001package ca.uhn.fhir.rest.server.interceptor.s13n.standardizers; 002 003/*- 004 * #%L 005 * HAPI FHIR - Server Framework 006 * %% 007 * Copyright (C) 2014 - 2022 Smile CDR, Inc. 008 * %% 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 * #L% 021 */ 022 023import java.util.ArrayList; 024import java.util.Arrays; 025import java.util.HashMap; 026import java.util.HashSet; 027import java.util.List; 028import java.util.Map; 029import java.util.Set; 030import java.util.regex.Pattern; 031import java.util.stream.Collectors; 032 033/** 034 * Standardizes text literals by removing noise characters. 035 */ 036public class TextStandardizer implements IStandardizer { 037 038 public static final Pattern DIACRITICAL_MARKS = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); 039 040 public static final int EXT_ASCII_RANGE_START = 155; 041 public static final int EXT_ASCII_RANGE_END = 255; 042 043 private List<Range> myAllowedExtendedAscii; 044 private Set<Integer> myAllowedNonLetterAndDigitCharacters = new HashSet<>(); 045 private NoiseCharacters myNoiseCharacters = new NoiseCharacters(); 046 private Map<Integer, Character> myTranslates = new HashMap<>(); 047 048 public TextStandardizer() { 049 myNoiseCharacters.initializeFromClasspath(); 050 051 initializeAllowedNonLetterAndDigitCharacters(); 052 initializeTranslates(); 053 initializeAllowedExtendedAscii(); 054 } 055 056 protected void initializeAllowedNonLetterAndDigitCharacters() { 057 addAllowedNonLetterAndDigitCharacters('.', '\'', ',', '-', '#', '/', '\\', ' '); 058 } 059 060 protected TextStandardizer addAllowedNonLetterAndDigitCharacters(Character... theCharacters) { 061 myAllowedNonLetterAndDigitCharacters.addAll(asSet(theCharacters)); 062 return this; 063 } 064 065 protected Set<Integer> asSet(Character... theCharacters) { 066 return Arrays.stream(theCharacters) 067 .map(c -> (int) c) 068 .collect(Collectors.toSet()); 069 } 070 071 protected TextStandardizer addTranslate(int theTranslate, char theMapping) { 072 myTranslates.put(theTranslate, theMapping); 073 return this; 074 } 075 076 protected void initializeTranslates() { 077 addTranslate(0x0080, '\''); // PAD 078 addTranslate(0x00A0, ' '); //   079 addTranslate((int) ' ', ' '); //   080 addTranslate(0x201C, '"'); 081 addTranslate(0x201D, '"'); 082 addTranslate(0x2019, ' '); 083 addTranslate(0x2018, ' '); 084 addTranslate(0x02BD, ' '); 085 addTranslate(0x00B4, ' '); 086 addTranslate(0x02DD, '"'); 087 addTranslate((int) '–', '-'); 088 addTranslate((int) '-', '-'); 089 addTranslate((int) '~', '-'); 090 } 091 092 protected void initializeAllowedExtendedAscii() { 093 myAllowedExtendedAscii = new ArrayList<>(); 094 095 // refer to https://www.ascii-code.com for the codes 096 for (int[] i : new int[][]{{192, 214}, {216, 246}, {248, 255}}) { 097 addAllowedExtendedAsciiRange(i[0], i[1]); 098 } 099 } 100 101 protected TextStandardizer addAllowedExtendedAsciiRange(int theRangeStart, int theRangeEnd) { 102 myAllowedExtendedAscii.add(new Range(theRangeStart, theRangeEnd)); 103 return this; 104 } 105 106 public String standardize(String theString) { 107 theString = replaceTranslates(theString); 108 return removeNoise(theString); 109 } 110 111 protected String replaceTranslates(String theString) { 112 StringBuilder buf = new StringBuilder(theString.length()); 113 for (char ch : theString.toCharArray()) { 114 if (myTranslates.containsKey((int) ch)) { 115 buf.append(myTranslates.get((int) ch)); 116 } else { 117 buf.append(ch); 118 } 119 } 120 return buf.toString(); 121 } 122 123 protected String replaceAccents(String theString) { 124 String string = java.text.Normalizer.normalize(theString, java.text.Normalizer.Form.NFD); 125 return DIACRITICAL_MARKS.matcher(string).replaceAll(""); 126 } 127 128 protected String removeNoise(String theToken) { 129 StringBuilder token = new StringBuilder(theToken.length()); 130 for (int offset = 0; offset < theToken.length(); ) { 131 int codePoint = theToken.codePointAt(offset); 132 offset += Character.charCount(codePoint); 133 134 switch (Character.getType(codePoint)) { 135 case Character.CONTROL: // \p{Cc} 136 case Character.FORMAT: // \p{Cf} 137 case Character.PRIVATE_USE: // \p{Co} 138 case Character.SURROGATE: // \p{Cs} 139 case Character.UNASSIGNED: // \p{Cn} 140 break; 141 default: 142 if (!isNoiseCharacter(codePoint)) { 143 token.append(Character.toChars(codePoint)); 144 } 145 break; 146 } 147 } 148 return token.toString(); 149 } 150 151 protected boolean isTranslate(int theChar) { 152 return myTranslates.containsKey(theChar); 153 } 154 155 protected boolean isNoiseCharacter(int theChar) { 156 if (myAllowedExtendedAscii.stream().anyMatch(r -> r.isInRange(theChar))) { 157 return false; 158 } 159 boolean isExtendedAscii = (theChar >= EXT_ASCII_RANGE_START && theChar <= EXT_ASCII_RANGE_END); 160 if (isExtendedAscii) { 161 return true; 162 } 163 return myNoiseCharacters.isNoise(theChar); 164 } 165 166}