001package ca.uhn.fhir.rest.server.interceptor.s13n.standardizers;
002
003/*-
004 * #%L
005 * HAPI FHIR - Server Framework
006 * %%
007 * Copyright (C) 2014 - 2022 Smile CDR, Inc.
008 * %%
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *      http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 * #L%
021 */
022
023import java.util.ArrayList;
024import java.util.Arrays;
025import java.util.HashMap;
026import java.util.HashSet;
027import java.util.List;
028import java.util.Map;
029import java.util.Set;
030import java.util.regex.Pattern;
031import java.util.stream.Collectors;
032
033/**
034 * Standardizes text literals by removing noise characters.
035 */
036public class TextStandardizer implements IStandardizer {
037
038        public static final Pattern DIACRITICAL_MARKS = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
039
040        public static final int EXT_ASCII_RANGE_START = 155;
041        public static final int EXT_ASCII_RANGE_END = 255;
042
043        private List<Range> myAllowedExtendedAscii;
044        private Set<Integer> myAllowedNonLetterAndDigitCharacters = new HashSet<>();
045        private NoiseCharacters myNoiseCharacters = new NoiseCharacters();
046        private Map<Integer, Character> myTranslates = new HashMap<>();
047
048        public TextStandardizer() {
049                myNoiseCharacters.initializeFromClasspath();
050
051                initializeAllowedNonLetterAndDigitCharacters();
052                initializeTranslates();
053                initializeAllowedExtendedAscii();
054        }
055
056        protected void initializeAllowedNonLetterAndDigitCharacters() {
057                addAllowedNonLetterAndDigitCharacters('.', '\'', ',', '-', '#', '/', '\\', ' ');
058        }
059
060        protected TextStandardizer addAllowedNonLetterAndDigitCharacters(Character... theCharacters) {
061                myAllowedNonLetterAndDigitCharacters.addAll(asSet(theCharacters));
062                return this;
063        }
064
065        protected Set<Integer> asSet(Character... theCharacters) {
066                return Arrays.stream(theCharacters)
067                        .map(c -> (int) c)
068                        .collect(Collectors.toSet());
069        }
070
071        protected TextStandardizer addTranslate(int theTranslate, char theMapping) {
072                myTranslates.put(theTranslate, theMapping);
073                return this;
074        }
075
076        protected void initializeTranslates() {
077                addTranslate(0x0080, '\''); // PAD
078                addTranslate(0x00A0, ' '); // &nbsp
079                addTranslate((int) ' ', ' '); // &nbsp
080                addTranslate(0x201C, '"');
081                addTranslate(0x201D, '"');
082                addTranslate(0x2019, ' ');
083                addTranslate(0x2018, ' ');
084                addTranslate(0x02BD, ' ');
085                addTranslate(0x00B4, ' ');
086                addTranslate(0x02DD, '"');
087                addTranslate((int) '–', '-');
088                addTranslate((int) '-', '-');
089                addTranslate((int) '~', '-');
090        }
091
092        protected void initializeAllowedExtendedAscii() {
093                myAllowedExtendedAscii = new ArrayList<>();
094
095                // refer to https://www.ascii-code.com for the codes
096                for (int[] i : new int[][]{{192, 214}, {216, 246}, {248, 255}}) {
097                        addAllowedExtendedAsciiRange(i[0], i[1]);
098                }
099        }
100
101        protected TextStandardizer addAllowedExtendedAsciiRange(int theRangeStart, int theRangeEnd) {
102                myAllowedExtendedAscii.add(new Range(theRangeStart, theRangeEnd));
103                return this;
104        }
105
106        public String standardize(String theString) {
107                theString = replaceTranslates(theString);
108                return removeNoise(theString);
109        }
110
111        protected String replaceTranslates(String theString) {
112                StringBuilder buf = new StringBuilder(theString.length());
113                for (char ch : theString.toCharArray()) {
114                        if (myTranslates.containsKey((int) ch)) {
115                                buf.append(myTranslates.get((int) ch));
116                        } else {
117                                buf.append(ch);
118                        }
119                }
120                return buf.toString();
121        }
122
123        protected String replaceAccents(String theString) {
124                String string = java.text.Normalizer.normalize(theString, java.text.Normalizer.Form.NFD);
125                return DIACRITICAL_MARKS.matcher(string).replaceAll("");
126        }
127
128        protected String removeNoise(String theToken) {
129                StringBuilder token = new StringBuilder(theToken.length());
130                for (int offset = 0; offset < theToken.length(); ) {
131                        int codePoint = theToken.codePointAt(offset);
132                        offset += Character.charCount(codePoint);
133
134                        switch (Character.getType(codePoint)) {
135                                case Character.CONTROL:     // \p{Cc}
136                                case Character.FORMAT:      // \p{Cf}
137                                case Character.PRIVATE_USE: // \p{Co}
138                                case Character.SURROGATE:   // \p{Cs}
139                                case Character.UNASSIGNED:  // \p{Cn}
140                                        break;
141                                default:
142                                        if (!isNoiseCharacter(codePoint)) {
143                                                token.append(Character.toChars(codePoint));
144                                        }
145                                        break;
146                        }
147                }
148                return token.toString();
149        }
150
151        protected boolean isTranslate(int theChar) {
152                return myTranslates.containsKey(theChar);
153        }
154
155        protected boolean isNoiseCharacter(int theChar) {
156                if (myAllowedExtendedAscii.stream().anyMatch(r -> r.isInRange(theChar))) {
157                        return false;
158                }
159                boolean isExtendedAscii = (theChar >= EXT_ASCII_RANGE_START && theChar <= EXT_ASCII_RANGE_END);
160                if (isExtendedAscii) {
161                        return true;
162                }
163                return myNoiseCharacters.isNoise(theChar);
164        }
165
166}