001package ca.uhn.fhir.rest.server.interceptor.s13n.standardizers;
002
003/*-
004 * #%L
005 * HAPI FHIR - Server Framework
006 * %%
007 * Copyright (C) 2014 - 2022 Smile CDR, Inc.
008 * %%
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *      http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 * #L%
021 */
022
023import ca.uhn.fhir.i18n.Msg;
024import ca.uhn.fhir.rest.server.interceptor.ConfigLoader;
025
026import java.util.HashSet;
027import java.util.Scanner;
028import java.util.Set;
029
030public class NoiseCharacters {
031
032        private static final int RANGE_THRESHOLD = 150;
033
034        private Set<Integer> myNoiseCharacters = new HashSet<>();
035        private Set<Range> myNoiseCharacterRanges = new HashSet<>();
036
037        private int size;
038
039        public int getSize() {
040                return myNoiseCharacters.size();
041        }
042
043        public void initializeFromClasspath() {
044                String noiseChars = ConfigLoader.loadResourceContent("classpath:noise-chars.txt");
045                try (Scanner scanner = new Scanner(noiseChars)) {
046                        while (scanner.hasNext()) {
047                                parse(scanner.nextLine());
048                        }
049                }
050        }
051
052        public boolean isNoise(int theChar) {
053                if (myNoiseCharacters.contains(theChar)) {
054                        return true;
055                }
056
057                for (Range r : myNoiseCharacterRanges) {
058                        if (r.isInRange(theChar)) {
059                                return true;
060                        }
061                }
062
063                return false;
064        }
065
066        private void parse(String theString) {
067                if (theString.contains("-")) {
068                        addRange(theString);
069                } else {
070                        add(theString);
071                }
072        }
073
074        public NoiseCharacters add(String theLiteral) {
075                myNoiseCharacters.add(toInt(theLiteral));
076                return this;
077        }
078
079        public NoiseCharacters addRange(String theRange) {
080                if (!theRange.contains("-")) {
081                        throw new IllegalArgumentException(Msg.code(350) + String.format("Invalid range %s", theRange));
082                }
083
084                String[] range = theRange.split("-");
085                if (range.length < 2) {
086                        throw new IllegalArgumentException(Msg.code(351) + String.format("Invalid range %s", theRange));
087                }
088
089                addRange(range[0].trim(), range[1].trim());
090                return this;
091        }
092
093        public NoiseCharacters addRange(String theLowerBound, String theUpperBound) {
094                int lower = toInt(theLowerBound);
095                int upper = toInt(theUpperBound);
096
097                if (lower > upper) {
098                        throw new IllegalArgumentException(Msg.code(352) + String.format("Invalid character range %s-%s", theLowerBound, theUpperBound));
099                }
100
101                if (upper - lower >= RANGE_THRESHOLD) {
102                        myNoiseCharacterRanges.add(new Range(lower, upper));
103                        return this;
104                }
105
106                for (int i = lower; i <= upper; i++) {
107                        myNoiseCharacters.add(i);
108                }
109                return this;
110        }
111
112        private int toInt(String theLiteral) {
113                if (!theLiteral.startsWith("#x")) {
114                        throw new IllegalArgumentException(Msg.code(353) + "Unable to parse " + theLiteral);
115                }
116
117                return Integer.parseInt(theLiteral.substring(2), 16);
118        }
119
120}