001package ca.uhn.fhir.rest.server.interceptor.s13n.standardizers; 002 003/*- 004 * #%L 005 * HAPI FHIR - Server Framework 006 * %% 007 * Copyright (C) 2014 - 2022 Smile CDR, Inc. 008 * %% 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 * #L% 021 */ 022 023import ca.uhn.fhir.i18n.Msg; 024import ca.uhn.fhir.rest.server.interceptor.ConfigLoader; 025 026import java.util.HashSet; 027import java.util.Scanner; 028import java.util.Set; 029 030public class NoiseCharacters { 031 032 private static final int RANGE_THRESHOLD = 150; 033 034 private Set<Integer> myNoiseCharacters = new HashSet<>(); 035 private Set<Range> myNoiseCharacterRanges = new HashSet<>(); 036 037 private int size; 038 039 public int getSize() { 040 return myNoiseCharacters.size(); 041 } 042 043 public void initializeFromClasspath() { 044 String noiseChars = ConfigLoader.loadResourceContent("classpath:noise-chars.txt"); 045 try (Scanner scanner = new Scanner(noiseChars)) { 046 while (scanner.hasNext()) { 047 parse(scanner.nextLine()); 048 } 049 } 050 } 051 052 public boolean isNoise(int theChar) { 053 if (myNoiseCharacters.contains(theChar)) { 054 return true; 055 } 056 057 for (Range r : myNoiseCharacterRanges) { 058 if (r.isInRange(theChar)) { 059 return true; 060 } 061 } 062 063 return false; 064 } 065 066 private void parse(String theString) { 067 if (theString.contains("-")) { 068 addRange(theString); 069 } else { 070 add(theString); 071 } 072 } 073 074 public NoiseCharacters add(String theLiteral) { 075 myNoiseCharacters.add(toInt(theLiteral)); 076 return this; 077 } 078 079 public NoiseCharacters addRange(String theRange) { 080 if (!theRange.contains("-")) { 081 throw new IllegalArgumentException(Msg.code(350) + String.format("Invalid range %s", theRange)); 082 } 083 084 String[] range = theRange.split("-"); 085 if (range.length < 2) { 086 throw new IllegalArgumentException(Msg.code(351) + String.format("Invalid range %s", theRange)); 087 } 088 089 addRange(range[0].trim(), range[1].trim()); 090 return this; 091 } 092 093 public NoiseCharacters addRange(String theLowerBound, String theUpperBound) { 094 int lower = toInt(theLowerBound); 095 int upper = toInt(theUpperBound); 096 097 if (lower > upper) { 098 throw new IllegalArgumentException(Msg.code(352) + String.format("Invalid character range %s-%s", theLowerBound, theUpperBound)); 099 } 100 101 if (upper - lower >= RANGE_THRESHOLD) { 102 myNoiseCharacterRanges.add(new Range(lower, upper)); 103 return this; 104 } 105 106 for (int i = lower; i <= upper; i++) { 107 myNoiseCharacters.add(i); 108 } 109 return this; 110 } 111 112 private int toInt(String theLiteral) { 113 if (!theLiteral.startsWith("#x")) { 114 throw new IllegalArgumentException(Msg.code(353) + "Unable to parse " + theLiteral); 115 } 116 117 return Integer.parseInt(theLiteral.substring(2), 16); 118 } 119 120}