001package ca.uhn.fhir.rest.server.interceptor.s13n.standardizers; 002 003/*- 004 * #%L 005 * HAPI FHIR - Server Framework 006 * %% 007 * Copyright (C) 2014 - 2022 Smile CDR, Inc. 008 * %% 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 * #L% 021 */ 022 023import org.apache.commons.lang3.StringUtils; 024 025import java.util.ArrayList; 026import java.util.Arrays; 027import java.util.HashSet; 028import java.util.List; 029import java.util.Set; 030import java.util.stream.Collectors; 031 032/** 033 * 034 */ 035public class TitleStandardizer extends LastNameStandardizer { 036 037 private Set<String> myExceptions = new HashSet<>(Arrays.asList("EAS", "EPS", "LLC", "LLP", "of", "at", "in", "and")); 038 private Set<String[]> myBiGramExceptions = new HashSet<String[]>(); 039 040 public TitleStandardizer() { 041 super(); 042 addDelimiters("/", ".", "|", ">", "<", "(", ")", ":", "!"); 043 addAllowed('(', ')', '@', ':', '!', '|', '>', '<'); 044 myBiGramExceptions.add(new String[] {"'", "s"}); 045 } 046 047 private void addAllowed(char... theCharacter) { 048 for (char ch : theCharacter) { 049 addAllowedExtendedAsciiRange((int) ch, (int) ch); 050 addAllowedNonLetterAndDigitCharacters(ch); 051 } 052 } 053 054 @Override 055 public String standardize(String theString) { 056 theString = replaceTranslates(theString); 057 058 return Arrays.stream(theString.split("\\s+")) 059 .map(String::trim) 060 .map(this::standardizeText) 061 .filter(s -> !StringUtils.isEmpty(s)) 062 .map(this::checkTitleExceptions) 063 .collect(Collectors.joining(" ")); 064 } 065 066 private List<String> split(String theString) { 067 int cursor = 0; 068 int start = 0; 069 070 List<String> retVal = new ArrayList<>(); 071 StringBuilder buf = new StringBuilder(); 072 073 while (cursor < theString.length()) { 074 int codePoint = theString.codePointAt(cursor); 075 cursor += Character.charCount(codePoint); 076 if (isNoiseCharacter(codePoint)) { 077 continue; 078 } 079 080 String str = new String(Character.toChars(codePoint)); 081 if (isDelimiter(str)) { 082 if (buf.length() != 0) { 083 retVal.add(buf.toString()); 084 buf.setLength(0); 085 } 086 retVal.add(str); 087 continue; 088 } 089 090 buf.append(str); 091 } 092 093 if (buf.length() != 0) { 094 retVal.add(buf.toString()); 095 } 096 097 return retVal; 098 } 099 100 protected String standardizeText(String theToken) { 101 StringBuilder buf = new StringBuilder(); 102 List<String> parts = split(theToken); 103 104 String prevPart = null; 105 for(String part : parts) { 106 if (isAllText(part)) { 107 part = standardizeNameToken(part); 108 } 109 110 part = checkBiGram(prevPart, part); 111 buf.append(part); 112 prevPart = part; 113 } 114 return buf.toString(); 115 } 116 117 private String checkBiGram(String thePart0, String thePart1) { 118 for (String[] biGram : myBiGramExceptions) { 119 if (biGram[0].equalsIgnoreCase(thePart0) 120 && biGram[1].equalsIgnoreCase(thePart1)) { 121 return biGram[1]; 122 } 123 } 124 return thePart1; 125 } 126 127 private boolean isAllText(String thePart) { 128 for (int offset = 0; offset < thePart.length(); ) { 129 int codePoint = thePart.codePointAt(offset); 130 if (!Character.isLetter(codePoint)) { 131 return false; 132 } 133 offset += Character.charCount(codePoint); 134 } 135 return true; 136 } 137 138 @Override 139 protected String standardizeNameToken(String theToken) { 140 String exception = myExceptions.stream() 141 .filter(s -> s.equalsIgnoreCase(theToken)) 142 .findFirst() 143 .orElse(null); 144 if (exception != null) { 145 return exception; 146 } 147 148 return super.standardizeNameToken(theToken); 149 } 150 151 private String checkTitleExceptions(String theString) { 152 return myExceptions.stream() 153 .filter(s -> s.equalsIgnoreCase(theString)) 154 .findFirst() 155 .orElse(theString); 156 } 157}