001package ca.uhn.fhir.rest.server.interceptor.s13n.standardizers;
002
003/*-
004 * #%L
005 * HAPI FHIR - Server Framework
006 * %%
007 * Copyright (C) 2014 - 2022 Smile CDR, Inc.
008 * %%
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *      http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 * #L%
021 */
022
023import org.apache.commons.lang3.StringUtils;
024
025import java.util.ArrayList;
026import java.util.Arrays;
027import java.util.HashSet;
028import java.util.List;
029import java.util.Set;
030import java.util.stream.Collectors;
031
032/**
033 *
034 */
035public class TitleStandardizer extends LastNameStandardizer {
036
037        private Set<String> myExceptions = new HashSet<>(Arrays.asList("EAS", "EPS", "LLC", "LLP", "of", "at", "in", "and"));
038        private Set<String[]> myBiGramExceptions = new HashSet<String[]>();
039
040        public TitleStandardizer() {
041                super();
042                addDelimiters("/", ".", "|", ">", "<", "(", ")", ":", "!");
043                addAllowed('(', ')', '@', ':', '!', '|', '>', '<');
044                myBiGramExceptions.add(new String[] {"'", "s"});
045        }
046
047        private void addAllowed(char... theCharacter) {
048                for (char ch : theCharacter) {
049                        addAllowedExtendedAsciiRange((int) ch, (int) ch);
050                        addAllowedNonLetterAndDigitCharacters(ch);
051                }
052        }
053
054        @Override
055        public String standardize(String theString) {
056                theString = replaceTranslates(theString);
057
058                return Arrays.stream(theString.split("\\s+"))
059                        .map(String::trim)
060                        .map(this::standardizeText)
061                        .filter(s -> !StringUtils.isEmpty(s))
062                        .map(this::checkTitleExceptions)
063                        .collect(Collectors.joining(" "));
064        }
065
066        private List<String> split(String theString) {
067                int cursor = 0;
068                int start = 0;
069
070                List<String> retVal = new ArrayList<>();
071                StringBuilder buf = new StringBuilder();
072
073                while (cursor < theString.length()) {
074                        int codePoint = theString.codePointAt(cursor);
075                        cursor += Character.charCount(codePoint);
076                        if (isNoiseCharacter(codePoint)) {
077                                continue;
078                        }
079
080                        String str = new String(Character.toChars(codePoint));
081                        if (isDelimiter(str)) {
082                                if (buf.length() != 0) {
083                                        retVal.add(buf.toString());
084                                        buf.setLength(0);
085                                }
086                                retVal.add(str);
087                                continue;
088                        }
089
090                        buf.append(str);
091                }
092
093                if (buf.length() != 0) {
094                        retVal.add(buf.toString());
095                }
096
097                return retVal;
098        }
099
100        protected String standardizeText(String theToken) {
101                StringBuilder buf = new StringBuilder();
102                List<String> parts = split(theToken);
103
104                String prevPart = null;
105                for(String part : parts) {
106                        if (isAllText(part)) {
107                                part = standardizeNameToken(part);
108                        }
109
110                        part = checkBiGram(prevPart, part);
111                        buf.append(part);
112                        prevPart = part;
113                }
114                return buf.toString();
115        }
116
117        private String checkBiGram(String thePart0, String thePart1) {
118                for (String[] biGram : myBiGramExceptions) {
119                        if (biGram[0].equalsIgnoreCase(thePart0)
120                                && biGram[1].equalsIgnoreCase(thePart1)) {
121                                return biGram[1];
122                        }
123                }
124                return thePart1;
125        }
126
127        private boolean isAllText(String thePart) {
128                for (int offset = 0; offset < thePart.length(); ) {
129                        int codePoint = thePart.codePointAt(offset);
130                        if (!Character.isLetter(codePoint)) {
131                                return false;
132                        }
133                        offset += Character.charCount(codePoint);
134                }
135                return true;
136        }
137
138        @Override
139        protected String standardizeNameToken(String theToken) {
140                String exception = myExceptions.stream()
141                        .filter(s -> s.equalsIgnoreCase(theToken))
142                        .findFirst()
143                        .orElse(null);
144                if (exception != null) {
145                        return exception;
146                }
147
148                return super.standardizeNameToken(theToken);
149        }
150
151        private String checkTitleExceptions(String theString) {
152                return myExceptions.stream()
153                        .filter(s -> s.equalsIgnoreCase(theString))
154                        .findFirst()
155                        .orElse(theString);
156        }
157}