package org.apache.nutch.util;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.protocol.Content;
import org.mortbay.jetty.MimeTypes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/util/EncodingDetector.class */
public class EncodingDetector {
    public static final int NO_THRESHOLD = -1;
    public static final String MIN_CONFIDENCE_KEY = "encodingdetector.charset.min.confidence";
    private static final int MIN_LENGTH = 4;
    private int minConfidence;
    private CharsetDetector detector = new CharsetDetector();
    private List<EncodingClue> clues = new ArrayList();
    public static final Logger LOG = LoggerFactory.getLogger(EncodingDetector.class);
    private static final HashMap<String, String> ALIASES = new HashMap<>();
    private static final HashSet<String> DETECTABLES = new HashSet<>();

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/util/EncodingDetector$EncodingClue.class */
    public class EncodingClue {
        private String value;
        private String source;
        private int confidence;

        public EncodingClue(EncodingDetector encodingDetector, String str, String str2) {
            this(str, str2, -1);
        }

        public EncodingClue(String str, String str2, int i) {
            this.value = str.toLowerCase();
            this.source = str2;
            this.confidence = i;
        }

        public String getSource() {
            return this.source;
        }

        public String getValue() {
            return this.value;
        }

        public String toString() {
            return this.value + " (" + this.source + (this.confidence >= 0 ? ", " + this.confidence + "% confidence" : "") + ")";
        }

        public boolean isEmpty() {
            return this.value == null || "".equals(this.value);
        }

        public boolean meetsThreshold() {
            return this.confidence < 0 || (EncodingDetector.this.minConfidence >= 0 && this.confidence >= EncodingDetector.this.minConfidence);
        }
    }

    public EncodingDetector(Configuration configuration) {
        this.minConfidence = configuration.getInt(MIN_CONFIDENCE_KEY, -1);
    }

    public void autoDetectClues(Content content, boolean z) {
        byte[] content2 = content.getContent();
        if (this.minConfidence >= 0 && DETECTABLES.contains(content.getContentType()) && content2.length > 4) {
            CharsetMatch[] charsetMatchArr = null;
            try {
                this.detector.enableInputFilter(z);
                if (content2.length > 4) {
                    this.detector.setText(content2);
                    charsetMatchArr = this.detector.detectAll();
                }
            } catch (Exception e) {
                LOG.debug("Exception from ICU4J (ignoring): ", (Throwable) e);
            }
            if (charsetMatchArr != null) {
                for (CharsetMatch charsetMatch : charsetMatchArr) {
                    addClue(charsetMatch.getName(), "detect", charsetMatch.getConfidence());
                }
            }
        }
        addClue(parseCharacterEncoding(content.getMetadata().get("Content-Type")), "header");
    }

    public void addClue(String str, String str2, int i) {
        String resolveEncodingAlias;
        if (str == null || "".equals(str) || (resolveEncodingAlias = resolveEncodingAlias(str)) == null) {
            return;
        }
        this.clues.add(new EncodingClue(resolveEncodingAlias, str2, i));
    }

    public void addClue(String str, String str2) {
        addClue(str, str2, -1);
    }

    public String guessEncoding(Content content, String str) {
        String baseUrl = content.getBaseUrl();
        if (LOG.isTraceEnabled()) {
            findDisagreements(baseUrl, this.clues);
        }
        EncodingClue encodingClue = new EncodingClue(this, str, "default");
        EncodingClue encodingClue2 = encodingClue;
        for (EncodingClue encodingClue3 : this.clues) {
            if (LOG.isTraceEnabled()) {
                LOG.trace(baseUrl + ": charset " + encodingClue3);
            }
            String str2 = encodingClue3.value;
            if (this.minConfidence >= 0 && encodingClue3.confidence >= this.minConfidence) {
                if (LOG.isTraceEnabled()) {
                    LOG.trace(baseUrl + ": Choosing encoding: " + str2 + " with confidence " + encodingClue3.confidence);
                }
                return resolveEncodingAlias(str2).toLowerCase();
            }
            if (encodingClue3.confidence == -1 && encodingClue2 == encodingClue) {
                encodingClue2 = encodingClue3;
            }
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace(baseUrl + ": Choosing encoding: " + encodingClue2);
        }
        return encodingClue2.value.toLowerCase();
    }

    public void clearClues() {
        this.clues.clear();
    }

    private void findDisagreements(String str, List<EncodingClue> list) {
        HashSet hashSet = new HashSet();
        HashSet hashSet2 = new HashSet();
        boolean z = false;
        for (int i = 0; i < list.size(); i++) {
            EncodingClue encodingClue = list.get(i);
            if (!encodingClue.isEmpty() && !hashSet2.contains(encodingClue.source)) {
                if (hashSet.size() > 0 && !hashSet.contains(encodingClue.value) && encodingClue.meetsThreshold()) {
                    z = true;
                }
                if (encodingClue.meetsThreshold()) {
                    hashSet.add(encodingClue.value);
                }
                hashSet2.add(encodingClue.source);
            }
        }
        if (z) {
            StringBuffer stringBuffer = new StringBuffer();
            stringBuffer.append("Disagreement: " + str + "; ");
            for (int i2 = 0; i2 < list.size(); i2++) {
                if (i2 > 0) {
                    stringBuffer.append(", ");
                }
                stringBuffer.append(list.get(i2));
            }
            LOG.trace(stringBuffer.toString());
        }
    }

    public static String resolveEncodingAlias(String str) {
        if (str == null) {
            return null;
        }
        try {
            if (!Charset.isSupported(str)) {
                return null;
            }
            String str2 = new String(Charset.forName(str).name());
            return ALIASES.containsKey(str2) ? ALIASES.get(str2) : str2;
        } catch (Exception e) {
            LOG.warn("Invalid encoding " + str + " detected, using default.");
            return null;
        }
    }

    public static String parseCharacterEncoding(String str) {
        int indexOf;
        if (str == null || (indexOf = str.indexOf("charset=")) < 0) {
            return null;
        }
        String substring = str.substring(indexOf + 8);
        int indexOf2 = substring.indexOf(59);
        if (indexOf2 >= 0) {
            substring = substring.substring(0, indexOf2);
        }
        String trim = substring.trim();
        if (trim.length() > 2 && trim.startsWith("\"") && trim.endsWith("\"")) {
            trim = trim.substring(1, trim.length() - 1);
        }
        return trim.trim();
    }

    public static void main(String[] strArr) throws IOException {
        if (strArr.length != 1) {
            System.err.println("Usage: EncodingDetector <file>");
            System.exit(1);
        }
        Configuration create = NutchConfiguration.create();
        EncodingDetector encodingDetector = new EncodingDetector(NutchConfiguration.create());
        BufferedInputStream bufferedInputStream = new BufferedInputStream(new FileInputStream(strArr[0]));
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        byte[] bArr = new byte[1000];
        boolean z = true;
        while (z) {
            int read = bufferedInputStream.read(bArr);
            if (read < bArr.length) {
                z = false;
                if (read > 0) {
                    byteArrayOutputStream.write(bArr, 0, read);
                }
            } else {
                byteArrayOutputStream.write(bArr);
            }
        }
        Content content = new Content("", "", byteArrayOutputStream.toByteArray(), MimeTypes.TEXT_HTML, new Metadata(), create);
        encodingDetector.autoDetectClues(content, true);
        System.out.println("Guessed encoding: " + encodingDetector.guessEncoding(content, create.get("parser.character.encoding.default")));
    }

    static {
        DETECTABLES.add(MimeTypes.TEXT_HTML);
        DETECTABLES.add("text/plain");
        DETECTABLES.add("text/richtext");
        DETECTABLES.add("text/rtf");
        DETECTABLES.add("text/sgml");
        DETECTABLES.add("text/tab-separated-values");
        DETECTABLES.add("text/xml");
        DETECTABLES.add("application/rss+xml");
        DETECTABLES.add("application/xhtml+xml");
        ALIASES.put("ISO-8859-1", "windows-1252");
        ALIASES.put("EUC-KR", "x-windows-949");
        ALIASES.put("x-EUC-CN", "GB18030");
        ALIASES.put("GBK", "GB18030");
    }
}
