package org.apache.nutch.crawl;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import org.apache.hadoop.io.MD5Hash;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.StringUtil;
import shaded.org.apache.commons.io.IOUtils;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/TextProfileSignature.class */
public class TextProfileSignature extends Signature {
    Signature fallback = new MD5Signature();

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/TextProfileSignature$Token.class */
    public static class Token {
        public int cnt;
        public String val;

        public Token(int i, String str) {
            this.cnt = i;
            this.val = str;
        }

        public String toString() {
            return this.val + " " + this.cnt;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/TextProfileSignature$TokenComparator.class */
    public static class TokenComparator implements Comparator<Token> {
        private TokenComparator() {
        }

        @Override // java.util.Comparator
        public int compare(Token token, Token token2) {
            return token2.cnt - token.cnt;
        }
    }

    @Override // org.apache.nutch.crawl.Signature
    public byte[] calculate(Content content, Parse parse) {
        int i = getConf().getInt("db.signature.text_profile.min_token_len", 2);
        float f = getConf().getFloat("db.signature.text_profile.quant_rate", 0.01f);
        HashMap hashMap = new HashMap();
        String text = parse != null ? parse.getText() : null;
        if (text == null || text.length() == 0) {
            return this.fallback.calculate(content, parse);
        }
        StringBuffer stringBuffer = new StringBuffer();
        int i2 = 0;
        for (int i3 = 0; i3 < text.length(); i3++) {
            char charAt = text.charAt(i3);
            if (Character.isLetterOrDigit(charAt)) {
                stringBuffer.append(Character.toLowerCase(charAt));
            } else if (stringBuffer.length() > 0) {
                if (stringBuffer.length() > i) {
                    String stringBuffer2 = stringBuffer.toString();
                    Token token = (Token) hashMap.get(stringBuffer2);
                    if (token == null) {
                        token = new Token(0, stringBuffer2);
                        hashMap.put(stringBuffer2, token);
                    }
                    token.cnt++;
                    if (token.cnt > i2) {
                        i2 = token.cnt;
                    }
                }
                stringBuffer.setLength(0);
            }
        }
        if (stringBuffer.length() > i) {
            String stringBuffer3 = stringBuffer.toString();
            Token token2 = (Token) hashMap.get(stringBuffer3);
            if (token2 == null) {
                token2 = new Token(0, stringBuffer3);
                hashMap.put(stringBuffer3, token2);
            }
            token2.cnt++;
            if (token2.cnt > i2) {
                i2 = token2.cnt;
            }
        }
        ArrayList arrayList = new ArrayList();
        int round = Math.round(i2 * f);
        if (round < 2) {
            round = i2 > 1 ? 2 : 1;
        }
        for (Token token3 : hashMap.values()) {
            token3.cnt = (token3.cnt / round) * round;
            if (token3.cnt >= round) {
                arrayList.add(token3);
            }
        }
        Collections.sort(arrayList, new TokenComparator());
        StringBuffer stringBuffer4 = new StringBuffer();
        Iterator it = arrayList.iterator();
        while (it.hasNext()) {
            Token token4 = (Token) it.next();
            if (stringBuffer4.length() > 0) {
                stringBuffer4.append(IOUtils.LINE_SEPARATOR_UNIX);
            }
            stringBuffer4.append(token4.toString());
        }
        return MD5Hash.digest(stringBuffer4.toString()).getDigest();
    }

    public static void main(String[] strArr) throws Exception {
        TextProfileSignature textProfileSignature = new TextProfileSignature();
        textProfileSignature.setConf(NutchConfiguration.create());
        HashMap hashMap = new HashMap();
        File[] listFiles = new File(strArr[0]).listFiles();
        for (int i = 0; i < listFiles.length; i++) {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(listFiles[i]), "UTF-8"));
            StringBuffer stringBuffer = new StringBuffer();
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine != null) {
                    if (stringBuffer.length() > 0) {
                        stringBuffer.append(IOUtils.LINE_SEPARATOR_UNIX);
                    }
                    stringBuffer.append(readLine);
                }
            }
            bufferedReader.close();
            hashMap.put(listFiles[i].toString(), textProfileSignature.calculate(null, new ParseImpl(stringBuffer.toString(), (ParseData) null)));
        }
        for (String str : hashMap.keySet()) {
            System.out.println(str + "\t" + StringUtil.toHexString((byte[]) hashMap.get(str)));
        }
    }
}
