package org.apache.nutch.net.urlnormalizer.regex;

import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.nutch.net.URLNormalizer;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.tika.mime.MimeTypesReaderMetKeys;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;

/* loaded from: input_file:plugins/urlnormalizer-regex/urlnormalizer-regex.jar:org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.class */
public class RegexURLNormalizer extends Configured implements URLNormalizer {
    private HashMap scopedRules;
    private static final Logger LOG = LoggerFactory.getLogger(RegexURLNormalizer.class);
    private static final List EMPTY_RULES = Collections.EMPTY_LIST;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:plugins/urlnormalizer-regex/urlnormalizer-regex.jar:org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer$Rule.class */
    public static class Rule {
        public Pattern pattern;
        public String substitution;

        private Rule() {
        }
    }

    public RegexURLNormalizer() {
        super((Configuration) null);
    }

    public RegexURLNormalizer(Configuration configuration) {
        super(configuration);
    }

    public RegexURLNormalizer(Configuration configuration, String str) throws IOException, PatternSyntaxException {
        super(configuration);
        List readConfigurationFile = readConfigurationFile(str);
        if (readConfigurationFile != null) {
            this.scopedRules.put("default", readConfigurationFile);
        }
    }

    public void setConf(Configuration configuration) {
        List list;
        super.setConf(configuration);
        if (configuration != null && this.scopedRules == null) {
            String str = getConf().get("urlnormalizer.regex.file");
            String str2 = getConf().get("urlnormalizer.regex.rules");
            this.scopedRules = new HashMap();
            Reader stringReader = str2 != null ? new StringReader(str2) : getConf().getConfResourceAsReader(str);
            if (stringReader == null) {
                LOG.warn("Can't load the default rules! ");
                list = EMPTY_RULES;
            } else {
                try {
                    list = readConfiguration(stringReader);
                } catch (Exception e) {
                    LOG.warn("Couldn't read default config: " + e);
                    list = EMPTY_RULES;
                }
            }
            this.scopedRules.put("default", list);
        }
    }

    void setConfiguration(Reader reader, String str) {
        List readConfiguration = readConfiguration(reader);
        this.scopedRules.put(str, readConfiguration);
        LOG.debug("Set config for scope '" + str + "': " + readConfiguration.size() + " rules.");
    }

    public synchronized String regexNormalize(String str, String str2) {
        List<Rule> list = (List) this.scopedRules.get(str2);
        if (list == null) {
            String str3 = getConf().get("urlnormalizer.regex.file." + str2);
            if (str3 != null) {
                LOG.debug("resource for scope '" + str2 + "': " + str3);
                if (str3 == null) {
                    LOG.warn("Can't load resource for config file: " + str3);
                } else {
                    try {
                        list = readConfiguration(getConf().getConfResourceAsReader(str3));
                        this.scopedRules.put(str2, list);
                    } catch (Exception e) {
                        LOG.warn("Couldn't load resource '" + str3 + "': " + e);
                    }
                }
            }
            if (list == EMPTY_RULES || list == null) {
                LOG.info("can't find rules for scope '" + str2 + "', using default");
                this.scopedRules.put(str2, EMPTY_RULES);
            }
        }
        if (list == EMPTY_RULES || list == null) {
            list = (List) this.scopedRules.get("default");
        }
        for (Rule rule : list) {
            str = rule.pattern.matcher(str).replaceAll(rule.substitution);
        }
        return str;
    }

    public synchronized String normalize(String str, String str2) throws MalformedURLException {
        return regexNormalize(str, str2);
    }

    private List readConfigurationFile(String str) {
        if (LOG.isInfoEnabled()) {
            LOG.info("loading " + str);
        }
        try {
            return readConfiguration(new FileReader(str));
        } catch (Exception e) {
            LOG.error("Error loading rules from '" + str + "': " + e);
            return EMPTY_RULES;
        }
    }

    private List readConfiguration(Reader reader) {
        ArrayList arrayList = new ArrayList();
        try {
            Element documentElement = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new InputSource(reader)).getDocumentElement();
            if (!"regex-normalize".equals(documentElement.getTagName()) && LOG.isErrorEnabled()) {
                LOG.error("bad conf file: top-level element not <regex-normalize>");
            }
            NodeList childNodes = documentElement.getChildNodes();
            for (int i = 0; i < childNodes.getLength(); i++) {
                Node item = childNodes.item(i);
                if (item instanceof Element) {
                    Element element = (Element) item;
                    if (!"regex".equals(element.getTagName()) && LOG.isWarnEnabled()) {
                        LOG.warn("bad conf file: element not <regex>");
                    }
                    NodeList childNodes2 = element.getChildNodes();
                    String str = null;
                    String str2 = null;
                    for (int i2 = 0; i2 < childNodes2.getLength(); i2++) {
                        Node item2 = childNodes2.item(i2);
                        if (item2 instanceof Element) {
                            Element element2 = (Element) item2;
                            if (MimeTypesReaderMetKeys.PATTERN_ATTR.equals(element2.getTagName()) && element2.hasChildNodes()) {
                                str = ((Text) element2.getFirstChild()).getData();
                            }
                            if ("substitution".equals(element2.getTagName()) && element2.hasChildNodes()) {
                                str2 = ((Text) element2.getFirstChild()).getData();
                            }
                            if (!element2.hasChildNodes()) {
                                str2 = "";
                            }
                        }
                    }
                    if (str != null && str2 != null) {
                        Rule rule = new Rule();
                        rule.pattern = Pattern.compile(str);
                        rule.substitution = str2;
                        arrayList.add(rule);
                    }
                }
            }
            return arrayList.size() == 0 ? EMPTY_RULES : arrayList;
        } catch (Exception e) {
            if (LOG.isErrorEnabled()) {
                LOG.error("error parsing conf file: " + e);
            }
            return EMPTY_RULES;
        }
    }

    public static void main(String[] strArr) throws PatternSyntaxException, IOException {
        RegexURLNormalizer regexURLNormalizer = new RegexURLNormalizer();
        regexURLNormalizer.setConf(NutchConfiguration.create());
        System.out.println("* Rules for 'DEFAULT' scope:");
        for (Rule rule : (List) regexURLNormalizer.scopedRules.get("default")) {
            System.out.print("  " + rule.pattern.pattern() + " -> ");
            System.out.println(rule.substitution);
        }
        if (strArr.length > 1) {
            regexURLNormalizer.normalize("http://test.com", strArr[1]);
        }
        if (regexURLNormalizer.scopedRules.size() > 1) {
            for (String str : regexURLNormalizer.scopedRules.keySet()) {
                if (!"default".equals(str)) {
                    System.out.println("* Rules for '" + str + "' scope:");
                    for (Rule rule2 : (List) regexURLNormalizer.scopedRules.get(str)) {
                        System.out.print("  " + rule2.pattern.pattern() + " -> ");
                        System.out.println(rule2.substitution);
                    }
                }
            }
        }
        if (strArr.length > 0) {
            System.out.println("\n---------- Normalizer test -----------");
            String str2 = strArr.length > 1 ? strArr[1] : "default";
            System.out.println("Scope: " + str2);
            System.out.println("Input url:  '" + strArr[0] + "'");
            System.out.println("Output url: '" + regexURLNormalizer.normalize(strArr[0], str2) + "'");
        }
        System.exit(0);
    }
}
