package org.apache.nutch.parse.js;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.oro.text.regex.Pattern;
import org.apache.oro.text.regex.PatternMatcherInput;
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Perl5Matcher;
import org.apache.pdfbox.pdmodel.documentinterchange.taggedpdf.PDPrintFieldAttributeObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import shaded.org.apache.commons.io.IOUtils;

/* loaded from: input_file:plugins/parse-js/parse-js.jar:org/apache/nutch/parse/js/JSParseFilter.class */
public class JSParseFilter implements HtmlParseFilter, Parser {
    public static final Logger LOG = LoggerFactory.getLogger(JSParseFilter.class);
    private static final int MAX_TITLE_LEN = 80;
    private Configuration conf;
    private static final String STRING_PATTERN = "(\\\\*(?:\"|'))([^\\s\"']+?)(?:\\1)";
    private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)";

    @Override // org.apache.nutch.parse.HtmlParseFilter
    public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags hTMLMetaTags, DocumentFragment documentFragment) {
        Parse parse = parseResult.get(content.getUrl());
        String baseUrl = content.getBaseUrl();
        ArrayList arrayList = new ArrayList();
        walk(documentFragment, parse, hTMLMetaTags, baseUrl, arrayList);
        if (arrayList.size() > 0) {
            Outlink[] outlinks = parse.getData().getOutlinks();
            String title = parse.getData().getTitle();
            arrayList.addAll(Arrays.asList(outlinks));
            ParseStatus status = parse.getData().getStatus();
            String text = parse.getText();
            parseResult.put(content.getUrl(), new ParseText(text), new ParseData(status, title, (Outlink[]) arrayList.toArray(new Outlink[arrayList.size()]), parse.getData().getContentMeta(), parse.getData().getParseMeta()));
        }
        return parseResult;
    }

    private void walk(Node node, Parse parse, HTMLMetaTags hTMLMetaTags, String str, List list) {
        String nodeValue;
        if (node instanceof Element) {
            if (node.getNodeName().equalsIgnoreCase("script")) {
                Node namedItem = node.getAttributes().getNamedItem("language");
                if (namedItem != null) {
                    namedItem.getNodeValue();
                }
                StringBuffer stringBuffer = new StringBuffer();
                NodeList childNodes = node.getChildNodes();
                if (childNodes.getLength() > 0) {
                    for (int i = 0; i < childNodes.getLength(); i++) {
                        if (i > 0) {
                            stringBuffer.append('\n');
                        }
                        stringBuffer.append(childNodes.item(i).getNodeValue());
                    }
                    Outlink[] jSLinks = getJSLinks(stringBuffer.toString(), "", str);
                    if (jSLinks == null || jSLinks.length <= 0) {
                        return;
                    }
                    list.addAll(Arrays.asList(jSLinks));
                    return;
                }
            } else {
                NamedNodeMap attributes = node.getAttributes();
                int length = attributes.getLength();
                for (int i2 = 0; i2 < length; i2++) {
                    Node item = attributes.item(i2);
                    Outlink[] outlinkArr = null;
                    if (item.getNodeName().startsWith(PDPrintFieldAttributeObject.CHECKED_STATE_ON)) {
                        outlinkArr = getJSLinks(item.getNodeValue(), "", str);
                    } else if (item.getNodeName().equalsIgnoreCase("href") && (nodeValue = item.getNodeValue()) != null && nodeValue.toLowerCase().indexOf("javascript:") != -1) {
                        outlinkArr = getJSLinks(nodeValue, "", str);
                    }
                    if (outlinkArr != null && outlinkArr.length > 0) {
                        list.addAll(Arrays.asList(outlinkArr));
                    }
                }
            }
        }
        NodeList childNodes2 = node.getChildNodes();
        for (int i3 = 0; i3 < childNodes2.getLength(); i3++) {
            walk(childNodes2.item(i3), parse, hTMLMetaTags, str, list);
        }
    }

    @Override // org.apache.nutch.parse.Parser
    public ParseResult getParse(Content content) {
        String substring;
        String contentType = content.getContentType();
        if (contentType != null && !contentType.trim().equals("") && !contentType.toLowerCase().startsWith("application/x-javascript")) {
            return new ParseStatus(203, "Content not JavaScript: '" + contentType + "'").getEmptyParseResult(content.getUrl(), getConf());
        }
        String str = new String(content.getContent());
        Outlink[] jSLinks = getJSLinks(str, "", content.getUrl());
        if (jSLinks == null) {
            jSLinks = new Outlink[0];
        }
        int indexOf = str.indexOf(10);
        if (indexOf != -1) {
            if (indexOf > 80) {
                indexOf = 80;
            }
            substring = str.substring(0, indexOf);
        } else {
            substring = str.substring(0, Math.min(80, str.length()));
        }
        return ParseResult.createParseResult(content.getUrl(), new ParseImpl(str, new ParseData(ParseStatus.STATUS_SUCCESS, substring, jSLinks, content.getMetadata())));
    }

    private Outlink[] getJSLinks(String str, String str2, String str3) {
        ArrayList arrayList = new ArrayList();
        URL url = null;
        try {
            url = new URL(str3);
        } catch (Exception e) {
            if (LOG.isErrorEnabled()) {
                LOG.error("getJSLinks", (Throwable) e);
            }
        }
        try {
            Perl5Compiler perl5Compiler = new Perl5Compiler();
            Pattern compile = perl5Compiler.compile(STRING_PATTERN, 32777);
            Pattern compile2 = perl5Compiler.compile(URI_PATTERN, 32777);
            Perl5Matcher perl5Matcher = new Perl5Matcher();
            Perl5Matcher perl5Matcher2 = new Perl5Matcher();
            PatternMatcherInput patternMatcherInput = new PatternMatcherInput(str);
            while (perl5Matcher.contains(patternMatcherInput, compile)) {
                String group = perl5Matcher.getMatch().group(2);
                if (perl5Matcher2.matches(new PatternMatcherInput(group), compile2)) {
                    if (group.startsWith("www.")) {
                        group = "http://" + group;
                    } else {
                        try {
                            group = new URL(url, group).toString();
                        } catch (MalformedURLException e2) {
                            if (LOG.isTraceEnabled()) {
                                LOG.trace(" - failed URL parse '" + group + "' and baseURL '" + url + "'", (Throwable) e2);
                            }
                        }
                    }
                    String replaceAll = group.replaceAll("&amp;", "&");
                    if (LOG.isTraceEnabled()) {
                        LOG.trace(" - outlink from JS: '" + replaceAll + "'");
                    }
                    arrayList.add(new Outlink(replaceAll, str2));
                }
            }
        } catch (Exception e3) {
            if (LOG.isErrorEnabled()) {
                LOG.error("getJSLinks", (Throwable) e3);
            }
        }
        return (arrayList == null || arrayList.size() <= 0) ? new Outlink[0] : (Outlink[]) arrayList.toArray(new Outlink[0]);
    }

    public static void main(String[] strArr) throws Exception {
        if (strArr.length < 2) {
            System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
            return;
        }
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(strArr[0]), "UTF-8"));
        StringBuffer stringBuffer = new StringBuffer();
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                break;
            } else {
                stringBuffer.append(readLine + IOUtils.LINE_SEPARATOR_UNIX);
            }
        }
        JSParseFilter jSParseFilter = new JSParseFilter();
        jSParseFilter.setConf(NutchConfiguration.create());
        Outlink[] jSLinks = jSParseFilter.getJSLinks(stringBuffer.toString(), "", strArr[1]);
        System.out.println("Outlinks extracted: " + jSLinks.length);
        for (Outlink outlink : jSLinks) {
            System.out.println(" - " + outlink);
        }
    }

    @Override // org.apache.hadoop.conf.Configurable
    public void setConf(Configuration configuration) {
        this.conf = configuration;
    }

    @Override // org.apache.hadoop.conf.Configurable
    public Configuration getConf() {
        return this.conf;
    }
}
