package org.apache.nutch.parse.html;

import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilters;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.EncodingDetector;
import org.apache.nutch.util.NutchConfiguration;
import org.cyberneko.html.HTMLScanner;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.mortbay.jetty.MimeTypes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import shaded.org.apache.http.protocol.HTTP;

/* loaded from: input_file:plugins/parse-html/parse-html.jar:org/apache/nutch/parse/html/HtmlParser.class */
public class HtmlParser implements Parser {
    private static final int CHUNK_SIZE = 2000;
    private String parserImpl;
    private String defaultCharEncoding;
    private Configuration conf;
    private DOMContentUtils utils;
    private HtmlParseFilters htmlParseFilters;
    private String cachingPolicy;
    public static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.parse.html");
    private static Pattern metaPattern = Pattern.compile("<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>", 2);
    private static Pattern charsetPattern = Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)", 2);

    private static String sniffCharacterEncoding(byte[] bArr) {
        try {
            Matcher matcher = metaPattern.matcher(new String(bArr, 0, bArr.length < 2000 ? bArr.length : 2000, Charset.forName(HTTP.ASCII).toString()));
            String str = null;
            if (matcher.find()) {
                Matcher matcher2 = charsetPattern.matcher(matcher.group(1));
                if (matcher2.find()) {
                    str = new String(matcher2.group(1));
                }
            }
            return str;
        } catch (UnsupportedEncodingException e) {
            return null;
        }
    }

    @Override // org.apache.nutch.parse.Parser
    public ParseResult getParse(Content content) {
        HTMLMetaTags hTMLMetaTags = new HTMLMetaTags();
        try {
            URL url = new URL(content.getBaseUrl());
            String str = "";
            String str2 = "";
            Outlink[] outlinkArr = new Outlink[0];
            Metadata metadata = new Metadata();
            try {
                byte[] content2 = content.getContent();
                InputSource inputSource = new InputSource(new ByteArrayInputStream(content2));
                EncodingDetector encodingDetector = new EncodingDetector(this.conf);
                encodingDetector.autoDetectClues(content, true);
                encodingDetector.addClue(sniffCharacterEncoding(content2), "sniffed");
                String guessEncoding = encodingDetector.guessEncoding(content, this.defaultCharEncoding);
                metadata.set(Nutch.ORIGINAL_CHAR_ENCODING, guessEncoding);
                metadata.set(Nutch.CHAR_ENCODING_FOR_CONVERSION, guessEncoding);
                inputSource.setEncoding(guessEncoding);
                if (LOG.isTraceEnabled()) {
                    LOG.trace("Parsing...");
                }
                DocumentFragment parse = parse(inputSource);
                HTMLMetaProcessor.getMetaTags(hTMLMetaTags, parse, url);
                if (LOG.isTraceEnabled()) {
                    LOG.trace("Meta tags for " + url + ": " + hTMLMetaTags.toString());
                }
                if (!hTMLMetaTags.getNoIndex()) {
                    StringBuffer stringBuffer = new StringBuffer();
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Getting text...");
                    }
                    this.utils.getText(stringBuffer, parse);
                    str = stringBuffer.toString();
                    stringBuffer.setLength(0);
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Getting title...");
                    }
                    this.utils.getTitle(stringBuffer, parse);
                    str2 = stringBuffer.toString().trim();
                }
                if (!hTMLMetaTags.getNoFollow()) {
                    ArrayList<Outlink> arrayList = new ArrayList<>();
                    URL base = this.utils.getBase(parse);
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Getting links...");
                    }
                    this.utils.getOutlinks(base != null ? base : url, arrayList, parse);
                    outlinkArr = (Outlink[]) arrayList.toArray(new Outlink[arrayList.size()]);
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("found " + outlinkArr.length + " outlinks in " + content.getUrl());
                    }
                }
                ParseStatus parseStatus = new ParseStatus(1);
                if (hTMLMetaTags.getRefresh()) {
                    parseStatus.setMinorCode((short) 100);
                    parseStatus.setArgs(new String[]{hTMLMetaTags.getRefreshHref().toString(), Integer.toString(hTMLMetaTags.getRefreshTime())});
                }
                ParseResult filter = this.htmlParseFilters.filter(content, ParseResult.createParseResult(content.getUrl(), new ParseImpl(str, new ParseData(parseStatus, str2, outlinkArr, content.getMetadata(), metadata))), hTMLMetaTags, parse);
                if (hTMLMetaTags.getNoCache()) {
                    Iterator<Map.Entry<Text, Parse>> it = filter.iterator();
                    while (it.hasNext()) {
                        it.next().getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, this.cachingPolicy);
                    }
                }
                return filter;
            } catch (IOException e) {
                return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
            } catch (DOMException e2) {
                return new ParseStatus(e2).getEmptyParseResult(content.getUrl(), getConf());
            } catch (SAXException e3) {
                return new ParseStatus(e3).getEmptyParseResult(content.getUrl(), getConf());
            } catch (Exception e4) {
                LOG.error("Error: ", (Throwable) e4);
                return new ParseStatus(e4).getEmptyParseResult(content.getUrl(), getConf());
            }
        } catch (MalformedURLException e5) {
            return new ParseStatus(e5).getEmptyParseResult(content.getUrl(), getConf());
        }
    }

    private DocumentFragment parse(InputSource inputSource) throws Exception {
        return this.parserImpl.equalsIgnoreCase("tagsoup") ? parseTagSoup(inputSource) : parseNeko(inputSource);
    }

    private DocumentFragment parseTagSoup(InputSource inputSource) throws Exception {
        HTMLDocumentImpl hTMLDocumentImpl = new HTMLDocumentImpl();
        DocumentFragment createDocumentFragment = hTMLDocumentImpl.createDocumentFragment();
        DOMBuilder dOMBuilder = new DOMBuilder((Document) hTMLDocumentImpl, createDocumentFragment);
        org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
        parser.setContentHandler(dOMBuilder);
        parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
        parser.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false);
        parser.setProperty(org.ccil.cowan.tagsoup.Parser.lexicalHandlerProperty, dOMBuilder);
        parser.parse(inputSource);
        return createDocumentFragment;
    }

    private DocumentFragment parseNeko(InputSource inputSource) throws Exception {
        DOMFragmentParser dOMFragmentParser = new DOMFragmentParser();
        try {
            dOMFragmentParser.setFeature("http://cyberneko.org/html/features/augmentations", true);
            dOMFragmentParser.setProperty("http://cyberneko.org/html/properties/default-encoding", this.defaultCharEncoding);
            dOMFragmentParser.setFeature(HTMLScanner.IGNORE_SPECIFIED_CHARSET, true);
            dOMFragmentParser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", false);
            dOMFragmentParser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
            dOMFragmentParser.setFeature("http://cyberneko.org/html/features/report-errors", LOG.isTraceEnabled());
        } catch (SAXException e) {
        }
        HTMLDocumentImpl hTMLDocumentImpl = new HTMLDocumentImpl();
        hTMLDocumentImpl.setErrorChecking(false);
        DocumentFragment createDocumentFragment = hTMLDocumentImpl.createDocumentFragment();
        DocumentFragment createDocumentFragment2 = hTMLDocumentImpl.createDocumentFragment();
        dOMFragmentParser.parse(inputSource, createDocumentFragment2);
        createDocumentFragment.appendChild(createDocumentFragment2);
        while (true) {
            try {
                DocumentFragment createDocumentFragment3 = hTMLDocumentImpl.createDocumentFragment();
                dOMFragmentParser.parse(inputSource, createDocumentFragment3);
                if (!createDocumentFragment3.hasChildNodes()) {
                    break;
                }
                if (LOG.isInfoEnabled()) {
                    LOG.info(" - new frag, " + createDocumentFragment3.getChildNodes().getLength() + " nodes.");
                }
                createDocumentFragment.appendChild(createDocumentFragment3);
            } catch (Exception e2) {
                LOG.error("Error: ", (Throwable) e2);
            }
        }
        return createDocumentFragment;
    }

    public static void main(String[] strArr) throws Exception {
        String str = strArr[0];
        String str2 = "file:" + str;
        File file = new File(str);
        byte[] bArr = new byte[(int) file.length()];
        new DataInputStream(new FileInputStream(file)).readFully(bArr);
        Configuration create = NutchConfiguration.create();
        HtmlParser htmlParser = new HtmlParser();
        htmlParser.setConf(create);
        Parse parse = htmlParser.getParse(new Content(str2, str2, bArr, MimeTypes.TEXT_HTML, new Metadata(), create)).get(str2);
        System.out.println("data: " + parse.getData());
        System.out.println("text: " + parse.getText());
    }

    @Override // org.apache.hadoop.conf.Configurable
    public void setConf(Configuration configuration) {
        this.conf = configuration;
        this.htmlParseFilters = new HtmlParseFilters(getConf());
        this.parserImpl = getConf().get("parser.html.impl", "neko");
        this.defaultCharEncoding = getConf().get("parser.character.encoding.default", "windows-1252");
        this.utils = new DOMContentUtils(configuration);
        this.cachingPolicy = getConf().get("parser.caching.forbidden.policy", "content");
    }

    @Override // org.apache.hadoop.conf.Configurable
    public Configuration getConf() {
        return this.conf;
    }
}
