package org.nuxeo.ecm.core.convert.plugins.text.extractors;

import java.util.HashSet;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.xerces.parsers.AbstractSAXParser;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.NamespaceContext;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLLocator;
import org.apache.xerces.xni.XMLString;
import org.cyberneko.html.HTMLConfiguration;
import org.xml.sax.SAXException;

/* loaded from: input_file:org/nuxeo/ecm/core/convert/plugins/text/extractors/HtmlParser.class */
public class HtmlParser extends AbstractSAXParser {
    private static final Log log = LogFactory.getLog(HtmlParser.class);
    protected StringBuilder buffer;
    protected String tagFilter;
    protected Boolean inFilter;
    protected Boolean noFilter;
    protected String skipUntillClosed;
    protected final Set<String> newLinesTags;
    protected final Set<String> skippedTags;

    public HtmlParser() {
        super(new HTMLConfiguration());
        this.newLinesTags = new HashSet();
        this.skippedTags = new HashSet();
        init(null);
    }

    public HtmlParser(String str) {
        super(new HTMLConfiguration());
        this.newLinesTags = new HashSet();
        this.skippedTags = new HashSet();
        init(str);
    }

    public void init(String str) {
        try {
            setFeature("http://xml.org/sax/features/validation", false);
            setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
        } catch (SAXException e) {
            log.debug("Could not switch parser to non-validating: " + e.getMessage());
        }
        this.inFilter = false;
        if (str == null || "".equals(str)) {
            this.noFilter = true;
        } else {
            this.tagFilter = str;
            this.noFilter = false;
        }
        this.skippedTags.add("script");
        this.skippedTags.add("style");
        this.skippedTags.add("link");
        this.newLinesTags.add("div");
        this.newLinesTags.add("p");
        this.newLinesTags.add("br");
        this.newLinesTags.add("pre");
        this.newLinesTags.add("h1");
        this.newLinesTags.add("h2");
        this.newLinesTags.add("h3");
        this.newLinesTags.add("h4");
        this.newLinesTags.add("h5");
        this.newLinesTags.add("h6");
    }

    public void startElement(QName qName, XMLAttributes xMLAttributes, Augmentations augmentations) {
        super.startElement(qName, xMLAttributes, augmentations);
        if (!this.noFilter.booleanValue() && this.tagFilter.equalsIgnoreCase(qName.localpart)) {
            this.inFilter = true;
        }
        if (this.skipUntillClosed == null && this.skippedTags.contains(qName.localpart.toLowerCase())) {
            this.skipUntillClosed = qName.localpart.toLowerCase();
        }
    }

    public void endElement(QName qName, Augmentations augmentations) {
        super.endElement(qName, augmentations);
        if (!this.noFilter.booleanValue() && this.tagFilter.equals(qName.localpart)) {
            this.inFilter = false;
        }
        if (this.skipUntillClosed != null && this.skipUntillClosed.equals(qName.localpart.toLowerCase())) {
            this.skipUntillClosed = null;
        }
        if (this.newLinesTags.contains(qName.localpart.toLowerCase())) {
            this.buffer.append("\n\n");
        }
    }

    public void startDocument(XMLLocator xMLLocator, String str, NamespaceContext namespaceContext, Augmentations augmentations) {
        super.startDocument(xMLLocator, str, namespaceContext, augmentations);
        this.buffer = new StringBuilder();
    }

    public void characters(XMLString xMLString, Augmentations augmentations) {
        super.characters(xMLString, augmentations);
        if ((this.noFilter.booleanValue() || this.inFilter.booleanValue()) && this.skipUntillClosed == null) {
            this.buffer.append(xMLString.toString());
        }
    }

    public String getContents() {
        return this.buffer.toString().replaceAll(" *\n", "\n").replaceAll(" +", " ").replaceAll("\\n\\n+", "\n\n");
    }
}
