package org.nuxeo.ecm.core.storage;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import net.htmlparser.jericho.Renderer;
import net.htmlparser.jericho.Source;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.nuxeo.ecm.core.api.DocumentLocation;
import org.nuxeo.runtime.api.Framework;

/* loaded from: input_file:org/nuxeo/ecm/core/storage/DefaultFulltextParser.class */
public class DefaultFulltextParser implements FulltextParser {
    public static final String WORD_SPLIT_PROP = "org.nuxeo.fulltext.wordsplit";
    public static final String WORD_SPLIT_DEF = "[\\s\\p{Punct}]+";
    protected static final Pattern WORD_SPLIT_PATTERN = Pattern.compile(Framework.getProperty(WORD_SPLIT_PROP, WORD_SPLIT_DEF));
    protected static final int HTML_MAGIC_OFFSET = 8192;
    protected static final String TEXT_HTML = "text/html";

    @Override // org.nuxeo.ecm.core.storage.FulltextParser
    public String parse(String str, String str2) {
        return parse(str, str2, null, null);
    }

    @Override // org.nuxeo.ecm.core.storage.FulltextParser
    public void parse(String str, String str2, List<String> list) {
        parse(str, str2, null, null, list);
    }

    @Override // org.nuxeo.ecm.core.storage.FulltextParser
    public String parse(String str, String str2, String str3, DocumentLocation documentLocation) {
        ArrayList arrayList = new ArrayList();
        parse(str, str2, str3, documentLocation, arrayList);
        return StringUtils.join(arrayList, ' ');
    }

    @Override // org.nuxeo.ecm.core.storage.FulltextParser
    public void parse(String str, String str2, String str3, DocumentLocation documentLocation, List<String> list) {
        for (String str4 : WORD_SPLIT_PATTERN.split(preprocessField(str, str2, str3))) {
            if (!str4.isEmpty()) {
                list.add(str4.toLowerCase());
            }
        }
    }

    protected String preprocessField(String str, String str2, String str3) {
        if (str == null) {
            return null;
        }
        if (StringUtils.isEmpty(str3)) {
            String lowerCase = str.substring(0, Math.min(str.length(), HTML_MAGIC_OFFSET)).toLowerCase();
            if (lowerCase.startsWith("<!doctype html") || lowerCase.contains("<html")) {
                str3 = TEXT_HTML;
            }
        }
        if (TEXT_HTML.equals(str3)) {
            str = removeHtml(str);
        }
        return StringEscapeUtils.unescapeHtml(str);
    }

    protected String removeHtml(String str) {
        Renderer renderer = new Source(str).getRenderer();
        renderer.setIncludeHyperlinkURLs(false);
        renderer.setDecorateFontStyles(false);
        return renderer.toString();
    }
}
