package org.nuxeo.ecm.core.storage;

import com.ibm.icu.text.PluralRules;
import com.sun.istack.localization.Localizable;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import net.htmlparser.jericho.Source;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.text.StringEscapeUtils;
import org.nuxeo.ecm.core.api.Blob;
import org.nuxeo.ecm.core.api.DocumentModel;
import org.nuxeo.ecm.core.api.DocumentRef;
import org.nuxeo.ecm.core.api.IdRef;
import org.nuxeo.ecm.core.api.IterableQueryResult;
import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder;
import org.nuxeo.ecm.core.api.repository.FulltextConfiguration;
import org.nuxeo.ecm.core.convert.api.ConversionException;
import org.nuxeo.ecm.core.convert.api.ConversionService;
import org.nuxeo.ecm.core.query.sql.NXQL;
import org.nuxeo.ecm.core.repository.RepositoryService;
import org.nuxeo.ecm.core.utils.BlobsExtractor;
import org.nuxeo.ecm.core.utils.StringsExtractor;
import org.nuxeo.ecm.core.work.AbstractWork;
import org.nuxeo.ecm.core.work.api.Work;
import org.nuxeo.runtime.api.Framework;

/* loaded from: input_file:org/nuxeo/ecm/core/storage/FulltextExtractorWork.class */
public class FulltextExtractorWork extends AbstractWork {
    private static final long serialVersionUID = 1;
    private static final Log log = LogFactory.getLog(FulltextExtractorWork.class);
    public static final String SYSPROP_FULLTEXT_SIMPLE = "fulltextSimple";
    public static final String SYSPROP_FULLTEXT_BINARY = "fulltextBinary";
    public static final String SYSPROP_FULLTEXT_JOBID = "fulltextJobId";
    public static final String FULLTEXT_DEFAULT_INDEX = "default";
    protected static final String CATEGORY = "fulltextExtractor";
    protected static final String TITLE = "Fulltext Extractor";
    protected static final String ANY2TEXT_CONVERTER = "any2text";
    protected static final int HTML_MAGIC_OFFSET = 8192;
    protected static final String TEXT_HTML = "text/html";
    protected transient FulltextConfiguration fulltextConfiguration;
    protected transient DocumentModel document;
    protected transient List<DocumentRef> docsToUpdate;
    protected final boolean updateSimpleText;
    protected final boolean updateBinaryText;
    protected final boolean useJobId;

    public FulltextExtractorWork(String str, String str2, boolean z, boolean z2, boolean z3) {
        setDocument(str, str2);
        this.updateSimpleText = z;
        this.updateBinaryText = z2;
        this.useJobId = z3;
    }

    @Override // org.nuxeo.ecm.core.work.AbstractWork, org.nuxeo.ecm.core.work.api.Work
    public String getCategory() {
        return CATEGORY;
    }

    @Override // org.nuxeo.ecm.core.work.api.Work
    public String getTitle() {
        return TITLE;
    }

    @Override // org.nuxeo.ecm.core.work.AbstractWork
    public int getRetryCount() {
        return 1;
    }

    @Override // org.nuxeo.ecm.core.work.AbstractWork, org.nuxeo.ecm.core.work.api.Work
    public void work() {
        openSystemSession();
        if (this.session.getPrincipal() == null) {
            return;
        }
        IdRef idRef = new IdRef(this.docId);
        if (this.session.exists(idRef)) {
            this.document = this.session.getDocument(idRef);
            findDocsToUpdate();
            if (this.docsToUpdate.isEmpty()) {
                return;
            }
            initFulltextConfiguration();
            setStatus("Extracting");
            setProgress(Work.Progress.PROGRESS_0_PC);
            extractAndUpdate();
            setStatus("Saving");
            this.session.save();
            setProgress(Work.Progress.PROGRESS_100_PC);
            setStatus("Done");
        }
    }

    protected void initFulltextConfiguration() {
        this.fulltextConfiguration = ((RepositoryService) Framework.getService(RepositoryService.class)).getRepository(this.repositoryName).getFulltextConfiguration();
    }

    protected void findDocsToUpdate() {
        if (!this.useJobId) {
            this.docsToUpdate = Collections.singletonList(this.document.getRef());
            return;
        }
        String format = String.format("SELECT ecm:uuid FROM Document WHERE ecm:fulltextJobId = '%s' AND ecm:isProxy = 0", this.docId);
        this.docsToUpdate = new ArrayList();
        IterableQueryResult queryAndFetch = this.session.queryAndFetch(format, NXQL.NXQL, new Object[0]);
        Throwable th = null;
        try {
            Iterator<Map<String, Serializable>> it = queryAndFetch.iterator();
            while (it.hasNext()) {
                this.docsToUpdate.add(new IdRef((String) it.next().get(NXQL.ECM_UUID)));
            }
            if (queryAndFetch != null) {
                if (0 == 0) {
                    queryAndFetch.close();
                    return;
                }
                try {
                    queryAndFetch.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
        } catch (Throwable th3) {
            if (queryAndFetch != null) {
                if (0 != 0) {
                    try {
                        queryAndFetch.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    queryAndFetch.close();
                }
            }
            throw th3;
        }
    }

    protected void extractAndUpdate() {
        if (this.updateSimpleText) {
            extractAndUpdateSimpleText();
        }
        if (this.updateBinaryText) {
            extractAndUpdateBinaryText();
        }
        Iterator<DocumentRef> it = this.docsToUpdate.iterator();
        while (it.hasNext()) {
            this.session.setDocumentSystemProp(it.next(), "fulltextJobId", null);
        }
    }

    protected void extractAndUpdateSimpleText() {
        if (this.fulltextConfiguration.fulltextSearchDisabled) {
            return;
        }
        for (String str : this.fulltextConfiguration.indexNames) {
            if (this.fulltextConfiguration.indexesAllSimple.contains(str) || this.fulltextConfiguration.propPathsByIndexSimple.get(str) != null) {
                String limitStringSize = limitStringSize((String) new StringsExtractor().findStrings(this.document, this.fulltextConfiguration.indexesAllSimple.contains(str) ? null : this.fulltextConfiguration.propPathsByIndexSimple.get(str), this.fulltextConfiguration.propPathsExcludedByIndexSimple.get(str)).stream().map(this::stringToText).collect(Collectors.joining(" ", " ", " ")), this.fulltextConfiguration.fulltextFieldSizeLimit);
                String fulltextPropertyName = getFulltextPropertyName("fulltextSimple", str);
                Iterator<DocumentRef> it = this.docsToUpdate.iterator();
                while (it.hasNext()) {
                    this.session.setDocumentSystemProp(it.next(), fulltextPropertyName, limitStringSize);
                }
            }
        }
    }

    protected void extractAndUpdateBinaryText() {
        BlobsExtractor blobsExtractor = new BlobsExtractor();
        IdentityHashMap identityHashMap = new IdentityHashMap();
        for (String str : this.fulltextConfiguration.indexNames) {
            if (this.fulltextConfiguration.indexesAllBinary.contains(str) || this.fulltextConfiguration.propPathsByIndexBinary.get(str) != null) {
                blobsExtractor.setExtractorProperties(this.fulltextConfiguration.propPathsByIndexBinary.get(str), this.fulltextConfiguration.propPathsExcludedByIndexBinary.get(str), this.fulltextConfiguration.indexesAllBinary.contains(str));
                ArrayList arrayList = new ArrayList();
                Iterator<Blob> it = blobsExtractor.getBlobs(this.document).iterator();
                while (it.hasNext()) {
                    arrayList.add((String) identityHashMap.computeIfAbsent(it.next(), this::blobToText));
                }
                String limitStringSize = limitStringSize(" " + String.join(" ", arrayList) + " ", this.fulltextConfiguration.fulltextFieldSizeLimit);
                String fulltextPropertyName = getFulltextPropertyName("fulltextBinary", str);
                Iterator<DocumentRef> it2 = this.docsToUpdate.iterator();
                while (it2.hasNext()) {
                    this.session.setDocumentSystemProp(it2.next(), fulltextPropertyName, limitStringSize);
                }
            }
        }
    }

    protected String stringToText(String str) {
        return removeEntities(removeHtml(str));
    }

    protected String removeHtml(String str) {
        String lowerCase = str.substring(0, Math.min(str.length(), 8192)).toLowerCase();
        if (lowerCase.startsWith("<!doctype html") || lowerCase.contains("<html")) {
            str = new Source(str).getRenderer().setIncludeHyperlinkURLs(false).setDecorateFontStyles(false).toString();
        }
        return str;
    }

    protected String removeEntities(String str) {
        if (str.indexOf(38) >= 0) {
            str = StringEscapeUtils.unescapeHtml4(str);
        }
        return str;
    }

    protected String blobToText(Blob blob) {
        Blob blob2;
        try {
            ConversionService conversionService = (ConversionService) Framework.getService(ConversionService.class);
            if (conversionService == null) {
                log.debug("No ConversionService available");
                return "";
            }
            BlobHolder convert = conversionService.convert(ANY2TEXT_CONVERTER, new SimpleBlobHolder(blob), null);
            if (convert == null || (blob2 = convert.getBlob()) == null) {
                return "";
            }
            String string = blob2.getString();
            if (string.indexOf(0) >= 0) {
                string = string.replace(Localizable.NOT_LOCALIZABLE, " ");
            }
            return string;
        } catch (IOException | ConversionException e) {
            String str = "Could not extract fulltext of file '" + blob.getFilename() + "' for document: " + this.docId + PluralRules.KEYWORD_RULE_SEPARATOR + e;
            log.warn(str);
            log.debug(str, e);
            return "";
        }
    }

    protected String limitStringSize(String str, int i) {
        if (i != 0 && str.length() > i) {
            if (log.isDebugEnabled()) {
                log.debug(String.format("Fulltext extract of length: %s for document: %s truncated to length: %s", Integer.valueOf(str.length()), this.docId, Integer.valueOf(i)));
            }
            str = str.substring(0, i);
        }
        return str;
    }

    protected String getFulltextPropertyName(String str, String str2) {
        if (!"default".equals(str2)) {
            str = str + '_' + str2;
        }
        return str;
    }
}
