/*
 * Decompiled with CFR 0.152.
 */
package org.nuxeo.ecm.core.storage;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import net.htmlparser.jericho.Source;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.text.StringEscapeUtils;
import org.nuxeo.ecm.core.api.Blob;
import org.nuxeo.ecm.core.api.DocumentModel;
import org.nuxeo.ecm.core.api.DocumentRef;
import org.nuxeo.ecm.core.api.IdRef;
import org.nuxeo.ecm.core.api.IterableQueryResult;
import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder;
import org.nuxeo.ecm.core.api.repository.FulltextConfiguration;
import org.nuxeo.ecm.core.convert.api.ConversionException;
import org.nuxeo.ecm.core.convert.api.ConversionService;
import org.nuxeo.ecm.core.model.Repository;
import org.nuxeo.ecm.core.repository.RepositoryService;
import org.nuxeo.ecm.core.utils.BlobsExtractor;
import org.nuxeo.ecm.core.utils.StringsExtractor;
import org.nuxeo.ecm.core.work.AbstractWork;
import org.nuxeo.ecm.core.work.api.Work;
import org.nuxeo.runtime.api.Framework;

public class FulltextExtractorWork
extends AbstractWork {
    private static final long serialVersionUID = 1L;
    private static final Log log = LogFactory.getLog(FulltextExtractorWork.class);
    public static final String SYSPROP_FULLTEXT_SIMPLE = "fulltextSimple";
    public static final String SYSPROP_FULLTEXT_BINARY = "fulltextBinary";
    public static final String SYSPROP_FULLTEXT_JOBID = "fulltextJobId";
    public static final String FULLTEXT_DEFAULT_INDEX = "default";
    protected static final String CATEGORY = "fulltextExtractor";
    protected static final String TITLE = "Fulltext Extractor";
    protected static final String ANY2TEXT_CONVERTER = "any2text";
    protected static final int HTML_MAGIC_OFFSET = 8192;
    protected static final String TEXT_HTML = "text/html";
    protected transient FulltextConfiguration fulltextConfiguration;
    protected transient DocumentModel document;
    protected transient List<DocumentRef> docsToUpdate;
    protected final boolean updateSimpleText;
    protected final boolean updateBinaryText;
    protected final boolean useJobId;

    public FulltextExtractorWork(String repositoryName, String docId, boolean updateSimpleText, boolean updateBinaryText, boolean useJobId) {
        this.setDocument(repositoryName, docId);
        this.updateSimpleText = updateSimpleText;
        this.updateBinaryText = updateBinaryText;
        this.useJobId = useJobId;
    }

    public String getCategory() {
        return CATEGORY;
    }

    public String getTitle() {
        return TITLE;
    }

    public int getRetryCount() {
        return 1;
    }

    public void work() {
        this.openSystemSession();
        if (this.session.getPrincipal() == null) {
            return;
        }
        IdRef docRef = new IdRef(this.docId);
        if (!this.session.exists((DocumentRef)docRef)) {
            return;
        }
        this.document = this.session.getDocument((DocumentRef)docRef);
        this.findDocsToUpdate();
        if (this.docsToUpdate.isEmpty()) {
            return;
        }
        this.initFulltextConfiguration();
        this.setStatus("Extracting");
        this.setProgress(Work.Progress.PROGRESS_0_PC);
        this.extractAndUpdate();
        this.setStatus("Saving");
        this.session.save();
        this.setProgress(Work.Progress.PROGRESS_100_PC);
        this.setStatus("Done");
    }

    protected void initFulltextConfiguration() {
        RepositoryService repositoryService = (RepositoryService)Framework.getService(RepositoryService.class);
        Repository repository = repositoryService.getRepository(this.repositoryName);
        this.fulltextConfiguration = repository.getFulltextConfiguration();
    }

    protected void findDocsToUpdate() {
        if (this.useJobId) {
            String query = String.format("SELECT ecm:uuid FROM Document WHERE ecm:fulltextJobId = '%s' AND ecm:isProxy = 0", this.docId);
            this.docsToUpdate = new ArrayList<DocumentRef>();
            try (IterableQueryResult it = this.session.queryAndFetch(query, "NXQL", new Object[0]);){
                for (Map map : it) {
                    this.docsToUpdate.add((DocumentRef)new IdRef((String)map.get("ecm:uuid")));
                }
            }
        } else {
            this.docsToUpdate = Collections.singletonList(this.document.getRef());
        }
    }

    protected void extractAndUpdate() {
        if (this.updateSimpleText) {
            this.extractAndUpdateSimpleText();
        }
        if (this.updateBinaryText) {
            this.extractAndUpdateBinaryText();
        }
        for (DocumentRef docRef : this.docsToUpdate) {
            this.session.setDocumentSystemProp(docRef, SYSPROP_FULLTEXT_JOBID, null);
        }
    }

    protected void extractAndUpdateSimpleText() {
        if (this.fulltextConfiguration == null || this.fulltextConfiguration.fulltextSearchDisabled) {
            return;
        }
        for (String indexName : this.fulltextConfiguration.indexNames) {
            if (!this.fulltextConfiguration.indexesAllSimple.contains(indexName) && this.fulltextConfiguration.propPathsByIndexSimple.get(indexName) == null) continue;
            Set includedPaths = this.fulltextConfiguration.indexesAllSimple.contains(indexName) ? null : (Set)this.fulltextConfiguration.propPathsByIndexSimple.get(indexName);
            Set excludedPaths = (Set)this.fulltextConfiguration.propPathsExcludedByIndexSimple.get(indexName);
            List strings = new StringsExtractor().findStrings(this.document, includedPaths, excludedPaths);
            String text = strings.stream().map(this::stringToText).collect(Collectors.joining(" ", " ", " "));
            text = this.limitStringSize(text, this.fulltextConfiguration.fulltextFieldSizeLimit);
            String property = this.getFulltextPropertyName(SYSPROP_FULLTEXT_SIMPLE, indexName);
            for (DocumentRef docRef : this.docsToUpdate) {
                this.session.setDocumentSystemProp(docRef, property, (Serializable)((Object)text));
            }
        }
    }

    protected void extractAndUpdateBinaryText() {
        BlobsExtractor blobsExtractor = new BlobsExtractor();
        IdentityHashMap<Blob, String> blobsText = new IdentityHashMap<Blob, String>();
        for (String indexName : this.fulltextConfiguration.indexNames) {
            if (!this.fulltextConfiguration.indexesAllBinary.contains(indexName) && this.fulltextConfiguration.propPathsByIndexBinary.get(indexName) == null) continue;
            blobsExtractor.setExtractorProperties((Set)this.fulltextConfiguration.propPathsByIndexBinary.get(indexName), (Set)this.fulltextConfiguration.propPathsExcludedByIndexBinary.get(indexName), this.fulltextConfiguration.indexesAllBinary.contains(indexName));
            ArrayList<String> strings = new ArrayList<String>();
            for (Blob blob : blobsExtractor.getBlobs(this.document)) {
                String string = blobsText.computeIfAbsent(blob, this::blobToText);
                strings.add(string);
            }
            Object text = " " + String.join((CharSequence)" ", strings) + " ";
            text = this.limitStringSize((String)text, this.fulltextConfiguration.fulltextFieldSizeLimit);
            String property = this.getFulltextPropertyName(SYSPROP_FULLTEXT_BINARY, indexName);
            for (DocumentRef docRef : this.docsToUpdate) {
                this.session.setDocumentSystemProp(docRef, property, (Serializable)text);
            }
        }
    }

    protected String stringToText(String string) {
        string = this.removeHtml(string);
        string = this.removeEntities(string);
        return string;
    }

    protected String removeHtml(String string) {
        String initial = string.substring(0, Math.min(string.length(), 8192)).toLowerCase();
        if (initial.startsWith("<!doctype html") || initial.contains("<html")) {
            string = new Source((CharSequence)string).getRenderer().setIncludeHyperlinkURLs(false).setDecorateFontStyles(false).toString();
        }
        return string;
    }

    protected String removeEntities(String string) {
        if (string.indexOf(38) >= 0) {
            string = StringEscapeUtils.unescapeHtml4((String)string);
        }
        return string;
    }

    protected String blobToText(Blob blob) {
        try {
            ConversionService conversionService = (ConversionService)Framework.getService(ConversionService.class);
            if (conversionService == null) {
                log.debug((Object)"No ConversionService available");
                return "";
            }
            BlobHolder blobHolder = conversionService.convert(ANY2TEXT_CONVERTER, (BlobHolder)new SimpleBlobHolder(blob), null);
            if (blobHolder == null) {
                return "";
            }
            Blob resultBlob = blobHolder.getBlob();
            if (resultBlob == null) {
                return "";
            }
            String string = resultBlob.getString();
            if (string.indexOf(0) >= 0) {
                string = string.replace("\u0000", " ");
            }
            return string;
        }
        catch (IOException | ConversionException e) {
            String msg = "Could not extract fulltext of file '" + blob.getFilename() + "' for document: " + this.docId + ": " + (Exception)e;
            log.warn((Object)msg);
            log.debug((Object)msg, e);
            return "";
        }
    }

    protected String limitStringSize(String string, int maxSize) {
        if (maxSize != 0 && string.length() > maxSize) {
            if (log.isDebugEnabled()) {
                log.debug((Object)String.format("Fulltext extract of length: %s for document: %s truncated to length: %s", string.length(), this.docId, maxSize));
            }
            string = string.substring(0, maxSize);
        }
        return string;
    }

    protected String getFulltextPropertyName(String name, String indexName) {
        if (!FULLTEXT_DEFAULT_INDEX.equals(indexName)) {
            name = (String)name + "_" + indexName;
        }
        return name;
    }
}

