package com.atlassian.bonnie.search.summary;

import com.atlassian.bonnie.BonnieConstants;
import com.atlassian.bonnie.ILuceneConnection;
import com.atlassian.bonnie.analyzer.LuceneAnalyzerFactory;
import com.atlassian.bonnie.search.BaseDocumentBuilder;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.StringReader;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Pattern;


/**
 * Originally from org.apache.nutch.searcher.Summarizer v 0.7 (Revision: <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java?rev=179640&view=markup">179640</a>)
 * <p></p>
 * Implements hit summarization using a sliding window and various document fragments.
 */
public class Summarizer {
    private static final Logger log = LoggerFactory.getLogger(Summarizer.class);

    /**
     * The default number of context terms to display preceding and following matches.
     */
    private static final int DEFAULT_SUM_CONTEXT = 10;

    /**
     * The default total number of terms to display in a summary.
     */
    private static final int DEFAULT_SUM_LENGTH = 30;

    private Analyzer analyzer;
    private StandardAnalyzer standardAnalyzer = new StandardAnalyzer(BonnieConstants.LUCENE_VERSION);
    private int sumContext = DEFAULT_SUM_CONTEXT;
    private int sumLength = DEFAULT_SUM_LENGTH;
    private ILuceneConnection luceneConnection;

    @SuppressWarnings("unused")
    public Summarizer() {
    }

    public Summarizer(Analyzer analyzer) {
        this.analyzer = analyzer;
    }

    public Summarizer(Analyzer analyzer, int sumContext, int sumLength, ILuceneConnection luceneConnection) {
        this.analyzer = analyzer;
        this.sumContext = sumContext;
        this.sumLength = sumLength;
        this.luceneConnection = luceneConnection;
    }

    public Summary getSummary(String text) throws IOException {
        return this.getSummary(text, null);
    }

    /**
     * Returns a summary for the given pre-tokenized text.
     */
    public Summary getSummary(String text, String query) throws IOException {
        // Simplistic implementation.  Finds the first fragments in the document
        // containing any query terms.
        //
        // TODO: check that phrases in the query are matched in the fragment

        log.debug("\n\ntext = {}", text);
        log.debug("query = {}", query);

        Attributes[] tokens = parseText(text);             // parse text to token array


        if (log.isDebugEnabled()) {
            final StringBuilder buf = new StringBuilder();
            for (int i = 0; i < tokens.length; i++) {
                buf.append(tokens[i].getText());
                if (i != (tokens.length - 1))
                    buf.append(", ");
            }
            log.debug("tokens = {}", buf);
        }

        if (tokens.length == 0)
            return new Summary();

        Set highlight = getTerms(query);            // put query terms in table

        log.debug("highlight = {}", highlight);

        // Create a SortedSet that ranks excerpts according to
        // how many query terms are present.  An excerpt is
        // a Vector full of Fragments and Highlights
        SortedSet<Excerpt> excerptSet = new TreeSet<>((excerpt1, excerpt2) -> {
            if (excerpt1 == null && excerpt2 != null) {
                return -1;
            } else if (excerpt1 != null && excerpt2 == null) {
                return 1;
            } else if (excerpt1 == null) { // implied: && excerpt2 == null
                return 0;
            }

            int numToks1 = excerpt1.numUniqueTokens();
            int numToks2 = excerpt2.numUniqueTokens();

            if (numToks1 < numToks2) {
                return -1;
            } else if (numToks1 == numToks2) {
                return excerpt1.numFragments() - excerpt2.numFragments();
            } else {
                return 1;
            }
        }
        );

        int lastExcerptPos = 0;

        if (highlight.size() > 0) // if we have any query terms
        {
            // Iterate through all terms in the document
            for (int i = 0; i < tokens.length; i++) {
                // If we find a term that's in the query...
                if (highlight.contains(tokens[i].getText())) {
                    // Start searching at a point sumContext terms back,
                    // and move sumContext terms into the future.
                    int startToken = (i > sumContext) ? i - sumContext : 0;
                    int endToken = Math.min(i + sumContext, tokens.length);
                    int startOffset = tokens[startToken].getStartOffset();
                    int currentToken = startToken;

                    // Iterate from the start point to the finish, adding
                    // terms all the way.  The end of the passage is always
                    // sumContext beyond the last query-term.
                    Excerpt excerpt = new Excerpt();
                    if (startOffset != 0) {
                        excerpt.add(new Summary.Ellipsis());
                    }

                    // Iterate through as long as we're before the end of
                    // the document and we haven't hit the max-number-of-items
                    // -in-a-summary.
                    while ((currentToken < endToken) && (currentToken - startToken < sumLength)) {
                        // Now grab the hit-element, if present
                        Attributes t = tokens[currentToken];
                        if (highlight.contains(t.getText())) {
                            excerpt.addToken(t.getText());
                            excerpt.add(new Summary.Fragment(text.substring(startOffset, t.getStartOffset())));
                            excerpt.add(new Summary.Highlight(text.substring(t.getStartOffset(), t.getEndOffset())));
                            startOffset = t.getEndOffset();
                            endToken = Math.min(currentToken + sumContext, tokens.length);
                        }

                        currentToken++;
                    }

                    lastExcerptPos = endToken;

                    // We found the series of search-term hits and added
                    // them (with intervening text) to the excerpt.  Now
                    // we need to add the trailing edge of text.
                    //
                    // So if (currentToken < tokens.length) then there is still trailing
                    // text to add.  (We haven't hit the end of the source doc.)
                    // Add the words since the last hit-term insert.
                    if (currentToken < tokens.length) {
                        excerpt.add(new Summary.Fragment(text.substring(startOffset, tokens[currentToken].getEndOffset())));
                    } else {
                        // This else block is the fix for JST-884 (Search results truncated after keyword).
                        int endOffset = tokens[(tokens.length - 1)].getEndOffset();
                        String trailingFragment = text.substring(startOffset, endOffset);
                        if (!StringUtils.isEmpty(trailingFragment)) {
                            excerpt.add(new Summary.Fragment(trailingFragment));
                        }
                    }

                    // Remember how many terms are in this excerpt
                    excerpt.setNumTerms(currentToken - startToken);

                    // Store the excerpt for later sorting
                    excerptSet.add(excerpt);

                    // Start sumContext places away.  The next
                    // search for relevant excerpts begins at i-sumContext
                    i = currentToken + sumContext;
                }
            }
        }

        // If the target text doesn't appear, then we just
        // excerpt the first sumLength words from the document.
        if (excerptSet.size() == 0) {
            Excerpt excerpt = new Excerpt();
            int excerptLen = Math.min(sumLength, tokens.length);
            lastExcerptPos = excerptLen;

            excerpt.add(new Summary.Fragment(text.substring(tokens[0].getStartOffset(), tokens[excerptLen - 1].getEndOffset())));
            excerpt.setNumTerms(excerptLen);
            excerptSet.add(excerpt);
        }

        log.debug("Found excerpts = {}", excerptSet.size());

        // Now choose the best items from the excerpt set.
        // Stop when our Summary grows too large.
        double tokenCount = 0;
        Summary s = new Summary();
        while (tokenCount <= sumLength && excerptSet.size() > 0) {
            Excerpt excerpt = excerptSet.last();
            excerptSet.remove(excerpt);

            double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
            for (Enumeration e = excerpt.elements(); e.hasMoreElements();) {
                Summary.Fragment f = (Summary.Fragment) e.nextElement();
                // Don't add fragments if it takes us over the max-limit
                if (tokenCount + tokenFraction <= sumLength) {
                    s.add(f);
                }
                tokenCount += tokenFraction;
            }
        }

        if (tokenCount > 0 && lastExcerptPos < tokens.length)
            s.add(new Summary.Ellipsis());
        return s;
    }

    /**
     * We use Lucene queries, not Nutch ones - so getting terms is a little different for us.
     * <p></p>
     * Right now this just does simple string manipulation.
     *
     * @param query the query
     * @return String[] A list of the individual terms in the query.
     */
    @SuppressWarnings("unchecked")
    private Set getTerms(final String query) {
        if (StringUtils.isNotEmpty(query)) {
            try {
                Set tokens = new HashSet();
                if (luceneConnection != null && query.indexOf('*') > -1)            // only expand wildcard queries
                {
                    QueryParser qp = new QueryParser(BonnieConstants.LUCENE_VERSION,
                            BaseDocumentBuilder.FieldName.CONTENT_BODY, analyzer);  // use standardanalyzer to avoid potential double-stem
                    try {
                        final Query parsed = qp.parse(query);
                        Set<String> set = (Set<String>) luceneConnection.withReader(reader -> {
                            //TODO lucene upgrade, this is horrible but seems to be the only way to achieve what we want..
                            String transformedQuery = query.replaceAll("\\.", "\\."); //looks stupid because the first is a regexp and the second the expression, so we replace all . with \.
                            transformedQuery = transformedQuery.replaceAll("\\*", ".*");
                            transformedQuery = transformedQuery.replaceAll("\\?", ".");
                            Set<String> innerSet = new HashSet<>();

                            Fields fields = MultiFields.getFields(reader);
                            if (fields != null) {
                                for (String field : fields) {
                                    Terms terms = fields.terms(field);
                                    if (terms != null) {
                                        TermsEnum termEnum = terms.iterator(null);
                                        String[] innerTokens = transformedQuery.split(" ");

                                        BytesRef text;
                                        while ((text = termEnum.next()) != null) {
                                            String termString = text.utf8ToString();
                                            //noinspection ForLoopReplaceableByForEach
                                            for (int i = 0; i < innerTokens.length; i++) {
                                                if (Pattern.matches(innerTokens[i], termString)) {
                                                    innerSet.add(termString);
                                                }
                                            }
                                        }

                                    }
                                }

                            }


                            return innerSet;
                        });
                        tokens.addAll(set);
                    } catch (ParseException e) {
                        log.warn("Error encountered parsing query: {} for wildcard match.", query, e);
                    }
                }
                TokenStream ts = analyzer.tokenStream(BaseDocumentBuilder.FieldName.CONTENT_BODY, new StringReader(query));
                ts.reset();
                CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
                while (ts.incrementToken()) {
                    tokens.add(charTermAttribute.toString());
                }

                ts.end();
                ts.close();

                return tokens;
            } catch (IOException e) {
                log.error(e.getMessage(), e);
            }
        }

        return Collections.EMPTY_SET;
    }

    private Attributes[] parseText(String text) throws IOException {
        if (text == null || text.trim().equals(""))
            return new Attributes[0];

        final List<Attributes> result = new LinkedList<>();
        TokenStream ts = analyzer.tokenStream(BaseDocumentBuilder.FieldName.CONTENT_BODY, new StringReader(text));
        OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);

        try {
            ts.reset();
            while (ts.incrementToken()) {
                result.add(new Attributes(charTermAttribute.toString(), offsetAttribute.startOffset(), offsetAttribute.endOffset()));
            }

            ts.end();
        } finally {
            ts.close();
        }

        return result.toArray(new Attributes[result.size()]);
    }

    private static class Attributes {
        private final String text;
        private final int startOffset;
        private final int endOffset;

        private Attributes(String text, int startOffset, int endOffset) {
            this.text = text;
            this.startOffset = startOffset;
            this.endOffset = endOffset;
        }

        public String getText() {
            return text;
        }

        public int getStartOffset() {
            return startOffset;
        }

        public int getEndOffset() {
            return endOffset;
        }
    }

    public void setAnalyzer(Analyzer analyzer) {
        this.analyzer = analyzer;
    }

    @SuppressWarnings("unused")
    public void setSumContext(int sumContext) {
        this.sumContext = sumContext;
    }

    @SuppressWarnings("unused")
    public void setSumLength(int sumLength) {
        this.sumLength = sumLength;
    }

    @SuppressWarnings("unused")
    public void setAnalyzerFactory(LuceneAnalyzerFactory f) {
        this.analyzer = f.createAnalyzer();
    }

    @SuppressWarnings("unused")
    public void setLuceneConnection(ILuceneConnection luceneConnection) {
        this.luceneConnection = luceneConnection;
    }
}
