/*
 * Copyright (c) 2003 by Atlassian Software Systems Pty. Ltd.
 * All rights reserved.
 */

package com.atlassian.bonnie.search;

import com.atlassian.bonnie.BonnieConstants;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

/**
 * The <code>SearchWordLister</code> is like a reverse builder that takes a query string and decomposes it into
 * words to be search on and ignored words
 * 
 * @author ROSS
 */
public class SearchWordsLister
{
    private static final String KEYWORDS_REGEX = "AND|NOT|OR";

    private List ignoredWords;
    private List searchWords;
    private Analyzer referenceAnalyzer = null;
    private Analyzer queryAnalyzer = null;

	/**
	 * Default Constructor uses the StandardAnalyzer to parse queries.
	 */
	public SearchWordsLister()
    {
        ignoredWords = new ArrayList();
        searchWords = new ArrayList();
        referenceAnalyzer = new StandardAnalyzer(BonnieConstants.LUCENE_VERSION, CharArraySet.EMPTY_SET);
        queryAnalyzer = new StandardAnalyzer(BonnieConstants.LUCENE_VERSION);
    }

	/**
	 * Construct a SearchWordsLister that uses the supplied analyzers. The ignored words are the difference between
	 * the output of referenceAnalyzer and queryAnalyzer.
	 *
	 * @param referenceAnalyzer a query analyzer which will not remove any stop words.
	 * @param queryAnalyzer a query Analyzer which may remove stop words from the query.
	 */
	public SearchWordsLister(Analyzer referenceAnalyzer, Analyzer queryAnalyzer)
	{
		this.referenceAnalyzer = referenceAnalyzer;
		this.queryAnalyzer = queryAnalyzer;
	}

	public List getIgnoredWords()
    {
        return ignoredWords;
    }

    public List getSearchWords()
    {
        return searchWords;
    }

    public String getIgnoredWordsAsString()
    {
        return listToDelimitedString(ignoredWords, ", ");
    }

    public String getSearchWordsAsString()
    {
        return listToDelimitedString(searchWords, ", ");
    }

    private String listToDelimitedString(List list, String delimiter)
    {
        StringBuffer buffer = new StringBuffer();

        for (int i = 0; i < list.size(); i++)
        {
            buffer.append(list.get(i));
            if (i + 1 < list.size())
            {
                buffer.append(delimiter);
            }
        }
        return buffer.toString();
    }

    /**
     * generates a comma separated string of words that would be removed from the search query
     * 
     * @param query the query being invoked
     * @throws java.io.IOException 
     */
    public void parseQuery(String query) throws IOException
    {
        //strip out any search keywords
        query = query.replaceAll(KEYWORDS_REGEX, "");

        //the token stream used to search with
        TokenStream queryStream = queryAnalyzer.tokenStream(null, new StringReader(query));
        //the token stream without StopFilter processing.  Other filters will be applied
        TokenStream referenceStream = referenceAnalyzer.tokenStream(null, new StringReader(query));

		CharTermAttribute charTermAttribute = queryStream.addAttribute(CharTermAttribute.class);
		CharTermAttribute referenceCharTermAttribute = referenceStream.addAttribute(CharTermAttribute.class);

		try
		{
			queryStream.reset();
			referenceStream.reset();

			boolean endOfQueryStream = !queryStream.incrementToken();
			boolean endOfReferenceStream = !referenceStream.incrementToken();
			String tokenText = charTermAttribute.toString();
			String referenceTokenText = referenceCharTermAttribute.toString();
			while (tokenText.length() > 0 && referenceTokenText.length() > 0)
			{
				if (tokenText.equals(referenceTokenText))
				{
					searchWords.add(tokenText);
					queryStream.incrementToken();
					tokenText = charTermAttribute.toString();
				}
				else if (!ignoredWords.contains(referenceTokenText))
				{
					ignoredWords.add(referenceTokenText);
				}


				referenceStream.incrementToken();
				referenceTokenText = referenceCharTermAttribute.toString();
				if (tokenText.length() == 0 && referenceTokenText.length() > 0)
				{
					do
					{
						if (ignoredWords.contains(referenceCharTermAttribute.toString()))
						{
							ignoredWords.add(referenceCharTermAttribute.toString());
						}
					} while (referenceStream.incrementToken());
				}
			}

			queryStream.end();
			referenceStream.end();
		}
		finally
		{
   			queryStream.close();
			referenceStream.close();
		}
	}

}
