/**********************************************************************
 *
 * mgqueryfilter.cpp -- implementation of queryfilter for old mg 
 * Copyright (C) 1999  The New Zealand Digital Library Project
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *********************************************************************/

#include "mgqueryfilter.h"
#include "fileutil.h"
#include "phrasesearch.h"
#include "mgsearch.h"
#include "phrases.h"

///////////////////////////////
// methods for resultsorderer_t
///////////////////////////////

resultsorderer_t::resultsorderer_t() {
  clear ();
}

void resultsorderer_t::clear() {
  compare_phrase_match = false;
  compare_terms_match = false;
  compare_doc_weight = true;

  docset = NULL;
}

bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
  if (docset == NULL) return t1>t2;

  docresultmap::iterator t1_here = docset->find(t1);
  docresultmap::iterator t2_here = docset->find(t2);
  docresultmap::iterator end = docset->end();

  // sort all the document numbers not in the document set to
  // the end of the list
  if (t1_here == end) {
    if (t2_here == end) return t1>t2;
    else return true;
  } else if (t2_here == end) return false;
  
  if (compare_phrase_match) {
    if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
    if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
  }

  if (compare_terms_match) {
    if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
    if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
  }

  if (compare_doc_weight) {
    if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
    if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
  }

  return t1>t2;
}


/////////////////////////////////
// functions for mgqueryfilterclass
/////////////////////////////////


void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
  queryfilterclass::configure (key, cfgline);

  if (key == "indexstem") {
    ((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
  }
  
}

// loads up phrases data structure with any phrases (that's the quoted bits)
// occuring in the querystring
void mgqueryfilterclass::get_phrase_terms (const text_t &querystring, 
					 const termfreqclassarray &orgterms,
					 vector<termfreqclassarray> &phrases) {

  text_t::const_iterator here = querystring.begin();
  text_t::const_iterator end = querystring.end();

  termfreqclassarray tmpterms;

  int termcount = 0;
  bool foundquote = false;
  bool foundbreak = false;
  bool start = true;
  while (here != end) {
    if (*here == '\"') {
      if (foundquote) {
	if (!foundbreak && !start) {
	  tmpterms.push_back (orgterms[termcount]);
	  ++termcount;
	}
	if (tmpterms.size() > 1) {
	  phrases.push_back (tmpterms);
	}
	tmpterms.erase (tmpterms.begin(), tmpterms.end());
	
	foundquote = false;
	foundbreak = true;
      } else foundquote = true;
    } else if (!is_unicode_letdig(*here)) {
      // found a break between terms
      if (!foundbreak && !start) {
	if (foundquote) {
	  tmpterms.push_back (orgterms[termcount]);
	}
	++termcount;
      }
      foundbreak = true;
    } else {
      start = false;
      foundbreak = false;
    }      
    ++here;
  }
}

// do aditional query processing
void mgqueryfilterclass::post_process (const queryparamclass &queryparams, 
				     queryresultsclass &queryresults) {

  // post-process the results if needed
  if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {

    // get the terms between quotes (if any)
    vector<termfreqclassarray> phrases;
    get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);

    num_phrases = phrases.size();
    if (num_phrases > 0) {

      // get the long version of the index
      text_t longindex;
      indexmap.to2from (queryparams.index, longindex);
      
      vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
      vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();

      while (this_phrase != end_phrase) {

	// process each of the matched documents
	docresultmap::iterator docs_here = queryresults.docs.docset.begin();
	docresultmap::iterator docs_end = queryresults.docs.docset.end();
	while (docs_here != docs_end) {
	  if (OID_phrase_search (*((mgsearchclass*)textsearchptr), *db_ptr, queryparams.index,
				 queryparams.subcollection, queryparams.language, 
				 longindex, queryparams.collection, *this_phrase,
				 (*docs_here).second.docnum)) {
	    ++docs_here->second.num_phrase_match;
	  }
	
	  ++docs_here;
	}
	++this_phrase;
      }
    }
  }
}


// do query that might involve multiple sub queries
// textsearchptr and db_ptr are assumed to be valid
void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
				       const vector<queryparamclass> &query_params,
				       queryresultsclass &multiresults,
				       comerror_t &err, ostream &logout) {
  outconvertclass text_t2ascii;

  err = noError;
  textsearchptr->setcollectdir (collectdir);
  multiresults.clear();
  
  vector<queryparamclass>::const_iterator query_here = query_params.begin();
  vector<queryparamclass>::const_iterator query_end = query_params.end();
  while (query_here != query_end) {
    queryresultsclass thisqueryresults;

    if (!textsearchptr->search(*query_here, thisqueryresults)) {
      // most likely a system problem
      logout << text_t2ascii
	     << "system problem: could not do search with mg for index \""
	     << (*query_here).index << (*query_here).subcollection
	     << (*query_here).language << "\".\n\n";
      err = systemProblem;
      return;
    }

    // combine the results
    if (need_matching_docs (request.filterResultOptions)) {
      // post-process the results if needed
      if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
	  !thisqueryresults.docs.docset.empty()) {
	post_process (*query_here, thisqueryresults);
	thisqueryresults.postprocessed = true;
	multiresults.postprocessed = true;
      } else {
	num_phrases = 0;
      }
      
      if (query_params.size() == 1) {
	multiresults.docs = thisqueryresults.docs; // just one set of results
	multiresults.docs_matched = thisqueryresults.docs_matched;
	multiresults.is_approx = thisqueryresults.is_approx;
	
      } else {
	if ((*query_here).combinequery == "and") {
	  multiresults.docs.combine_and (thisqueryresults.docs);
	} else if ((*query_here).combinequery == "or") {
	  multiresults.docs.combine_or (thisqueryresults.docs);
	} else if ((*query_here).combinequery == "not") {
	  multiresults.docs.combine_not (thisqueryresults.docs);
	}
	multiresults.docs_matched = multiresults.docs.docset.size();
	multiresults.is_approx = Exact;
      }
    }

    // combine the term information 
    if (need_term_info (request.filterResultOptions)) {
      // append the terms
      multiresults.orgterms.insert(multiresults.orgterms.end(),
				   thisqueryresults.orgterms.begin(),
				   thisqueryresults.orgterms.end());

      // add the term variants
      text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
      text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
      while (termvar_here != termvar_end) {
	multiresults.termvariants.insert(*termvar_here);
	++termvar_here;
      }
    }
   
    ++query_here;
  }

  // sort and unique the query terms
  multiresults.sortuniqqueryterms ();
}


void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
					 docresultsclass &docs) {
  resultsorderer_t resultsorderer;
  resultsorderer.compare_phrase_match = true;
  resultsorderer.docset = &(docs.docset);

  // first get a list of document numbers
  docs.docnum_order();

  sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
}


mgqueryfilterclass::mgqueryfilterclass () 
  :queryfilterclass() {

  num_phrases = 0;
}

mgqueryfilterclass::~mgqueryfilterclass () {
}

void mgqueryfilterclass::filter (const FilterRequest_t &request,
			       FilterResponse_t &response,
			       comerror_t &err, ostream &logout) {
  outconvertclass text_t2ascii;

  response.clear ();
  err = noError;
  if (db_ptr == NULL) {
    // most likely a configuration problem
    logout << text_t2ascii 
	   << "configuration error: mgqueryfilter contains a null dbclass\n\n";
    err = configurationError;
    return;
  }
  if (textsearchptr == NULL) {
    // most likely a configuration problem
    logout << text_t2ascii 
	   << "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
    err = configurationError;
    return;
  }

  // open the database
  db_ptr->setlogout(&logout);
  if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
    // most likely a system problem (we have already checked that the database exists)
    logout << text_t2ascii
	   << "system problem: open on database \"" << db_filename << "\" failed\n\n";
    err = systemProblem;
    return;
  }

  // get the query parameters
  int startresults = filterOptions["StartResults"].defaultValue.getint();
  int endresults = filterOptions["EndResults"].defaultValue.getint();
  text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;

  vector<queryparamclass> queryfilterparams;
  parse_query_params (request, queryfilterparams, startresults, 
		      endresults, phrasematch, logout);
  // do any mg specific diddling with query parameters that may be required
  mg_parse_query_params (request, queryfilterparams, startresults, 
			 endresults, phrasematch, logout);


  // do query
  queryresultsclass queryresults;
  do_multi_query (request, queryfilterparams, queryresults, err, logout);
  if (err != noError) return;
  
  // assemble document results
  if (need_matching_docs (request.filterResultOptions)) {
    // sort the query results
    // only want to sort the docs if we have done a ranked search or there were phrases
    if (num_phrases > 0 || (request.filterResultOptions & FRranking)) {
      sort_doc_results (request, queryresults.docs);
    }
    int resultnum = 1;
    ResultDocInfo_t resultdoc;
    text_t trans_OID;
    vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
    vector<int>::iterator docorder_end = queryresults.docs.docorder.end();

    // documents containing matching phrases will be sorted to the top so
    // we can break out once we're past those that match the PhraseMatch
    // option -- "all_phrases" = return only those documents containing all
    //	                     phrases in query string
    //           "some_phrases" = return only those documents containing 
    //                            at least 1 of the phrases in the document
    //           "all_docs" = return all documents regardless
    if (num_phrases > 0) {
      int numdocs = 0;
      while (docorder_here != docorder_end) {
	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
	
	if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) ||
	    ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
	  queryresults.docs_matched = numdocs;
	  break;
	}
	++numdocs;
	++docorder_here;
      }
    }

    if (endresults == -1) endresults = MAXNUMDOCS;
    docorder_here = queryresults.docs.docorder.begin();
    while (docorder_here != docorder_end) {
      if (resultnum > endresults || resultnum > queryresults.docs_matched) break;
      
      // translate the document number
      if (!translate(db_ptr, *docorder_here, trans_OID)) {
	logout << text_t2ascii
	       << "warning: could not translate mg document number \""
	       << *docorder_here << "\"to OID.\n\n";
	
      } else {
	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);

	// see if there is a result for this number,
	// if it is in the request set (or the request set is empty)
	if (docset_here != queryresults.docs.docset.end() &&
	    (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
	  if (resultnum >= startresults) {
	    // add this document
	    resultdoc.OID = trans_OID;
	    resultdoc.result_num = resultnum;
	    resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);

	    // these next two are not available on all versions of mg
	    resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched; 
	    resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
	    
	    response.docInfo.push_back (resultdoc);
	  }
	  
	  ++resultnum;
	}
      }
      
      ++docorder_here;
    }
  }

  // assemble the term results
  if (need_term_info(request.filterResultOptions)) {
    // note: the terms have already been sorted and uniqued

    TermInfo_t terminfo;
    bool terms_first = true;
    termfreqclassarray::iterator terms_here = queryresults.terms.begin();
    termfreqclassarray::iterator terms_end = queryresults.terms.end();

    while (terms_here != terms_end) {
      terminfo.clear();
      terminfo.term = (*terms_here).termstr;
      terminfo.freq = (*terms_here).termfreq;
      if (terms_first) {
	text_tset::iterator termvariants_here = queryresults.termvariants.begin();
	text_tset::iterator termvariants_end = queryresults.termvariants.end();
	while (termvariants_here != termvariants_end) {
	  terminfo.matchTerms.push_back (*termvariants_here);
	  ++termvariants_here;
	}
      }
      terms_first = false;

      response.termInfo.push_back (terminfo);

      ++terms_here;
    }
  }

  db_ptr->closedatabase();  // Important that local library doesn't leave any files open
  response.numDocs = queryresults.docs_matched;
  response.isApprox = queryresults.is_approx;
}

void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/*request*/,
						vector<queryparamclass> &query_params,
						int &/*startresults*/, int &/*endresults*/,
						text_t &/*phrasematch*/, ostream &/*logout*/) {

  //  outconvertclass text_t2ascii;
  
  vector<queryparamclass>::iterator query_here = query_params.begin();
  vector<queryparamclass>::iterator query_end = query_params.end();
  while (query_here != query_end) {

    // if we're doing a phrase search we want to maximise hits by making it
    // a boolean search on the index with the finest granularity - we'll
    // also set maxdocs to "all" (realizing that this will cause searches
    // like "and the" on a large collection to take a very very long time).

    // we're deciding it's a phrase search based on if the querystring 
    // contains at least 2 double quotes (not very scientific but 
    // then neither is the rest of the mg phrase searching functionality :-)
      //if (countchar ((*query_here).querystring.begin(), (*query_here).querystring.end(), '"') > 1) {

    // [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here
    text_tarray phrases;
    get_phrases((*query_here).querystring, phrases);

    if (phrases.size() > 0) {
      (*query_here).search_type = 0;

      // set maxdocs to "all"
      (*query_here).maxdocs = -1;

      // Get the long version of the index and test to see if any indexes with
      // finer granularity exist. Indexes must be the same type (i.e. same metadata
      // or "text").
      text_t longindex; text_tarray splitindex;
      indexmap.to2from ((*query_here).index, longindex);
      splitchar (longindex.begin(), longindex.end(), ':', splitindex);
      text_t &granularity = splitindex[0];
      text_t &indextype = splitindex[1];
      bool found = false;
      // currently supported granularity options are "document", "section" and "paragraph"
      if (granularity == "document" || granularity == "section") {
	text_t shortindex;
	if (indexmap.fromexists ("paragraph:" + indextype)) {
	  indexmap.from2to ("paragraph:" + indextype, shortindex);
	  (*query_here).index = shortindex;
	  found = true;
	}
	if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
	  indexmap.from2to ("section:" + indextype, shortindex);
	  (*query_here).index = shortindex;
	}
      }
    }

#ifdef GSDL_BBC_COLLECTION
    // This is a special hack for the BBC collection's ProgNumber and zzabn
    // indexes (they're built this way to prevent mg_perf_hash_build from
    // dying at build time)

    // if we're searching the ProgNumber index we want to
    // remove all non-alphanumeric characters from the query string
    text_t longindex; text_tarray splitindex;
    indexmap.to2from ((*query_here).index, longindex);
    splitchar (longindex.begin(), longindex.end(), ':', splitindex);
    text_t &indextype = splitindex[1];
    if (indextype == "ProgNumber") {
      text_t new_querystring;
      text_t::const_iterator here = (*query_here).querystring.begin();
      text_t::const_iterator end = (*query_here).querystring.end();
      while (here != end) {
        if ((*here >= 'a' && *here <= 'z') || (*here >= 'A' && *here <= 'Z') ||
            (*here >= '0' && *here <= '9')) {
          new_querystring.push_back (*here);
        }
        ++here;
      }
      (*query_here).querystring = new_querystring;
    }
#endif
    ++query_here;
  }
}