/**********************************************************************
 *
 * querytools.cpp -- 
 * Copyright (C) 1999  The New Zealand Digital Library Project
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *********************************************************************/

#include "querytools.h"
#include <ctype.h>
#include "unitool.h" // for is_unicode_letdig

// sets the ct, qt, qto arguments
void set_query_type_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {

  if (args["ct"].empty()) {
    text_t build_type = cinfo->buildType;
    if (build_type == "mgpp") {
      args["ct"] = "1";
    } else if (build_type == "lucene") {
      args["ct"] = "2";
    } else {
      args["ct"] = "0";
    }
  }
  text_t arg_ct = args["ct"];
  if (arg_ct == "0") {
    // mg
    args["qt"] = "0";
    args["qto"] = "0";
    return;
  }

  if (!args["qt"].empty() && !args["qto"].empty()) {
    return;
  }
  
  text_tmap::iterator check = cinfo->format.find("SearchTypes");
  text_t search_types;
  if(check != cinfo->format.end() && !(*check).second.empty()){
    search_types = (*check).second;
  } else {
    // assume plain,form
    if (args["qto"].empty()) args["qto"] = "3";
    if (args["qt"].empty()) {
      int arg_qto = args.getintarg("qto");
      if (arg_qto == 2) {
	args["qt"] = "1";
      } else {
	args["qt"] = "0";
      }
    }
    return;
  }
  
  
  if (args["qto"].empty()) {
    unsigned int type = 0;
    if (findword(search_types.begin(), search_types.end(), "form") != search_types.end()) {
      type |= 2;
    }
    if (findword(search_types.begin(), search_types.end(), "plain") != search_types.end()) {
      type |= 1;
    }
    args.setintarg("qto", type);
  } 

  if (args["qt"].empty()) {
    int arg_qto = args.getintarg("qto");
    if (arg_qto == 2 || (arg_qto == 3 && starts_with(search_types, "form"))) {
      args["qt"] = "1";
    } else {
      args["qt"] = "0";
    }
  }


  // decide if sqlqto should be set or not
  unsigned int sql_type = 0;
  text_t infodb_type = cinfo->infodbType;
  if ((infodb_type == "sqlite") || (infodb_type == "mssql")) {
    if (findword(search_types.begin(), search_types.end(), "sqlform") != search_types.end()) {
      sql_type = 1;
    }
  }

  if (sql_type) {
    args["sqlqto"] = "1";
  }
  else {
    args["sqlqto"] = "0";
  }


}

// sets the ks, ss, afs (casesupport, stemsupport, accentfoldsupport) args
void set_stem_index_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
  int stemIndexes = cinfo->stemIndexes;

  if (stemIndexes & SIcasefold) {
    args["ks"] = 1;
  }
  if (stemIndexes & SIstem) {
    args["ss"] = 1;
  }
  if (stemIndexes & SIaccentfold) {
    args["afs"] = 1;
  }

}


void set_basequeryfilter_options (FilterRequest_t &request, 
				  cgiargsclass &args) 
{

  OptionValue_t option;
  int arg_m = args.getintarg("m");
  
  option.name = "Maxdocs";
  option.value = arg_m;
  request.filterOptions.push_back (option);

  //  option.name = "StartResults";
  //  option.value = args["r"];
  //  request.filterOptions.push_back (option);

  //  option.name = "EndResults";
  //  int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
  //  if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
  //  option.value = endresults;
  //  request.filterOptions.push_back (option);
}


// request.filterResultOptions and request.fields (if required) should
// be set from the calling code
void set_fulltext_queryfilter_options (FilterRequest_t &request, 
				       const text_t &querystring,
				       cgiargsclass &args) 
{
  // better if this function, and the two-query companion function
  // was implemented in queryaction.cpp
  // Has to be done here to documentaction.cpp can call it directly

  request.filterName = "QueryFilter";

  OptionValue_t option;

  option.name = "Term";
  option.value = querystring;
  request.filterOptions.push_back (option);

  option.name = "QueryType";
  option.value = (args.getintarg("t")) ? "ranked" : "boolean";
  request.filterOptions.push_back (option);

  option.name = "MatchMode";
  // mgpp in advanced mode, always use some query
  if (args.getintarg("ct") == 1 && args.getintarg("b") == 1) {
    option.value = "some";
  } else {
    option.value = (args.getintarg("t")) ? "some" : "all";
  }
  request.filterOptions.push_back (option);

  option.name = "Casefold";
  option.value = (args.getintarg("k")) ? "true" : "false";
  request.filterOptions.push_back (option);

  option.name = "Stem";
  option.value = (args.getintarg("s")) ? "true" : "false";
  request.filterOptions.push_back (option);

  option.name = "AccentFold";
  option.value = (args.getintarg("af")) ? "true" : "false";
  request.filterOptions.push_back (option);
  
  if (!args["h"].empty()) {
    option.name = "Index";
    option.value = args["h"];
    request.filterOptions.push_back (option);
  }

  if (!args["j"].empty()) {
    option.name = "Subcollection";
    option.value = args["j"];
    request.filterOptions.push_back (option);
  }

  if (!args["n"].empty()) {
    option.name = "Language";
    option.value = args["n"];
    request.filterOptions.push_back (option);
  }
  
  if (!args["g"].empty()) { // granularity for mgpp
    option.name = "Level";
    option.value = args["g"];
    request.filterOptions.push_back (option);
  }

  if (!args["fs"].empty()) { // filter string for lucene
    option.name = "FilterString";
    option.value = args["fs"];
    request.filterOptions.push_back (option);
  }

  if (!args["sf"].empty()) { // sort field for lucene
    option.name = "SortField";
    option.value = args["sf"];
    request.filterOptions.push_back (option);
  }

  if (!args["fuzziness"].empty() && args["fuzziness"] != "100") { // fuzziness value for lucene
    option.name = "Fuzziness";
    option.value = (text_t) "0." + args["fuzziness"];
    request.filterOptions.push_back (option);
  }

  set_basequeryfilter_options(request, args);
}


void set_fulltext_queryfilter_options (FilterRequest_t &request, 
				       const text_t &querystring1,
				       const text_t &querystring2, 
				       cgiargsclass &args) 
{

  set_fulltext_queryfilter_options (request, querystring1, args);

  // fill in the second query if needed
  if (!args["cq2"].empty()) {
    OptionValue_t option;

    option.name = "CombineQuery";
    option.value = args["cq2"];
    request.filterOptions.push_back (option);
    
    option.name = "Term";
    option.value = querystring2;
    request.filterOptions.push_back (option);
    
    option.name = "QueryType";
    option.value = (args.getintarg("t")) ? "ranked" : "boolean";
    request.filterOptions.push_back (option);

    option.name = "Casefold";
    option.value = (args.getintarg("k")) ? "true" : "false";
    request.filterOptions.push_back (option);

    option.name = "Stem";
    option.value = (args.getintarg("s")) ? "true" : "false";
    request.filterOptions.push_back (option);

    option.name = "AccentFold";
    option.value = (args.getintarg("af")) ? "true" : "false";
    request.filterOptions.push_back (option);

    if (!args["h2"].empty()) {
      option.name = "Index";
      option.value = args["h2"];
      request.filterOptions.push_back (option);
    }

    if (!args["j2"].empty()) {
      option.name = "Subcollection";
      option.value = args["j2"];
      request.filterOptions.push_back (option);
    }

    if (!args["n2"].empty()) {
      option.name = "Language";
      option.value = args["n2"];
      request.filterOptions.push_back (option);
    }
  }

  // this is probably redundant, as first line to this method will have
  // already caused it to invoke set_basequeryfilter_options

  set_basequeryfilter_options(request, args);
}


// request.filterResultOptions and request.fields (if required) should
// be set from the calling code
void set_sql_queryfilter_options (FilterRequest_t &request, 
				  cgiargsclass &args) 
{
  if (!args["sqlsf"].empty()) { // sort field for lucene
    OptionValue_t option;

    option.name = "SortField";
    option.value = args["sqlsf"];
    request.filterOptions.push_back (option);
  }

  set_basequeryfilter_options(request, args);
}


bool is_special_character(int indexer_type, unsigned short character) {
  // mgpp
  if (indexer_type == 1) {
    return (character == '#' || character == '/' || character == '*');
  }
  // lucene
  else if (indexer_type == 2) {
    return (character == '?' || character == '*' || character == '~' || 
	    character == '^');
  }
  return false;
}

// This function removes boolean operators from simple searches, and segments
// chinese characters if segment=true
void format_querystring (text_t &querystring, int querymode, bool segment) {
  text_t formattedstring;

  // advanced search, no segmenting, don't need to do anything
  if (querymode == 1 && !segment) return;
  
  text_t::const_iterator here = querystring.begin();
  text_t::const_iterator end = querystring.end();

  // space is used to insert spaces between Chinese
  // characters. No space is needed before the first
  // Chinese character.
  bool space = false;

  // want to remove ()|!& from querystring so boolean queries are just
  // "all the words" queries (unless querymode is advanced)
  while (here != end) {
    if ((querymode == 0) && (*here == '(' || *here == ')' || *here == '|' ||
			     *here == '!' || *here == '&')) {
      formattedstring.push_back(' ');
    } else if (segment) {
      if ((*here >= 0x2e80 && *here <= 0xd7a3) ||
	  ( *here >= 0xf900 && *here <= 0xfa6a)) {
	/* text_t not big enough to handle these. */
	/*	  (*here >= 0x20000 && *here <= 0x2a6d6) ||
	  (*here >= 0x2f800 && *here <= 0x2fa1d)) { */
	
	// CJK character
	if (!space) formattedstring.push_back (0x200b); // zero width space
	formattedstring.push_back (*here);
	formattedstring.push_back (0x200b);
	space = true;
      } else {
	
	// non-Chinese character
	formattedstring.push_back (*here);
	space = false;
	
      }
    
    } else {
      formattedstring.push_back (*here);
    }
    ++here;
  }
  querystring = formattedstring;
}

// turn query string into terms separated by spaces.
// still working on this...
text_t get_plain_query_terms(const text_t &querystring, const text_t &arg_ct) {
  text_t::const_iterator here = querystring.begin();
  text_t::const_iterator end = querystring.end();

  // lets look for [] and () first - these are a pain.
  text_t::const_iterator bracket;
  text_t query_no_brackets = "";

  // mgpp brackets: [xxx]:TI
  if (findchar(here, end, '[') != end) {
    while ((bracket = findchar(here, end, '[')) != end) {
      // get the first bit
      query_no_brackets += substr(here, bracket);
      bracket++;
      here = bracket;
      // get the end bracket
      bracket = findchar(here, end, ']');
      query_no_brackets += substr(here, bracket);
      // skip the :TI bits
      while (bracket != end		// do bracket != end test first, ELSE when bracket = end, we're past the string, in
		  && *bracket != ' ') { // which case *bracket becomes an invalid operation that causes the server to crash
			bracket++;
	  }
      here = bracket;
    }
    if (here != end) {
      query_no_brackets += substr(here,end);
    }
  } else if (findchar(here, end, '(') != end) {
    // lucene brackets TI:(xxx)
    while ((bracket = findchar(here, end, '(')) != end) {
      // back up the field name
      text_t::const_iterator old_bracket = bracket;
      while (bracket != here && *bracket != ' ') {	// order of tests in condition matters (see long comment above)
	--bracket;						
      }
      if (bracket != here) {
	// get the first bit
	query_no_brackets += substr(here, bracket+1);
      }
      here = old_bracket +1;
      // get the end bracket
      bracket = findchar(here, end, ')');
      query_no_brackets += substr(here, bracket);
      if (bracket != end) {
	here = bracket+1;
      }
    }
    if (here != end) {
      query_no_brackets += substr(here,end);
    }
  } else {
    // was no brackets
    query_no_brackets = querystring;
  }
  
  
  if (arg_ct == "2") { // lucene
    // look for AND OR NOT and remove
    here = query_no_brackets.begin();
    end = query_no_brackets.end();
    text_tlist terms;
    splitword(here, end, "AND", terms);
    joinchar(terms, ' ', query_no_brackets);
    here = query_no_brackets.begin();
    end = query_no_brackets.end();
    splitword(here, end, "OR", terms);
    joinchar(terms, ' ', query_no_brackets);
    here = query_no_brackets.begin();
    end = query_no_brackets.end();
    splitword(here, end, "NOT", terms);
    joinchar(terms, ' ', query_no_brackets);
    
  }
  text_t terms = "";
  bool space = false;
  here = query_no_brackets.begin();
  end = query_no_brackets.end();
  
  while (here != end) {
    if (*here ==  '#' || *here == '/') {
      // skip over #is /10 etc 
      ++here;
      while (here != end && *here != ' ') {
	++here;
      }
      if (here == end) break;
    }
    if (is_unicode_letdig(*here)) {
      terms.push_back(*here);
      space = false;
    } else {
      if (!space) {
	terms.push_back(' ');
	space = true;
      }
    }
    ++here;
  }
  return trim(terms);
    
}

// search history tool 
// also used for form query macros
text_t escape_quotes(const text_t &querystring) {

  text_t::const_iterator here = querystring.begin();
  text_t::const_iterator end = querystring.end();
 
  text_t escquery = "";
  while (here != end) {
    if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
    else if (*here == '\n' || *here == '\r') {
      escquery.push_back(' ');
    } else {
      escquery +="\\\\";
      escquery.push_back(*here);
    }

    ++here;
  }
  return escquery;

}

// Parses the terms into words, and adds #si if necessary
text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold,
		   const int indexer_type) {
  
  // the default stem and case are set to 0 if this is being used, so we are only adding on qualifiers if stem or fold is 1.
  if (stem == "0" && fold == "0") {
    return terms;
  }
  // this is only for mgpp collections, shouldn't be called for anything else
  if (indexer_type != 1) {
    return terms;
  }
  
  text_t outtext;
  text_t word;

  text_t::const_iterator here = terms.begin();
  text_t::const_iterator end = terms.end();

  while (here !=end) {

    if (is_unicode_letdig(*here) || is_special_character(indexer_type, *here)) {
      // not word boundary
      word.push_back(*here);
      ++here;    
    }
    else {
      // found word boundary   
      if (!word.empty() ) {
	if (starts_with(word, "NEAR") || starts_with(word, "WITHIN")) {
	  outtext += word;
	  word.clear();
	}
	else {
	  word += "#";
	  if (stem == "1") word += "s";
	  if (fold == "1") word += "i";
	  outtext += word;
	  word.clear();
	}
      }
      // this only used in advanced form, so we leave in boolean operators
      if (*here == '\"' || *here == '&' || *here == '|' || *here == '!' || 
	  *here == '(' || *here == ')' || is_unicode_space(*here)) {
	outtext.push_back(*here);
      }
      ++here;
    }
  }
   
  // get last word
  if (!word.empty()) {
    word += "#";
    if (stem == "1") word += "s";
    if (fold == "1") word += "i";
    word += " ";
    outtext += word;
  }
  return outtext;
}


// some query form parsing functions for use with mgpp & lucene

void parse_reg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
{
  querystring.clear();

  int argct = args.getintarg("ct");
  int argt = args.getintarg("t");// t=0 -and, t=1 - or
  int argb = args.getintarg("b");
    
  text_t combine;

  // lucene uses global combine, so only need this for mgpp
  if (argct==1) {
    if (argt == 0) combine = "&";
    else combine = "|";
  }
  
  text_t field = args["fqf"];
  if (field.empty()) return; // no query
  text_tarray fields;
  splitchar(field.begin(), field.end(), ',', fields); 
  
  text_t value = args["fqv"];
  if (value.empty()) return; // somethings wrong
  text_tarray values;
  splitchar(value.begin(), value.end(), ',', values);


  for (int i=0; i< values.size(); ++i) {
    if (!values[i].empty()) {
      text_t this_value = values[i];

      // remove operators for simple search, segments text if necessary
      format_querystring(this_value, argb, segment); 
           
      // add tag info for this field (and other processing)
      format_field_info(this_value, fields[i], argct, argt, argb);

      // add into query string
      if (argct == 2) {
	// lucene
	// we don't worry about AND/OR, cos this is done by defaultcombineoperator
	querystring += this_value+" ";
      } else {
	// mgpp
	if (!querystring.empty()) {
	  querystring += " "+ combine+ " ";
	}
	querystring += this_value;
      }
    }
  }
}


void parse_adv_query_form(text_t &querystring, cgiargsclass &args, bool segment){
  querystring.clear();

  const int argct = args.getintarg("ct");
  int argt = 0;// arg t is either not used (lucene) or used for natural/ranked (mgpp), so we set it to 0 = AND, by default
  int argb = args.getintarg("b");
  text_t combine;
  if (argct==1) {
    combine = "&";
  }
  else { // lucene
    combine = "AND";
  }

  text_t field = args["fqf"];
  if (field.empty()) return; // no query
  text_tarray fields;
  splitchar(field.begin(), field.end(), ',', fields); 
  
  text_t value = args["fqv"];
  if (value.empty()) return; // somethings wrong
  text_tarray values;
  splitchar(value.begin(), value.end(), ',', values);

  text_t comb = args["fqc"];
  if (comb.empty()) return; //somethings wrong
  text_tarray combs;
  splitchar(comb.begin(), comb.end(), ',', combs);

  text_tarray stems;
  text_tarray folds;
  if (argct == 1) {// mgpp - lucene doesn't do stem/case
    text_t stem = args["fqs"];
    if (stem.empty()) return; // somethings wrong
    splitchar(stem.begin(), stem.end(), ',', stems);
    
    text_t fold = args["fqk"];
    if (fold.empty()) return; // somethings wrong
    splitchar(fold.begin(), fold.end(), ',', folds);
  }
  
  for(int i=0; i< values.size(); ++i) {
    if (!values[i].empty()) {
      if (i!=0) {
	if (argct==1) {
	  if (combs[i-1]=="and") combine = "&";
	  else if (combs[i-1]=="or")combine = "|";
	  else if (combs[i-1]=="not")combine = "!";
	}
	else { // lucene
	  if (combs[i-1]=="and") combine = "AND";
	  else if (combs[i-1]=="or")combine = "OR";
	  else if (combs[i-1]=="not")combine = "NOT";
	}
      }
      text_t this_value = values[i];
      // remove operators for simple search, segments text if necessary
      format_querystring(this_value, argb, segment); 
      if (argct == 1) { // mgpp only
	this_value = addstemcase(this_value, stems[i], folds[i], argct);
      }
      // add tag info for this field (and other processing)
      format_field_info(this_value, fields[i], argct, argt, argb);
      // add into query string
      if (!querystring.empty()) {
	querystring += " "+ combine+ " ";
      }
      querystring += this_value;
      
    }
  }
}


// SQL versions for parsing query form

void parse_sqlreg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
{
  querystring.clear();

  int argt = args.getintarg("t");// t=0 -and, t=1 - or
  int argb = args.getintarg("b");
    
  text_t combine;

  if (argt == 0) combine = "AND";
  else combine = "OR";
  
  text_t field = args["sqlfqf"];
  if (field.empty()) return; // no query
  text_tarray fields;
  splitchar(field.begin(), field.end(), ',', fields); 

  text_t sqlcomb = args["sqlfqc"];
  if (sqlcomb.empty()) return; //somethings wrong
  text_tarray sqlcombs;
  splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
  
  text_t value = args["fqv"];
  if (value.empty()) return; // somethings wrong
  text_tarray values;
  splitchar(value.begin(), value.end(), ',', values);


  for (int i=0; i< values.size(); ++i) {
    if (!values[i].empty()) {
	  text_t this_value;
	  const text_t STARTINGWITH_CONDITION = "STARTINGWITH";
	  const text_t LIKE_CONDITION = "LIKE";
	  
	  //Change the STARTINGWITH operator to 'LIKE' and then adds '%' to the end of the value field
	  //in order to search a field starting with certain words.
	  if (sqlcombs[i] == STARTINGWITH_CONDITION)
		  {this_value = values[i];
		  this_value += "%";
		  // remove operators for simple search, segments text if necessary
		  format_querystring(this_value, argb, segment); 
		  // add tag info for this field (and other processing)
		  format_field_info_sql(this_value, fields[i], LIKE_CONDITION, argt, argb);}

	  else 
		  {this_value = values[i];
		  // remove operators for simple search, segments text if necessary
		  format_querystring(this_value, argb, segment); 
		  // add tag info for this field (and other processing)
		  format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);}

	  
      const text_t DISTINCT_SELECT_WHERE = "SELECT DISTINCT docOID FROM document_metadata WHERE ";

      if (querystring.empty()) {
	// first query term
	querystring = DISTINCT_SELECT_WHERE + this_value;
      }
      else {
	this_value = DISTINCT_SELECT_WHERE + this_value;

	if (combine=="AND") {	
	  // INNER JOIN to restrict to only matching docOIDs
	  querystring = "SELECT docOID FROM (" + querystring + ")"
	    + " INNER JOIN (" + this_value +") USING (docOID)";
	}
	else if (combine=="OR") {
	  // Union to allow union of the two
	  querystring = querystring + " UNION " + this_value;
	}
      }
    }
  }
}


void parse_sqladv_query_form(text_t &querystring, cgiargsclass &args, 
			     bool segment)
{
  querystring.clear();

  int argt = 0; // set it to 0 = AND, by default
  int argb = args.getintarg("b");
  text_t combine = "AND";

  text_t field = args["sqlfqf"];

  if (field.empty()) return; // no query
  text_tarray fields;
  splitchar(field.begin(), field.end(), ',', fields); 
  
  text_t sqlcomb = args["sqlfqc"];
  if (sqlcomb.empty()) return; //somethings wrong
  text_tarray sqlcombs;
  splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);

  text_t value = args["fqv"];
  if (value.empty()) return; // somethings wrong
  text_tarray values;
  splitchar(value.begin(), value.end(), ',', values);

  text_t comb = args["fqc"];
  if (comb.empty()) return; //somethings wrong
  text_tarray combs;
  splitchar(comb.begin(), comb.end(), ',', combs);

  for(int i=0; i< values.size(); ++i) {
    if (!values[i].empty()) {
      if (i>0) {
	if (combs[i-1]=="and") { combine = "AND"; }
	else if (combs[i-1]=="or") { combine = "OR"; }
	else if (combs[i-1]=="not") { combine = "NOT"; }
      }
	  text_t this_value;
	  const text_t STARTINGWITH_CONDITION = "STARTINGWITH";
	  const text_t LIKE_CONDITION = "LIKE";
	  
	  //Change the STARTINGWITH operator to 'LIKE' and then adds '%' to the end of the value field
	  //in order to search a field starting with certain words.
	  if (sqlcombs[i] == STARTINGWITH_CONDITION)
		  {this_value = values[i];
		  this_value += "%";
		  // remove operators for simple search, segments text if necessary
		  format_querystring(this_value, argb, segment); 
		  // add tag info for this field (and other processing)
		  format_field_info_sql(this_value, fields[i], LIKE_CONDITION, argt, argb);}

	  else 
		  {this_value = values[i];
		  // remove operators for simple search, segments text if necessary
		  format_querystring(this_value, argb, segment); 
		  // add tag info for this field (and other processing)
		  format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);}
      
      const text_t DISTINCT_SELECT_WHERE = "SELECT DISTINCT docOID FROM document_metadata WHERE ";

      if (querystring.empty()) {
	// first query term
	querystring = DISTINCT_SELECT_WHERE + this_value;
      }
      else {
	this_value = DISTINCT_SELECT_WHERE + this_value;

	if (combine=="AND") {	
	  // INNER JOIN to restrict to only matching docOIDs
	  querystring = "SELECT docOID FROM (" + querystring + ")"
	    + " INNER JOIN (" + this_value +") USING (docOID)";
	}
	else if (combine=="OR") {
	  // Union to allow union of the two
	  querystring = querystring + " UNION " + this_value;
	}
	else {
	  cerr << "Unsupported combination operation: " << combine << endl;
	}
      }
      
    }
  }  
}


// Extended addqueryelem for Human Info project
void addqueryelem_ex(text_t &querystring, const text_t &tag, 
		     const text_t &terms, const text_t &stem, 
		     const text_t &fold,
		     const text_t& combine, const text_t& word_combine) {

  if (!querystring.empty()) { // have to put and/or
    querystring += " " + combine + " ";
  }
  text_t outtext; outtext.reserve(512);
  text_t word; word.reserve(100);
  //unsigned short c;                                                            
  text_t::const_iterator here = terms.begin();
  text_t::const_iterator end = terms.end();
  bool inquote = false, firstword = true;

  text_t word2; word2.reserve(256);
	
  while (here !=end) {
    if (is_unicode_space(*here)) {
      if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
      else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
      else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
      else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
      else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
      if (inquote) {
	word2.push_back(*here);
      }
      word.append(word2); word2.clear();
			
      if (!inquote && !word.empty() ) {
	// found word boundary   
				
	if (stem == "1" || fold =="1") {
	  word += "#";
	  if (stem == "1") word += "s";
	  //else word += "u";
					
	  if (fold == "1") word += "i";
	  //else word += "c";
	}
	if (firstword) {
	  firstword = false;
	} else {
	  outtext += " " + word_combine + " ";
	}
	outtext += "[" + word + "]:"+tag;
	word.clear();
      }
      ++here;
    } else if (*here == '\"') {
      word2.push_back(*here);
      inquote = !inquote;
      ++here;
    } else {
      // not word boundary
      word2.push_back(*here);
      ++here;    
    }
  }
	
  // get last word
  if (!word2.empty()) {
    if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
    else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
    else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
    else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
    else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
    word.append(word2); word2.clear();
		
    if (stem == "1"|| fold == "1") {
      word += "#";
      if (stem == "1") word += "s";
      //else word += "u";
			
      if (fold == "1") word += "i";
      //else word += "c";
    }
    if (!outtext.empty()) outtext += " " + word_combine + " ";
    outtext += "[" + word + "]:"+tag;
  }
  querystring += "(" + outtext + ")";
}

void add_field_info(text_t &querystring, const text_t &tag, int type) {

  if (tag == "") return; // do nothing
  if (tag == "ZZ" && type == 1) return;  // mgpp doesn't use ZZ tag internally
  if (type == 1) { //mgpp
    querystring = "["+querystring+"]:"+tag;
  } else if (type == 2) { // lucene
    querystring = tag+":("+querystring+")";
  }
    
}


void add_field_info_sql(text_t &querystring, const text_t &tagseq, 
			const text_t& sqlcomb) 
{

  if (tagseq == "") return; // do nothing

  text_t element_in = "(element IN (";

  text_tlist mdterms;

  splitword(tagseq.begin(), tagseq.end(), "/", mdterms);

  text_t tags_in = "";

  while (!mdterms.empty()) {
    text_t tag = mdterms.front();
    mdterms.pop_front();

    if (!tag.empty()) {

      if (tag.size()>3 && (substr(tag.begin(), tag.begin()+3) == "ex.")) {
	tag = substr (tag.begin()+3, tag.end());
      }

      if (!tags_in.empty()) {
	tags_in += ",";
      }
      
      tags_in += "'" + tag + "'";
    }
  }

  element_in += tags_in + ") AND (";

  
  if (sqlcomb == "=") {
    // override what it means to do equality, to make it more like full text
    // searching

    text_t orterms = "";
    text_t term = "";
    bool in_phrase = false;
    
    text_t::const_iterator here = querystring.begin();
    text_t::const_iterator end = querystring.end();
    while (here != end) {
      if (is_unicode_letdig(*here)) {
	term.push_back(*here);
      }
      else if (*here == '"') {
	term.push_back(*here);
	if (!in_phrase) {
	  in_phrase = true;
	} else {
	  in_phrase = false;
	}
      }      
      else if (in_phrase) {
        // Found word boundary, but in a phrase, so does not complete term
	term.push_back(*here);
      }
      else {
        // Found a word boundary
	if (!orterms.empty()) {
	  orterms += " OR ";
	}
	orterms += "value LIKE '%" + term + "%'";
	term.clear();
      }
      ++here;
    }

    if (!term.empty()) {
	if (!orterms.empty()) {
	  orterms += " OR ";
	}
      	orterms += "value LIKE '%" + term + "%'";
    }

    element_in += orterms;
  }
  //We cast the value from STRING to REAL to allow numeric sorting
  else if (sqlcomb == "<num") {
    element_in += "CAST(value as REAL) < CAST('" + querystring+"' AS REAL)";
  }
  else if (sqlcomb == ">num") {
    element_in += "CAST(value as REAL) > CAST('" + querystring+"' AS REAL)";
  }
   else if (sqlcomb == "<=num") {
    element_in += "CAST(value as REAL) <= CAST('" + querystring+"' AS REAL)";
  }
  else if (sqlcomb == ">=num") {
    element_in += "CAST(value as REAL) >= CAST('" + querystring+"' AS REAL)";
  }
  else if (sqlcomb == "=num") {
    element_in += "CAST(value as REAL) = CAST('" + querystring+"' AS REAL)";
  }
  else {
    // search on value is "as is" querystring
    element_in += "value " + sqlcomb + " '" + querystring+"'";
  }

  
  querystring = element_in + "))";
    
}


void format_field_info_lucene(text_t &querystring, text_t &tag, int argt, int argb) {

  int type = 2; //lucene

  if (argb==0) { // simple
    // there will be no & or | as they should have already been removed
    // just tag the entire thing
    if (tag != "") {
      add_field_info(querystring, tag, type);
    }
    return;
  }

  // need to replace & with &&, | with ||
  text_t::const_iterator here = querystring.begin();
  text_t::const_iterator end = querystring.end();

  text_t finalquery = "";
  while (here != end) {
    if (*here ==  '&') {
      finalquery.push_back('&');
      finalquery.push_back('&');
      while (*(here+1) == '&') {
	++here;
      }
    } 
    else if (*here == '|') {
      finalquery.push_back('|');
      finalquery.push_back('|');
      while (*(here+1) == '|') {
	++here;
      }
    } 
    else {
      finalquery.push_back(*here);
    }
    ++here;
  }
  querystring = finalquery;
  add_field_info(querystring, tag, type);
}


void format_field_info_mgpp(text_t &querystring, text_t tag, int argt, int argb) {

  if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
  if (tag == "" && argb == 1) {
    return; // no field specifier, advanced mode, the query stays as written
  }

  int type = 1; // mgpp

  bool simple_and = (argb==0 && argt==0); 
  text_t finalquery = "";
  text_t fieldpart ="";
  text_t queryelem = "";
  bool in_phrase = false;
  bool in_field = false;

  text_t::const_iterator here = querystring.begin();
  text_t::const_iterator end = querystring.end();
  while (here != end) {
    if (is_unicode_letdig(*here)  || *here == '&' || is_special_character(type, *here)) {
      queryelem.push_back(*here);
    }
    else if (*here == '|') {
      in_field = false;
    }
    else if (*here == '!' || *here == '(' || *here == ')') {
      if (!in_phrase) { // ignore these if in_phrase
	// output field, then output operator
	in_field = false;
	if (!queryelem.empty()) {
	  if (!simple_and && !fieldpart.empty()) {
	    add_field_info(fieldpart, tag, type);
	    finalquery += fieldpart;
	    finalquery.push_back(' ');
	    fieldpart.clear();
	  }
	  fieldpart += queryelem;
	}
	if (!fieldpart.empty()) {
	  add_field_info(fieldpart, tag, type);
	  finalquery += fieldpart;
	  finalquery.push_back(' ');
	}
	fieldpart.clear();
	queryelem.clear();
	finalquery.push_back(*here);
	finalquery.push_back(' ');
      }
    }
    else if (*here == '"') {
      queryelem.push_back(*here);
      if (in_phrase == false) in_phrase = true;
      else {
	in_phrase = false;
      }
    }

    // Found word boundary, in a phrase
    else if (in_phrase) {
      queryelem.push_back(*here);
    }
    // Found a word boundary
    else {
      if (!queryelem.empty()) {
	if (queryelem == "&") {
	  in_field = true;
	  queryelem.clear();
	}
	else if (starts_with(queryelem, "NEAR") || starts_with(queryelem, "WITHIN")) {
	  
	  if (argb==1) {
	    // simple search, these not allowed
	    in_field = true;
	    fieldpart += queryelem;
	    fieldpart.push_back(' ');
	  }
	  queryelem.clear();
	  
	} 
	else {
	  if (!simple_and && !in_field) {
	    if (!fieldpart.empty()) {
	      add_field_info(fieldpart, tag, type);
	      finalquery += fieldpart;
	      finalquery.push_back(' ');
	      fieldpart.clear();
	    } 
	  }
	  
	  fieldpart += queryelem;
	  fieldpart.push_back(' ');
	  queryelem.clear();
	}
      }
    }
    ++here;
  }
  // at the end
  if (!queryelem.empty()) {
    if (!simple_and && !in_field && !fieldpart.empty()) {
      add_field_info(fieldpart, tag, type);
      finalquery += fieldpart;
      finalquery.push_back(' '); 
      fieldpart.clear();
    }
    fieldpart += queryelem;
  }
  if (!fieldpart.empty()) {
    add_field_info(fieldpart, tag, type);
    finalquery += fieldpart;
    fieldpart.clear();

    // doesn't the following just leave a dangling space at the end ?? (used to make mgpp crash)
    // consider cutting this line
    finalquery.push_back(' '); 
  }
 
  querystring  = finalquery;
}


void format_field_info_sql(text_t &querystring, const text_t &tagseq, 
			   const text_t &sqlcomb, 
			   int argt, int argb) 
{
  add_field_info_sql(querystring, tagseq, sqlcomb);
}


void format_field_info(text_t &querystring, text_t tag, int argct, int argt, int argb) {  
  if (argct == 1) {
    format_field_info_mgpp(querystring, tag, argt, argb);
  } else if (argct == 2) {
    format_field_info_lucene(querystring, tag, argt, argb);
  }
}

void mgpp_adddateelem(text_t& querystring, const int date)
{
  querystring.appendcstr(" [");
  if(date<0) {
      querystring.appendcstr("bc");
      querystring.appendint((date*-1));
  }
  else {
    querystring.appendint(date);
  }
  querystring.appendcstr("]:CV");
}

void lucene_adddateelem(text_t& querystring, const int date)
{
  querystring.appendcstr(" CV:(");
  if(date<0) {
      querystring.appendcstr("bc");
      querystring.appendint((date*-1));
  }
  else {
    querystring.appendint(date);
  }
  querystring.appendcstr(")");
}


void add_dates(text_t &querystring, int startdate, int enddate, 
	       int startbc, int endbc, int ct)
{
  if(startdate)
    {
      int querystringis = 0;
      text_t::const_iterator here = querystring.begin();
      text_t::const_iterator end = querystring.end();
      while(here!=end)
	{
	  if(!(isspace((*here)))){
	    here = end;
	    querystringis = 1;
	  }
	  else
	    ++here;
	}
      //converting BCE dates
      if(startbc && startdate > 0)
	{
	  startdate *= -1;
	}
      if(endbc && enddate > 0)
	{
	  enddate *= -1;
	}
       if(enddate != 0 && enddate<startdate)
	{
	  cout<<"enddate too small"<<endl;
	  return;
	}
       if(querystringis)
	 querystring.appendcstr(" AND");
       if(!enddate)
	 {
	   if (ct==1) {
	     mgpp_adddateelem(querystring,startdate);
	   }
	   else { // lucene
	     lucene_adddateelem(querystring,startdate);
	   }
	 }
       else{
	 int nextdate = startdate;
	 querystring.appendcstr(" (");
	 while(nextdate<=enddate)
	   {
	     if(nextdate!=0) {
	       if (ct==1) {
		 mgpp_adddateelem(querystring,nextdate);
	       }
	       else { // lucene
		 lucene_adddateelem(querystring,nextdate);
	       }
	     }
	     ++nextdate;
	   }
	 querystring.appendcstr(" )");
       }
    }

}