Changeset 12784


Ignore:
Timestamp:
2006-09-20T09:53:39+12:00 (18 years ago)
Author:
kjdon
Message:

modified query parsing for mgpp and lucene. hopefully it will work properly now. lucene no longer uses is_unicode_letdig (which is only valid for unicode 2). advanced form search should now behave the same as plain search, allowing operators etc. lucene no longer does stem/case mods.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/recpt/querytools.cpp

    r12771 r12784  
    2828#include "unitool.h" // for is_unicode_letdig
    2929
     30// sets the ct, qt, qto arguments
    3031void set_query_type_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
    3132
     
    5354 
    5455  text_tmap::iterator check = cinfo->format.find("SearchTypes");
    55   text_t search_types = "plain,form";
    56   if(check != cinfo->format.end()){
     56  text_t search_types;
     57  if(check != cinfo->format.end() && !(*check).second.empty()){
    5758    search_types = (*check).second;
    58     if (search_types.empty()) {
    59       search_types = "plain,form";
    60     }
    61   }
     59  } else {
     60    // assume plain,form
     61    if (args["qto"].empty()) args["qto"] = "3";
     62    if (args["qt"].empty()) {
     63      int arg_qto = args.getintarg("qto");
     64      if (arg_qto > 1) {
     65    args["qt"] = "1";
     66      } else {
     67    args["qt"] = "0";
     68      }
     69    }
     70    return;
     71  }
     72 
    6273 
    6374  if (args["qto"].empty()) {
     
    7384 
    7485  if (args["qt"].empty()) {
    75     bool form_default = false;
    7686    int arg_qto = args.getintarg("qto");
    7787    if (arg_qto == 2 || (arg_qto == 3 && starts_with(search_types, "form"))) {
     
    8595// request.filterResultOptions and request.fields (if required) should
    8696// be set from the calling code
    87 void set_queryfilter_options (FilterRequest_t &request, const text_t &querystring,
     97void set_queryfilter_options (FilterRequest_t &request,
     98                  const text_t &querystring,
    8899                  cgiargsclass &args) {
    89100
     
    162173}
    163174
    164 void set_queryfilter_options (FilterRequest_t &request, const text_t &querystring1,
     175void set_queryfilter_options (FilterRequest_t &request,
     176                  const text_t &querystring1,
    165177                  const text_t &querystring2, cgiargsclass &args) {
    166178
     
    212224}
    213225
    214 void set_more_queryfilter_options (FilterRequest_t &request, cgiargsclass &args) {
     226void set_more_queryfilter_options (FilterRequest_t &request,
     227                   cgiargsclass &args) {
    215228
    216229  OptionValue_t option;
     
    238251  }
    239252  // lucene
    240   else if (indexer_type ==2) {
     253  else if (indexer_type == 2) {
    241254    return (character == '?' || character == '*' || character == '~' ||
    242255        character == '^');
     
    245258}
    246259
     260// This function removes boolean operators from simple searches, and segments
     261// chinese characters if segment=true
    247262void format_querystring (text_t &querystring, int querymode, bool segment) {
    248263  text_t formattedstring;
    249264
     265  // advanced search, no segmenting, don't need to do anything
    250266  if (querymode == 1 && !segment) return;
    251267 
     
    290306
    291307   
     308
     309// search history tool
     310// also used for form query macros
     311text_t escape_quotes(const text_t &querystring) {
     312
     313  text_t::const_iterator here = querystring.begin();
     314  text_t::const_iterator end = querystring.end();
     315 
     316  text_t escquery = "";
     317  while (here != end) {
     318    if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
     319    else if (*here == '\n' || *here == '\r') {
     320      escquery.push_back(' ');
     321    } else {
     322      escquery +="\\\\";
     323      escquery.push_back(*here);
     324    }
     325
     326    ++here;
     327  }
     328  return escquery;
     329
     330}
     331
     332// Parses the terms into words, and adds #si if necessary
     333text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold,
     334           const int indexer_type) {
     335 
     336  // the default stem and case are set to 0 if this is being used, so we are only adding on qualifiers if stem or fold is 1.
     337  if (stem == "0" && fold == "0") {
     338    return;
     339  }
     340  // this is only for mgpp collections, shouldn't be called for anything else
     341  if (indexer_type != 1) {
     342    return;
     343  }
     344 
     345  text_t outtext;
     346  text_t word;
     347
     348  text_t::const_iterator here = terms.begin();
     349  text_t::const_iterator end = terms.end();
     350
     351  while (here !=end) {
     352
     353    if (is_unicode_letdig(*here) || is_special_character(indexer_type, *here)) {
     354      // not word boundary
     355      word.push_back(*here);
     356      ++here;   
     357    }
     358    else {
     359      // found word boundary   
     360      if (!word.empty() ) {
     361    if (starts_with(word, "NEAR") || starts_with(word, "WITHIN")) {
     362      outtext += word;
     363      word.clear();
     364    }
     365    else {
     366      word += "#";
     367      if (stem == "1") word += "s";
     368      if (fold == "1") word += "i";
     369      outtext += word;
     370      word.clear();
     371    }
     372      }
     373      // this only used in advanced form, so we leave in boolean operators
     374      if (*here == '\"' || *here == '&' || *here == '|' || *here == '!' || is_unicode_space(*here)) {
     375    outtext.push_back(*here);
     376      }
     377      ++here;
     378    }
     379  }
     380   
     381  // get last word
     382  if (!word.empty()) {
     383    word += "#";
     384    if (stem == "1") word += "s";
     385    if (fold == "1") word += "i";
     386    word += " ";
     387    outtext += word;
     388  }
     389  return outtext;
     390}
     391
     392
     393// some query form parsing functions for use with mgpp & lucene
     394
     395void parse_reg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
     396{
     397  querystring.clear();
     398
     399  int argct = args.getintarg("ct");
     400  int argt = args.getintarg("t");// t=0 -and, t=1 - or
     401  int argb = args.getintarg("b");
     402   
     403  text_t combine;
     404
     405  // lucene uses global combine, so only need this for mgpp
     406  if (argct==1) {
     407    if (argt == 0) combine = "&";
     408    else combine = "|";
     409  }
     410 
     411  text_t field = args["fqf"];
     412  if (field.empty()) return; // no query
     413  text_tarray fields;
     414  splitchar(field.begin(), field.end(), ',', fields);
     415 
     416  text_t value = args["fqv"];
     417  if (value.empty()) return; // somethings wrong
     418  text_tarray values;
     419  splitchar(value.begin(), value.end(), ',', values);
     420
     421
     422  for (int i=0; i< values.size(); ++i) {
     423    if (!values[i].empty()) {
     424      text_t this_value = values[i];
     425      // remove operators for simple search, segments text if necessary
     426      format_querystring(this_value, argb, segment);
     427      // add tag info for this field (and other processing)
     428      format_field_info(this_value, fields[i], argct, argt, argb);
     429      // add into query string
     430      if (argct == 2) {
     431    // lucene
     432    // we don't worry about AND/OR, cos this is done by defaultcombineoperator
     433    querystring += this_value+" ";
     434      } else {
     435    // mgpp
     436    if (!querystring.empty()) {
     437      querystring += " "+ combine+ " ";
     438    }
     439    querystring += this_value;
     440      }
     441    }
     442  }
     443}
     444
     445
     446void parse_adv_query_form(text_t &querystring, cgiargsclass &args, bool segment){
     447  querystring.clear();
     448
     449  const int argct = args.getintarg("ct");
     450  int argt = 0;// arg t is either not used (lucene) or used for natural/ranked (mgpp), so we set it to 0 = AND, by default
     451  int argb = args.getintarg("b");
     452  text_t combine;
     453  if (argct==1) {
     454    combine = "&";
     455  }
     456  else { // lucene
     457    combine = "AND";
     458  }
     459
     460  text_t field = args["fqf"];
     461  if (field.empty()) return; // no query
     462  text_tarray fields;
     463  splitchar(field.begin(), field.end(), ',', fields);
     464 
     465  text_t value = args["fqv"];
     466  if (value.empty()) return; // somethings wrong
     467  text_tarray values;
     468  splitchar(value.begin(), value.end(), ',', values);
     469
     470  text_t comb = args["fqc"];
     471  if (comb.empty()) return; //somethings wrong
     472  text_tarray combs;
     473  splitchar(comb.begin(), comb.end(), ',', combs);
     474
     475  text_tarray stems;
     476  text_tarray folds;
     477  if (argct == 1) {// mgpp - lucene doesn't do stem/case
     478    text_t stem = args["fqs"];
     479    if (stem.empty()) return; // somethings wrong
     480    splitchar(stem.begin(), stem.end(), ',', stems);
     481   
     482    text_t fold = args["fqk"];
     483    if (fold.empty()) return; // somethings wrong
     484    splitchar(fold.begin(), fold.end(), ',', folds);
     485  }
     486 
     487  for(int i=0; i< values.size(); ++i) {
     488    if (!values[i].empty()) {
     489      if (i!=0) {
     490    if (argct==1) {
     491      if (combs[i-1]=="and") combine = "&";
     492      else if (combs[i-1]=="or")combine = "|";
     493      else if (combs[i-1]=="not")combine = "!";
     494    }
     495    else { // lucene
     496      if (combs[i-1]=="and") combine = "AND";
     497      else if (combs[i-1]=="or")combine = "OR";
     498      else if (combs[i-1]=="not")combine = "NOT";
     499    }
     500      }
     501      text_t this_value = values[i];
     502      // remove operators for simple search, segments text if necessary
     503      format_querystring(this_value, argb, segment);
     504      if (argct == 1) { // mgpp only
     505    this_value = addstemcase(this_value, stems[i], folds[i], argct);
     506      }
     507      // add tag info for this field (and other processing)
     508      format_field_info(this_value, fields[i], argct, argt, argb);
     509      // add into query string
     510      if (!querystring.empty()) {
     511    querystring += " "+ combine+ " ";
     512      }
     513      querystring += this_value;
     514     
     515    }
     516  }
     517}
     518
     519// Extended addqueryelem for Human Info project
     520void addqueryelem_ex(text_t &querystring, const text_t &tag,
     521             const text_t &terms, const text_t &stem,
     522             const text_t &fold,
     523             const text_t& combine, const text_t& word_combine) {
     524
     525  if (!querystring.empty()) { // have to put and/or
     526    querystring += " " + combine + " ";
     527  }
     528  text_t outtext; outtext.reserve(512);
     529  text_t word; word.reserve(100);
     530  //unsigned short c;                                                           
     531  text_t::const_iterator here = terms.begin();
     532  text_t::const_iterator end = terms.end();
     533  bool inquote = false, firstword = true;
     534
     535  text_t word2; word2.reserve(256);
     536   
     537  while (here !=end) {
     538    if (is_unicode_space(*here)) {
     539      if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
     540      else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
     541      else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
     542      else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
     543      else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
     544      if (inquote) {
     545    word2.push_back(*here);
     546      }
     547      word.append(word2); word2.clear();
     548           
     549      if (!inquote && !word.empty() ) {
     550    // found word boundary   
     551               
     552    if (stem == "1" || fold =="1") {
     553      word += "#";
     554      if (stem == "1") word += "s";
     555      //else word += "u";
     556                   
     557      if (fold == "1") word += "i";
     558      //else word += "c";
     559    }
     560    if (firstword) {
     561      firstword = false;
     562    } else {
     563      outtext += " " + word_combine + " ";
     564    }
     565    outtext += "[" + word + "]:"+tag;
     566    word.clear();
     567      }
     568      ++here;
     569    } else if (*here == '\"') {
     570      word2.push_back(*here);
     571      inquote = !inquote;
     572      ++here;
     573    } else {
     574      // not word boundary
     575      word2.push_back(*here);
     576      ++here;   
     577    }
     578  }
     579   
     580  // get last word
     581  if (!word2.empty()) {
     582    if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
     583    else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
     584    else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
     585    else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
     586    else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
     587    word.append(word2); word2.clear();
     588       
     589    if (stem == "1"|| fold == "1") {
     590      word += "#";
     591      if (stem == "1") word += "s";
     592      //else word += "u";
     593           
     594      if (fold == "1") word += "i";
     595      //else word += "c";
     596    }
     597    if (!outtext.empty()) outtext += " " + word_combine + " ";
     598    outtext += "[" + word + "]:"+tag;
     599  }
     600  querystring += "(" + outtext + ")";
     601}
     602
     603void add_field_info(text_t &querystring, const text_t &tag, int type) {
     604
     605  if (tag == "" || tag == "ZZ") return; // do nothing
     606  if (type == 1) { //mgpp
     607    querystring = "["+querystring+"]:"+tag;
     608  } else if (type == 2) { // lucene
     609    querystring = tag+":("+querystring+")";
     610  }
     611   
     612}
     613
     614
     615void format_field_info_lucene(text_t &querystring, text_t tag, int argt, int argb) {
     616  if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
     617  int type = 2; //lucene
     618
     619  if (argb==0) { // simple
     620    // there will be no & or | as they should have already been removed
     621    // just tag the entire thing
     622    if (tag != "") {
     623      add_field_info(querystring, tag, type);
     624    }
     625    return;
     626  }
     627
     628  // need to replace & with &&, | with ||
     629  text_t::const_iterator here = querystring.begin();
     630  text_t::const_iterator end = querystring.end();
     631
     632  text_t finalquery = "";
     633  while (here != end) {
     634    if (*here ==  '&') {
     635      finalquery.push_back('&');
     636      finalquery.push_back('&');
     637      while (*(here+1) == '&') {
     638    ++here;
     639      }
     640    }
     641    else if (*here == '|') {
     642      finalquery.push_back('|');
     643      finalquery.push_back('|');
     644      while (*(here+1) == '|') {
     645    ++here;
     646      }
     647    }
     648    else {
     649      finalquery.push_back(*here);
     650    }
     651    ++here;
     652  }
     653  querystring = finalquery;
     654  add_field_info(querystring, tag, type);
     655}
     656
     657
     658void format_field_info_mgpp(text_t &querystring, text_t tag, int argt, int argb) {
     659
     660  if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
     661  if (tag == "" && argb == 1) {
     662    return; // no field specifier, advanced mode, the query stays as written
     663  }
     664
     665  int type = 1; // mgpp
     666
     667  bool simple_and = (argb==0 && argt==0);
     668  text_t finalquery = "";
     669  text_t fieldpart ="";
     670  text_t queryelem = "";
     671  bool in_phrase = false;
     672  bool in_field = false;
     673
     674  text_t::const_iterator here = querystring.begin();
     675  text_t::const_iterator end = querystring.end();
     676  while (here != end) {
     677    if (is_unicode_letdig(*here)  || *here == '&' || is_special_character(type, *here)) {
     678      queryelem.push_back(*here);
     679    }
     680    else if (*here == '|') {
     681      in_field = false;
     682    }
     683    else if (*here == '!' || *here == '(' || *here == ')') {
     684      if (!in_phrase) { // ignore these if in_phrase
     685    // output field, then output operator
     686    in_field = false;
     687    if (!queryelem.empty()) {
     688      if (!simple_and && !fieldpart.empty()) {
     689        add_field_info(fieldpart, tag, type);
     690        finalquery += fieldpart;
     691        finalquery.push_back(' ');
     692        fieldpart.clear();
     693      }
     694      fieldpart += queryelem;
     695    }
     696    if (!fieldpart.empty()) {
     697      add_field_info(fieldpart, tag, type);
     698      finalquery += fieldpart;
     699      finalquery.push_back(' ');
     700    }
     701    fieldpart.clear();
     702    queryelem.clear();
     703    finalquery.push_back(*here);
     704    finalquery.push_back(' ');
     705      }
     706    }
     707    else if (*here == '"') {
     708      queryelem.push_back(*here);
     709      if (in_phrase == false) in_phrase = true;
     710      else {
     711    in_phrase = false;
     712      }
     713    }
     714
     715    // Found word boundary, in a phrase
     716    else if (in_phrase) {
     717      queryelem.push_back(*here);
     718    }
     719    // Found a word boundary
     720    else {
     721      if (!queryelem.empty()) {
     722    if (queryelem == "&") {
     723      in_field = true;
     724      queryelem.clear();
     725    }
     726    else if (starts_with(queryelem, "NEAR") || starts_with(queryelem, "WITHIN")) {
     727     
     728      if (argb==1) {
     729        // simple search, these not allowed
     730        in_field = true;
     731        fieldpart += queryelem;
     732        fieldpart.push_back(' ');
     733      }
     734      queryelem.clear();
     735     
     736    }
     737    else {
     738      if (!simple_and && !in_field) {
     739        if (!fieldpart.empty()) {
     740          add_field_info(fieldpart, tag, type);
     741          finalquery += fieldpart;
     742          finalquery.push_back(' ');
     743          fieldpart.clear();
     744        }
     745      }
     746     
     747      fieldpart += queryelem;
     748      fieldpart.push_back(' ');
     749      queryelem.clear();
     750    }
     751      }
     752    }
     753    ++here;
     754  }
     755  // at the end
     756  if (!queryelem.empty()) {
     757    if (!simple_and && !in_field && !fieldpart.empty()) {
     758      add_field_info(fieldpart, tag, type);
     759      finalquery += fieldpart;
     760      finalquery.push_back(' ');
     761      fieldpart.clear();
     762    }
     763    fieldpart += queryelem;
     764  }
     765  if (!fieldpart.empty()) {
     766    add_field_info(fieldpart, tag, type);
     767    finalquery += fieldpart;
     768    fieldpart.clear();
     769    finalquery.push_back(' ');
     770  }
     771
     772  querystring  = finalquery;
     773}
     774
     775
     776void format_field_info(text_t &querystring, text_t tag, int argct, int argt, int argb) { 
     777  if (argct == 1) {
     778    format_field_info_mgpp(querystring, tag, argt, argb);
     779  } else if (argct == 2) {
     780    format_field_info_lucene(querystring, tag, argt, argb);
     781  }
     782}
     783
     784void mgpp_adddateelem(text_t& querystring, const int date)
     785{
     786  querystring.appendcstr(" [");
     787  if(date<0) {
     788      querystring.appendcstr("bc");
     789      querystring.appendint((date*-1));
     790  }
     791  else {
     792    querystring.appendint(date);
     793  }
     794  querystring.appendcstr("]:CV");
     795}
     796
     797void lucene_adddateelem(text_t& querystring, const int date)
     798{
     799  querystring.appendcstr(" CV:(");
     800  if(date<0) {
     801      querystring.appendcstr("bc");
     802      querystring.appendint((date*-1));
     803  }
     804  else {
     805    querystring.appendint(date);
     806  }
     807  querystring.appendcstr(")");
     808}
     809
     810
    292811void add_dates(text_t &querystring, int startdate, int enddate,
    293812           int startbc, int endbc, int ct)
     
    352871
    353872}
    354 
    355 // search history tool
    356 // also used for form query macros
    357 text_t escape_quotes(const text_t &querystring) {
    358 
    359   text_t::const_iterator here = querystring.begin();
    360   text_t::const_iterator end = querystring.end();
    361  
    362   text_t escquery = "";
    363   while (here != end) {
    364     if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
    365     else if (*here == '\n' || *here == '\r') {
    366       escquery.push_back(' ');
    367     } else {
    368       escquery +="\\\\";
    369       escquery.push_back(*here);
    370     }
    371 
    372     ++here;
    373   }
    374   return escquery;
    375 
    376 }
    377 
    378 // some query form parsing functions for use with mgpp & lucene
    379 
    380 void parse_reg_query_form(text_t &querystring, cgiargsclass &args)
    381 {
    382   querystring.clear();
    383 
    384   const int ct = args.getintarg("ct");
    385   int argt = args.getintarg("t");// t=0 -and, t=1 - or
    386 
    387   text_t combine;
    388   if (ct==1) {
    389     if (argt == 0) combine = "&";
    390     else combine = "|";
    391   }
    392   else { // lucene
    393     if (argt == 0) combine = "AND";
    394     else combine = "OR";
    395   }
    396  
    397   text_t field = args["fqf"];
    398   if (field.empty()) return; // no query
    399   text_tarray fields;
    400   splitchar(field.begin(), field.end(), ',', fields);
    401  
    402   text_t value = args["fqv"];
    403   if (value.empty()) return; // somethings wrong
    404   text_tarray values;
    405   splitchar(value.begin(), value.end(), ',', values);
    406 
    407 
    408   for (int i=0; i< values.size(); ++i) {
    409     if (!values[i].empty()) {
    410       if (ct == 1) {
    411     mgpp_addqueryelem(querystring, fields[i], values[i], combine);
    412       }
    413       else { // lucene
    414     lucene_addqueryelem(querystring, fields[i], values[i], combine);
    415       }
    416     }
    417   }
    418  
    419 }
    420 
    421 
    422 void parse_adv_query_form(text_t &querystring, cgiargsclass &args){
    423 
    424   querystring.clear();
    425 
    426   const int ct = args.getintarg("ct");
    427   text_t combine;
    428   if (ct==1) {
    429     combine = "&";
    430   }
    431   else { // lucene
    432     combine = "AND";
    433   }
    434 
    435   text_t field = args["fqf"];
    436   if (field.empty()) return; // no query
    437   text_tarray fields;
    438   splitchar(field.begin(), field.end(), ',', fields);
    439  
    440   text_t value = args["fqv"];
    441   if (value.empty()) return; // somethings wrong
    442   text_tarray values;
    443   splitchar(value.begin(), value.end(), ',', values);
    444 
    445   text_t stem = args["fqs"];
    446   if (stem.empty()) return; // somethings wrong
    447   text_tarray stems;
    448   splitchar(stem.begin(), stem.end(), ',', stems);
    449 
    450   text_t fold = args["fqk"];
    451   if (fold.empty()) return; // somethings wrong
    452   text_tarray folds;
    453   splitchar(fold.begin(), fold.end(), ',', folds);
    454 
    455   text_t comb = args["fqc"];
    456   if (comb.empty()) return; //somethings wrong
    457   text_tarray combs;
    458   splitchar(comb.begin(), comb.end(), ',', combs);
    459  
    460   for(int i=0; i< values.size(); ++i) {
    461     if (!values[i].empty()) {
    462       if (i!=0) {
    463     if (ct==1) {
    464       if (combs[i-1]=="and") combine = "&";
    465       else if (combs[i-1]=="or")combine = "|";
    466       else if (combs[i-1]=="not")combine = "!";
    467     }
    468     else { // lucene
    469       if (combs[i-1]=="and") combine = "AND";
    470       else if (combs[i-1]=="or")combine = "OR";
    471       else if (combs[i-1]=="not")combine = "NOT";
    472     }
    473       }
    474       text_t term = addstemcase(values[i], stems[i], folds[i], ct);
    475       mgpp_addqueryelem(querystring, fields[i], term, combine);
    476     }
    477    
    478   }
    479 }
    480 
    481 text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold,
    482            const int indexer_type) {
    483  
    484   text_t outtext;
    485   text_t word;
    486   //unsigned short c;                                                           
    487   text_t::const_iterator here = terms.begin();
    488   text_t::const_iterator end = terms.end();
    489 
    490   while (here !=end) {
    491 
    492     if (is_unicode_letdig(*here) || is_special_character(indexer_type, *here)) {
    493       // not word boundary
    494       word.push_back(*here);
    495       ++here;   
    496     }
    497     else {
    498       // found word boundary   
    499       if (!word.empty() ) {
    500     if (stem == "1" || fold =="1") {
    501       word += "#";
    502       if (stem == "1") word += "s";
    503       //else word += "u";
    504      
    505       if (fold == "1") word += "i";
    506       //else word += "c";
    507     }
    508    
    509     word += " ";
    510     outtext += word;
    511     word.clear();
    512       }
    513       if (*here == '\"') {
    514     outtext.push_back(*here);
    515       }
    516       ++here;
    517     }
    518   }
    519    
    520   // get last word
    521   if (!word.empty()) {
    522     if (stem == "1"|| fold == "1") {
    523       word += "#";
    524       if (stem == "1") word += "s";
    525       //else word += "u";
    526      
    527       if (fold == "1") word += "i";
    528       //else word += "c";
    529     }
    530     word += " ";
    531     outtext += word;
    532   }
    533   return outtext;
    534 }
    535 
    536 
    537 void mgpp_adddateelem(text_t& querystring, const int date)
    538 {
    539   querystring.appendcstr(" [");
    540   if(date<0) {
    541       querystring.appendcstr("bc");
    542       querystring.appendint((date*-1));
    543   }
    544   else {
    545     querystring.appendint(date);
    546   }
    547   querystring.appendcstr("]:CV");
    548 }
    549 
    550 void lucene_adddateelem(text_t& querystring, const int date)
    551 {
    552   querystring.appendcstr(" CV:(");
    553   if(date<0) {
    554       querystring.appendcstr("bc");
    555       querystring.appendint((date*-1));
    556   }
    557   else {
    558     querystring.appendint(date);
    559   }
    560   querystring.appendcstr(")");
    561 }
    562 
    563 
    564 void mgpp_addqueryelem(text_t &querystring, text_t &tag,
    565           text_t &query, text_t &combine) {
    566   if (!querystring.empty()) { // have to put and/or
    567     querystring += " " + combine + " ";
    568  
    569   }
    570   if (tag=="ZZ" || tag=="") { // just add onto querystring
    571        querystring +=  query;
    572   }
    573   else {
    574     querystring += "["+query+"]:"+tag;
    575   }
    576 
    577 }
    578 
    579 void lucene_addqueryelem(text_t &querystring, text_t &tag,
    580           text_t &query, text_t &combine) {
    581   if (!querystring.empty()) { // have to put and/or
    582     querystring += " " + combine + " ";
    583  
    584   }
    585   if (tag=="ZZ" || tag=="") { // just add onto querystring
    586        querystring +=  query;
    587   }
    588   else {
    589     querystring += tag+":("+query+")";
    590   }
    591 }
    592 
    593 
    594 void addqueryelem_ex(text_t &querystring, const text_t &tag,
    595              const text_t &terms, const text_t &stem, const text_t &fold,
    596              const text_t& combine, const text_t& word_combine) {
    597   if (!querystring.empty()) { // have to put and/or
    598     querystring += " " + combine + " ";
    599   }
    600   text_t outtext; outtext.reserve(512);
    601   text_t word; word.reserve(100);
    602   //unsigned short c;                                                           
    603   text_t::const_iterator here = terms.begin();
    604   text_t::const_iterator end = terms.end();
    605   bool inquote = false, firstword = true;
    606 
    607   text_t word2; word2.reserve(256);
    608    
    609   while (here !=end) {
    610     if (is_unicode_space(*here)) {
    611       if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
    612       else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
    613       else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
    614       else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
    615       else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
    616       if (inquote) {
    617     word2.push_back(*here);
    618       }
    619       word.append(word2); word2.clear();
    620            
    621       if (!inquote && !word.empty() ) {
    622                 // found word boundary   
    623                
    624     if (stem == "1" || fold =="1") {
    625       word += "#";
    626       if (stem == "1") word += "s";
    627       //else word += "u";
    628                    
    629       if (fold == "1") word += "i";
    630       //else word += "c";
    631     }
    632     if (firstword) {
    633       firstword = false;
    634     } else {
    635       outtext += " " + word_combine + " ";
    636     }
    637     outtext += "[" + word + "]:"+tag;
    638     word.clear();
    639       }
    640       ++here;
    641     } else if (*here == '\"') {
    642       word2.push_back(*here);
    643       inquote = !inquote;
    644       ++here;
    645     } else {
    646       // not word boundary
    647       word2.push_back(*here);
    648       ++here;   
    649     }
    650   }
    651    
    652   // get last word
    653   if (!word2.empty()) {
    654     if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
    655     else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
    656     else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
    657     else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
    658     else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
    659     word.append(word2); word2.clear();
    660        
    661     if (stem == "1"|| fold == "1") {
    662       word += "#";
    663       if (stem == "1") word += "s";
    664       //else word += "u";
    665            
    666       if (fold == "1") word += "i";
    667       //else word += "c";
    668     }
    669     if (!outtext.empty()) outtext += " " + word_combine + " ";
    670     outtext += "[" + word + "]:"+tag;
    671   }
    672   querystring += "(" + outtext + ")";
    673 }
    674 
    675 
    676 void add_field_info(text_t &querystring, const text_t &tag, int type) {
    677 
    678   if (tag == "") return; // do nothing
    679   if (type == 1) { //mgpp
    680     querystring = "["+querystring+"]:"+tag;
    681   } else if (type == 2) { // lucene
    682     querystring = tag+":("+querystring+")";
    683   }
    684    
    685 }
    686 
    687 
    688 void format_field_info_lucene(text_t &querystring, cgiargsclass &args) {
    689   text_t tag = args["fqf"];
    690   if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
    691   int type = 2; //lucene
    692   int argt = args.getintarg("t");// t=0 -and, t=1 - or
    693   int argb = args.getintarg("b"); // b=0 simple, b=1 advanced
    694 
    695   // lucene simple OR - the string stays as is, but may need field tag
    696   if (argb==0 && argt == 1) {
    697     // just tag the entire thing
    698     if (tag != "") {
    699       add_field_info(querystring, tag, type);
    700     }
    701     return;
    702   }
    703   bool in_phrase = false;
    704  
    705   text_t queryelem = "";
    706   text_t finalquery = "";
    707  
    708   // only add in + for simple AND search
    709   text_t combine = ((argb==0)? "+" : "");
    710 
    711   // for lucene, we need to change & to && and | to || if advanced search
    712   // we need to tag the entire string, if we have a field
    713   // if we are simple and search, then we put && in between words
    714  
    715   text_t::const_iterator here = querystring.begin();
    716   text_t::const_iterator end = querystring.end();
    717   while (here != end) {
    718     if (is_unicode_letdig(*here) || is_special_character(type, *here)) {
    719       queryelem.push_back(*here);
    720     }
    721 
    722     // Detect phrase starts/finishes
    723     else if (*here == '"') {
    724       queryelem.push_back(*here);
    725       if (in_phrase == false) in_phrase = true;
    726       else {
    727     finalquery += combine + queryelem;
    728     queryelem.clear();
    729     in_phrase = false;
    730       }
    731     }
    732 
    733     // Found word boundary, in a phrase
    734     else if (in_phrase) {
    735       queryelem.push_back(*here);
    736     }
    737     // Word boundary, but not in a phrase
    738     else {
    739       if (*here == '&') {
    740     queryelem.push_back('&');
    741     queryelem.push_back('&');
    742       } else if (*here == '|') {
    743     queryelem.push_back('|');
    744     queryelem.push_back('|');
    745       } else {
    746     if (!queryelem.empty()) {
    747       finalquery += combine + queryelem;
    748       queryelem.clear();
    749     }
    750     finalquery.push_back(*here);
    751       }
    752     }
    753 
    754     ++here;
    755   }
    756 
    757   // Get last element
    758   if (!queryelem.empty()) {
    759     finalquery += combine + queryelem;
    760   }
    761 
    762   add_field_info(finalquery, tag, type);
    763   querystring = finalquery;
    764 }
    765 
    766 void format_field_info_mgpp(text_t &querystring, cgiargsclass &args) {
    767   text_t tag = args["fqf"];
    768   if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
    769  
    770   int argt = args.getintarg("t");// t=0 -and, t=1 - or
    771   int argb = args.getintarg("b"); // b=0 simple, b=1 advanced
    772 
    773   if (tag == "" && argb ==1) {
    774     return; // no field specifier, advanced mode, the query stays as written
    775   }
    776 
    777   int type = 1; // mgpp
    778 
    779   bool simple_and = (argb==0 && argt==0);
    780   text_t finalquery = "";
    781   text_t fieldpart ="";
    782   text_t queryelem = "";
    783   bool in_phrase = false;
    784   bool in_field = false;
    785 
    786   text_t::const_iterator here = querystring.begin();
    787   text_t::const_iterator end = querystring.end();
    788   while (here != end) {
    789     if (is_unicode_letdig(*here)  || *here == '&' || is_special_character(type, *here)) {
    790       queryelem.push_back(*here);
    791     }
    792     else if (*here == '|') {
    793       in_field = false;
    794     }
    795     else if (*here == '!' || *here == '(' || *here == ')') {
    796       if (!in_phrase) { // ignore these if in_phrase
    797     // output field, then output operator
    798     in_field = false;
    799     if (!queryelem.empty()) {
    800       if (!simple_and && !fieldpart.empty()) {
    801         add_field_info(fieldpart, tag, type);
    802         finalquery += fieldpart;
    803         finalquery.push_back(' ');
    804         fieldpart.clear();
    805       }
    806       fieldpart += queryelem;
    807     }
    808     if (!fieldpart.empty()) {
    809       add_field_info(fieldpart, tag, type);
    810       finalquery += fieldpart;
    811       finalquery.push_back(' ');
    812     }
    813     fieldpart.clear();
    814     queryelem.clear();
    815     finalquery.push_back(*here);
    816     finalquery.push_back(' ');
    817       }
    818     }
    819     else if (*here == '"') {
    820       queryelem.push_back(*here);
    821       if (in_phrase == false) in_phrase = true;
    822       else {
    823     in_phrase = false;
    824       }
    825     }
    826 
    827     // Found word boundary, in a phrase
    828     else if (in_phrase) {
    829       queryelem.push_back(*here);
    830     }
    831     // Found a word boundary
    832     else {
    833       if (!queryelem.empty()) {
    834     if (queryelem == "&") {
    835       in_field = true;
    836       queryelem.clear();
    837     }
    838     else if (starts_with(queryelem, "NEAR") || starts_with(queryelem, "WITHIN")) {
    839      
    840       if (argb==1) {
    841         // simple search, these not allowed
    842         in_field = true;
    843         fieldpart += queryelem;
    844         fieldpart.push_back(' ');
    845       }
    846       queryelem.clear();
    847      
    848     }
    849     else {
    850       if (!simple_and && !in_field) {
    851         if (!fieldpart.empty()) {
    852           add_field_info(fieldpart, tag, type);
    853           finalquery += fieldpart;
    854           finalquery.push_back(' ');
    855           fieldpart.clear();
    856         }
    857       }
    858      
    859       fieldpart += queryelem;
    860       fieldpart.push_back(' ');
    861       queryelem.clear();
    862     }
    863       }
    864     }
    865     ++here;
    866   }
    867   // at the end
    868   if (!queryelem.empty()) {
    869     if (!simple_and && !in_field && !fieldpart.empty()) {
    870       add_field_info(fieldpart, tag, type);
    871       finalquery += fieldpart;
    872       finalquery.push_back(' ');
    873       fieldpart.clear();
    874     }
    875     fieldpart += queryelem;
    876   }
    877   if (!fieldpart.empty()) {
    878     add_field_info(fieldpart, tag, type);
    879     finalquery += fieldpart;
    880     fieldpart.clear();
    881     finalquery.push_back(' ');
    882   }
    883 
    884   querystring  = finalquery;
    885   cerr << "final query = "<<finalquery<<endl;
    886 }
    887 
    888 void format_field_info(text_t &querystring, cgiargsclass &args) {
    889   int argct = args.getintarg("ct");
    890   if (argct == 1) {
    891     format_field_info_mgpp(querystring, args);
    892   } else if (argct == 2) {
    893     format_field_info_lucene(querystring, args);
    894   }
    895 }
    896 
Note: See TracChangeset for help on using the changeset viewer.