Changeset 10995 for trunk/gsdl


Ignore:
Timestamp:
2005-12-15T16:57:58+13:00 (18 years ago)
Author:
kjdon
Message:

fixed up mgpp and lucene plain search query string formatting. mgpp now handles phrases inside fields, and * inside fields. lucene doesn't need to tag individual words for an OR query - it can do OR and AND inside a field

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/recpt/querytools.cpp

    r10411 r10995  
    612612void add_field_info(text_t &querystring, const text_t &tag, int type) {
    613613
     614  if (tag == "") return; // do nothing
    614615  if (type == 1) { //mgpp
    615616    querystring = "["+querystring+"]:"+tag;
     
    620621}
    621622
     623bool is_special_character(int indexer_type, unsigned short character) {
     624  // mgpp
     625  if (indexer_type == 1) {
     626    return (character == '#' || character == '/' || character == '*');
     627  }
     628  // lucene
     629  else if (indexer_type ==2) {
     630    return (character == '?' || character == '*' || character == '~' ||
     631        character == '^');
     632  }
     633  return false;
     634}
    622635
    623636void format_field_info(text_t &querystring, cgiargsclass &args) {
    624637 
    625638  text_t tag = args["fqf"];
    626   if (tag == "ZZ" || tag == "") {
    627     return; // do nothing
    628   }
    629  
     639  if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
     640
    630641  int argct = args.getintarg("ct");
     642  bool mgpp = (argct == 1);
     643  bool lucene = (argct == 2);
     644
     645  if (mgpp && tag == "") {
     646    return; // no field specifier: do nothing
     647  }
     648 
    631649  int argt = args.getintarg("t");// t=0 -and, t=1 - or
    632650  int argb = args.getintarg("b"); // b=0 simple, b=1 advanced
    633651
    634   // Special code for Lucene
    635   // The default operator for Lucene is "or", so we need to add "+" symbols when t == 0
    636   // Also, we need to be careful not to mess up phrase searches
    637   if (argct == 2) {
    638     text_t processed_querystring = "";
    639     text_t queryelement = "";
    640     text_t combine = ((argt == 0) ? "+" : "");
    641     bool in_phrase = false;
    642     text_t::const_iterator here = querystring.begin();
    643     text_t::const_iterator end = querystring.end();
    644     while (here != end) {
    645       if (is_unicode_letdig(*here)) {
    646     queryelement.push_back(*here);
    647       }
    648 
    649       // Detect phrase starts/finishes
    650       else if (*here == '"') {
    651     queryelement.push_back(*here);
    652     if (in_phrase == false) in_phrase = true;
    653     else if (in_phrase == true) {
    654       add_field_info(queryelement, tag, argct);
    655       processed_querystring += combine + queryelement;
    656       queryelement.clear();
    657       in_phrase = false;
    658     }
    659       }
    660 
    661       // Found word boundary
    662       else if (in_phrase) {
    663     queryelement.push_back(*here);
    664       }
    665       else {
    666     if (!queryelement.empty()) {
    667       add_field_info(queryelement, tag, argct);
    668       processed_querystring += combine + queryelement;
    669       queryelement.clear();
    670     }
    671     processed_querystring.push_back(*here);
    672       }
    673 
    674       ++here;
    675     }
    676 
    677     // Get last element
    678     if (!queryelement.empty()) {
    679       add_field_info(queryelement, tag, argct);
    680       processed_querystring += combine + queryelement;
    681     }
    682 
    683     querystring = processed_querystring;
    684     return;
    685   }
    686 
    687   if (argb==0 && argt==0) {
    688     // simple 'and' search - just put tag info round whole query string
     652  bool simple_AND_search = (argb==0 && argt==0);
     653  bool simple_OR_search = (argb==0 && argt==1);
     654
     655  if (mgpp && simple_AND_search) {
     656    // mgpp, simple AND search, tag the whole query string
    689657    add_field_info(querystring, tag, argct);
    690658    return;
    691659  }
    692  
    693   // we need to individually tag words
    694   text_t outtext;
    695   text_t word;
    696   //unsigned short c;                                                           
     660  // resulting mgpp case - we need to tag each individual term or phrase
     661  // TODO - allow AND. OR in query string and don't tag these words
     662
     663  if (lucene && (simple_OR_search || argb == 1)) {
     664      // OR search or advanced search (here we assume that the user has added their term mods - don't need to add term mods
     665    if (tag != "") {
     666      // tag the whole string
     667      add_field_info(querystring, tag, argct);
     668    }
     669    return;
     670  }
     671 
     672
     673  // if we have got here, we need to add in combiners (lucene) or
     674  // we need to tag each individual word (mgpp OR search - mgpp can't do OR inside a field)
     675
     676  text_t combine = ((lucene)? "+" : "");
     677 
     678  text_t processed_querystring = "";
     679  text_t queryelement = "";
     680 
     681  bool in_phrase = false;
    697682  text_t::const_iterator here = querystring.begin();
    698683  text_t::const_iterator end = querystring.end();
    699 
    700   while (here !=end) {
    701 
    702     if (is_unicode_letdig(*here)|| *here == '#' || *here == '/' ) {
    703       // include term modifiers in a word just in case
    704       // not word boundary
    705       word.push_back(*here);
    706       ++here;   
    707     }
     684  while (here != end) {
     685    if (is_unicode_letdig(*here) || is_special_character(argct, *here)) {
     686      queryelement.push_back(*here);
     687    }
     688
     689    // Detect phrase starts/finishes
     690    else if (*here == '"') {
     691      queryelement.push_back(*here);
     692      if (in_phrase == false) in_phrase = true;
     693      else {
     694    if (mgpp) {add_field_info(queryelement, tag, argct);}
     695    processed_querystring += combine + queryelement;
     696    queryelement.clear();
     697    in_phrase = false;
     698      }
     699    }
     700
     701    // Found word boundary, in a phrase
     702    else if (in_phrase) {
     703      queryelement.push_back(*here);
     704    }
     705    // Word boundary, but not in a phrase
    708706    else {
    709       // found word boundary   
    710       if (!word.empty() ) {
    711     add_field_info(word, tag, argct);
    712     outtext += word;
    713     word.clear();
    714       }
    715       // everything else, we add into the query string
    716       outtext.push_back(*here);
    717       ++here;
    718     }
    719   }
    720    
    721   // get last word
    722   if (!word.empty()) {
    723     add_field_info(word, tag, argct);
    724     outtext += word;
    725   }
    726 
    727   querystring =  outtext;
    728 }
    729 
     707      if (!queryelement.empty()) {
     708    if (mgpp) {add_field_info(queryelement, tag, argct);}
     709    processed_querystring += combine + queryelement;
     710    queryelement.clear();
     711      }
     712      processed_querystring.push_back(*here);
     713    }
     714
     715    ++here;
     716  }
     717
     718  // Get last element
     719  if (!queryelement.empty()) {
     720    if (mgpp) {add_field_info(queryelement, tag, argct);}
     721    processed_querystring += combine + queryelement;
     722  }
     723
     724  querystring = processed_querystring;
     725 
     726  if (lucene) {
     727    // tag the whole query string
     728    add_field_info(querystring, tag, argct);
     729  }
     730}
     731
     732
Note: See TracChangeset for help on using the changeset viewer.