Changeset 1324


Ignore:
Timestamp:
2000-08-02T12:58:29+12:00 (24 years ago)
Author:
kjm18
Message:

mgpp incorporated. the old mgsearchclass and queryfilterclass are changed.
Have a base searchclass, from which mgsearchclass and mgppsearchclass inherit.
Have a base queryfilterclass, from which mgqueryfilterclass and
mgppqueryfilterclass inherit. librarymain in recpt should choose the appropriate
type (mg vs mgpp) for each collection.

Location:
trunk/gsdl/src/colservr
Files:
8 added
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/colservr/mgsearch.cpp

    r1306 r1324  
    232232
    233233mgsearchclass::mgsearchclass ()
    234 {
    235   cache = new querycache (RESULTCACHESIZE);
     234  : searchclass() {
     235 
    236236}
    237237
     
    243243      cache = NULL;
    244244    }
    245 }
    246 
    247 
    248 void mgsearchclass::setcollectdir (const text_t &thecollectdir)
    249 {
    250   collectdir = thecollectdir;
    251245}
    252246
     
    308302bool mgsearchclass::search(const queryparamclass &queryparams,
    309303               queryresultsclass &queryresults) {
    310   assert (cache != NULL);
     304  //  assert (cache != NULL);
    311305
    312306  queryresults.clear();
    313 
     307  cerr << "mgsearch start of search"<<endl;
    314308  // first check the cache
    315   if (cache->find(queryparams, queryresults)) return true;
    316 
     309  if (cache != NULL) {
     310    if (cache->find(queryparams, queryresults)) return true;
     311  }
    317312  // make sure there is a query to be processed
    318313  if (!has_unicode_letdig(queryparams.querystring)) return true;
     
    320315  if (makeindexcurrent (queryparams.index, queryparams.subcollection,
    321316            queryparams.language, queryparams.collection)) {
     317    cerr << "made index current "<<endl;
    322318    setsearchmode (queryparams);
    323319    submitquery (queryparams);
    324320    getresults (queryparams, queryresults);
     321    cerr << "got results"<<endl;
    325322    return true;
    326323  }
     
    512509  return (bool)databaseloaded;
    513510}
     511
  • trunk/gsdl/src/colservr/mgsearch.h

    r1285 r1324  
    11/**********************************************************************
    22 *
    3  * mgsearch.h --
     3 * mgsearch.h --  implementation of search for old versions of mg
    44 * Copyright (C) 1999  The New Zealand Digital Library Project
    55 *
     
    2929
    3030
    31 #include "text_t.h"
    32 #include "querycache.h"
     31#include "search.h"
    3332
    34 #define RESULTCACHESIZE 10
    35 #define MAXNUMDOCS      1000000
    36 #define MAXNUMTERMS     100
    37 
    38 class mgsearchclass {
     33class mgsearchclass : public searchclass {
    3934public:
    4035  mgsearchclass ();
    4136  virtual ~mgsearchclass ();
    42 
    43   // the index directory must be set before any searching
    44   // is done
    45   void setcollectdir (const text_t &thecollectdir);
    4637
    4738  // you only need to use this function before doing any stemming
     
    6051  // the search results are returned in queryresults
    6152  // search returns 'true' if it was able to do a search
    62   bool search(const queryparamclass &queryparams,
     53   bool search(const queryparamclass &queryparams,
    6354          queryresultsclass &queryresults);
    6455 
     
    8778
    8879protected:
    89   querycache *cache;
    90   text_t collectdir; // the collection directory
    9180
    9281  void setsearchmode (const queryparamclass &queryparams);
  • trunk/gsdl/src/colservr/queryfilter.cpp

    r1285 r1324  
    11/**********************************************************************
    22 *
    3  * queryfilter.cpp --
     3 * queryfilter.cpp -- base class for queryfilters
    44 * Copyright (C) 1999  The New Zealand Digital Library Project
    55 *
     
    2626#include "queryfilter.h"
    2727#include "fileutil.h"
    28 #include "queryinfo.h"
    29 #include "phrasesearch.h"
    3028#include "gsdltools.h"
    3129#include <assert.h>
    3230
    3331
    34 // some useful functions
    35 
    3632// translate will return true if successful
    37 static bool translate (gdbmclass *gdbmptr, int docnum, text_t &trans_OID) {
     33bool queryfilterclass::translate (gdbmclass *gdbmptr, int docnum, text_t &trans_OID) {
    3834  infodbclass info;
    3935
     
    5349
    5450// whether document results are needed
    55 static bool need_matching_docs (int filterResultOptions) {
     51bool queryfilterclass::need_matching_docs (int filterResultOptions) {
    5652  return ((filterResultOptions & FROID) || (filterResultOptions & FRranking) ||
    5753      (filterResultOptions & FRmetadata));
     
    5955
    6056// whether term information is needed
    61 static bool need_term_info (int filterResultOptions) {
     57bool queryfilterclass::need_term_info (int filterResultOptions) {
    6258  return ((filterResultOptions & FRtermFreq) || (filterResultOptions & FRmatchTerms));
    6359}
    64 
    65 ///////////////////////////////
    66 // methods for resultsorderer_t
    67 ///////////////////////////////
    68 
    69 resultsorderer_t::resultsorderer_t() {
    70   clear ();
    71 }
    72 
    73 void resultsorderer_t::clear() {
    74   compare_phrase_match = false;
    75   compare_terms_match = false;
    76   compare_doc_weight = true;
    77 
    78   docset = NULL;
    79 }
    80 
    81 bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
    82   if (docset == NULL) return t1>t2;
    83 
    84   docresultmap::iterator t1_here = docset->find(t1);
    85   docresultmap::iterator t2_here = docset->find(t2);
    86   docresultmap::iterator end = docset->end();
    87 
    88   // sort all the document numbers not in the document set to
    89   // the end of the list
    90   if (t1_here == end) {
    91     if (t2_here == end) return t1>t2;
    92     else return true;
    93   } else if (t2_here == end) return false;
    94  
    95   if (compare_phrase_match) {
    96     if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
    97     if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
    98   }
    99 
    100   if (compare_terms_match) {
    101     if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
    102     if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
    103   }
    104 
    105   if (compare_doc_weight) {
    106     if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
    107     if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
    108   }
    109 
    110   return t1>t2;
    111 }
    112 
    113 
    114 
    11560
    11661/////////////////////////////////
    11762// functions for queryfilterclass
    11863/////////////////////////////////
    119 
    120 // loads up phrases data structure with any phrases (that's the quoted bits)
    121 // occuring in the querystring
    122 void queryfilterclass::get_phrase_terms (const text_t &querystring,
    123                      const termfreqclassarray &orgterms,
    124                      vector<termfreqclassarray> &phrases) {
    125 
    126   text_t::const_iterator here = querystring.begin();
    127   text_t::const_iterator end = querystring.end();
    128 
    129   termfreqclassarray tmpterms;
    130 
    131   int termcount = 0;
    132   bool foundquote = false;
    133   bool foundbreak = false;
    134   bool start = true;
    135   while (here != end) {
    136     if (*here == '\"') {
    137       if (foundquote) {
    138     if (!foundbreak && !start) {
    139       tmpterms.push_back (orgterms[termcount]);
    140       termcount ++;
    141     }
    142     if (tmpterms.size() > 1) {
    143       phrases.push_back (tmpterms);
    144       tmpterms.erase (tmpterms.begin(), tmpterms.end());
    145     }
    146     foundquote = false;
    147     foundbreak = true;
    148       } else foundquote = true;
    149     } else if (!is_unicode_letdig(*here)) {
    150       // found a break between terms
    151       if (!foundbreak && !start) {
    152     if (foundquote)
    153       tmpterms.push_back (orgterms[termcount]);
    154     termcount ++;
    155       }
    156       foundbreak = true;
    157     } else {
    158       start = false;
    159       foundbreak = false;
    160     }     
    161     here++;
    162   }
    163 }
    164 
    165 // do aditional query processing
    166 void queryfilterclass::post_process (const queryparamclass &queryparams,
    167                      queryresultsclass &queryresults) {
    168 
    169   // post-process the results if needed
    170   if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
    171 
    172     // get the terms between quotes (if any)
    173     vector<termfreqclassarray> phrases;
    174     get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
    175 
    176     num_phrases = phrases.size();
    177     if (num_phrases > 0) {
    178 
    179       // get the long version of the index
    180       text_t longindex;
    181       indexmap.to2from (queryparams.index, longindex);
    182      
    183       vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
    184       vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
    185 
    186       while (this_phrase != end_phrase) {
    187 
    188     // process each of the matched documents
    189     docresultmap::iterator docs_here = queryresults.docs.docset.begin();
    190     docresultmap::iterator docs_end = queryresults.docs.docset.end();
    191     while (docs_here != docs_end) {
    192       if (OID_phrase_search (*mgsearchptr, *gdbmptr, queryparams.index,
    193                  queryparams.subcollection, queryparams.language,
    194                  longindex, queryparams.collection, *this_phrase,
    195                  (*docs_here).second.docnum)) {
    196         (*docs_here).second.num_phrase_match++;
    197       }
    198    
    199       docs_here++;
    200     }
    201     this_phrase++;
    202       }
    203     }
    204   }
    205 }
    20664
    20765// get the query parameters
     
    23189  query.stemming = (filterOptions["Stem"].defaultValue == "true");
    23290  query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
    233 
     91  query.level = "";
    23492  OptionValue_tarray::const_iterator options_here = request.filterOptions.begin();
    23593  OptionValue_tarray::const_iterator options_end = request.filterOptions.end();
     
    256114      query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
    257115      query.stemming = (filterOptions["Stem"].defaultValue == "true");
    258      
     116      query.level = "";
    259117      // "all", needed when combining queries where the document results are needed
    260118      if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
     
    286144    } else if ((*options_here).name == "PhraseMatch") {
    287145      phrasematch = (*options_here).value;
     146    } else if ((*options_here).name == "Level") {
     147      query.level = (*options_here).value;
    288148    } else {
    289149      logout << text_t2ascii
     
    302162
    303163
    304 // do query that might involve multiple sub queries
    305 // mgsearchptr and gdbmptr are assumed to be valid
    306 void queryfilterclass::do_multi_query (const FilterRequest_t &request,
    307                        const vector<queryparamclass> &query_params,
    308                        queryresultsclass &multiresults,
    309                        comerror_t &err, ostream &logout) {
    310   outconvertclass text_t2ascii;
    311 
    312   err = noError;
    313   mgsearchptr->setcollectdir (collectdir);
    314   multiresults.clear();
    315  
    316   vector<queryparamclass>::const_iterator query_here = query_params.begin();
    317   vector<queryparamclass>::const_iterator query_end = query_params.end();
    318   while (query_here != query_end) {
    319     queryresultsclass thisqueryresults;
    320    
    321     if (!mgsearchptr->search(*query_here, thisqueryresults)) {
    322       // most likely a system problem
    323       logout << text_t2ascii
    324          << "system problem: could not do search with mg for index \""
    325          << (*query_here).index << (*query_here).subcollection
    326          << (*query_here).language << "\".\n\n";
    327       err = systemProblem;
    328       return;
    329     }
    330 
    331     // combine the results
    332     if (need_matching_docs (request.filterResultOptions)) {
    333       // post-process the results if needed
    334       if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
    335       !thisqueryresults.docs.docset.empty()) {
    336     post_process (*query_here, thisqueryresults);
    337     thisqueryresults.postprocessed = true;
    338     multiresults.postprocessed = true;
    339       }
    340      
    341       if (query_params.size() == 1) {
    342     multiresults.docs = thisqueryresults.docs; // just one set of results
    343     multiresults.docs_matched = thisqueryresults.docs_matched;
    344     multiresults.is_approx = thisqueryresults.is_approx;
    345    
    346       } else {
    347     if ((*query_here).combinequery == "and") {
    348       multiresults.docs.combine_and (thisqueryresults.docs);
    349     } else if ((*query_here).combinequery == "or") {
    350       multiresults.docs.combine_or (thisqueryresults.docs);
    351     } else if ((*query_here).combinequery == "not") {
    352       multiresults.docs.combine_not (thisqueryresults.docs);
    353     }
    354     multiresults.docs_matched = multiresults.docs.docset.size();
    355     multiresults.is_approx = Exact;
    356       }
    357     }
    358 
    359     // combine the term information
    360     if (need_term_info (request.filterResultOptions)) {
    361       // append the terms
    362       multiresults.orgterms.insert(multiresults.orgterms.end(),
    363                    thisqueryresults.orgterms.begin(),
    364                    thisqueryresults.orgterms.end());
    365 
    366       // add the term variants
    367       text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
    368       text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
    369       while (termvar_here != termvar_end) {
    370     multiresults.termvariants.insert(*termvar_here);
    371     termvar_here++;
    372       }
    373     }
    374    
    375     query_here++;
    376   }
    377 
    378   // sort and unique the query terms
    379   multiresults.sortuniqqueryterms ();
    380 }
    381 
    382 
    383 void queryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
    384                      docresultsclass &docs) {
    385   resultsorderer_t resultsorderer;
    386   resultsorderer.compare_phrase_match = true;
    387   resultsorderer.docset = &(docs.docset);
    388 
    389   // first get a list of document numbers
    390   docs.docnum_order();
    391 
    392   sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
    393 }
    394 
    395 
    396164
    397165queryfilterclass::queryfilterclass () {
    398166  gdbmptr = NULL;
    399167  mgsearchptr = NULL;
    400   num_phrases = 0;
    401168
    402169  FilterOption_t filtopt;
     
    585352}
    586353
    587 void queryfilterclass::filter (const FilterRequest_t &request,
    588                    FilterResponse_t &response,
    589                    comerror_t &err, ostream &logout) {
    590   outconvertclass text_t2ascii;
    591 
    592   response.clear ();
    593   err = noError;
    594   if (gdbmptr == NULL) {
    595     // most likely a configuration problem
    596     logout << text_t2ascii
    597        << "configuration error: queryfilter contains a null gdbmclass\n\n";
    598     err = configurationError;
    599     return;
    600   }
    601   if (mgsearchptr == NULL) {
    602     // most likely a configuration problem
    603     logout << text_t2ascii
    604        << "configuration error: queryfilter contains a null mgsearchclass\n\n";
    605     err = configurationError;
    606     return;
    607   }
    608 
    609   // open the database
    610   gdbmptr->setlogout(&logout);
    611   if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
    612     // most likely a system problem (we have already checked that the
    613     // gdbm database exists)
    614     logout << text_t2ascii
    615        << "system problem: open on gdbm database \""
    616        << gdbm_filename << "\" failed\n\n";
    617     err = systemProblem;
    618     return;
    619   }
    620 
    621   // get the query parameters
    622   int startresults = filterOptions["StartResults"].defaultValue.getint();
    623   int endresults = filterOptions["EndResults"].defaultValue.getint();
    624   text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
    625 
    626   vector<queryparamclass> queryfilterparams;
    627   parse_query_params (request, queryfilterparams, startresults,
    628               endresults, phrasematch, logout); 
    629  
    630   // do query
    631   queryresultsclass queryresults;
    632   do_multi_query (request, queryfilterparams, queryresults, err, logout);
    633   if (err != noError) return;
    634  
    635   // assemble document results
    636   if (need_matching_docs (request.filterResultOptions)) {
    637     // sort the query results
    638     sort_doc_results (request, queryresults.docs);
    639    
    640     int resultnum = 1;
    641     ResultDocInfo_t resultdoc;
    642     text_t trans_OID;
    643     vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
    644     vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
    645 
    646     if (endresults == -1) endresults = MAXNUMDOCS;
    647     while (docorder_here != docorder_end) {
    648       if (resultnum > endresults) break;
    649      
    650       // translate the document number
    651       if (!translate(gdbmptr, *docorder_here, trans_OID)) {
    652     logout << text_t2ascii
    653            << "warning: could not translate mg document number \""
    654            << *docorder_here << "\"to OID.\n\n";
    655    
    656       } else {
    657     docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
    658    
    659     // documents containing matching phrases will be sorted to the top so
    660     // we can break out once we're past those that match the PhraseMatch
    661     // option -- "all_phrases" = return only those documents containing all
    662     //                       phrases in query string
    663     //           "some_phrases" = return only those documents containing
    664     //                            at least 1 of the phrases in the document
    665     //           "all_docs" = return all documents regardless
    666     if (num_phrases > 0) {
    667       if ((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) {
    668         queryresults.docs_matched = response.docInfo.size();
    669         break;
    670       }
    671       if ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1)) {
    672         queryresults.docs_matched = response.docInfo.size();
    673         break;
    674       }
    675     }
    676 
    677     // see if there is a result for this number,
    678     // if it is in the request set (or the request set is empty)
    679     if (docset_here != queryresults.docs.docset.end() &&
    680         (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
    681       if (resultnum >= startresults) {
    682         // add this document
    683         resultdoc.OID = trans_OID;
    684         resultdoc.result_num = resultnum;
    685         resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
    686 
    687         // these next two are not available on all versions of mg
    688         resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
    689         resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
    690        
    691         response.docInfo.push_back (resultdoc);
    692       }
    693      
    694       resultnum++;
    695     }
    696       }
    697      
    698       docorder_here++;
    699     }
    700   }
    701 
    702   // assemble the term results
    703   if (need_term_info(request.filterResultOptions)) {
    704     // note: the terms have already been sorted and uniqued
    705 
    706     TermInfo_t terminfo;
    707     bool terms_first = true;
    708     termfreqclassarray::iterator terms_here = queryresults.terms.begin();
    709     termfreqclassarray::iterator terms_end = queryresults.terms.end();
    710 
    711     while (terms_here != terms_end) {
    712       terminfo.clear();
    713       terminfo.term = (*terms_here).termstr;
    714       terminfo.freq = (*terms_here).termfreq;
    715       if (terms_first) {
    716     text_tset::iterator termvariants_here = queryresults.termvariants.begin();
    717     text_tset::iterator termvariants_end = queryresults.termvariants.end();
    718     while (termvariants_here != termvariants_end) {
    719       terminfo.matchTerms.push_back (*termvariants_here);
    720       termvariants_here++;
    721     }
    722       }
    723       terms_first = false;
    724 
    725       response.termInfo.push_back (terminfo);
    726 
    727       terms_here++;
    728     }
    729   }
    730 
    731   response.numDocs = queryresults.docs_matched;
    732   response.isApprox = queryresults.is_approx;
    733 }
     354
     355
     356
  • trunk/gsdl/src/colservr/queryfilter.h

    r1285 r1324  
    11/**********************************************************************
    22 *
    3  * queryfilter.h --
     3 * queryfilter.h -- abstract base class for queryfilters
     4 *          mgqueryfilter and mgppqueryfilter implement this
    45 * Copyright (C) 1999  The New Zealand Digital Library Project
    56 *
     
    3536#include "infodbclass.h"
    3637#include "maptools.h"
    37 #include "mgsearch.h"
     38#include "search.h"
    3839#include "queryinfo.h"
    39 
    40 
    41 // resultsorderer_t is used to sort the query results
    42 struct resultsorderer_t {
    43   bool compare_phrase_match;
    44   bool compare_terms_match;
    45   bool compare_doc_weight;
    46   docresultmap *docset;
    47 
    48   resultsorderer_t();
    49   void clear();
    50   bool operator()(const int &t1, const int &t2) const;
    51 };
    52 
    5340
    5441
     
    6350  gdbmclass *gdbmptr;
    6451 
    65   mgsearchclass *mgsearchptr;
     52  searchclass *mgsearchptr;
    6653 
    67   int num_phrases;
    68 
    69   void get_phrase_terms (const text_t &querystring,
    70              const termfreqclassarray &orgterms,
    71              vector<termfreqclassarray> &phrases);
    72 
    73   // do aditional query processing
    74   virtual void post_process (const queryparamclass &queryparams,
    75                  queryresultsclass &queryresults);
    76 
    77   // get the query parameters
     54   // get the query parameters
    7855  void parse_query_params (const FilterRequest_t &request,
    7956               vector<queryparamclass> &query_params,
     
    8360  // do query that might involve multiple sub queries
    8461  // mgsearchptr and gdbmptr are assumed to be valid
    85   void do_multi_query (const FilterRequest_t &request,
     62  virtual void do_multi_query (const FilterRequest_t &request,
    8663               const vector<queryparamclass> &query_params,
    8764               queryresultsclass &multiresults,
    88                comerror_t &err, ostream &logout);
     65               comerror_t &err, ostream &logout)=0;
    8966
    90   virtual void sort_doc_results (const FilterRequest_t &request,
    91                  docresultsclass &docs);
    92 
    93  
    9467public:
    9568  queryfilterclass ();
     
    10073
    10174  // the mgsearchptr remains the responsability of the calling code
    102   void set_mgsearchptr (mgsearchclass *themgsearchptr) {mgsearchptr=themgsearchptr;}
     75  void set_mgsearchptr (searchclass *themgsearchptr) {mgsearchptr=themgsearchptr;}
    10376
    104   void configure (const text_t &key, const text_tarray &cfgline);
     77  virtual void configure (const text_t &key, const text_tarray &cfgline);
    10578  bool init (ostream &logout);
     79
    10680  text_t get_filter_name () {return "QueryFilter";}
    107   void filter (const FilterRequest_t &request,
     81  bool translate(gdbmclass *gdbmptr, int docnum, text_t &trans_OID);
     82  bool need_matching_docs (int filterResultOptions);
     83  bool need_term_info (int filterResultOptions);
     84
     85
     86  virtual void filter (const FilterRequest_t &request,
    10887           FilterResponse_t &response,
    109            comerror_t &err, ostream &logout);
     88           comerror_t &err, ostream &logout)=0;
     89
     90
     91
    11092};
    11193
    11294
    11395#endif
     96
Note: See TracChangeset for help on using the changeset viewer.