Changeset 1324
- Timestamp:
- 2000-08-02T12:58:29+12:00 (24 years ago)
- Location:
- trunk/gsdl/src/colservr
- Files:
-
- 8 added
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/src/colservr/mgsearch.cpp
r1306 r1324 232 232 233 233 mgsearchclass::mgsearchclass () 234 {235 cache = new querycache (RESULTCACHESIZE);234 : searchclass() { 235 236 236 } 237 237 … … 243 243 cache = NULL; 244 244 } 245 }246 247 248 void mgsearchclass::setcollectdir (const text_t &thecollectdir)249 {250 collectdir = thecollectdir;251 245 } 252 246 … … 308 302 bool mgsearchclass::search(const queryparamclass &queryparams, 309 303 queryresultsclass &queryresults) { 310 assert (cache != NULL);304 // assert (cache != NULL); 311 305 312 306 queryresults.clear(); 313 307 cerr << "mgsearch start of search"<<endl; 314 308 // first check the cache 315 if (cache->find(queryparams, queryresults)) return true; 316 309 if (cache != NULL) { 310 if (cache->find(queryparams, queryresults)) return true; 311 } 317 312 // make sure there is a query to be processed 318 313 if (!has_unicode_letdig(queryparams.querystring)) return true; … … 320 315 if (makeindexcurrent (queryparams.index, queryparams.subcollection, 321 316 queryparams.language, queryparams.collection)) { 317 cerr << "made index current "<<endl; 322 318 setsearchmode (queryparams); 323 319 submitquery (queryparams); 324 320 getresults (queryparams, queryresults); 321 cerr << "got results"<<endl; 325 322 return true; 326 323 } … … 512 509 return (bool)databaseloaded; 513 510 } 511 -
trunk/gsdl/src/colservr/mgsearch.h
r1285 r1324 1 1 /********************************************************************** 2 2 * 3 * mgsearch.h -- 3 * mgsearch.h -- implementation of search for old versions of mg 4 4 * Copyright (C) 1999 The New Zealand Digital Library Project 5 5 * … … 29 29 30 30 31 #include "text_t.h" 32 #include "querycache.h" 31 #include "search.h" 33 32 34 #define RESULTCACHESIZE 10 35 #define MAXNUMDOCS 1000000 36 #define MAXNUMTERMS 100 37 38 class mgsearchclass { 33 class mgsearchclass : public searchclass { 39 34 public: 40 35 mgsearchclass (); 41 36 virtual ~mgsearchclass (); 42 43 // the index directory must be set before any searching44 // is done45 void setcollectdir (const text_t &thecollectdir);46 37 47 38 // you only need to use this function before doing any stemming … … 60 51 // the search results are returned in queryresults 61 52 // search returns 'true' if it was able to do a search 62 bool search(const queryparamclass &queryparams,53 bool search(const queryparamclass &queryparams, 63 54 queryresultsclass &queryresults); 64 55 … … 87 78 88 79 protected: 89 querycache *cache;90 text_t collectdir; // the collection directory91 80 92 81 void setsearchmode (const queryparamclass &queryparams); -
trunk/gsdl/src/colservr/queryfilter.cpp
r1285 r1324 1 1 /********************************************************************** 2 2 * 3 * queryfilter.cpp -- 3 * queryfilter.cpp -- base class for queryfilters 4 4 * Copyright (C) 1999 The New Zealand Digital Library Project 5 5 * … … 26 26 #include "queryfilter.h" 27 27 #include "fileutil.h" 28 #include "queryinfo.h"29 #include "phrasesearch.h"30 28 #include "gsdltools.h" 31 29 #include <assert.h> 32 30 33 31 34 // some useful functions35 36 32 // translate will return true if successful 37 static booltranslate (gdbmclass *gdbmptr, int docnum, text_t &trans_OID) {33 bool queryfilterclass::translate (gdbmclass *gdbmptr, int docnum, text_t &trans_OID) { 38 34 infodbclass info; 39 35 … … 53 49 54 50 // whether document results are needed 55 static boolneed_matching_docs (int filterResultOptions) {51 bool queryfilterclass::need_matching_docs (int filterResultOptions) { 56 52 return ((filterResultOptions & FROID) || (filterResultOptions & FRranking) || 57 53 (filterResultOptions & FRmetadata)); … … 59 55 60 56 // whether term information is needed 61 static boolneed_term_info (int filterResultOptions) {57 bool queryfilterclass::need_term_info (int filterResultOptions) { 62 58 return ((filterResultOptions & FRtermFreq) || (filterResultOptions & FRmatchTerms)); 63 59 } 64 65 ///////////////////////////////66 // methods for resultsorderer_t67 ///////////////////////////////68 69 resultsorderer_t::resultsorderer_t() {70 clear ();71 }72 73 void resultsorderer_t::clear() {74 compare_phrase_match = false;75 compare_terms_match = false;76 compare_doc_weight = true;77 78 docset = NULL;79 }80 81 bool resultsorderer_t::operator()(const int &t1, const int &t2) const {82 if (docset == NULL) return t1>t2;83 84 docresultmap::iterator t1_here = docset->find(t1);85 docresultmap::iterator t2_here = docset->find(t2);86 docresultmap::iterator end = docset->end();87 88 // sort all the document numbers not in the document set to89 // the end of the list90 if (t1_here == end) {91 if (t2_here == end) return t1>t2;92 else return true;93 } else if (t2_here == end) return false;94 95 if (compare_phrase_match) {96 if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;97 if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;98 }99 100 if (compare_terms_match) {101 if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;102 if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;103 }104 105 if (compare_doc_weight) {106 if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;107 if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;108 }109 110 return t1>t2;111 }112 113 114 115 60 116 61 ///////////////////////////////// 117 62 // functions for queryfilterclass 118 63 ///////////////////////////////// 119 120 // loads up phrases data structure with any phrases (that's the quoted bits)121 // occuring in the querystring122 void queryfilterclass::get_phrase_terms (const text_t &querystring,123 const termfreqclassarray &orgterms,124 vector<termfreqclassarray> &phrases) {125 126 text_t::const_iterator here = querystring.begin();127 text_t::const_iterator end = querystring.end();128 129 termfreqclassarray tmpterms;130 131 int termcount = 0;132 bool foundquote = false;133 bool foundbreak = false;134 bool start = true;135 while (here != end) {136 if (*here == '\"') {137 if (foundquote) {138 if (!foundbreak && !start) {139 tmpterms.push_back (orgterms[termcount]);140 termcount ++;141 }142 if (tmpterms.size() > 1) {143 phrases.push_back (tmpterms);144 tmpterms.erase (tmpterms.begin(), tmpterms.end());145 }146 foundquote = false;147 foundbreak = true;148 } else foundquote = true;149 } else if (!is_unicode_letdig(*here)) {150 // found a break between terms151 if (!foundbreak && !start) {152 if (foundquote)153 tmpterms.push_back (orgterms[termcount]);154 termcount ++;155 }156 foundbreak = true;157 } else {158 start = false;159 foundbreak = false;160 }161 here++;162 }163 }164 165 // do aditional query processing166 void queryfilterclass::post_process (const queryparamclass &queryparams,167 queryresultsclass &queryresults) {168 169 // post-process the results if needed170 if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {171 172 // get the terms between quotes (if any)173 vector<termfreqclassarray> phrases;174 get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);175 176 num_phrases = phrases.size();177 if (num_phrases > 0) {178 179 // get the long version of the index180 text_t longindex;181 indexmap.to2from (queryparams.index, longindex);182 183 vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();184 vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();185 186 while (this_phrase != end_phrase) {187 188 // process each of the matched documents189 docresultmap::iterator docs_here = queryresults.docs.docset.begin();190 docresultmap::iterator docs_end = queryresults.docs.docset.end();191 while (docs_here != docs_end) {192 if (OID_phrase_search (*mgsearchptr, *gdbmptr, queryparams.index,193 queryparams.subcollection, queryparams.language,194 longindex, queryparams.collection, *this_phrase,195 (*docs_here).second.docnum)) {196 (*docs_here).second.num_phrase_match++;197 }198 199 docs_here++;200 }201 this_phrase++;202 }203 }204 }205 }206 64 207 65 // get the query parameters … … 231 89 query.stemming = (filterOptions["Stem"].defaultValue == "true"); 232 90 query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint(); 233 91 query.level = ""; 234 92 OptionValue_tarray::const_iterator options_here = request.filterOptions.begin(); 235 93 OptionValue_tarray::const_iterator options_end = request.filterOptions.end(); … … 256 114 query.casefolding = (filterOptions["Casefold"].defaultValue == "true"); 257 115 query.stemming = (filterOptions["Stem"].defaultValue == "true"); 258 116 query.level = ""; 259 117 // "all", needed when combining queries where the document results are needed 260 118 if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1; … … 286 144 } else if ((*options_here).name == "PhraseMatch") { 287 145 phrasematch = (*options_here).value; 146 } else if ((*options_here).name == "Level") { 147 query.level = (*options_here).value; 288 148 } else { 289 149 logout << text_t2ascii … … 302 162 303 163 304 // do query that might involve multiple sub queries305 // mgsearchptr and gdbmptr are assumed to be valid306 void queryfilterclass::do_multi_query (const FilterRequest_t &request,307 const vector<queryparamclass> &query_params,308 queryresultsclass &multiresults,309 comerror_t &err, ostream &logout) {310 outconvertclass text_t2ascii;311 312 err = noError;313 mgsearchptr->setcollectdir (collectdir);314 multiresults.clear();315 316 vector<queryparamclass>::const_iterator query_here = query_params.begin();317 vector<queryparamclass>::const_iterator query_end = query_params.end();318 while (query_here != query_end) {319 queryresultsclass thisqueryresults;320 321 if (!mgsearchptr->search(*query_here, thisqueryresults)) {322 // most likely a system problem323 logout << text_t2ascii324 << "system problem: could not do search with mg for index \""325 << (*query_here).index << (*query_here).subcollection326 << (*query_here).language << "\".\n\n";327 err = systemProblem;328 return;329 }330 331 // combine the results332 if (need_matching_docs (request.filterResultOptions)) {333 // post-process the results if needed334 if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&335 !thisqueryresults.docs.docset.empty()) {336 post_process (*query_here, thisqueryresults);337 thisqueryresults.postprocessed = true;338 multiresults.postprocessed = true;339 }340 341 if (query_params.size() == 1) {342 multiresults.docs = thisqueryresults.docs; // just one set of results343 multiresults.docs_matched = thisqueryresults.docs_matched;344 multiresults.is_approx = thisqueryresults.is_approx;345 346 } else {347 if ((*query_here).combinequery == "and") {348 multiresults.docs.combine_and (thisqueryresults.docs);349 } else if ((*query_here).combinequery == "or") {350 multiresults.docs.combine_or (thisqueryresults.docs);351 } else if ((*query_here).combinequery == "not") {352 multiresults.docs.combine_not (thisqueryresults.docs);353 }354 multiresults.docs_matched = multiresults.docs.docset.size();355 multiresults.is_approx = Exact;356 }357 }358 359 // combine the term information360 if (need_term_info (request.filterResultOptions)) {361 // append the terms362 multiresults.orgterms.insert(multiresults.orgterms.end(),363 thisqueryresults.orgterms.begin(),364 thisqueryresults.orgterms.end());365 366 // add the term variants367 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();368 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();369 while (termvar_here != termvar_end) {370 multiresults.termvariants.insert(*termvar_here);371 termvar_here++;372 }373 }374 375 query_here++;376 }377 378 // sort and unique the query terms379 multiresults.sortuniqqueryterms ();380 }381 382 383 void queryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,384 docresultsclass &docs) {385 resultsorderer_t resultsorderer;386 resultsorderer.compare_phrase_match = true;387 resultsorderer.docset = &(docs.docset);388 389 // first get a list of document numbers390 docs.docnum_order();391 392 sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);393 }394 395 396 164 397 165 queryfilterclass::queryfilterclass () { 398 166 gdbmptr = NULL; 399 167 mgsearchptr = NULL; 400 num_phrases = 0;401 168 402 169 FilterOption_t filtopt; … … 585 352 } 586 353 587 void queryfilterclass::filter (const FilterRequest_t &request, 588 FilterResponse_t &response, 589 comerror_t &err, ostream &logout) { 590 outconvertclass text_t2ascii; 591 592 response.clear (); 593 err = noError; 594 if (gdbmptr == NULL) { 595 // most likely a configuration problem 596 logout << text_t2ascii 597 << "configuration error: queryfilter contains a null gdbmclass\n\n"; 598 err = configurationError; 599 return; 600 } 601 if (mgsearchptr == NULL) { 602 // most likely a configuration problem 603 logout << text_t2ascii 604 << "configuration error: queryfilter contains a null mgsearchclass\n\n"; 605 err = configurationError; 606 return; 607 } 608 609 // open the database 610 gdbmptr->setlogout(&logout); 611 if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) { 612 // most likely a system problem (we have already checked that the 613 // gdbm database exists) 614 logout << text_t2ascii 615 << "system problem: open on gdbm database \"" 616 << gdbm_filename << "\" failed\n\n"; 617 err = systemProblem; 618 return; 619 } 620 621 // get the query parameters 622 int startresults = filterOptions["StartResults"].defaultValue.getint(); 623 int endresults = filterOptions["EndResults"].defaultValue.getint(); 624 text_t phrasematch = filterOptions["PhraseMatch"].defaultValue; 625 626 vector<queryparamclass> queryfilterparams; 627 parse_query_params (request, queryfilterparams, startresults, 628 endresults, phrasematch, logout); 629 630 // do query 631 queryresultsclass queryresults; 632 do_multi_query (request, queryfilterparams, queryresults, err, logout); 633 if (err != noError) return; 634 635 // assemble document results 636 if (need_matching_docs (request.filterResultOptions)) { 637 // sort the query results 638 sort_doc_results (request, queryresults.docs); 639 640 int resultnum = 1; 641 ResultDocInfo_t resultdoc; 642 text_t trans_OID; 643 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin(); 644 vector<int>::iterator docorder_end = queryresults.docs.docorder.end(); 645 646 if (endresults == -1) endresults = MAXNUMDOCS; 647 while (docorder_here != docorder_end) { 648 if (resultnum > endresults) break; 649 650 // translate the document number 651 if (!translate(gdbmptr, *docorder_here, trans_OID)) { 652 logout << text_t2ascii 653 << "warning: could not translate mg document number \"" 654 << *docorder_here << "\"to OID.\n\n"; 655 656 } else { 657 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here); 658 659 // documents containing matching phrases will be sorted to the top so 660 // we can break out once we're past those that match the PhraseMatch 661 // option -- "all_phrases" = return only those documents containing all 662 // phrases in query string 663 // "some_phrases" = return only those documents containing 664 // at least 1 of the phrases in the document 665 // "all_docs" = return all documents regardless 666 if (num_phrases > 0) { 667 if ((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) { 668 queryresults.docs_matched = response.docInfo.size(); 669 break; 670 } 671 if ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1)) { 672 queryresults.docs_matched = response.docInfo.size(); 673 break; 674 } 675 } 676 677 // see if there is a result for this number, 678 // if it is in the request set (or the request set is empty) 679 if (docset_here != queryresults.docs.docset.end() && 680 (request.docSet.empty() || in_set(request.docSet, trans_OID))) { 681 if (resultnum >= startresults) { 682 // add this document 683 resultdoc.OID = trans_OID; 684 resultdoc.result_num = resultnum; 685 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5); 686 687 // these next two are not available on all versions of mg 688 resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched; 689 resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match; 690 691 response.docInfo.push_back (resultdoc); 692 } 693 694 resultnum++; 695 } 696 } 697 698 docorder_here++; 699 } 700 } 701 702 // assemble the term results 703 if (need_term_info(request.filterResultOptions)) { 704 // note: the terms have already been sorted and uniqued 705 706 TermInfo_t terminfo; 707 bool terms_first = true; 708 termfreqclassarray::iterator terms_here = queryresults.terms.begin(); 709 termfreqclassarray::iterator terms_end = queryresults.terms.end(); 710 711 while (terms_here != terms_end) { 712 terminfo.clear(); 713 terminfo.term = (*terms_here).termstr; 714 terminfo.freq = (*terms_here).termfreq; 715 if (terms_first) { 716 text_tset::iterator termvariants_here = queryresults.termvariants.begin(); 717 text_tset::iterator termvariants_end = queryresults.termvariants.end(); 718 while (termvariants_here != termvariants_end) { 719 terminfo.matchTerms.push_back (*termvariants_here); 720 termvariants_here++; 721 } 722 } 723 terms_first = false; 724 725 response.termInfo.push_back (terminfo); 726 727 terms_here++; 728 } 729 } 730 731 response.numDocs = queryresults.docs_matched; 732 response.isApprox = queryresults.is_approx; 733 } 354 355 356 -
trunk/gsdl/src/colservr/queryfilter.h
r1285 r1324 1 1 /********************************************************************** 2 2 * 3 * queryfilter.h -- 3 * queryfilter.h -- abstract base class for queryfilters 4 * mgqueryfilter and mgppqueryfilter implement this 4 5 * Copyright (C) 1999 The New Zealand Digital Library Project 5 6 * … … 35 36 #include "infodbclass.h" 36 37 #include "maptools.h" 37 #include " mgsearch.h"38 #include "search.h" 38 39 #include "queryinfo.h" 39 40 41 // resultsorderer_t is used to sort the query results42 struct resultsorderer_t {43 bool compare_phrase_match;44 bool compare_terms_match;45 bool compare_doc_weight;46 docresultmap *docset;47 48 resultsorderer_t();49 void clear();50 bool operator()(const int &t1, const int &t2) const;51 };52 53 40 54 41 … … 63 50 gdbmclass *gdbmptr; 64 51 65 mgsearchclass *mgsearchptr;52 searchclass *mgsearchptr; 66 53 67 int num_phrases; 68 69 void get_phrase_terms (const text_t &querystring, 70 const termfreqclassarray &orgterms, 71 vector<termfreqclassarray> &phrases); 72 73 // do aditional query processing 74 virtual void post_process (const queryparamclass &queryparams, 75 queryresultsclass &queryresults); 76 77 // get the query parameters 54 // get the query parameters 78 55 void parse_query_params (const FilterRequest_t &request, 79 56 vector<queryparamclass> &query_params, … … 83 60 // do query that might involve multiple sub queries 84 61 // mgsearchptr and gdbmptr are assumed to be valid 85 v oid do_multi_query (const FilterRequest_t &request,62 virtual void do_multi_query (const FilterRequest_t &request, 86 63 const vector<queryparamclass> &query_params, 87 64 queryresultsclass &multiresults, 88 comerror_t &err, ostream &logout) ;65 comerror_t &err, ostream &logout)=0; 89 66 90 virtual void sort_doc_results (const FilterRequest_t &request,91 docresultsclass &docs);92 93 94 67 public: 95 68 queryfilterclass (); … … 100 73 101 74 // the mgsearchptr remains the responsability of the calling code 102 void set_mgsearchptr ( mgsearchclass *themgsearchptr) {mgsearchptr=themgsearchptr;}75 void set_mgsearchptr (searchclass *themgsearchptr) {mgsearchptr=themgsearchptr;} 103 76 104 v oid configure (const text_t &key, const text_tarray &cfgline);77 virtual void configure (const text_t &key, const text_tarray &cfgline); 105 78 bool init (ostream &logout); 79 106 80 text_t get_filter_name () {return "QueryFilter";} 107 void filter (const FilterRequest_t &request, 81 bool translate(gdbmclass *gdbmptr, int docnum, text_t &trans_OID); 82 bool need_matching_docs (int filterResultOptions); 83 bool need_term_info (int filterResultOptions); 84 85 86 virtual void filter (const FilterRequest_t &request, 108 87 FilterResponse_t &response, 109 comerror_t &err, ostream &logout); 88 comerror_t &err, ostream &logout)=0; 89 90 91 110 92 }; 111 93 112 94 113 95 #endif 96
Note:
See TracChangeset
for help on using the changeset viewer.