source: main/trunk/greenstone2/runtime-src/src/colservr/mgqueryfilter.cpp@ 28762

Last change on this file since 28762 was 27064, checked in by kjdon, 11 years ago

adding reverse sort/sort order in for lucene search results sorting. reorganising code to avoid duplication, added fieldedqueryfilter in the chain of inheritance

  • Property svn:keywords set to Author Date Id Revision
File size: 18.8 KB
RevLine 
[1324]1/**********************************************************************
2 *
3 * mgqueryfilter.cpp -- implementation of queryfilter for old mg
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "mgqueryfilter.h"
27#include "fileutil.h"
28#include "phrasesearch.h"
29#include "mgsearch.h"
[11002]30#include "phrases.h"
[1324]31
32///////////////////////////////
33// methods for resultsorderer_t
34///////////////////////////////
35
36resultsorderer_t::resultsorderer_t() {
37 clear ();
38}
39
40void resultsorderer_t::clear() {
41 compare_phrase_match = false;
42 compare_terms_match = false;
43 compare_doc_weight = true;
44
45 docset = NULL;
46}
47
[16445]48bool resultsorderer_t::operator()(const text_t &t1, const text_t &t2) const {
[1324]49 if (docset == NULL) return t1>t2;
50
51 docresultmap::iterator t1_here = docset->find(t1);
52 docresultmap::iterator t2_here = docset->find(t2);
53 docresultmap::iterator end = docset->end();
54
55 // sort all the document numbers not in the document set to
56 // the end of the list
57 if (t1_here == end) {
58 if (t2_here == end) return t1>t2;
59 else return true;
60 } else if (t2_here == end) return false;
61
62 if (compare_phrase_match) {
63 if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
64 if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
65 }
66
67 if (compare_terms_match) {
68 if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
69 if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
70 }
71
72 if (compare_doc_weight) {
73 if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
74 if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
75 }
76
77 return t1>t2;
78}
79
80
81
82
83/////////////////////////////////
84// functions for mgqueryfilterclass
85/////////////////////////////////
86
[4193]87
88void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
89 queryfilterclass::configure (key, cfgline);
90
[12314]91 if (key == "indexstem") {
[9937]92 ((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
93 }
94
[4193]95}
96
[1324]97// loads up phrases data structure with any phrases (that's the quoted bits)
98// occuring in the querystring
99void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
100 const termfreqclassarray &orgterms,
101 vector<termfreqclassarray> &phrases) {
102
103 text_t::const_iterator here = querystring.begin();
104 text_t::const_iterator end = querystring.end();
105
106 termfreqclassarray tmpterms;
107
108 int termcount = 0;
109 bool foundquote = false;
110 bool foundbreak = false;
111 bool start = true;
112 while (here != end) {
113 if (*here == '\"') {
114 if (foundquote) {
115 if (!foundbreak && !start) {
116 tmpterms.push_back (orgterms[termcount]);
[9620]117 ++termcount;
[1324]118 }
119 if (tmpterms.size() > 1) {
120 phrases.push_back (tmpterms);
121 }
[11002]122 tmpterms.erase (tmpterms.begin(), tmpterms.end());
123
[1324]124 foundquote = false;
125 foundbreak = true;
126 } else foundquote = true;
127 } else if (!is_unicode_letdig(*here)) {
128 // found a break between terms
129 if (!foundbreak && !start) {
[11002]130 if (foundquote) {
[1324]131 tmpterms.push_back (orgterms[termcount]);
[11002]132 }
[9620]133 ++termcount;
[1324]134 }
135 foundbreak = true;
136 } else {
137 start = false;
138 foundbreak = false;
139 }
[9620]140 ++here;
[1324]141 }
142}
143
144// do aditional query processing
145void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
146 queryresultsclass &queryresults) {
147
148 // post-process the results if needed
149 if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
150
151 // get the terms between quotes (if any)
152 vector<termfreqclassarray> phrases;
153 get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
154
155 num_phrases = phrases.size();
156 if (num_phrases > 0) {
157
158 // get the long version of the index
159 text_t longindex;
160 indexmap.to2from (queryparams.index, longindex);
161
162 vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
163 vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
164
165 while (this_phrase != end_phrase) {
166
167 // process each of the matched documents
168 docresultmap::iterator docs_here = queryresults.docs.docset.begin();
169 docresultmap::iterator docs_end = queryresults.docs.docset.end();
170 while (docs_here != docs_end) {
[15558]171 if (OID_phrase_search (*((mgsearchclass*)textsearchptr), *db_ptr, queryparams.index,
[1324]172 queryparams.subcollection, queryparams.language,
173 longindex, queryparams.collection, *this_phrase,
174 (*docs_here).second.docnum)) {
[9620]175 ++docs_here->second.num_phrase_match;
[1324]176 }
177
[9620]178 ++docs_here;
[1324]179 }
[9620]180 ++this_phrase;
[1324]181 }
182 }
183 }
184}
185
186
187// do query that might involve multiple sub queries
[15595]188// textsearchptr and db_ptr are assumed to be valid
[1324]189void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
190 const vector<queryparamclass> &query_params,
191 queryresultsclass &multiresults,
192 comerror_t &err, ostream &logout) {
193 outconvertclass text_t2ascii;
194
195 err = noError;
[8026]196 textsearchptr->setcollectdir (collectdir);
[16310]197
[1324]198 multiresults.clear();
199
200 vector<queryparamclass>::const_iterator query_here = query_params.begin();
201 vector<queryparamclass>::const_iterator query_end = query_params.end();
202 while (query_here != query_end) {
203 queryresultsclass thisqueryresults;
[1662]204
[8026]205 if (!textsearchptr->search(*query_here, thisqueryresults)) {
[1324]206 // most likely a system problem
207 logout << text_t2ascii
208 << "system problem: could not do search with mg for index \""
209 << (*query_here).index << (*query_here).subcollection
210 << (*query_here).language << "\".\n\n";
211 err = systemProblem;
212 return;
213 }
214
215 // combine the results
216 if (need_matching_docs (request.filterResultOptions)) {
217 // post-process the results if needed
218 if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
219 !thisqueryresults.docs.docset.empty()) {
220 post_process (*query_here, thisqueryresults);
221 thisqueryresults.postprocessed = true;
222 multiresults.postprocessed = true;
[1721]223 } else {
224 num_phrases = 0;
[1324]225 }
226
227 if (query_params.size() == 1) {
228 multiresults.docs = thisqueryresults.docs; // just one set of results
229 multiresults.docs_matched = thisqueryresults.docs_matched;
230 multiresults.is_approx = thisqueryresults.is_approx;
231
232 } else {
233 if ((*query_here).combinequery == "and") {
234 multiresults.docs.combine_and (thisqueryresults.docs);
235 } else if ((*query_here).combinequery == "or") {
236 multiresults.docs.combine_or (thisqueryresults.docs);
237 } else if ((*query_here).combinequery == "not") {
238 multiresults.docs.combine_not (thisqueryresults.docs);
239 }
240 multiresults.docs_matched = multiresults.docs.docset.size();
241 multiresults.is_approx = Exact;
242 }
243 }
244
245 // combine the term information
246 if (need_term_info (request.filterResultOptions)) {
247 // append the terms
248 multiresults.orgterms.insert(multiresults.orgterms.end(),
249 thisqueryresults.orgterms.begin(),
250 thisqueryresults.orgterms.end());
251
252 // add the term variants
253 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
254 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
255 while (termvar_here != termvar_end) {
256 multiresults.termvariants.insert(*termvar_here);
[9620]257 ++termvar_here;
[1324]258 }
259 }
260
[9620]261 ++query_here;
[1324]262 }
263
264 // sort and unique the query terms
265 multiresults.sortuniqqueryterms ();
266}
267
268
269void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
270 docresultsclass &docs) {
271 resultsorderer_t resultsorderer;
272 resultsorderer.compare_phrase_match = true;
273 resultsorderer.docset = &(docs.docset);
274
275 // first get a list of document numbers
276 docs.docnum_order();
277
278 sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
279}
280
281
282
283mgqueryfilterclass::mgqueryfilterclass ()
284 :queryfilterclass() {
285
286 num_phrases = 0;
[27064]287
288 FilterOption_t filtopt;
289 // -- onePerQuery PhraseMatch enumerated
290 filtopt.name = "PhraseMatch";
291 filtopt.type = FilterOption_t::enumeratedt;
292 filtopt.repeatable = FilterOption_t::onePerQuery;
293 filtopt.defaultValue = "some_phrases";
294 filtopt.validValues.push_back ("all_phrases");
295 filtopt.validValues.push_back ("some_phrases");
296 filtopt.validValues.push_back ("all_docs");
297 filterOptions["PhraseMatch"] = filtopt;
298
[1324]299}
300
301mgqueryfilterclass::~mgqueryfilterclass () {
302}
303
304void mgqueryfilterclass::filter (const FilterRequest_t &request,
305 FilterResponse_t &response,
306 comerror_t &err, ostream &logout) {
307 outconvertclass text_t2ascii;
308
309 response.clear ();
310 err = noError;
[15558]311 if (db_ptr == NULL) {
[1324]312 // most likely a configuration problem
313 logout << text_t2ascii
[15558]314 << "configuration error: mgqueryfilter contains a null dbclass\n\n";
[1324]315 err = configurationError;
316 return;
317 }
[8026]318 if (textsearchptr == NULL) {
[1324]319 // most likely a configuration problem
320 logout << text_t2ascii
[8026]321 << "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
[1324]322 err = configurationError;
323 return;
324 }
325
326 // open the database
[15558]327 db_ptr->setlogout(&logout);
328 if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
329 // most likely a system problem (we have already checked that the database exists)
[1324]330 logout << text_t2ascii
[15558]331 << "system problem: open on database \"" << db_filename << "\" failed\n\n";
[1324]332 err = systemProblem;
333 return;
334 }
335
336 // get the query parameters
337 int startresults = filterOptions["StartResults"].defaultValue.getint();
338 int endresults = filterOptions["EndResults"].defaultValue.getint();
339 text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
340
341 vector<queryparamclass> queryfilterparams;
342 parse_query_params (request, queryfilterparams, startresults,
[1662]343 endresults, phrasematch, logout);
344 // do any mg specific diddling with query parameters that may be required
[27064]345 // mg_parse_query_params (request, queryfilterparams, startresults,
346 // endresults, phrasematch, logout);
[1662]347
348
[1324]349 // do query
350 queryresultsclass queryresults;
351 do_multi_query (request, queryfilterparams, queryresults, err, logout);
352 if (err != noError) return;
353
354 // assemble document results
355 if (need_matching_docs (request.filterResultOptions)) {
356 // sort the query results
[5850]357 // only want to sort the docs if we have done a ranked search or there were phrases
358 if (num_phrases > 0 || (request.filterResultOptions & FRranking)) {
359 sort_doc_results (request, queryresults.docs);
360 }
[1324]361 int resultnum = 1;
362 ResultDocInfo_t resultdoc;
363 text_t trans_OID;
[16445]364 vector<text_t>::iterator docorder_here = queryresults.docs.docorder.begin();
365 vector<text_t>::iterator docorder_end = queryresults.docs.docorder.end();
[1324]366
[1662]367 // documents containing matching phrases will be sorted to the top so
368 // we can break out once we're past those that match the PhraseMatch
369 // option -- "all_phrases" = return only those documents containing all
370 // phrases in query string
371 // "some_phrases" = return only those documents containing
372 // at least 1 of the phrases in the document
373 // "all_docs" = return all documents regardless
374 if (num_phrases > 0) {
375 int numdocs = 0;
376 while (docorder_here != docorder_end) {
377 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
378
379 if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) ||
380 ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
381 queryresults.docs_matched = numdocs;
382 break;
383 }
[9620]384 ++numdocs;
385 ++docorder_here;
[1662]386 }
387 }
388
[1324]389 if (endresults == -1) endresults = MAXNUMDOCS;
[1662]390 docorder_here = queryresults.docs.docorder.begin();
[1324]391 while (docorder_here != docorder_end) {
[1662]392 if (resultnum > endresults || resultnum > queryresults.docs_matched) break;
[1324]393
394 // translate the document number
[15558]395 if (!translate(db_ptr, *docorder_here, trans_OID)) {
[1324]396 logout << text_t2ascii
397 << "warning: could not translate mg document number \""
398 << *docorder_here << "\"to OID.\n\n";
399
400 } else {
401 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
402
403 // see if there is a result for this number,
404 // if it is in the request set (or the request set is empty)
405 if (docset_here != queryresults.docs.docset.end() &&
406 (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
407 if (resultnum >= startresults) {
408 // add this document
409 resultdoc.OID = trans_OID;
410 resultdoc.result_num = resultnum;
411 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
412
413 // these next two are not available on all versions of mg
414 resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
415 resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
416
417 response.docInfo.push_back (resultdoc);
418 }
419
[9620]420 ++resultnum;
[1324]421 }
422 }
423
[9620]424 ++docorder_here;
[1324]425 }
426 }
427
428 // assemble the term results
429 if (need_term_info(request.filterResultOptions)) {
430 // note: the terms have already been sorted and uniqued
431
432 TermInfo_t terminfo;
433 bool terms_first = true;
434 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
435 termfreqclassarray::iterator terms_end = queryresults.terms.end();
436
437 while (terms_here != terms_end) {
438 terminfo.clear();
439 terminfo.term = (*terms_here).termstr;
440 terminfo.freq = (*terms_here).termfreq;
441 if (terms_first) {
442 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
443 text_tset::iterator termvariants_end = queryresults.termvariants.end();
444 while (termvariants_here != termvariants_end) {
445 terminfo.matchTerms.push_back (*termvariants_here);
[9620]446 ++termvariants_here;
[1324]447 }
448 }
449 terms_first = false;
450
451 response.termInfo.push_back (terminfo);
452
[9620]453 ++terms_here;
[1324]454 }
455 }
456
[15558]457 db_ptr->closedatabase(); // Important that local library doesn't leave any files open
[1324]458 response.numDocs = queryresults.docs_matched;
459 response.isApprox = queryresults.is_approx;
460}
461
[27064]462void mgqueryfilterclass::parse_query_params (const FilterRequest_t &request,
[1662]463 vector<queryparamclass> &query_params,
[27064]464 int &startresults, int &endresults,
465 text_t &phrasematch, ostream &logout) {
[1662]466
[27064]467 queryfilterclass::parse_query_params (request, query_params,
468 startresults, endresults, logout);
469
470 phrasematch = filterOptions["PhraseMatch"].defaultValue;
471
472 // is there a better way to do this than iterate through all the options again??
473 OptionValue_tarray::const_iterator options_here = request.filterOptions.begin();
474 OptionValue_tarray::const_iterator options_end = request.filterOptions.end();
475 while (options_here != options_end) {
476 if ((*options_here).name == "PhraseMatch") {
477 phrasematch = (*options_here).value;
478 break;
479 }
480 ++options_here;
481 }
[1662]482
483 vector<queryparamclass>::iterator query_here = query_params.begin();
484 vector<queryparamclass>::iterator query_end = query_params.end();
485 while (query_here != query_end) {
486
[2134]487 // if we're doing a phrase search we want to maximise hits by making it
488 // a boolean search on the index with the finest granularity - we'll
489 // also set maxdocs to "all" (realizing that this will cause searches
490 // like "and the" on a large collection to take a very very long time).
491
[1662]492 // we're deciding it's a phrase search based on if the querystring
493 // contains at least 2 double quotes (not very scientific but
494 // then neither is the rest of the mg phrase searching functionality :-)
[11002]495 //if (countchar ((*query_here).querystring.begin(), (*query_here).querystring.end(), '"') > 1) {
496
497 // [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here
498 text_tarray phrases;
499 get_phrases((*query_here).querystring, phrases);
500
501 if (phrases.size() > 0) {
[1662]502 (*query_here).search_type = 0;
503
[2134]504 // set maxdocs to "all"
505 (*query_here).maxdocs = -1;
506
[1662]507 // Get the long version of the index and test to see if any indexes with
508 // finer granularity exist. Indexes must be the same type (i.e. same metadata
509 // or "text").
510 text_t longindex; text_tarray splitindex;
511 indexmap.to2from ((*query_here).index, longindex);
512 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
513 text_t &granularity = splitindex[0];
514 text_t &indextype = splitindex[1];
515 bool found = false;
516 // currently supported granularity options are "document", "section" and "paragraph"
517 if (granularity == "document" || granularity == "section") {
518 text_t shortindex;
519 if (indexmap.fromexists ("paragraph:" + indextype)) {
520 indexmap.from2to ("paragraph:" + indextype, shortindex);
521 (*query_here).index = shortindex;
522 found = true;
523 }
524 if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
525 indexmap.from2to ("section:" + indextype, shortindex);
526 (*query_here).index = shortindex;
527 }
528 }
529 }
[4507]530
531#ifdef GSDL_BBC_COLLECTION
532 // This is a special hack for the BBC collection's ProgNumber and zzabn
533 // indexes (they're built this way to prevent mg_perf_hash_build from
534 // dying at build time)
535
[4735]536 // if we're searching the ProgNumber index we want to
[4507]537 // remove all non-alphanumeric characters from the query string
538 text_t longindex; text_tarray splitindex;
539 indexmap.to2from ((*query_here).index, longindex);
540 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
541 text_t &indextype = splitindex[1];
[4735]542 if (indextype == "ProgNumber") {
[4507]543 text_t new_querystring;
544 text_t::const_iterator here = (*query_here).querystring.begin();
545 text_t::const_iterator end = (*query_here).querystring.end();
546 while (here != end) {
547 if ((*here >= 'a' && *here <= 'z') || (*here >= 'A' && *here <= 'Z') ||
548 (*here >= '0' && *here <= '9')) {
549 new_querystring.push_back (*here);
550 }
[9620]551 ++here;
[4507]552 }
553 (*query_here).querystring = new_querystring;
554 }
555#endif
[9620]556 ++query_here;
[1662]557 }
558}
559
Note: See TracBrowser for help on using the repository browser.