source: main/trunk/greenstone2/runtime-src/src/colservr/mgqueryfilter.cpp@ 22452

Last change on this file since 22452 was 16445, checked in by mdewsnip, 16 years ago

Search result document numbers are now represented with a text_t rather than an int, in preparation for changing Lucene to return the Greenstone document OIDs directly rather than looking them up as a separate step. This is better for efficiency and is also required for incremental building.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.8 KB
Line 
1/**********************************************************************
2 *
3 * mgqueryfilter.cpp -- implementation of queryfilter for old mg
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "mgqueryfilter.h"
27#include "fileutil.h"
28#include "phrasesearch.h"
29#include "mgsearch.h"
30#include "phrases.h"
31
32///////////////////////////////
33// methods for resultsorderer_t
34///////////////////////////////
35
36resultsorderer_t::resultsorderer_t() {
37 clear ();
38}
39
40void resultsorderer_t::clear() {
41 compare_phrase_match = false;
42 compare_terms_match = false;
43 compare_doc_weight = true;
44
45 docset = NULL;
46}
47
48bool resultsorderer_t::operator()(const text_t &t1, const text_t &t2) const {
49 if (docset == NULL) return t1>t2;
50
51 docresultmap::iterator t1_here = docset->find(t1);
52 docresultmap::iterator t2_here = docset->find(t2);
53 docresultmap::iterator end = docset->end();
54
55 // sort all the document numbers not in the document set to
56 // the end of the list
57 if (t1_here == end) {
58 if (t2_here == end) return t1>t2;
59 else return true;
60 } else if (t2_here == end) return false;
61
62 if (compare_phrase_match) {
63 if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
64 if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
65 }
66
67 if (compare_terms_match) {
68 if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
69 if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
70 }
71
72 if (compare_doc_weight) {
73 if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
74 if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
75 }
76
77 return t1>t2;
78}
79
80
81
82
83/////////////////////////////////
84// functions for mgqueryfilterclass
85/////////////////////////////////
86
87
88void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
89 queryfilterclass::configure (key, cfgline);
90
91 if (key == "indexstem") {
92 ((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
93 }
94
95}
96
97// loads up phrases data structure with any phrases (that's the quoted bits)
98// occuring in the querystring
99void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
100 const termfreqclassarray &orgterms,
101 vector<termfreqclassarray> &phrases) {
102
103 text_t::const_iterator here = querystring.begin();
104 text_t::const_iterator end = querystring.end();
105
106 termfreqclassarray tmpterms;
107
108 int termcount = 0;
109 bool foundquote = false;
110 bool foundbreak = false;
111 bool start = true;
112 while (here != end) {
113 if (*here == '\"') {
114 if (foundquote) {
115 if (!foundbreak && !start) {
116 tmpterms.push_back (orgterms[termcount]);
117 ++termcount;
118 }
119 if (tmpterms.size() > 1) {
120 phrases.push_back (tmpterms);
121 }
122 tmpterms.erase (tmpterms.begin(), tmpterms.end());
123
124 foundquote = false;
125 foundbreak = true;
126 } else foundquote = true;
127 } else if (!is_unicode_letdig(*here)) {
128 // found a break between terms
129 if (!foundbreak && !start) {
130 if (foundquote) {
131 tmpterms.push_back (orgterms[termcount]);
132 }
133 ++termcount;
134 }
135 foundbreak = true;
136 } else {
137 start = false;
138 foundbreak = false;
139 }
140 ++here;
141 }
142}
143
144// do aditional query processing
145void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
146 queryresultsclass &queryresults) {
147
148 // post-process the results if needed
149 if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
150
151 // get the terms between quotes (if any)
152 vector<termfreqclassarray> phrases;
153 get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
154
155 num_phrases = phrases.size();
156 if (num_phrases > 0) {
157
158 // get the long version of the index
159 text_t longindex;
160 indexmap.to2from (queryparams.index, longindex);
161
162 vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
163 vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
164
165 while (this_phrase != end_phrase) {
166
167 // process each of the matched documents
168 docresultmap::iterator docs_here = queryresults.docs.docset.begin();
169 docresultmap::iterator docs_end = queryresults.docs.docset.end();
170 while (docs_here != docs_end) {
171 if (OID_phrase_search (*((mgsearchclass*)textsearchptr), *db_ptr, queryparams.index,
172 queryparams.subcollection, queryparams.language,
173 longindex, queryparams.collection, *this_phrase,
174 (*docs_here).second.docnum)) {
175 ++docs_here->second.num_phrase_match;
176 }
177
178 ++docs_here;
179 }
180 ++this_phrase;
181 }
182 }
183 }
184}
185
186
187// do query that might involve multiple sub queries
188// textsearchptr and db_ptr are assumed to be valid
189void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
190 const vector<queryparamclass> &query_params,
191 queryresultsclass &multiresults,
192 comerror_t &err, ostream &logout) {
193 outconvertclass text_t2ascii;
194
195 err = noError;
196 textsearchptr->setcollectdir (collectdir);
197
198 multiresults.clear();
199
200 vector<queryparamclass>::const_iterator query_here = query_params.begin();
201 vector<queryparamclass>::const_iterator query_end = query_params.end();
202 while (query_here != query_end) {
203 queryresultsclass thisqueryresults;
204
205 if (!textsearchptr->search(*query_here, thisqueryresults)) {
206 // most likely a system problem
207 logout << text_t2ascii
208 << "system problem: could not do search with mg for index \""
209 << (*query_here).index << (*query_here).subcollection
210 << (*query_here).language << "\".\n\n";
211 err = systemProblem;
212 return;
213 }
214
215 // combine the results
216 if (need_matching_docs (request.filterResultOptions)) {
217 // post-process the results if needed
218 if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
219 !thisqueryresults.docs.docset.empty()) {
220 post_process (*query_here, thisqueryresults);
221 thisqueryresults.postprocessed = true;
222 multiresults.postprocessed = true;
223 } else {
224 num_phrases = 0;
225 }
226
227 if (query_params.size() == 1) {
228 multiresults.docs = thisqueryresults.docs; // just one set of results
229 multiresults.docs_matched = thisqueryresults.docs_matched;
230 multiresults.is_approx = thisqueryresults.is_approx;
231
232 } else {
233 if ((*query_here).combinequery == "and") {
234 multiresults.docs.combine_and (thisqueryresults.docs);
235 } else if ((*query_here).combinequery == "or") {
236 multiresults.docs.combine_or (thisqueryresults.docs);
237 } else if ((*query_here).combinequery == "not") {
238 multiresults.docs.combine_not (thisqueryresults.docs);
239 }
240 multiresults.docs_matched = multiresults.docs.docset.size();
241 multiresults.is_approx = Exact;
242 }
243 }
244
245 // combine the term information
246 if (need_term_info (request.filterResultOptions)) {
247 // append the terms
248 multiresults.orgterms.insert(multiresults.orgterms.end(),
249 thisqueryresults.orgterms.begin(),
250 thisqueryresults.orgterms.end());
251
252 // add the term variants
253 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
254 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
255 while (termvar_here != termvar_end) {
256 multiresults.termvariants.insert(*termvar_here);
257 ++termvar_here;
258 }
259 }
260
261 ++query_here;
262 }
263
264 // sort and unique the query terms
265 multiresults.sortuniqqueryterms ();
266}
267
268
269void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
270 docresultsclass &docs) {
271 resultsorderer_t resultsorderer;
272 resultsorderer.compare_phrase_match = true;
273 resultsorderer.docset = &(docs.docset);
274
275 // first get a list of document numbers
276 docs.docnum_order();
277
278 sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
279}
280
281
282
283mgqueryfilterclass::mgqueryfilterclass ()
284 :queryfilterclass() {
285
286 num_phrases = 0;
287}
288
289mgqueryfilterclass::~mgqueryfilterclass () {
290}
291
292void mgqueryfilterclass::filter (const FilterRequest_t &request,
293 FilterResponse_t &response,
294 comerror_t &err, ostream &logout) {
295 outconvertclass text_t2ascii;
296
297 response.clear ();
298 err = noError;
299 if (db_ptr == NULL) {
300 // most likely a configuration problem
301 logout << text_t2ascii
302 << "configuration error: mgqueryfilter contains a null dbclass\n\n";
303 err = configurationError;
304 return;
305 }
306 if (textsearchptr == NULL) {
307 // most likely a configuration problem
308 logout << text_t2ascii
309 << "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
310 err = configurationError;
311 return;
312 }
313
314 // open the database
315 db_ptr->setlogout(&logout);
316 if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
317 // most likely a system problem (we have already checked that the database exists)
318 logout << text_t2ascii
319 << "system problem: open on database \"" << db_filename << "\" failed\n\n";
320 err = systemProblem;
321 return;
322 }
323
324 // get the query parameters
325 int startresults = filterOptions["StartResults"].defaultValue.getint();
326 int endresults = filterOptions["EndResults"].defaultValue.getint();
327 text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
328
329 vector<queryparamclass> queryfilterparams;
330 parse_query_params (request, queryfilterparams, startresults,
331 endresults, phrasematch, logout);
332 // do any mg specific diddling with query parameters that may be required
333 mg_parse_query_params (request, queryfilterparams, startresults,
334 endresults, phrasematch, logout);
335
336
337 // do query
338 queryresultsclass queryresults;
339 do_multi_query (request, queryfilterparams, queryresults, err, logout);
340 if (err != noError) return;
341
342 // assemble document results
343 if (need_matching_docs (request.filterResultOptions)) {
344 // sort the query results
345 // only want to sort the docs if we have done a ranked search or there were phrases
346 if (num_phrases > 0 || (request.filterResultOptions & FRranking)) {
347 sort_doc_results (request, queryresults.docs);
348 }
349 int resultnum = 1;
350 ResultDocInfo_t resultdoc;
351 text_t trans_OID;
352 vector<text_t>::iterator docorder_here = queryresults.docs.docorder.begin();
353 vector<text_t>::iterator docorder_end = queryresults.docs.docorder.end();
354
355 // documents containing matching phrases will be sorted to the top so
356 // we can break out once we're past those that match the PhraseMatch
357 // option -- "all_phrases" = return only those documents containing all
358 // phrases in query string
359 // "some_phrases" = return only those documents containing
360 // at least 1 of the phrases in the document
361 // "all_docs" = return all documents regardless
362 if (num_phrases > 0) {
363 int numdocs = 0;
364 while (docorder_here != docorder_end) {
365 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
366
367 if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) ||
368 ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
369 queryresults.docs_matched = numdocs;
370 break;
371 }
372 ++numdocs;
373 ++docorder_here;
374 }
375 }
376
377 if (endresults == -1) endresults = MAXNUMDOCS;
378 docorder_here = queryresults.docs.docorder.begin();
379 while (docorder_here != docorder_end) {
380 if (resultnum > endresults || resultnum > queryresults.docs_matched) break;
381
382 // translate the document number
383 if (!translate(db_ptr, *docorder_here, trans_OID)) {
384 logout << text_t2ascii
385 << "warning: could not translate mg document number \""
386 << *docorder_here << "\"to OID.\n\n";
387
388 } else {
389 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
390
391 // see if there is a result for this number,
392 // if it is in the request set (or the request set is empty)
393 if (docset_here != queryresults.docs.docset.end() &&
394 (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
395 if (resultnum >= startresults) {
396 // add this document
397 resultdoc.OID = trans_OID;
398 resultdoc.result_num = resultnum;
399 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
400
401 // these next two are not available on all versions of mg
402 resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
403 resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
404
405 response.docInfo.push_back (resultdoc);
406 }
407
408 ++resultnum;
409 }
410 }
411
412 ++docorder_here;
413 }
414 }
415
416 // assemble the term results
417 if (need_term_info(request.filterResultOptions)) {
418 // note: the terms have already been sorted and uniqued
419
420 TermInfo_t terminfo;
421 bool terms_first = true;
422 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
423 termfreqclassarray::iterator terms_end = queryresults.terms.end();
424
425 while (terms_here != terms_end) {
426 terminfo.clear();
427 terminfo.term = (*terms_here).termstr;
428 terminfo.freq = (*terms_here).termfreq;
429 if (terms_first) {
430 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
431 text_tset::iterator termvariants_end = queryresults.termvariants.end();
432 while (termvariants_here != termvariants_end) {
433 terminfo.matchTerms.push_back (*termvariants_here);
434 ++termvariants_here;
435 }
436 }
437 terms_first = false;
438
439 response.termInfo.push_back (terminfo);
440
441 ++terms_here;
442 }
443 }
444
445 db_ptr->closedatabase(); // Important that local library doesn't leave any files open
446 response.numDocs = queryresults.docs_matched;
447 response.isApprox = queryresults.is_approx;
448}
449
450void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/*request*/,
451 vector<queryparamclass> &query_params,
452 int &/*startresults*/, int &/*endresults*/,
453 text_t &/*phrasematch*/, ostream &/*logout*/) {
454
455 // outconvertclass text_t2ascii;
456
457 vector<queryparamclass>::iterator query_here = query_params.begin();
458 vector<queryparamclass>::iterator query_end = query_params.end();
459 while (query_here != query_end) {
460
461 // if we're doing a phrase search we want to maximise hits by making it
462 // a boolean search on the index with the finest granularity - we'll
463 // also set maxdocs to "all" (realizing that this will cause searches
464 // like "and the" on a large collection to take a very very long time).
465
466 // we're deciding it's a phrase search based on if the querystring
467 // contains at least 2 double quotes (not very scientific but
468 // then neither is the rest of the mg phrase searching functionality :-)
469 //if (countchar ((*query_here).querystring.begin(), (*query_here).querystring.end(), '"') > 1) {
470
471 // [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here
472 text_tarray phrases;
473 get_phrases((*query_here).querystring, phrases);
474
475 if (phrases.size() > 0) {
476 (*query_here).search_type = 0;
477
478 // set maxdocs to "all"
479 (*query_here).maxdocs = -1;
480
481 // Get the long version of the index and test to see if any indexes with
482 // finer granularity exist. Indexes must be the same type (i.e. same metadata
483 // or "text").
484 text_t longindex; text_tarray splitindex;
485 indexmap.to2from ((*query_here).index, longindex);
486 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
487 text_t &granularity = splitindex[0];
488 text_t &indextype = splitindex[1];
489 bool found = false;
490 // currently supported granularity options are "document", "section" and "paragraph"
491 if (granularity == "document" || granularity == "section") {
492 text_t shortindex;
493 if (indexmap.fromexists ("paragraph:" + indextype)) {
494 indexmap.from2to ("paragraph:" + indextype, shortindex);
495 (*query_here).index = shortindex;
496 found = true;
497 }
498 if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
499 indexmap.from2to ("section:" + indextype, shortindex);
500 (*query_here).index = shortindex;
501 }
502 }
503 }
504
505#ifdef GSDL_BBC_COLLECTION
506 // This is a special hack for the BBC collection's ProgNumber and zzabn
507 // indexes (they're built this way to prevent mg_perf_hash_build from
508 // dying at build time)
509
510 // if we're searching the ProgNumber index we want to
511 // remove all non-alphanumeric characters from the query string
512 text_t longindex; text_tarray splitindex;
513 indexmap.to2from ((*query_here).index, longindex);
514 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
515 text_t &indextype = splitindex[1];
516 if (indextype == "ProgNumber") {
517 text_t new_querystring;
518 text_t::const_iterator here = (*query_here).querystring.begin();
519 text_t::const_iterator end = (*query_here).querystring.end();
520 while (here != end) {
521 if ((*here >= 'a' && *here <= 'z') || (*here >= 'A' && *here <= 'Z') ||
522 (*here >= '0' && *here <= '9')) {
523 new_querystring.push_back (*here);
524 }
525 ++here;
526 }
527 (*query_here).querystring = new_querystring;
528 }
529#endif
530 ++query_here;
531 }
532}
533
Note: See TracBrowser for help on using the repository browser.