source: main/tags/2.71/gsdl/src/colservr/mgqueryfilter.cpp@ 24179

Last change on this file since 24179 was 12314, checked in by kjdon, 18 years ago

maxnumeric moved from mgqueryfilterclass to queryfilterclass, cos now mgpp uses it too. Its passed in as an arg to ParseQuery

  • Property svn:keywords set to Author Date Id Revision
File size: 17.8 KB
Line 
1/**********************************************************************
2 *
3 * mgqueryfilter.cpp -- implementation of queryfilter for old mg
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "mgqueryfilter.h"
27#include "fileutil.h"
28#include "phrasesearch.h"
29#include <assert.h>
30#include "mgsearch.h"
31#include "phrases.h"
32
33///////////////////////////////
34// methods for resultsorderer_t
35///////////////////////////////
36
37resultsorderer_t::resultsorderer_t() {
38 clear ();
39}
40
41void resultsorderer_t::clear() {
42 compare_phrase_match = false;
43 compare_terms_match = false;
44 compare_doc_weight = true;
45
46 docset = NULL;
47}
48
49bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
50 if (docset == NULL) return t1>t2;
51
52 docresultmap::iterator t1_here = docset->find(t1);
53 docresultmap::iterator t2_here = docset->find(t2);
54 docresultmap::iterator end = docset->end();
55
56 // sort all the document numbers not in the document set to
57 // the end of the list
58 if (t1_here == end) {
59 if (t2_here == end) return t1>t2;
60 else return true;
61 } else if (t2_here == end) return false;
62
63 if (compare_phrase_match) {
64 if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
65 if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
66 }
67
68 if (compare_terms_match) {
69 if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
70 if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
71 }
72
73 if (compare_doc_weight) {
74 if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
75 if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
76 }
77
78 return t1>t2;
79}
80
81
82
83
84/////////////////////////////////
85// functions for mgqueryfilterclass
86/////////////////////////////////
87
88
89void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
90 queryfilterclass::configure (key, cfgline);
91
92 if (key == "indexstem") {
93 ((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
94 }
95
96}
97
98// loads up phrases data structure with any phrases (that's the quoted bits)
99// occuring in the querystring
100void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
101 const termfreqclassarray &orgterms,
102 vector<termfreqclassarray> &phrases) {
103
104 text_t::const_iterator here = querystring.begin();
105 text_t::const_iterator end = querystring.end();
106
107 termfreqclassarray tmpterms;
108
109 int termcount = 0;
110 bool foundquote = false;
111 bool foundbreak = false;
112 bool start = true;
113 while (here != end) {
114 if (*here == '\"') {
115 if (foundquote) {
116 if (!foundbreak && !start) {
117 tmpterms.push_back (orgterms[termcount]);
118 ++termcount;
119 }
120 if (tmpterms.size() > 1) {
121 phrases.push_back (tmpterms);
122 }
123 tmpterms.erase (tmpterms.begin(), tmpterms.end());
124
125 foundquote = false;
126 foundbreak = true;
127 } else foundquote = true;
128 } else if (!is_unicode_letdig(*here)) {
129 // found a break between terms
130 if (!foundbreak && !start) {
131 if (foundquote) {
132 tmpterms.push_back (orgterms[termcount]);
133 }
134 ++termcount;
135 }
136 foundbreak = true;
137 } else {
138 start = false;
139 foundbreak = false;
140 }
141 ++here;
142 }
143}
144
145// do aditional query processing
146void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
147 queryresultsclass &queryresults) {
148
149 // post-process the results if needed
150 if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
151
152 // get the terms between quotes (if any)
153 vector<termfreqclassarray> phrases;
154 get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
155
156 num_phrases = phrases.size();
157 if (num_phrases > 0) {
158
159 // get the long version of the index
160 text_t longindex;
161 indexmap.to2from (queryparams.index, longindex);
162
163 vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
164 vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
165
166 while (this_phrase != end_phrase) {
167
168 // process each of the matched documents
169 docresultmap::iterator docs_here = queryresults.docs.docset.begin();
170 docresultmap::iterator docs_end = queryresults.docs.docset.end();
171 while (docs_here != docs_end) {
172 if (OID_phrase_search (*((mgsearchclass*)textsearchptr), *gdbmptr, queryparams.index,
173 queryparams.subcollection, queryparams.language,
174 longindex, queryparams.collection, *this_phrase,
175 (*docs_here).second.docnum)) {
176 ++docs_here->second.num_phrase_match;
177 }
178
179 ++docs_here;
180 }
181 ++this_phrase;
182 }
183 }
184 }
185}
186
187
188// do query that might involve multiple sub queries
189// mgsearchptr and gdbmptr are assumed to be valid
190void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
191 const vector<queryparamclass> &query_params,
192 queryresultsclass &multiresults,
193 comerror_t &err, ostream &logout) {
194 outconvertclass text_t2ascii;
195
196 err = noError;
197 textsearchptr->setcollectdir (collectdir);
198 multiresults.clear();
199
200 vector<queryparamclass>::const_iterator query_here = query_params.begin();
201 vector<queryparamclass>::const_iterator query_end = query_params.end();
202 while (query_here != query_end) {
203 queryresultsclass thisqueryresults;
204
205 if (!textsearchptr->search(*query_here, thisqueryresults)) {
206 // most likely a system problem
207 logout << text_t2ascii
208 << "system problem: could not do search with mg for index \""
209 << (*query_here).index << (*query_here).subcollection
210 << (*query_here).language << "\".\n\n";
211 err = systemProblem;
212 return;
213 }
214
215 // combine the results
216 if (need_matching_docs (request.filterResultOptions)) {
217 // post-process the results if needed
218 if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
219 !thisqueryresults.docs.docset.empty()) {
220 post_process (*query_here, thisqueryresults);
221 thisqueryresults.postprocessed = true;
222 multiresults.postprocessed = true;
223 } else {
224 num_phrases = 0;
225 }
226
227 if (query_params.size() == 1) {
228 multiresults.docs = thisqueryresults.docs; // just one set of results
229 multiresults.docs_matched = thisqueryresults.docs_matched;
230 multiresults.is_approx = thisqueryresults.is_approx;
231
232 } else {
233 if ((*query_here).combinequery == "and") {
234 multiresults.docs.combine_and (thisqueryresults.docs);
235 } else if ((*query_here).combinequery == "or") {
236 multiresults.docs.combine_or (thisqueryresults.docs);
237 } else if ((*query_here).combinequery == "not") {
238 multiresults.docs.combine_not (thisqueryresults.docs);
239 }
240 multiresults.docs_matched = multiresults.docs.docset.size();
241 multiresults.is_approx = Exact;
242 }
243 }
244
245 // combine the term information
246 if (need_term_info (request.filterResultOptions)) {
247 // append the terms
248 multiresults.orgterms.insert(multiresults.orgterms.end(),
249 thisqueryresults.orgterms.begin(),
250 thisqueryresults.orgterms.end());
251
252 // add the term variants
253 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
254 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
255 while (termvar_here != termvar_end) {
256 multiresults.termvariants.insert(*termvar_here);
257 ++termvar_here;
258 }
259 }
260
261 ++query_here;
262 }
263
264 // sort and unique the query terms
265 multiresults.sortuniqqueryterms ();
266}
267
268
269void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
270 docresultsclass &docs) {
271 resultsorderer_t resultsorderer;
272 resultsorderer.compare_phrase_match = true;
273 resultsorderer.docset = &(docs.docset);
274
275 // first get a list of document numbers
276 docs.docnum_order();
277
278 sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
279}
280
281
282
283mgqueryfilterclass::mgqueryfilterclass ()
284 :queryfilterclass() {
285
286 num_phrases = 0;
287}
288
289mgqueryfilterclass::~mgqueryfilterclass () {
290}
291
292void mgqueryfilterclass::filter (const FilterRequest_t &request,
293 FilterResponse_t &response,
294 comerror_t &err, ostream &logout) {
295 outconvertclass text_t2ascii;
296
297 response.clear ();
298 err = noError;
299 if (gdbmptr == NULL) {
300 // most likely a configuration problem
301 logout << text_t2ascii
302 << "configuration error: mgqueryfilter contains a null gdbmclass\n\n";
303 err = configurationError;
304 return;
305 }
306 if (textsearchptr == NULL) {
307 // most likely a configuration problem
308 logout << text_t2ascii
309 << "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
310 err = configurationError;
311 return;
312 }
313
314 // open the database
315 gdbmptr->setlogout(&logout);
316 if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
317 // most likely a system problem (we have already checked that the
318 // gdbm database exists)
319 logout << text_t2ascii
320 << "system problem: open on gdbm database \""
321 << gdbm_filename << "\" failed\n\n";
322 err = systemProblem;
323 return;
324 }
325
326 // get the query parameters
327 int startresults = filterOptions["StartResults"].defaultValue.getint();
328 int endresults = filterOptions["EndResults"].defaultValue.getint();
329 text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
330
331 vector<queryparamclass> queryfilterparams;
332 parse_query_params (request, queryfilterparams, startresults,
333 endresults, phrasematch, logout);
334 // do any mg specific diddling with query parameters that may be required
335 mg_parse_query_params (request, queryfilterparams, startresults,
336 endresults, phrasematch, logout);
337
338
339 // do query
340 queryresultsclass queryresults;
341 do_multi_query (request, queryfilterparams, queryresults, err, logout);
342 if (err != noError) return;
343
344 // assemble document results
345 if (need_matching_docs (request.filterResultOptions)) {
346 // sort the query results
347 // only want to sort the docs if we have done a ranked search or there were phrases
348 if (num_phrases > 0 || (request.filterResultOptions & FRranking)) {
349 sort_doc_results (request, queryresults.docs);
350 }
351 int resultnum = 1;
352 ResultDocInfo_t resultdoc;
353 text_t trans_OID;
354 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
355 vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
356
357 // documents containing matching phrases will be sorted to the top so
358 // we can break out once we're past those that match the PhraseMatch
359 // option -- "all_phrases" = return only those documents containing all
360 // phrases in query string
361 // "some_phrases" = return only those documents containing
362 // at least 1 of the phrases in the document
363 // "all_docs" = return all documents regardless
364 if (num_phrases > 0) {
365 int numdocs = 0;
366 while (docorder_here != docorder_end) {
367 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
368
369 if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) ||
370 ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
371 queryresults.docs_matched = numdocs;
372 break;
373 }
374 ++numdocs;
375 ++docorder_here;
376 }
377 }
378
379 if (endresults == -1) endresults = MAXNUMDOCS;
380 docorder_here = queryresults.docs.docorder.begin();
381 while (docorder_here != docorder_end) {
382 if (resultnum > endresults || resultnum > queryresults.docs_matched) break;
383
384 // translate the document number
385 if (!translate(gdbmptr, *docorder_here, trans_OID)) {
386 logout << text_t2ascii
387 << "warning: could not translate mg document number \""
388 << *docorder_here << "\"to OID.\n\n";
389
390 } else {
391 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
392
393 // see if there is a result for this number,
394 // if it is in the request set (or the request set is empty)
395 if (docset_here != queryresults.docs.docset.end() &&
396 (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
397 if (resultnum >= startresults) {
398 // add this document
399 resultdoc.OID = trans_OID;
400 resultdoc.result_num = resultnum;
401 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
402
403 // these next two are not available on all versions of mg
404 resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
405 resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
406
407 response.docInfo.push_back (resultdoc);
408 }
409
410 ++resultnum;
411 }
412 }
413
414 ++docorder_here;
415 }
416 }
417
418 // assemble the term results
419 if (need_term_info(request.filterResultOptions)) {
420 // note: the terms have already been sorted and uniqued
421
422 TermInfo_t terminfo;
423 bool terms_first = true;
424 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
425 termfreqclassarray::iterator terms_end = queryresults.terms.end();
426
427 while (terms_here != terms_end) {
428 terminfo.clear();
429 terminfo.term = (*terms_here).termstr;
430 terminfo.freq = (*terms_here).termfreq;
431 if (terms_first) {
432 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
433 text_tset::iterator termvariants_end = queryresults.termvariants.end();
434 while (termvariants_here != termvariants_end) {
435 terminfo.matchTerms.push_back (*termvariants_here);
436 ++termvariants_here;
437 }
438 }
439 terms_first = false;
440
441 response.termInfo.push_back (terminfo);
442
443 ++terms_here;
444 }
445 }
446
447 response.numDocs = queryresults.docs_matched;
448 response.isApprox = queryresults.is_approx;
449}
450
451void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/*request*/,
452 vector<queryparamclass> &query_params,
453 int &/*startresults*/, int &/*endresults*/,
454 text_t &/*phrasematch*/, ostream &/*logout*/) {
455
456 // outconvertclass text_t2ascii;
457
458 vector<queryparamclass>::iterator query_here = query_params.begin();
459 vector<queryparamclass>::iterator query_end = query_params.end();
460 while (query_here != query_end) {
461
462 // if we're doing a phrase search we want to maximise hits by making it
463 // a boolean search on the index with the finest granularity - we'll
464 // also set maxdocs to "all" (realizing that this will cause searches
465 // like "and the" on a large collection to take a very very long time).
466
467 // we're deciding it's a phrase search based on if the querystring
468 // contains at least 2 double quotes (not very scientific but
469 // then neither is the rest of the mg phrase searching functionality :-)
470 //if (countchar ((*query_here).querystring.begin(), (*query_here).querystring.end(), '"') > 1) {
471
472 // [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here
473 text_tarray phrases;
474 get_phrases((*query_here).querystring, phrases);
475
476 if (phrases.size() > 0) {
477 (*query_here).search_type = 0;
478
479 // set maxdocs to "all"
480 (*query_here).maxdocs = -1;
481
482 // Get the long version of the index and test to see if any indexes with
483 // finer granularity exist. Indexes must be the same type (i.e. same metadata
484 // or "text").
485 text_t longindex; text_tarray splitindex;
486 indexmap.to2from ((*query_here).index, longindex);
487 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
488 text_t &granularity = splitindex[0];
489 text_t &indextype = splitindex[1];
490 bool found = false;
491 // currently supported granularity options are "document", "section" and "paragraph"
492 if (granularity == "document" || granularity == "section") {
493 text_t shortindex;
494 if (indexmap.fromexists ("paragraph:" + indextype)) {
495 indexmap.from2to ("paragraph:" + indextype, shortindex);
496 (*query_here).index = shortindex;
497 found = true;
498 }
499 if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
500 indexmap.from2to ("section:" + indextype, shortindex);
501 (*query_here).index = shortindex;
502 }
503 }
504 }
505
506#ifdef GSDL_BBC_COLLECTION
507 // This is a special hack for the BBC collection's ProgNumber and zzabn
508 // indexes (they're built this way to prevent mg_perf_hash_build from
509 // dying at build time)
510
511 // if we're searching the ProgNumber index we want to
512 // remove all non-alphanumeric characters from the query string
513 text_t longindex; text_tarray splitindex;
514 indexmap.to2from ((*query_here).index, longindex);
515 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
516 text_t &indextype = splitindex[1];
517 if (indextype == "ProgNumber") {
518 text_t new_querystring;
519 text_t::const_iterator here = (*query_here).querystring.begin();
520 text_t::const_iterator end = (*query_here).querystring.end();
521 while (here != end) {
522 if ((*here >= 'a' && *here <= 'z') || (*here >= 'A' && *here <= 'Z') ||
523 (*here >= '0' && *here <= '9')) {
524 new_querystring.push_back (*here);
525 }
526 ++here;
527 }
528 (*query_here).querystring = new_querystring;
529 }
530#endif
531 ++query_here;
532 }
533}
534
Note: See TracBrowser for help on using the repository browser.