source: trunk/gsdl/src/colservr/mgqueryfilter.cpp@ 12276

Last change on this file since 12276 was 11002, checked in by kjdon, 18 years ago

fixed up mg phrase searching. there was a bug where if you had two single word phrases, it would join the terms together and try to find that. also, now we don't do any phrase stuff if all phrases are only single words

  • Property svn:keywords set to Author Date Id Revision
File size: 17.9 KB
Line 
1/**********************************************************************
2 *
3 * mgqueryfilter.cpp -- implementation of queryfilter for old mg
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "mgqueryfilter.h"
27#include "fileutil.h"
28#include "phrasesearch.h"
29#include <assert.h>
30#include "mgsearch.h"
31#include "phrases.h"
32
33///////////////////////////////
34// methods for resultsorderer_t
35///////////////////////////////
36
37resultsorderer_t::resultsorderer_t() {
38 clear ();
39}
40
41void resultsorderer_t::clear() {
42 compare_phrase_match = false;
43 compare_terms_match = false;
44 compare_doc_weight = true;
45
46 docset = NULL;
47}
48
49bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
50 if (docset == NULL) return t1>t2;
51
52 docresultmap::iterator t1_here = docset->find(t1);
53 docresultmap::iterator t2_here = docset->find(t2);
54 docresultmap::iterator end = docset->end();
55
56 // sort all the document numbers not in the document set to
57 // the end of the list
58 if (t1_here == end) {
59 if (t2_here == end) return t1>t2;
60 else return true;
61 } else if (t2_here == end) return false;
62
63 if (compare_phrase_match) {
64 if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
65 if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
66 }
67
68 if (compare_terms_match) {
69 if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
70 if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
71 }
72
73 if (compare_doc_weight) {
74 if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
75 if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
76 }
77
78 return t1>t2;
79}
80
81
82
83
84/////////////////////////////////
85// functions for mgqueryfilterclass
86/////////////////////////////////
87
88
89void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
90 queryfilterclass::configure (key, cfgline);
91
92 if (key == "maxnumeric") {
93 maxnumeric = cfgline[0].getint();
94 }
95 else if (key == "indexstem") {
96 ((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
97 }
98
99}
100
101// loads up phrases data structure with any phrases (that's the quoted bits)
102// occuring in the querystring
103void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
104 const termfreqclassarray &orgterms,
105 vector<termfreqclassarray> &phrases) {
106
107 text_t::const_iterator here = querystring.begin();
108 text_t::const_iterator end = querystring.end();
109
110 termfreqclassarray tmpterms;
111
112 int termcount = 0;
113 bool foundquote = false;
114 bool foundbreak = false;
115 bool start = true;
116 while (here != end) {
117 if (*here == '\"') {
118 if (foundquote) {
119 if (!foundbreak && !start) {
120 tmpterms.push_back (orgterms[termcount]);
121 ++termcount;
122 }
123 if (tmpterms.size() > 1) {
124 phrases.push_back (tmpterms);
125 }
126 tmpterms.erase (tmpterms.begin(), tmpterms.end());
127
128 foundquote = false;
129 foundbreak = true;
130 } else foundquote = true;
131 } else if (!is_unicode_letdig(*here)) {
132 // found a break between terms
133 if (!foundbreak && !start) {
134 if (foundquote) {
135 tmpterms.push_back (orgterms[termcount]);
136 }
137 ++termcount;
138 }
139 foundbreak = true;
140 } else {
141 start = false;
142 foundbreak = false;
143 }
144 ++here;
145 }
146}
147
148// do aditional query processing
149void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
150 queryresultsclass &queryresults) {
151
152 // post-process the results if needed
153 if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
154
155 // get the terms between quotes (if any)
156 vector<termfreqclassarray> phrases;
157 get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
158
159 num_phrases = phrases.size();
160 if (num_phrases > 0) {
161
162 // get the long version of the index
163 text_t longindex;
164 indexmap.to2from (queryparams.index, longindex);
165
166 vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
167 vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
168
169 while (this_phrase != end_phrase) {
170
171 // process each of the matched documents
172 docresultmap::iterator docs_here = queryresults.docs.docset.begin();
173 docresultmap::iterator docs_end = queryresults.docs.docset.end();
174 while (docs_here != docs_end) {
175 if (OID_phrase_search (*((mgsearchclass*)textsearchptr), *gdbmptr, queryparams.index,
176 queryparams.subcollection, queryparams.language,
177 longindex, queryparams.collection, *this_phrase,
178 (*docs_here).second.docnum)) {
179 ++docs_here->second.num_phrase_match;
180 }
181
182 ++docs_here;
183 }
184 ++this_phrase;
185 }
186 }
187 }
188}
189
190
191// do query that might involve multiple sub queries
192// mgsearchptr and gdbmptr are assumed to be valid
193void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
194 const vector<queryparamclass> &query_params,
195 queryresultsclass &multiresults,
196 comerror_t &err, ostream &logout) {
197 outconvertclass text_t2ascii;
198
199 err = noError;
200 textsearchptr->setcollectdir (collectdir);
201 multiresults.clear();
202
203 vector<queryparamclass>::const_iterator query_here = query_params.begin();
204 vector<queryparamclass>::const_iterator query_end = query_params.end();
205 while (query_here != query_end) {
206 queryresultsclass thisqueryresults;
207
208 if (!textsearchptr->search(*query_here, thisqueryresults)) {
209 // most likely a system problem
210 logout << text_t2ascii
211 << "system problem: could not do search with mg for index \""
212 << (*query_here).index << (*query_here).subcollection
213 << (*query_here).language << "\".\n\n";
214 err = systemProblem;
215 return;
216 }
217
218 // combine the results
219 if (need_matching_docs (request.filterResultOptions)) {
220 // post-process the results if needed
221 if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
222 !thisqueryresults.docs.docset.empty()) {
223 post_process (*query_here, thisqueryresults);
224 thisqueryresults.postprocessed = true;
225 multiresults.postprocessed = true;
226 } else {
227 num_phrases = 0;
228 }
229
230 if (query_params.size() == 1) {
231 multiresults.docs = thisqueryresults.docs; // just one set of results
232 multiresults.docs_matched = thisqueryresults.docs_matched;
233 multiresults.is_approx = thisqueryresults.is_approx;
234
235 } else {
236 if ((*query_here).combinequery == "and") {
237 multiresults.docs.combine_and (thisqueryresults.docs);
238 } else if ((*query_here).combinequery == "or") {
239 multiresults.docs.combine_or (thisqueryresults.docs);
240 } else if ((*query_here).combinequery == "not") {
241 multiresults.docs.combine_not (thisqueryresults.docs);
242 }
243 multiresults.docs_matched = multiresults.docs.docset.size();
244 multiresults.is_approx = Exact;
245 }
246 }
247
248 // combine the term information
249 if (need_term_info (request.filterResultOptions)) {
250 // append the terms
251 multiresults.orgterms.insert(multiresults.orgterms.end(),
252 thisqueryresults.orgterms.begin(),
253 thisqueryresults.orgterms.end());
254
255 // add the term variants
256 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
257 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
258 while (termvar_here != termvar_end) {
259 multiresults.termvariants.insert(*termvar_here);
260 ++termvar_here;
261 }
262 }
263
264 ++query_here;
265 }
266
267 // sort and unique the query terms
268 multiresults.sortuniqqueryterms ();
269}
270
271
272void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
273 docresultsclass &docs) {
274 resultsorderer_t resultsorderer;
275 resultsorderer.compare_phrase_match = true;
276 resultsorderer.docset = &(docs.docset);
277
278 // first get a list of document numbers
279 docs.docnum_order();
280
281 sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
282}
283
284
285
286mgqueryfilterclass::mgqueryfilterclass ()
287 :queryfilterclass() {
288
289 num_phrases = 0;
290 maxnumeric = 4;
291}
292
293mgqueryfilterclass::~mgqueryfilterclass () {
294}
295
296void mgqueryfilterclass::filter (const FilterRequest_t &request,
297 FilterResponse_t &response,
298 comerror_t &err, ostream &logout) {
299 outconvertclass text_t2ascii;
300
301 response.clear ();
302 err = noError;
303 if (gdbmptr == NULL) {
304 // most likely a configuration problem
305 logout << text_t2ascii
306 << "configuration error: mgqueryfilter contains a null gdbmclass\n\n";
307 err = configurationError;
308 return;
309 }
310 if (textsearchptr == NULL) {
311 // most likely a configuration problem
312 logout << text_t2ascii
313 << "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
314 err = configurationError;
315 return;
316 }
317
318 // open the database
319 gdbmptr->setlogout(&logout);
320 if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
321 // most likely a system problem (we have already checked that the
322 // gdbm database exists)
323 logout << text_t2ascii
324 << "system problem: open on gdbm database \""
325 << gdbm_filename << "\" failed\n\n";
326 err = systemProblem;
327 return;
328 }
329
330 // get the query parameters
331 int startresults = filterOptions["StartResults"].defaultValue.getint();
332 int endresults = filterOptions["EndResults"].defaultValue.getint();
333 text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
334
335 vector<queryparamclass> queryfilterparams;
336 parse_query_params (request, queryfilterparams, startresults,
337 endresults, phrasematch, logout);
338 // do any mg specific diddling with query parameters that may be required
339 mg_parse_query_params (request, queryfilterparams, startresults,
340 endresults, phrasematch, logout);
341
342
343 // do query
344 queryresultsclass queryresults;
345 do_multi_query (request, queryfilterparams, queryresults, err, logout);
346 if (err != noError) return;
347
348 // assemble document results
349 if (need_matching_docs (request.filterResultOptions)) {
350 // sort the query results
351 // only want to sort the docs if we have done a ranked search or there were phrases
352 if (num_phrases > 0 || (request.filterResultOptions & FRranking)) {
353 sort_doc_results (request, queryresults.docs);
354 }
355 int resultnum = 1;
356 ResultDocInfo_t resultdoc;
357 text_t trans_OID;
358 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
359 vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
360
361 // documents containing matching phrases will be sorted to the top so
362 // we can break out once we're past those that match the PhraseMatch
363 // option -- "all_phrases" = return only those documents containing all
364 // phrases in query string
365 // "some_phrases" = return only those documents containing
366 // at least 1 of the phrases in the document
367 // "all_docs" = return all documents regardless
368 if (num_phrases > 0) {
369 int numdocs = 0;
370 while (docorder_here != docorder_end) {
371 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
372
373 if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) ||
374 ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
375 queryresults.docs_matched = numdocs;
376 break;
377 }
378 ++numdocs;
379 ++docorder_here;
380 }
381 }
382
383 if (endresults == -1) endresults = MAXNUMDOCS;
384 docorder_here = queryresults.docs.docorder.begin();
385 while (docorder_here != docorder_end) {
386 if (resultnum > endresults || resultnum > queryresults.docs_matched) break;
387
388 // translate the document number
389 if (!translate(gdbmptr, *docorder_here, trans_OID)) {
390 logout << text_t2ascii
391 << "warning: could not translate mg document number \""
392 << *docorder_here << "\"to OID.\n\n";
393
394 } else {
395 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
396
397 // see if there is a result for this number,
398 // if it is in the request set (or the request set is empty)
399 if (docset_here != queryresults.docs.docset.end() &&
400 (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
401 if (resultnum >= startresults) {
402 // add this document
403 resultdoc.OID = trans_OID;
404 resultdoc.result_num = resultnum;
405 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
406
407 // these next two are not available on all versions of mg
408 resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
409 resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
410
411 response.docInfo.push_back (resultdoc);
412 }
413
414 ++resultnum;
415 }
416 }
417
418 ++docorder_here;
419 }
420 }
421
422 // assemble the term results
423 if (need_term_info(request.filterResultOptions)) {
424 // note: the terms have already been sorted and uniqued
425
426 TermInfo_t terminfo;
427 bool terms_first = true;
428 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
429 termfreqclassarray::iterator terms_end = queryresults.terms.end();
430
431 while (terms_here != terms_end) {
432 terminfo.clear();
433 terminfo.term = (*terms_here).termstr;
434 terminfo.freq = (*terms_here).termfreq;
435 if (terms_first) {
436 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
437 text_tset::iterator termvariants_end = queryresults.termvariants.end();
438 while (termvariants_here != termvariants_end) {
439 terminfo.matchTerms.push_back (*termvariants_here);
440 ++termvariants_here;
441 }
442 }
443 terms_first = false;
444
445 response.termInfo.push_back (terminfo);
446
447 ++terms_here;
448 }
449 }
450
451 response.numDocs = queryresults.docs_matched;
452 response.isApprox = queryresults.is_approx;
453}
454
455void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/*request*/,
456 vector<queryparamclass> &query_params,
457 int &/*startresults*/, int &/*endresults*/,
458 text_t &/*phrasematch*/, ostream &/*logout*/) {
459
460 // outconvertclass text_t2ascii;
461
462 vector<queryparamclass>::iterator query_here = query_params.begin();
463 vector<queryparamclass>::iterator query_end = query_params.end();
464 while (query_here != query_end) {
465
466 // set maxnumeric
467 (*query_here).maxnumeric = maxnumeric;
468
469 // if we're doing a phrase search we want to maximise hits by making it
470 // a boolean search on the index with the finest granularity - we'll
471 // also set maxdocs to "all" (realizing that this will cause searches
472 // like "and the" on a large collection to take a very very long time).
473
474 // we're deciding it's a phrase search based on if the querystring
475 // contains at least 2 double quotes (not very scientific but
476 // then neither is the rest of the mg phrase searching functionality :-)
477 //if (countchar ((*query_here).querystring.begin(), (*query_here).querystring.end(), '"') > 1) {
478
479 // [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here
480 text_tarray phrases;
481 get_phrases((*query_here).querystring, phrases);
482
483 if (phrases.size() > 0) {
484 (*query_here).search_type = 0;
485
486 // set maxdocs to "all"
487 (*query_here).maxdocs = -1;
488
489 // Get the long version of the index and test to see if any indexes with
490 // finer granularity exist. Indexes must be the same type (i.e. same metadata
491 // or "text").
492 text_t longindex; text_tarray splitindex;
493 indexmap.to2from ((*query_here).index, longindex);
494 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
495 text_t &granularity = splitindex[0];
496 text_t &indextype = splitindex[1];
497 bool found = false;
498 // currently supported granularity options are "document", "section" and "paragraph"
499 if (granularity == "document" || granularity == "section") {
500 text_t shortindex;
501 if (indexmap.fromexists ("paragraph:" + indextype)) {
502 indexmap.from2to ("paragraph:" + indextype, shortindex);
503 (*query_here).index = shortindex;
504 found = true;
505 }
506 if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
507 indexmap.from2to ("section:" + indextype, shortindex);
508 (*query_here).index = shortindex;
509 }
510 }
511 }
512
513#ifdef GSDL_BBC_COLLECTION
514 // This is a special hack for the BBC collection's ProgNumber and zzabn
515 // indexes (they're built this way to prevent mg_perf_hash_build from
516 // dying at build time)
517
518 // if we're searching the ProgNumber index we want to
519 // remove all non-alphanumeric characters from the query string
520 text_t longindex; text_tarray splitindex;
521 indexmap.to2from ((*query_here).index, longindex);
522 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
523 text_t &indextype = splitindex[1];
524 if (indextype == "ProgNumber") {
525 text_t new_querystring;
526 text_t::const_iterator here = (*query_here).querystring.begin();
527 text_t::const_iterator end = (*query_here).querystring.end();
528 while (here != end) {
529 if ((*here >= 'a' && *here <= 'z') || (*here >= 'A' && *here <= 'Z') ||
530 (*here >= '0' && *here <= '9')) {
531 new_querystring.push_back (*here);
532 }
533 ++here;
534 }
535 (*query_here).querystring = new_querystring;
536 }
537#endif
538 ++query_here;
539 }
540}
541
Note: See TracBrowser for help on using the repository browser.