source: gsdl/trunk/src/colservr/mgqueryfilter.cpp@ 15757

Last change on this file since 15757 was 15681, checked in by mdewsnip, 16 years ago

Removed some unnecessary inclusions of "assert.h".

  • Property svn:keywords set to Author Date Id Revision
File size: 17.8 KB
Line 
1/**********************************************************************
2 *
3 * mgqueryfilter.cpp -- implementation of queryfilter for old mg
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "mgqueryfilter.h"
27#include "fileutil.h"
28#include "phrasesearch.h"
29#include "mgsearch.h"
30#include "phrases.h"
31
32///////////////////////////////
33// methods for resultsorderer_t
34///////////////////////////////
35
36resultsorderer_t::resultsorderer_t() {
37 clear ();
38}
39
40void resultsorderer_t::clear() {
41 compare_phrase_match = false;
42 compare_terms_match = false;
43 compare_doc_weight = true;
44
45 docset = NULL;
46}
47
48bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
49 if (docset == NULL) return t1>t2;
50
51 docresultmap::iterator t1_here = docset->find(t1);
52 docresultmap::iterator t2_here = docset->find(t2);
53 docresultmap::iterator end = docset->end();
54
55 // sort all the document numbers not in the document set to
56 // the end of the list
57 if (t1_here == end) {
58 if (t2_here == end) return t1>t2;
59 else return true;
60 } else if (t2_here == end) return false;
61
62 if (compare_phrase_match) {
63 if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
64 if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
65 }
66
67 if (compare_terms_match) {
68 if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
69 if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
70 }
71
72 if (compare_doc_weight) {
73 if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
74 if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
75 }
76
77 return t1>t2;
78}
79
80
81
82
83/////////////////////////////////
84// functions for mgqueryfilterclass
85/////////////////////////////////
86
87
88void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
89 queryfilterclass::configure (key, cfgline);
90
91 if (key == "indexstem") {
92 ((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
93 }
94
95}
96
97// loads up phrases data structure with any phrases (that's the quoted bits)
98// occuring in the querystring
99void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
100 const termfreqclassarray &orgterms,
101 vector<termfreqclassarray> &phrases) {
102
103 text_t::const_iterator here = querystring.begin();
104 text_t::const_iterator end = querystring.end();
105
106 termfreqclassarray tmpterms;
107
108 int termcount = 0;
109 bool foundquote = false;
110 bool foundbreak = false;
111 bool start = true;
112 while (here != end) {
113 if (*here == '\"') {
114 if (foundquote) {
115 if (!foundbreak && !start) {
116 tmpterms.push_back (orgterms[termcount]);
117 ++termcount;
118 }
119 if (tmpterms.size() > 1) {
120 phrases.push_back (tmpterms);
121 }
122 tmpterms.erase (tmpterms.begin(), tmpterms.end());
123
124 foundquote = false;
125 foundbreak = true;
126 } else foundquote = true;
127 } else if (!is_unicode_letdig(*here)) {
128 // found a break between terms
129 if (!foundbreak && !start) {
130 if (foundquote) {
131 tmpterms.push_back (orgterms[termcount]);
132 }
133 ++termcount;
134 }
135 foundbreak = true;
136 } else {
137 start = false;
138 foundbreak = false;
139 }
140 ++here;
141 }
142}
143
144// do aditional query processing
145void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
146 queryresultsclass &queryresults) {
147
148 // post-process the results if needed
149 if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
150
151 // get the terms between quotes (if any)
152 vector<termfreqclassarray> phrases;
153 get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
154
155 num_phrases = phrases.size();
156 if (num_phrases > 0) {
157
158 // get the long version of the index
159 text_t longindex;
160 indexmap.to2from (queryparams.index, longindex);
161
162 vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
163 vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
164
165 while (this_phrase != end_phrase) {
166
167 // process each of the matched documents
168 docresultmap::iterator docs_here = queryresults.docs.docset.begin();
169 docresultmap::iterator docs_end = queryresults.docs.docset.end();
170 while (docs_here != docs_end) {
171 if (OID_phrase_search (*((mgsearchclass*)textsearchptr), *db_ptr, queryparams.index,
172 queryparams.subcollection, queryparams.language,
173 longindex, queryparams.collection, *this_phrase,
174 (*docs_here).second.docnum)) {
175 ++docs_here->second.num_phrase_match;
176 }
177
178 ++docs_here;
179 }
180 ++this_phrase;
181 }
182 }
183 }
184}
185
186
187// do query that might involve multiple sub queries
188// textsearchptr and db_ptr are assumed to be valid
189void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
190 const vector<queryparamclass> &query_params,
191 queryresultsclass &multiresults,
192 comerror_t &err, ostream &logout) {
193 outconvertclass text_t2ascii;
194
195 err = noError;
196 textsearchptr->setcollectdir (collectdir);
197 multiresults.clear();
198
199 vector<queryparamclass>::const_iterator query_here = query_params.begin();
200 vector<queryparamclass>::const_iterator query_end = query_params.end();
201 while (query_here != query_end) {
202 queryresultsclass thisqueryresults;
203
204 if (!textsearchptr->search(*query_here, thisqueryresults)) {
205 // most likely a system problem
206 logout << text_t2ascii
207 << "system problem: could not do search with mg for index \""
208 << (*query_here).index << (*query_here).subcollection
209 << (*query_here).language << "\".\n\n";
210 err = systemProblem;
211 return;
212 }
213
214 // combine the results
215 if (need_matching_docs (request.filterResultOptions)) {
216 // post-process the results if needed
217 if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
218 !thisqueryresults.docs.docset.empty()) {
219 post_process (*query_here, thisqueryresults);
220 thisqueryresults.postprocessed = true;
221 multiresults.postprocessed = true;
222 } else {
223 num_phrases = 0;
224 }
225
226 if (query_params.size() == 1) {
227 multiresults.docs = thisqueryresults.docs; // just one set of results
228 multiresults.docs_matched = thisqueryresults.docs_matched;
229 multiresults.is_approx = thisqueryresults.is_approx;
230
231 } else {
232 if ((*query_here).combinequery == "and") {
233 multiresults.docs.combine_and (thisqueryresults.docs);
234 } else if ((*query_here).combinequery == "or") {
235 multiresults.docs.combine_or (thisqueryresults.docs);
236 } else if ((*query_here).combinequery == "not") {
237 multiresults.docs.combine_not (thisqueryresults.docs);
238 }
239 multiresults.docs_matched = multiresults.docs.docset.size();
240 multiresults.is_approx = Exact;
241 }
242 }
243
244 // combine the term information
245 if (need_term_info (request.filterResultOptions)) {
246 // append the terms
247 multiresults.orgterms.insert(multiresults.orgterms.end(),
248 thisqueryresults.orgterms.begin(),
249 thisqueryresults.orgterms.end());
250
251 // add the term variants
252 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
253 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
254 while (termvar_here != termvar_end) {
255 multiresults.termvariants.insert(*termvar_here);
256 ++termvar_here;
257 }
258 }
259
260 ++query_here;
261 }
262
263 // sort and unique the query terms
264 multiresults.sortuniqqueryterms ();
265}
266
267
268void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
269 docresultsclass &docs) {
270 resultsorderer_t resultsorderer;
271 resultsorderer.compare_phrase_match = true;
272 resultsorderer.docset = &(docs.docset);
273
274 // first get a list of document numbers
275 docs.docnum_order();
276
277 sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
278}
279
280
281
282mgqueryfilterclass::mgqueryfilterclass ()
283 :queryfilterclass() {
284
285 num_phrases = 0;
286}
287
288mgqueryfilterclass::~mgqueryfilterclass () {
289}
290
291void mgqueryfilterclass::filter (const FilterRequest_t &request,
292 FilterResponse_t &response,
293 comerror_t &err, ostream &logout) {
294 outconvertclass text_t2ascii;
295
296 response.clear ();
297 err = noError;
298 if (db_ptr == NULL) {
299 // most likely a configuration problem
300 logout << text_t2ascii
301 << "configuration error: mgqueryfilter contains a null dbclass\n\n";
302 err = configurationError;
303 return;
304 }
305 if (textsearchptr == NULL) {
306 // most likely a configuration problem
307 logout << text_t2ascii
308 << "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
309 err = configurationError;
310 return;
311 }
312
313 // open the database
314 db_ptr->setlogout(&logout);
315 if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
316 // most likely a system problem (we have already checked that the database exists)
317 logout << text_t2ascii
318 << "system problem: open on database \"" << db_filename << "\" failed\n\n";
319 err = systemProblem;
320 return;
321 }
322
323 // get the query parameters
324 int startresults = filterOptions["StartResults"].defaultValue.getint();
325 int endresults = filterOptions["EndResults"].defaultValue.getint();
326 text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
327
328 vector<queryparamclass> queryfilterparams;
329 parse_query_params (request, queryfilterparams, startresults,
330 endresults, phrasematch, logout);
331 // do any mg specific diddling with query parameters that may be required
332 mg_parse_query_params (request, queryfilterparams, startresults,
333 endresults, phrasematch, logout);
334
335
336 // do query
337 queryresultsclass queryresults;
338 do_multi_query (request, queryfilterparams, queryresults, err, logout);
339 if (err != noError) return;
340
341 // assemble document results
342 if (need_matching_docs (request.filterResultOptions)) {
343 // sort the query results
344 // only want to sort the docs if we have done a ranked search or there were phrases
345 if (num_phrases > 0 || (request.filterResultOptions & FRranking)) {
346 sort_doc_results (request, queryresults.docs);
347 }
348 int resultnum = 1;
349 ResultDocInfo_t resultdoc;
350 text_t trans_OID;
351 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
352 vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
353
354 // documents containing matching phrases will be sorted to the top so
355 // we can break out once we're past those that match the PhraseMatch
356 // option -- "all_phrases" = return only those documents containing all
357 // phrases in query string
358 // "some_phrases" = return only those documents containing
359 // at least 1 of the phrases in the document
360 // "all_docs" = return all documents regardless
361 if (num_phrases > 0) {
362 int numdocs = 0;
363 while (docorder_here != docorder_end) {
364 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
365
366 if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) ||
367 ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
368 queryresults.docs_matched = numdocs;
369 break;
370 }
371 ++numdocs;
372 ++docorder_here;
373 }
374 }
375
376 if (endresults == -1) endresults = MAXNUMDOCS;
377 docorder_here = queryresults.docs.docorder.begin();
378 while (docorder_here != docorder_end) {
379 if (resultnum > endresults || resultnum > queryresults.docs_matched) break;
380
381 // translate the document number
382 if (!translate(db_ptr, *docorder_here, trans_OID)) {
383 logout << text_t2ascii
384 << "warning: could not translate mg document number \""
385 << *docorder_here << "\"to OID.\n\n";
386
387 } else {
388 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
389
390 // see if there is a result for this number,
391 // if it is in the request set (or the request set is empty)
392 if (docset_here != queryresults.docs.docset.end() &&
393 (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
394 if (resultnum >= startresults) {
395 // add this document
396 resultdoc.OID = trans_OID;
397 resultdoc.result_num = resultnum;
398 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
399
400 // these next two are not available on all versions of mg
401 resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
402 resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
403
404 response.docInfo.push_back (resultdoc);
405 }
406
407 ++resultnum;
408 }
409 }
410
411 ++docorder_here;
412 }
413 }
414
415 // assemble the term results
416 if (need_term_info(request.filterResultOptions)) {
417 // note: the terms have already been sorted and uniqued
418
419 TermInfo_t terminfo;
420 bool terms_first = true;
421 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
422 termfreqclassarray::iterator terms_end = queryresults.terms.end();
423
424 while (terms_here != terms_end) {
425 terminfo.clear();
426 terminfo.term = (*terms_here).termstr;
427 terminfo.freq = (*terms_here).termfreq;
428 if (terms_first) {
429 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
430 text_tset::iterator termvariants_end = queryresults.termvariants.end();
431 while (termvariants_here != termvariants_end) {
432 terminfo.matchTerms.push_back (*termvariants_here);
433 ++termvariants_here;
434 }
435 }
436 terms_first = false;
437
438 response.termInfo.push_back (terminfo);
439
440 ++terms_here;
441 }
442 }
443
444 db_ptr->closedatabase(); // Important that local library doesn't leave any files open
445 response.numDocs = queryresults.docs_matched;
446 response.isApprox = queryresults.is_approx;
447}
448
449void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/*request*/,
450 vector<queryparamclass> &query_params,
451 int &/*startresults*/, int &/*endresults*/,
452 text_t &/*phrasematch*/, ostream &/*logout*/) {
453
454 // outconvertclass text_t2ascii;
455
456 vector<queryparamclass>::iterator query_here = query_params.begin();
457 vector<queryparamclass>::iterator query_end = query_params.end();
458 while (query_here != query_end) {
459
460 // if we're doing a phrase search we want to maximise hits by making it
461 // a boolean search on the index with the finest granularity - we'll
462 // also set maxdocs to "all" (realizing that this will cause searches
463 // like "and the" on a large collection to take a very very long time).
464
465 // we're deciding it's a phrase search based on if the querystring
466 // contains at least 2 double quotes (not very scientific but
467 // then neither is the rest of the mg phrase searching functionality :-)
468 //if (countchar ((*query_here).querystring.begin(), (*query_here).querystring.end(), '"') > 1) {
469
470 // [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here
471 text_tarray phrases;
472 get_phrases((*query_here).querystring, phrases);
473
474 if (phrases.size() > 0) {
475 (*query_here).search_type = 0;
476
477 // set maxdocs to "all"
478 (*query_here).maxdocs = -1;
479
480 // Get the long version of the index and test to see if any indexes with
481 // finer granularity exist. Indexes must be the same type (i.e. same metadata
482 // or "text").
483 text_t longindex; text_tarray splitindex;
484 indexmap.to2from ((*query_here).index, longindex);
485 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
486 text_t &granularity = splitindex[0];
487 text_t &indextype = splitindex[1];
488 bool found = false;
489 // currently supported granularity options are "document", "section" and "paragraph"
490 if (granularity == "document" || granularity == "section") {
491 text_t shortindex;
492 if (indexmap.fromexists ("paragraph:" + indextype)) {
493 indexmap.from2to ("paragraph:" + indextype, shortindex);
494 (*query_here).index = shortindex;
495 found = true;
496 }
497 if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
498 indexmap.from2to ("section:" + indextype, shortindex);
499 (*query_here).index = shortindex;
500 }
501 }
502 }
503
504#ifdef GSDL_BBC_COLLECTION
505 // This is a special hack for the BBC collection's ProgNumber and zzabn
506 // indexes (they're built this way to prevent mg_perf_hash_build from
507 // dying at build time)
508
509 // if we're searching the ProgNumber index we want to
510 // remove all non-alphanumeric characters from the query string
511 text_t longindex; text_tarray splitindex;
512 indexmap.to2from ((*query_here).index, longindex);
513 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
514 text_t &indextype = splitindex[1];
515 if (indextype == "ProgNumber") {
516 text_t new_querystring;
517 text_t::const_iterator here = (*query_here).querystring.begin();
518 text_t::const_iterator end = (*query_here).querystring.end();
519 while (here != end) {
520 if ((*here >= 'a' && *here <= 'z') || (*here >= 'A' && *here <= 'Z') ||
521 (*here >= '0' && *here <= '9')) {
522 new_querystring.push_back (*here);
523 }
524 ++here;
525 }
526 (*query_here).querystring = new_querystring;
527 }
528#endif
529 ++query_here;
530 }
531}
532
Note: See TracBrowser for help on using the repository browser.