source: gsdl/trunk/src/colservr/mgqueryfilter.cpp@ 14119

Last change on this file since 14119 was 13780, checked in by mdewsnip, 17 years ago

GLI/LOCAL LIBRARY: To prevent the problems with the GLI being unable to install newly built collections because the local library is holding files open, much more care needs to be taken to close files (typically the GDBM database and the MG/MGPP index files) after use. Fixed a lot of places where files were being left open.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.9 KB
Line 
1/**********************************************************************
2 *
3 * mgqueryfilter.cpp -- implementation of queryfilter for old mg
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "mgqueryfilter.h"
27#include "fileutil.h"
28#include "phrasesearch.h"
29#include <assert.h>
30#include "mgsearch.h"
31#include "phrases.h"
32
33///////////////////////////////
34// methods for resultsorderer_t
35///////////////////////////////
36
37resultsorderer_t::resultsorderer_t() {
38 clear ();
39}
40
41void resultsorderer_t::clear() {
42 compare_phrase_match = false;
43 compare_terms_match = false;
44 compare_doc_weight = true;
45
46 docset = NULL;
47}
48
49bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
50 if (docset == NULL) return t1>t2;
51
52 docresultmap::iterator t1_here = docset->find(t1);
53 docresultmap::iterator t2_here = docset->find(t2);
54 docresultmap::iterator end = docset->end();
55
56 // sort all the document numbers not in the document set to
57 // the end of the list
58 if (t1_here == end) {
59 if (t2_here == end) return t1>t2;
60 else return true;
61 } else if (t2_here == end) return false;
62
63 if (compare_phrase_match) {
64 if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
65 if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
66 }
67
68 if (compare_terms_match) {
69 if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
70 if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
71 }
72
73 if (compare_doc_weight) {
74 if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
75 if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
76 }
77
78 return t1>t2;
79}
80
81
82
83
84/////////////////////////////////
85// functions for mgqueryfilterclass
86/////////////////////////////////
87
88
89void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
90 queryfilterclass::configure (key, cfgline);
91
92 if (key == "indexstem") {
93 ((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
94 }
95
96}
97
98// loads up phrases data structure with any phrases (that's the quoted bits)
99// occuring in the querystring
100void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
101 const termfreqclassarray &orgterms,
102 vector<termfreqclassarray> &phrases) {
103
104 text_t::const_iterator here = querystring.begin();
105 text_t::const_iterator end = querystring.end();
106
107 termfreqclassarray tmpterms;
108
109 int termcount = 0;
110 bool foundquote = false;
111 bool foundbreak = false;
112 bool start = true;
113 while (here != end) {
114 if (*here == '\"') {
115 if (foundquote) {
116 if (!foundbreak && !start) {
117 tmpterms.push_back (orgterms[termcount]);
118 ++termcount;
119 }
120 if (tmpterms.size() > 1) {
121 phrases.push_back (tmpterms);
122 }
123 tmpterms.erase (tmpterms.begin(), tmpterms.end());
124
125 foundquote = false;
126 foundbreak = true;
127 } else foundquote = true;
128 } else if (!is_unicode_letdig(*here)) {
129 // found a break between terms
130 if (!foundbreak && !start) {
131 if (foundquote) {
132 tmpterms.push_back (orgterms[termcount]);
133 }
134 ++termcount;
135 }
136 foundbreak = true;
137 } else {
138 start = false;
139 foundbreak = false;
140 }
141 ++here;
142 }
143}
144
145// do aditional query processing
146void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
147 queryresultsclass &queryresults) {
148
149 // post-process the results if needed
150 if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
151
152 // get the terms between quotes (if any)
153 vector<termfreqclassarray> phrases;
154 get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
155
156 num_phrases = phrases.size();
157 if (num_phrases > 0) {
158
159 // get the long version of the index
160 text_t longindex;
161 indexmap.to2from (queryparams.index, longindex);
162
163 vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
164 vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
165
166 while (this_phrase != end_phrase) {
167
168 // process each of the matched documents
169 docresultmap::iterator docs_here = queryresults.docs.docset.begin();
170 docresultmap::iterator docs_end = queryresults.docs.docset.end();
171 while (docs_here != docs_end) {
172 if (OID_phrase_search (*((mgsearchclass*)textsearchptr), *gdbmptr, queryparams.index,
173 queryparams.subcollection, queryparams.language,
174 longindex, queryparams.collection, *this_phrase,
175 (*docs_here).second.docnum)) {
176 ++docs_here->second.num_phrase_match;
177 }
178
179 ++docs_here;
180 }
181 ++this_phrase;
182 }
183 }
184 }
185}
186
187
188// do query that might involve multiple sub queries
189// mgsearchptr and gdbmptr are assumed to be valid
190void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
191 const vector<queryparamclass> &query_params,
192 queryresultsclass &multiresults,
193 comerror_t &err, ostream &logout) {
194 outconvertclass text_t2ascii;
195
196 err = noError;
197 textsearchptr->setcollectdir (collectdir);
198 multiresults.clear();
199
200 vector<queryparamclass>::const_iterator query_here = query_params.begin();
201 vector<queryparamclass>::const_iterator query_end = query_params.end();
202 while (query_here != query_end) {
203 queryresultsclass thisqueryresults;
204
205 if (!textsearchptr->search(*query_here, thisqueryresults)) {
206 // most likely a system problem
207 logout << text_t2ascii
208 << "system problem: could not do search with mg for index \""
209 << (*query_here).index << (*query_here).subcollection
210 << (*query_here).language << "\".\n\n";
211 err = systemProblem;
212 return;
213 }
214
215 // combine the results
216 if (need_matching_docs (request.filterResultOptions)) {
217 // post-process the results if needed
218 if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
219 !thisqueryresults.docs.docset.empty()) {
220 post_process (*query_here, thisqueryresults);
221 thisqueryresults.postprocessed = true;
222 multiresults.postprocessed = true;
223 } else {
224 num_phrases = 0;
225 }
226
227 if (query_params.size() == 1) {
228 multiresults.docs = thisqueryresults.docs; // just one set of results
229 multiresults.docs_matched = thisqueryresults.docs_matched;
230 multiresults.is_approx = thisqueryresults.is_approx;
231
232 } else {
233 if ((*query_here).combinequery == "and") {
234 multiresults.docs.combine_and (thisqueryresults.docs);
235 } else if ((*query_here).combinequery == "or") {
236 multiresults.docs.combine_or (thisqueryresults.docs);
237 } else if ((*query_here).combinequery == "not") {
238 multiresults.docs.combine_not (thisqueryresults.docs);
239 }
240 multiresults.docs_matched = multiresults.docs.docset.size();
241 multiresults.is_approx = Exact;
242 }
243 }
244
245 // combine the term information
246 if (need_term_info (request.filterResultOptions)) {
247 // append the terms
248 multiresults.orgterms.insert(multiresults.orgterms.end(),
249 thisqueryresults.orgterms.begin(),
250 thisqueryresults.orgterms.end());
251
252 // add the term variants
253 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
254 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
255 while (termvar_here != termvar_end) {
256 multiresults.termvariants.insert(*termvar_here);
257 ++termvar_here;
258 }
259 }
260
261 ++query_here;
262 }
263
264 // sort and unique the query terms
265 multiresults.sortuniqqueryterms ();
266}
267
268
269void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
270 docresultsclass &docs) {
271 resultsorderer_t resultsorderer;
272 resultsorderer.compare_phrase_match = true;
273 resultsorderer.docset = &(docs.docset);
274
275 // first get a list of document numbers
276 docs.docnum_order();
277
278 sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
279}
280
281
282
283mgqueryfilterclass::mgqueryfilterclass ()
284 :queryfilterclass() {
285
286 num_phrases = 0;
287}
288
289mgqueryfilterclass::~mgqueryfilterclass () {
290}
291
292void mgqueryfilterclass::filter (const FilterRequest_t &request,
293 FilterResponse_t &response,
294 comerror_t &err, ostream &logout) {
295 outconvertclass text_t2ascii;
296
297 response.clear ();
298 err = noError;
299 if (gdbmptr == NULL) {
300 // most likely a configuration problem
301 logout << text_t2ascii
302 << "configuration error: mgqueryfilter contains a null gdbmclass\n\n";
303 err = configurationError;
304 return;
305 }
306 if (textsearchptr == NULL) {
307 // most likely a configuration problem
308 logout << text_t2ascii
309 << "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
310 err = configurationError;
311 return;
312 }
313
314 // open the database
315 gdbmptr->setlogout(&logout);
316 if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
317 // most likely a system problem (we have already checked that the
318 // gdbm database exists)
319 logout << text_t2ascii
320 << "system problem: open on gdbm database \""
321 << gdbm_filename << "\" failed\n\n";
322 err = systemProblem;
323 return;
324 }
325
326 // get the query parameters
327 int startresults = filterOptions["StartResults"].defaultValue.getint();
328 int endresults = filterOptions["EndResults"].defaultValue.getint();
329 text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
330
331 vector<queryparamclass> queryfilterparams;
332 parse_query_params (request, queryfilterparams, startresults,
333 endresults, phrasematch, logout);
334 // do any mg specific diddling with query parameters that may be required
335 mg_parse_query_params (request, queryfilterparams, startresults,
336 endresults, phrasematch, logout);
337
338
339 // do query
340 queryresultsclass queryresults;
341 do_multi_query (request, queryfilterparams, queryresults, err, logout);
342 if (err != noError) return;
343
344 // assemble document results
345 if (need_matching_docs (request.filterResultOptions)) {
346 // sort the query results
347 // only want to sort the docs if we have done a ranked search or there were phrases
348 if (num_phrases > 0 || (request.filterResultOptions & FRranking)) {
349 sort_doc_results (request, queryresults.docs);
350 }
351 int resultnum = 1;
352 ResultDocInfo_t resultdoc;
353 text_t trans_OID;
354 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
355 vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
356
357 // documents containing matching phrases will be sorted to the top so
358 // we can break out once we're past those that match the PhraseMatch
359 // option -- "all_phrases" = return only those documents containing all
360 // phrases in query string
361 // "some_phrases" = return only those documents containing
362 // at least 1 of the phrases in the document
363 // "all_docs" = return all documents regardless
364 if (num_phrases > 0) {
365 int numdocs = 0;
366 while (docorder_here != docorder_end) {
367 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
368
369 if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) ||
370 ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
371 queryresults.docs_matched = numdocs;
372 break;
373 }
374 ++numdocs;
375 ++docorder_here;
376 }
377 }
378
379 if (endresults == -1) endresults = MAXNUMDOCS;
380 docorder_here = queryresults.docs.docorder.begin();
381 while (docorder_here != docorder_end) {
382 if (resultnum > endresults || resultnum > queryresults.docs_matched) break;
383
384 // translate the document number
385 if (!translate(gdbmptr, *docorder_here, trans_OID)) {
386 logout << text_t2ascii
387 << "warning: could not translate mg document number \""
388 << *docorder_here << "\"to OID.\n\n";
389
390 } else {
391 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
392
393 // see if there is a result for this number,
394 // if it is in the request set (or the request set is empty)
395 if (docset_here != queryresults.docs.docset.end() &&
396 (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
397 if (resultnum >= startresults) {
398 // add this document
399 resultdoc.OID = trans_OID;
400 resultdoc.result_num = resultnum;
401 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
402
403 // these next two are not available on all versions of mg
404 resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
405 resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
406
407 response.docInfo.push_back (resultdoc);
408 }
409
410 ++resultnum;
411 }
412 }
413
414 ++docorder_here;
415 }
416 }
417
418 // assemble the term results
419 if (need_term_info(request.filterResultOptions)) {
420 // note: the terms have already been sorted and uniqued
421
422 TermInfo_t terminfo;
423 bool terms_first = true;
424 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
425 termfreqclassarray::iterator terms_end = queryresults.terms.end();
426
427 while (terms_here != terms_end) {
428 terminfo.clear();
429 terminfo.term = (*terms_here).termstr;
430 terminfo.freq = (*terms_here).termfreq;
431 if (terms_first) {
432 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
433 text_tset::iterator termvariants_end = queryresults.termvariants.end();
434 while (termvariants_here != termvariants_end) {
435 terminfo.matchTerms.push_back (*termvariants_here);
436 ++termvariants_here;
437 }
438 }
439 terms_first = false;
440
441 response.termInfo.push_back (terminfo);
442
443 ++terms_here;
444 }
445 }
446
447 gdbmptr->closedatabase(); // Important that local library doesn't leave any files open
448 response.numDocs = queryresults.docs_matched;
449 response.isApprox = queryresults.is_approx;
450}
451
452void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/*request*/,
453 vector<queryparamclass> &query_params,
454 int &/*startresults*/, int &/*endresults*/,
455 text_t &/*phrasematch*/, ostream &/*logout*/) {
456
457 // outconvertclass text_t2ascii;
458
459 vector<queryparamclass>::iterator query_here = query_params.begin();
460 vector<queryparamclass>::iterator query_end = query_params.end();
461 while (query_here != query_end) {
462
463 // if we're doing a phrase search we want to maximise hits by making it
464 // a boolean search on the index with the finest granularity - we'll
465 // also set maxdocs to "all" (realizing that this will cause searches
466 // like "and the" on a large collection to take a very very long time).
467
468 // we're deciding it's a phrase search based on if the querystring
469 // contains at least 2 double quotes (not very scientific but
470 // then neither is the rest of the mg phrase searching functionality :-)
471 //if (countchar ((*query_here).querystring.begin(), (*query_here).querystring.end(), '"') > 1) {
472
473 // [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here
474 text_tarray phrases;
475 get_phrases((*query_here).querystring, phrases);
476
477 if (phrases.size() > 0) {
478 (*query_here).search_type = 0;
479
480 // set maxdocs to "all"
481 (*query_here).maxdocs = -1;
482
483 // Get the long version of the index and test to see if any indexes with
484 // finer granularity exist. Indexes must be the same type (i.e. same metadata
485 // or "text").
486 text_t longindex; text_tarray splitindex;
487 indexmap.to2from ((*query_here).index, longindex);
488 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
489 text_t &granularity = splitindex[0];
490 text_t &indextype = splitindex[1];
491 bool found = false;
492 // currently supported granularity options are "document", "section" and "paragraph"
493 if (granularity == "document" || granularity == "section") {
494 text_t shortindex;
495 if (indexmap.fromexists ("paragraph:" + indextype)) {
496 indexmap.from2to ("paragraph:" + indextype, shortindex);
497 (*query_here).index = shortindex;
498 found = true;
499 }
500 if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
501 indexmap.from2to ("section:" + indextype, shortindex);
502 (*query_here).index = shortindex;
503 }
504 }
505 }
506
507#ifdef GSDL_BBC_COLLECTION
508 // This is a special hack for the BBC collection's ProgNumber and zzabn
509 // indexes (they're built this way to prevent mg_perf_hash_build from
510 // dying at build time)
511
512 // if we're searching the ProgNumber index we want to
513 // remove all non-alphanumeric characters from the query string
514 text_t longindex; text_tarray splitindex;
515 indexmap.to2from ((*query_here).index, longindex);
516 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
517 text_t &indextype = splitindex[1];
518 if (indextype == "ProgNumber") {
519 text_t new_querystring;
520 text_t::const_iterator here = (*query_here).querystring.begin();
521 text_t::const_iterator end = (*query_here).querystring.end();
522 while (here != end) {
523 if ((*here >= 'a' && *here <= 'z') || (*here >= 'A' && *here <= 'Z') ||
524 (*here >= '0' && *here <= '9')) {
525 new_querystring.push_back (*here);
526 }
527 ++here;
528 }
529 (*query_here).querystring = new_querystring;
530 }
531#endif
532 ++query_here;
533 }
534}
535
Note: See TracBrowser for help on using the repository browser.