source: gsdl/trunk/src/colservr/mgqueryfilter.cpp@ 16310

Last change on this file since 16310 was 16310, checked in by davidb, 16 years ago

Introduction of 'collecthome' which parallels 'gsdlhome' to allow the toplevel collect folder to be outside of the gsdlhome area

  • Property svn:keywords set to Author Date Id Revision
File size: 17.8 KB
Line 
1/**********************************************************************
2 *
3 * mgqueryfilter.cpp -- implementation of queryfilter for old mg
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "mgqueryfilter.h"
27#include "fileutil.h"
28#include "phrasesearch.h"
29#include "mgsearch.h"
30#include "phrases.h"
31
32///////////////////////////////
33// methods for resultsorderer_t
34///////////////////////////////
35
36resultsorderer_t::resultsorderer_t() {
37 clear ();
38}
39
40void resultsorderer_t::clear() {
41 compare_phrase_match = false;
42 compare_terms_match = false;
43 compare_doc_weight = true;
44
45 docset = NULL;
46}
47
48bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
49 if (docset == NULL) return t1>t2;
50
51 docresultmap::iterator t1_here = docset->find(t1);
52 docresultmap::iterator t2_here = docset->find(t2);
53 docresultmap::iterator end = docset->end();
54
55 // sort all the document numbers not in the document set to
56 // the end of the list
57 if (t1_here == end) {
58 if (t2_here == end) return t1>t2;
59 else return true;
60 } else if (t2_here == end) return false;
61
62 if (compare_phrase_match) {
63 if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
64 if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
65 }
66
67 if (compare_terms_match) {
68 if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
69 if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
70 }
71
72 if (compare_doc_weight) {
73 if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
74 if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
75 }
76
77 return t1>t2;
78}
79
80
81
82
83/////////////////////////////////
84// functions for mgqueryfilterclass
85/////////////////////////////////
86
87
88void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
89 queryfilterclass::configure (key, cfgline);
90
91 if (key == "indexstem") {
92 ((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
93 }
94
95}
96
97// loads up phrases data structure with any phrases (that's the quoted bits)
98// occuring in the querystring
99void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
100 const termfreqclassarray &orgterms,
101 vector<termfreqclassarray> &phrases) {
102
103 text_t::const_iterator here = querystring.begin();
104 text_t::const_iterator end = querystring.end();
105
106 termfreqclassarray tmpterms;
107
108 int termcount = 0;
109 bool foundquote = false;
110 bool foundbreak = false;
111 bool start = true;
112 while (here != end) {
113 if (*here == '\"') {
114 if (foundquote) {
115 if (!foundbreak && !start) {
116 tmpterms.push_back (orgterms[termcount]);
117 ++termcount;
118 }
119 if (tmpterms.size() > 1) {
120 phrases.push_back (tmpterms);
121 }
122 tmpterms.erase (tmpterms.begin(), tmpterms.end());
123
124 foundquote = false;
125 foundbreak = true;
126 } else foundquote = true;
127 } else if (!is_unicode_letdig(*here)) {
128 // found a break between terms
129 if (!foundbreak && !start) {
130 if (foundquote) {
131 tmpterms.push_back (orgterms[termcount]);
132 }
133 ++termcount;
134 }
135 foundbreak = true;
136 } else {
137 start = false;
138 foundbreak = false;
139 }
140 ++here;
141 }
142}
143
144// do aditional query processing
145void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
146 queryresultsclass &queryresults) {
147
148 // post-process the results if needed
149 if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
150
151 // get the terms between quotes (if any)
152 vector<termfreqclassarray> phrases;
153 get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
154
155 num_phrases = phrases.size();
156 if (num_phrases > 0) {
157
158 // get the long version of the index
159 text_t longindex;
160 indexmap.to2from (queryparams.index, longindex);
161
162 vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
163 vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
164
165 while (this_phrase != end_phrase) {
166
167 // process each of the matched documents
168 docresultmap::iterator docs_here = queryresults.docs.docset.begin();
169 docresultmap::iterator docs_end = queryresults.docs.docset.end();
170 while (docs_here != docs_end) {
171 if (OID_phrase_search (*((mgsearchclass*)textsearchptr), *db_ptr, queryparams.index,
172 queryparams.subcollection, queryparams.language,
173 longindex, queryparams.collection, *this_phrase,
174 (*docs_here).second.docnum)) {
175 ++docs_here->second.num_phrase_match;
176 }
177
178 ++docs_here;
179 }
180 ++this_phrase;
181 }
182 }
183 }
184}
185
186
187// do query that might involve multiple sub queries
188// textsearchptr and db_ptr are assumed to be valid
189void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
190 const vector<queryparamclass> &query_params,
191 queryresultsclass &multiresults,
192 comerror_t &err, ostream &logout) {
193 outconvertclass text_t2ascii;
194
195 err = noError;
196 textsearchptr->setcollectdir (collectdir);
197
198 multiresults.clear();
199
200 vector<queryparamclass>::const_iterator query_here = query_params.begin();
201 vector<queryparamclass>::const_iterator query_end = query_params.end();
202 while (query_here != query_end) {
203 queryresultsclass thisqueryresults;
204
205 if (!textsearchptr->search(*query_here, thisqueryresults)) {
206 // most likely a system problem
207 logout << text_t2ascii
208 << "system problem: could not do search with mg for index \""
209 << (*query_here).index << (*query_here).subcollection
210 << (*query_here).language << "\".\n\n";
211 err = systemProblem;
212 return;
213 }
214
215 // combine the results
216 if (need_matching_docs (request.filterResultOptions)) {
217 // post-process the results if needed
218 if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
219 !thisqueryresults.docs.docset.empty()) {
220 post_process (*query_here, thisqueryresults);
221 thisqueryresults.postprocessed = true;
222 multiresults.postprocessed = true;
223 } else {
224 num_phrases = 0;
225 }
226
227 if (query_params.size() == 1) {
228 multiresults.docs = thisqueryresults.docs; // just one set of results
229 multiresults.docs_matched = thisqueryresults.docs_matched;
230 multiresults.is_approx = thisqueryresults.is_approx;
231
232 } else {
233 if ((*query_here).combinequery == "and") {
234 multiresults.docs.combine_and (thisqueryresults.docs);
235 } else if ((*query_here).combinequery == "or") {
236 multiresults.docs.combine_or (thisqueryresults.docs);
237 } else if ((*query_here).combinequery == "not") {
238 multiresults.docs.combine_not (thisqueryresults.docs);
239 }
240 multiresults.docs_matched = multiresults.docs.docset.size();
241 multiresults.is_approx = Exact;
242 }
243 }
244
245 // combine the term information
246 if (need_term_info (request.filterResultOptions)) {
247 // append the terms
248 multiresults.orgterms.insert(multiresults.orgterms.end(),
249 thisqueryresults.orgterms.begin(),
250 thisqueryresults.orgterms.end());
251
252 // add the term variants
253 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
254 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
255 while (termvar_here != termvar_end) {
256 multiresults.termvariants.insert(*termvar_here);
257 ++termvar_here;
258 }
259 }
260
261 ++query_here;
262 }
263
264 // sort and unique the query terms
265 multiresults.sortuniqqueryterms ();
266}
267
268
269void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
270 docresultsclass &docs) {
271 resultsorderer_t resultsorderer;
272 resultsorderer.compare_phrase_match = true;
273 resultsorderer.docset = &(docs.docset);
274
275 // first get a list of document numbers
276 docs.docnum_order();
277
278 sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
279}
280
281
282
283mgqueryfilterclass::mgqueryfilterclass ()
284 :queryfilterclass() {
285
286 num_phrases = 0;
287}
288
289mgqueryfilterclass::~mgqueryfilterclass () {
290}
291
292void mgqueryfilterclass::filter (const FilterRequest_t &request,
293 FilterResponse_t &response,
294 comerror_t &err, ostream &logout) {
295 outconvertclass text_t2ascii;
296
297 response.clear ();
298 err = noError;
299 if (db_ptr == NULL) {
300 // most likely a configuration problem
301 logout << text_t2ascii
302 << "configuration error: mgqueryfilter contains a null dbclass\n\n";
303 err = configurationError;
304 return;
305 }
306 if (textsearchptr == NULL) {
307 // most likely a configuration problem
308 logout << text_t2ascii
309 << "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
310 err = configurationError;
311 return;
312 }
313
314 // open the database
315 db_ptr->setlogout(&logout);
316 if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
317 // most likely a system problem (we have already checked that the database exists)
318 logout << text_t2ascii
319 << "system problem: open on database \"" << db_filename << "\" failed\n\n";
320 err = systemProblem;
321 return;
322 }
323
324 // get the query parameters
325 int startresults = filterOptions["StartResults"].defaultValue.getint();
326 int endresults = filterOptions["EndResults"].defaultValue.getint();
327 text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
328
329 vector<queryparamclass> queryfilterparams;
330 parse_query_params (request, queryfilterparams, startresults,
331 endresults, phrasematch, logout);
332 // do any mg specific diddling with query parameters that may be required
333 mg_parse_query_params (request, queryfilterparams, startresults,
334 endresults, phrasematch, logout);
335
336
337 // do query
338 queryresultsclass queryresults;
339 do_multi_query (request, queryfilterparams, queryresults, err, logout);
340 if (err != noError) return;
341
342 // assemble document results
343 if (need_matching_docs (request.filterResultOptions)) {
344 // sort the query results
345 // only want to sort the docs if we have done a ranked search or there were phrases
346 if (num_phrases > 0 || (request.filterResultOptions & FRranking)) {
347 sort_doc_results (request, queryresults.docs);
348 }
349 int resultnum = 1;
350 ResultDocInfo_t resultdoc;
351 text_t trans_OID;
352 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
353 vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
354
355 // documents containing matching phrases will be sorted to the top so
356 // we can break out once we're past those that match the PhraseMatch
357 // option -- "all_phrases" = return only those documents containing all
358 // phrases in query string
359 // "some_phrases" = return only those documents containing
360 // at least 1 of the phrases in the document
361 // "all_docs" = return all documents regardless
362 if (num_phrases > 0) {
363 int numdocs = 0;
364 while (docorder_here != docorder_end) {
365 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
366
367 if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) ||
368 ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
369 queryresults.docs_matched = numdocs;
370 break;
371 }
372 ++numdocs;
373 ++docorder_here;
374 }
375 }
376
377 if (endresults == -1) endresults = MAXNUMDOCS;
378 docorder_here = queryresults.docs.docorder.begin();
379 while (docorder_here != docorder_end) {
380 if (resultnum > endresults || resultnum > queryresults.docs_matched) break;
381
382 // translate the document number
383 if (!translate(db_ptr, *docorder_here, trans_OID)) {
384 logout << text_t2ascii
385 << "warning: could not translate mg document number \""
386 << *docorder_here << "\"to OID.\n\n";
387
388 } else {
389 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
390
391 // see if there is a result for this number,
392 // if it is in the request set (or the request set is empty)
393 if (docset_here != queryresults.docs.docset.end() &&
394 (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
395 if (resultnum >= startresults) {
396 // add this document
397 resultdoc.OID = trans_OID;
398 resultdoc.result_num = resultnum;
399 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
400
401 // these next two are not available on all versions of mg
402 resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
403 resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
404
405 response.docInfo.push_back (resultdoc);
406 }
407
408 ++resultnum;
409 }
410 }
411
412 ++docorder_here;
413 }
414 }
415
416 // assemble the term results
417 if (need_term_info(request.filterResultOptions)) {
418 // note: the terms have already been sorted and uniqued
419
420 TermInfo_t terminfo;
421 bool terms_first = true;
422 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
423 termfreqclassarray::iterator terms_end = queryresults.terms.end();
424
425 while (terms_here != terms_end) {
426 terminfo.clear();
427 terminfo.term = (*terms_here).termstr;
428 terminfo.freq = (*terms_here).termfreq;
429 if (terms_first) {
430 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
431 text_tset::iterator termvariants_end = queryresults.termvariants.end();
432 while (termvariants_here != termvariants_end) {
433 terminfo.matchTerms.push_back (*termvariants_here);
434 ++termvariants_here;
435 }
436 }
437 terms_first = false;
438
439 response.termInfo.push_back (terminfo);
440
441 ++terms_here;
442 }
443 }
444
445 db_ptr->closedatabase(); // Important that local library doesn't leave any files open
446 response.numDocs = queryresults.docs_matched;
447 response.isApprox = queryresults.is_approx;
448}
449
450void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/*request*/,
451 vector<queryparamclass> &query_params,
452 int &/*startresults*/, int &/*endresults*/,
453 text_t &/*phrasematch*/, ostream &/*logout*/) {
454
455 // outconvertclass text_t2ascii;
456
457 vector<queryparamclass>::iterator query_here = query_params.begin();
458 vector<queryparamclass>::iterator query_end = query_params.end();
459 while (query_here != query_end) {
460
461 // if we're doing a phrase search we want to maximise hits by making it
462 // a boolean search on the index with the finest granularity - we'll
463 // also set maxdocs to "all" (realizing that this will cause searches
464 // like "and the" on a large collection to take a very very long time).
465
466 // we're deciding it's a phrase search based on if the querystring
467 // contains at least 2 double quotes (not very scientific but
468 // then neither is the rest of the mg phrase searching functionality :-)
469 //if (countchar ((*query_here).querystring.begin(), (*query_here).querystring.end(), '"') > 1) {
470
471 // [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here
472 text_tarray phrases;
473 get_phrases((*query_here).querystring, phrases);
474
475 if (phrases.size() > 0) {
476 (*query_here).search_type = 0;
477
478 // set maxdocs to "all"
479 (*query_here).maxdocs = -1;
480
481 // Get the long version of the index and test to see if any indexes with
482 // finer granularity exist. Indexes must be the same type (i.e. same metadata
483 // or "text").
484 text_t longindex; text_tarray splitindex;
485 indexmap.to2from ((*query_here).index, longindex);
486 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
487 text_t &granularity = splitindex[0];
488 text_t &indextype = splitindex[1];
489 bool found = false;
490 // currently supported granularity options are "document", "section" and "paragraph"
491 if (granularity == "document" || granularity == "section") {
492 text_t shortindex;
493 if (indexmap.fromexists ("paragraph:" + indextype)) {
494 indexmap.from2to ("paragraph:" + indextype, shortindex);
495 (*query_here).index = shortindex;
496 found = true;
497 }
498 if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
499 indexmap.from2to ("section:" + indextype, shortindex);
500 (*query_here).index = shortindex;
501 }
502 }
503 }
504
505#ifdef GSDL_BBC_COLLECTION
506 // This is a special hack for the BBC collection's ProgNumber and zzabn
507 // indexes (they're built this way to prevent mg_perf_hash_build from
508 // dying at build time)
509
510 // if we're searching the ProgNumber index we want to
511 // remove all non-alphanumeric characters from the query string
512 text_t longindex; text_tarray splitindex;
513 indexmap.to2from ((*query_here).index, longindex);
514 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
515 text_t &indextype = splitindex[1];
516 if (indextype == "ProgNumber") {
517 text_t new_querystring;
518 text_t::const_iterator here = (*query_here).querystring.begin();
519 text_t::const_iterator end = (*query_here).querystring.end();
520 while (here != end) {
521 if ((*here >= 'a' && *here <= 'z') || (*here >= 'A' && *here <= 'Z') ||
522 (*here >= '0' && *here <= '9')) {
523 new_querystring.push_back (*here);
524 }
525 ++here;
526 }
527 (*query_here).querystring = new_querystring;
528 }
529#endif
530 ++query_here;
531 }
532}
533
Note: See TracBrowser for help on using the repository browser.