source: trunk/gsdl/src/colservr/mgqueryfilter.cpp@ 10004

Last change on this file since 10004 was 9937, checked in by kjdon, 19 years ago

modified the filters/sources etc so that if an indexstem is specified in the build.cfg file, then this will be used as the root of the index/gdbm filenames instead of the collection name. colleciton name still used by default. this means that we can rename a coll directory without rebuilding.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.7 KB
Line 
1/**********************************************************************
2 *
3 * mgqueryfilter.cpp -- implementation of queryfilter for old mg
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "mgqueryfilter.h"
27#include "fileutil.h"
28#include "phrasesearch.h"
29#include <assert.h>
30#include "mgsearch.h"
31
32///////////////////////////////
33// methods for resultsorderer_t
34///////////////////////////////
35
36resultsorderer_t::resultsorderer_t() {
37 clear ();
38}
39
40void resultsorderer_t::clear() {
41 compare_phrase_match = false;
42 compare_terms_match = false;
43 compare_doc_weight = true;
44
45 docset = NULL;
46}
47
48bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
49 if (docset == NULL) return t1>t2;
50
51 docresultmap::iterator t1_here = docset->find(t1);
52 docresultmap::iterator t2_here = docset->find(t2);
53 docresultmap::iterator end = docset->end();
54
55 // sort all the document numbers not in the document set to
56 // the end of the list
57 if (t1_here == end) {
58 if (t2_here == end) return t1>t2;
59 else return true;
60 } else if (t2_here == end) return false;
61
62 if (compare_phrase_match) {
63 if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
64 if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
65 }
66
67 if (compare_terms_match) {
68 if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
69 if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
70 }
71
72 if (compare_doc_weight) {
73 if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
74 if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
75 }
76
77 return t1>t2;
78}
79
80
81
82
83/////////////////////////////////
84// functions for mgqueryfilterclass
85/////////////////////////////////
86
87
88void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
89 queryfilterclass::configure (key, cfgline);
90
91 if (key == "maxnumeric") {
92 maxnumeric = cfgline[0].getint();
93 }
94 else if (key == "indexstem") {
95 ((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
96 }
97
98}
99
100// loads up phrases data structure with any phrases (that's the quoted bits)
101// occuring in the querystring
102void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
103 const termfreqclassarray &orgterms,
104 vector<termfreqclassarray> &phrases) {
105
106 text_t::const_iterator here = querystring.begin();
107 text_t::const_iterator end = querystring.end();
108
109 termfreqclassarray tmpterms;
110
111 int termcount = 0;
112 bool foundquote = false;
113 bool foundbreak = false;
114 bool start = true;
115 while (here != end) {
116 if (*here == '\"') {
117 if (foundquote) {
118 if (!foundbreak && !start) {
119 tmpterms.push_back (orgterms[termcount]);
120 ++termcount;
121 }
122 if (tmpterms.size() > 1) {
123 phrases.push_back (tmpterms);
124 tmpterms.erase (tmpterms.begin(), tmpterms.end());
125 }
126 foundquote = false;
127 foundbreak = true;
128 } else foundquote = true;
129 } else if (!is_unicode_letdig(*here)) {
130 // found a break between terms
131 if (!foundbreak && !start) {
132 if (foundquote)
133 tmpterms.push_back (orgterms[termcount]);
134 ++termcount;
135 }
136 foundbreak = true;
137 } else {
138 start = false;
139 foundbreak = false;
140 }
141 ++here;
142 }
143}
144
145// do aditional query processing
146void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
147 queryresultsclass &queryresults) {
148
149 // post-process the results if needed
150 if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
151
152 // get the terms between quotes (if any)
153 vector<termfreqclassarray> phrases;
154 get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
155
156 num_phrases = phrases.size();
157 if (num_phrases > 0) {
158
159 // get the long version of the index
160 text_t longindex;
161 indexmap.to2from (queryparams.index, longindex);
162
163 vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
164 vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
165
166 while (this_phrase != end_phrase) {
167
168 // process each of the matched documents
169 docresultmap::iterator docs_here = queryresults.docs.docset.begin();
170 docresultmap::iterator docs_end = queryresults.docs.docset.end();
171 while (docs_here != docs_end) {
172 if (OID_phrase_search (*((mgsearchclass*)textsearchptr), *gdbmptr, queryparams.index,
173 queryparams.subcollection, queryparams.language,
174 longindex, queryparams.collection, *this_phrase,
175 (*docs_here).second.docnum)) {
176 ++docs_here->second.num_phrase_match;
177 }
178
179 ++docs_here;
180 }
181 ++this_phrase;
182 }
183 }
184 }
185}
186
187
188// do query that might involve multiple sub queries
189// mgsearchptr and gdbmptr are assumed to be valid
190void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
191 const vector<queryparamclass> &query_params,
192 queryresultsclass &multiresults,
193 comerror_t &err, ostream &logout) {
194 outconvertclass text_t2ascii;
195
196 err = noError;
197 textsearchptr->setcollectdir (collectdir);
198 multiresults.clear();
199
200 vector<queryparamclass>::const_iterator query_here = query_params.begin();
201 vector<queryparamclass>::const_iterator query_end = query_params.end();
202 while (query_here != query_end) {
203 queryresultsclass thisqueryresults;
204
205 if (!textsearchptr->search(*query_here, thisqueryresults)) {
206 // most likely a system problem
207 logout << text_t2ascii
208 << "system problem: could not do search with mg for index \""
209 << (*query_here).index << (*query_here).subcollection
210 << (*query_here).language << "\".\n\n";
211 err = systemProblem;
212 return;
213 }
214
215 // combine the results
216 if (need_matching_docs (request.filterResultOptions)) {
217 // post-process the results if needed
218 if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
219 !thisqueryresults.docs.docset.empty()) {
220 post_process (*query_here, thisqueryresults);
221 thisqueryresults.postprocessed = true;
222 multiresults.postprocessed = true;
223 } else {
224 num_phrases = 0;
225 }
226
227 if (query_params.size() == 1) {
228 multiresults.docs = thisqueryresults.docs; // just one set of results
229 multiresults.docs_matched = thisqueryresults.docs_matched;
230 multiresults.is_approx = thisqueryresults.is_approx;
231
232 } else {
233 if ((*query_here).combinequery == "and") {
234 multiresults.docs.combine_and (thisqueryresults.docs);
235 } else if ((*query_here).combinequery == "or") {
236 multiresults.docs.combine_or (thisqueryresults.docs);
237 } else if ((*query_here).combinequery == "not") {
238 multiresults.docs.combine_not (thisqueryresults.docs);
239 }
240 multiresults.docs_matched = multiresults.docs.docset.size();
241 multiresults.is_approx = Exact;
242 }
243 }
244
245 // combine the term information
246 if (need_term_info (request.filterResultOptions)) {
247 // append the terms
248 multiresults.orgterms.insert(multiresults.orgterms.end(),
249 thisqueryresults.orgterms.begin(),
250 thisqueryresults.orgterms.end());
251
252 // add the term variants
253 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
254 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
255 while (termvar_here != termvar_end) {
256 multiresults.termvariants.insert(*termvar_here);
257 ++termvar_here;
258 }
259 }
260
261 ++query_here;
262 }
263
264 // sort and unique the query terms
265 multiresults.sortuniqqueryterms ();
266}
267
268
269void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
270 docresultsclass &docs) {
271 resultsorderer_t resultsorderer;
272 resultsorderer.compare_phrase_match = true;
273 resultsorderer.docset = &(docs.docset);
274
275 // first get a list of document numbers
276 docs.docnum_order();
277
278 sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
279}
280
281
282
283mgqueryfilterclass::mgqueryfilterclass ()
284 :queryfilterclass() {
285
286 num_phrases = 0;
287 maxnumeric = 4;
288}
289
290mgqueryfilterclass::~mgqueryfilterclass () {
291}
292
293void mgqueryfilterclass::filter (const FilterRequest_t &request,
294 FilterResponse_t &response,
295 comerror_t &err, ostream &logout) {
296 outconvertclass text_t2ascii;
297
298 response.clear ();
299 err = noError;
300 if (gdbmptr == NULL) {
301 // most likely a configuration problem
302 logout << text_t2ascii
303 << "configuration error: mgqueryfilter contains a null gdbmclass\n\n";
304 err = configurationError;
305 return;
306 }
307 if (textsearchptr == NULL) {
308 // most likely a configuration problem
309 logout << text_t2ascii
310 << "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
311 err = configurationError;
312 return;
313 }
314
315 // open the database
316 gdbmptr->setlogout(&logout);
317 if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
318 // most likely a system problem (we have already checked that the
319 // gdbm database exists)
320 logout << text_t2ascii
321 << "system problem: open on gdbm database \""
322 << gdbm_filename << "\" failed\n\n";
323 err = systemProblem;
324 return;
325 }
326
327 // get the query parameters
328 int startresults = filterOptions["StartResults"].defaultValue.getint();
329 int endresults = filterOptions["EndResults"].defaultValue.getint();
330 text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
331
332 vector<queryparamclass> queryfilterparams;
333 parse_query_params (request, queryfilterparams, startresults,
334 endresults, phrasematch, logout);
335 // do any mg specific diddling with query parameters that may be required
336 mg_parse_query_params (request, queryfilterparams, startresults,
337 endresults, phrasematch, logout);
338
339
340 // do query
341 queryresultsclass queryresults;
342 do_multi_query (request, queryfilterparams, queryresults, err, logout);
343 if (err != noError) return;
344
345 // assemble document results
346 if (need_matching_docs (request.filterResultOptions)) {
347 // sort the query results
348 // only want to sort the docs if we have done a ranked search or there were phrases
349 if (num_phrases > 0 || (request.filterResultOptions & FRranking)) {
350 sort_doc_results (request, queryresults.docs);
351 }
352 int resultnum = 1;
353 ResultDocInfo_t resultdoc;
354 text_t trans_OID;
355 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
356 vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
357
358 // documents containing matching phrases will be sorted to the top so
359 // we can break out once we're past those that match the PhraseMatch
360 // option -- "all_phrases" = return only those documents containing all
361 // phrases in query string
362 // "some_phrases" = return only those documents containing
363 // at least 1 of the phrases in the document
364 // "all_docs" = return all documents regardless
365 if (num_phrases > 0) {
366 int numdocs = 0;
367 while (docorder_here != docorder_end) {
368 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
369
370 if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) ||
371 ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
372 queryresults.docs_matched = numdocs;
373 break;
374 }
375 ++numdocs;
376 ++docorder_here;
377 }
378 }
379
380 if (endresults == -1) endresults = MAXNUMDOCS;
381 docorder_here = queryresults.docs.docorder.begin();
382 while (docorder_here != docorder_end) {
383 if (resultnum > endresults || resultnum > queryresults.docs_matched) break;
384
385 // translate the document number
386 if (!translate(gdbmptr, *docorder_here, trans_OID)) {
387 logout << text_t2ascii
388 << "warning: could not translate mg document number \""
389 << *docorder_here << "\"to OID.\n\n";
390
391 } else {
392 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
393
394 // see if there is a result for this number,
395 // if it is in the request set (or the request set is empty)
396 if (docset_here != queryresults.docs.docset.end() &&
397 (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
398 if (resultnum >= startresults) {
399 // add this document
400 resultdoc.OID = trans_OID;
401 resultdoc.result_num = resultnum;
402 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
403
404 // these next two are not available on all versions of mg
405 resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
406 resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
407
408 response.docInfo.push_back (resultdoc);
409 }
410
411 ++resultnum;
412 }
413 }
414
415 ++docorder_here;
416 }
417 }
418
419 // assemble the term results
420 if (need_term_info(request.filterResultOptions)) {
421 // note: the terms have already been sorted and uniqued
422
423 TermInfo_t terminfo;
424 bool terms_first = true;
425 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
426 termfreqclassarray::iterator terms_end = queryresults.terms.end();
427
428 while (terms_here != terms_end) {
429 terminfo.clear();
430 terminfo.term = (*terms_here).termstr;
431 terminfo.freq = (*terms_here).termfreq;
432 if (terms_first) {
433 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
434 text_tset::iterator termvariants_end = queryresults.termvariants.end();
435 while (termvariants_here != termvariants_end) {
436 terminfo.matchTerms.push_back (*termvariants_here);
437 ++termvariants_here;
438 }
439 }
440 terms_first = false;
441
442 response.termInfo.push_back (terminfo);
443
444 ++terms_here;
445 }
446 }
447
448 response.numDocs = queryresults.docs_matched;
449 response.isApprox = queryresults.is_approx;
450}
451
452void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/*request*/,
453 vector<queryparamclass> &query_params,
454 int &/*startresults*/, int &/*endresults*/,
455 text_t &/*phrasematch*/, ostream &/*logout*/) {
456
457 // outconvertclass text_t2ascii;
458
459 vector<queryparamclass>::iterator query_here = query_params.begin();
460 vector<queryparamclass>::iterator query_end = query_params.end();
461 while (query_here != query_end) {
462
463 // set maxnumeric
464 (*query_here).maxnumeric = maxnumeric;
465
466 // if we're doing a phrase search we want to maximise hits by making it
467 // a boolean search on the index with the finest granularity - we'll
468 // also set maxdocs to "all" (realizing that this will cause searches
469 // like "and the" on a large collection to take a very very long time).
470
471 // we're deciding it's a phrase search based on if the querystring
472 // contains at least 2 double quotes (not very scientific but
473 // then neither is the rest of the mg phrase searching functionality :-)
474 if (countchar ((*query_here).querystring.begin(), (*query_here).querystring.end(), '"') > 1) {
475 (*query_here).search_type = 0;
476
477 // set maxdocs to "all"
478 (*query_here).maxdocs = -1;
479
480 // Get the long version of the index and test to see if any indexes with
481 // finer granularity exist. Indexes must be the same type (i.e. same metadata
482 // or "text").
483 text_t longindex; text_tarray splitindex;
484 indexmap.to2from ((*query_here).index, longindex);
485 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
486 text_t &granularity = splitindex[0];
487 text_t &indextype = splitindex[1];
488 bool found = false;
489 // currently supported granularity options are "document", "section" and "paragraph"
490 if (granularity == "document" || granularity == "section") {
491 text_t shortindex;
492 if (indexmap.fromexists ("paragraph:" + indextype)) {
493 indexmap.from2to ("paragraph:" + indextype, shortindex);
494 (*query_here).index = shortindex;
495 found = true;
496 }
497 if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
498 indexmap.from2to ("section:" + indextype, shortindex);
499 (*query_here).index = shortindex;
500 }
501 }
502 }
503
504#ifdef GSDL_BBC_COLLECTION
505 // This is a special hack for the BBC collection's ProgNumber and zzabn
506 // indexes (they're built this way to prevent mg_perf_hash_build from
507 // dying at build time)
508
509 // if we're searching the ProgNumber index we want to
510 // remove all non-alphanumeric characters from the query string
511 text_t longindex; text_tarray splitindex;
512 indexmap.to2from ((*query_here).index, longindex);
513 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
514 text_t &indextype = splitindex[1];
515 if (indextype == "ProgNumber") {
516 text_t new_querystring;
517 text_t::const_iterator here = (*query_here).querystring.begin();
518 text_t::const_iterator end = (*query_here).querystring.end();
519 while (here != end) {
520 if ((*here >= 'a' && *here <= 'z') || (*here >= 'A' && *here <= 'Z') ||
521 (*here >= '0' && *here <= '9')) {
522 new_querystring.push_back (*here);
523 }
524 ++here;
525 }
526 (*query_here).querystring = new_querystring;
527 }
528#endif
529 ++query_here;
530 }
531}
532
Note: See TracBrowser for help on using the repository browser.