source: trunk/gsdl/src/colservr/queryfilter.cpp@ 766

Last change on this file since 766 was 766, checked in by sjboddie, 25 years ago

Added filter option to remove documents not matching a phrase match.
This used to be done in the receptionist.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 26.5 KB
Line 
1/**********************************************************************
2 *
3 * queryfilter.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: queryfilter.cpp 766 1999-11-01 22:06:06Z sjboddie $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.20 1999/11/01 22:06:06 sjboddie
31 Added filter option to remove documents not matching a phrase match.
32 This used to be done in the receptionist.
33
34 Revision 1.19 1999/10/19 03:23:40 davidb
35 Collection building support through web pages
36 and internal and external link handling for collection documents
37
38 Revision 1.18 1999/09/22 03:43:18 sjboddie
39 Endresults queryfilter option may now take '-1' for 'all'
40
41 Revision 1.17 1999/09/21 12:01:07 sjboddie
42 added Maxdocs queryfilter option (which may be -1 for 'all')
43
44 Revision 1.16 1999/09/07 04:57:24 sjboddie
45 added gpl notice
46
47 Revision 1.15 1999/08/31 22:47:09 rjmcnab
48 Added matchmode option for some and all.
49
50 Revision 1.14 1999/07/16 03:42:21 sjboddie
51 changed isApprox
52
53 Revision 1.13 1999/07/16 00:17:06 sjboddie
54 got using phrasesearch for post-processing
55
56 Revision 1.12 1999/07/09 02:19:43 rjmcnab
57 Fixed a couple of compiler conflicts
58
59 Revision 1.11 1999/07/08 20:49:44 rjmcnab
60 Added result_num to the ResultDocInto_t structure.
61
62 Revision 1.10 1999/07/07 06:19:46 rjmcnab
63 Added ability to combine two or more independant queries.
64
65 Revision 1.9 1999/07/01 09:29:20 rjmcnab
66 Changes for better reporting of number documents which match a query. Changes
67 should still work as before with older versions of mg.
68
69 Revision 1.8 1999/07/01 03:59:54 rjmcnab
70 reduced MAXDOCS to 200 (more reasonable ???). I also added a virtual
71 method for post-processing the query.
72
73 Revision 1.7 1999/06/30 04:04:13 rjmcnab
74 made stemming functions available from mgsearch and made the stems
75 for the query terms available in queryinfo
76
77 Revision 1.6 1999/06/29 22:06:23 rjmcnab
78 Added a couple of fields to queryinfo to handle a special version
79 of mg.
80
81 Revision 1.5 1999/06/27 22:08:48 sjboddie
82 now check for defaultindex, defaultsubcollection, and defaultlanguage
83 entries in config files
84
85 Revision 1.4 1999/06/16 02:03:25 sjboddie
86 fixed bug in isApprox and set MAXDOCS to always be 500
87
88 Revision 1.3 1999/04/19 23:56:09 rjmcnab
89 Finished the gdbm metadata stuff
90
91 Revision 1.2 1999/04/12 03:45:03 rjmcnab
92 Finished the query filter.
93
94 Revision 1.1 1999/04/06 22:22:09 rjmcnab
95 Initial revision.
96
97 */
98
99
100#include "queryfilter.h"
101#include "fileutil.h"
102#include "queryinfo.h"
103#include "phrasesearch.h"
104#include <assert.h>
105
106
107// some useful functions
108
109// translate will return true if successful
110static bool translate (gdbmclass *gdbmptr, int docnum, text_t &trans_OID) {
111 infodbclass info;
112
113 trans_OID.clear();
114
115 // get the info
116 if (gdbmptr == NULL) return false;
117 if (!gdbmptr->getinfo(docnum, info)) return false;
118
119 // translate
120 if (info["section"].empty()) return false;
121
122 trans_OID = info["section"];
123 return true;
124}
125
126
127// whether document results are needed
128static bool need_matching_docs (int filterResultOptions) {
129 return ((filterResultOptions & FROID) || (filterResultOptions & FRranking) ||
130 (filterResultOptions & FRmetadata));
131}
132
133// whether term information is needed
134static bool need_term_info (int filterResultOptions) {
135 return ((filterResultOptions & FRtermFreq) || (filterResultOptions & FRmatchTerms));
136}
137
138///////////////////////////////
139// methods for resultsorderer_t
140///////////////////////////////
141
142resultsorderer_t::resultsorderer_t() {
143 clear ();
144}
145
146void resultsorderer_t::clear() {
147 compare_phrase_match = false;
148 compare_terms_match = false;
149 compare_doc_weight = true;
150
151 docset = NULL;
152}
153
154bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
155 if (docset == NULL) return t1>t2;
156
157 docresultmap::iterator t1_here = docset->find(t1);
158 docresultmap::iterator t2_here = docset->find(t2);
159 docresultmap::iterator end = docset->end();
160
161 // sort all the document numbers not in the document set to
162 // the end of the list
163 if (t1_here == end) {
164 if (t2_here == end) return t1>t2;
165 else return true;
166 } else if (t2_here == end) return false;
167
168 if (compare_phrase_match) {
169 if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
170 if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
171 }
172
173 if (compare_terms_match) {
174 if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
175 if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
176 }
177
178 if (compare_doc_weight) {
179 if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
180 if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
181 }
182
183 return t1>t2;
184}
185
186
187
188
189/////////////////////////////////
190// functions for queryfilterclass
191/////////////////////////////////
192
193// loads up phrases data structure with any phrases (that's the quoted bits)
194// occuring in the querystring
195void queryfilterclass::get_phrase_terms (const text_t &querystring,
196 const termfreqclassarray &orgterms,
197 vector<termfreqclassarray> &phrases) {
198
199 text_t::const_iterator here = querystring.begin();
200 text_t::const_iterator end = querystring.end();
201
202 termfreqclassarray tmpterms;
203
204 int termcount = 0;
205 bool foundquote = false;
206 bool foundbreak = false;
207 bool start = true;
208 while (here != end) {
209 if (*here == '\"') {
210 if (foundquote) {
211 if (!foundbreak && !start) {
212 tmpterms.push_back (orgterms[termcount]);
213 termcount ++;
214 }
215 if (tmpterms.size() > 1) {
216 phrases.push_back (tmpterms);
217 tmpterms.erase (tmpterms.begin(), tmpterms.end());
218 }
219 foundquote = false;
220 foundbreak = true;
221 } else foundquote = true;
222 } else if (!is_unicode_letdig(*here)) {
223 // found a break between terms
224 if (!foundbreak && !start) {
225 if (foundquote)
226 tmpterms.push_back (orgterms[termcount]);
227 termcount ++;
228 }
229 foundbreak = true;
230 } else {
231 start = false;
232 foundbreak = false;
233 }
234 here++;
235 }
236}
237
238// do aditional query processing
239void queryfilterclass::post_process (const queryparamclass &queryparams,
240 queryresultsclass &queryresults) {
241
242 // post-process the results if needed
243 if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
244
245 // get the terms between quotes (if any)
246 vector<termfreqclassarray> phrases;
247 get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
248
249 num_phrases = phrases.size();
250 if (num_phrases > 0) {
251
252 // get the long version of the index
253 text_t longindex;
254 indexmap.to2from (queryparams.index, longindex);
255
256 vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
257 vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
258
259 while (this_phrase != end_phrase) {
260
261 // process each of the matched documents
262 docresultmap::iterator docs_here = queryresults.docs.docset.begin();
263 docresultmap::iterator docs_end = queryresults.docs.docset.end();
264 while (docs_here != docs_end) {
265 if (OID_phrase_search (*mgsearchptr, *gdbmptr, queryparams.index,
266 queryparams.subcollection, queryparams.language,
267 longindex, queryparams.collection, *this_phrase,
268 (*docs_here).second.docnum)) {
269 (*docs_here).second.num_phrase_match++;
270 }
271
272 docs_here++;
273 }
274 this_phrase++;
275 }
276 }
277 }
278}
279
280// get the query parameters
281void queryfilterclass::parse_query_params (const FilterRequest_t &request,
282 vector<queryparamclass> &query_params,
283 int &startresults, int &endresults,
284 text_t &phrasematch, ostream &logout) {
285 outconvertclass text_t2ascii;
286
287 // set defaults for the return parameters
288 query_params.erase(query_params.begin(), query_params.end());
289 startresults = filterOptions["StartResults"].defaultValue.getint();
290 endresults = filterOptions["EndResults"].defaultValue.getint();
291 phrasematch = filterOptions["PhraseMatch"].defaultValue;
292
293 // set defaults for query parameters
294 queryparamclass query;
295 query.combinequery = "or"; // first one must be "or"
296 query.collection = collection;
297 query.index = filterOptions["Index"].defaultValue;
298 query.subcollection = filterOptions["Subcollection"].defaultValue;
299 query.language = filterOptions["Language"].defaultValue;
300 query.querystring.clear();
301 query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
302 query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
303 query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
304 query.stemming = (filterOptions["Stem"].defaultValue == "true");
305 query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
306
307 OptionValue_tarray::const_iterator options_here = request.filterOptions.begin();
308 OptionValue_tarray::const_iterator options_end = request.filterOptions.end();
309 while (options_here != options_end) {
310 if ((*options_here).name == "CombineQuery") {
311 // add this query
312
313 // "all", needed when combining queries where the document results are needed
314 if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
315 query_params.push_back (query);
316
317 // start on next query
318 query.clear();
319 query.combinequery = (*options_here).value;
320
321 // set defaults for query parameters
322 query.collection = collection;
323 query.index = filterOptions["Index"].defaultValue;
324 query.subcollection = filterOptions["Subcollection"].defaultValue;
325 query.language = filterOptions["Language"].defaultValue;
326 query.querystring.clear();
327 query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
328 query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
329 query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
330 query.stemming = (filterOptions["Stem"].defaultValue == "true");
331
332 // "all", needed when combining queries where the document results are needed
333 if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
334 else query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
335
336 } else if ((*options_here).name == "StartResults") {
337 startresults = (*options_here).value.getint();
338 } else if ((*options_here).name == "EndResults") {
339 endresults = (*options_here).value.getint();
340 } else if ((*options_here).name == "QueryType") {
341 query.search_type = ((*options_here).value == "ranked");
342 } else if ((*options_here).name == "MatchMode") {
343 query.match_mode = ((*options_here).value == "all");
344 if (query.match_mode == 1) query.maxdocs = -1;
345 } else if ((*options_here).name == "Term") {
346 query.querystring = (*options_here).value;
347 } else if ((*options_here).name == "Casefold") {
348 query.casefolding = ((*options_here).value == "true");
349 } else if ((*options_here).name == "Stem") {
350 query.stemming = ((*options_here).value == "true");
351 } else if ((*options_here).name == "Index") {
352 query.index = (*options_here).value;
353 } else if ((*options_here).name == "Subcollection") {
354 query.subcollection = (*options_here).value;
355 } else if ((*options_here).name == "Language") {
356 query.language = (*options_here).value;
357 } else if ((*options_here).name == "Maxdocs") {
358 query.maxdocs = (*options_here).value.getint();
359 } else if ((*options_here).name == "PhraseMatch") {
360 phrasematch = (*options_here).value;
361 } else {
362 logout << text_t2ascii
363 << "warning: unknown queryfilter option \""
364 << (*options_here).name
365 << "\" ignored.\n\n";
366 }
367
368 options_here++;
369 }
370
371 // add the last query
372 query_params.push_back (query);
373}
374
375
376
377// do query that might involve multiple sub queries
378// mgsearchptr and gdbmptr are assumed to be valid
379void queryfilterclass::do_multi_query (const FilterRequest_t &request,
380 const vector<queryparamclass> &query_params,
381 queryresultsclass &multiresults,
382 comerror_t &err, ostream &logout) {
383 outconvertclass text_t2ascii;
384
385 err = noError;
386 mgsearchptr->setcollectdir (collectdir);
387 multiresults.clear();
388
389 vector<queryparamclass>::const_iterator query_here = query_params.begin();
390 vector<queryparamclass>::const_iterator query_end = query_params.end();
391 while (query_here != query_end) {
392 queryresultsclass thisqueryresults;
393
394 if (!mgsearchptr->search(*query_here, thisqueryresults)) {
395 // most likely a system problem
396 logout << text_t2ascii
397 << "system problem: could not do search with mg for index \""
398 << (*query_here).index << (*query_here).subcollection
399 << (*query_here).language << "\".\n\n";
400 err = systemProblem;
401 return;
402 }
403
404 // combine the results
405 if (need_matching_docs (request.filterResultOptions)) {
406 // post-process the results if needed
407 if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
408 !thisqueryresults.docs.docset.empty()) {
409 post_process (*query_here, thisqueryresults);
410 thisqueryresults.postprocessed = true;
411 multiresults.postprocessed = true;
412 }
413
414 if (query_params.size() == 1) {
415 multiresults.docs = thisqueryresults.docs; // just one set of results
416 multiresults.docs_matched = thisqueryresults.docs_matched;
417 multiresults.is_approx = thisqueryresults.is_approx;
418
419 } else {
420 if ((*query_here).combinequery == "and") {
421 multiresults.docs.combine_and (thisqueryresults.docs);
422 } else if ((*query_here).combinequery == "or") {
423 multiresults.docs.combine_or (thisqueryresults.docs);
424 } else if ((*query_here).combinequery == "not") {
425 multiresults.docs.combine_not (thisqueryresults.docs);
426 }
427 multiresults.docs_matched = multiresults.docs.docset.size();
428 multiresults.is_approx = Exact;
429 }
430 }
431
432 // combine the term information
433 if (need_term_info (request.filterResultOptions)) {
434 // append the terms
435 multiresults.orgterms.insert(multiresults.orgterms.end(),
436 thisqueryresults.orgterms.begin(),
437 thisqueryresults.orgterms.end());
438
439 // add the term variants
440 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
441 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
442 while (termvar_here != termvar_end) {
443 multiresults.termvariants.insert(*termvar_here);
444 termvar_here++;
445 }
446 }
447
448 query_here++;
449 }
450
451 // sort and unique the query terms
452 multiresults.sortuniqqueryterms ();
453}
454
455
456void queryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
457 docresultsclass &docs) {
458 resultsorderer_t resultsorderer;
459 resultsorderer.compare_phrase_match = true;
460 resultsorderer.docset = &(docs.docset);
461
462 // first get a list of document numbers
463 docs.docnum_order();
464
465 sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
466}
467
468
469
470queryfilterclass::queryfilterclass () {
471 gdbmptr = NULL;
472 mgsearchptr = NULL;
473 num_phrases = 0;
474
475 FilterOption_t filtopt;
476 filtopt.name = "CombineQuery";
477 filtopt.type = FilterOption_t::enumeratedt;
478 filtopt.repeatable = FilterOption_t::onePerQuery;
479 filtopt.defaultValue = "and";
480 filtopt.validValues.push_back("and");
481 filtopt.validValues.push_back("or");
482 filtopt.validValues.push_back("not");
483 filterOptions["CombineQuery"] = filtopt;
484
485 // -- onePerQuery StartResults integer
486 filtopt.clear();
487 filtopt.name = "StartResults";
488 filtopt.type = FilterOption_t::integert;
489 filtopt.repeatable = FilterOption_t::onePerQuery;
490 filtopt.defaultValue = "1";
491 filtopt.validValues.push_back("1");
492 filtopt.validValues.push_back("1000");
493 filterOptions["StartResults"] = filtopt;
494
495 // -- onePerQuery EndResults integer
496 filtopt.clear();
497 filtopt.name = "EndResults";
498 filtopt.type = FilterOption_t::integert;
499 filtopt.repeatable = FilterOption_t::onePerQuery;
500 filtopt.defaultValue = "10";
501 filtopt.validValues.push_back("-1");
502 filtopt.validValues.push_back("1000");
503 filterOptions["EndResults"] = filtopt;
504
505 // -- onePerQuery QueryType enumerated (boolean, ranked)
506 filtopt.clear();
507 filtopt.name = "QueryType";
508 filtopt.type = FilterOption_t::enumeratedt;
509 filtopt.repeatable = FilterOption_t::onePerQuery;
510 filtopt.defaultValue = "ranked";
511 filtopt.validValues.push_back("boolean");
512 filtopt.validValues.push_back("ranked");
513 filterOptions["QueryType"] = filtopt;
514
515 // -- onePerQuery MatchMode enumerated (some, all)
516 filtopt.clear();
517 filtopt.name = "MatchMode";
518 filtopt.type = FilterOption_t::enumeratedt;
519 filtopt.repeatable = FilterOption_t::onePerQuery;
520 filtopt.defaultValue = "some";
521 filtopt.validValues.push_back("some");
522 filtopt.validValues.push_back("all");
523 filterOptions["MatchMode"] = filtopt;
524
525 // -- onePerTerm Term string ???
526 filtopt.clear();
527 filtopt.name = "Term";
528 filtopt.type = FilterOption_t::stringt;
529 filtopt.repeatable = FilterOption_t::onePerTerm;
530 filtopt.defaultValue = "";
531 filterOptions["Term"] = filtopt;
532
533 // -- onePerTerm Casefold boolean
534 filtopt.clear();
535 filtopt.name = "Casefold";
536 filtopt.type = FilterOption_t::booleant;
537 filtopt.repeatable = FilterOption_t::onePerTerm;
538 filtopt.defaultValue = "true";
539 filtopt.validValues.push_back("false");
540 filtopt.validValues.push_back("true");
541 filterOptions["Casefold"] = filtopt;
542
543 // -- onePerTerm Stem boolean
544 filtopt.clear();
545 filtopt.name = "Stem";
546 filtopt.type = FilterOption_t::booleant;
547 filtopt.repeatable = FilterOption_t::onePerTerm;
548 filtopt.defaultValue = "false";
549 filtopt.validValues.push_back("false");
550 filtopt.validValues.push_back("true");
551 filterOptions["Stem"] = filtopt;
552
553 // -- onePerTerm Index enumerated
554 filtopt.clear();
555 filtopt.name = "Index";
556 filtopt.type = FilterOption_t::enumeratedt;
557 filtopt.repeatable = FilterOption_t::onePerTerm;
558 filtopt.defaultValue = "";
559 filterOptions["Index"] = filtopt;
560
561 // -- onePerTerm Subcollection enumerated
562 filtopt.clear();
563 filtopt.name = "Subcollection";
564 filtopt.type = FilterOption_t::enumeratedt;
565 filtopt.repeatable = FilterOption_t::onePerTerm;
566 filtopt.defaultValue = "";
567 filterOptions["Subcollection"] = filtopt;
568
569 // -- onePerTerm Language enumerated
570 filtopt.clear();
571 filtopt.name = "Language";
572 filtopt.type = FilterOption_t::enumeratedt;
573 filtopt.repeatable = FilterOption_t::onePerTerm;
574 filtopt.defaultValue = "";
575 filterOptions["Language"] = filtopt;
576
577 // -- onePerQuery Maxdocs integer
578 filtopt.clear();
579 filtopt.name = "Maxdocs";
580 filtopt.type = FilterOption_t::integert;
581 filtopt.repeatable = FilterOption_t::onePerQuery;
582 filtopt.defaultValue = "200";
583 filtopt.validValues.push_back("-1");
584 filtopt.validValues.push_back("1000");
585 filterOptions["Maxdocs"] = filtopt;
586
587 // -- onePerQuery PhraseMatch enumerated
588 filtopt.clear();
589 filtopt.name = "PhraseMatch";
590 filtopt.type = FilterOption_t::enumeratedt;
591 filtopt.repeatable = FilterOption_t::onePerQuery;
592 filtopt.defaultValue = "some_phrases";
593 filtopt.validValues.push_back ("all_phrases");
594 filtopt.validValues.push_back ("some_phrases");
595 filtopt.validValues.push_back ("all_docs");
596 filterOptions["PhraseMatch"] = filtopt;
597}
598
599queryfilterclass::~queryfilterclass () {
600}
601
602void queryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
603 filterclass::configure (key, cfgline);
604
605 if (key == "indexmap") {
606 indexmap.importmap (cfgline);
607
608 // update the list of indexes in the filter information
609 text_tarray options;
610 indexmap.gettoarray (options);
611 filterOptions["Index"].validValues = options;
612
613 } else if (key == "defaultindex") {
614 indexmap.from2to (cfgline[0], filterOptions["Index"].defaultValue);
615
616 } else if (key == "subcollectionmap") {
617 subcollectionmap.importmap (cfgline);
618
619 // update the list of subcollections in the filter information
620 text_tarray options;
621 subcollectionmap.gettoarray (options);
622 filterOptions["Subcollection"].validValues = options;
623
624 } else if (key == "defaultsubcollection") {
625 subcollectionmap.from2to (cfgline[0], filterOptions["Subcollection"].defaultValue);
626
627 } else if (key == "languagemap") {
628 languagemap.importmap (cfgline);
629
630 // update the list of languages in the filter information
631 text_tarray options;
632 languagemap.gettoarray (options);
633 filterOptions["Language"].validValues = options;
634
635 } else if (key == "defaultlanguage")
636 languagemap.from2to (cfgline[0], filterOptions["Language"].defaultValue);
637}
638
639bool queryfilterclass::init (ostream &logout) {
640 outconvertclass text_t2ascii;
641
642 if (!filterclass::init(logout)) return false;
643
644 // get the filename for the database and make sure it exists
645 gdbm_filename = filename_cat(collectdir,"index","text",collection);
646
647#ifdef _LITTLE_ENDIAN
648 gdbm_filename += ".ldb";
649#else
650 gdbm_filename += ".bdb";
651#endif
652 if (!file_exists(gdbm_filename)) {
653 logout << text_t2ascii
654 << "warning: gdbm database \"" //****
655 << gdbm_filename << "\" does not exist\n\n";
656 //return false; //****
657 }
658
659 return true;
660}
661
662void queryfilterclass::filter (const FilterRequest_t &request,
663 FilterResponse_t &response,
664 comerror_t &err, ostream &logout) {
665 outconvertclass text_t2ascii;
666
667 response.clear ();
668 err = noError;
669 if (gdbmptr == NULL) {
670 // most likely a configuration problem
671 logout << text_t2ascii
672 << "configuration error: queryfilter contains a null gdbmclass\n\n";
673 err = configurationError;
674 return;
675 }
676 if (mgsearchptr == NULL) {
677 // most likely a configuration problem
678 logout << text_t2ascii
679 << "configuration error: queryfilter contains a null mgsearchclass\n\n";
680 err = configurationError;
681 return;
682 }
683
684 // open the database
685 gdbmptr->setlogout(&logout);
686 if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
687 // most likely a system problem (we have already checked that the
688 // gdbm database exists)
689 logout << text_t2ascii
690 << "system problem: open on gdbm database \""
691 << gdbm_filename << "\" failed\n\n";
692 err = systemProblem;
693 return;
694 }
695
696 // get the query parameters
697 int startresults = filterOptions["StartResults"].defaultValue.getint();
698 int endresults = filterOptions["EndResults"].defaultValue.getint();
699 text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
700
701 vector<queryparamclass> queryfilterparams;
702 parse_query_params (request, queryfilterparams, startresults,
703 endresults, phrasematch, logout);
704
705 // do query
706 queryresultsclass queryresults;
707 do_multi_query (request, queryfilterparams, queryresults, err, logout);
708 if (err != noError) return;
709
710 // assemble document results
711 if (need_matching_docs (request.filterResultOptions)) {
712 // sort the query results
713 sort_doc_results (request, queryresults.docs);
714
715 int resultnum = 1;
716 ResultDocInfo_t resultdoc;
717 text_t trans_OID;
718 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
719 vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
720
721 if (endresults == -1) endresults = MAXNUMDOCS;
722 while (docorder_here != docorder_end) {
723 if (resultnum > endresults) break;
724
725 // translate the document number
726 if (!translate(gdbmptr, *docorder_here, trans_OID)) {
727 logout << text_t2ascii
728 << "warning: could not translate mg document number \""
729 << *docorder_here << "\"to OID.\n\n";
730
731 } else {
732 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
733
734 // documents containing matching phrases will be sorted to the top so
735 // we can break out once we're past those that match the PhraseMatch
736 // option -- "all_phrases" = return only those documents containing all
737 // phrases in query string
738 // "some_phrases" = return only those documents containing
739 // at least 1 of the phrases in the document
740 // "all_docs" = return all documents regardless
741 if (num_phrases > 0) {
742 if ((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases))
743 break;
744 if ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))
745 break;
746 }
747
748 // see if there is a result for this number,
749 // if it is in the request set (or the request set is empty)
750 if (docset_here != queryresults.docs.docset.end() &&
751 (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
752 if (resultnum >= startresults) {
753 // add this document
754 resultdoc.OID = trans_OID;
755 resultdoc.result_num = resultnum;
756 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
757
758 // these next two are not available on all versions of mg
759 resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
760 resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
761
762 response.docInfo.push_back (resultdoc);
763 }
764
765 resultnum++;
766 }
767 }
768
769 docorder_here++;
770 }
771 }
772
773 // assemble the term results
774 if (need_term_info(request.filterResultOptions)) {
775 // note: the terms have already been sorted and uniqued
776
777 TermInfo_t terminfo;
778 bool terms_first = true;
779 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
780 termfreqclassarray::iterator terms_end = queryresults.terms.end();
781
782 while (terms_here != terms_end) {
783 terminfo.clear();
784 terminfo.term = (*terms_here).termstr;
785 terminfo.freq = (*terms_here).termfreq;
786 if (terms_first) {
787 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
788 text_tset::iterator termvariants_end = queryresults.termvariants.end();
789 while (termvariants_here != termvariants_end) {
790 terminfo.matchTerms.push_back (*termvariants_here);
791 termvariants_here++;
792 }
793 }
794 terms_first = false;
795
796 response.termInfo.push_back (terminfo);
797
798 terms_here++;
799 }
800 }
801
802 response.numDocs = queryresults.docs_matched;
803 response.isApprox = queryresults.is_approx;
804}
Note: See TracBrowser for help on using the repository browser.