source: trunk/gsdl/src/colservr/queryfilter.cpp@ 990

Last change on this file since 990 was 990, checked in by sjboddie, 24 years ago

tidied up endianness and fastcgi

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 26.8 KB
Line 
1/**********************************************************************
2 *
3 * queryfilter.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: queryfilter.cpp 990 2000-02-29 01:35:56Z sjboddie $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.22 2000/02/29 01:35:56 sjboddie
31 tidied up endianness and fastcgi
32
33 Revision 1.21 1999/11/25 02:21:13 sjboddie
34 fixed bug in phrasematch stuff
35
36 Revision 1.20 1999/11/01 22:06:06 sjboddie
37 Added filter option to remove documents not matching a phrase match.
38 This used to be done in the receptionist.
39
40 Revision 1.19 1999/10/19 03:23:40 davidb
41 Collection building support through web pages
42 and internal and external link handling for collection documents
43
44 Revision 1.18 1999/09/22 03:43:18 sjboddie
45 Endresults queryfilter option may now take '-1' for 'all'
46
47 Revision 1.17 1999/09/21 12:01:07 sjboddie
48 added Maxdocs queryfilter option (which may be -1 for 'all')
49
50 Revision 1.16 1999/09/07 04:57:24 sjboddie
51 added gpl notice
52
53 Revision 1.15 1999/08/31 22:47:09 rjmcnab
54 Added matchmode option for some and all.
55
56 Revision 1.14 1999/07/16 03:42:21 sjboddie
57 changed isApprox
58
59 Revision 1.13 1999/07/16 00:17:06 sjboddie
60 got using phrasesearch for post-processing
61
62 Revision 1.12 1999/07/09 02:19:43 rjmcnab
63 Fixed a couple of compiler conflicts
64
65 Revision 1.11 1999/07/08 20:49:44 rjmcnab
66 Added result_num to the ResultDocInto_t structure.
67
68 Revision 1.10 1999/07/07 06:19:46 rjmcnab
69 Added ability to combine two or more independant queries.
70
71 Revision 1.9 1999/07/01 09:29:20 rjmcnab
72 Changes for better reporting of number documents which match a query. Changes
73 should still work as before with older versions of mg.
74
75 Revision 1.8 1999/07/01 03:59:54 rjmcnab
76 reduced MAXDOCS to 200 (more reasonable ???). I also added a virtual
77 method for post-processing the query.
78
79 Revision 1.7 1999/06/30 04:04:13 rjmcnab
80 made stemming functions available from mgsearch and made the stems
81 for the query terms available in queryinfo
82
83 Revision 1.6 1999/06/29 22:06:23 rjmcnab
84 Added a couple of fields to queryinfo to handle a special version
85 of mg.
86
87 Revision 1.5 1999/06/27 22:08:48 sjboddie
88 now check for defaultindex, defaultsubcollection, and defaultlanguage
89 entries in config files
90
91 Revision 1.4 1999/06/16 02:03:25 sjboddie
92 fixed bug in isApprox and set MAXDOCS to always be 500
93
94 Revision 1.3 1999/04/19 23:56:09 rjmcnab
95 Finished the gdbm metadata stuff
96
97 Revision 1.2 1999/04/12 03:45:03 rjmcnab
98 Finished the query filter.
99
100 Revision 1.1 1999/04/06 22:22:09 rjmcnab
101 Initial revision.
102
103 */
104
105
106#include "queryfilter.h"
107#include "fileutil.h"
108#include "queryinfo.h"
109#include "phrasesearch.h"
110#include "gsdltools.h"
111#include <assert.h>
112
113
114// some useful functions
115
116// translate will return true if successful
117static bool translate (gdbmclass *gdbmptr, int docnum, text_t &trans_OID) {
118 infodbclass info;
119
120 trans_OID.clear();
121
122 // get the info
123 if (gdbmptr == NULL) return false;
124 if (!gdbmptr->getinfo(docnum, info)) return false;
125
126 // translate
127 if (info["section"].empty()) return false;
128
129 trans_OID = info["section"];
130 return true;
131}
132
133
134// whether document results are needed
135static bool need_matching_docs (int filterResultOptions) {
136 return ((filterResultOptions & FROID) || (filterResultOptions & FRranking) ||
137 (filterResultOptions & FRmetadata));
138}
139
140// whether term information is needed
141static bool need_term_info (int filterResultOptions) {
142 return ((filterResultOptions & FRtermFreq) || (filterResultOptions & FRmatchTerms));
143}
144
145///////////////////////////////
146// methods for resultsorderer_t
147///////////////////////////////
148
149resultsorderer_t::resultsorderer_t() {
150 clear ();
151}
152
153void resultsorderer_t::clear() {
154 compare_phrase_match = false;
155 compare_terms_match = false;
156 compare_doc_weight = true;
157
158 docset = NULL;
159}
160
161bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
162 if (docset == NULL) return t1>t2;
163
164 docresultmap::iterator t1_here = docset->find(t1);
165 docresultmap::iterator t2_here = docset->find(t2);
166 docresultmap::iterator end = docset->end();
167
168 // sort all the document numbers not in the document set to
169 // the end of the list
170 if (t1_here == end) {
171 if (t2_here == end) return t1>t2;
172 else return true;
173 } else if (t2_here == end) return false;
174
175 if (compare_phrase_match) {
176 if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
177 if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
178 }
179
180 if (compare_terms_match) {
181 if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
182 if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
183 }
184
185 if (compare_doc_weight) {
186 if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
187 if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
188 }
189
190 return t1>t2;
191}
192
193
194
195
196/////////////////////////////////
197// functions for queryfilterclass
198/////////////////////////////////
199
200// loads up phrases data structure with any phrases (that's the quoted bits)
201// occuring in the querystring
202void queryfilterclass::get_phrase_terms (const text_t &querystring,
203 const termfreqclassarray &orgterms,
204 vector<termfreqclassarray> &phrases) {
205
206 text_t::const_iterator here = querystring.begin();
207 text_t::const_iterator end = querystring.end();
208
209 termfreqclassarray tmpterms;
210
211 int termcount = 0;
212 bool foundquote = false;
213 bool foundbreak = false;
214 bool start = true;
215 while (here != end) {
216 if (*here == '\"') {
217 if (foundquote) {
218 if (!foundbreak && !start) {
219 tmpterms.push_back (orgterms[termcount]);
220 termcount ++;
221 }
222 if (tmpterms.size() > 1) {
223 phrases.push_back (tmpterms);
224 tmpterms.erase (tmpterms.begin(), tmpterms.end());
225 }
226 foundquote = false;
227 foundbreak = true;
228 } else foundquote = true;
229 } else if (!is_unicode_letdig(*here)) {
230 // found a break between terms
231 if (!foundbreak && !start) {
232 if (foundquote)
233 tmpterms.push_back (orgterms[termcount]);
234 termcount ++;
235 }
236 foundbreak = true;
237 } else {
238 start = false;
239 foundbreak = false;
240 }
241 here++;
242 }
243}
244
245// do aditional query processing
246void queryfilterclass::post_process (const queryparamclass &queryparams,
247 queryresultsclass &queryresults) {
248
249 // post-process the results if needed
250 if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
251
252 // get the terms between quotes (if any)
253 vector<termfreqclassarray> phrases;
254 get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
255
256 num_phrases = phrases.size();
257 if (num_phrases > 0) {
258
259 // get the long version of the index
260 text_t longindex;
261 indexmap.to2from (queryparams.index, longindex);
262
263 vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
264 vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
265
266 while (this_phrase != end_phrase) {
267
268 // process each of the matched documents
269 docresultmap::iterator docs_here = queryresults.docs.docset.begin();
270 docresultmap::iterator docs_end = queryresults.docs.docset.end();
271 while (docs_here != docs_end) {
272 if (OID_phrase_search (*mgsearchptr, *gdbmptr, queryparams.index,
273 queryparams.subcollection, queryparams.language,
274 longindex, queryparams.collection, *this_phrase,
275 (*docs_here).second.docnum)) {
276 (*docs_here).second.num_phrase_match++;
277 }
278
279 docs_here++;
280 }
281 this_phrase++;
282 }
283 }
284 }
285}
286
287// get the query parameters
288void queryfilterclass::parse_query_params (const FilterRequest_t &request,
289 vector<queryparamclass> &query_params,
290 int &startresults, int &endresults,
291 text_t &phrasematch, ostream &logout) {
292 outconvertclass text_t2ascii;
293
294 // set defaults for the return parameters
295 query_params.erase(query_params.begin(), query_params.end());
296 startresults = filterOptions["StartResults"].defaultValue.getint();
297 endresults = filterOptions["EndResults"].defaultValue.getint();
298 phrasematch = filterOptions["PhraseMatch"].defaultValue;
299
300 // set defaults for query parameters
301 queryparamclass query;
302 query.combinequery = "or"; // first one must be "or"
303 query.collection = collection;
304 query.index = filterOptions["Index"].defaultValue;
305 query.subcollection = filterOptions["Subcollection"].defaultValue;
306 query.language = filterOptions["Language"].defaultValue;
307 query.querystring.clear();
308 query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
309 query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
310 query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
311 query.stemming = (filterOptions["Stem"].defaultValue == "true");
312 query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
313
314 OptionValue_tarray::const_iterator options_here = request.filterOptions.begin();
315 OptionValue_tarray::const_iterator options_end = request.filterOptions.end();
316 while (options_here != options_end) {
317 if ((*options_here).name == "CombineQuery") {
318 // add this query
319
320 // "all", needed when combining queries where the document results are needed
321 if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
322 query_params.push_back (query);
323
324 // start on next query
325 query.clear();
326 query.combinequery = (*options_here).value;
327
328 // set defaults for query parameters
329 query.collection = collection;
330 query.index = filterOptions["Index"].defaultValue;
331 query.subcollection = filterOptions["Subcollection"].defaultValue;
332 query.language = filterOptions["Language"].defaultValue;
333 query.querystring.clear();
334 query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
335 query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
336 query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
337 query.stemming = (filterOptions["Stem"].defaultValue == "true");
338
339 // "all", needed when combining queries where the document results are needed
340 if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
341 else query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
342
343 } else if ((*options_here).name == "StartResults") {
344 startresults = (*options_here).value.getint();
345 } else if ((*options_here).name == "EndResults") {
346 endresults = (*options_here).value.getint();
347 } else if ((*options_here).name == "QueryType") {
348 query.search_type = ((*options_here).value == "ranked");
349 } else if ((*options_here).name == "MatchMode") {
350 query.match_mode = ((*options_here).value == "all");
351 if (query.match_mode == 1) query.maxdocs = -1;
352 } else if ((*options_here).name == "Term") {
353 query.querystring = (*options_here).value;
354 } else if ((*options_here).name == "Casefold") {
355 query.casefolding = ((*options_here).value == "true");
356 } else if ((*options_here).name == "Stem") {
357 query.stemming = ((*options_here).value == "true");
358 } else if ((*options_here).name == "Index") {
359 query.index = (*options_here).value;
360 } else if ((*options_here).name == "Subcollection") {
361 query.subcollection = (*options_here).value;
362 } else if ((*options_here).name == "Language") {
363 query.language = (*options_here).value;
364 } else if ((*options_here).name == "Maxdocs") {
365 query.maxdocs = (*options_here).value.getint();
366 } else if ((*options_here).name == "PhraseMatch") {
367 phrasematch = (*options_here).value;
368 } else {
369 logout << text_t2ascii
370 << "warning: unknown queryfilter option \""
371 << (*options_here).name
372 << "\" ignored.\n\n";
373 }
374
375 options_here++;
376 }
377
378 // add the last query
379 query_params.push_back (query);
380}
381
382
383
384// do query that might involve multiple sub queries
385// mgsearchptr and gdbmptr are assumed to be valid
386void queryfilterclass::do_multi_query (const FilterRequest_t &request,
387 const vector<queryparamclass> &query_params,
388 queryresultsclass &multiresults,
389 comerror_t &err, ostream &logout) {
390 outconvertclass text_t2ascii;
391
392 err = noError;
393 mgsearchptr->setcollectdir (collectdir);
394 multiresults.clear();
395
396 vector<queryparamclass>::const_iterator query_here = query_params.begin();
397 vector<queryparamclass>::const_iterator query_end = query_params.end();
398 while (query_here != query_end) {
399 queryresultsclass thisqueryresults;
400
401 if (!mgsearchptr->search(*query_here, thisqueryresults)) {
402 // most likely a system problem
403 logout << text_t2ascii
404 << "system problem: could not do search with mg for index \""
405 << (*query_here).index << (*query_here).subcollection
406 << (*query_here).language << "\".\n\n";
407 err = systemProblem;
408 return;
409 }
410
411 // combine the results
412 if (need_matching_docs (request.filterResultOptions)) {
413 // post-process the results if needed
414 if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
415 !thisqueryresults.docs.docset.empty()) {
416 post_process (*query_here, thisqueryresults);
417 thisqueryresults.postprocessed = true;
418 multiresults.postprocessed = true;
419 }
420
421 if (query_params.size() == 1) {
422 multiresults.docs = thisqueryresults.docs; // just one set of results
423 multiresults.docs_matched = thisqueryresults.docs_matched;
424 multiresults.is_approx = thisqueryresults.is_approx;
425
426 } else {
427 if ((*query_here).combinequery == "and") {
428 multiresults.docs.combine_and (thisqueryresults.docs);
429 } else if ((*query_here).combinequery == "or") {
430 multiresults.docs.combine_or (thisqueryresults.docs);
431 } else if ((*query_here).combinequery == "not") {
432 multiresults.docs.combine_not (thisqueryresults.docs);
433 }
434 multiresults.docs_matched = multiresults.docs.docset.size();
435 multiresults.is_approx = Exact;
436 }
437 }
438
439 // combine the term information
440 if (need_term_info (request.filterResultOptions)) {
441 // append the terms
442 multiresults.orgterms.insert(multiresults.orgterms.end(),
443 thisqueryresults.orgterms.begin(),
444 thisqueryresults.orgterms.end());
445
446 // add the term variants
447 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
448 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
449 while (termvar_here != termvar_end) {
450 multiresults.termvariants.insert(*termvar_here);
451 termvar_here++;
452 }
453 }
454
455 query_here++;
456 }
457
458 // sort and unique the query terms
459 multiresults.sortuniqqueryterms ();
460}
461
462
463void queryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
464 docresultsclass &docs) {
465 resultsorderer_t resultsorderer;
466 resultsorderer.compare_phrase_match = true;
467 resultsorderer.docset = &(docs.docset);
468
469 // first get a list of document numbers
470 docs.docnum_order();
471
472 sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
473}
474
475
476
477queryfilterclass::queryfilterclass () {
478 gdbmptr = NULL;
479 mgsearchptr = NULL;
480 num_phrases = 0;
481
482 FilterOption_t filtopt;
483 filtopt.name = "CombineQuery";
484 filtopt.type = FilterOption_t::enumeratedt;
485 filtopt.repeatable = FilterOption_t::onePerQuery;
486 filtopt.defaultValue = "and";
487 filtopt.validValues.push_back("and");
488 filtopt.validValues.push_back("or");
489 filtopt.validValues.push_back("not");
490 filterOptions["CombineQuery"] = filtopt;
491
492 // -- onePerQuery StartResults integer
493 filtopt.clear();
494 filtopt.name = "StartResults";
495 filtopt.type = FilterOption_t::integert;
496 filtopt.repeatable = FilterOption_t::onePerQuery;
497 filtopt.defaultValue = "1";
498 filtopt.validValues.push_back("1");
499 filtopt.validValues.push_back("1000");
500 filterOptions["StartResults"] = filtopt;
501
502 // -- onePerQuery EndResults integer
503 filtopt.clear();
504 filtopt.name = "EndResults";
505 filtopt.type = FilterOption_t::integert;
506 filtopt.repeatable = FilterOption_t::onePerQuery;
507 filtopt.defaultValue = "10";
508 filtopt.validValues.push_back("-1");
509 filtopt.validValues.push_back("1000");
510 filterOptions["EndResults"] = filtopt;
511
512 // -- onePerQuery QueryType enumerated (boolean, ranked)
513 filtopt.clear();
514 filtopt.name = "QueryType";
515 filtopt.type = FilterOption_t::enumeratedt;
516 filtopt.repeatable = FilterOption_t::onePerQuery;
517 filtopt.defaultValue = "ranked";
518 filtopt.validValues.push_back("boolean");
519 filtopt.validValues.push_back("ranked");
520 filterOptions["QueryType"] = filtopt;
521
522 // -- onePerQuery MatchMode enumerated (some, all)
523 filtopt.clear();
524 filtopt.name = "MatchMode";
525 filtopt.type = FilterOption_t::enumeratedt;
526 filtopt.repeatable = FilterOption_t::onePerQuery;
527 filtopt.defaultValue = "some";
528 filtopt.validValues.push_back("some");
529 filtopt.validValues.push_back("all");
530 filterOptions["MatchMode"] = filtopt;
531
532 // -- onePerTerm Term string ???
533 filtopt.clear();
534 filtopt.name = "Term";
535 filtopt.type = FilterOption_t::stringt;
536 filtopt.repeatable = FilterOption_t::onePerTerm;
537 filtopt.defaultValue = "";
538 filterOptions["Term"] = filtopt;
539
540 // -- onePerTerm Casefold boolean
541 filtopt.clear();
542 filtopt.name = "Casefold";
543 filtopt.type = FilterOption_t::booleant;
544 filtopt.repeatable = FilterOption_t::onePerTerm;
545 filtopt.defaultValue = "true";
546 filtopt.validValues.push_back("false");
547 filtopt.validValues.push_back("true");
548 filterOptions["Casefold"] = filtopt;
549
550 // -- onePerTerm Stem boolean
551 filtopt.clear();
552 filtopt.name = "Stem";
553 filtopt.type = FilterOption_t::booleant;
554 filtopt.repeatable = FilterOption_t::onePerTerm;
555 filtopt.defaultValue = "false";
556 filtopt.validValues.push_back("false");
557 filtopt.validValues.push_back("true");
558 filterOptions["Stem"] = filtopt;
559
560 // -- onePerTerm Index enumerated
561 filtopt.clear();
562 filtopt.name = "Index";
563 filtopt.type = FilterOption_t::enumeratedt;
564 filtopt.repeatable = FilterOption_t::onePerTerm;
565 filtopt.defaultValue = "";
566 filterOptions["Index"] = filtopt;
567
568 // -- onePerTerm Subcollection enumerated
569 filtopt.clear();
570 filtopt.name = "Subcollection";
571 filtopt.type = FilterOption_t::enumeratedt;
572 filtopt.repeatable = FilterOption_t::onePerTerm;
573 filtopt.defaultValue = "";
574 filterOptions["Subcollection"] = filtopt;
575
576 // -- onePerTerm Language enumerated
577 filtopt.clear();
578 filtopt.name = "Language";
579 filtopt.type = FilterOption_t::enumeratedt;
580 filtopt.repeatable = FilterOption_t::onePerTerm;
581 filtopt.defaultValue = "";
582 filterOptions["Language"] = filtopt;
583
584 // -- onePerQuery Maxdocs integer
585 filtopt.clear();
586 filtopt.name = "Maxdocs";
587 filtopt.type = FilterOption_t::integert;
588 filtopt.repeatable = FilterOption_t::onePerQuery;
589 filtopt.defaultValue = "200";
590 filtopt.validValues.push_back("-1");
591 filtopt.validValues.push_back("1000");
592 filterOptions["Maxdocs"] = filtopt;
593
594 // -- onePerQuery PhraseMatch enumerated
595 filtopt.clear();
596 filtopt.name = "PhraseMatch";
597 filtopt.type = FilterOption_t::enumeratedt;
598 filtopt.repeatable = FilterOption_t::onePerQuery;
599 filtopt.defaultValue = "some_phrases";
600 filtopt.validValues.push_back ("all_phrases");
601 filtopt.validValues.push_back ("some_phrases");
602 filtopt.validValues.push_back ("all_docs");
603 filterOptions["PhraseMatch"] = filtopt;
604}
605
606queryfilterclass::~queryfilterclass () {
607}
608
609void queryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
610 filterclass::configure (key, cfgline);
611
612 if (key == "indexmap") {
613 indexmap.importmap (cfgline);
614
615 // update the list of indexes in the filter information
616 text_tarray options;
617 indexmap.gettoarray (options);
618 filterOptions["Index"].validValues = options;
619
620 } else if (key == "defaultindex") {
621 indexmap.from2to (cfgline[0], filterOptions["Index"].defaultValue);
622
623 } else if (key == "subcollectionmap") {
624 subcollectionmap.importmap (cfgline);
625
626 // update the list of subcollections in the filter information
627 text_tarray options;
628 subcollectionmap.gettoarray (options);
629 filterOptions["Subcollection"].validValues = options;
630
631 } else if (key == "defaultsubcollection") {
632 subcollectionmap.from2to (cfgline[0], filterOptions["Subcollection"].defaultValue);
633
634 } else if (key == "languagemap") {
635 languagemap.importmap (cfgline);
636
637 // update the list of languages in the filter information
638 text_tarray options;
639 languagemap.gettoarray (options);
640 filterOptions["Language"].validValues = options;
641
642 } else if (key == "defaultlanguage")
643 languagemap.from2to (cfgline[0], filterOptions["Language"].defaultValue);
644}
645
646bool queryfilterclass::init (ostream &logout) {
647 outconvertclass text_t2ascii;
648
649 if (!filterclass::init(logout)) return false;
650
651 // get the filename for the database and make sure it exists
652 gdbm_filename = filename_cat(collectdir,"index","text",collection);
653
654 if (littleEndian()) gdbm_filename += ".ldb";
655 else gdbm_filename += ".bdb";
656
657 if (!file_exists(gdbm_filename)) {
658 logout << text_t2ascii
659 << "warning: gdbm database \"" //****
660 << gdbm_filename << "\" does not exist\n\n";
661 //return false; //****
662 }
663
664 return true;
665}
666
667void queryfilterclass::filter (const FilterRequest_t &request,
668 FilterResponse_t &response,
669 comerror_t &err, ostream &logout) {
670 outconvertclass text_t2ascii;
671
672 response.clear ();
673 err = noError;
674 if (gdbmptr == NULL) {
675 // most likely a configuration problem
676 logout << text_t2ascii
677 << "configuration error: queryfilter contains a null gdbmclass\n\n";
678 err = configurationError;
679 return;
680 }
681 if (mgsearchptr == NULL) {
682 // most likely a configuration problem
683 logout << text_t2ascii
684 << "configuration error: queryfilter contains a null mgsearchclass\n\n";
685 err = configurationError;
686 return;
687 }
688
689 // open the database
690 gdbmptr->setlogout(&logout);
691 if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
692 // most likely a system problem (we have already checked that the
693 // gdbm database exists)
694 logout << text_t2ascii
695 << "system problem: open on gdbm database \""
696 << gdbm_filename << "\" failed\n\n";
697 err = systemProblem;
698 return;
699 }
700
701 // get the query parameters
702 int startresults = filterOptions["StartResults"].defaultValue.getint();
703 int endresults = filterOptions["EndResults"].defaultValue.getint();
704 text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
705
706 vector<queryparamclass> queryfilterparams;
707 parse_query_params (request, queryfilterparams, startresults,
708 endresults, phrasematch, logout);
709
710 // do query
711 queryresultsclass queryresults;
712 do_multi_query (request, queryfilterparams, queryresults, err, logout);
713 if (err != noError) return;
714
715 // assemble document results
716 if (need_matching_docs (request.filterResultOptions)) {
717 // sort the query results
718 sort_doc_results (request, queryresults.docs);
719
720 int resultnum = 1;
721 ResultDocInfo_t resultdoc;
722 text_t trans_OID;
723 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
724 vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
725
726 if (endresults == -1) endresults = MAXNUMDOCS;
727 while (docorder_here != docorder_end) {
728 if (resultnum > endresults) break;
729
730 // translate the document number
731 if (!translate(gdbmptr, *docorder_here, trans_OID)) {
732 logout << text_t2ascii
733 << "warning: could not translate mg document number \""
734 << *docorder_here << "\"to OID.\n\n";
735
736 } else {
737 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
738
739 // documents containing matching phrases will be sorted to the top so
740 // we can break out once we're past those that match the PhraseMatch
741 // option -- "all_phrases" = return only those documents containing all
742 // phrases in query string
743 // "some_phrases" = return only those documents containing
744 // at least 1 of the phrases in the document
745 // "all_docs" = return all documents regardless
746 if (num_phrases > 0) {
747 if ((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) {
748 queryresults.docs_matched = response.docInfo.size();
749 break;
750 }
751 if ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1)) {
752 queryresults.docs_matched = response.docInfo.size();
753 break;
754 }
755 }
756
757 // see if there is a result for this number,
758 // if it is in the request set (or the request set is empty)
759 if (docset_here != queryresults.docs.docset.end() &&
760 (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
761 if (resultnum >= startresults) {
762 // add this document
763 resultdoc.OID = trans_OID;
764 resultdoc.result_num = resultnum;
765 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
766
767 // these next two are not available on all versions of mg
768 resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
769 resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
770
771 response.docInfo.push_back (resultdoc);
772 }
773
774 resultnum++;
775 }
776 }
777
778 docorder_here++;
779 }
780 }
781
782 // assemble the term results
783 if (need_term_info(request.filterResultOptions)) {
784 // note: the terms have already been sorted and uniqued
785
786 TermInfo_t terminfo;
787 bool terms_first = true;
788 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
789 termfreqclassarray::iterator terms_end = queryresults.terms.end();
790
791 while (terms_here != terms_end) {
792 terminfo.clear();
793 terminfo.term = (*terms_here).termstr;
794 terminfo.freq = (*terms_here).termfreq;
795 if (terms_first) {
796 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
797 text_tset::iterator termvariants_end = queryresults.termvariants.end();
798 while (termvariants_here != termvariants_end) {
799 terminfo.matchTerms.push_back (*termvariants_here);
800 termvariants_here++;
801 }
802 }
803 terms_first = false;
804
805 response.termInfo.push_back (terminfo);
806
807 terms_here++;
808 }
809 }
810
811 response.numDocs = queryresults.docs_matched;
812 response.isApprox = queryresults.is_approx;
813}
Note: See TracBrowser for help on using the repository browser.