source: branches/corba/gsdl/src/colservr/queryfilter.cpp@ 1074

Last change on this file since 1074 was 1074, checked in by cs025, 24 years ago

Corba improvements; tidied client initialisation in corbaproto and also
added a number of changes to the main trunk which somehow had not made
it into the corba branch via update before its instantiation.

Also the dated use of the GSDL_GSDLHOME macro was removed, at the expense
of some particularly poor code in corbaserver where log file creation is
now nowhere near so elegant.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 27.2 KB
Line 
1/**********************************************************************
2 *
3 * queryfilter.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: queryfilter.cpp 1074 2000-04-06 11:11:40Z cs025 $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.22.2.1 2000/04/06 11:11:37 cs025
31 Corba improvements; tidied client initialisation in corbaproto and also
32 added a number of changes to the main trunk which somehow had not made
33 it into the corba branch via update before its instantiation.
34
35 Also the dated use of the GSDL_GSDLHOME macro was removed, at the expense
36 of some particularly poor code in corbaserver where log file creation is
37 now nowhere near so elegant.
38
39 Revision 1.22 2000/02/29 01:35:56 sjboddie
40 tidied up endianness and fastcgi
41
42 Revision 1.21 1999/11/25 02:21:13 sjboddie
43 fixed bug in phrasematch stuff
44
45 Revision 1.20 1999/11/01 22:06:06 sjboddie
46 Added filter option to remove documents not matching a phrase match.
47 This used to be done in the receptionist.
48
49 Revision 1.19 1999/10/19 03:23:40 davidb
50 Collection building support through web pages
51 and internal and external link handling for collection documents
52
53 Revision 1.18 1999/09/22 03:43:18 sjboddie
54 Endresults queryfilter option may now take '-1' for 'all'
55
56 Revision 1.17 1999/09/21 12:01:07 sjboddie
57 added Maxdocs queryfilter option (which may be -1 for 'all')
58
59 Revision 1.16 1999/09/07 04:57:24 sjboddie
60 added gpl notice
61
62 Revision 1.15 1999/08/31 22:47:09 rjmcnab
63 Added matchmode option for some and all.
64
65 Revision 1.14 1999/07/16 03:42:21 sjboddie
66 changed isApprox
67
68 Revision 1.13 1999/07/16 00:17:06 sjboddie
69 got using phrasesearch for post-processing
70
71 Revision 1.12 1999/07/09 02:19:43 rjmcnab
72 Fixed a couple of compiler conflicts
73
74 Revision 1.11 1999/07/08 20:49:44 rjmcnab
75 Added result_num to the ResultDocInto_t structure.
76
77 Revision 1.10 1999/07/07 06:19:46 rjmcnab
78 Added ability to combine two or more independant queries.
79
80 Revision 1.9 1999/07/01 09:29:20 rjmcnab
81 Changes for better reporting of number documents which match a query. Changes
82 should still work as before with older versions of mg.
83
84 Revision 1.8 1999/07/01 03:59:54 rjmcnab
85 reduced MAXDOCS to 200 (more reasonable ???). I also added a virtual
86 method for post-processing the query.
87
88 Revision 1.7 1999/06/30 04:04:13 rjmcnab
89 made stemming functions available from mgsearch and made the stems
90 for the query terms available in queryinfo
91
92 Revision 1.6 1999/06/29 22:06:23 rjmcnab
93 Added a couple of fields to queryinfo to handle a special version
94 of mg.
95
96 Revision 1.5 1999/06/27 22:08:48 sjboddie
97 now check for defaultindex, defaultsubcollection, and defaultlanguage
98 entries in config files
99
100 Revision 1.4 1999/06/16 02:03:25 sjboddie
101 fixed bug in isApprox and set MAXDOCS to always be 500
102
103 Revision 1.3 1999/04/19 23:56:09 rjmcnab
104 Finished the gdbm metadata stuff
105
106 Revision 1.2 1999/04/12 03:45:03 rjmcnab
107 Finished the query filter.
108
109 Revision 1.1 1999/04/06 22:22:09 rjmcnab
110 Initial revision.
111
112 */
113
114
115#include "queryfilter.h"
116#include "fileutil.h"
117#include "queryinfo.h"
118#include "phrasesearch.h"
119#include "gsdltools.h"
120#include <assert.h>
121
122
123// some useful functions
124
125// translate will return true if successful
126static bool translate (gdbmclass *gdbmptr, int docnum, text_t &trans_OID) {
127 infodbclass info;
128
129 trans_OID.clear();
130
131 // get the info
132 if (gdbmptr == NULL) return false;
133 if (!gdbmptr->getinfo(docnum, info)) return false;
134
135 // translate
136 if (info["section"].empty()) return false;
137
138 trans_OID = info["section"];
139 return true;
140}
141
142
143// whether document results are needed
144static bool need_matching_docs (int filterResultOptions) {
145 return ((filterResultOptions & FROID) || (filterResultOptions & FRranking) ||
146 (filterResultOptions & FRmetadata));
147}
148
149// whether term information is needed
150static bool need_term_info (int filterResultOptions) {
151 return ((filterResultOptions & FRtermFreq) || (filterResultOptions & FRmatchTerms));
152}
153
154///////////////////////////////
155// methods for resultsorderer_t
156///////////////////////////////
157
158resultsorderer_t::resultsorderer_t() {
159 clear ();
160}
161
162void resultsorderer_t::clear() {
163 compare_phrase_match = false;
164 compare_terms_match = false;
165 compare_doc_weight = true;
166
167 docset = NULL;
168}
169
170bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
171 if (docset == NULL) return t1>t2;
172
173 docresultmap::iterator t1_here = docset->find(t1);
174 docresultmap::iterator t2_here = docset->find(t2);
175 docresultmap::iterator end = docset->end();
176
177 // sort all the document numbers not in the document set to
178 // the end of the list
179 if (t1_here == end) {
180 if (t2_here == end) return t1>t2;
181 else return true;
182 } else if (t2_here == end) return false;
183
184 if (compare_phrase_match) {
185 if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
186 if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
187 }
188
189 if (compare_terms_match) {
190 if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
191 if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
192 }
193
194 if (compare_doc_weight) {
195 if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
196 if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
197 }
198
199 return t1>t2;
200}
201
202
203
204
205/////////////////////////////////
206// functions for queryfilterclass
207/////////////////////////////////
208
209// loads up phrases data structure with any phrases (that's the quoted bits)
210// occuring in the querystring
211void queryfilterclass::get_phrase_terms (const text_t &querystring,
212 const termfreqclassarray &orgterms,
213 vector<termfreqclassarray> &phrases) {
214
215 text_t::const_iterator here = querystring.begin();
216 text_t::const_iterator end = querystring.end();
217
218 termfreqclassarray tmpterms;
219
220 int termcount = 0;
221 bool foundquote = false;
222 bool foundbreak = false;
223 bool start = true;
224 while (here != end) {
225 if (*here == '\"') {
226 if (foundquote) {
227 if (!foundbreak && !start) {
228 tmpterms.push_back (orgterms[termcount]);
229 termcount ++;
230 }
231 if (tmpterms.size() > 1) {
232 phrases.push_back (tmpterms);
233 tmpterms.erase (tmpterms.begin(), tmpterms.end());
234 }
235 foundquote = false;
236 foundbreak = true;
237 } else foundquote = true;
238 } else if (!is_unicode_letdig(*here)) {
239 // found a break between terms
240 if (!foundbreak && !start) {
241 if (foundquote)
242 tmpterms.push_back (orgterms[termcount]);
243 termcount ++;
244 }
245 foundbreak = true;
246 } else {
247 start = false;
248 foundbreak = false;
249 }
250 here++;
251 }
252}
253
254// do aditional query processing
255void queryfilterclass::post_process (const queryparamclass &queryparams,
256 queryresultsclass &queryresults) {
257
258 // post-process the results if needed
259 if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
260
261 // get the terms between quotes (if any)
262 vector<termfreqclassarray> phrases;
263 get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
264
265 num_phrases = phrases.size();
266 if (num_phrases > 0) {
267
268 // get the long version of the index
269 text_t longindex;
270 indexmap.to2from (queryparams.index, longindex);
271
272 vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
273 vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
274
275 while (this_phrase != end_phrase) {
276
277 // process each of the matched documents
278 docresultmap::iterator docs_here = queryresults.docs.docset.begin();
279 docresultmap::iterator docs_end = queryresults.docs.docset.end();
280 while (docs_here != docs_end) {
281 if (OID_phrase_search (*mgsearchptr, *gdbmptr, queryparams.index,
282 queryparams.subcollection, queryparams.language,
283 longindex, queryparams.collection, *this_phrase,
284 (*docs_here).second.docnum)) {
285 (*docs_here).second.num_phrase_match++;
286 }
287
288 docs_here++;
289 }
290 this_phrase++;
291 }
292 }
293 }
294}
295
296// get the query parameters
297void queryfilterclass::parse_query_params (const FilterRequest_t &request,
298 vector<queryparamclass> &query_params,
299 int &startresults, int &endresults,
300 text_t &phrasematch, ostream &logout) {
301 outconvertclass text_t2ascii;
302
303 // set defaults for the return parameters
304 query_params.erase(query_params.begin(), query_params.end());
305 startresults = filterOptions["StartResults"].defaultValue.getint();
306 endresults = filterOptions["EndResults"].defaultValue.getint();
307 phrasematch = filterOptions["PhraseMatch"].defaultValue;
308
309 // set defaults for query parameters
310 queryparamclass query;
311 query.combinequery = "or"; // first one must be "or"
312 query.collection = collection;
313 query.index = filterOptions["Index"].defaultValue;
314 query.subcollection = filterOptions["Subcollection"].defaultValue;
315 query.language = filterOptions["Language"].defaultValue;
316 query.querystring.clear();
317 query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
318 query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
319 query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
320 query.stemming = (filterOptions["Stem"].defaultValue == "true");
321 query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
322
323 OptionValue_tarray::const_iterator options_here = request.filterOptions.begin();
324 OptionValue_tarray::const_iterator options_end = request.filterOptions.end();
325 while (options_here != options_end) {
326 if ((*options_here).name == "CombineQuery") {
327 // add this query
328
329 // "all", needed when combining queries where the document results are needed
330 if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
331 query_params.push_back (query);
332
333 // start on next query
334 query.clear();
335 query.combinequery = (*options_here).value;
336
337 // set defaults for query parameters
338 query.collection = collection;
339 query.index = filterOptions["Index"].defaultValue;
340 query.subcollection = filterOptions["Subcollection"].defaultValue;
341 query.language = filterOptions["Language"].defaultValue;
342 query.querystring.clear();
343 query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
344 query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
345 query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
346 query.stemming = (filterOptions["Stem"].defaultValue == "true");
347
348 // "all", needed when combining queries where the document results are needed
349 if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
350 else query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
351
352 } else if ((*options_here).name == "StartResults") {
353 startresults = (*options_here).value.getint();
354 } else if ((*options_here).name == "EndResults") {
355 endresults = (*options_here).value.getint();
356 } else if ((*options_here).name == "QueryType") {
357 query.search_type = ((*options_here).value == "ranked");
358 } else if ((*options_here).name == "MatchMode") {
359 query.match_mode = ((*options_here).value == "all");
360 if (query.match_mode == 1) query.maxdocs = -1;
361 } else if ((*options_here).name == "Term") {
362 query.querystring = (*options_here).value;
363 } else if ((*options_here).name == "Casefold") {
364 query.casefolding = ((*options_here).value == "true");
365 } else if ((*options_here).name == "Stem") {
366 query.stemming = ((*options_here).value == "true");
367 } else if ((*options_here).name == "Index") {
368 query.index = (*options_here).value;
369 } else if ((*options_here).name == "Subcollection") {
370 query.subcollection = (*options_here).value;
371 } else if ((*options_here).name == "Language") {
372 query.language = (*options_here).value;
373 } else if ((*options_here).name == "Maxdocs") {
374 query.maxdocs = (*options_here).value.getint();
375 } else if ((*options_here).name == "PhraseMatch") {
376 phrasematch = (*options_here).value;
377 } else {
378 logout << text_t2ascii
379 << "warning: unknown queryfilter option \""
380 << (*options_here).name
381 << "\" ignored.\n\n";
382 }
383
384 options_here++;
385 }
386
387 // add the last query
388 query_params.push_back (query);
389}
390
391
392
393// do query that might involve multiple sub queries
394// mgsearchptr and gdbmptr are assumed to be valid
395void queryfilterclass::do_multi_query (const FilterRequest_t &request,
396 const vector<queryparamclass> &query_params,
397 queryresultsclass &multiresults,
398 comerror_t &err, ostream &logout) {
399 outconvertclass text_t2ascii;
400
401 err = noError;
402 mgsearchptr->setcollectdir (collectdir);
403 multiresults.clear();
404
405 vector<queryparamclass>::const_iterator query_here = query_params.begin();
406 vector<queryparamclass>::const_iterator query_end = query_params.end();
407 while (query_here != query_end) {
408 queryresultsclass thisqueryresults;
409
410 if (!mgsearchptr->search(*query_here, thisqueryresults)) {
411 // most likely a system problem
412 logout << text_t2ascii
413 << "system problem: could not do search with mg for index \""
414 << (*query_here).index << (*query_here).subcollection
415 << (*query_here).language << "\".\n\n";
416 err = systemProblem;
417 return;
418 }
419
420 // combine the results
421 if (need_matching_docs (request.filterResultOptions)) {
422 // post-process the results if needed
423 if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
424 !thisqueryresults.docs.docset.empty()) {
425 post_process (*query_here, thisqueryresults);
426 thisqueryresults.postprocessed = true;
427 multiresults.postprocessed = true;
428 }
429
430 if (query_params.size() == 1) {
431 multiresults.docs = thisqueryresults.docs; // just one set of results
432 multiresults.docs_matched = thisqueryresults.docs_matched;
433 multiresults.is_approx = thisqueryresults.is_approx;
434
435 } else {
436 if ((*query_here).combinequery == "and") {
437 multiresults.docs.combine_and (thisqueryresults.docs);
438 } else if ((*query_here).combinequery == "or") {
439 multiresults.docs.combine_or (thisqueryresults.docs);
440 } else if ((*query_here).combinequery == "not") {
441 multiresults.docs.combine_not (thisqueryresults.docs);
442 }
443 multiresults.docs_matched = multiresults.docs.docset.size();
444 multiresults.is_approx = Exact;
445 }
446 }
447
448 // combine the term information
449 if (need_term_info (request.filterResultOptions)) {
450 // append the terms
451 multiresults.orgterms.insert(multiresults.orgterms.end(),
452 thisqueryresults.orgterms.begin(),
453 thisqueryresults.orgterms.end());
454
455 // add the term variants
456 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
457 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
458 while (termvar_here != termvar_end) {
459 multiresults.termvariants.insert(*termvar_here);
460 termvar_here++;
461 }
462 }
463
464 query_here++;
465 }
466
467 // sort and unique the query terms
468 multiresults.sortuniqqueryterms ();
469}
470
471
472void queryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
473 docresultsclass &docs) {
474 resultsorderer_t resultsorderer;
475 resultsorderer.compare_phrase_match = true;
476 resultsorderer.docset = &(docs.docset);
477
478 // first get a list of document numbers
479 docs.docnum_order();
480
481 sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
482}
483
484
485
486queryfilterclass::queryfilterclass () {
487 gdbmptr = NULL;
488 mgsearchptr = NULL;
489 num_phrases = 0;
490
491 FilterOption_t filtopt;
492 filtopt.name = "CombineQuery";
493 filtopt.type = FilterOption_t::enumeratedt;
494 filtopt.repeatable = FilterOption_t::onePerQuery;
495 filtopt.defaultValue = "and";
496 filtopt.validValues.push_back("and");
497 filtopt.validValues.push_back("or");
498 filtopt.validValues.push_back("not");
499 filterOptions["CombineQuery"] = filtopt;
500
501 // -- onePerQuery StartResults integer
502 filtopt.clear();
503 filtopt.name = "StartResults";
504 filtopt.type = FilterOption_t::integert;
505 filtopt.repeatable = FilterOption_t::onePerQuery;
506 filtopt.defaultValue = "1";
507 filtopt.validValues.push_back("1");
508 filtopt.validValues.push_back("1000");
509 filterOptions["StartResults"] = filtopt;
510
511 // -- onePerQuery EndResults integer
512 filtopt.clear();
513 filtopt.name = "EndResults";
514 filtopt.type = FilterOption_t::integert;
515 filtopt.repeatable = FilterOption_t::onePerQuery;
516 filtopt.defaultValue = "10";
517 filtopt.validValues.push_back("-1");
518 filtopt.validValues.push_back("1000");
519 filterOptions["EndResults"] = filtopt;
520
521 // -- onePerQuery QueryType enumerated (boolean, ranked)
522 filtopt.clear();
523 filtopt.name = "QueryType";
524 filtopt.type = FilterOption_t::enumeratedt;
525 filtopt.repeatable = FilterOption_t::onePerQuery;
526 filtopt.defaultValue = "ranked";
527 filtopt.validValues.push_back("boolean");
528 filtopt.validValues.push_back("ranked");
529 filterOptions["QueryType"] = filtopt;
530
531 // -- onePerQuery MatchMode enumerated (some, all)
532 filtopt.clear();
533 filtopt.name = "MatchMode";
534 filtopt.type = FilterOption_t::enumeratedt;
535 filtopt.repeatable = FilterOption_t::onePerQuery;
536 filtopt.defaultValue = "some";
537 filtopt.validValues.push_back("some");
538 filtopt.validValues.push_back("all");
539 filterOptions["MatchMode"] = filtopt;
540
541 // -- onePerTerm Term string ???
542 filtopt.clear();
543 filtopt.name = "Term";
544 filtopt.type = FilterOption_t::stringt;
545 filtopt.repeatable = FilterOption_t::onePerTerm;
546 filtopt.defaultValue = "";
547 filterOptions["Term"] = filtopt;
548
549 // -- onePerTerm Casefold boolean
550 filtopt.clear();
551 filtopt.name = "Casefold";
552 filtopt.type = FilterOption_t::booleant;
553 filtopt.repeatable = FilterOption_t::onePerTerm;
554 filtopt.defaultValue = "true";
555 filtopt.validValues.push_back("false");
556 filtopt.validValues.push_back("true");
557 filterOptions["Casefold"] = filtopt;
558
559 // -- onePerTerm Stem boolean
560 filtopt.clear();
561 filtopt.name = "Stem";
562 filtopt.type = FilterOption_t::booleant;
563 filtopt.repeatable = FilterOption_t::onePerTerm;
564 filtopt.defaultValue = "false";
565 filtopt.validValues.push_back("false");
566 filtopt.validValues.push_back("true");
567 filterOptions["Stem"] = filtopt;
568
569 // -- onePerTerm Index enumerated
570 filtopt.clear();
571 filtopt.name = "Index";
572 filtopt.type = FilterOption_t::enumeratedt;
573 filtopt.repeatable = FilterOption_t::onePerTerm;
574 filtopt.defaultValue = "";
575 filterOptions["Index"] = filtopt;
576
577 // -- onePerTerm Subcollection enumerated
578 filtopt.clear();
579 filtopt.name = "Subcollection";
580 filtopt.type = FilterOption_t::enumeratedt;
581 filtopt.repeatable = FilterOption_t::onePerTerm;
582 filtopt.defaultValue = "";
583 filterOptions["Subcollection"] = filtopt;
584
585 // -- onePerTerm Language enumerated
586 filtopt.clear();
587 filtopt.name = "Language";
588 filtopt.type = FilterOption_t::enumeratedt;
589 filtopt.repeatable = FilterOption_t::onePerTerm;
590 filtopt.defaultValue = "";
591 filterOptions["Language"] = filtopt;
592
593 // -- onePerQuery Maxdocs integer
594 filtopt.clear();
595 filtopt.name = "Maxdocs";
596 filtopt.type = FilterOption_t::integert;
597 filtopt.repeatable = FilterOption_t::onePerQuery;
598 filtopt.defaultValue = "200";
599 filtopt.validValues.push_back("-1");
600 filtopt.validValues.push_back("1000");
601 filterOptions["Maxdocs"] = filtopt;
602
603 // -- onePerQuery PhraseMatch enumerated
604 filtopt.clear();
605 filtopt.name = "PhraseMatch";
606 filtopt.type = FilterOption_t::enumeratedt;
607 filtopt.repeatable = FilterOption_t::onePerQuery;
608 filtopt.defaultValue = "some_phrases";
609 filtopt.validValues.push_back ("all_phrases");
610 filtopt.validValues.push_back ("some_phrases");
611 filtopt.validValues.push_back ("all_docs");
612 filterOptions["PhraseMatch"] = filtopt;
613}
614
615queryfilterclass::~queryfilterclass () {
616}
617
618void queryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
619 filterclass::configure (key, cfgline);
620
621 if (key == "indexmap") {
622 indexmap.importmap (cfgline);
623
624 // update the list of indexes in the filter information
625 text_tarray options;
626 indexmap.gettoarray (options);
627 filterOptions["Index"].validValues = options;
628
629 } else if (key == "defaultindex") {
630 indexmap.from2to (cfgline[0], filterOptions["Index"].defaultValue);
631
632 } else if (key == "subcollectionmap") {
633 subcollectionmap.importmap (cfgline);
634
635 // update the list of subcollections in the filter information
636 text_tarray options;
637 subcollectionmap.gettoarray (options);
638 filterOptions["Subcollection"].validValues = options;
639
640 } else if (key == "defaultsubcollection") {
641 subcollectionmap.from2to (cfgline[0], filterOptions["Subcollection"].defaultValue);
642
643 } else if (key == "languagemap") {
644 languagemap.importmap (cfgline);
645
646 // update the list of languages in the filter information
647 text_tarray options;
648 languagemap.gettoarray (options);
649 filterOptions["Language"].validValues = options;
650
651 } else if (key == "defaultlanguage")
652 languagemap.from2to (cfgline[0], filterOptions["Language"].defaultValue);
653}
654
655bool queryfilterclass::init (ostream &logout) {
656 outconvertclass text_t2ascii;
657
658 if (!filterclass::init(logout)) return false;
659
660 // get the filename for the database and make sure it exists
661 gdbm_filename = filename_cat(collectdir,"index","text",collection);
662
663 if (littleEndian()) gdbm_filename += ".ldb";
664 else gdbm_filename += ".bdb";
665
666 if (!file_exists(gdbm_filename)) {
667 logout << text_t2ascii
668 << "warning: gdbm database \"" //****
669 << gdbm_filename << "\" does not exist\n\n";
670 //return false; //****
671 }
672
673 return true;
674}
675
676void queryfilterclass::filter (const FilterRequest_t &request,
677 FilterResponse_t &response,
678 comerror_t &err, ostream &logout) {
679 outconvertclass text_t2ascii;
680
681 response.clear ();
682 err = noError;
683 if (gdbmptr == NULL) {
684 // most likely a configuration problem
685 logout << text_t2ascii
686 << "configuration error: queryfilter contains a null gdbmclass\n\n";
687 err = configurationError;
688 return;
689 }
690 if (mgsearchptr == NULL) {
691 // most likely a configuration problem
692 logout << text_t2ascii
693 << "configuration error: queryfilter contains a null mgsearchclass\n\n";
694 err = configurationError;
695 return;
696 }
697
698 // open the database
699 gdbmptr->setlogout(&logout);
700 if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
701 // most likely a system problem (we have already checked that the
702 // gdbm database exists)
703 logout << text_t2ascii
704 << "system problem: open on gdbm database \""
705 << gdbm_filename << "\" failed\n\n";
706 err = systemProblem;
707 return;
708 }
709
710 // get the query parameters
711 int startresults = filterOptions["StartResults"].defaultValue.getint();
712 int endresults = filterOptions["EndResults"].defaultValue.getint();
713 text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
714
715 vector<queryparamclass> queryfilterparams;
716 parse_query_params (request, queryfilterparams, startresults,
717 endresults, phrasematch, logout);
718
719 // do query
720 queryresultsclass queryresults;
721 do_multi_query (request, queryfilterparams, queryresults, err, logout);
722 if (err != noError) return;
723
724 // assemble document results
725 if (need_matching_docs (request.filterResultOptions)) {
726 // sort the query results
727 sort_doc_results (request, queryresults.docs);
728
729 int resultnum = 1;
730 ResultDocInfo_t resultdoc;
731 text_t trans_OID;
732 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
733 vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
734
735 if (endresults == -1) endresults = MAXNUMDOCS;
736 while (docorder_here != docorder_end) {
737 if (resultnum > endresults) break;
738
739 // translate the document number
740 if (!translate(gdbmptr, *docorder_here, trans_OID)) {
741 logout << text_t2ascii
742 << "warning: could not translate mg document number \""
743 << *docorder_here << "\"to OID.\n\n";
744
745 } else {
746 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
747
748 // documents containing matching phrases will be sorted to the top so
749 // we can break out once we're past those that match the PhraseMatch
750 // option -- "all_phrases" = return only those documents containing all
751 // phrases in query string
752 // "some_phrases" = return only those documents containing
753 // at least 1 of the phrases in the document
754 // "all_docs" = return all documents regardless
755 if (num_phrases > 0) {
756 if ((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) {
757 queryresults.docs_matched = response.docInfo.size();
758 break;
759 }
760 if ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1)) {
761 queryresults.docs_matched = response.docInfo.size();
762 break;
763 }
764 }
765
766 // see if there is a result for this number,
767 // if it is in the request set (or the request set is empty)
768 if (docset_here != queryresults.docs.docset.end() &&
769 (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
770 if (resultnum >= startresults) {
771 // add this document
772 resultdoc.OID = trans_OID;
773 resultdoc.result_num = resultnum;
774 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
775
776 // these next two are not available on all versions of mg
777 resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
778 resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
779
780 response.docInfo.push_back (resultdoc);
781 }
782
783 resultnum++;
784 }
785 }
786
787 docorder_here++;
788 }
789 }
790
791 // assemble the term results
792 if (need_term_info(request.filterResultOptions)) {
793 // note: the terms have already been sorted and uniqued
794
795 TermInfo_t terminfo;
796 bool terms_first = true;
797 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
798 termfreqclassarray::iterator terms_end = queryresults.terms.end();
799
800 while (terms_here != terms_end) {
801 terminfo.clear();
802 terminfo.term = (*terms_here).termstr;
803 terminfo.freq = (*terms_here).termfreq;
804 if (terms_first) {
805 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
806 text_tset::iterator termvariants_end = queryresults.termvariants.end();
807 while (termvariants_here != termvariants_end) {
808 terminfo.matchTerms.push_back (*termvariants_here);
809 termvariants_here++;
810 }
811 }
812 terms_first = false;
813
814 response.termInfo.push_back (terminfo);
815
816 terms_here++;
817 }
818 }
819
820 response.numDocs = queryresults.docs_matched;
821 response.isApprox = queryresults.is_approx;
822}
Note: See TracBrowser for help on using the repository browser.