source: trunk/gsdl/src/colservr/queryfilter.cpp@ 367

Last change on this file since 367 was 358, checked in by rjmcnab, 25 years ago

Fixed a couple of compiler conflicts

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 20.0 KB
Line 
1/**********************************************************************
2 *
3 * queryfilter.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: queryfilter.cpp 358 1999-07-09 02:19:44Z rjmcnab $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.12 1999/07/09 02:19:43 rjmcnab
15 Fixed a couple of compiler conflicts
16
17 Revision 1.11 1999/07/08 20:49:44 rjmcnab
18 Added result_num to the ResultDocInto_t structure.
19
20 Revision 1.10 1999/07/07 06:19:46 rjmcnab
21 Added ability to combine two or more independant queries.
22
23 Revision 1.9 1999/07/01 09:29:20 rjmcnab
24 Changes for better reporting of number documents which match a query. Changes
25 should still work as before with older versions of mg.
26
27 Revision 1.8 1999/07/01 03:59:54 rjmcnab
28 reduced MAXDOCS to 200 (more reasonable ???). I also added a virtual
29 method for post-processing the query.
30
31 Revision 1.7 1999/06/30 04:04:13 rjmcnab
32 made stemming functions available from mgsearch and made the stems
33 for the query terms available in queryinfo
34
35 Revision 1.6 1999/06/29 22:06:23 rjmcnab
36 Added a couple of fields to queryinfo to handle a special version
37 of mg.
38
39 Revision 1.5 1999/06/27 22:08:48 sjboddie
40 now check for defaultindex, defaultsubcollection, and defaultlanguage
41 entries in config files
42
43 Revision 1.4 1999/06/16 02:03:25 sjboddie
44 fixed bug in isApprox and set MAXDOCS to always be 500
45
46 Revision 1.3 1999/04/19 23:56:09 rjmcnab
47 Finished the gdbm metadata stuff
48
49 Revision 1.2 1999/04/12 03:45:03 rjmcnab
50 Finished the query filter.
51
52 Revision 1.1 1999/04/06 22:22:09 rjmcnab
53 Initial revision.
54
55 */
56
57
58#include "queryfilter.h"
59#include "fileutil.h"
60#include "queryinfo.h"
61
62#define MAXDOCS 200 // note that maxdocs must be at least as large
63 // as the highest possible value of EndResults
64
65// some useful functions
66
67// translate will return true if successful
68static bool translate (gdbmclass *gdbmptr, int docnum, text_t &trans_OID) {
69 infodbclass info;
70
71 trans_OID.clear();
72
73 // get the info
74 if (gdbmptr == NULL) return false;
75 if (!gdbmptr->getinfo(docnum, info)) return false;
76
77 // translate
78 if (info["section"].empty()) return false;
79
80 trans_OID = info["section"];
81 return true;
82}
83
84
85// whether document results are needed
86static bool need_matching_docs (int filterResultOptions) {
87 return ((filterResultOptions & FROID) || (filterResultOptions & FRranking) ||
88 (filterResultOptions & FRmetadata));
89}
90
91// whether term information is needed
92static bool need_term_info (int filterResultOptions) {
93 return ((filterResultOptions & FRtermFreq) || (filterResultOptions & FRmatchTerms));
94}
95
96///////////////////////////////
97// methods for resultsorderer_t
98///////////////////////////////
99
100resultsorderer_t::resultsorderer_t() {
101 clear ();
102}
103
104void resultsorderer_t::clear() {
105 compare_phrase_match = false;
106 compare_terms_match = false;
107 compare_doc_weight = true;
108
109 docset = NULL;
110}
111
112bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
113 if (docset == NULL) return t1>t2;
114
115 docresultmap::iterator t1_here = docset->find(t1);
116 docresultmap::iterator t2_here = docset->find(t2);
117 docresultmap::iterator end = docset->end();
118
119 // sort all the document numbers not in the document set to
120 // the end of the list
121 if (t1_here == end) {
122 if (t2_here == end) return t1>t2;
123 else return true;
124 } else if (t2_here == end) return false;
125
126 if (compare_phrase_match) {
127 if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
128 if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
129 }
130
131 if (compare_terms_match) {
132 if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
133 if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
134 }
135
136 if (compare_doc_weight) {
137 if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
138 if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
139 }
140
141 return t1>t2;
142}
143
144
145
146
147/////////////////////////////////
148// functions for queryfilterclass
149/////////////////////////////////
150
151// do aditional query processing
152void queryfilterclass::post_process (const queryparamclass &/*queryparams*/,
153 queryresultsclass &/*queryresults*/) {
154}
155
156// get the query parameters
157void queryfilterclass::parse_query_params (const FilterRequest_t &request,
158 vector<queryparamclass> &query_params,
159 int &startresults,
160 int &endresults,
161 ostream &logout) {
162 outconvertclass text_t2ascii;
163
164 // set defaults for the return parameters
165 query_params.erase(query_params.begin(), query_params.end());
166 startresults = filterOptions["StartResults"].defaultValue.getint();
167 endresults = filterOptions["EndResults"].defaultValue.getint();
168
169 // set defaults for query parameters
170 queryparamclass query;
171 query.combinequery = "or"; // first one must be "or"
172 query.collection = collection;
173 query.index = filterOptions["Index"].defaultValue;
174 query.subcollection = filterOptions["Subcollection"].defaultValue;
175 query.language = filterOptions["Language"].defaultValue;
176 query.querystring.clear();
177 query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
178 query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
179 query.stemming = (filterOptions["Stem"].defaultValue == "true");
180 query.maxdocs = MAXDOCS; // default for single query
181
182 OptionValue_tarray::const_iterator options_here = request.filterOptions.begin();
183 OptionValue_tarray::const_iterator options_end = request.filterOptions.end();
184 while (options_here != options_end) {
185 if ((*options_here).name == "CombineQuery") {
186 // add this query
187
188 // "all", needed when combining queries where the document results are needed
189 if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
190 query_params.push_back (query);
191
192 // start on next query
193 query.clear();
194 query.combinequery = (*options_here).value;
195
196 // set defaults for query parameters
197 query.collection = collection;
198 query.index = filterOptions["Index"].defaultValue;
199 query.subcollection = filterOptions["Subcollection"].defaultValue;
200 query.language = filterOptions["Language"].defaultValue;
201 query.querystring.clear();
202 query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
203 query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
204 query.stemming = (filterOptions["Stem"].defaultValue == "true");
205
206 // "all", needed when combining queries where the document results are needed
207 if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
208 else query.maxdocs = MAXDOCS; // "all"
209
210 } else if ((*options_here).name == "StartResults") {
211 startresults = (*options_here).value.getint();
212 } else if ((*options_here).name == "EndResults") {
213 endresults = (*options_here).value.getint();
214 } else if ((*options_here).name == "QueryType") {
215 query.search_type = ((*options_here).value == "ranked");
216 } else if ((*options_here).name == "Term") {
217 query.querystring = (*options_here).value;
218 } else if ((*options_here).name == "Casefold") {
219 query.casefolding = ((*options_here).value == "true");
220 } else if ((*options_here).name == "Stem") {
221 query.stemming = ((*options_here).value == "true");
222 } else if ((*options_here).name == "Index") {
223 query.index = (*options_here).value;
224 } else if ((*options_here).name == "Subcollection") {
225 query.subcollection = (*options_here).value;
226 } else if ((*options_here).name == "Language") {
227 query.language = (*options_here).value;
228 } else {
229 logout << text_t2ascii
230 << "warning: unknown queryfilter option \""
231 << (*options_here).name
232 << "\" ignored.\n\n";
233 }
234
235 options_here++;
236 }
237
238 // add the last query
239 query_params.push_back (query);
240}
241
242
243
244// do query that might involve multiple sub queries
245// mgsearchptr and gdbmptr are assumed to be valid
246void queryfilterclass::do_multi_query (const FilterRequest_t &request,
247 const vector<queryparamclass> &query_params,
248 queryresultsclass &multiresults,
249 comerror_t &err, ostream &logout) {
250 outconvertclass text_t2ascii;
251
252 err = noError;
253 mgsearchptr->setcollectdir (collectdir);
254 multiresults.clear();
255
256 vector<queryparamclass>::const_iterator query_here = query_params.begin();
257 vector<queryparamclass>::const_iterator query_end = query_params.end();
258 while (query_here != query_end) {
259 queryresultsclass thisqueryresults;
260
261 if (!mgsearchptr->search(*query_here, thisqueryresults)) {
262 // most likely a system problem
263 logout << text_t2ascii
264 << "system problem: could not do search with mg for index \""
265 << (*query_here).index << (*query_here).subcollection
266 << (*query_here).language << "\".\n\n";
267 err = systemProblem;
268 return;
269 }
270
271 // combine the results
272 if (need_matching_docs (request.filterResultOptions)) {
273 // post-process the results if needed
274 if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
275 !thisqueryresults.docs.docset.empty()) {
276 post_process (*query_here, thisqueryresults);
277 thisqueryresults.postprocessed = true;
278 multiresults.postprocessed = true;
279 }
280
281 if (query_params.size() == 1) {
282 multiresults.docs = thisqueryresults.docs; // just one set of results
283 multiresults.docs_matched = thisqueryresults.docs_matched;
284 multiresults.is_approx = thisqueryresults.is_approx;
285
286 } else {
287 if ((*query_here).combinequery == "and") {
288 multiresults.docs.combine_and (thisqueryresults.docs);
289 } else if ((*query_here).combinequery == "or") {
290 multiresults.docs.combine_or (thisqueryresults.docs);
291 } else if ((*query_here).combinequery == "not") {
292 multiresults.docs.combine_not (thisqueryresults.docs);
293 }
294 multiresults.docs_matched = multiresults.docs.docset.size();
295 multiresults.is_approx = false;
296 }
297 }
298
299 // combine the term information
300 if (need_term_info (request.filterResultOptions)) {
301 // append the terms
302 multiresults.orgterms.insert(multiresults.orgterms.end(),
303 thisqueryresults.orgterms.begin(),
304 thisqueryresults.orgterms.end());
305
306 // add the term variants
307 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
308 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
309 while (termvar_here != termvar_end) {
310 multiresults.termvariants.insert(*termvar_here);
311 termvar_here++;
312 }
313 }
314
315 query_here++;
316 }
317
318 // sort and unique the query terms
319 multiresults.sortuniqqueryterms ();
320}
321
322
323void queryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
324 docresultsclass &docs) {
325 resultsorderer_t resultsorderer;
326 resultsorderer.docset = &(docs.docset);
327
328 // first get a list of document numbers
329 docs.docnum_order();
330
331 sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
332}
333
334
335
336queryfilterclass::queryfilterclass () {
337 gdbmptr = NULL;
338 mgsearchptr = NULL;
339
340 FilterOption_t filtopt;
341 filtopt.name = "CombineQuery";
342 filtopt.type = FilterOption_t::enumeratedt;
343 filtopt.repeatable = FilterOption_t::onePerQuery;
344 filtopt.defaultValue = "and";
345 filtopt.validValues.push_back("and");
346 filtopt.validValues.push_back("or");
347 filtopt.validValues.push_back("not");
348 filterOptions["CombineQuery"] = filtopt;
349
350 // -- onePerQuery StartResults integer
351 filtopt.clear();
352 filtopt.name = "StartResults";
353 filtopt.type = FilterOption_t::integert;
354 filtopt.repeatable = FilterOption_t::onePerQuery;
355 filtopt.defaultValue = "1";
356 filtopt.validValues.push_back("1");
357 filtopt.validValues.push_back("1000");
358 filterOptions["StartResults"] = filtopt;
359
360 // -- onePerQuery EndResults integer
361 filtopt.clear();
362 filtopt.name = "EndResults";
363 filtopt.type = FilterOption_t::integert;
364 filtopt.repeatable = FilterOption_t::onePerQuery;
365 filtopt.defaultValue = "10";
366 filtopt.validValues.push_back("1");
367 filtopt.validValues.push_back("1000");
368 filterOptions["EndResults"] = filtopt;
369
370 // -- onePerQuery QueryType enumerated (boolean, ranked)
371 filtopt.clear();
372 filtopt.name = "QueryType";
373 filtopt.type = FilterOption_t::enumeratedt;
374 filtopt.repeatable = FilterOption_t::onePerQuery;
375 filtopt.defaultValue = "ranked";
376 filtopt.validValues.push_back("boolean");
377 filtopt.validValues.push_back("ranked");
378 filterOptions["QueryType"] = filtopt;
379
380 // -- onePerTerm Term string ???
381 filtopt.clear();
382 filtopt.name = "Term";
383 filtopt.type = FilterOption_t::stringt;
384 filtopt.repeatable = FilterOption_t::onePerTerm;
385 filtopt.defaultValue = "";
386 filterOptions["Term"] = filtopt;
387
388 // -- onePerTerm Casefold boolean
389 filtopt.clear();
390 filtopt.name = "Casefold";
391 filtopt.type = FilterOption_t::booleant;
392 filtopt.repeatable = FilterOption_t::onePerTerm;
393 filtopt.defaultValue = "true";
394 filtopt.validValues.push_back("false");
395 filtopt.validValues.push_back("true");
396 filterOptions["Casefold"] = filtopt;
397
398 // -- onePerTerm Stem boolean
399 filtopt.clear();
400 filtopt.name = "Stem";
401 filtopt.type = FilterOption_t::booleant;
402 filtopt.repeatable = FilterOption_t::onePerTerm;
403 filtopt.defaultValue = "false";
404 filtopt.validValues.push_back("false");
405 filtopt.validValues.push_back("true");
406 filterOptions["Stem"] = filtopt;
407
408 // -- onePerTerm Index enumerated
409 filtopt.clear();
410 filtopt.name = "Index";
411 filtopt.type = FilterOption_t::enumeratedt;
412 filtopt.repeatable = FilterOption_t::onePerTerm;
413 filtopt.defaultValue = "";
414 filterOptions["Index"] = filtopt;
415
416 // -- onePerTerm Subcollection enumerated
417 filtopt.clear();
418 filtopt.name = "Subcollection";
419 filtopt.type = FilterOption_t::enumeratedt;
420 filtopt.repeatable = FilterOption_t::onePerTerm;
421 filtopt.defaultValue = "";
422 filterOptions["Subcollection"] = filtopt;
423
424 // -- onePerTerm Language enumerated
425 filtopt.clear();
426 filtopt.name = "Language";
427 filtopt.type = FilterOption_t::enumeratedt;
428 filtopt.repeatable = FilterOption_t::onePerTerm;
429 filtopt.defaultValue = "";
430 filterOptions["Language"] = filtopt;
431}
432
433queryfilterclass::~queryfilterclass () {
434}
435
436void queryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
437 filterclass::configure (key, cfgline);
438
439 if (key == "indexmap") {
440 indexmap.importmap (cfgline);
441
442 // update the list of indexes in the filter information
443 text_tarray options;
444 indexmap.gettoarray (options);
445 filterOptions["Index"].validValues = options;
446
447 } else if (key == "defaultindex") {
448 indexmap.from2to (cfgline[0], filterOptions["Index"].defaultValue);
449
450 } else if (key == "subcollectionmap") {
451 subcollectionmap.importmap (cfgline);
452
453 // update the list of subcollections in the filter information
454 text_tarray options;
455 subcollectionmap.gettoarray (options);
456 filterOptions["Subcollection"].validValues = options;
457
458 } else if (key == "defaultsubcollection") {
459 subcollectionmap.from2to (cfgline[0], filterOptions["Subcollection"].defaultValue);
460
461 } else if (key == "languagemap") {
462 languagemap.importmap (cfgline);
463
464 // update the list of languages in the filter information
465 text_tarray options;
466 languagemap.gettoarray (options);
467 filterOptions["Language"].validValues = options;
468
469 } else if (key == "defaultlanguage")
470 languagemap.from2to (cfgline[0], filterOptions["Language"].defaultValue);
471}
472
473bool queryfilterclass::init (ostream &logout) {
474 outconvertclass text_t2ascii;
475
476 if (!filterclass::init(logout)) return false;
477
478 // get the filename for the database and make sure it exists
479 gdbm_filename = filename_cat(collectdir,"index","text",collection);
480#ifdef _LITTLE_ENDIAN
481 gdbm_filename += ".ldb";
482#else
483 gdbm_filename += ".bdb";
484#endif
485 if (!file_exists(gdbm_filename)) {
486 logout << text_t2ascii
487 << "error: gdbm database \""
488 << gdbm_filename << "\" does not exist\n\n";
489 return false;
490 }
491
492 return true;
493}
494
495void queryfilterclass::filter (const FilterRequest_t &request,
496 FilterResponse_t &response,
497 comerror_t &err, ostream &logout) {
498 outconvertclass text_t2ascii;
499
500 response.clear ();
501 err = noError;
502 if (gdbmptr == NULL) {
503 // most likely a configuration problem
504 logout << text_t2ascii
505 << "configuration error: queryfilter contains a null gdbmclass\n\n";
506 err = configurationError;
507 return;
508 }
509 if (mgsearchptr == NULL) {
510 // most likely a configuration problem
511 logout << text_t2ascii
512 << "configuration error: queryfilter contains a null mgsearchclass\n\n";
513 err = configurationError;
514 return;
515 }
516
517 // open the database
518 gdbmptr->setlogout(&logout);
519 if (!gdbmptr->opendatabase (gdbm_filename)) {
520 // most likely a system problem (we have already checked that the
521 // gdbm database exists)
522 logout << text_t2ascii
523 << "system problem: open on gdbm database \""
524 << gdbm_filename << "\" failed\n\n";
525 err = systemProblem;
526 return;
527 }
528
529 // get the query parameters
530 int startresults = filterOptions["StartResults"].defaultValue.getint();
531 int endresults = filterOptions["EndResults"].defaultValue.getint();
532 vector<queryparamclass> queryfilterparams;
533 parse_query_params (request, queryfilterparams, startresults, endresults, logout);
534
535 // do query
536 queryresultsclass queryresults;
537 do_multi_query (request, queryfilterparams, queryresults, err, logout);
538 if (err != noError) return;
539
540 // assemble document results
541 if (need_matching_docs (request.filterResultOptions)) {
542 // sort the query results
543 sort_doc_results (request, queryresults.docs);
544
545 int resultnum = 1;
546 ResultDocInfo_t resultdoc;
547 text_t trans_OID;
548 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
549 vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
550
551 while (docorder_here != docorder_end) {
552 if (resultnum > endresults) break;
553
554 // translate the document number
555 if (!translate(gdbmptr, *docorder_here, trans_OID)) {
556 logout << text_t2ascii
557 << "warning: could not translate mg document number \""
558 << *docorder_here << "\"to OID.\n\n";
559
560 } else {
561 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
562
563 // see if there is a result for this number,
564 // if it is in the request set (or the request set is empty)
565 if (docset_here != queryresults.docs.docset.end() &&
566 (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
567 if (resultnum >= startresults) {
568 // add this document
569 resultdoc.OID = trans_OID;
570 resultdoc.result_num = resultnum;
571 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
572
573 // these next two are not available on all versions of mg
574 resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
575 resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
576
577 response.docInfo.push_back (resultdoc);
578 }
579
580 resultnum++;
581 }
582 }
583
584 docorder_here++;
585 }
586 }
587
588 // assemble the term results
589 if (need_term_info(request.filterResultOptions)) {
590 // note: the terms have already been sorted and uniqued
591
592 TermInfo_t terminfo;
593 bool terms_first = true;
594 vector<termfreqclass>::iterator terms_here = queryresults.terms.begin();
595 vector<termfreqclass>::iterator terms_end = queryresults.terms.end();
596
597 while (terms_here != terms_end) {
598 terminfo.clear();
599 terminfo.term = (*terms_here).termstr;
600 terminfo.freq = (*terms_here).termfreq;
601 if (terms_first) {
602 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
603 text_tset::iterator termvariants_end = queryresults.termvariants.end();
604 while (termvariants_here != termvariants_end) {
605 terminfo.matchTerms.push_back (*termvariants_here);
606 termvariants_here++;
607 }
608 }
609 terms_first = false;
610
611 response.termInfo.push_back (terminfo);
612
613 terms_here++;
614 }
615 }
616
617 response.numDocs = queryresults.docs_matched;
618 response.isApprox = queryresults.is_approx;
619}
Note: See TracBrowser for help on using the repository browser.