source: trunk/gsdl/src/colservr/mgqueryfilter.cpp@ 1721

Last change on this file since 1721 was 1721, checked in by sjboddie, 23 years ago

Fixed a small bug that was causing phrase searching to act erratically
when using either fastcgi or the windows local library

  • Property svn:keywords set to Author Date Id Revision
File size: 16.1 KB
Line 
1/**********************************************************************
2 *
3 * mgqueryfilter.cpp -- implementation of queryfilter for old mg
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "mgqueryfilter.h"
27#include "fileutil.h"
28#include "phrasesearch.h"
29#include <assert.h>
30#include "mgsearch.h"
31
32///////////////////////////////
33// methods for resultsorderer_t
34///////////////////////////////
35
36resultsorderer_t::resultsorderer_t() {
37 clear ();
38}
39
40void resultsorderer_t::clear() {
41 compare_phrase_match = false;
42 compare_terms_match = false;
43 compare_doc_weight = true;
44
45 docset = NULL;
46}
47
48bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
49 if (docset == NULL) return t1>t2;
50
51 docresultmap::iterator t1_here = docset->find(t1);
52 docresultmap::iterator t2_here = docset->find(t2);
53 docresultmap::iterator end = docset->end();
54
55 // sort all the document numbers not in the document set to
56 // the end of the list
57 if (t1_here == end) {
58 if (t2_here == end) return t1>t2;
59 else return true;
60 } else if (t2_here == end) return false;
61
62 if (compare_phrase_match) {
63 if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
64 if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
65 }
66
67 if (compare_terms_match) {
68 if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
69 if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
70 }
71
72 if (compare_doc_weight) {
73 if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
74 if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
75 }
76
77 return t1>t2;
78}
79
80
81
82
83/////////////////////////////////
84// functions for mgqueryfilterclass
85/////////////////////////////////
86
87// loads up phrases data structure with any phrases (that's the quoted bits)
88// occuring in the querystring
89void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
90 const termfreqclassarray &orgterms,
91 vector<termfreqclassarray> &phrases) {
92
93 text_t::const_iterator here = querystring.begin();
94 text_t::const_iterator end = querystring.end();
95
96 termfreqclassarray tmpterms;
97
98 int termcount = 0;
99 bool foundquote = false;
100 bool foundbreak = false;
101 bool start = true;
102 while (here != end) {
103 if (*here == '\"') {
104 if (foundquote) {
105 if (!foundbreak && !start) {
106 tmpterms.push_back (orgterms[termcount]);
107 termcount ++;
108 }
109 if (tmpterms.size() > 1) {
110 phrases.push_back (tmpterms);
111 tmpterms.erase (tmpterms.begin(), tmpterms.end());
112 }
113 foundquote = false;
114 foundbreak = true;
115 } else foundquote = true;
116 } else if (!is_unicode_letdig(*here)) {
117 // found a break between terms
118 if (!foundbreak && !start) {
119 if (foundquote)
120 tmpterms.push_back (orgterms[termcount]);
121 termcount ++;
122 }
123 foundbreak = true;
124 } else {
125 start = false;
126 foundbreak = false;
127 }
128 here++;
129 }
130}
131
132// do aditional query processing
133void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
134 queryresultsclass &queryresults) {
135
136 // post-process the results if needed
137 if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
138
139 // get the terms between quotes (if any)
140 vector<termfreqclassarray> phrases;
141 get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
142
143 num_phrases = phrases.size();
144 if (num_phrases > 0) {
145
146 // get the long version of the index
147 text_t longindex;
148 indexmap.to2from (queryparams.index, longindex);
149
150 vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
151 vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
152
153 while (this_phrase != end_phrase) {
154
155 // process each of the matched documents
156 docresultmap::iterator docs_here = queryresults.docs.docset.begin();
157 docresultmap::iterator docs_end = queryresults.docs.docset.end();
158 while (docs_here != docs_end) {
159 if (OID_phrase_search (*((mgsearchclass*)mgsearchptr), *gdbmptr, queryparams.index,
160 queryparams.subcollection, queryparams.language,
161 longindex, queryparams.collection, *this_phrase,
162 (*docs_here).second.docnum)) {
163 (*docs_here).second.num_phrase_match++;
164 }
165
166 docs_here++;
167 }
168 this_phrase++;
169 }
170 }
171 }
172}
173
174
175// do query that might involve multiple sub queries
176// mgsearchptr and gdbmptr are assumed to be valid
177void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
178 const vector<queryparamclass> &query_params,
179 queryresultsclass &multiresults,
180 comerror_t &err, ostream &logout) {
181 outconvertclass text_t2ascii;
182
183 err = noError;
184 mgsearchptr->setcollectdir (collectdir);
185 multiresults.clear();
186
187 vector<queryparamclass>::const_iterator query_here = query_params.begin();
188 vector<queryparamclass>::const_iterator query_end = query_params.end();
189 while (query_here != query_end) {
190 queryresultsclass thisqueryresults;
191
192 if (!mgsearchptr->search(*query_here, thisqueryresults)) {
193 // most likely a system problem
194 logout << text_t2ascii
195 << "system problem: could not do search with mg for index \""
196 << (*query_here).index << (*query_here).subcollection
197 << (*query_here).language << "\".\n\n";
198 err = systemProblem;
199 return;
200 }
201
202 // combine the results
203 if (need_matching_docs (request.filterResultOptions)) {
204 // post-process the results if needed
205 if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
206 !thisqueryresults.docs.docset.empty()) {
207 post_process (*query_here, thisqueryresults);
208 thisqueryresults.postprocessed = true;
209 multiresults.postprocessed = true;
210 } else {
211 num_phrases = 0;
212 }
213
214 if (query_params.size() == 1) {
215 multiresults.docs = thisqueryresults.docs; // just one set of results
216 multiresults.docs_matched = thisqueryresults.docs_matched;
217 multiresults.is_approx = thisqueryresults.is_approx;
218
219 } else {
220 if ((*query_here).combinequery == "and") {
221 multiresults.docs.combine_and (thisqueryresults.docs);
222 } else if ((*query_here).combinequery == "or") {
223 multiresults.docs.combine_or (thisqueryresults.docs);
224 } else if ((*query_here).combinequery == "not") {
225 multiresults.docs.combine_not (thisqueryresults.docs);
226 }
227 multiresults.docs_matched = multiresults.docs.docset.size();
228 multiresults.is_approx = Exact;
229 }
230 }
231
232 // combine the term information
233 if (need_term_info (request.filterResultOptions)) {
234 // append the terms
235 multiresults.orgterms.insert(multiresults.orgterms.end(),
236 thisqueryresults.orgterms.begin(),
237 thisqueryresults.orgterms.end());
238
239 // add the term variants
240 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
241 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
242 while (termvar_here != termvar_end) {
243 multiresults.termvariants.insert(*termvar_here);
244 termvar_here++;
245 }
246 }
247
248 query_here++;
249 }
250
251 // sort and unique the query terms
252 multiresults.sortuniqqueryterms ();
253}
254
255
256void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
257 docresultsclass &docs) {
258 resultsorderer_t resultsorderer;
259 resultsorderer.compare_phrase_match = true;
260 resultsorderer.docset = &(docs.docset);
261
262 // first get a list of document numbers
263 docs.docnum_order();
264
265 sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
266}
267
268
269
270mgqueryfilterclass::mgqueryfilterclass ()
271 :queryfilterclass() {
272
273 num_phrases = 0;
274
275}
276
277mgqueryfilterclass::~mgqueryfilterclass () {
278}
279
280void mgqueryfilterclass::filter (const FilterRequest_t &request,
281 FilterResponse_t &response,
282 comerror_t &err, ostream &logout) {
283 outconvertclass text_t2ascii;
284
285 response.clear ();
286 err = noError;
287 if (gdbmptr == NULL) {
288 // most likely a configuration problem
289 logout << text_t2ascii
290 << "configuration error: mgqueryfilter contains a null gdbmclass\n\n";
291 err = configurationError;
292 return;
293 }
294 if (mgsearchptr == NULL) {
295 // most likely a configuration problem
296 logout << text_t2ascii
297 << "configuration error: mgqueryfilter contains a null mgsearchclass\n\n";
298 err = configurationError;
299 return;
300 }
301
302 // open the database
303 gdbmptr->setlogout(&logout);
304 if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
305 // most likely a system problem (we have already checked that the
306 // gdbm database exists)
307 logout << text_t2ascii
308 << "system problem: open on gdbm database \""
309 << gdbm_filename << "\" failed\n\n";
310 err = systemProblem;
311 return;
312 }
313
314 // get the query parameters
315 int startresults = filterOptions["StartResults"].defaultValue.getint();
316 int endresults = filterOptions["EndResults"].defaultValue.getint();
317 text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
318
319 vector<queryparamclass> queryfilterparams;
320 parse_query_params (request, queryfilterparams, startresults,
321 endresults, phrasematch, logout);
322 // do any mg specific diddling with query parameters that may be required
323 mg_parse_query_params (request, queryfilterparams, startresults,
324 endresults, phrasematch, logout);
325
326
327 // do query
328 queryresultsclass queryresults;
329 do_multi_query (request, queryfilterparams, queryresults, err, logout);
330 if (err != noError) return;
331
332 // assemble document results
333 if (need_matching_docs (request.filterResultOptions)) {
334 // sort the query results
335 sort_doc_results (request, queryresults.docs);
336
337 int resultnum = 1;
338 ResultDocInfo_t resultdoc;
339 text_t trans_OID;
340 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
341 vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
342
343 // documents containing matching phrases will be sorted to the top so
344 // we can break out once we're past those that match the PhraseMatch
345 // option -- "all_phrases" = return only those documents containing all
346 // phrases in query string
347 // "some_phrases" = return only those documents containing
348 // at least 1 of the phrases in the document
349 // "all_docs" = return all documents regardless
350 if (num_phrases > 0) {
351 int numdocs = 0;
352 while (docorder_here != docorder_end) {
353 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
354
355 if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) ||
356 ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
357 queryresults.docs_matched = numdocs;
358 break;
359 }
360 numdocs ++;
361 docorder_here ++;
362 }
363 }
364
365 if (endresults == -1) endresults = MAXNUMDOCS;
366 docorder_here = queryresults.docs.docorder.begin();
367 while (docorder_here != docorder_end) {
368 if (resultnum > endresults || resultnum > queryresults.docs_matched) break;
369
370 // translate the document number
371 if (!translate(gdbmptr, *docorder_here, trans_OID)) {
372 logout << text_t2ascii
373 << "warning: could not translate mg document number \""
374 << *docorder_here << "\"to OID.\n\n";
375
376 } else {
377 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
378
379 // see if there is a result for this number,
380 // if it is in the request set (or the request set is empty)
381 if (docset_here != queryresults.docs.docset.end() &&
382 (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
383 if (resultnum >= startresults) {
384 // add this document
385 resultdoc.OID = trans_OID;
386 resultdoc.result_num = resultnum;
387 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
388
389 // these next two are not available on all versions of mg
390 resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
391 resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
392
393 response.docInfo.push_back (resultdoc);
394 }
395
396 resultnum++;
397 }
398 }
399
400 docorder_here++;
401 }
402 }
403
404 // assemble the term results
405 if (need_term_info(request.filterResultOptions)) {
406 // note: the terms have already been sorted and uniqued
407
408 TermInfo_t terminfo;
409 bool terms_first = true;
410 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
411 termfreqclassarray::iterator terms_end = queryresults.terms.end();
412
413 while (terms_here != terms_end) {
414 terminfo.clear();
415 terminfo.term = (*terms_here).termstr;
416 terminfo.freq = (*terms_here).termfreq;
417 if (terms_first) {
418 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
419 text_tset::iterator termvariants_end = queryresults.termvariants.end();
420 while (termvariants_here != termvariants_end) {
421 terminfo.matchTerms.push_back (*termvariants_here);
422 termvariants_here++;
423 }
424 }
425 terms_first = false;
426
427 response.termInfo.push_back (terminfo);
428
429 terms_here++;
430 }
431 }
432
433 response.numDocs = queryresults.docs_matched;
434 response.isApprox = queryresults.is_approx;
435}
436
437void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/*request*/,
438 vector<queryparamclass> &query_params,
439 int &/*startresults*/, int &/*endresults*/,
440 text_t &/*phrasematch*/, ostream &/*logout*/) {
441
442 // outconvertclass text_t2ascii;
443
444 vector<queryparamclass>::iterator query_here = query_params.begin();
445 vector<queryparamclass>::iterator query_end = query_params.end();
446 while (query_here != query_end) {
447
448 // if we're doing a phrase search we want to maximise hits by making it a boolean
449 // search on the index with the finest granularity
450 // we're deciding it's a phrase search based on if the querystring
451 // contains at least 2 double quotes (not very scientific but
452 // then neither is the rest of the mg phrase searching functionality :-)
453 if (countchar ((*query_here).querystring.begin(), (*query_here).querystring.end(), '"') > 1) {
454 (*query_here).search_type = 0;
455
456 // Get the long version of the index and test to see if any indexes with
457 // finer granularity exist. Indexes must be the same type (i.e. same metadata
458 // or "text").
459 text_t longindex; text_tarray splitindex;
460 indexmap.to2from ((*query_here).index, longindex);
461 splitchar (longindex.begin(), longindex.end(), ':', splitindex);
462 text_t &granularity = splitindex[0];
463 text_t &indextype = splitindex[1];
464 bool found = false;
465 // currently supported granularity options are "document", "section" and "paragraph"
466 if (granularity == "document" || granularity == "section") {
467 text_t shortindex;
468 if (indexmap.fromexists ("paragraph:" + indextype)) {
469 // logout << text_t2ascii << "changing index from " << longindex << " to " << ("paragraph:" + indextype) << "\n";
470 indexmap.from2to ("paragraph:" + indextype, shortindex);
471 (*query_here).index = shortindex;
472 found = true;
473 }
474 if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
475 // logout << text_t2ascii << "changing index from " << longindex << " to " << ("section:" + indextype) << "\n";
476 indexmap.from2to ("section:" + indextype, shortindex);
477 (*query_here).index = shortindex;
478 }
479 }
480 }
481
482 query_here ++;
483 }
484}
485
Note: See TracBrowser for help on using the repository browser.