root/gsdl/trunk/src/colservr/mgqueryfilter.cpp @ 15757

Revision 15681, 17.8 KB (checked in by mdewsnip, 12 years ago)

Removed some unnecessary inclusions of "assert.h".

  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * mgqueryfilter.cpp -- implementation of queryfilter for old mg
4 * Copyright (C) 1999  The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "mgqueryfilter.h"
27#include "fileutil.h"
28#include "phrasesearch.h"
29#include "mgsearch.h"
30#include "phrases.h"
31
32///////////////////////////////
33// methods for resultsorderer_t
34///////////////////////////////
35
36resultsorderer_t::resultsorderer_t() {
37  clear ();
38}
39
40void resultsorderer_t::clear() {
41  compare_phrase_match = false;
42  compare_terms_match = false;
43  compare_doc_weight = true;
44
45  docset = NULL;
46}
47
48bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
49  if (docset == NULL) return t1>t2;
50
51  docresultmap::iterator t1_here = docset->find(t1);
52  docresultmap::iterator t2_here = docset->find(t2);
53  docresultmap::iterator end = docset->end();
54
55  // sort all the document numbers not in the document set to
56  // the end of the list
57  if (t1_here == end) {
58    if (t2_here == end) return t1>t2;
59    else return true;
60  } else if (t2_here == end) return false;
61 
62  if (compare_phrase_match) {
63    if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
64    if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
65  }
66
67  if (compare_terms_match) {
68    if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
69    if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
70  }
71
72  if (compare_doc_weight) {
73    if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
74    if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
75  }
76
77  return t1>t2;
78}
79
80
81
82
83/////////////////////////////////
84// functions for mgqueryfilterclass
85/////////////////////////////////
86
87
88void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
89  queryfilterclass::configure (key, cfgline);
90
91  if (key == "indexstem") {
92    ((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
93  }
94 
95}
96
97// loads up phrases data structure with any phrases (that's the quoted bits)
98// occuring in the querystring
99void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
100                     const termfreqclassarray &orgterms,
101                     vector<termfreqclassarray> &phrases) {
102
103  text_t::const_iterator here = querystring.begin();
104  text_t::const_iterator end = querystring.end();
105
106  termfreqclassarray tmpterms;
107
108  int termcount = 0;
109  bool foundquote = false;
110  bool foundbreak = false;
111  bool start = true;
112  while (here != end) {
113    if (*here == '\"') {
114      if (foundquote) {
115    if (!foundbreak && !start) {
116      tmpterms.push_back (orgterms[termcount]);
117      ++termcount;
118    }
119    if (tmpterms.size() > 1) {
120      phrases.push_back (tmpterms);
121    }
122    tmpterms.erase (tmpterms.begin(), tmpterms.end());
123   
124    foundquote = false;
125    foundbreak = true;
126      } else foundquote = true;
127    } else if (!is_unicode_letdig(*here)) {
128      // found a break between terms
129      if (!foundbreak && !start) {
130    if (foundquote) {
131      tmpterms.push_back (orgterms[termcount]);
132    }
133    ++termcount;
134      }
135      foundbreak = true;
136    } else {
137      start = false;
138      foundbreak = false;
139    }     
140    ++here;
141  }
142}
143
144// do aditional query processing
145void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
146                     queryresultsclass &queryresults) {
147
148  // post-process the results if needed
149  if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
150
151    // get the terms between quotes (if any)
152    vector<termfreqclassarray> phrases;
153    get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
154
155    num_phrases = phrases.size();
156    if (num_phrases > 0) {
157
158      // get the long version of the index
159      text_t longindex;
160      indexmap.to2from (queryparams.index, longindex);
161     
162      vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
163      vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
164
165      while (this_phrase != end_phrase) {
166
167    // process each of the matched documents
168    docresultmap::iterator docs_here = queryresults.docs.docset.begin();
169    docresultmap::iterator docs_end = queryresults.docs.docset.end();
170    while (docs_here != docs_end) {
171      if (OID_phrase_search (*((mgsearchclass*)textsearchptr), *db_ptr, queryparams.index,
172                 queryparams.subcollection, queryparams.language,
173                 longindex, queryparams.collection, *this_phrase,
174                 (*docs_here).second.docnum)) {
175        ++docs_here->second.num_phrase_match;
176      }
177   
178      ++docs_here;
179    }
180    ++this_phrase;
181      }
182    }
183  }
184}
185
186
187// do query that might involve multiple sub queries
188// textsearchptr and db_ptr are assumed to be valid
189void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
190                       const vector<queryparamclass> &query_params,
191                       queryresultsclass &multiresults,
192                       comerror_t &err, ostream &logout) {
193  outconvertclass text_t2ascii;
194
195  err = noError;
196  textsearchptr->setcollectdir (collectdir);
197  multiresults.clear();
198 
199  vector<queryparamclass>::const_iterator query_here = query_params.begin();
200  vector<queryparamclass>::const_iterator query_end = query_params.end();
201  while (query_here != query_end) {
202    queryresultsclass thisqueryresults;
203
204    if (!textsearchptr->search(*query_here, thisqueryresults)) {
205      // most likely a system problem
206      logout << text_t2ascii
207         << "system problem: could not do search with mg for index \""
208         << (*query_here).index << (*query_here).subcollection
209         << (*query_here).language << "\".\n\n";
210      err = systemProblem;
211      return;
212    }
213
214    // combine the results
215    if (need_matching_docs (request.filterResultOptions)) {
216      // post-process the results if needed
217      if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
218      !thisqueryresults.docs.docset.empty()) {
219    post_process (*query_here, thisqueryresults);
220    thisqueryresults.postprocessed = true;
221    multiresults.postprocessed = true;
222      } else {
223    num_phrases = 0;
224      }
225     
226      if (query_params.size() == 1) {
227    multiresults.docs = thisqueryresults.docs; // just one set of results
228    multiresults.docs_matched = thisqueryresults.docs_matched;
229    multiresults.is_approx = thisqueryresults.is_approx;
230   
231      } else {
232    if ((*query_here).combinequery == "and") {
233      multiresults.docs.combine_and (thisqueryresults.docs);
234    } else if ((*query_here).combinequery == "or") {
235      multiresults.docs.combine_or (thisqueryresults.docs);
236    } else if ((*query_here).combinequery == "not") {
237      multiresults.docs.combine_not (thisqueryresults.docs);
238    }
239    multiresults.docs_matched = multiresults.docs.docset.size();
240    multiresults.is_approx = Exact;
241      }
242    }
243
244    // combine the term information
245    if (need_term_info (request.filterResultOptions)) {
246      // append the terms
247      multiresults.orgterms.insert(multiresults.orgterms.end(),
248                   thisqueryresults.orgterms.begin(),
249                   thisqueryresults.orgterms.end());
250
251      // add the term variants
252      text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
253      text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
254      while (termvar_here != termvar_end) {
255    multiresults.termvariants.insert(*termvar_here);
256    ++termvar_here;
257      }
258    }
259   
260    ++query_here;
261  }
262
263  // sort and unique the query terms
264  multiresults.sortuniqqueryterms ();
265}
266
267
268void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
269                     docresultsclass &docs) {
270  resultsorderer_t resultsorderer;
271  resultsorderer.compare_phrase_match = true;
272  resultsorderer.docset = &(docs.docset);
273
274  // first get a list of document numbers
275  docs.docnum_order();
276
277  sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
278}
279
280
281
282mgqueryfilterclass::mgqueryfilterclass ()
283  :queryfilterclass() {
284
285  num_phrases = 0;
286}
287
288mgqueryfilterclass::~mgqueryfilterclass () {
289}
290
291void mgqueryfilterclass::filter (const FilterRequest_t &request,
292                   FilterResponse_t &response,
293                   comerror_t &err, ostream &logout) {
294  outconvertclass text_t2ascii;
295
296  response.clear ();
297  err = noError;
298  if (db_ptr == NULL) {
299    // most likely a configuration problem
300    logout << text_t2ascii
301       << "configuration error: mgqueryfilter contains a null dbclass\n\n";
302    err = configurationError;
303    return;
304  }
305  if (textsearchptr == NULL) {
306    // most likely a configuration problem
307    logout << text_t2ascii
308       << "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
309    err = configurationError;
310    return;
311  }
312
313  // open the database
314  db_ptr->setlogout(&logout);
315  if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
316    // most likely a system problem (we have already checked that the database exists)
317    logout << text_t2ascii
318       << "system problem: open on database \"" << db_filename << "\" failed\n\n";
319    err = systemProblem;
320    return;
321  }
322
323  // get the query parameters
324  int startresults = filterOptions["StartResults"].defaultValue.getint();
325  int endresults = filterOptions["EndResults"].defaultValue.getint();
326  text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
327
328  vector<queryparamclass> queryfilterparams;
329  parse_query_params (request, queryfilterparams, startresults,
330              endresults, phrasematch, logout);
331  // do any mg specific diddling with query parameters that may be required
332  mg_parse_query_params (request, queryfilterparams, startresults,
333             endresults, phrasematch, logout);
334
335
336  // do query
337  queryresultsclass queryresults;
338  do_multi_query (request, queryfilterparams, queryresults, err, logout);
339  if (err != noError) return;
340 
341  // assemble document results
342  if (need_matching_docs (request.filterResultOptions)) {
343    // sort the query results
344    // only want to sort the docs if we have done a ranked search or there were phrases
345    if (num_phrases > 0 || (request.filterResultOptions & FRranking)) {
346      sort_doc_results (request, queryresults.docs);
347    }
348    int resultnum = 1;
349    ResultDocInfo_t resultdoc;
350    text_t trans_OID;
351    vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
352    vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
353
354    // documents containing matching phrases will be sorted to the top so
355    // we can break out once we're past those that match the PhraseMatch
356    // option -- "all_phrases" = return only those documents containing all
357    //                       phrases in query string
358    //           "some_phrases" = return only those documents containing
359    //                            at least 1 of the phrases in the document
360    //           "all_docs" = return all documents regardless
361    if (num_phrases > 0) {
362      int numdocs = 0;
363      while (docorder_here != docorder_end) {
364    docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
365   
366    if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) ||
367        ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
368      queryresults.docs_matched = numdocs;
369      break;
370    }
371    ++numdocs;
372    ++docorder_here;
373      }
374    }
375
376    if (endresults == -1) endresults = MAXNUMDOCS;
377    docorder_here = queryresults.docs.docorder.begin();
378    while (docorder_here != docorder_end) {
379      if (resultnum > endresults || resultnum > queryresults.docs_matched) break;
380     
381      // translate the document number
382      if (!translate(db_ptr, *docorder_here, trans_OID)) {
383    logout << text_t2ascii
384           << "warning: could not translate mg document number \""
385           << *docorder_here << "\"to OID.\n\n";
386   
387      } else {
388    docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
389
390    // see if there is a result for this number,
391    // if it is in the request set (or the request set is empty)
392    if (docset_here != queryresults.docs.docset.end() &&
393        (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
394      if (resultnum >= startresults) {
395        // add this document
396        resultdoc.OID = trans_OID;
397        resultdoc.result_num = resultnum;
398        resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
399
400        // these next two are not available on all versions of mg
401        resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
402        resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
403       
404        response.docInfo.push_back (resultdoc);
405      }
406     
407      ++resultnum;
408    }
409      }
410     
411      ++docorder_here;
412    }
413  }
414
415  // assemble the term results
416  if (need_term_info(request.filterResultOptions)) {
417    // note: the terms have already been sorted and uniqued
418
419    TermInfo_t terminfo;
420    bool terms_first = true;
421    termfreqclassarray::iterator terms_here = queryresults.terms.begin();
422    termfreqclassarray::iterator terms_end = queryresults.terms.end();
423
424    while (terms_here != terms_end) {
425      terminfo.clear();
426      terminfo.term = (*terms_here).termstr;
427      terminfo.freq = (*terms_here).termfreq;
428      if (terms_first) {
429    text_tset::iterator termvariants_here = queryresults.termvariants.begin();
430    text_tset::iterator termvariants_end = queryresults.termvariants.end();
431    while (termvariants_here != termvariants_end) {
432      terminfo.matchTerms.push_back (*termvariants_here);
433      ++termvariants_here;
434    }
435      }
436      terms_first = false;
437
438      response.termInfo.push_back (terminfo);
439
440      ++terms_here;
441    }
442  }
443
444  db_ptr->closedatabase();  // Important that local library doesn't leave any files open
445  response.numDocs = queryresults.docs_matched;
446  response.isApprox = queryresults.is_approx;
447}
448
449void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/*request*/,
450                        vector<queryparamclass> &query_params,
451                        int &/*startresults*/, int &/*endresults*/,
452                        text_t &/*phrasematch*/, ostream &/*logout*/) {
453
454  //  outconvertclass text_t2ascii;
455 
456  vector<queryparamclass>::iterator query_here = query_params.begin();
457  vector<queryparamclass>::iterator query_end = query_params.end();
458  while (query_here != query_end) {
459
460    // if we're doing a phrase search we want to maximise hits by making it
461    // a boolean search on the index with the finest granularity - we'll
462    // also set maxdocs to "all" (realizing that this will cause searches
463    // like "and the" on a large collection to take a very very long time).
464
465    // we're deciding it's a phrase search based on if the querystring
466    // contains at least 2 double quotes (not very scientific but
467    // then neither is the rest of the mg phrase searching functionality :-)
468      //if (countchar ((*query_here).querystring.begin(), (*query_here).querystring.end(), '"') > 1) {
469
470    // [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here
471    text_tarray phrases;
472    get_phrases((*query_here).querystring, phrases);
473
474    if (phrases.size() > 0) {
475      (*query_here).search_type = 0;
476
477      // set maxdocs to "all"
478      (*query_here).maxdocs = -1;
479
480      // Get the long version of the index and test to see if any indexes with
481      // finer granularity exist. Indexes must be the same type (i.e. same metadata
482      // or "text").
483      text_t longindex; text_tarray splitindex;
484      indexmap.to2from ((*query_here).index, longindex);
485      splitchar (longindex.begin(), longindex.end(), ':', splitindex);
486      text_t &granularity = splitindex[0];
487      text_t &indextype = splitindex[1];
488      bool found = false;
489      // currently supported granularity options are "document", "section" and "paragraph"
490      if (granularity == "document" || granularity == "section") {
491    text_t shortindex;
492    if (indexmap.fromexists ("paragraph:" + indextype)) {
493      indexmap.from2to ("paragraph:" + indextype, shortindex);
494      (*query_here).index = shortindex;
495      found = true;
496    }
497    if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
498      indexmap.from2to ("section:" + indextype, shortindex);
499      (*query_here).index = shortindex;
500    }
501      }
502    }
503
504#ifdef GSDL_BBC_COLLECTION
505    // This is a special hack for the BBC collection's ProgNumber and zzabn
506    // indexes (they're built this way to prevent mg_perf_hash_build from
507    // dying at build time)
508
509    // if we're searching the ProgNumber index we want to
510    // remove all non-alphanumeric characters from the query string
511    text_t longindex; text_tarray splitindex;
512    indexmap.to2from ((*query_here).index, longindex);
513    splitchar (longindex.begin(), longindex.end(), ':', splitindex);
514    text_t &indextype = splitindex[1];
515    if (indextype == "ProgNumber") {
516      text_t new_querystring;
517      text_t::const_iterator here = (*query_here).querystring.begin();
518      text_t::const_iterator end = (*query_here).querystring.end();
519      while (here != end) {
520        if ((*here >= 'a' && *here <= 'z') || (*here >= 'A' && *here <= 'Z') ||
521            (*here >= '0' && *here <= '9')) {
522          new_querystring.push_back (*here);
523        }
524        ++here;
525      }
526      (*query_here).querystring = new_querystring;
527    }
528#endif
529    ++query_here;
530  }
531}
532   
Note: See TracBrowser for help on using the browser.