root/gsdl/trunk/runtime-src/src/colservr/lucenequeryfilter.cpp @ 16915

Revision 16915, 13.0 KB (checked in by mdewsnip, 12 years ago)

Changes made by Richard Managh at DL Consulting Ltd for returning document-level term frequency totals.

  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * lucenequeryfilter.cpp --
4 * Copyright (C) 1999  The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "lucenequeryfilter.h"
27#include "fileutil.h"
28#include "lucenesearch.h"
29
30/////////////////////////////////
31// functions for queryfilterclass
32/////////////////////////////////
33
34
35lucenequeryfilterclass::lucenequeryfilterclass ()
36  : queryfilterclass() {
37
38 
39  FilterOption_t filtopt;
40 
41  // -- onePerTerm  Level          enumerated
42  // likely to be Doc, Sec, Para, but we dont assume anything now
43  filtopt.clear();
44  filtopt.name = "Level";
45  filtopt.type = FilterOption_t::enumeratedt;
46  filtopt.repeatable = FilterOption_t::onePerTerm;
47  filterOptions["Level"] = filtopt;
48
49  // --  IndexField, enumerated, used to list available fields
50  filtopt.clear();
51  filtopt.name = "IndexField";
52  filtopt.type = FilterOption_t::enumeratedt;
53  filtopt.repeatable = FilterOption_t::onePerTerm;
54  filtopt.defaultValue = "";
55  filterOptions["IndexField"] = filtopt;
56
57}
58
59lucenequeryfilterclass::~lucenequeryfilterclass () {
60}
61
62
63//whether a query is a full text browse
64bool lucenequeryfilterclass::full_text_browse (int filterRequestOptions) {
65  return (filterRequestOptions & FRfullTextBrowse);
66}
67
68void lucenequeryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
69  queryfilterclass::configure(key, cfgline);
70
71  if (key == "indexfieldmap") {
72    indexfieldmap.importmap (cfgline);
73   
74    // update the list of indexes in the filter information
75    text_tarray options;
76    indexfieldmap.gettoarray (options);
77
78    text_tarray::const_iterator here = options.begin();
79    text_tarray::const_iterator end = options.end();
80    bool start = true;
81    while (here !=end) {
82      if (!(*here).empty()) {
83    filterOptions["IndexField"].validValues.push_back(*here);
84    if (start) {
85      filterOptions["IndexField"].defaultValue = *here;
86      start = false;
87    }
88      }
89      ++here;
90    }
91  } else if (key == "indexlevels") {
92    text_tarray::const_iterator here = cfgline.begin();
93    text_tarray::const_iterator end = cfgline.end();
94    bool first=true;
95    while (here != end) {
96      if (!(*here).empty()) {
97    if (first) {
98      first = false;
99      // the default is the first value
100      filterOptions["Level"].defaultValue = *here;
101    }
102    filterOptions["Level"].validValues.push_back(*here);
103      }
104      ++here;
105    }
106  } else if (key == "textlevel") {
107      ((lucenesearchclass *)textsearchptr)->set_text_level(cfgline[0]);
108  }
109 
110}
111
112
113void lucenequeryfilterclass::filter(const FilterRequest_t &request,
114                  FilterResponse_t &response,
115                  comerror_t &err, ostream &logout) { 
116
117  outconvertclass text_t2ascii;
118
119  response.clear ();
120  err = noError;
121  if (db_ptr == NULL) {
122    // most likely a configuration problem
123    logout << text_t2ascii
124       << "configuration error: queryfilter contains a null dbclass\n\n";
125    err = configurationError;
126    return;
127  }
128  if (textsearchptr == NULL) {
129    // most likely a configuration problem
130    logout << text_t2ascii
131       << "configuration error: queryfilter contains a null textsearchclass (lucene)\n\n";
132    err = configurationError;
133    return;
134  }
135  if (full_text_browse(request.filterResultOptions)) {
136    browsefilter(request, response, err, logout);
137    return;
138  }
139  // open the database
140  db_ptr->setlogout(&logout);
141  if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
142    // most likely a system problem (we have already checked that the database exists)
143    logout << text_t2ascii
144       << "system problem: open on database \"" << db_filename << "\" failed\n\n";
145    err = systemProblem;
146    return;
147  }
148
149
150  // get the query parameters
151  int startresults, endresults;
152  text_t phrasematch; // not used here any more
153  vector<queryparamclass> queryfilterparams;
154  parse_query_params (request, queryfilterparams, startresults,
155              endresults, phrasematch, logout); 
156 
157   
158  // do query
159  queryresultsclass queryresults;
160  do_multi_query (request, queryfilterparams, queryresults, err, logout);
161  response.error_message = queryresults.error_message;
162  if (err != noError) return;
163
164  // assemble document results
165  if (need_matching_docs (request.filterResultOptions)) {
166   
167    int resultnum = 1;
168    ResultDocInfo_t resultdoc;
169    text_t trans_OID;
170    vector<text_t>::iterator docorder_here = queryresults.docs.docorder.begin();
171    vector<text_t>::iterator docorder_end = queryresults.docs.docorder.end();
172
173    // Now handled by Lucene directly
174    //if (endresults == -1) endresults = MAXNUMDOCS;
175
176    while (docorder_here != docorder_end)
177      {
178        // Now handled by Lucene directly
179        //if (resultnum > endresults) break;
180     
181        // translate the document number
182        if (!translate(db_ptr, *docorder_here, trans_OID))
183          {
184            logout << text_t2ascii
185                   << "warning: could not translate lucene document number \""
186                   << *docorder_here << "\" to OID.\n\n";
187           
188          }
189        else
190          {
191            docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
192
193            // see if there is a result for this number,
194            // if it is in the request set (or the request set is empty)
195            if (docset_here != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, trans_OID)))
196              {
197                // Now handled by Lucene directly
198                //if (resultnum >= startresults) {
199
200                // add this document
201                resultdoc.OID = trans_OID;
202                resultdoc.result_num = resultnum;
203                resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
204                resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
205               
206                response.docInfo.push_back (resultdoc);
207                //}
208                ++resultnum;
209              }
210          } // else
211       
212        ++docorder_here;
213      }
214  } // if need matching docs
215 
216  // assemble the term results
217  if (need_term_info(request.filterResultOptions)) {
218    // note: the terms have already been sorted and uniqued - ?? have they??
219
220    TermInfo_t terminfo;
221    bool terms_first = true;
222
223    termfreqclassarray::iterator terms_here = queryresults.terms.begin();
224    termfreqclassarray::iterator terms_end = queryresults.terms.end();
225
226    while (terms_here != terms_end) {
227      terminfo.clear();
228      terminfo.term = (*terms_here).termstr;
229      terminfo.freq = (*terms_here).termfreq;
230      // lucene doesn't return any termvariants at this stage,
231      // so make sure the original term is set
232      terminfo.matchTerms.push_back(terminfo.term);
233     
234      // this bit gets the matchTerms ie the equivalent (stem/casefold) terms
235      if (terms_first) {
236    text_tset::iterator termvariants_here = queryresults.termvariants.begin();
237    text_tset::iterator termvariants_end = queryresults.termvariants.end();
238    while (termvariants_here != termvariants_end) {
239      terminfo.matchTerms.push_back (*termvariants_here);
240      ++termvariants_here;
241    }
242      }
243      terms_first = false;
244     
245      response.termInfo.push_back (terminfo);
246
247      ++terms_here;
248    }
249
250    // add the stop words
251    text_tset::iterator stopwords_here = queryresults.stopwords.begin();
252    text_tset::iterator stopwords_end = queryresults.stopwords.end();
253    while (stopwords_here != stopwords_end) {
254      response.stopwords.insert(*stopwords_here);
255      ++stopwords_here;
256    }
257  }
258
259  db_ptr->closedatabase();  // Important that local library doesn't leave any files open
260  response.numDocs = queryresults.docs_matched;
261  response.isApprox = queryresults.is_approx;
262}
263
264void lucenequeryfilterclass::browsefilter(const FilterRequest_t &request,
265                    FilterResponse_t &response,
266                    comerror_t &err, ostream &logout) { 
267
268  outconvertclass text_t2ascii;
269
270  // get the query parameters
271  int startresults, endresults;
272  text_t phrasematch; // not used here any more, just have it so can use
273                      // parse_query_params function
274 
275  vector<queryparamclass> queryfilterparams;
276  parse_query_params (request, queryfilterparams, startresults,
277              endresults, phrasematch, logout); 
278
279    vector<queryparamclass>::const_iterator query_here = queryfilterparams.begin();
280   
281  // do query
282  queryresultsclass queryresults;
283  queryresults.clear();
284 
285  int numDocs = endresults-startresults;
286  textsearchptr->setcollectdir (collectdir);
287
288  if (!((lucenesearchclass*)textsearchptr)->browse_search((*query_here), startresults, numDocs, queryresults)) {
289    // most likely a system problem
290    logout << text_t2ascii
291       << "system problem: could not do full text browse with lucene for index \""
292       << (*query_here).index << (*query_here).subcollection
293       << (*query_here).language << "\".\n\n";
294    err = systemProblem;
295    return;
296  }
297
298  // assemble the term results
299  TermInfo_t terminfo;
300 
301  termfreqclassarray::iterator terms_here = queryresults.terms.begin();
302  termfreqclassarray::iterator terms_end = queryresults.terms.end();
303
304  while (terms_here != terms_end) {
305    terminfo.clear();
306    terminfo.term = (*terms_here).termstr;
307    terminfo.freq = (*terms_here).termfreq;
308   
309    response.termInfo.push_back (terminfo);
310
311    ++terms_here;
312  }
313 
314
315}
316
317// lucenesearchptr and db_ptr are assumed to be valid
318void lucenequeryfilterclass::do_multi_query (const FilterRequest_t &request,
319                       const vector<queryparamclass> &query_params,
320                       queryresultsclass &multiresults,
321                       comerror_t &err, ostream &logout) {
322  outconvertclass text_t2ascii;
323
324  err = noError;
325  textsearchptr->setcollectdir (collectdir);
326  multiresults.clear();
327 
328  vector<queryparamclass>::const_iterator query_here = query_params.begin();
329  vector<queryparamclass>::const_iterator query_end = query_params.end();
330  while (query_here != query_end) {
331    queryresultsclass thisqueryresults;
332    if (!textsearchptr->search((*query_here), thisqueryresults)) {
333      // most likely a system problem
334      logout << text_t2ascii
335         << "system problem: could not do search with lucene for index \""
336         << (*query_here).index << (*query_here).level
337         << (*query_here).subcollection
338         << (*query_here).language << "\".\n\n";
339      err = systemProblem;
340      return;
341    }
342
343    // check for syntax error
344    if (thisqueryresults.syntax_error==true) {
345      logout << text_t2ascii
346         << "syntax problem: invalid query string \""
347         << (*query_here).querystring<<"\".\n";
348      err = syntaxError;
349      return;
350    }
351    // combine the results
352    if (need_matching_docs (request.filterResultOptions)) {
353           
354      if (query_params.size() == 1) {
355    multiresults.error_message = thisqueryresults.error_message;
356    multiresults.docs = thisqueryresults.docs; // just one set of results
357    multiresults.docs_matched = thisqueryresults.docs_matched;
358    multiresults.is_approx = thisqueryresults.is_approx;
359   
360      } else {
361    if ((*query_here).combinequery == "and") {
362      multiresults.docs.combine_and (thisqueryresults.docs);
363    } else if ((*query_here).combinequery == "or") {
364      multiresults.docs.combine_or (thisqueryresults.docs);
365    } else if ((*query_here).combinequery == "not") {
366      multiresults.docs.combine_not (thisqueryresults.docs);
367    }
368    multiresults.docs_matched = multiresults.docs.docset.size();
369    multiresults.is_approx = Exact;
370      }
371    }
372
373    // combine the term information
374    if (need_term_info (request.filterResultOptions)) {
375      // append the terms
376      multiresults.orgterms.insert(multiresults.orgterms.end(),
377                   thisqueryresults.orgterms.begin(),
378                   thisqueryresults.orgterms.end());
379
380     
381      // add the term variants -
382      text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
383      text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
384      while (termvar_here != termvar_end) {
385    multiresults.termvariants.insert(*termvar_here);
386    ++termvar_here;
387      }
388
389      // add the stop words
390      text_tset::iterator stopwords_here = thisqueryresults.stopwords.begin();
391      text_tset::iterator stopwords_end = thisqueryresults.stopwords.end();
392      while (stopwords_here != stopwords_end) {
393    multiresults.stopwords.insert(*stopwords_here);
394    ++stopwords_here;
395      }
396    }
397   
398    ++query_here;
399  }
400
401  // sort and unique the query terms
402  multiresults.sortuniqqueryterms ();
403}
404
405
406
Note: See TracBrowser for help on using the browser.