root/gsdl/trunk/runtime-src/src/colservr/lucenequeryfilter.cpp @ 16947

Revision 16947, 12.3 KB (checked in by mdewsnip, 12 years ago)

Changed the Lucene code to use the Greenstone document OIDs directly, instead of creating its own numeric IDs and then mapping them to the Greenstone OIDs in the GDBM file. As well as being simpler and more space and speed efficient (the mapping no longer needs to be stored in the GDBM file, and no lookup needs to be done for each search result), this is another important step along the road to true incremental building.

  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * lucenequeryfilter.cpp --
4 * Copyright (C) 1999  The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "lucenequeryfilter.h"
27#include "fileutil.h"
28#include "lucenesearch.h"
29
30/////////////////////////////////
31// functions for queryfilterclass
32/////////////////////////////////
33
34
35lucenequeryfilterclass::lucenequeryfilterclass ()
36  : queryfilterclass() {
37
38 
39  FilterOption_t filtopt;
40 
41  // -- onePerTerm  Level          enumerated
42  // likely to be Doc, Sec, Para, but we dont assume anything now
43  filtopt.clear();
44  filtopt.name = "Level";
45  filtopt.type = FilterOption_t::enumeratedt;
46  filtopt.repeatable = FilterOption_t::onePerTerm;
47  filterOptions["Level"] = filtopt;
48
49  // --  IndexField, enumerated, used to list available fields
50  filtopt.clear();
51  filtopt.name = "IndexField";
52  filtopt.type = FilterOption_t::enumeratedt;
53  filtopt.repeatable = FilterOption_t::onePerTerm;
54  filtopt.defaultValue = "";
55  filterOptions["IndexField"] = filtopt;
56
57}
58
59lucenequeryfilterclass::~lucenequeryfilterclass () {
60}
61
62
63//whether a query is a full text browse
64bool lucenequeryfilterclass::full_text_browse (int filterRequestOptions) {
65  return (filterRequestOptions & FRfullTextBrowse);
66}
67
68void lucenequeryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
69  queryfilterclass::configure(key, cfgline);
70
71  if (key == "indexfieldmap") {
72    indexfieldmap.importmap (cfgline);
73   
74    // update the list of indexes in the filter information
75    text_tarray options;
76    indexfieldmap.gettoarray (options);
77
78    text_tarray::const_iterator here = options.begin();
79    text_tarray::const_iterator end = options.end();
80    bool start = true;
81    while (here !=end) {
82      if (!(*here).empty()) {
83    filterOptions["IndexField"].validValues.push_back(*here);
84    if (start) {
85      filterOptions["IndexField"].defaultValue = *here;
86      start = false;
87    }
88      }
89      ++here;
90    }
91  } else if (key == "indexlevels") {
92    text_tarray::const_iterator here = cfgline.begin();
93    text_tarray::const_iterator end = cfgline.end();
94    bool first=true;
95    while (here != end) {
96      if (!(*here).empty()) {
97    if (first) {
98      first = false;
99      // the default is the first value
100      filterOptions["Level"].defaultValue = *here;
101    }
102    filterOptions["Level"].validValues.push_back(*here);
103      }
104      ++here;
105    }
106  } else if (key == "textlevel") {
107      ((lucenesearchclass *)textsearchptr)->set_text_level(cfgline[0]);
108  }
109 
110}
111
112
113void lucenequeryfilterclass::filter(const FilterRequest_t &request,
114                  FilterResponse_t &response,
115                  comerror_t &err, ostream &logout) { 
116
117  outconvertclass text_t2ascii;
118
119  response.clear ();
120  err = noError;
121  if (db_ptr == NULL) {
122    // most likely a configuration problem
123    logout << text_t2ascii
124       << "configuration error: queryfilter contains a null dbclass\n\n";
125    err = configurationError;
126    return;
127  }
128  if (textsearchptr == NULL) {
129    // most likely a configuration problem
130    logout << text_t2ascii
131       << "configuration error: queryfilter contains a null textsearchclass (lucene)\n\n";
132    err = configurationError;
133    return;
134  }
135  if (full_text_browse(request.filterResultOptions)) {
136    browsefilter(request, response, err, logout);
137    return;
138  }
139  // open the database
140  db_ptr->setlogout(&logout);
141  if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
142    // most likely a system problem (we have already checked that the database exists)
143    logout << text_t2ascii
144       << "system problem: open on database \"" << db_filename << "\" failed\n\n";
145    err = systemProblem;
146    return;
147  }
148
149
150  // get the query parameters
151  int startresults, endresults;
152  text_t phrasematch; // not used here any more
153  vector<queryparamclass> queryfilterparams;
154  parse_query_params (request, queryfilterparams, startresults,
155              endresults, phrasematch, logout); 
156 
157   
158  // do query
159  queryresultsclass queryresults;
160  do_multi_query (request, queryfilterparams, queryresults, err, logout);
161  response.error_message = queryresults.error_message;
162  if (err != noError) return;
163
164  // assemble document results
165  if (need_matching_docs (request.filterResultOptions))
166  {
167    // Loop through the query results (ordered by ranking)
168    int resultnum = 1;
169    vector<text_t>::iterator docorder_iterator = queryresults.docs.docorder.begin();
170    while (docorder_iterator != queryresults.docs.docorder.end())
171    {
172      text_t doc_OID = (*docorder_iterator);
173      // logout << "Matching doc OID: " << doc_OID << endl;
174
175      // Make sure this result is in the docset, and either in the request set or the request set is empty
176      docresultmap::iterator doc_result = queryresults.docs.docset.find (doc_OID);
177      if (doc_result != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, doc_OID)))
178      {
179    // Add the matching document
180    ResultDocInfo_t resultdoc;
181    resultdoc.OID = doc_OID;
182    resultdoc.result_num = resultnum;
183    resultdoc.ranking = (int)((*doc_result).second.docweight * 10000.0 + 0.5);
184    resultdoc.num_terms_matched = (*doc_result).second.num_query_terms_matched;
185    response.docInfo.push_back (resultdoc);
186
187    resultnum++;
188      }
189
190      docorder_iterator++;
191    }
192  }
193 
194  // assemble the term results
195  if (need_term_info(request.filterResultOptions)) {
196    // note: the terms have already been sorted and uniqued - ?? have they??
197
198    TermInfo_t terminfo;
199    bool terms_first = true;
200
201    termfreqclassarray::iterator terms_here = queryresults.terms.begin();
202    termfreqclassarray::iterator terms_end = queryresults.terms.end();
203
204    while (terms_here != terms_end) {
205      terminfo.clear();
206      terminfo.term = (*terms_here).termstr;
207      terminfo.freq = (*terms_here).termfreq;
208      // lucene doesn't return any termvariants at this stage,
209      // so make sure the original term is set
210      terminfo.matchTerms.push_back(terminfo.term);
211     
212      // this bit gets the matchTerms ie the equivalent (stem/casefold) terms
213      if (terms_first) {
214    text_tset::iterator termvariants_here = queryresults.termvariants.begin();
215    text_tset::iterator termvariants_end = queryresults.termvariants.end();
216    while (termvariants_here != termvariants_end) {
217      terminfo.matchTerms.push_back (*termvariants_here);
218      ++termvariants_here;
219    }
220      }
221      terms_first = false;
222     
223      response.termInfo.push_back (terminfo);
224
225      ++terms_here;
226    }
227
228    // add the stop words
229    text_tset::iterator stopwords_here = queryresults.stopwords.begin();
230    text_tset::iterator stopwords_end = queryresults.stopwords.end();
231    while (stopwords_here != stopwords_end) {
232      response.stopwords.insert(*stopwords_here);
233      ++stopwords_here;
234    }
235  }
236
237  db_ptr->closedatabase();  // Important that local library doesn't leave any files open
238  response.numDocs = queryresults.docs_matched;
239  response.isApprox = queryresults.is_approx;
240}
241
242void lucenequeryfilterclass::browsefilter(const FilterRequest_t &request,
243                    FilterResponse_t &response,
244                    comerror_t &err, ostream &logout) { 
245
246  outconvertclass text_t2ascii;
247
248  // get the query parameters
249  int startresults, endresults;
250  text_t phrasematch; // not used here any more, just have it so can use
251                      // parse_query_params function
252 
253  vector<queryparamclass> queryfilterparams;
254  parse_query_params (request, queryfilterparams, startresults,
255              endresults, phrasematch, logout); 
256
257    vector<queryparamclass>::const_iterator query_here = queryfilterparams.begin();
258   
259  // do query
260  queryresultsclass queryresults;
261  queryresults.clear();
262 
263  int numDocs = endresults-startresults;
264  textsearchptr->setcollectdir (collectdir);
265
266  if (!((lucenesearchclass*)textsearchptr)->browse_search((*query_here), startresults, numDocs, queryresults)) {
267    // most likely a system problem
268    logout << text_t2ascii
269       << "system problem: could not do full text browse with lucene for index \""
270       << (*query_here).index << (*query_here).subcollection
271       << (*query_here).language << "\".\n\n";
272    err = systemProblem;
273    return;
274  }
275
276  // assemble the term results
277  TermInfo_t terminfo;
278 
279  termfreqclassarray::iterator terms_here = queryresults.terms.begin();
280  termfreqclassarray::iterator terms_end = queryresults.terms.end();
281
282  while (terms_here != terms_end) {
283    terminfo.clear();
284    terminfo.term = (*terms_here).termstr;
285    terminfo.freq = (*terms_here).termfreq;
286   
287    response.termInfo.push_back (terminfo);
288
289    ++terms_here;
290  }
291 
292
293}
294
295// lucenesearchptr and db_ptr are assumed to be valid
296void lucenequeryfilterclass::do_multi_query (const FilterRequest_t &request,
297                       const vector<queryparamclass> &query_params,
298                       queryresultsclass &multiresults,
299                       comerror_t &err, ostream &logout) {
300  outconvertclass text_t2ascii;
301
302  err = noError;
303  textsearchptr->setcollectdir (collectdir);
304  multiresults.clear();
305 
306  vector<queryparamclass>::const_iterator query_here = query_params.begin();
307  vector<queryparamclass>::const_iterator query_end = query_params.end();
308  while (query_here != query_end) {
309    queryresultsclass thisqueryresults;
310    if (!textsearchptr->search((*query_here), thisqueryresults)) {
311      // most likely a system problem
312      logout << text_t2ascii
313         << "system problem: could not do search with lucene for index \""
314         << (*query_here).index << (*query_here).level
315         << (*query_here).subcollection
316         << (*query_here).language << "\".\n\n";
317      err = systemProblem;
318      return;
319    }
320
321    // check for syntax error
322    if (thisqueryresults.syntax_error==true) {
323      logout << text_t2ascii
324         << "syntax problem: invalid query string \""
325         << (*query_here).querystring<<"\".\n";
326      err = syntaxError;
327      return;
328    }
329    // combine the results
330    if (need_matching_docs (request.filterResultOptions)) {
331           
332      if (query_params.size() == 1) {
333    multiresults.error_message = thisqueryresults.error_message;
334    multiresults.docs = thisqueryresults.docs; // just one set of results
335    multiresults.docs_matched = thisqueryresults.docs_matched;
336    multiresults.is_approx = thisqueryresults.is_approx;
337   
338      } else {
339    if ((*query_here).combinequery == "and") {
340      multiresults.docs.combine_and (thisqueryresults.docs);
341    } else if ((*query_here).combinequery == "or") {
342      multiresults.docs.combine_or (thisqueryresults.docs);
343    } else if ((*query_here).combinequery == "not") {
344      multiresults.docs.combine_not (thisqueryresults.docs);
345    }
346    multiresults.docs_matched = multiresults.docs.docset.size();
347    multiresults.is_approx = Exact;
348      }
349    }
350
351    // combine the term information
352    if (need_term_info (request.filterResultOptions)) {
353      // append the terms
354      multiresults.orgterms.insert(multiresults.orgterms.end(),
355                   thisqueryresults.orgterms.begin(),
356                   thisqueryresults.orgterms.end());
357
358     
359      // add the term variants -
360      text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
361      text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
362      while (termvar_here != termvar_end) {
363    multiresults.termvariants.insert(*termvar_here);
364    ++termvar_here;
365      }
366
367      // add the stop words
368      text_tset::iterator stopwords_here = thisqueryresults.stopwords.begin();
369      text_tset::iterator stopwords_end = thisqueryresults.stopwords.end();
370      while (stopwords_here != stopwords_end) {
371    multiresults.stopwords.insert(*stopwords_here);
372    ++stopwords_here;
373      }
374    }
375   
376    ++query_here;
377  }
378
379  // sort and unique the query terms
380  multiresults.sortuniqqueryterms ();
381}
382
383
384
Note: See TracBrowser for help on using the browser.