root/main/trunk/greenstone2/runtime-src/src/colservr/lucenequeryfilter.cpp @ 27084

Revision 27084, 12.8 KB (checked in by kjdon, 7 years ago)

more tidying up of queryfilters. moved some stuff to the lucene class, and added 'virtual's so the right method actually gets used

  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * lucenequeryfilter.cpp --
4 * Copyright (C) 1999  The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "lucenequeryfilter.h"
27#include "fileutil.h"
28#include "lucenesearch.h"
29
30lucenequeryfilterclass::lucenequeryfilterclass ()
31  : fieldedqueryfilterclass() {
32
33 
34  FilterOption_t filtopt;
35 
36  // -- onePerQuery SortField, enumerated, used to list available sorting fields
37  filtopt.clear();
38  filtopt.name = "SortField";
39  filtopt.type = FilterOption_t::enumeratedt;
40  filtopt.repeatable = FilterOption_t::onePerQuery;
41  filtopt.defaultValue = "";
42  filterOptions["SortField"] = filtopt;
43
44  // -- onePerQuery SortOder      enumerated (0=ascending, 1=descending)
45  filtopt.clear();
46  filtopt.name = "SortOrder";
47  filtopt.type = FilterOption_t::enumeratedt;
48  filtopt.repeatable = FilterOption_t::onePerQuery;
49  filtopt.defaultValue = "ascending";
50  filtopt.validValues.push_back("ascending");
51  filtopt.validValues.push_back("descending");
52  filterOptions["SortOrder"] = filtopt;
53
54  // -- onePerQuery Fuzziness string 0.0-1.0
55  filtopt.clear();
56  filtopt.name = "Fuzziness";
57  filtopt.type = FilterOption_t::stringt;
58  filtopt.repeatable = FilterOption_t::onePerQuery;
59  filtopt.defaultValue = "";
60  filterOptions["Fuzziness"] = filtopt;
61
62 // -- onePerQuery FilterString  string
63  filtopt.clear();
64  filtopt.name = "FilterString";
65  filtopt.type = FilterOption_t::stringt;
66  filtopt.repeatable = FilterOption_t::onePerQuery;
67  filtopt.defaultValue = "";
68  filterOptions["FilterString"] = filtopt;
69}
70
71lucenequeryfilterclass::~lucenequeryfilterclass () {
72}
73
74
75
76void lucenequeryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
77  fieldedqueryfilterclass::configure(key, cfgline);
78
79  if (key == "textlevel") {
80    ((lucenesearchclass *)textsearchptr)->set_text_level(cfgline[0]);
81  }
82}
83
84bool lucenequeryfilterclass::init (ostream &logout) {
85 
86  if (!fieldedqueryfilterclass::init(logout)) {
87    return false;
88  }
89 
90  text_tarray field_array;
91  indexfieldmap.gettoarray(field_array);
92  for (int i=0; i<field_array.size(); i++) {
93    text_t field = field_array[i];
94    if (field!="ZZ" && field !="ZZ") {
95      filterOptions["SortField"].validValues.push_back("by"+field);
96    }
97  }
98  return true;
99}
100
101void lucenequeryfilterclass::set_queryparam_defaults(queryparamclass &query ) {
102
103  fieldedqueryfilterclass::set_queryparam_defaults(query);
104  query.filterstring = filterOptions["FilterString"].defaultValue; 
105  query.sortfield = filterOptions["SortField"].defaultValue;
106  query.sortorder = (filterOptions["SortOrder"].defaultValue == "descending");
107  query.fuzziness = filterOptions["Fuzziness"].defaultValue; 
108
109}
110
111bool lucenequeryfilterclass::set_queryparam_field(const OptionValue_t &option, queryparamclass &query) {
112
113  if (option.name == "FilterString") {
114    query.filterstring = option.value;
115    return true;
116  }
117  if (option.name == "SortField") {
118    query.sortfield = option.value;
119    return true;
120  }
121  if (option.name == "SortOrder") {
122    query.sortorder = (option.value == "descending");
123    return true;
124  }
125  if (option.name == "Fuzziness") {
126    query.fuzziness = option.value;
127    return true;
128  }
129  return fieldedqueryfilterclass::set_queryparam_field(option, query);
130}
131
132void lucenequeryfilterclass::filter(const FilterRequest_t &request,
133                  FilterResponse_t &response,
134                  comerror_t &err, ostream &logout) { 
135
136  outconvertclass text_t2ascii;
137
138  response.clear ();
139  err = noError;
140  if (db_ptr == NULL) {
141    // most likely a configuration problem
142    logout << text_t2ascii
143       << "configuration error: queryfilter contains a null dbclass\n\n";
144    err = configurationError;
145    return;
146  }
147  if (textsearchptr == NULL) {
148    // most likely a configuration problem
149    logout << text_t2ascii
150       << "configuration error: queryfilter contains a null textsearchclass (lucene)\n\n";
151    err = configurationError;
152    return;
153  }
154  if (full_text_browse(request.filterResultOptions)) {
155    browsefilter(request, response, err, logout);
156    return;
157  }
158  // open the database
159  db_ptr->setlogout(&logout);
160  if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
161    // most likely a system problem (we have already checked that the database exists)
162    logout << text_t2ascii
163       << "system problem: open on database \"" << db_filename << "\" failed\n\n";
164    err = systemProblem;
165    return;
166  }
167
168
169  // get the query parameters
170  int startresults, endresults;
171  vector<queryparamclass> queryfilterparams;
172  parse_query_params (request, queryfilterparams, startresults,
173              endresults, logout); 
174 
175   
176  // do query
177  queryresultsclass queryresults;
178  do_multi_query (request, queryfilterparams, queryresults, err, logout);
179  response.error_message = queryresults.error_message;
180  if (err != noError) return;
181
182  // assemble document results
183  if (need_matching_docs (request.filterResultOptions))
184  {
185    // Loop through the query results (ordered by ranking)
186    int resultnum = 1;
187    vector<text_t>::iterator docorder_iterator = queryresults.docs.docorder.begin();
188    while (docorder_iterator != queryresults.docs.docorder.end())
189    {
190      text_t doc_OID = (*docorder_iterator);
191      // logout << "Matching doc OID: " << doc_OID << endl;
192
193      // Make sure this result is in the docset, and either in the request set or the request set is empty
194      docresultmap::iterator doc_result = queryresults.docs.docset.find (doc_OID);
195      if (doc_result != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, doc_OID)))
196      {
197    // Add the matching document
198    ResultDocInfo_t resultdoc;
199    resultdoc.OID = doc_OID;
200    resultdoc.result_num = resultnum;
201    resultdoc.ranking = (int)((*doc_result).second.docweight * 10000.0 + 0.5);
202    resultdoc.num_terms_matched = (*doc_result).second.num_query_terms_matched;
203    response.docInfo.push_back (resultdoc);
204
205    resultnum++;
206      }
207
208      docorder_iterator++;
209    }
210  }
211 
212  // assemble the term results
213  if (need_term_info(request.filterResultOptions)) {
214    // note: the terms have already been sorted and uniqued - ?? have they??
215
216    TermInfo_t terminfo;
217    bool terms_first = true;
218
219    termfreqclassarray::iterator terms_here = queryresults.terms.begin();
220    termfreqclassarray::iterator terms_end = queryresults.terms.end();
221
222    while (terms_here != terms_end) {
223      terminfo.clear();
224      terminfo.term = (*terms_here).termstr;
225      terminfo.freq = (*terms_here).termfreq;
226      // lucene doesn't return any termvariants at this stage,
227      // so make sure the original term is set
228      terminfo.matchTerms.push_back(terminfo.term);
229     
230      // this bit gets the matchTerms ie the equivalent (stem/casefold) terms
231      if (terms_first) {
232    text_tset::iterator termvariants_here = queryresults.termvariants.begin();
233    text_tset::iterator termvariants_end = queryresults.termvariants.end();
234    while (termvariants_here != termvariants_end) {
235      terminfo.matchTerms.push_back (*termvariants_here);
236      ++termvariants_here;
237    }
238      }
239      terms_first = false;
240     
241      response.termInfo.push_back (terminfo);
242
243      ++terms_here;
244    }
245
246    // add the stop words
247    text_tset::iterator stopwords_here = queryresults.stopwords.begin();
248    text_tset::iterator stopwords_end = queryresults.stopwords.end();
249    while (stopwords_here != stopwords_end) {
250      response.stopwords.insert(*stopwords_here);
251      ++stopwords_here;
252    }
253  }
254
255  db_ptr->closedatabase();  // Important that local library doesn't leave any files open
256  response.numDocs = queryresults.docs_matched;
257  response.isApprox = queryresults.is_approx;
258}
259
260void lucenequeryfilterclass::browsefilter(const FilterRequest_t &request,
261                    FilterResponse_t &response,
262                    comerror_t &err, ostream &logout) { 
263
264  outconvertclass text_t2ascii;
265
266  // get the query parameters
267  int startresults, endresults;
268 
269  vector<queryparamclass> queryfilterparams;
270  parse_query_params (request, queryfilterparams, startresults,
271              endresults, logout); 
272
273  vector<queryparamclass>::const_iterator query_here = queryfilterparams.begin();
274   
275  // do query
276  queryresultsclass queryresults;
277  queryresults.clear();
278 
279  int numDocs = endresults-startresults;
280  textsearchptr->setcollectdir (collectdir);
281
282  if (!((lucenesearchclass*)textsearchptr)->browse_search((*query_here), startresults, numDocs, queryresults)) {
283    // most likely a system problem
284    logout << text_t2ascii
285       << "system problem: could not do full text browse with lucene for index \""
286       << (*query_here).index << (*query_here).subcollection
287       << (*query_here).language << "\".\n\n";
288    err = systemProblem;
289    return;
290  }
291
292  // assemble the term results
293  TermInfo_t terminfo;
294 
295  termfreqclassarray::iterator terms_here = queryresults.terms.begin();
296  termfreqclassarray::iterator terms_end = queryresults.terms.end();
297
298  while (terms_here != terms_end) {
299    terminfo.clear();
300    terminfo.term = (*terms_here).termstr;
301    terminfo.freq = (*terms_here).termfreq;
302   
303    response.termInfo.push_back (terminfo);
304
305    ++terms_here;
306  }
307 
308
309}
310
311// lucenesearchptr and db_ptr are assumed to be valid
312void lucenequeryfilterclass::do_multi_query (const FilterRequest_t &request,
313                       const vector<queryparamclass> &query_params,
314                       queryresultsclass &multiresults,
315                       comerror_t &err, ostream &logout) {
316  outconvertclass text_t2ascii;
317
318  err = noError;
319  textsearchptr->setcollectdir (collectdir);
320  multiresults.clear();
321 
322  vector<queryparamclass>::const_iterator query_here = query_params.begin();
323  vector<queryparamclass>::const_iterator query_end = query_params.end();
324  while (query_here != query_end) {
325    queryresultsclass thisqueryresults;
326    if (!textsearchptr->search((*query_here), thisqueryresults)) {
327      // most likely a system problem
328      logout << text_t2ascii
329         << "system problem: could not do search with lucene for index \""
330         << (*query_here).index << (*query_here).level
331         << (*query_here).subcollection
332         << (*query_here).language << "\".\n\n";
333      err = systemProblem;
334      return;
335    }
336
337    // check for syntax error
338    if (thisqueryresults.syntax_error==true) {
339      logout << text_t2ascii
340         << "syntax problem: invalid query string \""
341         << (*query_here).querystring<<"\".\n";
342      err = syntaxError;
343      return;
344    }
345    // combine the results
346    if (need_matching_docs (request.filterResultOptions)) {
347           
348      if (query_params.size() == 1) {
349    multiresults.error_message = thisqueryresults.error_message;
350    multiresults.docs = thisqueryresults.docs; // just one set of results
351    multiresults.docs_matched = thisqueryresults.docs_matched;
352    multiresults.is_approx = thisqueryresults.is_approx;
353   
354      } else {
355    if ((*query_here).combinequery == "and") {
356      multiresults.docs.combine_and (thisqueryresults.docs);
357    } else if ((*query_here).combinequery == "or") {
358      multiresults.docs.combine_or (thisqueryresults.docs);
359    } else if ((*query_here).combinequery == "not") {
360      multiresults.docs.combine_not (thisqueryresults.docs);
361    }
362    multiresults.docs_matched = multiresults.docs.docset.size();
363    multiresults.is_approx = Exact;
364      }
365    }
366
367    // combine the term information
368    if (need_term_info (request.filterResultOptions)) {
369      // append the terms
370      multiresults.orgterms.insert(multiresults.orgterms.end(),
371                   thisqueryresults.orgterms.begin(),
372                   thisqueryresults.orgterms.end());
373
374     
375      // add the term variants -
376      text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
377      text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
378      while (termvar_here != termvar_end) {
379    multiresults.termvariants.insert(*termvar_here);
380    ++termvar_here;
381      }
382
383      // add the stop words
384      text_tset::iterator stopwords_here = thisqueryresults.stopwords.begin();
385      text_tset::iterator stopwords_end = thisqueryresults.stopwords.end();
386      while (stopwords_here != stopwords_end) {
387    multiresults.stopwords.insert(*stopwords_here);
388    ++stopwords_here;
389      }
390    }
391   
392    ++query_here;
393  }
394
395  // sort and unique the query terms
396  multiresults.sortuniqqueryterms ();
397}
398
399
Note: See TracBrowser for help on using the browser.