source: main/trunk/greenstone2/runtime-src/src/colservr/lucenequeryfilter.cpp@ 27084

Last change on this file since 27084 was 27084, checked in by kjdon, 11 years ago

more tidying up of queryfilters. moved some stuff to the lucene class, and added 'virtual's so the right method actually gets used

  • Property svn:keywords set to Author Date Id Revision
File size: 12.8 KB
RevLine 
[8027]1/**********************************************************************
2 *
3 * lucenequeryfilter.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "lucenequeryfilter.h"
27#include "fileutil.h"
28#include "lucenesearch.h"
29
30lucenequeryfilterclass::lucenequeryfilterclass ()
[27064]31 : fieldedqueryfilterclass() {
[8027]32
33
34 FilterOption_t filtopt;
35
[27064]36 // -- onePerQuery SortField, enumerated, used to list available sorting fields
[8027]37 filtopt.clear();
[27064]38 filtopt.name = "SortField";
[8027]39 filtopt.type = FilterOption_t::enumeratedt;
[27064]40 filtopt.repeatable = FilterOption_t::onePerQuery;
[20727]41 filtopt.defaultValue = "";
[27064]42 filterOptions["SortField"] = filtopt;
[8027]43
[27064]44 // -- onePerQuery SortOder enumerated (0=ascending, 1=descending)
[8027]45 filtopt.clear();
[27064]46 filtopt.name = "SortOrder";
[8027]47 filtopt.type = FilterOption_t::enumeratedt;
[27064]48 filtopt.repeatable = FilterOption_t::onePerQuery;
49 filtopt.defaultValue = "ascending";
50 filtopt.validValues.push_back("ascending");
51 filtopt.validValues.push_back("descending");
52 filterOptions["SortOrder"] = filtopt;
53
54 // -- onePerQuery Fuzziness string 0.0-1.0
55 filtopt.clear();
56 filtopt.name = "Fuzziness";
57 filtopt.type = FilterOption_t::stringt;
58 filtopt.repeatable = FilterOption_t::onePerQuery;
[8027]59 filtopt.defaultValue = "";
[27064]60 filterOptions["Fuzziness"] = filtopt;
[8027]61
[27064]62 // -- onePerQuery FilterString string
63 filtopt.clear();
64 filtopt.name = "FilterString";
65 filtopt.type = FilterOption_t::stringt;
66 filtopt.repeatable = FilterOption_t::onePerQuery;
67 filtopt.defaultValue = "";
68 filterOptions["FilterString"] = filtopt;
[8027]69}
70
71lucenequeryfilterclass::~lucenequeryfilterclass () {
72}
73
74
75
76void lucenequeryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
[27064]77 fieldedqueryfilterclass::configure(key, cfgline);
[8027]78
[27064]79 if (key == "textlevel") {
80 ((lucenesearchclass *)textsearchptr)->set_text_level(cfgline[0]);
[8027]81 }
82}
83
[20727]84bool lucenequeryfilterclass::init (ostream &logout) {
85
[27064]86 if (!fieldedqueryfilterclass::init(logout)) {
[20727]87 return false;
88 }
89
[27064]90 text_tarray field_array;
91 indexfieldmap.gettoarray(field_array);
92 for (int i=0; i<field_array.size(); i++) {
93 text_t field = field_array[i];
94 if (field!="ZZ" && field !="ZZ") {
95 filterOptions["SortField"].validValues.push_back("by"+field);
[20727]96 }
97 }
98 return true;
99}
100
[27064]101void lucenequeryfilterclass::set_queryparam_defaults(queryparamclass &query ) {
102
103 fieldedqueryfilterclass::set_queryparam_defaults(query);
104 query.filterstring = filterOptions["FilterString"].defaultValue;
105 query.sortfield = filterOptions["SortField"].defaultValue;
106 query.sortorder = (filterOptions["SortOrder"].defaultValue == "descending");
107 query.fuzziness = filterOptions["Fuzziness"].defaultValue;
108
109}
110
[27084]111bool lucenequeryfilterclass::set_queryparam_field(const OptionValue_t &option, queryparamclass &query) {
[27064]112
[27084]113 if (option.name == "FilterString") {
114 query.filterstring = option.value;
115 return true;
116 }
117 if (option.name == "SortField") {
118 query.sortfield = option.value;
119 return true;
120 }
121 if (option.name == "SortOrder") {
122 query.sortorder = (option.value == "descending");
123 return true;
124 }
125 if (option.name == "Fuzziness") {
126 query.fuzziness = option.value;
127 return true;
128 }
129 return fieldedqueryfilterclass::set_queryparam_field(option, query);
130}
131
[8027]132void lucenequeryfilterclass::filter(const FilterRequest_t &request,
133 FilterResponse_t &response,
134 comerror_t &err, ostream &logout) {
135
136 outconvertclass text_t2ascii;
137
138 response.clear ();
139 err = noError;
[15558]140 if (db_ptr == NULL) {
[8027]141 // most likely a configuration problem
142 logout << text_t2ascii
[15558]143 << "configuration error: queryfilter contains a null dbclass\n\n";
[8027]144 err = configurationError;
145 return;
146 }
147 if (textsearchptr == NULL) {
148 // most likely a configuration problem
149 logout << text_t2ascii
[9110]150 << "configuration error: queryfilter contains a null textsearchclass (lucene)\n\n";
[8027]151 err = configurationError;
152 return;
153 }
154 if (full_text_browse(request.filterResultOptions)) {
155 browsefilter(request, response, err, logout);
156 return;
157 }
158 // open the database
[15558]159 db_ptr->setlogout(&logout);
160 if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
161 // most likely a system problem (we have already checked that the database exists)
[8027]162 logout << text_t2ascii
[15558]163 << "system problem: open on database \"" << db_filename << "\" failed\n\n";
[8027]164 err = systemProblem;
165 return;
166 }
167
168
169 // get the query parameters
170 int startresults, endresults;
171 vector<queryparamclass> queryfilterparams;
172 parse_query_params (request, queryfilterparams, startresults,
[27064]173 endresults, logout);
[8027]174
175
176 // do query
177 queryresultsclass queryresults;
178 do_multi_query (request, queryfilterparams, queryresults, err, logout);
[12421]179 response.error_message = queryresults.error_message;
[8027]180 if (err != noError) return;
[12421]181
[8027]182 // assemble document results
[16947]183 if (need_matching_docs (request.filterResultOptions))
184 {
185 // Loop through the query results (ordered by ranking)
[8027]186 int resultnum = 1;
[16947]187 vector<text_t>::iterator docorder_iterator = queryresults.docs.docorder.begin();
188 while (docorder_iterator != queryresults.docs.docorder.end())
189 {
190 text_t doc_OID = (*docorder_iterator);
191 // logout << "Matching doc OID: " << doc_OID << endl;
[8027]192
[16947]193 // Make sure this result is in the docset, and either in the request set or the request set is empty
194 docresultmap::iterator doc_result = queryresults.docs.docset.find (doc_OID);
195 if (doc_result != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, doc_OID)))
[12655]196 {
[16947]197 // Add the matching document
198 ResultDocInfo_t resultdoc;
199 resultdoc.OID = doc_OID;
200 resultdoc.result_num = resultnum;
201 resultdoc.ranking = (int)((*doc_result).second.docweight * 10000.0 + 0.5);
202 resultdoc.num_terms_matched = (*doc_result).second.num_query_terms_matched;
203 response.docInfo.push_back (resultdoc);
[8027]204
[16947]205 resultnum++;
206 }
[8027]207
[16947]208 docorder_iterator++;
209 }
210 }
[12655]211
[8027]212 // assemble the term results
213 if (need_term_info(request.filterResultOptions)) {
214 // note: the terms have already been sorted and uniqued - ?? have they??
215
216 TermInfo_t terminfo;
217 bool terms_first = true;
218
219 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
220 termfreqclassarray::iterator terms_end = queryresults.terms.end();
221
222 while (terms_here != terms_end) {
223 terminfo.clear();
224 terminfo.term = (*terms_here).termstr;
225 terminfo.freq = (*terms_here).termfreq;
[13063]226 // lucene doesn't return any termvariants at this stage,
227 // so make sure the original term is set
228 terminfo.matchTerms.push_back(terminfo.term);
229
[8027]230 // this bit gets the matchTerms ie the equivalent (stem/casefold) terms
231 if (terms_first) {
232 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
233 text_tset::iterator termvariants_end = queryresults.termvariants.end();
234 while (termvariants_here != termvariants_end) {
235 terminfo.matchTerms.push_back (*termvariants_here);
[9620]236 ++termvariants_here;
[8027]237 }
238 }
239 terms_first = false;
240
241 response.termInfo.push_back (terminfo);
242
[9620]243 ++terms_here;
[8027]244 }
[12380]245
246 // add the stop words
247 text_tset::iterator stopwords_here = queryresults.stopwords.begin();
248 text_tset::iterator stopwords_end = queryresults.stopwords.end();
249 while (stopwords_here != stopwords_end) {
250 response.stopwords.insert(*stopwords_here);
251 ++stopwords_here;
252 }
[8027]253 }
254
[15558]255 db_ptr->closedatabase(); // Important that local library doesn't leave any files open
[8027]256 response.numDocs = queryresults.docs_matched;
257 response.isApprox = queryresults.is_approx;
258}
259
260void lucenequeryfilterclass::browsefilter(const FilterRequest_t &request,
261 FilterResponse_t &response,
262 comerror_t &err, ostream &logout) {
263
264 outconvertclass text_t2ascii;
265
266 // get the query parameters
267 int startresults, endresults;
268
269 vector<queryparamclass> queryfilterparams;
270 parse_query_params (request, queryfilterparams, startresults,
[27064]271 endresults, logout);
[8027]272
[22050]273 vector<queryparamclass>::const_iterator query_here = queryfilterparams.begin();
[8027]274
275 // do query
276 queryresultsclass queryresults;
277 queryresults.clear();
278
279 int numDocs = endresults-startresults;
280 textsearchptr->setcollectdir (collectdir);
281
282 if (!((lucenesearchclass*)textsearchptr)->browse_search((*query_here), startresults, numDocs, queryresults)) {
283 // most likely a system problem
284 logout << text_t2ascii
285 << "system problem: could not do full text browse with lucene for index \""
286 << (*query_here).index << (*query_here).subcollection
287 << (*query_here).language << "\".\n\n";
288 err = systemProblem;
289 return;
290 }
291
292 // assemble the term results
293 TermInfo_t terminfo;
294
295 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
296 termfreqclassarray::iterator terms_end = queryresults.terms.end();
297
298 while (terms_here != terms_end) {
299 terminfo.clear();
300 terminfo.term = (*terms_here).termstr;
301 terminfo.freq = (*terms_here).termfreq;
302
303 response.termInfo.push_back (terminfo);
304
[9620]305 ++terms_here;
[8027]306 }
307
308
309}
310
[15558]311// lucenesearchptr and db_ptr are assumed to be valid
[8027]312void lucenequeryfilterclass::do_multi_query (const FilterRequest_t &request,
313 const vector<queryparamclass> &query_params,
314 queryresultsclass &multiresults,
315 comerror_t &err, ostream &logout) {
316 outconvertclass text_t2ascii;
317
318 err = noError;
319 textsearchptr->setcollectdir (collectdir);
320 multiresults.clear();
321
322 vector<queryparamclass>::const_iterator query_here = query_params.begin();
323 vector<queryparamclass>::const_iterator query_end = query_params.end();
324 while (query_here != query_end) {
325 queryresultsclass thisqueryresults;
326 if (!textsearchptr->search((*query_here), thisqueryresults)) {
327 // most likely a system problem
328 logout << text_t2ascii
329 << "system problem: could not do search with lucene for index \""
[9090]330 << (*query_here).index << (*query_here).level
331 << (*query_here).subcollection
[8027]332 << (*query_here).language << "\".\n\n";
333 err = systemProblem;
334 return;
335 }
336
337 // check for syntax error
338 if (thisqueryresults.syntax_error==true) {
339 logout << text_t2ascii
340 << "syntax problem: invalid query string \""
341 << (*query_here).querystring<<"\".\n";
342 err = syntaxError;
343 return;
344 }
345 // combine the results
346 if (need_matching_docs (request.filterResultOptions)) {
347
348 if (query_params.size() == 1) {
[12421]349 multiresults.error_message = thisqueryresults.error_message;
[8027]350 multiresults.docs = thisqueryresults.docs; // just one set of results
351 multiresults.docs_matched = thisqueryresults.docs_matched;
352 multiresults.is_approx = thisqueryresults.is_approx;
353
354 } else {
355 if ((*query_here).combinequery == "and") {
356 multiresults.docs.combine_and (thisqueryresults.docs);
357 } else if ((*query_here).combinequery == "or") {
358 multiresults.docs.combine_or (thisqueryresults.docs);
359 } else if ((*query_here).combinequery == "not") {
360 multiresults.docs.combine_not (thisqueryresults.docs);
361 }
362 multiresults.docs_matched = multiresults.docs.docset.size();
363 multiresults.is_approx = Exact;
364 }
365 }
366
367 // combine the term information
368 if (need_term_info (request.filterResultOptions)) {
369 // append the terms
370 multiresults.orgterms.insert(multiresults.orgterms.end(),
371 thisqueryresults.orgterms.begin(),
372 thisqueryresults.orgterms.end());
373
374
375 // add the term variants -
376 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
377 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
378 while (termvar_here != termvar_end) {
379 multiresults.termvariants.insert(*termvar_here);
[9620]380 ++termvar_here;
[8027]381 }
[12380]382
383 // add the stop words
384 text_tset::iterator stopwords_here = thisqueryresults.stopwords.begin();
385 text_tset::iterator stopwords_end = thisqueryresults.stopwords.end();
386 while (stopwords_here != stopwords_end) {
387 multiresults.stopwords.insert(*stopwords_here);
388 ++stopwords_here;
389 }
[8027]390 }
391
[9620]392 ++query_here;
[8027]393 }
394
395 // sort and unique the query terms
396 multiresults.sortuniqqueryterms ();
397}
398
399
Note: See TracBrowser for help on using the repository browser.