source: main/trunk/greenstone2/runtime-src/src/colservr/lucenequeryfilter.cpp@ 27064

Last change on this file since 27064 was 27064, checked in by kjdon, 11 years ago

adding reverse sort/sort order in for lucene search results sorting. reorganising code to avoid duplication, added fieldedqueryfilter in the chain of inheritance

  • Property svn:keywords set to Author Date Id Revision
File size: 12.3 KB
Line 
1/**********************************************************************
2 *
3 * lucenequeryfilter.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "lucenequeryfilter.h"
27#include "fileutil.h"
28#include "lucenesearch.h"
29
30lucenequeryfilterclass::lucenequeryfilterclass ()
31 : fieldedqueryfilterclass() {
32
33
34 FilterOption_t filtopt;
35
36 // -- onePerQuery SortField, enumerated, used to list available sorting fields
37 filtopt.clear();
38 filtopt.name = "SortField";
39 filtopt.type = FilterOption_t::enumeratedt;
40 filtopt.repeatable = FilterOption_t::onePerQuery;
41 filtopt.defaultValue = "";
42 filterOptions["SortField"] = filtopt;
43
44 // -- onePerQuery SortOder enumerated (0=ascending, 1=descending)
45 filtopt.clear();
46 filtopt.name = "SortOrder";
47 filtopt.type = FilterOption_t::enumeratedt;
48 filtopt.repeatable = FilterOption_t::onePerQuery;
49 filtopt.defaultValue = "ascending";
50 filtopt.validValues.push_back("ascending");
51 filtopt.validValues.push_back("descending");
52 filterOptions["SortOrder"] = filtopt;
53
54 // -- onePerQuery Fuzziness string 0.0-1.0
55 filtopt.clear();
56 filtopt.name = "Fuzziness";
57 filtopt.type = FilterOption_t::stringt;
58 filtopt.repeatable = FilterOption_t::onePerQuery;
59 filtopt.defaultValue = "";
60 filterOptions["Fuzziness"] = filtopt;
61
62 // -- onePerQuery FilterString string
63 filtopt.clear();
64 filtopt.name = "FilterString";
65 filtopt.type = FilterOption_t::stringt;
66 filtopt.repeatable = FilterOption_t::onePerQuery;
67 filtopt.defaultValue = "";
68 filterOptions["FilterString"] = filtopt;
69}
70
71lucenequeryfilterclass::~lucenequeryfilterclass () {
72}
73
74
75
76void lucenequeryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
77 fieldedqueryfilterclass::configure(key, cfgline);
78
79 if (key == "textlevel") {
80 ((lucenesearchclass *)textsearchptr)->set_text_level(cfgline[0]);
81 }
82}
83
84bool lucenequeryfilterclass::init (ostream &logout) {
85
86 if (!fieldedqueryfilterclass::init(logout)) {
87 return false;
88 }
89
90 text_tarray field_array;
91 indexfieldmap.gettoarray(field_array);
92 for (int i=0; i<field_array.size(); i++) {
93 text_t field = field_array[i];
94 if (field!="ZZ" && field !="ZZ") {
95 filterOptions["SortField"].validValues.push_back("by"+field);
96 }
97 }
98 return true;
99}
100
101void lucenequeryfilterclass::set_queryparam_defaults(queryparamclass &query ) {
102
103 fieldedqueryfilterclass::set_queryparam_defaults(query);
104 query.filterstring = filterOptions["FilterString"].defaultValue;
105 query.sortfield = filterOptions["SortField"].defaultValue;
106 query.sortorder = (filterOptions["SortOrder"].defaultValue == "descending");
107 query.fuzziness = filterOptions["Fuzziness"].defaultValue;
108
109}
110
111
112void lucenequeryfilterclass::filter(const FilterRequest_t &request,
113 FilterResponse_t &response,
114 comerror_t &err, ostream &logout) {
115
116 outconvertclass text_t2ascii;
117
118 response.clear ();
119 err = noError;
120 if (db_ptr == NULL) {
121 // most likely a configuration problem
122 logout << text_t2ascii
123 << "configuration error: queryfilter contains a null dbclass\n\n";
124 err = configurationError;
125 return;
126 }
127 if (textsearchptr == NULL) {
128 // most likely a configuration problem
129 logout << text_t2ascii
130 << "configuration error: queryfilter contains a null textsearchclass (lucene)\n\n";
131 err = configurationError;
132 return;
133 }
134 if (full_text_browse(request.filterResultOptions)) {
135 browsefilter(request, response, err, logout);
136 return;
137 }
138 // open the database
139 db_ptr->setlogout(&logout);
140 if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
141 // most likely a system problem (we have already checked that the database exists)
142 logout << text_t2ascii
143 << "system problem: open on database \"" << db_filename << "\" failed\n\n";
144 err = systemProblem;
145 return;
146 }
147
148
149 // get the query parameters
150 int startresults, endresults;
151 vector<queryparamclass> queryfilterparams;
152 parse_query_params (request, queryfilterparams, startresults,
153 endresults, logout);
154
155
156 // do query
157 queryresultsclass queryresults;
158 do_multi_query (request, queryfilterparams, queryresults, err, logout);
159 response.error_message = queryresults.error_message;
160 if (err != noError) return;
161
162 // assemble document results
163 if (need_matching_docs (request.filterResultOptions))
164 {
165 // Loop through the query results (ordered by ranking)
166 int resultnum = 1;
167 vector<text_t>::iterator docorder_iterator = queryresults.docs.docorder.begin();
168 while (docorder_iterator != queryresults.docs.docorder.end())
169 {
170 text_t doc_OID = (*docorder_iterator);
171 // logout << "Matching doc OID: " << doc_OID << endl;
172
173 // Make sure this result is in the docset, and either in the request set or the request set is empty
174 docresultmap::iterator doc_result = queryresults.docs.docset.find (doc_OID);
175 if (doc_result != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, doc_OID)))
176 {
177 // Add the matching document
178 ResultDocInfo_t resultdoc;
179 resultdoc.OID = doc_OID;
180 resultdoc.result_num = resultnum;
181 resultdoc.ranking = (int)((*doc_result).second.docweight * 10000.0 + 0.5);
182 resultdoc.num_terms_matched = (*doc_result).second.num_query_terms_matched;
183 response.docInfo.push_back (resultdoc);
184
185 resultnum++;
186 }
187
188 docorder_iterator++;
189 }
190 }
191
192 // assemble the term results
193 if (need_term_info(request.filterResultOptions)) {
194 // note: the terms have already been sorted and uniqued - ?? have they??
195
196 TermInfo_t terminfo;
197 bool terms_first = true;
198
199 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
200 termfreqclassarray::iterator terms_end = queryresults.terms.end();
201
202 while (terms_here != terms_end) {
203 terminfo.clear();
204 terminfo.term = (*terms_here).termstr;
205 terminfo.freq = (*terms_here).termfreq;
206 // lucene doesn't return any termvariants at this stage,
207 // so make sure the original term is set
208 terminfo.matchTerms.push_back(terminfo.term);
209
210 // this bit gets the matchTerms ie the equivalent (stem/casefold) terms
211 if (terms_first) {
212 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
213 text_tset::iterator termvariants_end = queryresults.termvariants.end();
214 while (termvariants_here != termvariants_end) {
215 terminfo.matchTerms.push_back (*termvariants_here);
216 ++termvariants_here;
217 }
218 }
219 terms_first = false;
220
221 response.termInfo.push_back (terminfo);
222
223 ++terms_here;
224 }
225
226 // add the stop words
227 text_tset::iterator stopwords_here = queryresults.stopwords.begin();
228 text_tset::iterator stopwords_end = queryresults.stopwords.end();
229 while (stopwords_here != stopwords_end) {
230 response.stopwords.insert(*stopwords_here);
231 ++stopwords_here;
232 }
233 }
234
235 db_ptr->closedatabase(); // Important that local library doesn't leave any files open
236 response.numDocs = queryresults.docs_matched;
237 response.isApprox = queryresults.is_approx;
238}
239
240void lucenequeryfilterclass::browsefilter(const FilterRequest_t &request,
241 FilterResponse_t &response,
242 comerror_t &err, ostream &logout) {
243
244 outconvertclass text_t2ascii;
245
246 // get the query parameters
247 int startresults, endresults;
248
249 vector<queryparamclass> queryfilterparams;
250 parse_query_params (request, queryfilterparams, startresults,
251 endresults, logout);
252
253 vector<queryparamclass>::const_iterator query_here = queryfilterparams.begin();
254
255 // do query
256 queryresultsclass queryresults;
257 queryresults.clear();
258
259 int numDocs = endresults-startresults;
260 textsearchptr->setcollectdir (collectdir);
261
262 if (!((lucenesearchclass*)textsearchptr)->browse_search((*query_here), startresults, numDocs, queryresults)) {
263 // most likely a system problem
264 logout << text_t2ascii
265 << "system problem: could not do full text browse with lucene for index \""
266 << (*query_here).index << (*query_here).subcollection
267 << (*query_here).language << "\".\n\n";
268 err = systemProblem;
269 return;
270 }
271
272 // assemble the term results
273 TermInfo_t terminfo;
274
275 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
276 termfreqclassarray::iterator terms_end = queryresults.terms.end();
277
278 while (terms_here != terms_end) {
279 terminfo.clear();
280 terminfo.term = (*terms_here).termstr;
281 terminfo.freq = (*terms_here).termfreq;
282
283 response.termInfo.push_back (terminfo);
284
285 ++terms_here;
286 }
287
288
289}
290
291// lucenesearchptr and db_ptr are assumed to be valid
292void lucenequeryfilterclass::do_multi_query (const FilterRequest_t &request,
293 const vector<queryparamclass> &query_params,
294 queryresultsclass &multiresults,
295 comerror_t &err, ostream &logout) {
296 outconvertclass text_t2ascii;
297
298 err = noError;
299 textsearchptr->setcollectdir (collectdir);
300 multiresults.clear();
301
302 vector<queryparamclass>::const_iterator query_here = query_params.begin();
303 vector<queryparamclass>::const_iterator query_end = query_params.end();
304 while (query_here != query_end) {
305 queryresultsclass thisqueryresults;
306 if (!textsearchptr->search((*query_here), thisqueryresults)) {
307 // most likely a system problem
308 logout << text_t2ascii
309 << "system problem: could not do search with lucene for index \""
310 << (*query_here).index << (*query_here).level
311 << (*query_here).subcollection
312 << (*query_here).language << "\".\n\n";
313 err = systemProblem;
314 return;
315 }
316
317 // check for syntax error
318 if (thisqueryresults.syntax_error==true) {
319 logout << text_t2ascii
320 << "syntax problem: invalid query string \""
321 << (*query_here).querystring<<"\".\n";
322 err = syntaxError;
323 return;
324 }
325 // combine the results
326 if (need_matching_docs (request.filterResultOptions)) {
327
328 if (query_params.size() == 1) {
329 multiresults.error_message = thisqueryresults.error_message;
330 multiresults.docs = thisqueryresults.docs; // just one set of results
331 multiresults.docs_matched = thisqueryresults.docs_matched;
332 multiresults.is_approx = thisqueryresults.is_approx;
333
334 } else {
335 if ((*query_here).combinequery == "and") {
336 multiresults.docs.combine_and (thisqueryresults.docs);
337 } else if ((*query_here).combinequery == "or") {
338 multiresults.docs.combine_or (thisqueryresults.docs);
339 } else if ((*query_here).combinequery == "not") {
340 multiresults.docs.combine_not (thisqueryresults.docs);
341 }
342 multiresults.docs_matched = multiresults.docs.docset.size();
343 multiresults.is_approx = Exact;
344 }
345 }
346
347 // combine the term information
348 if (need_term_info (request.filterResultOptions)) {
349 // append the terms
350 multiresults.orgterms.insert(multiresults.orgterms.end(),
351 thisqueryresults.orgterms.begin(),
352 thisqueryresults.orgterms.end());
353
354
355 // add the term variants -
356 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
357 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
358 while (termvar_here != termvar_end) {
359 multiresults.termvariants.insert(*termvar_here);
360 ++termvar_here;
361 }
362
363 // add the stop words
364 text_tset::iterator stopwords_here = thisqueryresults.stopwords.begin();
365 text_tset::iterator stopwords_end = thisqueryresults.stopwords.end();
366 while (stopwords_here != stopwords_end) {
367 multiresults.stopwords.insert(*stopwords_here);
368 ++stopwords_here;
369 }
370 }
371
372 ++query_here;
373 }
374
375 // sort and unique the query terms
376 multiresults.sortuniqqueryterms ();
377}
378
379
Note: See TracBrowser for help on using the repository browser.