source: main/trunk/greenstone2/runtime-src/src/colservr/lucenequeryfilter.cpp@ 27061

Last change on this file since 27061 was 22050, checked in by davidb, 14 years ago

Updating of code to support sql-query filter

  • Property svn:keywords set to Author Date Id Revision
File size: 12.8 KB
Line 
1/**********************************************************************
2 *
3 * lucenequeryfilter.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "lucenequeryfilter.h"
27#include "fileutil.h"
28#include "lucenesearch.h"
29
30/////////////////////////////////
31// functions for queryfilterclass
32/////////////////////////////////
33
34
35lucenequeryfilterclass::lucenequeryfilterclass ()
36 : queryfilterclass() {
37
38
39 FilterOption_t filtopt;
40
41 // -- onePerTerm Level enumerated
42 // likely to be Doc, Sec, Para, but we dont assume anything now
43 filtopt.clear();
44 filtopt.name = "Level";
45 filtopt.type = FilterOption_t::enumeratedt;
46 filtopt.repeatable = FilterOption_t::onePerTerm;
47 filtopt.defaultValue = "";
48 filterOptions["Level"] = filtopt;
49
50 // -- IndexField, enumerated, used to list available fields
51 filtopt.clear();
52 filtopt.name = "IndexField";
53 filtopt.type = FilterOption_t::enumeratedt;
54 filtopt.repeatable = FilterOption_t::onePerTerm;
55 filtopt.defaultValue = "";
56 filterOptions["IndexField"] = filtopt;
57
58}
59
60lucenequeryfilterclass::~lucenequeryfilterclass () {
61}
62
63
64//whether a query is a full text browse
65bool lucenequeryfilterclass::full_text_browse (int filterRequestOptions) {
66 return (filterRequestOptions & FRfullTextBrowse);
67}
68
69void lucenequeryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
70 queryfilterclass::configure(key, cfgline);
71
72 if (key == "indexfieldmap") {
73 indexfieldmap.importmap (cfgline);
74
75 // update the list of indexes in the filter information
76 text_tarray options;
77 indexfieldmap.gettoarray (options);
78 filterOptions["IndexField"].validValues = options;
79 } else if (key == "levelmap") {
80 levelmap.importmap (cfgline);
81 } else if (key == "indexlevels") {
82 text_tarray::const_iterator here = cfgline.begin();
83 text_tarray::const_iterator end = cfgline.end();
84 while (here != end) {
85 if (!(*here).empty()) {
86 filterOptions["Level"].validValues.push_back(*here);
87 }
88 ++here;
89 }
90 } else if (key == "textlevel") {
91 ((lucenesearchclass *)textsearchptr)->set_text_level(cfgline[0]);
92 } else if (key == "defaultindex") {
93 indexfieldmap.from2to (cfgline[0], filterOptions["IndexField"].defaultValue);
94 } else if (key == "defaultlevel") {
95 levelmap.from2to (cfgline[0], filterOptions["Level"].defaultValue);
96 }
97
98}
99
100bool lucenequeryfilterclass::init (ostream &logout) {
101
102 if (!queryfilterclass::init(logout)) {
103 return false;
104 }
105
106 if (filterOptions["IndexField"].defaultValue.empty()) {
107 // use first index in map as default if no default is set explicitly
108 text_tarray fromarray;
109 indexfieldmap.getfromarray(fromarray);
110 if (fromarray.size()) {
111 filterOptions["IndexField"].defaultValue = fromarray[0];
112 }
113 }
114 if (filterOptions["Levels"].defaultValue.empty()) {
115 // use first level as default if no default is set explicitly
116 if (!filterOptions["Level"].validValues[0].empty())
117 filterOptions["Levels"].defaultValue = filterOptions["Level"].validValues[0];
118 }
119
120 return true;
121}
122
123void lucenequeryfilterclass::filter(const FilterRequest_t &request,
124 FilterResponse_t &response,
125 comerror_t &err, ostream &logout) {
126
127 outconvertclass text_t2ascii;
128
129 response.clear ();
130 err = noError;
131 if (db_ptr == NULL) {
132 // most likely a configuration problem
133 logout << text_t2ascii
134 << "configuration error: queryfilter contains a null dbclass\n\n";
135 err = configurationError;
136 return;
137 }
138 if (textsearchptr == NULL) {
139 // most likely a configuration problem
140 logout << text_t2ascii
141 << "configuration error: queryfilter contains a null textsearchclass (lucene)\n\n";
142 err = configurationError;
143 return;
144 }
145 if (full_text_browse(request.filterResultOptions)) {
146 browsefilter(request, response, err, logout);
147 return;
148 }
149 // open the database
150 db_ptr->setlogout(&logout);
151 if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
152 // most likely a system problem (we have already checked that the database exists)
153 logout << text_t2ascii
154 << "system problem: open on database \"" << db_filename << "\" failed\n\n";
155 err = systemProblem;
156 return;
157 }
158
159
160 // get the query parameters
161 int startresults, endresults;
162 text_t phrasematch; // not used here any more
163 vector<queryparamclass> queryfilterparams;
164 parse_query_params (request, queryfilterparams, startresults,
165 endresults, phrasematch, logout);
166
167
168 // do query
169 queryresultsclass queryresults;
170 do_multi_query (request, queryfilterparams, queryresults, err, logout);
171 response.error_message = queryresults.error_message;
172 if (err != noError) return;
173
174 // assemble document results
175 if (need_matching_docs (request.filterResultOptions))
176 {
177 // Loop through the query results (ordered by ranking)
178 int resultnum = 1;
179 vector<text_t>::iterator docorder_iterator = queryresults.docs.docorder.begin();
180 while (docorder_iterator != queryresults.docs.docorder.end())
181 {
182 text_t doc_OID = (*docorder_iterator);
183 // logout << "Matching doc OID: " << doc_OID << endl;
184
185 // Make sure this result is in the docset, and either in the request set or the request set is empty
186 docresultmap::iterator doc_result = queryresults.docs.docset.find (doc_OID);
187 if (doc_result != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, doc_OID)))
188 {
189 // Add the matching document
190 ResultDocInfo_t resultdoc;
191 resultdoc.OID = doc_OID;
192 resultdoc.result_num = resultnum;
193 resultdoc.ranking = (int)((*doc_result).second.docweight * 10000.0 + 0.5);
194 resultdoc.num_terms_matched = (*doc_result).second.num_query_terms_matched;
195 response.docInfo.push_back (resultdoc);
196
197 resultnum++;
198 }
199
200 docorder_iterator++;
201 }
202 }
203
204 // assemble the term results
205 if (need_term_info(request.filterResultOptions)) {
206 // note: the terms have already been sorted and uniqued - ?? have they??
207
208 TermInfo_t terminfo;
209 bool terms_first = true;
210
211 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
212 termfreqclassarray::iterator terms_end = queryresults.terms.end();
213
214 while (terms_here != terms_end) {
215 terminfo.clear();
216 terminfo.term = (*terms_here).termstr;
217 terminfo.freq = (*terms_here).termfreq;
218 // lucene doesn't return any termvariants at this stage,
219 // so make sure the original term is set
220 terminfo.matchTerms.push_back(terminfo.term);
221
222 // this bit gets the matchTerms ie the equivalent (stem/casefold) terms
223 if (terms_first) {
224 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
225 text_tset::iterator termvariants_end = queryresults.termvariants.end();
226 while (termvariants_here != termvariants_end) {
227 terminfo.matchTerms.push_back (*termvariants_here);
228 ++termvariants_here;
229 }
230 }
231 terms_first = false;
232
233 response.termInfo.push_back (terminfo);
234
235 ++terms_here;
236 }
237
238 // add the stop words
239 text_tset::iterator stopwords_here = queryresults.stopwords.begin();
240 text_tset::iterator stopwords_end = queryresults.stopwords.end();
241 while (stopwords_here != stopwords_end) {
242 response.stopwords.insert(*stopwords_here);
243 ++stopwords_here;
244 }
245 }
246
247 db_ptr->closedatabase(); // Important that local library doesn't leave any files open
248 response.numDocs = queryresults.docs_matched;
249 response.isApprox = queryresults.is_approx;
250}
251
252void lucenequeryfilterclass::browsefilter(const FilterRequest_t &request,
253 FilterResponse_t &response,
254 comerror_t &err, ostream &logout) {
255
256 outconvertclass text_t2ascii;
257
258 // get the query parameters
259 int startresults, endresults;
260 text_t phrasematch; // not used here any more, just have it so can use
261 // parse_query_params function
262
263 vector<queryparamclass> queryfilterparams;
264 parse_query_params (request, queryfilterparams, startresults,
265 endresults, phrasematch, logout);
266
267 vector<queryparamclass>::const_iterator query_here = queryfilterparams.begin();
268
269 // do query
270 queryresultsclass queryresults;
271 queryresults.clear();
272
273 int numDocs = endresults-startresults;
274 textsearchptr->setcollectdir (collectdir);
275
276 if (!((lucenesearchclass*)textsearchptr)->browse_search((*query_here), startresults, numDocs, queryresults)) {
277 // most likely a system problem
278 logout << text_t2ascii
279 << "system problem: could not do full text browse with lucene for index \""
280 << (*query_here).index << (*query_here).subcollection
281 << (*query_here).language << "\".\n\n";
282 err = systemProblem;
283 return;
284 }
285
286 // assemble the term results
287 TermInfo_t terminfo;
288
289 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
290 termfreqclassarray::iterator terms_end = queryresults.terms.end();
291
292 while (terms_here != terms_end) {
293 terminfo.clear();
294 terminfo.term = (*terms_here).termstr;
295 terminfo.freq = (*terms_here).termfreq;
296
297 response.termInfo.push_back (terminfo);
298
299 ++terms_here;
300 }
301
302
303}
304
305// lucenesearchptr and db_ptr are assumed to be valid
306void lucenequeryfilterclass::do_multi_query (const FilterRequest_t &request,
307 const vector<queryparamclass> &query_params,
308 queryresultsclass &multiresults,
309 comerror_t &err, ostream &logout) {
310 outconvertclass text_t2ascii;
311
312 err = noError;
313 textsearchptr->setcollectdir (collectdir);
314 multiresults.clear();
315
316 vector<queryparamclass>::const_iterator query_here = query_params.begin();
317 vector<queryparamclass>::const_iterator query_end = query_params.end();
318 while (query_here != query_end) {
319 queryresultsclass thisqueryresults;
320 if (!textsearchptr->search((*query_here), thisqueryresults)) {
321 // most likely a system problem
322 logout << text_t2ascii
323 << "system problem: could not do search with lucene for index \""
324 << (*query_here).index << (*query_here).level
325 << (*query_here).subcollection
326 << (*query_here).language << "\".\n\n";
327 err = systemProblem;
328 return;
329 }
330
331 // check for syntax error
332 if (thisqueryresults.syntax_error==true) {
333 logout << text_t2ascii
334 << "syntax problem: invalid query string \""
335 << (*query_here).querystring<<"\".\n";
336 err = syntaxError;
337 return;
338 }
339 // combine the results
340 if (need_matching_docs (request.filterResultOptions)) {
341
342 if (query_params.size() == 1) {
343 multiresults.error_message = thisqueryresults.error_message;
344 multiresults.docs = thisqueryresults.docs; // just one set of results
345 multiresults.docs_matched = thisqueryresults.docs_matched;
346 multiresults.is_approx = thisqueryresults.is_approx;
347
348 } else {
349 if ((*query_here).combinequery == "and") {
350 multiresults.docs.combine_and (thisqueryresults.docs);
351 } else if ((*query_here).combinequery == "or") {
352 multiresults.docs.combine_or (thisqueryresults.docs);
353 } else if ((*query_here).combinequery == "not") {
354 multiresults.docs.combine_not (thisqueryresults.docs);
355 }
356 multiresults.docs_matched = multiresults.docs.docset.size();
357 multiresults.is_approx = Exact;
358 }
359 }
360
361 // combine the term information
362 if (need_term_info (request.filterResultOptions)) {
363 // append the terms
364 multiresults.orgterms.insert(multiresults.orgterms.end(),
365 thisqueryresults.orgterms.begin(),
366 thisqueryresults.orgterms.end());
367
368
369 // add the term variants -
370 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
371 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
372 while (termvar_here != termvar_end) {
373 multiresults.termvariants.insert(*termvar_here);
374 ++termvar_here;
375 }
376
377 // add the stop words
378 text_tset::iterator stopwords_here = thisqueryresults.stopwords.begin();
379 text_tset::iterator stopwords_end = thisqueryresults.stopwords.end();
380 while (stopwords_here != stopwords_end) {
381 multiresults.stopwords.insert(*stopwords_here);
382 ++stopwords_here;
383 }
384 }
385
386 ++query_here;
387 }
388
389 // sort and unique the query terms
390 multiresults.sortuniqqueryterms ();
391}
392
393
394
Note: See TracBrowser for help on using the repository browser.