source: gsdl/trunk/runtime-src/src/colservr/lucenequeryfilter.cpp@ 20727

Last change on this file since 20727 was 20727, checked in by kjdon, 15 years ago

added support for defaultlevel in collect.cfg, for mgpp and lucene. also added defaultindex for lucene

  • Property svn:keywords set to Author Date Id Revision
File size: 12.8 KB
Line 
1/**********************************************************************
2 *
3 * lucenequeryfilter.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "lucenequeryfilter.h"
27#include "fileutil.h"
28#include "lucenesearch.h"
29
30/////////////////////////////////
31// functions for queryfilterclass
32/////////////////////////////////
33
34
35lucenequeryfilterclass::lucenequeryfilterclass ()
36 : queryfilterclass() {
37
38
39 FilterOption_t filtopt;
40
41 // -- onePerTerm Level enumerated
42 // likely to be Doc, Sec, Para, but we dont assume anything now
43 filtopt.clear();
44 filtopt.name = "Level";
45 filtopt.type = FilterOption_t::enumeratedt;
46 filtopt.repeatable = FilterOption_t::onePerTerm;
47 filtopt.defaultValue = "";
48 filterOptions["Level"] = filtopt;
49
50 // -- IndexField, enumerated, used to list available fields
51 filtopt.clear();
52 filtopt.name = "IndexField";
53 filtopt.type = FilterOption_t::enumeratedt;
54 filtopt.repeatable = FilterOption_t::onePerTerm;
55 filtopt.defaultValue = "";
56 filterOptions["IndexField"] = filtopt;
57
58}
59
60lucenequeryfilterclass::~lucenequeryfilterclass () {
61}
62
63
64//whether a query is a full text browse
65bool lucenequeryfilterclass::full_text_browse (int filterRequestOptions) {
66 return (filterRequestOptions & FRfullTextBrowse);
67}
68
69void lucenequeryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
70 queryfilterclass::configure(key, cfgline);
71
72 if (key == "indexfieldmap") {
73 indexfieldmap.importmap (cfgline);
74
75 // update the list of indexes in the filter information
76 text_tarray options;
77 indexfieldmap.gettoarray (options);
78 filterOptions["IndexField"].validValues = options;
79 } else if (key == "levelmap") {
80 levelmap.importmap (cfgline);
81 } else if (key == "indexlevels") {
82 text_tarray::const_iterator here = cfgline.begin();
83 text_tarray::const_iterator end = cfgline.end();
84 while (here != end) {
85 if (!(*here).empty()) {
86 filterOptions["Level"].validValues.push_back(*here);
87 }
88 ++here;
89 }
90 } else if (key == "textlevel") {
91 ((lucenesearchclass *)textsearchptr)->set_text_level(cfgline[0]);
92 } else if (key == "defaultindex") {
93 indexfieldmap.from2to (cfgline[0], filterOptions["IndexField"].defaultValue);
94 } else if (key == "defaultlevel") {
95 levelmap.from2to (cfgline[0], filterOptions["Level"].defaultValue);
96 }
97
98}
99
100bool lucenequeryfilterclass::init (ostream &logout) {
101
102 if (!queryfilterclass::init(logout)) {
103 return false;
104 }
105
106 if (filterOptions["IndexField"].defaultValue.empty()) {
107 // use first index in map as default if no default is set explicitly
108 text_tarray fromarray;
109 indexfieldmap.getfromarray(fromarray);
110 if (fromarray.size()) {
111 filterOptions["IndexField"].defaultValue = fromarray[0];
112 }
113 }
114 if (filterOptions["Levels"].defaultValue.empty()) {
115 // use first level as default if no default is set explicitly
116 if (!filterOptions["Level"].validValues[0].empty())
117 filterOptions["Levels"].defaultValue = filterOptions["Level"].validValues[0];
118 }
119
120 return true;
121}
122
123void lucenequeryfilterclass::filter(const FilterRequest_t &request,
124 FilterResponse_t &response,
125 comerror_t &err, ostream &logout) {
126
127 outconvertclass text_t2ascii;
128
129 response.clear ();
130 err = noError;
131 if (db_ptr == NULL) {
132 // most likely a configuration problem
133 logout << text_t2ascii
134 << "configuration error: queryfilter contains a null dbclass\n\n";
135 err = configurationError;
136 return;
137 }
138 if (textsearchptr == NULL) {
139 // most likely a configuration problem
140 logout << text_t2ascii
141 << "configuration error: queryfilter contains a null textsearchclass (lucene)\n\n";
142 err = configurationError;
143 return;
144 }
145 if (full_text_browse(request.filterResultOptions)) {
146 browsefilter(request, response, err, logout);
147 return;
148 }
149 // open the database
150 db_ptr->setlogout(&logout);
151 if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
152 // most likely a system problem (we have already checked that the database exists)
153 logout << text_t2ascii
154 << "system problem: open on database \"" << db_filename << "\" failed\n\n";
155 err = systemProblem;
156 return;
157 }
158
159
160 // get the query parameters
161 int startresults, endresults;
162 text_t phrasematch; // not used here any more
163 vector<queryparamclass> queryfilterparams;
164 parse_query_params (request, queryfilterparams, startresults,
165 endresults, phrasematch, logout);
166
167
168 // do query
169 queryresultsclass queryresults;
170 do_multi_query (request, queryfilterparams, queryresults, err, logout);
171 response.error_message = queryresults.error_message;
172 if (err != noError) return;
173
174 // assemble document results
175 if (need_matching_docs (request.filterResultOptions))
176 {
177 // Loop through the query results (ordered by ranking)
178 int resultnum = 1;
179 vector<text_t>::iterator docorder_iterator = queryresults.docs.docorder.begin();
180 while (docorder_iterator != queryresults.docs.docorder.end())
181 {
182 text_t doc_OID = (*docorder_iterator);
183 // logout << "Matching doc OID: " << doc_OID << endl;
184
185 // Make sure this result is in the docset, and either in the request set or the request set is empty
186 docresultmap::iterator doc_result = queryresults.docs.docset.find (doc_OID);
187 if (doc_result != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, doc_OID)))
188 {
189 // Add the matching document
190 ResultDocInfo_t resultdoc;
191 resultdoc.OID = doc_OID;
192 resultdoc.result_num = resultnum;
193 resultdoc.ranking = (int)((*doc_result).second.docweight * 10000.0 + 0.5);
194 resultdoc.num_terms_matched = (*doc_result).second.num_query_terms_matched;
195 response.docInfo.push_back (resultdoc);
196
197 resultnum++;
198 }
199
200 docorder_iterator++;
201 }
202 }
203
204 // assemble the term results
205 if (need_term_info(request.filterResultOptions)) {
206 // note: the terms have already been sorted and uniqued - ?? have they??
207
208 TermInfo_t terminfo;
209 bool terms_first = true;
210
211 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
212 termfreqclassarray::iterator terms_end = queryresults.terms.end();
213
214 while (terms_here != terms_end) {
215 terminfo.clear();
216 terminfo.term = (*terms_here).termstr;
217 terminfo.freq = (*terms_here).termfreq;
218 // lucene doesn't return any termvariants at this stage,
219 // so make sure the original term is set
220 terminfo.matchTerms.push_back(terminfo.term);
221
222 // this bit gets the matchTerms ie the equivalent (stem/casefold) terms
223 if (terms_first) {
224 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
225 text_tset::iterator termvariants_end = queryresults.termvariants.end();
226 while (termvariants_here != termvariants_end) {
227 terminfo.matchTerms.push_back (*termvariants_here);
228 ++termvariants_here;
229 }
230 }
231 terms_first = false;
232
233 response.termInfo.push_back (terminfo);
234
235 ++terms_here;
236 }
237
238 // add the stop words
239 text_tset::iterator stopwords_here = queryresults.stopwords.begin();
240 text_tset::iterator stopwords_end = queryresults.stopwords.end();
241 while (stopwords_here != stopwords_end) {
242 response.stopwords.insert(*stopwords_here);
243 ++stopwords_here;
244 }
245 }
246
247 db_ptr->closedatabase(); // Important that local library doesn't leave any files open
248 response.numDocs = queryresults.docs_matched;
249 response.isApprox = queryresults.is_approx;
250}
251
252void lucenequeryfilterclass::browsefilter(const FilterRequest_t &request,
253 FilterResponse_t &response,
254 comerror_t &err, ostream &logout) {
255
256 outconvertclass text_t2ascii;
257
258 // get the query parameters
259 int startresults, endresults;
260 text_t phrasematch; // not used here any more, just have it so can use
261 // parse_query_params function
262
263 vector<queryparamclass> queryfilterparams;
264 parse_query_params (request, queryfilterparams, startresults,
265 endresults, phrasematch, logout);
266
267 vector<queryparamclass>::const_iterator query_here = queryfilterparams.begin();
268
269 // do query
270 queryresultsclass queryresults;
271 queryresults.clear();
272
273 int numDocs = endresults-startresults;
274 textsearchptr->setcollectdir (collectdir);
275
276 if (!((lucenesearchclass*)textsearchptr)->browse_search((*query_here), startresults, numDocs, queryresults)) {
277 // most likely a system problem
278 logout << text_t2ascii
279 << "system problem: could not do full text browse with lucene for index \""
280 << (*query_here).index << (*query_here).subcollection
281 << (*query_here).language << "\".\n\n";
282 err = systemProblem;
283 return;
284 }
285
286 // assemble the term results
287 TermInfo_t terminfo;
288
289 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
290 termfreqclassarray::iterator terms_end = queryresults.terms.end();
291
292 while (terms_here != terms_end) {
293 terminfo.clear();
294 terminfo.term = (*terms_here).termstr;
295 terminfo.freq = (*terms_here).termfreq;
296
297 response.termInfo.push_back (terminfo);
298
299 ++terms_here;
300 }
301
302
303}
304
305// lucenesearchptr and db_ptr are assumed to be valid
306void lucenequeryfilterclass::do_multi_query (const FilterRequest_t &request,
307 const vector<queryparamclass> &query_params,
308 queryresultsclass &multiresults,
309 comerror_t &err, ostream &logout) {
310 outconvertclass text_t2ascii;
311
312 err = noError;
313 textsearchptr->setcollectdir (collectdir);
314 multiresults.clear();
315
316 vector<queryparamclass>::const_iterator query_here = query_params.begin();
317 vector<queryparamclass>::const_iterator query_end = query_params.end();
318 while (query_here != query_end) {
319 queryresultsclass thisqueryresults;
320 if (!textsearchptr->search((*query_here), thisqueryresults)) {
321 // most likely a system problem
322 logout << text_t2ascii
323 << "system problem: could not do search with lucene for index \""
324 << (*query_here).index << (*query_here).level
325 << (*query_here).subcollection
326 << (*query_here).language << "\".\n\n";
327 err = systemProblem;
328 return;
329 }
330
331 // check for syntax error
332 if (thisqueryresults.syntax_error==true) {
333 logout << text_t2ascii
334 << "syntax problem: invalid query string \""
335 << (*query_here).querystring<<"\".\n";
336 err = syntaxError;
337 return;
338 }
339 // combine the results
340 if (need_matching_docs (request.filterResultOptions)) {
341
342 if (query_params.size() == 1) {
343 multiresults.error_message = thisqueryresults.error_message;
344 multiresults.docs = thisqueryresults.docs; // just one set of results
345 multiresults.docs_matched = thisqueryresults.docs_matched;
346 multiresults.is_approx = thisqueryresults.is_approx;
347
348 } else {
349 if ((*query_here).combinequery == "and") {
350 multiresults.docs.combine_and (thisqueryresults.docs);
351 } else if ((*query_here).combinequery == "or") {
352 multiresults.docs.combine_or (thisqueryresults.docs);
353 } else if ((*query_here).combinequery == "not") {
354 multiresults.docs.combine_not (thisqueryresults.docs);
355 }
356 multiresults.docs_matched = multiresults.docs.docset.size();
357 multiresults.is_approx = Exact;
358 }
359 }
360
361 // combine the term information
362 if (need_term_info (request.filterResultOptions)) {
363 // append the terms
364 multiresults.orgterms.insert(multiresults.orgterms.end(),
365 thisqueryresults.orgterms.begin(),
366 thisqueryresults.orgterms.end());
367
368
369 // add the term variants -
370 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
371 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
372 while (termvar_here != termvar_end) {
373 multiresults.termvariants.insert(*termvar_here);
374 ++termvar_here;
375 }
376
377 // add the stop words
378 text_tset::iterator stopwords_here = thisqueryresults.stopwords.begin();
379 text_tset::iterator stopwords_end = thisqueryresults.stopwords.end();
380 while (stopwords_here != stopwords_end) {
381 multiresults.stopwords.insert(*stopwords_here);
382 ++stopwords_here;
383 }
384 }
385
386 ++query_here;
387 }
388
389 // sort and unique the query terms
390 multiresults.sortuniqqueryterms ();
391}
392
393
394
Note: See TracBrowser for help on using the repository browser.