source: gsdl/trunk/src/colservr/lucenequeryfilter.cpp@ 15757

Last change on this file since 15757 was 15681, checked in by mdewsnip, 16 years ago

Removed some unnecessary inclusions of "assert.h".

  • Property svn:keywords set to Author Date Id Revision
File size: 12.9 KB
Line 
1/**********************************************************************
2 *
3 * lucenequeryfilter.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "lucenequeryfilter.h"
27#include "fileutil.h"
28#include "lucenesearch.h"
29
30/////////////////////////////////
31// functions for queryfilterclass
32/////////////////////////////////
33
34
35lucenequeryfilterclass::lucenequeryfilterclass ()
36 : queryfilterclass() {
37
38
39 FilterOption_t filtopt;
40
41 // -- onePerTerm Level enumerated
42 // likely to be Doc, Sec, Para, but we dont assume anything now
43 filtopt.clear();
44 filtopt.name = "Level";
45 filtopt.type = FilterOption_t::enumeratedt;
46 filtopt.repeatable = FilterOption_t::onePerTerm;
47 filterOptions["Level"] = filtopt;
48
49 // -- IndexField, enumerated, used to list available fields
50 filtopt.clear();
51 filtopt.name = "IndexField";
52 filtopt.type = FilterOption_t::enumeratedt;
53 filtopt.repeatable = FilterOption_t::onePerTerm;
54 filtopt.defaultValue = "";
55 filterOptions["IndexField"] = filtopt;
56
57}
58
59lucenequeryfilterclass::~lucenequeryfilterclass () {
60}
61
62
63//whether a query is a full text browse
64bool lucenequeryfilterclass::full_text_browse (int filterRequestOptions) {
65 return (filterRequestOptions & FRfullTextBrowse);
66}
67
68void lucenequeryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
69 queryfilterclass::configure(key, cfgline);
70
71 if (key == "indexfieldmap") {
72 indexfieldmap.importmap (cfgline);
73
74 // update the list of indexes in the filter information
75 text_tarray options;
76 indexfieldmap.gettoarray (options);
77
78 text_tarray::const_iterator here = options.begin();
79 text_tarray::const_iterator end = options.end();
80 bool start = true;
81 while (here !=end) {
82 if (!(*here).empty()) {
83 filterOptions["IndexField"].validValues.push_back(*here);
84 if (start) {
85 filterOptions["IndexField"].defaultValue = *here;
86 start = false;
87 }
88 }
89 ++here;
90 }
91 } else if (key == "indexlevels") {
92 text_tarray::const_iterator here = cfgline.begin();
93 text_tarray::const_iterator end = cfgline.end();
94 bool first=true;
95 while (here != end) {
96 if (!(*here).empty()) {
97 if (first) {
98 first = false;
99 // the default is the first value
100 filterOptions["Level"].defaultValue = *here;
101 }
102 filterOptions["Level"].validValues.push_back(*here);
103 }
104 ++here;
105 }
106 } else if (key == "textlevel") {
107 ((lucenesearchclass *)textsearchptr)->set_text_level(cfgline[0]);
108 }
109
110}
111
112
113void lucenequeryfilterclass::filter(const FilterRequest_t &request,
114 FilterResponse_t &response,
115 comerror_t &err, ostream &logout) {
116
117 outconvertclass text_t2ascii;
118
119 response.clear ();
120 err = noError;
121 if (db_ptr == NULL) {
122 // most likely a configuration problem
123 logout << text_t2ascii
124 << "configuration error: queryfilter contains a null dbclass\n\n";
125 err = configurationError;
126 return;
127 }
128 if (textsearchptr == NULL) {
129 // most likely a configuration problem
130 logout << text_t2ascii
131 << "configuration error: queryfilter contains a null textsearchclass (lucene)\n\n";
132 err = configurationError;
133 return;
134 }
135 if (full_text_browse(request.filterResultOptions)) {
136 browsefilter(request, response, err, logout);
137 return;
138 }
139 // open the database
140 db_ptr->setlogout(&logout);
141 if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
142 // most likely a system problem (we have already checked that the database exists)
143 logout << text_t2ascii
144 << "system problem: open on database \"" << db_filename << "\" failed\n\n";
145 err = systemProblem;
146 return;
147 }
148
149
150 // get the query parameters
151 int startresults, endresults;
152 text_t phrasematch; // not used here any more
153 vector<queryparamclass> queryfilterparams;
154 parse_query_params (request, queryfilterparams, startresults,
155 endresults, phrasematch, logout);
156
157
158 // do query
159 queryresultsclass queryresults;
160 do_multi_query (request, queryfilterparams, queryresults, err, logout);
161 response.error_message = queryresults.error_message;
162 if (err != noError) return;
163
164 // assemble document results
165 if (need_matching_docs (request.filterResultOptions)) {
166
167 int resultnum = 1;
168 ResultDocInfo_t resultdoc;
169 text_t trans_OID;
170 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
171 vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
172
173 // Now handled by Lucene directly
174 //if (endresults == -1) endresults = MAXNUMDOCS;
175
176 while (docorder_here != docorder_end)
177 {
178 // Now handled by Lucene directly
179 //if (resultnum > endresults) break;
180
181 // translate the document number
182 if (!translate(db_ptr, *docorder_here, trans_OID))
183 {
184 logout << text_t2ascii
185 << "warning: could not translate lucene document number \""
186 << *docorder_here << "\" to OID.\n\n";
187
188 }
189 else
190 {
191 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
192
193 // see if there is a result for this number,
194 // if it is in the request set (or the request set is empty)
195 if (docset_here != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, trans_OID)))
196 {
197 // Now handled by Lucene directly
198 //if (resultnum >= startresults) {
199
200 // add this document
201 resultdoc.OID = trans_OID;
202 resultdoc.result_num = resultnum;
203 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
204
205 response.docInfo.push_back (resultdoc);
206 //}
207 ++resultnum;
208 }
209 } // else
210
211 ++docorder_here;
212 }
213 } // if need matching docs
214
215 // assemble the term results
216 if (need_term_info(request.filterResultOptions)) {
217 // note: the terms have already been sorted and uniqued - ?? have they??
218
219 TermInfo_t terminfo;
220 bool terms_first = true;
221
222 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
223 termfreqclassarray::iterator terms_end = queryresults.terms.end();
224
225 while (terms_here != terms_end) {
226 terminfo.clear();
227 terminfo.term = (*terms_here).termstr;
228 terminfo.freq = (*terms_here).termfreq;
229 // lucene doesn't return any termvariants at this stage,
230 // so make sure the original term is set
231 terminfo.matchTerms.push_back(terminfo.term);
232
233 // this bit gets the matchTerms ie the equivalent (stem/casefold) terms
234 if (terms_first) {
235 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
236 text_tset::iterator termvariants_end = queryresults.termvariants.end();
237 while (termvariants_here != termvariants_end) {
238 terminfo.matchTerms.push_back (*termvariants_here);
239 ++termvariants_here;
240 }
241 }
242 terms_first = false;
243
244 response.termInfo.push_back (terminfo);
245
246 ++terms_here;
247 }
248
249 // add the stop words
250 text_tset::iterator stopwords_here = queryresults.stopwords.begin();
251 text_tset::iterator stopwords_end = queryresults.stopwords.end();
252 while (stopwords_here != stopwords_end) {
253 response.stopwords.insert(*stopwords_here);
254 ++stopwords_here;
255 }
256 }
257
258 db_ptr->closedatabase(); // Important that local library doesn't leave any files open
259 response.numDocs = queryresults.docs_matched;
260 response.isApprox = queryresults.is_approx;
261}
262
263void lucenequeryfilterclass::browsefilter(const FilterRequest_t &request,
264 FilterResponse_t &response,
265 comerror_t &err, ostream &logout) {
266
267 outconvertclass text_t2ascii;
268
269 // get the query parameters
270 int startresults, endresults;
271 text_t phrasematch; // not used here any more, just have it so can use
272 // parse_query_params function
273
274 vector<queryparamclass> queryfilterparams;
275 parse_query_params (request, queryfilterparams, startresults,
276 endresults, phrasematch, logout);
277
278 vector<queryparamclass>::const_iterator query_here = queryfilterparams.begin();
279
280 // do query
281 queryresultsclass queryresults;
282 queryresults.clear();
283
284 int numDocs = endresults-startresults;
285 textsearchptr->setcollectdir (collectdir);
286
287 if (!((lucenesearchclass*)textsearchptr)->browse_search((*query_here), startresults, numDocs, queryresults)) {
288 // most likely a system problem
289 logout << text_t2ascii
290 << "system problem: could not do full text browse with lucene for index \""
291 << (*query_here).index << (*query_here).subcollection
292 << (*query_here).language << "\".\n\n";
293 err = systemProblem;
294 return;
295 }
296
297 // assemble the term results
298 TermInfo_t terminfo;
299
300 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
301 termfreqclassarray::iterator terms_end = queryresults.terms.end();
302
303 while (terms_here != terms_end) {
304 terminfo.clear();
305 terminfo.term = (*terms_here).termstr;
306 terminfo.freq = (*terms_here).termfreq;
307
308 response.termInfo.push_back (terminfo);
309
310 ++terms_here;
311 }
312
313
314}
315
316// lucenesearchptr and db_ptr are assumed to be valid
317void lucenequeryfilterclass::do_multi_query (const FilterRequest_t &request,
318 const vector<queryparamclass> &query_params,
319 queryresultsclass &multiresults,
320 comerror_t &err, ostream &logout) {
321 outconvertclass text_t2ascii;
322
323 err = noError;
324 textsearchptr->setcollectdir (collectdir);
325 multiresults.clear();
326
327 vector<queryparamclass>::const_iterator query_here = query_params.begin();
328 vector<queryparamclass>::const_iterator query_end = query_params.end();
329 while (query_here != query_end) {
330 queryresultsclass thisqueryresults;
331 if (!textsearchptr->search((*query_here), thisqueryresults)) {
332 // most likely a system problem
333 logout << text_t2ascii
334 << "system problem: could not do search with lucene for index \""
335 << (*query_here).index << (*query_here).level
336 << (*query_here).subcollection
337 << (*query_here).language << "\".\n\n";
338 err = systemProblem;
339 return;
340 }
341
342 // check for syntax error
343 if (thisqueryresults.syntax_error==true) {
344 logout << text_t2ascii
345 << "syntax problem: invalid query string \""
346 << (*query_here).querystring<<"\".\n";
347 err = syntaxError;
348 return;
349 }
350 // combine the results
351 if (need_matching_docs (request.filterResultOptions)) {
352
353 if (query_params.size() == 1) {
354 multiresults.error_message = thisqueryresults.error_message;
355 multiresults.docs = thisqueryresults.docs; // just one set of results
356 multiresults.docs_matched = thisqueryresults.docs_matched;
357 multiresults.is_approx = thisqueryresults.is_approx;
358
359 } else {
360 if ((*query_here).combinequery == "and") {
361 multiresults.docs.combine_and (thisqueryresults.docs);
362 } else if ((*query_here).combinequery == "or") {
363 multiresults.docs.combine_or (thisqueryresults.docs);
364 } else if ((*query_here).combinequery == "not") {
365 multiresults.docs.combine_not (thisqueryresults.docs);
366 }
367 multiresults.docs_matched = multiresults.docs.docset.size();
368 multiresults.is_approx = Exact;
369 }
370 }
371
372 // combine the term information
373 if (need_term_info (request.filterResultOptions)) {
374 // append the terms
375 multiresults.orgterms.insert(multiresults.orgterms.end(),
376 thisqueryresults.orgterms.begin(),
377 thisqueryresults.orgterms.end());
378
379
380 // add the term variants -
381 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
382 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
383 while (termvar_here != termvar_end) {
384 multiresults.termvariants.insert(*termvar_here);
385 ++termvar_here;
386 }
387
388 // add the stop words
389 text_tset::iterator stopwords_here = thisqueryresults.stopwords.begin();
390 text_tset::iterator stopwords_end = thisqueryresults.stopwords.end();
391 while (stopwords_here != stopwords_end) {
392 multiresults.stopwords.insert(*stopwords_here);
393 ++stopwords_here;
394 }
395 }
396
397 ++query_here;
398 }
399
400 // sort and unique the query terms
401 multiresults.sortuniqqueryterms ();
402}
403
404
405
Note: See TracBrowser for help on using the repository browser.