source: gsdl/trunk/src/colservr/lucenequeryfilter.cpp@ 15580

Last change on this file since 15580 was 15558, checked in by mdewsnip, 16 years ago

(Adding new DB support) Changed lots of "gdbm"s to "db"s, in preparation for adding new DB types.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.0 KB
Line 
1/**********************************************************************
2 *
3 * lucenequeryfilter.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27
28#include "lucenequeryfilter.h"
29#include "fileutil.h"
30#include <assert.h>
31#include "lucenesearch.h"
32
33/////////////////////////////////
34// functions for queryfilterclass
35/////////////////////////////////
36
37
38lucenequeryfilterclass::lucenequeryfilterclass ()
39 : queryfilterclass() {
40
41
42 FilterOption_t filtopt;
43
44 // -- onePerTerm Level enumerated
45 // likely to be Doc, Sec, Para, but we dont assume anything now
46 filtopt.clear();
47 filtopt.name = "Level";
48 filtopt.type = FilterOption_t::enumeratedt;
49 filtopt.repeatable = FilterOption_t::onePerTerm;
50 filterOptions["Level"] = filtopt;
51
52 // -- IndexField, enumerated, used to list available fields
53 filtopt.clear();
54 filtopt.name = "IndexField";
55 filtopt.type = FilterOption_t::enumeratedt;
56 filtopt.repeatable = FilterOption_t::onePerTerm;
57 filtopt.defaultValue = "";
58 filterOptions["IndexField"] = filtopt;
59
60}
61
62lucenequeryfilterclass::~lucenequeryfilterclass () {
63}
64
65
66//whether a query is a full text browse
67bool lucenequeryfilterclass::full_text_browse (int filterRequestOptions) {
68 return (filterRequestOptions & FRfullTextBrowse);
69}
70
71void lucenequeryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
72 queryfilterclass::configure(key, cfgline);
73
74 if (key == "indexfieldmap") {
75 indexfieldmap.importmap (cfgline);
76
77 // update the list of indexes in the filter information
78 text_tarray options;
79 indexfieldmap.gettoarray (options);
80
81 text_tarray::const_iterator here = options.begin();
82 text_tarray::const_iterator end = options.end();
83 bool start = true;
84 while (here !=end) {
85 if (!(*here).empty()) {
86 filterOptions["IndexField"].validValues.push_back(*here);
87 if (start) {
88 filterOptions["IndexField"].defaultValue = *here;
89 start = false;
90 }
91 }
92 ++here;
93 }
94 } else if (key == "indexlevels") {
95 text_tarray::const_iterator here = cfgline.begin();
96 text_tarray::const_iterator end = cfgline.end();
97 bool first=true;
98 while (here != end) {
99 if (!(*here).empty()) {
100 if (first) {
101 first = false;
102 // the default is the first value
103 filterOptions["Level"].defaultValue = *here;
104 }
105 filterOptions["Level"].validValues.push_back(*here);
106 }
107 ++here;
108 }
109 } else if (key == "textlevel") {
110 ((lucenesearchclass *)textsearchptr)->set_gdbm_level( cfgline[0]);
111 }
112
113}
114
115
116void lucenequeryfilterclass::filter(const FilterRequest_t &request,
117 FilterResponse_t &response,
118 comerror_t &err, ostream &logout) {
119
120 outconvertclass text_t2ascii;
121
122 response.clear ();
123 err = noError;
124 if (db_ptr == NULL) {
125 // most likely a configuration problem
126 logout << text_t2ascii
127 << "configuration error: queryfilter contains a null dbclass\n\n";
128 err = configurationError;
129 return;
130 }
131 if (textsearchptr == NULL) {
132 // most likely a configuration problem
133 logout << text_t2ascii
134 << "configuration error: queryfilter contains a null textsearchclass (lucene)\n\n";
135 err = configurationError;
136 return;
137 }
138 if (full_text_browse(request.filterResultOptions)) {
139 browsefilter(request, response, err, logout);
140 return;
141 }
142 // open the database
143 db_ptr->setlogout(&logout);
144 if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
145 // most likely a system problem (we have already checked that the database exists)
146 logout << text_t2ascii
147 << "system problem: open on database \"" << db_filename << "\" failed\n\n";
148 err = systemProblem;
149 return;
150 }
151
152
153 // get the query parameters
154 int startresults, endresults;
155 text_t phrasematch; // not used here any more
156 vector<queryparamclass> queryfilterparams;
157 parse_query_params (request, queryfilterparams, startresults,
158 endresults, phrasematch, logout);
159
160
161 // do query
162 queryresultsclass queryresults;
163 do_multi_query (request, queryfilterparams, queryresults, err, logout);
164 response.error_message = queryresults.error_message;
165 if (err != noError) return;
166
167 // assemble document results
168 if (need_matching_docs (request.filterResultOptions)) {
169
170 int resultnum = 1;
171 ResultDocInfo_t resultdoc;
172 text_t trans_OID;
173 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
174 vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
175
176 // Now handled by Lucene directly
177 //if (endresults == -1) endresults = MAXNUMDOCS;
178
179 while (docorder_here != docorder_end)
180 {
181 // Now handled by Lucene directly
182 //if (resultnum > endresults) break;
183
184 // translate the document number
185 if (!translate(db_ptr, *docorder_here, trans_OID))
186 {
187 logout << text_t2ascii
188 << "warning: could not translate lucene document number \""
189 << *docorder_here << "\" to OID.\n\n";
190
191 }
192 else
193 {
194 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
195
196 // see if there is a result for this number,
197 // if it is in the request set (or the request set is empty)
198 if (docset_here != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, trans_OID)))
199 {
200 // Now handled by Lucene directly
201 //if (resultnum >= startresults) {
202
203 // add this document
204 resultdoc.OID = trans_OID;
205 resultdoc.result_num = resultnum;
206 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
207
208 response.docInfo.push_back (resultdoc);
209 //}
210 ++resultnum;
211 }
212 } // else
213
214 ++docorder_here;
215 }
216 } // if need matching docs
217
218 // assemble the term results
219 if (need_term_info(request.filterResultOptions)) {
220 // note: the terms have already been sorted and uniqued - ?? have they??
221
222 TermInfo_t terminfo;
223 bool terms_first = true;
224
225 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
226 termfreqclassarray::iterator terms_end = queryresults.terms.end();
227
228 while (terms_here != terms_end) {
229 terminfo.clear();
230 terminfo.term = (*terms_here).termstr;
231 terminfo.freq = (*terms_here).termfreq;
232 // lucene doesn't return any termvariants at this stage,
233 // so make sure the original term is set
234 terminfo.matchTerms.push_back(terminfo.term);
235
236 // this bit gets the matchTerms ie the equivalent (stem/casefold) terms
237 if (terms_first) {
238 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
239 text_tset::iterator termvariants_end = queryresults.termvariants.end();
240 while (termvariants_here != termvariants_end) {
241 terminfo.matchTerms.push_back (*termvariants_here);
242 ++termvariants_here;
243 }
244 }
245 terms_first = false;
246
247 response.termInfo.push_back (terminfo);
248
249 ++terms_here;
250 }
251
252 // add the stop words
253 text_tset::iterator stopwords_here = queryresults.stopwords.begin();
254 text_tset::iterator stopwords_end = queryresults.stopwords.end();
255 while (stopwords_here != stopwords_end) {
256 response.stopwords.insert(*stopwords_here);
257 ++stopwords_here;
258 }
259 }
260
261 db_ptr->closedatabase(); // Important that local library doesn't leave any files open
262 response.numDocs = queryresults.docs_matched;
263 response.isApprox = queryresults.is_approx;
264}
265
266void lucenequeryfilterclass::browsefilter(const FilterRequest_t &request,
267 FilterResponse_t &response,
268 comerror_t &err, ostream &logout) {
269
270 outconvertclass text_t2ascii;
271
272 // get the query parameters
273 int startresults, endresults;
274 text_t phrasematch; // not used here any more, just have it so can use
275 // parse_query_params function
276
277 vector<queryparamclass> queryfilterparams;
278 parse_query_params (request, queryfilterparams, startresults,
279 endresults, phrasematch, logout);
280
281 vector<queryparamclass>::const_iterator query_here = queryfilterparams.begin();
282
283 // do query
284 queryresultsclass queryresults;
285 queryresults.clear();
286
287 int numDocs = endresults-startresults;
288 textsearchptr->setcollectdir (collectdir);
289
290 if (!((lucenesearchclass*)textsearchptr)->browse_search((*query_here), startresults, numDocs, queryresults)) {
291 // most likely a system problem
292 logout << text_t2ascii
293 << "system problem: could not do full text browse with lucene for index \""
294 << (*query_here).index << (*query_here).subcollection
295 << (*query_here).language << "\".\n\n";
296 err = systemProblem;
297 return;
298 }
299
300 // assemble the term results
301 TermInfo_t terminfo;
302
303 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
304 termfreqclassarray::iterator terms_end = queryresults.terms.end();
305
306 while (terms_here != terms_end) {
307 terminfo.clear();
308 terminfo.term = (*terms_here).termstr;
309 terminfo.freq = (*terms_here).termfreq;
310
311 response.termInfo.push_back (terminfo);
312
313 ++terms_here;
314 }
315
316
317}
318
319// lucenesearchptr and db_ptr are assumed to be valid
320void lucenequeryfilterclass::do_multi_query (const FilterRequest_t &request,
321 const vector<queryparamclass> &query_params,
322 queryresultsclass &multiresults,
323 comerror_t &err, ostream &logout) {
324 outconvertclass text_t2ascii;
325
326 err = noError;
327 textsearchptr->setcollectdir (collectdir);
328 multiresults.clear();
329
330 vector<queryparamclass>::const_iterator query_here = query_params.begin();
331 vector<queryparamclass>::const_iterator query_end = query_params.end();
332 while (query_here != query_end) {
333 queryresultsclass thisqueryresults;
334 if (!textsearchptr->search((*query_here), thisqueryresults)) {
335 // most likely a system problem
336 logout << text_t2ascii
337 << "system problem: could not do search with lucene for index \""
338 << (*query_here).index << (*query_here).level
339 << (*query_here).subcollection
340 << (*query_here).language << "\".\n\n";
341 err = systemProblem;
342 return;
343 }
344
345 // check for syntax error
346 if (thisqueryresults.syntax_error==true) {
347 logout << text_t2ascii
348 << "syntax problem: invalid query string \""
349 << (*query_here).querystring<<"\".\n";
350 err = syntaxError;
351 return;
352 }
353 // combine the results
354 if (need_matching_docs (request.filterResultOptions)) {
355
356 if (query_params.size() == 1) {
357 multiresults.error_message = thisqueryresults.error_message;
358 multiresults.docs = thisqueryresults.docs; // just one set of results
359 multiresults.docs_matched = thisqueryresults.docs_matched;
360 multiresults.is_approx = thisqueryresults.is_approx;
361
362 } else {
363 if ((*query_here).combinequery == "and") {
364 multiresults.docs.combine_and (thisqueryresults.docs);
365 } else if ((*query_here).combinequery == "or") {
366 multiresults.docs.combine_or (thisqueryresults.docs);
367 } else if ((*query_here).combinequery == "not") {
368 multiresults.docs.combine_not (thisqueryresults.docs);
369 }
370 multiresults.docs_matched = multiresults.docs.docset.size();
371 multiresults.is_approx = Exact;
372 }
373 }
374
375 // combine the term information
376 if (need_term_info (request.filterResultOptions)) {
377 // append the terms
378 multiresults.orgterms.insert(multiresults.orgterms.end(),
379 thisqueryresults.orgterms.begin(),
380 thisqueryresults.orgterms.end());
381
382
383 // add the term variants -
384 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
385 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
386 while (termvar_here != termvar_end) {
387 multiresults.termvariants.insert(*termvar_here);
388 ++termvar_here;
389 }
390
391 // add the stop words
392 text_tset::iterator stopwords_here = thisqueryresults.stopwords.begin();
393 text_tset::iterator stopwords_end = thisqueryresults.stopwords.end();
394 while (stopwords_here != stopwords_end) {
395 multiresults.stopwords.insert(*stopwords_here);
396 ++stopwords_here;
397 }
398 }
399
400 ++query_here;
401 }
402
403 // sort and unique the query terms
404 multiresults.sortuniqqueryterms ();
405}
406
407
408
Note: See TracBrowser for help on using the repository browser.