source: trunk/gsdl/src/colservr/lucenequeryfilter.cpp@ 13780

Last change on this file since 13780 was 13780, checked in by mdewsnip, 17 years ago

GLI/LOCAL LIBRARY: To prevent the problems with the GLI being unable to install newly built collections because the local library is holding files open, much more care needs to be taken to close files (typically the GDBM database and the MG/MGPP index files) after use. Fixed a lot of places where files were being left open.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.0 KB
Line 
1/**********************************************************************
2 *
3 * lucenequeryfilter.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27
28#include "lucenequeryfilter.h"
29#include "fileutil.h"
30#include <assert.h>
31#include "lucenesearch.h"
32
33/////////////////////////////////
34// functions for queryfilterclass
35/////////////////////////////////
36
37
38lucenequeryfilterclass::lucenequeryfilterclass ()
39 : queryfilterclass() {
40
41
42 FilterOption_t filtopt;
43
44 // -- onePerTerm Level enumerated
45 // likely to be Doc, Sec, Para, but we dont assume anything now
46 filtopt.clear();
47 filtopt.name = "Level";
48 filtopt.type = FilterOption_t::enumeratedt;
49 filtopt.repeatable = FilterOption_t::onePerTerm;
50 filterOptions["Level"] = filtopt;
51
52 // -- IndexField, enumerated, used to list available fields
53 filtopt.clear();
54 filtopt.name = "IndexField";
55 filtopt.type = FilterOption_t::enumeratedt;
56 filtopt.repeatable = FilterOption_t::onePerTerm;
57 filtopt.defaultValue = "";
58 filterOptions["IndexField"] = filtopt;
59
60}
61
62lucenequeryfilterclass::~lucenequeryfilterclass () {
63}
64
65
66//whether a query is a full text browse
67bool lucenequeryfilterclass::full_text_browse (int filterRequestOptions) {
68 return (filterRequestOptions & FRfullTextBrowse);
69}
70
71void lucenequeryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
72 queryfilterclass::configure(key, cfgline);
73
74 if (key == "indexfieldmap") {
75 indexfieldmap.importmap (cfgline);
76
77 // update the list of indexes in the filter information
78 text_tarray options;
79 indexfieldmap.gettoarray (options);
80
81 text_tarray::const_iterator here = options.begin();
82 text_tarray::const_iterator end = options.end();
83 bool start = true;
84 while (here !=end) {
85 if (!(*here).empty()) {
86 filterOptions["IndexField"].validValues.push_back(*here);
87 if (start) {
88 filterOptions["IndexField"].defaultValue = *here;
89 start = false;
90 }
91 }
92 ++here;
93 }
94 } else if (key == "indexlevels") {
95 text_tarray::const_iterator here = cfgline.begin();
96 text_tarray::const_iterator end = cfgline.end();
97 bool first=true;
98 while (here != end) {
99 if (!(*here).empty()) {
100 if (first) {
101 first = false;
102 // the default is the first value
103 filterOptions["Level"].defaultValue = *here;
104 }
105 filterOptions["Level"].validValues.push_back(*here);
106 }
107 ++here;
108 }
109 } else if (key == "textlevel") {
110 ((lucenesearchclass *)textsearchptr)->set_gdbm_level( cfgline[0]);
111 }
112
113}
114
115
116void lucenequeryfilterclass::filter(const FilterRequest_t &request,
117 FilterResponse_t &response,
118 comerror_t &err, ostream &logout) {
119
120 outconvertclass text_t2ascii;
121
122 response.clear ();
123 err = noError;
124 if (gdbmptr == NULL) {
125 // most likely a configuration problem
126 logout << text_t2ascii
127 << "configuration error: queryfilter contains a null gdbmclass\n\n";
128 err = configurationError;
129 return;
130 }
131 if (textsearchptr == NULL) {
132 // most likely a configuration problem
133 logout << text_t2ascii
134 << "configuration error: queryfilter contains a null textsearchclass (lucene)\n\n";
135 err = configurationError;
136 return;
137 }
138 if (full_text_browse(request.filterResultOptions)) {
139 browsefilter(request, response, err, logout);
140 return;
141 }
142 // open the database
143 gdbmptr->setlogout(&logout);
144 if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
145 // most likely a system problem (we have already checked that the
146 // gdbm database exists)
147 logout << text_t2ascii
148 << "system problem: open on gdbm database \""
149 << gdbm_filename << "\" failed\n\n";
150 err = systemProblem;
151 return;
152 }
153
154
155 // get the query parameters
156 int startresults, endresults;
157 text_t phrasematch; // not used here any more
158 vector<queryparamclass> queryfilterparams;
159 parse_query_params (request, queryfilterparams, startresults,
160 endresults, phrasematch, logout);
161
162
163 // do query
164 queryresultsclass queryresults;
165 do_multi_query (request, queryfilterparams, queryresults, err, logout);
166 response.error_message = queryresults.error_message;
167 if (err != noError) return;
168
169 // assemble document results
170 if (need_matching_docs (request.filterResultOptions)) {
171
172 int resultnum = 1;
173 ResultDocInfo_t resultdoc;
174 text_t trans_OID;
175 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
176 vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
177
178 // Now handled by Lucene directly
179 //if (endresults == -1) endresults = MAXNUMDOCS;
180
181 while (docorder_here != docorder_end)
182 {
183 // Now handled by Lucene directly
184 //if (resultnum > endresults) break;
185
186 // translate the document number
187 if (!translate(gdbmptr, *docorder_here, trans_OID))
188 {
189 logout << text_t2ascii
190 << "warning: could not translate lucene document number \""
191 << *docorder_here << "\" to OID.\n\n";
192
193 }
194 else
195 {
196 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
197
198 // see if there is a result for this number,
199 // if it is in the request set (or the request set is empty)
200 if (docset_here != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, trans_OID)))
201 {
202 // Now handled by Lucene directly
203 //if (resultnum >= startresults) {
204
205 // add this document
206 resultdoc.OID = trans_OID;
207 resultdoc.result_num = resultnum;
208 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
209
210 response.docInfo.push_back (resultdoc);
211 //}
212 ++resultnum;
213 }
214 } // else
215
216 ++docorder_here;
217 }
218 } // if need matching docs
219
220 // assemble the term results
221 if (need_term_info(request.filterResultOptions)) {
222 // note: the terms have already been sorted and uniqued - ?? have they??
223
224 TermInfo_t terminfo;
225 bool terms_first = true;
226
227 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
228 termfreqclassarray::iterator terms_end = queryresults.terms.end();
229
230 while (terms_here != terms_end) {
231 terminfo.clear();
232 terminfo.term = (*terms_here).termstr;
233 terminfo.freq = (*terms_here).termfreq;
234 // lucene doesn't return any termvariants at this stage,
235 // so make sure the original term is set
236 terminfo.matchTerms.push_back(terminfo.term);
237
238 // this bit gets the matchTerms ie the equivalent (stem/casefold) terms
239 if (terms_first) {
240 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
241 text_tset::iterator termvariants_end = queryresults.termvariants.end();
242 while (termvariants_here != termvariants_end) {
243 terminfo.matchTerms.push_back (*termvariants_here);
244 ++termvariants_here;
245 }
246 }
247 terms_first = false;
248
249 response.termInfo.push_back (terminfo);
250
251 ++terms_here;
252 }
253
254 // add the stop words
255 text_tset::iterator stopwords_here = queryresults.stopwords.begin();
256 text_tset::iterator stopwords_end = queryresults.stopwords.end();
257 while (stopwords_here != stopwords_end) {
258 response.stopwords.insert(*stopwords_here);
259 ++stopwords_here;
260 }
261 }
262
263 gdbmptr->closedatabase(); // Important that local library doesn't leave any files open
264 response.numDocs = queryresults.docs_matched;
265 response.isApprox = queryresults.is_approx;
266}
267
268void lucenequeryfilterclass::browsefilter(const FilterRequest_t &request,
269 FilterResponse_t &response,
270 comerror_t &err, ostream &logout) {
271
272 outconvertclass text_t2ascii;
273
274 // get the query parameters
275 int startresults, endresults;
276 text_t phrasematch; // not used here any more, just have it so can use
277 // parse_query_params function
278
279 vector<queryparamclass> queryfilterparams;
280 parse_query_params (request, queryfilterparams, startresults,
281 endresults, phrasematch, logout);
282
283 vector<queryparamclass>::const_iterator query_here = queryfilterparams.begin();
284
285 // do query
286 queryresultsclass queryresults;
287 queryresults.clear();
288
289 int numDocs = endresults-startresults;
290 textsearchptr->setcollectdir (collectdir);
291
292 if (!((lucenesearchclass*)textsearchptr)->browse_search((*query_here), startresults, numDocs, queryresults)) {
293 // most likely a system problem
294 logout << text_t2ascii
295 << "system problem: could not do full text browse with lucene for index \""
296 << (*query_here).index << (*query_here).subcollection
297 << (*query_here).language << "\".\n\n";
298 err = systemProblem;
299 return;
300 }
301
302 // assemble the term results
303 TermInfo_t terminfo;
304
305 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
306 termfreqclassarray::iterator terms_end = queryresults.terms.end();
307
308 while (terms_here != terms_end) {
309 terminfo.clear();
310 terminfo.term = (*terms_here).termstr;
311 terminfo.freq = (*terms_here).termfreq;
312
313 response.termInfo.push_back (terminfo);
314
315 ++terms_here;
316 }
317
318
319}
320
321// lucenesearchptr and gdbmptr are assumed to be valid
322void lucenequeryfilterclass::do_multi_query (const FilterRequest_t &request,
323 const vector<queryparamclass> &query_params,
324 queryresultsclass &multiresults,
325 comerror_t &err, ostream &logout) {
326 outconvertclass text_t2ascii;
327
328 err = noError;
329 textsearchptr->setcollectdir (collectdir);
330 multiresults.clear();
331
332 vector<queryparamclass>::const_iterator query_here = query_params.begin();
333 vector<queryparamclass>::const_iterator query_end = query_params.end();
334 while (query_here != query_end) {
335 queryresultsclass thisqueryresults;
336 if (!textsearchptr->search((*query_here), thisqueryresults)) {
337 // most likely a system problem
338 logout << text_t2ascii
339 << "system problem: could not do search with lucene for index \""
340 << (*query_here).index << (*query_here).level
341 << (*query_here).subcollection
342 << (*query_here).language << "\".\n\n";
343 err = systemProblem;
344 return;
345 }
346
347 // check for syntax error
348 if (thisqueryresults.syntax_error==true) {
349 logout << text_t2ascii
350 << "syntax problem: invalid query string \""
351 << (*query_here).querystring<<"\".\n";
352 err = syntaxError;
353 return;
354 }
355 // combine the results
356 if (need_matching_docs (request.filterResultOptions)) {
357
358 if (query_params.size() == 1) {
359 multiresults.error_message = thisqueryresults.error_message;
360 multiresults.docs = thisqueryresults.docs; // just one set of results
361 multiresults.docs_matched = thisqueryresults.docs_matched;
362 multiresults.is_approx = thisqueryresults.is_approx;
363
364 } else {
365 if ((*query_here).combinequery == "and") {
366 multiresults.docs.combine_and (thisqueryresults.docs);
367 } else if ((*query_here).combinequery == "or") {
368 multiresults.docs.combine_or (thisqueryresults.docs);
369 } else if ((*query_here).combinequery == "not") {
370 multiresults.docs.combine_not (thisqueryresults.docs);
371 }
372 multiresults.docs_matched = multiresults.docs.docset.size();
373 multiresults.is_approx = Exact;
374 }
375 }
376
377 // combine the term information
378 if (need_term_info (request.filterResultOptions)) {
379 // append the terms
380 multiresults.orgterms.insert(multiresults.orgterms.end(),
381 thisqueryresults.orgterms.begin(),
382 thisqueryresults.orgterms.end());
383
384
385 // add the term variants -
386 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
387 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
388 while (termvar_here != termvar_end) {
389 multiresults.termvariants.insert(*termvar_here);
390 ++termvar_here;
391 }
392
393 // add the stop words
394 text_tset::iterator stopwords_here = thisqueryresults.stopwords.begin();
395 text_tset::iterator stopwords_end = thisqueryresults.stopwords.end();
396 while (stopwords_here != stopwords_end) {
397 multiresults.stopwords.insert(*stopwords_here);
398 ++stopwords_here;
399 }
400 }
401
402 ++query_here;
403 }
404
405 // sort and unique the query terms
406 multiresults.sortuniqqueryterms ();
407}
408
409
410
Note: See TracBrowser for help on using the repository browser.