source: trunk/gsdl/src/colservr/lucenequeryfilter.cpp@ 13063

Last change on this file since 13063 was 13063, checked in by kjdon, 18 years ago

make sure some matchTerms are set in terminfo - no termvariants are passed back, so set the original term as a MatchTerm

  • Property svn:keywords set to Author Date Id Revision
File size: 12.9 KB
Line 
1/**********************************************************************
2 *
3 * lucenequeryfilter.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27
28#include "lucenequeryfilter.h"
29#include "fileutil.h"
30#include <assert.h>
31#include "lucenesearch.h"
32
33/////////////////////////////////
34// functions for queryfilterclass
35/////////////////////////////////
36
37
38lucenequeryfilterclass::lucenequeryfilterclass ()
39 : queryfilterclass() {
40
41
42 FilterOption_t filtopt;
43
44 // -- onePerTerm Level enumerated
45 // likely to be Doc, Sec, Para, but we dont assume anything now
46 filtopt.clear();
47 filtopt.name = "Level";
48 filtopt.type = FilterOption_t::enumeratedt;
49 filtopt.repeatable = FilterOption_t::onePerTerm;
50 filterOptions["Level"] = filtopt;
51
52 // -- IndexField, enumerated, used to list available fields
53 filtopt.clear();
54 filtopt.name = "IndexField";
55 filtopt.type = FilterOption_t::enumeratedt;
56 filtopt.repeatable = FilterOption_t::onePerTerm;
57 filtopt.defaultValue = "";
58 filterOptions["IndexField"] = filtopt;
59
60}
61
62lucenequeryfilterclass::~lucenequeryfilterclass () {
63}
64
65
66//whether a query is a full text browse
67bool lucenequeryfilterclass::full_text_browse (int filterRequestOptions) {
68 return (filterRequestOptions & FRfullTextBrowse);
69}
70
71void lucenequeryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
72 queryfilterclass::configure(key, cfgline);
73
74 if (key == "indexfieldmap") {
75 indexfieldmap.importmap (cfgline);
76
77 // update the list of indexes in the filter information
78 text_tarray options;
79 indexfieldmap.gettoarray (options);
80
81 text_tarray::const_iterator here = options.begin();
82 text_tarray::const_iterator end = options.end();
83 bool start = true;
84 while (here !=end) {
85 if (!(*here).empty()) {
86 filterOptions["IndexField"].validValues.push_back(*here);
87 if (start) {
88 filterOptions["IndexField"].defaultValue = *here;
89 start = false;
90 }
91 }
92 ++here;
93 }
94 } else if (key == "indexlevels") {
95 text_tarray::const_iterator here = cfgline.begin();
96 text_tarray::const_iterator end = cfgline.end();
97 bool first=true;
98 while (here != end) {
99 if (!(*here).empty()) {
100 if (first) {
101 first = false;
102 // the default is the first value
103 filterOptions["Level"].defaultValue = *here;
104 }
105 filterOptions["Level"].validValues.push_back(*here);
106 }
107 ++here;
108 }
109 } else if (key == "textlevel") {
110 ((lucenesearchclass *)textsearchptr)->set_gdbm_level( cfgline[0]);
111 }
112
113}
114
115
116void lucenequeryfilterclass::filter(const FilterRequest_t &request,
117 FilterResponse_t &response,
118 comerror_t &err, ostream &logout) {
119
120 outconvertclass text_t2ascii;
121
122 response.clear ();
123 err = noError;
124 if (gdbmptr == NULL) {
125 // most likely a configuration problem
126 logout << text_t2ascii
127 << "configuration error: queryfilter contains a null gdbmclass\n\n";
128 err = configurationError;
129 return;
130 }
131 if (textsearchptr == NULL) {
132 // most likely a configuration problem
133 logout << text_t2ascii
134 << "configuration error: queryfilter contains a null textsearchclass (lucene)\n\n";
135 err = configurationError;
136 return;
137 }
138 if (full_text_browse(request.filterResultOptions)) {
139 browsefilter(request, response, err, logout);
140 return;
141 }
142 // open the database
143 gdbmptr->setlogout(&logout);
144 if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
145 // most likely a system problem (we have already checked that the
146 // gdbm database exists)
147 logout << text_t2ascii
148 << "system problem: open on gdbm database \""
149 << gdbm_filename << "\" failed\n\n";
150 err = systemProblem;
151 return;
152 }
153
154
155 // get the query parameters
156 int startresults, endresults;
157 text_t phrasematch; // not used here any more
158 vector<queryparamclass> queryfilterparams;
159 parse_query_params (request, queryfilterparams, startresults,
160 endresults, phrasematch, logout);
161
162
163 // do query
164 queryresultsclass queryresults;
165 do_multi_query (request, queryfilterparams, queryresults, err, logout);
166 response.error_message = queryresults.error_message;
167 if (err != noError) return;
168
169 // assemble document results
170 if (need_matching_docs (request.filterResultOptions)) {
171
172 int resultnum = 1;
173 ResultDocInfo_t resultdoc;
174 text_t trans_OID;
175 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
176 vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
177
178 // Now handled by Lucene directly
179 //if (endresults == -1) endresults = MAXNUMDOCS;
180
181 while (docorder_here != docorder_end)
182 {
183 // Now handled by Lucene directly
184 //if (resultnum > endresults) break;
185
186 // translate the document number
187 if (!translate(gdbmptr, *docorder_here, trans_OID))
188 {
189 logout << text_t2ascii
190 << "warning: could not translate lucene document number \""
191 << *docorder_here << "\" to OID.\n\n";
192
193 }
194 else
195 {
196 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
197
198 // see if there is a result for this number,
199 // if it is in the request set (or the request set is empty)
200 if (docset_here != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, trans_OID)))
201 {
202 // Now handled by Lucene directly
203 //if (resultnum >= startresults) {
204
205 // add this document
206 resultdoc.OID = trans_OID;
207 resultdoc.result_num = resultnum;
208 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
209
210 response.docInfo.push_back (resultdoc);
211 //}
212 ++resultnum;
213 }
214 } // else
215
216 ++docorder_here;
217 }
218 } // if need matching docs
219
220 // assemble the term results
221 if (need_term_info(request.filterResultOptions)) {
222 // note: the terms have already been sorted and uniqued - ?? have they??
223
224 TermInfo_t terminfo;
225 bool terms_first = true;
226
227 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
228 termfreqclassarray::iterator terms_end = queryresults.terms.end();
229
230 while (terms_here != terms_end) {
231 terminfo.clear();
232 terminfo.term = (*terms_here).termstr;
233 terminfo.freq = (*terms_here).termfreq;
234 // lucene doesn't return any termvariants at this stage,
235 // so make sure the original term is set
236 terminfo.matchTerms.push_back(terminfo.term);
237
238 // this bit gets the matchTerms ie the equivalent (stem/casefold) terms
239 if (terms_first) {
240 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
241 text_tset::iterator termvariants_end = queryresults.termvariants.end();
242 while (termvariants_here != termvariants_end) {
243 terminfo.matchTerms.push_back (*termvariants_here);
244 ++termvariants_here;
245 }
246 }
247 terms_first = false;
248
249 response.termInfo.push_back (terminfo);
250
251 ++terms_here;
252 }
253
254 // add the stop words
255 text_tset::iterator stopwords_here = queryresults.stopwords.begin();
256 text_tset::iterator stopwords_end = queryresults.stopwords.end();
257 while (stopwords_here != stopwords_end) {
258 response.stopwords.insert(*stopwords_here);
259 ++stopwords_here;
260 }
261 }
262
263 response.numDocs = queryresults.docs_matched;
264 response.isApprox = queryresults.is_approx;
265}
266
267void lucenequeryfilterclass::browsefilter(const FilterRequest_t &request,
268 FilterResponse_t &response,
269 comerror_t &err, ostream &logout) {
270
271 outconvertclass text_t2ascii;
272
273 // get the query parameters
274 int startresults, endresults;
275 text_t phrasematch; // not used here any more, just have it so can use
276 // parse_query_params function
277
278 vector<queryparamclass> queryfilterparams;
279 parse_query_params (request, queryfilterparams, startresults,
280 endresults, phrasematch, logout);
281
282 vector<queryparamclass>::const_iterator query_here = queryfilterparams.begin();
283
284 // do query
285 queryresultsclass queryresults;
286 queryresults.clear();
287
288 int numDocs = endresults-startresults;
289 textsearchptr->setcollectdir (collectdir);
290
291 if (!((lucenesearchclass*)textsearchptr)->browse_search((*query_here), startresults, numDocs, queryresults)) {
292 // most likely a system problem
293 logout << text_t2ascii
294 << "system problem: could not do full text browse with lucene for index \""
295 << (*query_here).index << (*query_here).subcollection
296 << (*query_here).language << "\".\n\n";
297 err = systemProblem;
298 return;
299 }
300
301 // assemble the term results
302 TermInfo_t terminfo;
303
304 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
305 termfreqclassarray::iterator terms_end = queryresults.terms.end();
306
307 while (terms_here != terms_end) {
308 terminfo.clear();
309 terminfo.term = (*terms_here).termstr;
310 terminfo.freq = (*terms_here).termfreq;
311
312 response.termInfo.push_back (terminfo);
313
314 ++terms_here;
315 }
316
317
318}
319
320// lucenesearchptr and gdbmptr are assumed to be valid
321void lucenequeryfilterclass::do_multi_query (const FilterRequest_t &request,
322 const vector<queryparamclass> &query_params,
323 queryresultsclass &multiresults,
324 comerror_t &err, ostream &logout) {
325 outconvertclass text_t2ascii;
326
327 err = noError;
328 textsearchptr->setcollectdir (collectdir);
329 multiresults.clear();
330
331 vector<queryparamclass>::const_iterator query_here = query_params.begin();
332 vector<queryparamclass>::const_iterator query_end = query_params.end();
333 while (query_here != query_end) {
334 queryresultsclass thisqueryresults;
335 if (!textsearchptr->search((*query_here), thisqueryresults)) {
336 // most likely a system problem
337 logout << text_t2ascii
338 << "system problem: could not do search with lucene for index \""
339 << (*query_here).index << (*query_here).level
340 << (*query_here).subcollection
341 << (*query_here).language << "\".\n\n";
342 err = systemProblem;
343 return;
344 }
345
346 // check for syntax error
347 if (thisqueryresults.syntax_error==true) {
348 logout << text_t2ascii
349 << "syntax problem: invalid query string \""
350 << (*query_here).querystring<<"\".\n";
351 err = syntaxError;
352 return;
353 }
354 // combine the results
355 if (need_matching_docs (request.filterResultOptions)) {
356
357 if (query_params.size() == 1) {
358 multiresults.error_message = thisqueryresults.error_message;
359 multiresults.docs = thisqueryresults.docs; // just one set of results
360 multiresults.docs_matched = thisqueryresults.docs_matched;
361 multiresults.is_approx = thisqueryresults.is_approx;
362
363 } else {
364 if ((*query_here).combinequery == "and") {
365 multiresults.docs.combine_and (thisqueryresults.docs);
366 } else if ((*query_here).combinequery == "or") {
367 multiresults.docs.combine_or (thisqueryresults.docs);
368 } else if ((*query_here).combinequery == "not") {
369 multiresults.docs.combine_not (thisqueryresults.docs);
370 }
371 multiresults.docs_matched = multiresults.docs.docset.size();
372 multiresults.is_approx = Exact;
373 }
374 }
375
376 // combine the term information
377 if (need_term_info (request.filterResultOptions)) {
378 // append the terms
379 multiresults.orgterms.insert(multiresults.orgterms.end(),
380 thisqueryresults.orgterms.begin(),
381 thisqueryresults.orgterms.end());
382
383
384 // add the term variants -
385 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
386 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
387 while (termvar_here != termvar_end) {
388 multiresults.termvariants.insert(*termvar_here);
389 ++termvar_here;
390 }
391
392 // add the stop words
393 text_tset::iterator stopwords_here = thisqueryresults.stopwords.begin();
394 text_tset::iterator stopwords_end = thisqueryresults.stopwords.end();
395 while (stopwords_here != stopwords_end) {
396 multiresults.stopwords.insert(*stopwords_here);
397 ++stopwords_here;
398 }
399 }
400
401 ++query_here;
402 }
403
404 // sort and unique the query terms
405 multiresults.sortuniqqueryterms ();
406}
407
408
409
Note: See TracBrowser for help on using the repository browser.