source: trunk/gsdl/src/colservr/lucenequeryfilter.cpp@ 12380

Last change on this file since 12380 was 12380, checked in by mdewsnip, 18 years ago

Now shows the stopwords removed by Lucene, many thanks to Me and DL Consulting Ltd.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.2 KB
Line 
1/**********************************************************************
2 *
3 * lucenequeryfilter.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27
28#include "lucenequeryfilter.h"
29#include "fileutil.h"
30#include <assert.h>
31#include "lucenesearch.h"
32
33/////////////////////////////////
34// functions for queryfilterclass
35/////////////////////////////////
36
37
38lucenequeryfilterclass::lucenequeryfilterclass ()
39 : queryfilterclass() {
40
41
42 FilterOption_t filtopt;
43
44 // -- onePerTerm Level enumerated
45 // likely to be Doc, Sec, Para, but we dont assume anything now
46 filtopt.clear();
47 filtopt.name = "Level";
48 filtopt.type = FilterOption_t::enumeratedt;
49 filtopt.repeatable = FilterOption_t::onePerTerm;
50 filterOptions["Level"] = filtopt;
51
52 // -- IndexField, enumerated, used to list available fields
53 filtopt.clear();
54 filtopt.name = "IndexField";
55 filtopt.type = FilterOption_t::enumeratedt;
56 filtopt.repeatable = FilterOption_t::onePerTerm;
57 filtopt.defaultValue = "";
58 filterOptions["IndexField"] = filtopt;
59
60}
61
62lucenequeryfilterclass::~lucenequeryfilterclass () {
63}
64
65
66//whether a query is a full text browse
67bool lucenequeryfilterclass::full_text_browse (int filterRequestOptions) {
68 return (filterRequestOptions & FRfullTextBrowse);
69}
70
71void lucenequeryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
72 queryfilterclass::configure(key, cfgline);
73
74 if (key == "indexfieldmap") {
75 indexfieldmap.importmap (cfgline);
76
77 // update the list of indexes in the filter information
78 text_tarray options;
79 indexfieldmap.gettoarray (options);
80
81 text_tarray::const_iterator here = options.begin();
82 text_tarray::const_iterator end = options.end();
83 bool start = true;
84 while (here !=end) {
85 if (!(*here).empty()) {
86 filterOptions["IndexField"].validValues.push_back(*here);
87 if (start) {
88 filterOptions["IndexField"].defaultValue = *here;
89 start = false;
90 }
91 }
92 ++here;
93 }
94 } else if (key == "indexlevels") {
95 text_tarray::const_iterator here = cfgline.begin();
96 text_tarray::const_iterator end = cfgline.end();
97 bool first=true;
98 while (here != end) {
99 if (!(*here).empty()) {
100 if (first) {
101 first = false;
102 // the default is the first value
103 filterOptions["Level"].defaultValue = *here;
104 }
105 filterOptions["Level"].validValues.push_back(*here);
106 }
107 ++here;
108 }
109 } else if (key == "textlevel") {
110 ((lucenesearchclass *)textsearchptr)->set_gdbm_level( cfgline[0]);
111 }
112
113}
114
115
116void lucenequeryfilterclass::filter(const FilterRequest_t &request,
117 FilterResponse_t &response,
118 comerror_t &err, ostream &logout) {
119
120 outconvertclass text_t2ascii;
121
122 response.clear ();
123 err = noError;
124 if (gdbmptr == NULL) {
125 // most likely a configuration problem
126 logout << text_t2ascii
127 << "configuration error: queryfilter contains a null gdbmclass\n\n";
128 err = configurationError;
129 return;
130 }
131 if (textsearchptr == NULL) {
132 // most likely a configuration problem
133 logout << text_t2ascii
134 << "configuration error: queryfilter contains a null textsearchclass (lucene)\n\n";
135 err = configurationError;
136 return;
137 }
138 if (full_text_browse(request.filterResultOptions)) {
139 browsefilter(request, response, err, logout);
140 return;
141 }
142 // open the database
143 gdbmptr->setlogout(&logout);
144 if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
145 // most likely a system problem (we have already checked that the
146 // gdbm database exists)
147 logout << text_t2ascii
148 << "system problem: open on gdbm database \""
149 << gdbm_filename << "\" failed\n\n";
150 err = systemProblem;
151 return;
152 }
153
154
155 // get the query parameters
156 int startresults, endresults;
157 text_t phrasematch; // not used here any more
158 vector<queryparamclass> queryfilterparams;
159 parse_query_params (request, queryfilterparams, startresults,
160 endresults, phrasematch, logout);
161
162
163 // do query
164 queryresultsclass queryresults;
165 do_multi_query (request, queryfilterparams, queryresults, err, logout);
166 if (err != noError) return;
167 // assemble document results
168 if (need_matching_docs (request.filterResultOptions)) {
169
170 int resultnum = 1;
171 ResultDocInfo_t resultdoc;
172 text_t trans_OID;
173 vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
174 vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
175
176 if (endresults == -1) endresults = MAXNUMDOCS;
177 while (docorder_here != docorder_end) {
178 if (resultnum > endresults) break;
179
180 // translate the document number
181 if (!translate(gdbmptr, *docorder_here, trans_OID)) {
182 logout << text_t2ascii
183 << "warning: could not translate lucene document number \""
184 << *docorder_here << "\" to OID.\n\n";
185
186 } else {
187 docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
188
189 // see if there is a result for this number,
190 // if it is in the request set (or the request set is empty)
191 if (docset_here != queryresults.docs.docset.end() &&
192 (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
193 if (resultnum >= startresults) {
194 // add this document
195 resultdoc.OID = trans_OID;
196 resultdoc.result_num = resultnum;
197 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
198
199 response.docInfo.push_back (resultdoc);
200 }
201
202 ++resultnum;
203 }
204 } // else
205
206 ++docorder_here;
207 }
208 } // if need matching docs
209
210 // assemble the term results
211 if (need_term_info(request.filterResultOptions)) {
212 // note: the terms have already been sorted and uniqued - ?? have they??
213
214 TermInfo_t terminfo;
215 bool terms_first = true;
216
217 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
218 termfreqclassarray::iterator terms_end = queryresults.terms.end();
219
220 while (terms_here != terms_end) {
221 terminfo.clear();
222 terminfo.term = (*terms_here).termstr;
223 terminfo.freq = (*terms_here).termfreq;
224
225 // this bit gets the matchTerms ie the equivalent (stem/casefold) terms
226 if (terms_first) {
227 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
228 text_tset::iterator termvariants_end = queryresults.termvariants.end();
229 while (termvariants_here != termvariants_end) {
230 terminfo.matchTerms.push_back (*termvariants_here);
231 ++termvariants_here;
232 }
233 }
234 terms_first = false;
235
236 response.termInfo.push_back (terminfo);
237
238 ++terms_here;
239 }
240
241 // add the stop words
242 text_tset::iterator stopwords_here = queryresults.stopwords.begin();
243 text_tset::iterator stopwords_end = queryresults.stopwords.end();
244 while (stopwords_here != stopwords_end) {
245 response.stopwords.insert(*stopwords_here);
246 ++stopwords_here;
247 }
248 }
249
250 response.numDocs = queryresults.docs_matched;
251 response.isApprox = queryresults.is_approx;
252}
253
254void lucenequeryfilterclass::browsefilter(const FilterRequest_t &request,
255 FilterResponse_t &response,
256 comerror_t &err, ostream &logout) {
257
258 outconvertclass text_t2ascii;
259
260 // get the query parameters
261 int startresults, endresults;
262 text_t phrasematch; // not used here any more, just have it so can use
263 // parse_query_params function
264
265 vector<queryparamclass> queryfilterparams;
266 parse_query_params (request, queryfilterparams, startresults,
267 endresults, phrasematch, logout);
268
269 vector<queryparamclass>::const_iterator query_here = queryfilterparams.begin();
270
271 // do query
272 queryresultsclass queryresults;
273 queryresults.clear();
274
275 int numDocs = endresults-startresults;
276 textsearchptr->setcollectdir (collectdir);
277
278 if (!((lucenesearchclass*)textsearchptr)->browse_search((*query_here), startresults, numDocs, queryresults)) {
279 // most likely a system problem
280 logout << text_t2ascii
281 << "system problem: could not do full text browse with lucene for index \""
282 << (*query_here).index << (*query_here).subcollection
283 << (*query_here).language << "\".\n\n";
284 err = systemProblem;
285 return;
286 }
287
288 // assemble the term results
289 TermInfo_t terminfo;
290
291 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
292 termfreqclassarray::iterator terms_end = queryresults.terms.end();
293
294 while (terms_here != terms_end) {
295 terminfo.clear();
296 terminfo.term = (*terms_here).termstr;
297 terminfo.freq = (*terms_here).termfreq;
298
299 response.termInfo.push_back (terminfo);
300
301 ++terms_here;
302 }
303
304
305}
306
307// lucenesearchptr and gdbmptr are assumed to be valid
308void lucenequeryfilterclass::do_multi_query (const FilterRequest_t &request,
309 const vector<queryparamclass> &query_params,
310 queryresultsclass &multiresults,
311 comerror_t &err, ostream &logout) {
312 outconvertclass text_t2ascii;
313
314 err = noError;
315 textsearchptr->setcollectdir (collectdir);
316 multiresults.clear();
317
318 vector<queryparamclass>::const_iterator query_here = query_params.begin();
319 vector<queryparamclass>::const_iterator query_end = query_params.end();
320 while (query_here != query_end) {
321 queryresultsclass thisqueryresults;
322 if (!textsearchptr->search((*query_here), thisqueryresults)) {
323 // most likely a system problem
324 logout << text_t2ascii
325 << "system problem: could not do search with lucene for index \""
326 << (*query_here).index << (*query_here).level
327 << (*query_here).subcollection
328 << (*query_here).language << "\".\n\n";
329 err = systemProblem;
330 return;
331 }
332
333 // check for syntax error
334 if (thisqueryresults.syntax_error==true) {
335 logout << text_t2ascii
336 << "syntax problem: invalid query string \""
337 << (*query_here).querystring<<"\".\n";
338 err = syntaxError;
339 return;
340 }
341 // combine the results
342 if (need_matching_docs (request.filterResultOptions)) {
343
344 if (query_params.size() == 1) {
345 multiresults.docs = thisqueryresults.docs; // just one set of results
346 multiresults.docs_matched = thisqueryresults.docs_matched;
347 multiresults.is_approx = thisqueryresults.is_approx;
348
349 } else {
350 if ((*query_here).combinequery == "and") {
351 multiresults.docs.combine_and (thisqueryresults.docs);
352 } else if ((*query_here).combinequery == "or") {
353 multiresults.docs.combine_or (thisqueryresults.docs);
354 } else if ((*query_here).combinequery == "not") {
355 multiresults.docs.combine_not (thisqueryresults.docs);
356 }
357 multiresults.docs_matched = multiresults.docs.docset.size();
358 multiresults.is_approx = Exact;
359 }
360 }
361
362 // combine the term information
363 if (need_term_info (request.filterResultOptions)) {
364 // append the terms
365 multiresults.orgterms.insert(multiresults.orgterms.end(),
366 thisqueryresults.orgterms.begin(),
367 thisqueryresults.orgterms.end());
368
369
370 // add the term variants -
371 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
372 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
373 while (termvar_here != termvar_end) {
374 multiresults.termvariants.insert(*termvar_here);
375 ++termvar_here;
376 }
377
378 // add the stop words
379 text_tset::iterator stopwords_here = thisqueryresults.stopwords.begin();
380 text_tset::iterator stopwords_end = thisqueryresults.stopwords.end();
381 while (stopwords_here != stopwords_end) {
382 multiresults.stopwords.insert(*stopwords_here);
383 ++stopwords_here;
384 }
385 }
386
387 ++query_here;
388 }
389
390 // sort and unique the query terms
391 multiresults.sortuniqqueryterms ();
392}
393
394
395
Note: See TracBrowser for help on using the repository browser.