source: gsdl/trunk/runtime-src/src/colservr/lucenequeryfilter.cpp@ 16947

Last change on this file since 16947 was 16947, checked in by mdewsnip, 16 years ago

Changed the Lucene code to use the Greenstone document OIDs directly, instead of creating its own numeric IDs and then mapping them to the Greenstone OIDs in the GDBM file. As well as being simpler and more space and speed efficient (the mapping no longer needs to be stored in the GDBM file, and no lookup needs to be done for each search result), this is another important step along the road to true incremental building.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.3 KB
Line 
1/**********************************************************************
2 *
3 * lucenequeryfilter.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "lucenequeryfilter.h"
27#include "fileutil.h"
28#include "lucenesearch.h"
29
30/////////////////////////////////
31// functions for queryfilterclass
32/////////////////////////////////
33
34
35lucenequeryfilterclass::lucenequeryfilterclass ()
36 : queryfilterclass() {
37
38
39 FilterOption_t filtopt;
40
41 // -- onePerTerm Level enumerated
42 // likely to be Doc, Sec, Para, but we dont assume anything now
43 filtopt.clear();
44 filtopt.name = "Level";
45 filtopt.type = FilterOption_t::enumeratedt;
46 filtopt.repeatable = FilterOption_t::onePerTerm;
47 filterOptions["Level"] = filtopt;
48
49 // -- IndexField, enumerated, used to list available fields
50 filtopt.clear();
51 filtopt.name = "IndexField";
52 filtopt.type = FilterOption_t::enumeratedt;
53 filtopt.repeatable = FilterOption_t::onePerTerm;
54 filtopt.defaultValue = "";
55 filterOptions["IndexField"] = filtopt;
56
57}
58
59lucenequeryfilterclass::~lucenequeryfilterclass () {
60}
61
62
63//whether a query is a full text browse
64bool lucenequeryfilterclass::full_text_browse (int filterRequestOptions) {
65 return (filterRequestOptions & FRfullTextBrowse);
66}
67
68void lucenequeryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
69 queryfilterclass::configure(key, cfgline);
70
71 if (key == "indexfieldmap") {
72 indexfieldmap.importmap (cfgline);
73
74 // update the list of indexes in the filter information
75 text_tarray options;
76 indexfieldmap.gettoarray (options);
77
78 text_tarray::const_iterator here = options.begin();
79 text_tarray::const_iterator end = options.end();
80 bool start = true;
81 while (here !=end) {
82 if (!(*here).empty()) {
83 filterOptions["IndexField"].validValues.push_back(*here);
84 if (start) {
85 filterOptions["IndexField"].defaultValue = *here;
86 start = false;
87 }
88 }
89 ++here;
90 }
91 } else if (key == "indexlevels") {
92 text_tarray::const_iterator here = cfgline.begin();
93 text_tarray::const_iterator end = cfgline.end();
94 bool first=true;
95 while (here != end) {
96 if (!(*here).empty()) {
97 if (first) {
98 first = false;
99 // the default is the first value
100 filterOptions["Level"].defaultValue = *here;
101 }
102 filterOptions["Level"].validValues.push_back(*here);
103 }
104 ++here;
105 }
106 } else if (key == "textlevel") {
107 ((lucenesearchclass *)textsearchptr)->set_text_level(cfgline[0]);
108 }
109
110}
111
112
113void lucenequeryfilterclass::filter(const FilterRequest_t &request,
114 FilterResponse_t &response,
115 comerror_t &err, ostream &logout) {
116
117 outconvertclass text_t2ascii;
118
119 response.clear ();
120 err = noError;
121 if (db_ptr == NULL) {
122 // most likely a configuration problem
123 logout << text_t2ascii
124 << "configuration error: queryfilter contains a null dbclass\n\n";
125 err = configurationError;
126 return;
127 }
128 if (textsearchptr == NULL) {
129 // most likely a configuration problem
130 logout << text_t2ascii
131 << "configuration error: queryfilter contains a null textsearchclass (lucene)\n\n";
132 err = configurationError;
133 return;
134 }
135 if (full_text_browse(request.filterResultOptions)) {
136 browsefilter(request, response, err, logout);
137 return;
138 }
139 // open the database
140 db_ptr->setlogout(&logout);
141 if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
142 // most likely a system problem (we have already checked that the database exists)
143 logout << text_t2ascii
144 << "system problem: open on database \"" << db_filename << "\" failed\n\n";
145 err = systemProblem;
146 return;
147 }
148
149
150 // get the query parameters
151 int startresults, endresults;
152 text_t phrasematch; // not used here any more
153 vector<queryparamclass> queryfilterparams;
154 parse_query_params (request, queryfilterparams, startresults,
155 endresults, phrasematch, logout);
156
157
158 // do query
159 queryresultsclass queryresults;
160 do_multi_query (request, queryfilterparams, queryresults, err, logout);
161 response.error_message = queryresults.error_message;
162 if (err != noError) return;
163
164 // assemble document results
165 if (need_matching_docs (request.filterResultOptions))
166 {
167 // Loop through the query results (ordered by ranking)
168 int resultnum = 1;
169 vector<text_t>::iterator docorder_iterator = queryresults.docs.docorder.begin();
170 while (docorder_iterator != queryresults.docs.docorder.end())
171 {
172 text_t doc_OID = (*docorder_iterator);
173 // logout << "Matching doc OID: " << doc_OID << endl;
174
175 // Make sure this result is in the docset, and either in the request set or the request set is empty
176 docresultmap::iterator doc_result = queryresults.docs.docset.find (doc_OID);
177 if (doc_result != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, doc_OID)))
178 {
179 // Add the matching document
180 ResultDocInfo_t resultdoc;
181 resultdoc.OID = doc_OID;
182 resultdoc.result_num = resultnum;
183 resultdoc.ranking = (int)((*doc_result).second.docweight * 10000.0 + 0.5);
184 resultdoc.num_terms_matched = (*doc_result).second.num_query_terms_matched;
185 response.docInfo.push_back (resultdoc);
186
187 resultnum++;
188 }
189
190 docorder_iterator++;
191 }
192 }
193
194 // assemble the term results
195 if (need_term_info(request.filterResultOptions)) {
196 // note: the terms have already been sorted and uniqued - ?? have they??
197
198 TermInfo_t terminfo;
199 bool terms_first = true;
200
201 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
202 termfreqclassarray::iterator terms_end = queryresults.terms.end();
203
204 while (terms_here != terms_end) {
205 terminfo.clear();
206 terminfo.term = (*terms_here).termstr;
207 terminfo.freq = (*terms_here).termfreq;
208 // lucene doesn't return any termvariants at this stage,
209 // so make sure the original term is set
210 terminfo.matchTerms.push_back(terminfo.term);
211
212 // this bit gets the matchTerms ie the equivalent (stem/casefold) terms
213 if (terms_first) {
214 text_tset::iterator termvariants_here = queryresults.termvariants.begin();
215 text_tset::iterator termvariants_end = queryresults.termvariants.end();
216 while (termvariants_here != termvariants_end) {
217 terminfo.matchTerms.push_back (*termvariants_here);
218 ++termvariants_here;
219 }
220 }
221 terms_first = false;
222
223 response.termInfo.push_back (terminfo);
224
225 ++terms_here;
226 }
227
228 // add the stop words
229 text_tset::iterator stopwords_here = queryresults.stopwords.begin();
230 text_tset::iterator stopwords_end = queryresults.stopwords.end();
231 while (stopwords_here != stopwords_end) {
232 response.stopwords.insert(*stopwords_here);
233 ++stopwords_here;
234 }
235 }
236
237 db_ptr->closedatabase(); // Important that local library doesn't leave any files open
238 response.numDocs = queryresults.docs_matched;
239 response.isApprox = queryresults.is_approx;
240}
241
242void lucenequeryfilterclass::browsefilter(const FilterRequest_t &request,
243 FilterResponse_t &response,
244 comerror_t &err, ostream &logout) {
245
246 outconvertclass text_t2ascii;
247
248 // get the query parameters
249 int startresults, endresults;
250 text_t phrasematch; // not used here any more, just have it so can use
251 // parse_query_params function
252
253 vector<queryparamclass> queryfilterparams;
254 parse_query_params (request, queryfilterparams, startresults,
255 endresults, phrasematch, logout);
256
257 vector<queryparamclass>::const_iterator query_here = queryfilterparams.begin();
258
259 // do query
260 queryresultsclass queryresults;
261 queryresults.clear();
262
263 int numDocs = endresults-startresults;
264 textsearchptr->setcollectdir (collectdir);
265
266 if (!((lucenesearchclass*)textsearchptr)->browse_search((*query_here), startresults, numDocs, queryresults)) {
267 // most likely a system problem
268 logout << text_t2ascii
269 << "system problem: could not do full text browse with lucene for index \""
270 << (*query_here).index << (*query_here).subcollection
271 << (*query_here).language << "\".\n\n";
272 err = systemProblem;
273 return;
274 }
275
276 // assemble the term results
277 TermInfo_t terminfo;
278
279 termfreqclassarray::iterator terms_here = queryresults.terms.begin();
280 termfreqclassarray::iterator terms_end = queryresults.terms.end();
281
282 while (terms_here != terms_end) {
283 terminfo.clear();
284 terminfo.term = (*terms_here).termstr;
285 terminfo.freq = (*terms_here).termfreq;
286
287 response.termInfo.push_back (terminfo);
288
289 ++terms_here;
290 }
291
292
293}
294
295// lucenesearchptr and db_ptr are assumed to be valid
296void lucenequeryfilterclass::do_multi_query (const FilterRequest_t &request,
297 const vector<queryparamclass> &query_params,
298 queryresultsclass &multiresults,
299 comerror_t &err, ostream &logout) {
300 outconvertclass text_t2ascii;
301
302 err = noError;
303 textsearchptr->setcollectdir (collectdir);
304 multiresults.clear();
305
306 vector<queryparamclass>::const_iterator query_here = query_params.begin();
307 vector<queryparamclass>::const_iterator query_end = query_params.end();
308 while (query_here != query_end) {
309 queryresultsclass thisqueryresults;
310 if (!textsearchptr->search((*query_here), thisqueryresults)) {
311 // most likely a system problem
312 logout << text_t2ascii
313 << "system problem: could not do search with lucene for index \""
314 << (*query_here).index << (*query_here).level
315 << (*query_here).subcollection
316 << (*query_here).language << "\".\n\n";
317 err = systemProblem;
318 return;
319 }
320
321 // check for syntax error
322 if (thisqueryresults.syntax_error==true) {
323 logout << text_t2ascii
324 << "syntax problem: invalid query string \""
325 << (*query_here).querystring<<"\".\n";
326 err = syntaxError;
327 return;
328 }
329 // combine the results
330 if (need_matching_docs (request.filterResultOptions)) {
331
332 if (query_params.size() == 1) {
333 multiresults.error_message = thisqueryresults.error_message;
334 multiresults.docs = thisqueryresults.docs; // just one set of results
335 multiresults.docs_matched = thisqueryresults.docs_matched;
336 multiresults.is_approx = thisqueryresults.is_approx;
337
338 } else {
339 if ((*query_here).combinequery == "and") {
340 multiresults.docs.combine_and (thisqueryresults.docs);
341 } else if ((*query_here).combinequery == "or") {
342 multiresults.docs.combine_or (thisqueryresults.docs);
343 } else if ((*query_here).combinequery == "not") {
344 multiresults.docs.combine_not (thisqueryresults.docs);
345 }
346 multiresults.docs_matched = multiresults.docs.docset.size();
347 multiresults.is_approx = Exact;
348 }
349 }
350
351 // combine the term information
352 if (need_term_info (request.filterResultOptions)) {
353 // append the terms
354 multiresults.orgterms.insert(multiresults.orgterms.end(),
355 thisqueryresults.orgterms.begin(),
356 thisqueryresults.orgterms.end());
357
358
359 // add the term variants -
360 text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
361 text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
362 while (termvar_here != termvar_end) {
363 multiresults.termvariants.insert(*termvar_here);
364 ++termvar_here;
365 }
366
367 // add the stop words
368 text_tset::iterator stopwords_here = thisqueryresults.stopwords.begin();
369 text_tset::iterator stopwords_end = thisqueryresults.stopwords.end();
370 while (stopwords_here != stopwords_end) {
371 multiresults.stopwords.insert(*stopwords_here);
372 ++stopwords_here;
373 }
374 }
375
376 ++query_here;
377 }
378
379 // sort and unique the query terms
380 multiresults.sortuniqqueryterms ();
381}
382
383
384
Note: See TracBrowser for help on using the repository browser.