Context Navigation

source: trunk/gsdl/src/colservr/lucenequeryfilter.cpp@ 13063

Last change on this file since 13063 was 13063, checked in by kjdon, 18 years ago
make sure some matchTerms are set in terminfo - no termvariants are passed back, so set the original term as a MatchTerm
Property svn:keywords set to `Author Date Id Revision`
File size: 12.9 KB

Line
1	/**********************************************************************
2	*
3	* lucenequeryfilter.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26
27
28	#include "lucenequeryfilter.h"
29	#include "fileutil.h"
30	#include <assert.h>
31	#include "lucenesearch.h"
32
33	/////////////////////////////////
34	// functions for queryfilterclass
35	/////////////////////////////////
36
37
38	lucenequeryfilterclass::lucenequeryfilterclass ()
39	: queryfilterclass() {
40
41
42	FilterOption_t filtopt;
43
44	// -- onePerTerm Level enumerated
45	// likely to be Doc, Sec, Para, but we dont assume anything now
46	filtopt.clear();
47	filtopt.name = "Level";
48	filtopt.type = FilterOption_t::enumeratedt;
49	filtopt.repeatable = FilterOption_t::onePerTerm;
50	filterOptions["Level"] = filtopt;
51
52	// -- IndexField, enumerated, used to list available fields
53	filtopt.clear();
54	filtopt.name = "IndexField";
55	filtopt.type = FilterOption_t::enumeratedt;
56	filtopt.repeatable = FilterOption_t::onePerTerm;
57	filtopt.defaultValue = "";
58	filterOptions["IndexField"] = filtopt;
59
60	}
61
62	lucenequeryfilterclass::~lucenequeryfilterclass () {
63	}
64
65
66	//whether a query is a full text browse
67	bool lucenequeryfilterclass::full_text_browse (int filterRequestOptions) {
68	return (filterRequestOptions & FRfullTextBrowse);
69	}
70
71	void lucenequeryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
72	queryfilterclass::configure(key, cfgline);
73
74	if (key == "indexfieldmap") {
75	indexfieldmap.importmap (cfgline);
76
77	// update the list of indexes in the filter information
78	text_tarray options;
79	indexfieldmap.gettoarray (options);
80
81	text_tarray::const_iterator here = options.begin();
82	text_tarray::const_iterator end = options.end();
83	bool start = true;
84	while (here !=end) {
85	if (!(*here).empty()) {
86	filterOptions["IndexField"].validValues.push_back(*here);
87	if (start) {
88	filterOptions["IndexField"].defaultValue = *here;
89	start = false;
90	}
91	}
92	++here;
93	}
94	} else if (key == "indexlevels") {
95	text_tarray::const_iterator here = cfgline.begin();
96	text_tarray::const_iterator end = cfgline.end();
97	bool first=true;
98	while (here != end) {
99	if (!(*here).empty()) {
100	if (first) {
101	first = false;
102	// the default is the first value
103	filterOptions["Level"].defaultValue = *here;
104	}
105	filterOptions["Level"].validValues.push_back(*here);
106	}
107	++here;
108	}
109	} else if (key == "textlevel") {
110	((lucenesearchclass *)textsearchptr)->set_gdbm_level( cfgline[0]);
111	}
112
113	}
114
115
116	void lucenequeryfilterclass::filter(const FilterRequest_t &request,
117	FilterResponse_t &response,
118	comerror_t &err, ostream &logout) {
119
120	outconvertclass text_t2ascii;
121
122	response.clear ();
123	err = noError;
124	if (gdbmptr == NULL) {
125	// most likely a configuration problem
126	logout << text_t2ascii
127	<< "configuration error: queryfilter contains a null gdbmclass\n\n";
128	err = configurationError;
129	return;
130	}
131	if (textsearchptr == NULL) {
132	// most likely a configuration problem
133	logout << text_t2ascii
134	<< "configuration error: queryfilter contains a null textsearchclass (lucene)\n\n";
135	err = configurationError;
136	return;
137	}
138	if (full_text_browse(request.filterResultOptions)) {
139	browsefilter(request, response, err, logout);
140	return;
141	}
142	// open the database
143	gdbmptr->setlogout(&logout);
144	if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
145	// most likely a system problem (we have already checked that the
146	// gdbm database exists)
147	logout << text_t2ascii
148	<< "system problem: open on gdbm database \""
149	<< gdbm_filename << "\" failed\n\n";
150	err = systemProblem;
151	return;
152	}
153
154
155	// get the query parameters
156	int startresults, endresults;
157	text_t phrasematch; // not used here any more
158	vector<queryparamclass> queryfilterparams;
159	parse_query_params (request, queryfilterparams, startresults,
160	endresults, phrasematch, logout);
161
162
163	// do query
164	queryresultsclass queryresults;
165	do_multi_query (request, queryfilterparams, queryresults, err, logout);
166	response.error_message = queryresults.error_message;
167	if (err != noError) return;
168
169	// assemble document results
170	if (need_matching_docs (request.filterResultOptions)) {
171
172	int resultnum = 1;
173	ResultDocInfo_t resultdoc;
174	text_t trans_OID;
175	vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
176	vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
177
178	// Now handled by Lucene directly
179	//if (endresults == -1) endresults = MAXNUMDOCS;
180
181	while (docorder_here != docorder_end)
182	{
183	// Now handled by Lucene directly
184	//if (resultnum > endresults) break;
185
186	// translate the document number
187	if (!translate(gdbmptr, *docorder_here, trans_OID))
188	{
189	logout << text_t2ascii
190	<< "warning: could not translate lucene document number \""
191	<< *docorder_here << "\" to OID.\n\n";
192
193	}
194	else
195	{
196	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
197
198	// see if there is a result for this number,
199	// if it is in the request set (or the request set is empty)
200	if (docset_here != queryresults.docs.docset.end() && (request.docSet.empty() \|\| in_set(request.docSet, trans_OID)))
201	{
202	// Now handled by Lucene directly
203	//if (resultnum >= startresults) {
204
205	// add this document
206	resultdoc.OID = trans_OID;
207	resultdoc.result_num = resultnum;
208	resultdoc.ranking = (int)((docset_here).second.docweight 10000.0 + 0.5);
209
210	response.docInfo.push_back (resultdoc);
211	//}
212	++resultnum;
213	}
214	} // else
215
216	++docorder_here;
217	}
218	} // if need matching docs
219
220	// assemble the term results
221	if (need_term_info(request.filterResultOptions)) {
222	// note: the terms have already been sorted and uniqued - ?? have they??
223
224	TermInfo_t terminfo;
225	bool terms_first = true;
226
227	termfreqclassarray::iterator terms_here = queryresults.terms.begin();
228	termfreqclassarray::iterator terms_end = queryresults.terms.end();
229
230	while (terms_here != terms_end) {
231	terminfo.clear();
232	terminfo.term = (*terms_here).termstr;
233	terminfo.freq = (*terms_here).termfreq;
234	// lucene doesn't return any termvariants at this stage,
235	// so make sure the original term is set
236	terminfo.matchTerms.push_back(terminfo.term);
237
238	// this bit gets the matchTerms ie the equivalent (stem/casefold) terms
239	if (terms_first) {
240	text_tset::iterator termvariants_here = queryresults.termvariants.begin();
241	text_tset::iterator termvariants_end = queryresults.termvariants.end();
242	while (termvariants_here != termvariants_end) {
243	terminfo.matchTerms.push_back (*termvariants_here);
244	++termvariants_here;
245	}
246	}
247	terms_first = false;
248
249	response.termInfo.push_back (terminfo);
250
251	++terms_here;
252	}
253
254	// add the stop words
255	text_tset::iterator stopwords_here = queryresults.stopwords.begin();
256	text_tset::iterator stopwords_end = queryresults.stopwords.end();
257	while (stopwords_here != stopwords_end) {
258	response.stopwords.insert(*stopwords_here);
259	++stopwords_here;
260	}
261	}
262
263	response.numDocs = queryresults.docs_matched;
264	response.isApprox = queryresults.is_approx;
265	}
266
267	void lucenequeryfilterclass::browsefilter(const FilterRequest_t &request,
268	FilterResponse_t &response,
269	comerror_t &err, ostream &logout) {
270
271	outconvertclass text_t2ascii;
272
273	// get the query parameters
274	int startresults, endresults;
275	text_t phrasematch; // not used here any more, just have it so can use
276	// parse_query_params function
277
278	vector<queryparamclass> queryfilterparams;
279	parse_query_params (request, queryfilterparams, startresults,
280	endresults, phrasematch, logout);
281
282	vector<queryparamclass>::const_iterator query_here = queryfilterparams.begin();
283
284	// do query
285	queryresultsclass queryresults;
286	queryresults.clear();
287
288	int numDocs = endresults-startresults;
289	textsearchptr->setcollectdir (collectdir);
290
291	if (!((lucenesearchclass)textsearchptr)->browse_search((query_here), startresults, numDocs, queryresults)) {
292	// most likely a system problem
293	logout << text_t2ascii
294	<< "system problem: could not do full text browse with lucene for index \""
295	<< (query_here).index << (query_here).subcollection
296	<< (*query_here).language << "\".\n\n";
297	err = systemProblem;
298	return;
299	}
300
301	// assemble the term results
302	TermInfo_t terminfo;
303
304	termfreqclassarray::iterator terms_here = queryresults.terms.begin();
305	termfreqclassarray::iterator terms_end = queryresults.terms.end();
306
307	while (terms_here != terms_end) {
308	terminfo.clear();
309	terminfo.term = (*terms_here).termstr;
310	terminfo.freq = (*terms_here).termfreq;
311
312	response.termInfo.push_back (terminfo);
313
314	++terms_here;
315	}
316
317
318	}
319
320	// lucenesearchptr and gdbmptr are assumed to be valid
321	void lucenequeryfilterclass::do_multi_query (const FilterRequest_t &request,
322	const vector<queryparamclass> &query_params,
323	queryresultsclass &multiresults,
324	comerror_t &err, ostream &logout) {
325	outconvertclass text_t2ascii;
326
327	err = noError;
328	textsearchptr->setcollectdir (collectdir);
329	multiresults.clear();
330
331	vector<queryparamclass>::const_iterator query_here = query_params.begin();
332	vector<queryparamclass>::const_iterator query_end = query_params.end();
333	while (query_here != query_end) {
334	queryresultsclass thisqueryresults;
335	if (!textsearchptr->search((*query_here), thisqueryresults)) {
336	// most likely a system problem
337	logout << text_t2ascii
338	<< "system problem: could not do search with lucene for index \""
339	<< (query_here).index << (query_here).level
340	<< (*query_here).subcollection
341	<< (*query_here).language << "\".\n\n";
342	err = systemProblem;
343	return;
344	}
345
346	// check for syntax error
347	if (thisqueryresults.syntax_error==true) {
348	logout << text_t2ascii
349	<< "syntax problem: invalid query string \""
350	<< (*query_here).querystring<<"\".\n";
351	err = syntaxError;
352	return;
353	}
354	// combine the results
355	if (need_matching_docs (request.filterResultOptions)) {
356
357	if (query_params.size() == 1) {
358	multiresults.error_message = thisqueryresults.error_message;
359	multiresults.docs = thisqueryresults.docs; // just one set of results
360	multiresults.docs_matched = thisqueryresults.docs_matched;
361	multiresults.is_approx = thisqueryresults.is_approx;
362
363	} else {
364	if ((*query_here).combinequery == "and") {
365	multiresults.docs.combine_and (thisqueryresults.docs);
366	} else if ((*query_here).combinequery == "or") {
367	multiresults.docs.combine_or (thisqueryresults.docs);
368	} else if ((*query_here).combinequery == "not") {
369	multiresults.docs.combine_not (thisqueryresults.docs);
370	}
371	multiresults.docs_matched = multiresults.docs.docset.size();
372	multiresults.is_approx = Exact;
373	}
374	}
375
376	// combine the term information
377	if (need_term_info (request.filterResultOptions)) {
378	// append the terms
379	multiresults.orgterms.insert(multiresults.orgterms.end(),
380	thisqueryresults.orgterms.begin(),
381	thisqueryresults.orgterms.end());
382
383
384	// add the term variants -
385	text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
386	text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
387	while (termvar_here != termvar_end) {
388	multiresults.termvariants.insert(*termvar_here);
389	++termvar_here;
390	}
391
392	// add the stop words
393	text_tset::iterator stopwords_here = thisqueryresults.stopwords.begin();
394	text_tset::iterator stopwords_end = thisqueryresults.stopwords.end();
395	while (stopwords_here != stopwords_end) {
396	multiresults.stopwords.insert(*stopwords_here);
397	++stopwords_here;
398	}
399	}
400
401	++query_here;
402	}
403
404	// sort and unique the query terms
405	multiresults.sortuniqqueryterms ();
406	}
407
408
409

Note: See TracBrowser for help on using the repository browser.

Download in other formats: