Context Navigation

source: gsdl/trunk/runtime-src/src/colservr/lucenequeryfilter.cpp@ 16947

Last change on this file since 16947 was 16947, checked in by mdewsnip, 16 years ago
Changed the Lucene code to use the Greenstone document OIDs directly, instead of creating its own numeric IDs and then mapping them to the Greenstone OIDs in the GDBM file. As well as being simpler and more space and speed efficient (the mapping no longer needs to be stored in the GDBM file, and no lookup needs to be done for each search result), this is another important step along the road to true incremental building.
Property svn:keywords set to `Author Date Id Revision`
File size: 12.3 KB

Line
1	/**********************************************************************
2	*
3	* lucenequeryfilter.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "lucenequeryfilter.h"
27	#include "fileutil.h"
28	#include "lucenesearch.h"
29
30	/////////////////////////////////
31	// functions for queryfilterclass
32	/////////////////////////////////
33
34
35	lucenequeryfilterclass::lucenequeryfilterclass ()
36	: queryfilterclass() {
37
38
39	FilterOption_t filtopt;
40
41	// -- onePerTerm Level enumerated
42	// likely to be Doc, Sec, Para, but we dont assume anything now
43	filtopt.clear();
44	filtopt.name = "Level";
45	filtopt.type = FilterOption_t::enumeratedt;
46	filtopt.repeatable = FilterOption_t::onePerTerm;
47	filterOptions["Level"] = filtopt;
48
49	// -- IndexField, enumerated, used to list available fields
50	filtopt.clear();
51	filtopt.name = "IndexField";
52	filtopt.type = FilterOption_t::enumeratedt;
53	filtopt.repeatable = FilterOption_t::onePerTerm;
54	filtopt.defaultValue = "";
55	filterOptions["IndexField"] = filtopt;
56
57	}
58
59	lucenequeryfilterclass::~lucenequeryfilterclass () {
60	}
61
62
63	//whether a query is a full text browse
64	bool lucenequeryfilterclass::full_text_browse (int filterRequestOptions) {
65	return (filterRequestOptions & FRfullTextBrowse);
66	}
67
68	void lucenequeryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
69	queryfilterclass::configure(key, cfgline);
70
71	if (key == "indexfieldmap") {
72	indexfieldmap.importmap (cfgline);
73
74	// update the list of indexes in the filter information
75	text_tarray options;
76	indexfieldmap.gettoarray (options);
77
78	text_tarray::const_iterator here = options.begin();
79	text_tarray::const_iterator end = options.end();
80	bool start = true;
81	while (here !=end) {
82	if (!(*here).empty()) {
83	filterOptions["IndexField"].validValues.push_back(*here);
84	if (start) {
85	filterOptions["IndexField"].defaultValue = *here;
86	start = false;
87	}
88	}
89	++here;
90	}
91	} else if (key == "indexlevels") {
92	text_tarray::const_iterator here = cfgline.begin();
93	text_tarray::const_iterator end = cfgline.end();
94	bool first=true;
95	while (here != end) {
96	if (!(*here).empty()) {
97	if (first) {
98	first = false;
99	// the default is the first value
100	filterOptions["Level"].defaultValue = *here;
101	}
102	filterOptions["Level"].validValues.push_back(*here);
103	}
104	++here;
105	}
106	} else if (key == "textlevel") {
107	((lucenesearchclass *)textsearchptr)->set_text_level(cfgline[0]);
108	}
109
110	}
111
112
113	void lucenequeryfilterclass::filter(const FilterRequest_t &request,
114	FilterResponse_t &response,
115	comerror_t &err, ostream &logout) {
116
117	outconvertclass text_t2ascii;
118
119	response.clear ();
120	err = noError;
121	if (db_ptr == NULL) {
122	// most likely a configuration problem
123	logout << text_t2ascii
124	<< "configuration error: queryfilter contains a null dbclass\n\n";
125	err = configurationError;
126	return;
127	}
128	if (textsearchptr == NULL) {
129	// most likely a configuration problem
130	logout << text_t2ascii
131	<< "configuration error: queryfilter contains a null textsearchclass (lucene)\n\n";
132	err = configurationError;
133	return;
134	}
135	if (full_text_browse(request.filterResultOptions)) {
136	browsefilter(request, response, err, logout);
137	return;
138	}
139	// open the database
140	db_ptr->setlogout(&logout);
141	if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
142	// most likely a system problem (we have already checked that the database exists)
143	logout << text_t2ascii
144	<< "system problem: open on database \"" << db_filename << "\" failed\n\n";
145	err = systemProblem;
146	return;
147	}
148
149
150	// get the query parameters
151	int startresults, endresults;
152	text_t phrasematch; // not used here any more
153	vector<queryparamclass> queryfilterparams;
154	parse_query_params (request, queryfilterparams, startresults,
155	endresults, phrasematch, logout);
156
157
158	// do query
159	queryresultsclass queryresults;
160	do_multi_query (request, queryfilterparams, queryresults, err, logout);
161	response.error_message = queryresults.error_message;
162	if (err != noError) return;
163
164	// assemble document results
165	if (need_matching_docs (request.filterResultOptions))
166	{
167	// Loop through the query results (ordered by ranking)
168	int resultnum = 1;
169	vector<text_t>::iterator docorder_iterator = queryresults.docs.docorder.begin();
170	while (docorder_iterator != queryresults.docs.docorder.end())
171	{
172	text_t doc_OID = (*docorder_iterator);
173	// logout << "Matching doc OID: " << doc_OID << endl;
174
175	// Make sure this result is in the docset, and either in the request set or the request set is empty
176	docresultmap::iterator doc_result = queryresults.docs.docset.find (doc_OID);
177	if (doc_result != queryresults.docs.docset.end() && (request.docSet.empty() \|\| in_set(request.docSet, doc_OID)))
178	{
179	// Add the matching document
180	ResultDocInfo_t resultdoc;
181	resultdoc.OID = doc_OID;
182	resultdoc.result_num = resultnum;
183	resultdoc.ranking = (int)((doc_result).second.docweight 10000.0 + 0.5);
184	resultdoc.num_terms_matched = (*doc_result).second.num_query_terms_matched;
185	response.docInfo.push_back (resultdoc);
186
187	resultnum++;
188	}
189
190	docorder_iterator++;
191	}
192	}
193
194	// assemble the term results
195	if (need_term_info(request.filterResultOptions)) {
196	// note: the terms have already been sorted and uniqued - ?? have they??
197
198	TermInfo_t terminfo;
199	bool terms_first = true;
200
201	termfreqclassarray::iterator terms_here = queryresults.terms.begin();
202	termfreqclassarray::iterator terms_end = queryresults.terms.end();
203
204	while (terms_here != terms_end) {
205	terminfo.clear();
206	terminfo.term = (*terms_here).termstr;
207	terminfo.freq = (*terms_here).termfreq;
208	// lucene doesn't return any termvariants at this stage,
209	// so make sure the original term is set
210	terminfo.matchTerms.push_back(terminfo.term);
211
212	// this bit gets the matchTerms ie the equivalent (stem/casefold) terms
213	if (terms_first) {
214	text_tset::iterator termvariants_here = queryresults.termvariants.begin();
215	text_tset::iterator termvariants_end = queryresults.termvariants.end();
216	while (termvariants_here != termvariants_end) {
217	terminfo.matchTerms.push_back (*termvariants_here);
218	++termvariants_here;
219	}
220	}
221	terms_first = false;
222
223	response.termInfo.push_back (terminfo);
224
225	++terms_here;
226	}
227
228	// add the stop words
229	text_tset::iterator stopwords_here = queryresults.stopwords.begin();
230	text_tset::iterator stopwords_end = queryresults.stopwords.end();
231	while (stopwords_here != stopwords_end) {
232	response.stopwords.insert(*stopwords_here);
233	++stopwords_here;
234	}
235	}
236
237	db_ptr->closedatabase(); // Important that local library doesn't leave any files open
238	response.numDocs = queryresults.docs_matched;
239	response.isApprox = queryresults.is_approx;
240	}
241
242	void lucenequeryfilterclass::browsefilter(const FilterRequest_t &request,
243	FilterResponse_t &response,
244	comerror_t &err, ostream &logout) {
245
246	outconvertclass text_t2ascii;
247
248	// get the query parameters
249	int startresults, endresults;
250	text_t phrasematch; // not used here any more, just have it so can use
251	// parse_query_params function
252
253	vector<queryparamclass> queryfilterparams;
254	parse_query_params (request, queryfilterparams, startresults,
255	endresults, phrasematch, logout);
256
257	vector<queryparamclass>::const_iterator query_here = queryfilterparams.begin();
258
259	// do query
260	queryresultsclass queryresults;
261	queryresults.clear();
262
263	int numDocs = endresults-startresults;
264	textsearchptr->setcollectdir (collectdir);
265
266	if (!((lucenesearchclass)textsearchptr)->browse_search((query_here), startresults, numDocs, queryresults)) {
267	// most likely a system problem
268	logout << text_t2ascii
269	<< "system problem: could not do full text browse with lucene for index \""
270	<< (query_here).index << (query_here).subcollection
271	<< (*query_here).language << "\".\n\n";
272	err = systemProblem;
273	return;
274	}
275
276	// assemble the term results
277	TermInfo_t terminfo;
278
279	termfreqclassarray::iterator terms_here = queryresults.terms.begin();
280	termfreqclassarray::iterator terms_end = queryresults.terms.end();
281
282	while (terms_here != terms_end) {
283	terminfo.clear();
284	terminfo.term = (*terms_here).termstr;
285	terminfo.freq = (*terms_here).termfreq;
286
287	response.termInfo.push_back (terminfo);
288
289	++terms_here;
290	}
291
292
293	}
294
295	// lucenesearchptr and db_ptr are assumed to be valid
296	void lucenequeryfilterclass::do_multi_query (const FilterRequest_t &request,
297	const vector<queryparamclass> &query_params,
298	queryresultsclass &multiresults,
299	comerror_t &err, ostream &logout) {
300	outconvertclass text_t2ascii;
301
302	err = noError;
303	textsearchptr->setcollectdir (collectdir);
304	multiresults.clear();
305
306	vector<queryparamclass>::const_iterator query_here = query_params.begin();
307	vector<queryparamclass>::const_iterator query_end = query_params.end();
308	while (query_here != query_end) {
309	queryresultsclass thisqueryresults;
310	if (!textsearchptr->search((*query_here), thisqueryresults)) {
311	// most likely a system problem
312	logout << text_t2ascii
313	<< "system problem: could not do search with lucene for index \""
314	<< (query_here).index << (query_here).level
315	<< (*query_here).subcollection
316	<< (*query_here).language << "\".\n\n";
317	err = systemProblem;
318	return;
319	}
320
321	// check for syntax error
322	if (thisqueryresults.syntax_error==true) {
323	logout << text_t2ascii
324	<< "syntax problem: invalid query string \""
325	<< (*query_here).querystring<<"\".\n";
326	err = syntaxError;
327	return;
328	}
329	// combine the results
330	if (need_matching_docs (request.filterResultOptions)) {
331
332	if (query_params.size() == 1) {
333	multiresults.error_message = thisqueryresults.error_message;
334	multiresults.docs = thisqueryresults.docs; // just one set of results
335	multiresults.docs_matched = thisqueryresults.docs_matched;
336	multiresults.is_approx = thisqueryresults.is_approx;
337
338	} else {
339	if ((*query_here).combinequery == "and") {
340	multiresults.docs.combine_and (thisqueryresults.docs);
341	} else if ((*query_here).combinequery == "or") {
342	multiresults.docs.combine_or (thisqueryresults.docs);
343	} else if ((*query_here).combinequery == "not") {
344	multiresults.docs.combine_not (thisqueryresults.docs);
345	}
346	multiresults.docs_matched = multiresults.docs.docset.size();
347	multiresults.is_approx = Exact;
348	}
349	}
350
351	// combine the term information
352	if (need_term_info (request.filterResultOptions)) {
353	// append the terms
354	multiresults.orgterms.insert(multiresults.orgterms.end(),
355	thisqueryresults.orgterms.begin(),
356	thisqueryresults.orgterms.end());
357
358
359	// add the term variants -
360	text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
361	text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
362	while (termvar_here != termvar_end) {
363	multiresults.termvariants.insert(*termvar_here);
364	++termvar_here;
365	}
366
367	// add the stop words
368	text_tset::iterator stopwords_here = thisqueryresults.stopwords.begin();
369	text_tset::iterator stopwords_end = thisqueryresults.stopwords.end();
370	while (stopwords_here != stopwords_end) {
371	multiresults.stopwords.insert(*stopwords_here);
372	++stopwords_here;
373	}
374	}
375
376	++query_here;
377	}
378
379	// sort and unique the query terms
380	multiresults.sortuniqqueryterms ();
381	}
382
383
384

Note: See TracBrowser for help on using the repository browser.

Download in other formats: