Context Navigation

source: main/trunk/greenstone2/runtime-src/src/colservr/mgqueryfilter.cpp@ 22452

Last change on this file since 22452 was 16445, checked in by mdewsnip, 16 years ago
Search result document numbers are now represented with a text_t rather than an int, in preparation for changing Lucene to return the Greenstone document OIDs directly rather than looking them up as a separate step. This is better for efficiency and is also required for incremental building.
Property svn:keywords set to `Author Date Id Revision`
File size: 17.8 KB

Line
1	/**********************************************************************
2	*
3	* mgqueryfilter.cpp -- implementation of queryfilter for old mg
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "mgqueryfilter.h"
27	#include "fileutil.h"
28	#include "phrasesearch.h"
29	#include "mgsearch.h"
30	#include "phrases.h"
31
32	///////////////////////////////
33	// methods for resultsorderer_t
34	///////////////////////////////
35
36	resultsorderer_t::resultsorderer_t() {
37	clear ();
38	}
39
40	void resultsorderer_t::clear() {
41	compare_phrase_match = false;
42	compare_terms_match = false;
43	compare_doc_weight = true;
44
45	docset = NULL;
46	}
47
48	bool resultsorderer_t::operator()(const text_t &t1, const text_t &t2) const {
49	if (docset == NULL) return t1>t2;
50
51	docresultmap::iterator t1_here = docset->find(t1);
52	docresultmap::iterator t2_here = docset->find(t2);
53	docresultmap::iterator end = docset->end();
54
55	// sort all the document numbers not in the document set to
56	// the end of the list
57	if (t1_here == end) {
58	if (t2_here == end) return t1>t2;
59	else return true;
60	} else if (t2_here == end) return false;
61
62	if (compare_phrase_match) {
63	if ((t1_here).second.num_phrase_match > (t2_here).second.num_phrase_match) return true;
64	if ((t1_here).second.num_phrase_match < (t2_here).second.num_phrase_match) return false;
65	}
66
67	if (compare_terms_match) {
68	if ((t1_here).second.num_query_terms_matched > (t2_here).second.num_query_terms_matched) return true;
69	if ((t1_here).second.num_query_terms_matched < (t2_here).second.num_query_terms_matched) return false;
70	}
71
72	if (compare_doc_weight) {
73	if ((t1_here).second.docweight > (t2_here).second.docweight) return true;
74	if ((t1_here).second.docweight < (t2_here).second.docweight) return false;
75	}
76
77	return t1>t2;
78	}
79
80
81
82
83	/////////////////////////////////
84	// functions for mgqueryfilterclass
85	/////////////////////////////////
86
87
88	void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
89	queryfilterclass::configure (key, cfgline);
90
91	if (key == "indexstem") {
92	((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
93	}
94
95	}
96
97	// loads up phrases data structure with any phrases (that's the quoted bits)
98	// occuring in the querystring
99	void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
100	const termfreqclassarray &orgterms,
101	vector<termfreqclassarray> &phrases) {
102
103	text_t::const_iterator here = querystring.begin();
104	text_t::const_iterator end = querystring.end();
105
106	termfreqclassarray tmpterms;
107
108	int termcount = 0;
109	bool foundquote = false;
110	bool foundbreak = false;
111	bool start = true;
112	while (here != end) {
113	if (*here == '\"') {
114	if (foundquote) {
115	if (!foundbreak && !start) {
116	tmpterms.push_back (orgterms[termcount]);
117	++termcount;
118	}
119	if (tmpterms.size() > 1) {
120	phrases.push_back (tmpterms);
121	}
122	tmpterms.erase (tmpterms.begin(), tmpterms.end());
123
124	foundquote = false;
125	foundbreak = true;
126	} else foundquote = true;
127	} else if (!is_unicode_letdig(*here)) {
128	// found a break between terms
129	if (!foundbreak && !start) {
130	if (foundquote) {
131	tmpterms.push_back (orgterms[termcount]);
132	}
133	++termcount;
134	}
135	foundbreak = true;
136	} else {
137	start = false;
138	foundbreak = false;
139	}
140	++here;
141	}
142	}
143
144	// do aditional query processing
145	void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
146	queryresultsclass &queryresults) {
147
148	// post-process the results if needed
149	if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
150
151	// get the terms between quotes (if any)
152	vector<termfreqclassarray> phrases;
153	get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
154
155	num_phrases = phrases.size();
156	if (num_phrases > 0) {
157
158	// get the long version of the index
159	text_t longindex;
160	indexmap.to2from (queryparams.index, longindex);
161
162	vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
163	vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
164
165	while (this_phrase != end_phrase) {
166
167	// process each of the matched documents
168	docresultmap::iterator docs_here = queryresults.docs.docset.begin();
169	docresultmap::iterator docs_end = queryresults.docs.docset.end();
170	while (docs_here != docs_end) {
171	if (OID_phrase_search (((mgsearchclass)textsearchptr), *db_ptr, queryparams.index,
172	queryparams.subcollection, queryparams.language,
173	longindex, queryparams.collection, *this_phrase,
174	(*docs_here).second.docnum)) {
175	++docs_here->second.num_phrase_match;
176	}
177
178	++docs_here;
179	}
180	++this_phrase;
181	}
182	}
183	}
184	}
185
186
187	// do query that might involve multiple sub queries
188	// textsearchptr and db_ptr are assumed to be valid
189	void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
190	const vector<queryparamclass> &query_params,
191	queryresultsclass &multiresults,
192	comerror_t &err, ostream &logout) {
193	outconvertclass text_t2ascii;
194
195	err = noError;
196	textsearchptr->setcollectdir (collectdir);
197
198	multiresults.clear();
199
200	vector<queryparamclass>::const_iterator query_here = query_params.begin();
201	vector<queryparamclass>::const_iterator query_end = query_params.end();
202	while (query_here != query_end) {
203	queryresultsclass thisqueryresults;
204
205	if (!textsearchptr->search(*query_here, thisqueryresults)) {
206	// most likely a system problem
207	logout << text_t2ascii
208	<< "system problem: could not do search with mg for index \""
209	<< (query_here).index << (query_here).subcollection
210	<< (*query_here).language << "\".\n\n";
211	err = systemProblem;
212	return;
213	}
214
215	// combine the results
216	if (need_matching_docs (request.filterResultOptions)) {
217	// post-process the results if needed
218	if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
219	!thisqueryresults.docs.docset.empty()) {
220	post_process (*query_here, thisqueryresults);
221	thisqueryresults.postprocessed = true;
222	multiresults.postprocessed = true;
223	} else {
224	num_phrases = 0;
225	}
226
227	if (query_params.size() == 1) {
228	multiresults.docs = thisqueryresults.docs; // just one set of results
229	multiresults.docs_matched = thisqueryresults.docs_matched;
230	multiresults.is_approx = thisqueryresults.is_approx;
231
232	} else {
233	if ((*query_here).combinequery == "and") {
234	multiresults.docs.combine_and (thisqueryresults.docs);
235	} else if ((*query_here).combinequery == "or") {
236	multiresults.docs.combine_or (thisqueryresults.docs);
237	} else if ((*query_here).combinequery == "not") {
238	multiresults.docs.combine_not (thisqueryresults.docs);
239	}
240	multiresults.docs_matched = multiresults.docs.docset.size();
241	multiresults.is_approx = Exact;
242	}
243	}
244
245	// combine the term information
246	if (need_term_info (request.filterResultOptions)) {
247	// append the terms
248	multiresults.orgterms.insert(multiresults.orgterms.end(),
249	thisqueryresults.orgterms.begin(),
250	thisqueryresults.orgterms.end());
251
252	// add the term variants
253	text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
254	text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
255	while (termvar_here != termvar_end) {
256	multiresults.termvariants.insert(*termvar_here);
257	++termvar_here;
258	}
259	}
260
261	++query_here;
262	}
263
264	// sort and unique the query terms
265	multiresults.sortuniqqueryterms ();
266	}
267
268
269	void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/request/,
270	docresultsclass &docs) {
271	resultsorderer_t resultsorderer;
272	resultsorderer.compare_phrase_match = true;
273	resultsorderer.docset = &(docs.docset);
274
275	// first get a list of document numbers
276	docs.docnum_order();
277
278	sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
279	}
280
281
282
283	mgqueryfilterclass::mgqueryfilterclass ()
284	:queryfilterclass() {
285
286	num_phrases = 0;
287	}
288
289	mgqueryfilterclass::~mgqueryfilterclass () {
290	}
291
292	void mgqueryfilterclass::filter (const FilterRequest_t &request,
293	FilterResponse_t &response,
294	comerror_t &err, ostream &logout) {
295	outconvertclass text_t2ascii;
296
297	response.clear ();
298	err = noError;
299	if (db_ptr == NULL) {
300	// most likely a configuration problem
301	logout << text_t2ascii
302	<< "configuration error: mgqueryfilter contains a null dbclass\n\n";
303	err = configurationError;
304	return;
305	}
306	if (textsearchptr == NULL) {
307	// most likely a configuration problem
308	logout << text_t2ascii
309	<< "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
310	err = configurationError;
311	return;
312	}
313
314	// open the database
315	db_ptr->setlogout(&logout);
316	if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
317	// most likely a system problem (we have already checked that the database exists)
318	logout << text_t2ascii
319	<< "system problem: open on database \"" << db_filename << "\" failed\n\n";
320	err = systemProblem;
321	return;
322	}
323
324	// get the query parameters
325	int startresults = filterOptions["StartResults"].defaultValue.getint();
326	int endresults = filterOptions["EndResults"].defaultValue.getint();
327	text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
328
329	vector<queryparamclass> queryfilterparams;
330	parse_query_params (request, queryfilterparams, startresults,
331	endresults, phrasematch, logout);
332	// do any mg specific diddling with query parameters that may be required
333	mg_parse_query_params (request, queryfilterparams, startresults,
334	endresults, phrasematch, logout);
335
336
337	// do query
338	queryresultsclass queryresults;
339	do_multi_query (request, queryfilterparams, queryresults, err, logout);
340	if (err != noError) return;
341
342	// assemble document results
343	if (need_matching_docs (request.filterResultOptions)) {
344	// sort the query results
345	// only want to sort the docs if we have done a ranked search or there were phrases
346	if (num_phrases > 0 \|\| (request.filterResultOptions & FRranking)) {
347	sort_doc_results (request, queryresults.docs);
348	}
349	int resultnum = 1;
350	ResultDocInfo_t resultdoc;
351	text_t trans_OID;
352	vector<text_t>::iterator docorder_here = queryresults.docs.docorder.begin();
353	vector<text_t>::iterator docorder_end = queryresults.docs.docorder.end();
354
355	// documents containing matching phrases will be sorted to the top so
356	// we can break out once we're past those that match the PhraseMatch
357	// option -- "all_phrases" = return only those documents containing all
358	// phrases in query string
359	// "some_phrases" = return only those documents containing
360	// at least 1 of the phrases in the document
361	// "all_docs" = return all documents regardless
362	if (num_phrases > 0) {
363	int numdocs = 0;
364	while (docorder_here != docorder_end) {
365	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
366
367	if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) \|\|
368	((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
369	queryresults.docs_matched = numdocs;
370	break;
371	}
372	++numdocs;
373	++docorder_here;
374	}
375	}
376
377	if (endresults == -1) endresults = MAXNUMDOCS;
378	docorder_here = queryresults.docs.docorder.begin();
379	while (docorder_here != docorder_end) {
380	if (resultnum > endresults \|\| resultnum > queryresults.docs_matched) break;
381
382	// translate the document number
383	if (!translate(db_ptr, *docorder_here, trans_OID)) {
384	logout << text_t2ascii
385	<< "warning: could not translate mg document number \""
386	<< *docorder_here << "\"to OID.\n\n";
387
388	} else {
389	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
390
391	// see if there is a result for this number,
392	// if it is in the request set (or the request set is empty)
393	if (docset_here != queryresults.docs.docset.end() &&
394	(request.docSet.empty() \|\| in_set(request.docSet, trans_OID))) {
395	if (resultnum >= startresults) {
396	// add this document
397	resultdoc.OID = trans_OID;
398	resultdoc.result_num = resultnum;
399	resultdoc.ranking = (int)((docset_here).second.docweight 10000.0 + 0.5);
400
401	// these next two are not available on all versions of mg
402	resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
403	resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
404
405	response.docInfo.push_back (resultdoc);
406	}
407
408	++resultnum;
409	}
410	}
411
412	++docorder_here;
413	}
414	}
415
416	// assemble the term results
417	if (need_term_info(request.filterResultOptions)) {
418	// note: the terms have already been sorted and uniqued
419
420	TermInfo_t terminfo;
421	bool terms_first = true;
422	termfreqclassarray::iterator terms_here = queryresults.terms.begin();
423	termfreqclassarray::iterator terms_end = queryresults.terms.end();
424
425	while (terms_here != terms_end) {
426	terminfo.clear();
427	terminfo.term = (*terms_here).termstr;
428	terminfo.freq = (*terms_here).termfreq;
429	if (terms_first) {
430	text_tset::iterator termvariants_here = queryresults.termvariants.begin();
431	text_tset::iterator termvariants_end = queryresults.termvariants.end();
432	while (termvariants_here != termvariants_end) {
433	terminfo.matchTerms.push_back (*termvariants_here);
434	++termvariants_here;
435	}
436	}
437	terms_first = false;
438
439	response.termInfo.push_back (terminfo);
440
441	++terms_here;
442	}
443	}
444
445	db_ptr->closedatabase(); // Important that local library doesn't leave any files open
446	response.numDocs = queryresults.docs_matched;
447	response.isApprox = queryresults.is_approx;
448	}
449
450	void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/request/,
451	vector<queryparamclass> &query_params,
452	int &/startresults/, int &/endresults/,
453	text_t &/phrasematch/, ostream &/logout/) {
454
455	// outconvertclass text_t2ascii;
456
457	vector<queryparamclass>::iterator query_here = query_params.begin();
458	vector<queryparamclass>::iterator query_end = query_params.end();
459	while (query_here != query_end) {
460
461	// if we're doing a phrase search we want to maximise hits by making it
462	// a boolean search on the index with the finest granularity - we'll
463	// also set maxdocs to "all" (realizing that this will cause searches
464	// like "and the" on a large collection to take a very very long time).
465
466	// we're deciding it's a phrase search based on if the querystring
467	// contains at least 2 double quotes (not very scientific but
468	// then neither is the rest of the mg phrase searching functionality :-)
469	//if (countchar ((query_here).querystring.begin(), (query_here).querystring.end(), '"') > 1) {
470
471	// [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here
472	text_tarray phrases;
473	get_phrases((*query_here).querystring, phrases);
474
475	if (phrases.size() > 0) {
476	(*query_here).search_type = 0;
477
478	// set maxdocs to "all"
479	(*query_here).maxdocs = -1;
480
481	// Get the long version of the index and test to see if any indexes with
482	// finer granularity exist. Indexes must be the same type (i.e. same metadata
483	// or "text").
484	text_t longindex; text_tarray splitindex;
485	indexmap.to2from ((*query_here).index, longindex);
486	splitchar (longindex.begin(), longindex.end(), ':', splitindex);
487	text_t &granularity = splitindex[0];
488	text_t &indextype = splitindex[1];
489	bool found = false;
490	// currently supported granularity options are "document", "section" and "paragraph"
491	if (granularity == "document" \|\| granularity == "section") {
492	text_t shortindex;
493	if (indexmap.fromexists ("paragraph:" + indextype)) {
494	indexmap.from2to ("paragraph:" + indextype, shortindex);
495	(*query_here).index = shortindex;
496	found = true;
497	}
498	if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
499	indexmap.from2to ("section:" + indextype, shortindex);
500	(*query_here).index = shortindex;
501	}
502	}
503	}
504
505	#ifdef GSDL_BBC_COLLECTION
506	// This is a special hack for the BBC collection's ProgNumber and zzabn
507	// indexes (they're built this way to prevent mg_perf_hash_build from
508	// dying at build time)
509
510	// if we're searching the ProgNumber index we want to
511	// remove all non-alphanumeric characters from the query string
512	text_t longindex; text_tarray splitindex;
513	indexmap.to2from ((*query_here).index, longindex);
514	splitchar (longindex.begin(), longindex.end(), ':', splitindex);
515	text_t &indextype = splitindex[1];
516	if (indextype == "ProgNumber") {
517	text_t new_querystring;
518	text_t::const_iterator here = (*query_here).querystring.begin();
519	text_t::const_iterator end = (*query_here).querystring.end();
520	while (here != end) {
521	if ((here >= 'a' && here <= 'z') \|\| (here >= 'A' && here <= 'Z') \|\|
522	(here >= '0' && here <= '9')) {
523	new_querystring.push_back (*here);
524	}
525	++here;
526	}
527	(*query_here).querystring = new_querystring;
528	}
529	#endif
530	++query_here;
531	}
532	}
533

Note: See TracBrowser for help on using the repository browser.

Download in other formats: