Context Navigation

source: trunk/gsdl/src/colservr/mgqueryfilter.cpp@ 2545

Last change on this file since 2545 was 2134, checked in by sjboddie, 23 years ago

mg phrase searching now always sets maxdocs to -1 (all) - this means that
a phrase search is guaranteed always to hit any document that contains the
phrase but also means that bad phrase searches (like "and the") will take
a very long time, especially on a large collection.

also added a bit of a hack to handle program number indexes for various
bbc collections.

Property svn:keywords set to Author Date Id Revision

File size: 17.1 KB

Line
1	/**********************************************************************
2	*
3	* mgqueryfilter.cpp -- implementation of queryfilter for old mg
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "mgqueryfilter.h"
27	#include "fileutil.h"
28	#include "phrasesearch.h"
29	#include <assert.h>
30	#include "mgsearch.h"
31
32	///////////////////////////////
33	// methods for resultsorderer_t
34	///////////////////////////////
35
36	resultsorderer_t::resultsorderer_t() {
37	clear ();
38	}
39
40	void resultsorderer_t::clear() {
41	compare_phrase_match = false;
42	compare_terms_match = false;
43	compare_doc_weight = true;
44
45	docset = NULL;
46	}
47
48	bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
49	if (docset == NULL) return t1>t2;
50
51	docresultmap::iterator t1_here = docset->find(t1);
52	docresultmap::iterator t2_here = docset->find(t2);
53	docresultmap::iterator end = docset->end();
54
55	// sort all the document numbers not in the document set to
56	// the end of the list
57	if (t1_here == end) {
58	if (t2_here == end) return t1>t2;
59	else return true;
60	} else if (t2_here == end) return false;
61
62	if (compare_phrase_match) {
63	if ((t1_here).second.num_phrase_match > (t2_here).second.num_phrase_match) return true;
64	if ((t1_here).second.num_phrase_match < (t2_here).second.num_phrase_match) return false;
65	}
66
67	if (compare_terms_match) {
68	if ((t1_here).second.num_query_terms_matched > (t2_here).second.num_query_terms_matched) return true;
69	if ((t1_here).second.num_query_terms_matched < (t2_here).second.num_query_terms_matched) return false;
70	}
71
72	if (compare_doc_weight) {
73	if ((t1_here).second.docweight > (t2_here).second.docweight) return true;
74	if ((t1_here).second.docweight < (t2_here).second.docweight) return false;
75	}
76
77	return t1>t2;
78	}
79
80
81
82
83	/////////////////////////////////
84	// functions for mgqueryfilterclass
85	/////////////////////////////////
86
87	// loads up phrases data structure with any phrases (that's the quoted bits)
88	// occuring in the querystring
89	void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
90	const termfreqclassarray &orgterms,
91	vector<termfreqclassarray> &phrases) {
92
93	text_t::const_iterator here = querystring.begin();
94	text_t::const_iterator end = querystring.end();
95
96	termfreqclassarray tmpterms;
97
98	int termcount = 0;
99	bool foundquote = false;
100	bool foundbreak = false;
101	bool start = true;
102	while (here != end) {
103	if (*here == '\"') {
104	if (foundquote) {
105	if (!foundbreak && !start) {
106	tmpterms.push_back (orgterms[termcount]);
107	termcount ++;
108	}
109	if (tmpterms.size() > 1) {
110	phrases.push_back (tmpterms);
111	tmpterms.erase (tmpterms.begin(), tmpterms.end());
112	}
113	foundquote = false;
114	foundbreak = true;
115	} else foundquote = true;
116	} else if (!is_unicode_letdig(*here)) {
117	// found a break between terms
118	if (!foundbreak && !start) {
119	if (foundquote)
120	tmpterms.push_back (orgterms[termcount]);
121	termcount ++;
122	}
123	foundbreak = true;
124	} else {
125	start = false;
126	foundbreak = false;
127	}
128	here++;
129	}
130	}
131
132	// do aditional query processing
133	void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
134	queryresultsclass &queryresults) {
135
136	// post-process the results if needed
137	if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
138
139	// get the terms between quotes (if any)
140	vector<termfreqclassarray> phrases;
141	get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
142
143	num_phrases = phrases.size();
144	if (num_phrases > 0) {
145
146	// get the long version of the index
147	text_t longindex;
148	indexmap.to2from (queryparams.index, longindex);
149
150	vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
151	vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
152
153	while (this_phrase != end_phrase) {
154
155	// process each of the matched documents
156	docresultmap::iterator docs_here = queryresults.docs.docset.begin();
157	docresultmap::iterator docs_end = queryresults.docs.docset.end();
158	while (docs_here != docs_end) {
159	if (OID_phrase_search (((mgsearchclass)mgsearchptr), *gdbmptr, queryparams.index,
160	queryparams.subcollection, queryparams.language,
161	longindex, queryparams.collection, *this_phrase,
162	(*docs_here).second.docnum)) {
163	(*docs_here).second.num_phrase_match++;
164	}
165
166	docs_here++;
167	}
168	this_phrase++;
169	}
170	}
171	}
172	}
173
174
175	// do query that might involve multiple sub queries
176	// mgsearchptr and gdbmptr are assumed to be valid
177	void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
178	const vector<queryparamclass> &query_params,
179	queryresultsclass &multiresults,
180	comerror_t &err, ostream &logout) {
181	outconvertclass text_t2ascii;
182
183	err = noError;
184	mgsearchptr->setcollectdir (collectdir);
185	multiresults.clear();
186
187	vector<queryparamclass>::const_iterator query_here = query_params.begin();
188	vector<queryparamclass>::const_iterator query_end = query_params.end();
189	while (query_here != query_end) {
190	queryresultsclass thisqueryresults;
191
192	if (!mgsearchptr->search(*query_here, thisqueryresults)) {
193	// most likely a system problem
194	logout << text_t2ascii
195	<< "system problem: could not do search with mg for index \""
196	<< (query_here).index << (query_here).subcollection
197	<< (*query_here).language << "\".\n\n";
198	err = systemProblem;
199	return;
200	}
201
202	// combine the results
203	if (need_matching_docs (request.filterResultOptions)) {
204	// post-process the results if needed
205	if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
206	!thisqueryresults.docs.docset.empty()) {
207	post_process (*query_here, thisqueryresults);
208	thisqueryresults.postprocessed = true;
209	multiresults.postprocessed = true;
210	} else {
211	num_phrases = 0;
212	}
213
214	if (query_params.size() == 1) {
215	multiresults.docs = thisqueryresults.docs; // just one set of results
216	multiresults.docs_matched = thisqueryresults.docs_matched;
217	multiresults.is_approx = thisqueryresults.is_approx;
218
219	} else {
220	if ((*query_here).combinequery == "and") {
221	multiresults.docs.combine_and (thisqueryresults.docs);
222	} else if ((*query_here).combinequery == "or") {
223	multiresults.docs.combine_or (thisqueryresults.docs);
224	} else if ((*query_here).combinequery == "not") {
225	multiresults.docs.combine_not (thisqueryresults.docs);
226	}
227	multiresults.docs_matched = multiresults.docs.docset.size();
228	multiresults.is_approx = Exact;
229	}
230	}
231
232	// combine the term information
233	if (need_term_info (request.filterResultOptions)) {
234	// append the terms
235	multiresults.orgterms.insert(multiresults.orgterms.end(),
236	thisqueryresults.orgterms.begin(),
237	thisqueryresults.orgterms.end());
238
239	// add the term variants
240	text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
241	text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
242	while (termvar_here != termvar_end) {
243	multiresults.termvariants.insert(*termvar_here);
244	termvar_here++;
245	}
246	}
247
248	query_here++;
249	}
250
251	// sort and unique the query terms
252	multiresults.sortuniqqueryterms ();
253	}
254
255
256	void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/request/,
257	docresultsclass &docs) {
258	resultsorderer_t resultsorderer;
259	resultsorderer.compare_phrase_match = true;
260	resultsorderer.docset = &(docs.docset);
261
262	// first get a list of document numbers
263	docs.docnum_order();
264
265	sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
266	}
267
268
269
270	mgqueryfilterclass::mgqueryfilterclass ()
271	:queryfilterclass() {
272
273	num_phrases = 0;
274
275	}
276
277	mgqueryfilterclass::~mgqueryfilterclass () {
278	}
279
280	void mgqueryfilterclass::filter (const FilterRequest_t &request,
281	FilterResponse_t &response,
282	comerror_t &err, ostream &logout) {
283	outconvertclass text_t2ascii;
284
285	response.clear ();
286	err = noError;
287	if (gdbmptr == NULL) {
288	// most likely a configuration problem
289	logout << text_t2ascii
290	<< "configuration error: mgqueryfilter contains a null gdbmclass\n\n";
291	err = configurationError;
292	return;
293	}
294	if (mgsearchptr == NULL) {
295	// most likely a configuration problem
296	logout << text_t2ascii
297	<< "configuration error: mgqueryfilter contains a null mgsearchclass\n\n";
298	err = configurationError;
299	return;
300	}
301
302	// open the database
303	gdbmptr->setlogout(&logout);
304	if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
305	// most likely a system problem (we have already checked that the
306	// gdbm database exists)
307	logout << text_t2ascii
308	<< "system problem: open on gdbm database \""
309	<< gdbm_filename << "\" failed\n\n";
310	err = systemProblem;
311	return;
312	}
313
314	// get the query parameters
315	int startresults = filterOptions["StartResults"].defaultValue.getint();
316	int endresults = filterOptions["EndResults"].defaultValue.getint();
317	text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
318
319	vector<queryparamclass> queryfilterparams;
320	parse_query_params (request, queryfilterparams, startresults,
321	endresults, phrasematch, logout);
322	// do any mg specific diddling with query parameters that may be required
323	mg_parse_query_params (request, queryfilterparams, startresults,
324	endresults, phrasematch, logout);
325
326
327	// do query
328	queryresultsclass queryresults;
329	do_multi_query (request, queryfilterparams, queryresults, err, logout);
330	if (err != noError) return;
331
332	// assemble document results
333	if (need_matching_docs (request.filterResultOptions)) {
334	// sort the query results
335	sort_doc_results (request, queryresults.docs);
336
337	int resultnum = 1;
338	ResultDocInfo_t resultdoc;
339	text_t trans_OID;
340	vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
341	vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
342
343	// documents containing matching phrases will be sorted to the top so
344	// we can break out once we're past those that match the PhraseMatch
345	// option -- "all_phrases" = return only those documents containing all
346	// phrases in query string
347	// "some_phrases" = return only those documents containing
348	// at least 1 of the phrases in the document
349	// "all_docs" = return all documents regardless
350	if (num_phrases > 0) {
351	int numdocs = 0;
352	while (docorder_here != docorder_end) {
353	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
354
355	if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) \|\|
356	((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
357	queryresults.docs_matched = numdocs;
358	break;
359	}
360	numdocs ++;
361	docorder_here ++;
362	}
363	}
364
365	if (endresults == -1) endresults = MAXNUMDOCS;
366	docorder_here = queryresults.docs.docorder.begin();
367	while (docorder_here != docorder_end) {
368	if (resultnum > endresults \|\| resultnum > queryresults.docs_matched) break;
369
370	// translate the document number
371	if (!translate(gdbmptr, *docorder_here, trans_OID)) {
372	logout << text_t2ascii
373	<< "warning: could not translate mg document number \""
374	<< *docorder_here << "\"to OID.\n\n";
375
376	} else {
377	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
378
379	// see if there is a result for this number,
380	// if it is in the request set (or the request set is empty)
381	if (docset_here != queryresults.docs.docset.end() &&
382	(request.docSet.empty() \|\| in_set(request.docSet, trans_OID))) {
383	if (resultnum >= startresults) {
384	// add this document
385	resultdoc.OID = trans_OID;
386	resultdoc.result_num = resultnum;
387	resultdoc.ranking = (int)((docset_here).second.docweight 10000.0 + 0.5);
388
389	// these next two are not available on all versions of mg
390	resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
391	resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
392
393	response.docInfo.push_back (resultdoc);
394	}
395
396	resultnum++;
397	}
398	}
399
400	docorder_here++;
401	}
402	}
403
404	// assemble the term results
405	if (need_term_info(request.filterResultOptions)) {
406	// note: the terms have already been sorted and uniqued
407
408	TermInfo_t terminfo;
409	bool terms_first = true;
410	termfreqclassarray::iterator terms_here = queryresults.terms.begin();
411	termfreqclassarray::iterator terms_end = queryresults.terms.end();
412
413	while (terms_here != terms_end) {
414	terminfo.clear();
415	terminfo.term = (*terms_here).termstr;
416	terminfo.freq = (*terms_here).termfreq;
417	if (terms_first) {
418	text_tset::iterator termvariants_here = queryresults.termvariants.begin();
419	text_tset::iterator termvariants_end = queryresults.termvariants.end();
420	while (termvariants_here != termvariants_end) {
421	terminfo.matchTerms.push_back (*termvariants_here);
422	termvariants_here++;
423	}
424	}
425	terms_first = false;
426
427	response.termInfo.push_back (terminfo);
428
429	terms_here++;
430	}
431	}
432
433	response.numDocs = queryresults.docs_matched;
434	response.isApprox = queryresults.is_approx;
435	}
436
437	void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/request/,
438	vector<queryparamclass> &query_params,
439	int &/startresults/, int &/endresults/,
440	text_t &/phrasematch/, ostream &/logout/) {
441
442	// outconvertclass text_t2ascii;
443
444	vector<queryparamclass>::iterator query_here = query_params.begin();
445	vector<queryparamclass>::iterator query_end = query_params.end();
446	while (query_here != query_end) {
447
448	// if we're doing a phrase search we want to maximise hits by making it
449	// a boolean search on the index with the finest granularity - we'll
450	// also set maxdocs to "all" (realizing that this will cause searches
451	// like "and the" on a large collection to take a very very long time).
452
453	// we're deciding it's a phrase search based on if the querystring
454	// contains at least 2 double quotes (not very scientific but
455	// then neither is the rest of the mg phrase searching functionality :-)
456	if (countchar ((query_here).querystring.begin(), (query_here).querystring.end(), '"') > 1) {
457	(*query_here).search_type = 0;
458
459	// set maxdocs to "all"
460	(*query_here).maxdocs = -1;
461
462	// Get the long version of the index and test to see if any indexes with
463	// finer granularity exist. Indexes must be the same type (i.e. same metadata
464	// or "text").
465	text_t longindex; text_tarray splitindex;
466	indexmap.to2from ((*query_here).index, longindex);
467	splitchar (longindex.begin(), longindex.end(), ':', splitindex);
468	text_t &granularity = splitindex[0];
469	text_t &indextype = splitindex[1];
470	bool found = false;
471	// currently supported granularity options are "document", "section" and "paragraph"
472	if (granularity == "document" \|\| granularity == "section") {
473	text_t shortindex;
474	if (indexmap.fromexists ("paragraph:" + indextype)) {
475	indexmap.from2to ("paragraph:" + indextype, shortindex);
476	(*query_here).index = shortindex;
477	found = true;
478	}
479	if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
480	indexmap.from2to ("section:" + indextype, shortindex);
481	(*query_here).index = shortindex;
482	}
483	}
484	}
485
486	#ifdef GSDL_BBC_COLLECTION
487	// This is a special hack for the BBC collection's ProgNumber index
488
489	// if we're searching a ProgNumber index we want to:
490	// 1. Remove all non-alphanumeric characters from the query string
491	// 2. Make it a boolean search
492	// 3. Turn off case-folding
493	text_t longindex; text_tarray splitindex;
494	indexmap.to2from ((*query_here).index, longindex);
495	splitchar (longindex.begin(), longindex.end(), ':', splitindex);
496	text_t &indextype = splitindex[1];
497	if (indextype == "ProgNumber") {
498	(*query_here).search_type = 0;
499	(*query_here).casefolding = 0;
500	text_t new_querystring;
501	text_t::const_iterator here = (*query_here).querystring.begin();
502	text_t::const_iterator end = (*query_here).querystring.end();
503	while (here != end) {
504	if ((here >= 'a' && here <= 'z') \|\| (here >= 'A' && here <= 'Z') \|\|
505	(here >= '0' && here <= '9')) {
506	new_querystring.push_back (*here);
507	}
508	here ++;
509	}
510	(*query_here).querystring = new_querystring;
511	}
512	#endif
513
514	query_here ++;
515	}
516	}
517

Note: See TracBrowser for help on using the repository browser.

Download in other formats: