Context Navigation

source: trunk/gsdl/src/colservr/mgqueryfilter.cpp@ 10004

Last change on this file since 10004 was 9937, checked in by kjdon, 19 years ago
modified the filters/sources etc so that if an indexstem is specified in the build.cfg file, then this will be used as the root of the index/gdbm filenames instead of the collection name. colleciton name still used by default. this means that we can rename a coll directory without rebuilding.
Property svn:keywords set to `Author Date Id Revision`
File size: 17.7 KB

Line
1	/**********************************************************************
2	*
3	* mgqueryfilter.cpp -- implementation of queryfilter for old mg
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "mgqueryfilter.h"
27	#include "fileutil.h"
28	#include "phrasesearch.h"
29	#include <assert.h>
30	#include "mgsearch.h"
31
32	///////////////////////////////
33	// methods for resultsorderer_t
34	///////////////////////////////
35
36	resultsorderer_t::resultsorderer_t() {
37	clear ();
38	}
39
40	void resultsorderer_t::clear() {
41	compare_phrase_match = false;
42	compare_terms_match = false;
43	compare_doc_weight = true;
44
45	docset = NULL;
46	}
47
48	bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
49	if (docset == NULL) return t1>t2;
50
51	docresultmap::iterator t1_here = docset->find(t1);
52	docresultmap::iterator t2_here = docset->find(t2);
53	docresultmap::iterator end = docset->end();
54
55	// sort all the document numbers not in the document set to
56	// the end of the list
57	if (t1_here == end) {
58	if (t2_here == end) return t1>t2;
59	else return true;
60	} else if (t2_here == end) return false;
61
62	if (compare_phrase_match) {
63	if ((t1_here).second.num_phrase_match > (t2_here).second.num_phrase_match) return true;
64	if ((t1_here).second.num_phrase_match < (t2_here).second.num_phrase_match) return false;
65	}
66
67	if (compare_terms_match) {
68	if ((t1_here).second.num_query_terms_matched > (t2_here).second.num_query_terms_matched) return true;
69	if ((t1_here).second.num_query_terms_matched < (t2_here).second.num_query_terms_matched) return false;
70	}
71
72	if (compare_doc_weight) {
73	if ((t1_here).second.docweight > (t2_here).second.docweight) return true;
74	if ((t1_here).second.docweight < (t2_here).second.docweight) return false;
75	}
76
77	return t1>t2;
78	}
79
80
81
82
83	/////////////////////////////////
84	// functions for mgqueryfilterclass
85	/////////////////////////////////
86
87
88	void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
89	queryfilterclass::configure (key, cfgline);
90
91	if (key == "maxnumeric") {
92	maxnumeric = cfgline[0].getint();
93	}
94	else if (key == "indexstem") {
95	((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
96	}
97
98	}
99
100	// loads up phrases data structure with any phrases (that's the quoted bits)
101	// occuring in the querystring
102	void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
103	const termfreqclassarray &orgterms,
104	vector<termfreqclassarray> &phrases) {
105
106	text_t::const_iterator here = querystring.begin();
107	text_t::const_iterator end = querystring.end();
108
109	termfreqclassarray tmpterms;
110
111	int termcount = 0;
112	bool foundquote = false;
113	bool foundbreak = false;
114	bool start = true;
115	while (here != end) {
116	if (*here == '\"') {
117	if (foundquote) {
118	if (!foundbreak && !start) {
119	tmpterms.push_back (orgterms[termcount]);
120	++termcount;
121	}
122	if (tmpterms.size() > 1) {
123	phrases.push_back (tmpterms);
124	tmpterms.erase (tmpterms.begin(), tmpterms.end());
125	}
126	foundquote = false;
127	foundbreak = true;
128	} else foundquote = true;
129	} else if (!is_unicode_letdig(*here)) {
130	// found a break between terms
131	if (!foundbreak && !start) {
132	if (foundquote)
133	tmpterms.push_back (orgterms[termcount]);
134	++termcount;
135	}
136	foundbreak = true;
137	} else {
138	start = false;
139	foundbreak = false;
140	}
141	++here;
142	}
143	}
144
145	// do aditional query processing
146	void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
147	queryresultsclass &queryresults) {
148
149	// post-process the results if needed
150	if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
151
152	// get the terms between quotes (if any)
153	vector<termfreqclassarray> phrases;
154	get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
155
156	num_phrases = phrases.size();
157	if (num_phrases > 0) {
158
159	// get the long version of the index
160	text_t longindex;
161	indexmap.to2from (queryparams.index, longindex);
162
163	vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
164	vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
165
166	while (this_phrase != end_phrase) {
167
168	// process each of the matched documents
169	docresultmap::iterator docs_here = queryresults.docs.docset.begin();
170	docresultmap::iterator docs_end = queryresults.docs.docset.end();
171	while (docs_here != docs_end) {
172	if (OID_phrase_search (((mgsearchclass)textsearchptr), *gdbmptr, queryparams.index,
173	queryparams.subcollection, queryparams.language,
174	longindex, queryparams.collection, *this_phrase,
175	(*docs_here).second.docnum)) {
176	++docs_here->second.num_phrase_match;
177	}
178
179	++docs_here;
180	}
181	++this_phrase;
182	}
183	}
184	}
185	}
186
187
188	// do query that might involve multiple sub queries
189	// mgsearchptr and gdbmptr are assumed to be valid
190	void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
191	const vector<queryparamclass> &query_params,
192	queryresultsclass &multiresults,
193	comerror_t &err, ostream &logout) {
194	outconvertclass text_t2ascii;
195
196	err = noError;
197	textsearchptr->setcollectdir (collectdir);
198	multiresults.clear();
199
200	vector<queryparamclass>::const_iterator query_here = query_params.begin();
201	vector<queryparamclass>::const_iterator query_end = query_params.end();
202	while (query_here != query_end) {
203	queryresultsclass thisqueryresults;
204
205	if (!textsearchptr->search(*query_here, thisqueryresults)) {
206	// most likely a system problem
207	logout << text_t2ascii
208	<< "system problem: could not do search with mg for index \""
209	<< (query_here).index << (query_here).subcollection
210	<< (*query_here).language << "\".\n\n";
211	err = systemProblem;
212	return;
213	}
214
215	// combine the results
216	if (need_matching_docs (request.filterResultOptions)) {
217	// post-process the results if needed
218	if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
219	!thisqueryresults.docs.docset.empty()) {
220	post_process (*query_here, thisqueryresults);
221	thisqueryresults.postprocessed = true;
222	multiresults.postprocessed = true;
223	} else {
224	num_phrases = 0;
225	}
226
227	if (query_params.size() == 1) {
228	multiresults.docs = thisqueryresults.docs; // just one set of results
229	multiresults.docs_matched = thisqueryresults.docs_matched;
230	multiresults.is_approx = thisqueryresults.is_approx;
231
232	} else {
233	if ((*query_here).combinequery == "and") {
234	multiresults.docs.combine_and (thisqueryresults.docs);
235	} else if ((*query_here).combinequery == "or") {
236	multiresults.docs.combine_or (thisqueryresults.docs);
237	} else if ((*query_here).combinequery == "not") {
238	multiresults.docs.combine_not (thisqueryresults.docs);
239	}
240	multiresults.docs_matched = multiresults.docs.docset.size();
241	multiresults.is_approx = Exact;
242	}
243	}
244
245	// combine the term information
246	if (need_term_info (request.filterResultOptions)) {
247	// append the terms
248	multiresults.orgterms.insert(multiresults.orgterms.end(),
249	thisqueryresults.orgterms.begin(),
250	thisqueryresults.orgterms.end());
251
252	// add the term variants
253	text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
254	text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
255	while (termvar_here != termvar_end) {
256	multiresults.termvariants.insert(*termvar_here);
257	++termvar_here;
258	}
259	}
260
261	++query_here;
262	}
263
264	// sort and unique the query terms
265	multiresults.sortuniqqueryterms ();
266	}
267
268
269	void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/request/,
270	docresultsclass &docs) {
271	resultsorderer_t resultsorderer;
272	resultsorderer.compare_phrase_match = true;
273	resultsorderer.docset = &(docs.docset);
274
275	// first get a list of document numbers
276	docs.docnum_order();
277
278	sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
279	}
280
281
282
283	mgqueryfilterclass::mgqueryfilterclass ()
284	:queryfilterclass() {
285
286	num_phrases = 0;
287	maxnumeric = 4;
288	}
289
290	mgqueryfilterclass::~mgqueryfilterclass () {
291	}
292
293	void mgqueryfilterclass::filter (const FilterRequest_t &request,
294	FilterResponse_t &response,
295	comerror_t &err, ostream &logout) {
296	outconvertclass text_t2ascii;
297
298	response.clear ();
299	err = noError;
300	if (gdbmptr == NULL) {
301	// most likely a configuration problem
302	logout << text_t2ascii
303	<< "configuration error: mgqueryfilter contains a null gdbmclass\n\n";
304	err = configurationError;
305	return;
306	}
307	if (textsearchptr == NULL) {
308	// most likely a configuration problem
309	logout << text_t2ascii
310	<< "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
311	err = configurationError;
312	return;
313	}
314
315	// open the database
316	gdbmptr->setlogout(&logout);
317	if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
318	// most likely a system problem (we have already checked that the
319	// gdbm database exists)
320	logout << text_t2ascii
321	<< "system problem: open on gdbm database \""
322	<< gdbm_filename << "\" failed\n\n";
323	err = systemProblem;
324	return;
325	}
326
327	// get the query parameters
328	int startresults = filterOptions["StartResults"].defaultValue.getint();
329	int endresults = filterOptions["EndResults"].defaultValue.getint();
330	text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
331
332	vector<queryparamclass> queryfilterparams;
333	parse_query_params (request, queryfilterparams, startresults,
334	endresults, phrasematch, logout);
335	// do any mg specific diddling with query parameters that may be required
336	mg_parse_query_params (request, queryfilterparams, startresults,
337	endresults, phrasematch, logout);
338
339
340	// do query
341	queryresultsclass queryresults;
342	do_multi_query (request, queryfilterparams, queryresults, err, logout);
343	if (err != noError) return;
344
345	// assemble document results
346	if (need_matching_docs (request.filterResultOptions)) {
347	// sort the query results
348	// only want to sort the docs if we have done a ranked search or there were phrases
349	if (num_phrases > 0 \|\| (request.filterResultOptions & FRranking)) {
350	sort_doc_results (request, queryresults.docs);
351	}
352	int resultnum = 1;
353	ResultDocInfo_t resultdoc;
354	text_t trans_OID;
355	vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
356	vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
357
358	// documents containing matching phrases will be sorted to the top so
359	// we can break out once we're past those that match the PhraseMatch
360	// option -- "all_phrases" = return only those documents containing all
361	// phrases in query string
362	// "some_phrases" = return only those documents containing
363	// at least 1 of the phrases in the document
364	// "all_docs" = return all documents regardless
365	if (num_phrases > 0) {
366	int numdocs = 0;
367	while (docorder_here != docorder_end) {
368	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
369
370	if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) \|\|
371	((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
372	queryresults.docs_matched = numdocs;
373	break;
374	}
375	++numdocs;
376	++docorder_here;
377	}
378	}
379
380	if (endresults == -1) endresults = MAXNUMDOCS;
381	docorder_here = queryresults.docs.docorder.begin();
382	while (docorder_here != docorder_end) {
383	if (resultnum > endresults \|\| resultnum > queryresults.docs_matched) break;
384
385	// translate the document number
386	if (!translate(gdbmptr, *docorder_here, trans_OID)) {
387	logout << text_t2ascii
388	<< "warning: could not translate mg document number \""
389	<< *docorder_here << "\"to OID.\n\n";
390
391	} else {
392	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
393
394	// see if there is a result for this number,
395	// if it is in the request set (or the request set is empty)
396	if (docset_here != queryresults.docs.docset.end() &&
397	(request.docSet.empty() \|\| in_set(request.docSet, trans_OID))) {
398	if (resultnum >= startresults) {
399	// add this document
400	resultdoc.OID = trans_OID;
401	resultdoc.result_num = resultnum;
402	resultdoc.ranking = (int)((docset_here).second.docweight 10000.0 + 0.5);
403
404	// these next two are not available on all versions of mg
405	resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
406	resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
407
408	response.docInfo.push_back (resultdoc);
409	}
410
411	++resultnum;
412	}
413	}
414
415	++docorder_here;
416	}
417	}
418
419	// assemble the term results
420	if (need_term_info(request.filterResultOptions)) {
421	// note: the terms have already been sorted and uniqued
422
423	TermInfo_t terminfo;
424	bool terms_first = true;
425	termfreqclassarray::iterator terms_here = queryresults.terms.begin();
426	termfreqclassarray::iterator terms_end = queryresults.terms.end();
427
428	while (terms_here != terms_end) {
429	terminfo.clear();
430	terminfo.term = (*terms_here).termstr;
431	terminfo.freq = (*terms_here).termfreq;
432	if (terms_first) {
433	text_tset::iterator termvariants_here = queryresults.termvariants.begin();
434	text_tset::iterator termvariants_end = queryresults.termvariants.end();
435	while (termvariants_here != termvariants_end) {
436	terminfo.matchTerms.push_back (*termvariants_here);
437	++termvariants_here;
438	}
439	}
440	terms_first = false;
441
442	response.termInfo.push_back (terminfo);
443
444	++terms_here;
445	}
446	}
447
448	response.numDocs = queryresults.docs_matched;
449	response.isApprox = queryresults.is_approx;
450	}
451
452	void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/request/,
453	vector<queryparamclass> &query_params,
454	int &/startresults/, int &/endresults/,
455	text_t &/phrasematch/, ostream &/logout/) {
456
457	// outconvertclass text_t2ascii;
458
459	vector<queryparamclass>::iterator query_here = query_params.begin();
460	vector<queryparamclass>::iterator query_end = query_params.end();
461	while (query_here != query_end) {
462
463	// set maxnumeric
464	(*query_here).maxnumeric = maxnumeric;
465
466	// if we're doing a phrase search we want to maximise hits by making it
467	// a boolean search on the index with the finest granularity - we'll
468	// also set maxdocs to "all" (realizing that this will cause searches
469	// like "and the" on a large collection to take a very very long time).
470
471	// we're deciding it's a phrase search based on if the querystring
472	// contains at least 2 double quotes (not very scientific but
473	// then neither is the rest of the mg phrase searching functionality :-)
474	if (countchar ((query_here).querystring.begin(), (query_here).querystring.end(), '"') > 1) {
475	(*query_here).search_type = 0;
476
477	// set maxdocs to "all"
478	(*query_here).maxdocs = -1;
479
480	// Get the long version of the index and test to see if any indexes with
481	// finer granularity exist. Indexes must be the same type (i.e. same metadata
482	// or "text").
483	text_t longindex; text_tarray splitindex;
484	indexmap.to2from ((*query_here).index, longindex);
485	splitchar (longindex.begin(), longindex.end(), ':', splitindex);
486	text_t &granularity = splitindex[0];
487	text_t &indextype = splitindex[1];
488	bool found = false;
489	// currently supported granularity options are "document", "section" and "paragraph"
490	if (granularity == "document" \|\| granularity == "section") {
491	text_t shortindex;
492	if (indexmap.fromexists ("paragraph:" + indextype)) {
493	indexmap.from2to ("paragraph:" + indextype, shortindex);
494	(*query_here).index = shortindex;
495	found = true;
496	}
497	if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
498	indexmap.from2to ("section:" + indextype, shortindex);
499	(*query_here).index = shortindex;
500	}
501	}
502	}
503
504	#ifdef GSDL_BBC_COLLECTION
505	// This is a special hack for the BBC collection's ProgNumber and zzabn
506	// indexes (they're built this way to prevent mg_perf_hash_build from
507	// dying at build time)
508
509	// if we're searching the ProgNumber index we want to
510	// remove all non-alphanumeric characters from the query string
511	text_t longindex; text_tarray splitindex;
512	indexmap.to2from ((*query_here).index, longindex);
513	splitchar (longindex.begin(), longindex.end(), ':', splitindex);
514	text_t &indextype = splitindex[1];
515	if (indextype == "ProgNumber") {
516	text_t new_querystring;
517	text_t::const_iterator here = (*query_here).querystring.begin();
518	text_t::const_iterator end = (*query_here).querystring.end();
519	while (here != end) {
520	if ((here >= 'a' && here <= 'z') \|\| (here >= 'A' && here <= 'Z') \|\|
521	(here >= '0' && here <= '9')) {
522	new_querystring.push_back (*here);
523	}
524	++here;
525	}
526	(*query_here).querystring = new_querystring;
527	}
528	#endif
529	++query_here;
530	}
531	}
532

Note: See TracBrowser for help on using the repository browser.

Download in other formats: