Context Navigation

source: gsdl/trunk/src/colservr/mgqueryfilter.cpp@ 15757

Last change on this file since 15757 was 15681, checked in by mdewsnip, 16 years ago
Removed some unnecessary inclusions of "assert.h".
Property svn:keywords set to `Author Date Id Revision`
File size: 17.8 KB

Line
1	/**********************************************************************
2	*
3	* mgqueryfilter.cpp -- implementation of queryfilter for old mg
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "mgqueryfilter.h"
27	#include "fileutil.h"
28	#include "phrasesearch.h"
29	#include "mgsearch.h"
30	#include "phrases.h"
31
32	///////////////////////////////
33	// methods for resultsorderer_t
34	///////////////////////////////
35
36	resultsorderer_t::resultsorderer_t() {
37	clear ();
38	}
39
40	void resultsorderer_t::clear() {
41	compare_phrase_match = false;
42	compare_terms_match = false;
43	compare_doc_weight = true;
44
45	docset = NULL;
46	}
47
48	bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
49	if (docset == NULL) return t1>t2;
50
51	docresultmap::iterator t1_here = docset->find(t1);
52	docresultmap::iterator t2_here = docset->find(t2);
53	docresultmap::iterator end = docset->end();
54
55	// sort all the document numbers not in the document set to
56	// the end of the list
57	if (t1_here == end) {
58	if (t2_here == end) return t1>t2;
59	else return true;
60	} else if (t2_here == end) return false;
61
62	if (compare_phrase_match) {
63	if ((t1_here).second.num_phrase_match > (t2_here).second.num_phrase_match) return true;
64	if ((t1_here).second.num_phrase_match < (t2_here).second.num_phrase_match) return false;
65	}
66
67	if (compare_terms_match) {
68	if ((t1_here).second.num_query_terms_matched > (t2_here).second.num_query_terms_matched) return true;
69	if ((t1_here).second.num_query_terms_matched < (t2_here).second.num_query_terms_matched) return false;
70	}
71
72	if (compare_doc_weight) {
73	if ((t1_here).second.docweight > (t2_here).second.docweight) return true;
74	if ((t1_here).second.docweight < (t2_here).second.docweight) return false;
75	}
76
77	return t1>t2;
78	}
79
80
81
82
83	/////////////////////////////////
84	// functions for mgqueryfilterclass
85	/////////////////////////////////
86
87
88	void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
89	queryfilterclass::configure (key, cfgline);
90
91	if (key == "indexstem") {
92	((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
93	}
94
95	}
96
97	// loads up phrases data structure with any phrases (that's the quoted bits)
98	// occuring in the querystring
99	void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
100	const termfreqclassarray &orgterms,
101	vector<termfreqclassarray> &phrases) {
102
103	text_t::const_iterator here = querystring.begin();
104	text_t::const_iterator end = querystring.end();
105
106	termfreqclassarray tmpterms;
107
108	int termcount = 0;
109	bool foundquote = false;
110	bool foundbreak = false;
111	bool start = true;
112	while (here != end) {
113	if (*here == '\"') {
114	if (foundquote) {
115	if (!foundbreak && !start) {
116	tmpterms.push_back (orgterms[termcount]);
117	++termcount;
118	}
119	if (tmpterms.size() > 1) {
120	phrases.push_back (tmpterms);
121	}
122	tmpterms.erase (tmpterms.begin(), tmpterms.end());
123
124	foundquote = false;
125	foundbreak = true;
126	} else foundquote = true;
127	} else if (!is_unicode_letdig(*here)) {
128	// found a break between terms
129	if (!foundbreak && !start) {
130	if (foundquote) {
131	tmpterms.push_back (orgterms[termcount]);
132	}
133	++termcount;
134	}
135	foundbreak = true;
136	} else {
137	start = false;
138	foundbreak = false;
139	}
140	++here;
141	}
142	}
143
144	// do aditional query processing
145	void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
146	queryresultsclass &queryresults) {
147
148	// post-process the results if needed
149	if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
150
151	// get the terms between quotes (if any)
152	vector<termfreqclassarray> phrases;
153	get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
154
155	num_phrases = phrases.size();
156	if (num_phrases > 0) {
157
158	// get the long version of the index
159	text_t longindex;
160	indexmap.to2from (queryparams.index, longindex);
161
162	vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
163	vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
164
165	while (this_phrase != end_phrase) {
166
167	// process each of the matched documents
168	docresultmap::iterator docs_here = queryresults.docs.docset.begin();
169	docresultmap::iterator docs_end = queryresults.docs.docset.end();
170	while (docs_here != docs_end) {
171	if (OID_phrase_search (((mgsearchclass)textsearchptr), *db_ptr, queryparams.index,
172	queryparams.subcollection, queryparams.language,
173	longindex, queryparams.collection, *this_phrase,
174	(*docs_here).second.docnum)) {
175	++docs_here->second.num_phrase_match;
176	}
177
178	++docs_here;
179	}
180	++this_phrase;
181	}
182	}
183	}
184	}
185
186
187	// do query that might involve multiple sub queries
188	// textsearchptr and db_ptr are assumed to be valid
189	void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
190	const vector<queryparamclass> &query_params,
191	queryresultsclass &multiresults,
192	comerror_t &err, ostream &logout) {
193	outconvertclass text_t2ascii;
194
195	err = noError;
196	textsearchptr->setcollectdir (collectdir);
197	multiresults.clear();
198
199	vector<queryparamclass>::const_iterator query_here = query_params.begin();
200	vector<queryparamclass>::const_iterator query_end = query_params.end();
201	while (query_here != query_end) {
202	queryresultsclass thisqueryresults;
203
204	if (!textsearchptr->search(*query_here, thisqueryresults)) {
205	// most likely a system problem
206	logout << text_t2ascii
207	<< "system problem: could not do search with mg for index \""
208	<< (query_here).index << (query_here).subcollection
209	<< (*query_here).language << "\".\n\n";
210	err = systemProblem;
211	return;
212	}
213
214	// combine the results
215	if (need_matching_docs (request.filterResultOptions)) {
216	// post-process the results if needed
217	if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
218	!thisqueryresults.docs.docset.empty()) {
219	post_process (*query_here, thisqueryresults);
220	thisqueryresults.postprocessed = true;
221	multiresults.postprocessed = true;
222	} else {
223	num_phrases = 0;
224	}
225
226	if (query_params.size() == 1) {
227	multiresults.docs = thisqueryresults.docs; // just one set of results
228	multiresults.docs_matched = thisqueryresults.docs_matched;
229	multiresults.is_approx = thisqueryresults.is_approx;
230
231	} else {
232	if ((*query_here).combinequery == "and") {
233	multiresults.docs.combine_and (thisqueryresults.docs);
234	} else if ((*query_here).combinequery == "or") {
235	multiresults.docs.combine_or (thisqueryresults.docs);
236	} else if ((*query_here).combinequery == "not") {
237	multiresults.docs.combine_not (thisqueryresults.docs);
238	}
239	multiresults.docs_matched = multiresults.docs.docset.size();
240	multiresults.is_approx = Exact;
241	}
242	}
243
244	// combine the term information
245	if (need_term_info (request.filterResultOptions)) {
246	// append the terms
247	multiresults.orgterms.insert(multiresults.orgterms.end(),
248	thisqueryresults.orgterms.begin(),
249	thisqueryresults.orgterms.end());
250
251	// add the term variants
252	text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
253	text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
254	while (termvar_here != termvar_end) {
255	multiresults.termvariants.insert(*termvar_here);
256	++termvar_here;
257	}
258	}
259
260	++query_here;
261	}
262
263	// sort and unique the query terms
264	multiresults.sortuniqqueryterms ();
265	}
266
267
268	void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/request/,
269	docresultsclass &docs) {
270	resultsorderer_t resultsorderer;
271	resultsorderer.compare_phrase_match = true;
272	resultsorderer.docset = &(docs.docset);
273
274	// first get a list of document numbers
275	docs.docnum_order();
276
277	sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
278	}
279
280
281
282	mgqueryfilterclass::mgqueryfilterclass ()
283	:queryfilterclass() {
284
285	num_phrases = 0;
286	}
287
288	mgqueryfilterclass::~mgqueryfilterclass () {
289	}
290
291	void mgqueryfilterclass::filter (const FilterRequest_t &request,
292	FilterResponse_t &response,
293	comerror_t &err, ostream &logout) {
294	outconvertclass text_t2ascii;
295
296	response.clear ();
297	err = noError;
298	if (db_ptr == NULL) {
299	// most likely a configuration problem
300	logout << text_t2ascii
301	<< "configuration error: mgqueryfilter contains a null dbclass\n\n";
302	err = configurationError;
303	return;
304	}
305	if (textsearchptr == NULL) {
306	// most likely a configuration problem
307	logout << text_t2ascii
308	<< "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
309	err = configurationError;
310	return;
311	}
312
313	// open the database
314	db_ptr->setlogout(&logout);
315	if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
316	// most likely a system problem (we have already checked that the database exists)
317	logout << text_t2ascii
318	<< "system problem: open on database \"" << db_filename << "\" failed\n\n";
319	err = systemProblem;
320	return;
321	}
322
323	// get the query parameters
324	int startresults = filterOptions["StartResults"].defaultValue.getint();
325	int endresults = filterOptions["EndResults"].defaultValue.getint();
326	text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
327
328	vector<queryparamclass> queryfilterparams;
329	parse_query_params (request, queryfilterparams, startresults,
330	endresults, phrasematch, logout);
331	// do any mg specific diddling with query parameters that may be required
332	mg_parse_query_params (request, queryfilterparams, startresults,
333	endresults, phrasematch, logout);
334
335
336	// do query
337	queryresultsclass queryresults;
338	do_multi_query (request, queryfilterparams, queryresults, err, logout);
339	if (err != noError) return;
340
341	// assemble document results
342	if (need_matching_docs (request.filterResultOptions)) {
343	// sort the query results
344	// only want to sort the docs if we have done a ranked search or there were phrases
345	if (num_phrases > 0 \|\| (request.filterResultOptions & FRranking)) {
346	sort_doc_results (request, queryresults.docs);
347	}
348	int resultnum = 1;
349	ResultDocInfo_t resultdoc;
350	text_t trans_OID;
351	vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
352	vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
353
354	// documents containing matching phrases will be sorted to the top so
355	// we can break out once we're past those that match the PhraseMatch
356	// option -- "all_phrases" = return only those documents containing all
357	// phrases in query string
358	// "some_phrases" = return only those documents containing
359	// at least 1 of the phrases in the document
360	// "all_docs" = return all documents regardless
361	if (num_phrases > 0) {
362	int numdocs = 0;
363	while (docorder_here != docorder_end) {
364	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
365
366	if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) \|\|
367	((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
368	queryresults.docs_matched = numdocs;
369	break;
370	}
371	++numdocs;
372	++docorder_here;
373	}
374	}
375
376	if (endresults == -1) endresults = MAXNUMDOCS;
377	docorder_here = queryresults.docs.docorder.begin();
378	while (docorder_here != docorder_end) {
379	if (resultnum > endresults \|\| resultnum > queryresults.docs_matched) break;
380
381	// translate the document number
382	if (!translate(db_ptr, *docorder_here, trans_OID)) {
383	logout << text_t2ascii
384	<< "warning: could not translate mg document number \""
385	<< *docorder_here << "\"to OID.\n\n";
386
387	} else {
388	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
389
390	// see if there is a result for this number,
391	// if it is in the request set (or the request set is empty)
392	if (docset_here != queryresults.docs.docset.end() &&
393	(request.docSet.empty() \|\| in_set(request.docSet, trans_OID))) {
394	if (resultnum >= startresults) {
395	// add this document
396	resultdoc.OID = trans_OID;
397	resultdoc.result_num = resultnum;
398	resultdoc.ranking = (int)((docset_here).second.docweight 10000.0 + 0.5);
399
400	// these next two are not available on all versions of mg
401	resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
402	resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
403
404	response.docInfo.push_back (resultdoc);
405	}
406
407	++resultnum;
408	}
409	}
410
411	++docorder_here;
412	}
413	}
414
415	// assemble the term results
416	if (need_term_info(request.filterResultOptions)) {
417	// note: the terms have already been sorted and uniqued
418
419	TermInfo_t terminfo;
420	bool terms_first = true;
421	termfreqclassarray::iterator terms_here = queryresults.terms.begin();
422	termfreqclassarray::iterator terms_end = queryresults.terms.end();
423
424	while (terms_here != terms_end) {
425	terminfo.clear();
426	terminfo.term = (*terms_here).termstr;
427	terminfo.freq = (*terms_here).termfreq;
428	if (terms_first) {
429	text_tset::iterator termvariants_here = queryresults.termvariants.begin();
430	text_tset::iterator termvariants_end = queryresults.termvariants.end();
431	while (termvariants_here != termvariants_end) {
432	terminfo.matchTerms.push_back (*termvariants_here);
433	++termvariants_here;
434	}
435	}
436	terms_first = false;
437
438	response.termInfo.push_back (terminfo);
439
440	++terms_here;
441	}
442	}
443
444	db_ptr->closedatabase(); // Important that local library doesn't leave any files open
445	response.numDocs = queryresults.docs_matched;
446	response.isApprox = queryresults.is_approx;
447	}
448
449	void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/request/,
450	vector<queryparamclass> &query_params,
451	int &/startresults/, int &/endresults/,
452	text_t &/phrasematch/, ostream &/logout/) {
453
454	// outconvertclass text_t2ascii;
455
456	vector<queryparamclass>::iterator query_here = query_params.begin();
457	vector<queryparamclass>::iterator query_end = query_params.end();
458	while (query_here != query_end) {
459
460	// if we're doing a phrase search we want to maximise hits by making it
461	// a boolean search on the index with the finest granularity - we'll
462	// also set maxdocs to "all" (realizing that this will cause searches
463	// like "and the" on a large collection to take a very very long time).
464
465	// we're deciding it's a phrase search based on if the querystring
466	// contains at least 2 double quotes (not very scientific but
467	// then neither is the rest of the mg phrase searching functionality :-)
468	//if (countchar ((query_here).querystring.begin(), (query_here).querystring.end(), '"') > 1) {
469
470	// [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here
471	text_tarray phrases;
472	get_phrases((*query_here).querystring, phrases);
473
474	if (phrases.size() > 0) {
475	(*query_here).search_type = 0;
476
477	// set maxdocs to "all"
478	(*query_here).maxdocs = -1;
479
480	// Get the long version of the index and test to see if any indexes with
481	// finer granularity exist. Indexes must be the same type (i.e. same metadata
482	// or "text").
483	text_t longindex; text_tarray splitindex;
484	indexmap.to2from ((*query_here).index, longindex);
485	splitchar (longindex.begin(), longindex.end(), ':', splitindex);
486	text_t &granularity = splitindex[0];
487	text_t &indextype = splitindex[1];
488	bool found = false;
489	// currently supported granularity options are "document", "section" and "paragraph"
490	if (granularity == "document" \|\| granularity == "section") {
491	text_t shortindex;
492	if (indexmap.fromexists ("paragraph:" + indextype)) {
493	indexmap.from2to ("paragraph:" + indextype, shortindex);
494	(*query_here).index = shortindex;
495	found = true;
496	}
497	if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
498	indexmap.from2to ("section:" + indextype, shortindex);
499	(*query_here).index = shortindex;
500	}
501	}
502	}
503
504	#ifdef GSDL_BBC_COLLECTION
505	// This is a special hack for the BBC collection's ProgNumber and zzabn
506	// indexes (they're built this way to prevent mg_perf_hash_build from
507	// dying at build time)
508
509	// if we're searching the ProgNumber index we want to
510	// remove all non-alphanumeric characters from the query string
511	text_t longindex; text_tarray splitindex;
512	indexmap.to2from ((*query_here).index, longindex);
513	splitchar (longindex.begin(), longindex.end(), ':', splitindex);
514	text_t &indextype = splitindex[1];
515	if (indextype == "ProgNumber") {
516	text_t new_querystring;
517	text_t::const_iterator here = (*query_here).querystring.begin();
518	text_t::const_iterator end = (*query_here).querystring.end();
519	while (here != end) {
520	if ((here >= 'a' && here <= 'z') \|\| (here >= 'A' && here <= 'Z') \|\|
521	(here >= '0' && here <= '9')) {
522	new_querystring.push_back (*here);
523	}
524	++here;
525	}
526	(*query_here).querystring = new_querystring;
527	}
528	#endif
529	++query_here;
530	}
531	}
532

Note: See TracBrowser for help on using the repository browser.

Download in other formats: