Context Navigation

source: trunk/gsdl/src/colservr/queryfilter.cpp@ 990

Last change on this file since 990 was 990, checked in by sjboddie, 24 years ago
tidied up endianness and fastcgi
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 26.8 KB

Line
1	/**********************************************************************
2	*
3	* queryfilter.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: queryfilter.cpp 990 2000-02-29 01:35:56Z sjboddie $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.22 2000/02/29 01:35:56 sjboddie
31	tidied up endianness and fastcgi
32
33	Revision 1.21 1999/11/25 02:21:13 sjboddie
34	fixed bug in phrasematch stuff
35
36	Revision 1.20 1999/11/01 22:06:06 sjboddie
37	Added filter option to remove documents not matching a phrase match.
38	This used to be done in the receptionist.
39
40	Revision 1.19 1999/10/19 03:23:40 davidb
41	Collection building support through web pages
42	and internal and external link handling for collection documents
43
44	Revision 1.18 1999/09/22 03:43:18 sjboddie
45	Endresults queryfilter option may now take '-1' for 'all'
46
47	Revision 1.17 1999/09/21 12:01:07 sjboddie
48	added Maxdocs queryfilter option (which may be -1 for 'all')
49
50	Revision 1.16 1999/09/07 04:57:24 sjboddie
51	added gpl notice
52
53	Revision 1.15 1999/08/31 22:47:09 rjmcnab
54	Added matchmode option for some and all.
55
56	Revision 1.14 1999/07/16 03:42:21 sjboddie
57	changed isApprox
58
59	Revision 1.13 1999/07/16 00:17:06 sjboddie
60	got using phrasesearch for post-processing
61
62	Revision 1.12 1999/07/09 02:19:43 rjmcnab
63	Fixed a couple of compiler conflicts
64
65	Revision 1.11 1999/07/08 20:49:44 rjmcnab
66	Added result_num to the ResultDocInto_t structure.
67
68	Revision 1.10 1999/07/07 06:19:46 rjmcnab
69	Added ability to combine two or more independant queries.
70
71	Revision 1.9 1999/07/01 09:29:20 rjmcnab
72	Changes for better reporting of number documents which match a query. Changes
73	should still work as before with older versions of mg.
74
75	Revision 1.8 1999/07/01 03:59:54 rjmcnab
76	reduced MAXDOCS to 200 (more reasonable ???). I also added a virtual
77	method for post-processing the query.
78
79	Revision 1.7 1999/06/30 04:04:13 rjmcnab
80	made stemming functions available from mgsearch and made the stems
81	for the query terms available in queryinfo
82
83	Revision 1.6 1999/06/29 22:06:23 rjmcnab
84	Added a couple of fields to queryinfo to handle a special version
85	of mg.
86
87	Revision 1.5 1999/06/27 22:08:48 sjboddie
88	now check for defaultindex, defaultsubcollection, and defaultlanguage
89	entries in config files
90
91	Revision 1.4 1999/06/16 02:03:25 sjboddie
92	fixed bug in isApprox and set MAXDOCS to always be 500
93
94	Revision 1.3 1999/04/19 23:56:09 rjmcnab
95	Finished the gdbm metadata stuff
96
97	Revision 1.2 1999/04/12 03:45:03 rjmcnab
98	Finished the query filter.
99
100	Revision 1.1 1999/04/06 22:22:09 rjmcnab
101	Initial revision.
102
103	*/
104
105
106	#include "queryfilter.h"
107	#include "fileutil.h"
108	#include "queryinfo.h"
109	#include "phrasesearch.h"
110	#include "gsdltools.h"
111	#include <assert.h>
112
113
114	// some useful functions
115
116	// translate will return true if successful
117	static bool translate (gdbmclass *gdbmptr, int docnum, text_t &trans_OID) {
118	infodbclass info;
119
120	trans_OID.clear();
121
122	// get the info
123	if (gdbmptr == NULL) return false;
124	if (!gdbmptr->getinfo(docnum, info)) return false;
125
126	// translate
127	if (info["section"].empty()) return false;
128
129	trans_OID = info["section"];
130	return true;
131	}
132
133
134	// whether document results are needed
135	static bool need_matching_docs (int filterResultOptions) {
136	return ((filterResultOptions & FROID) \|\| (filterResultOptions & FRranking) \|\|
137	(filterResultOptions & FRmetadata));
138	}
139
140	// whether term information is needed
141	static bool need_term_info (int filterResultOptions) {
142	return ((filterResultOptions & FRtermFreq) \|\| (filterResultOptions & FRmatchTerms));
143	}
144
145	///////////////////////////////
146	// methods for resultsorderer_t
147	///////////////////////////////
148
149	resultsorderer_t::resultsorderer_t() {
150	clear ();
151	}
152
153	void resultsorderer_t::clear() {
154	compare_phrase_match = false;
155	compare_terms_match = false;
156	compare_doc_weight = true;
157
158	docset = NULL;
159	}
160
161	bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
162	if (docset == NULL) return t1>t2;
163
164	docresultmap::iterator t1_here = docset->find(t1);
165	docresultmap::iterator t2_here = docset->find(t2);
166	docresultmap::iterator end = docset->end();
167
168	// sort all the document numbers not in the document set to
169	// the end of the list
170	if (t1_here == end) {
171	if (t2_here == end) return t1>t2;
172	else return true;
173	} else if (t2_here == end) return false;
174
175	if (compare_phrase_match) {
176	if ((t1_here).second.num_phrase_match > (t2_here).second.num_phrase_match) return true;
177	if ((t1_here).second.num_phrase_match < (t2_here).second.num_phrase_match) return false;
178	}
179
180	if (compare_terms_match) {
181	if ((t1_here).second.num_query_terms_matched > (t2_here).second.num_query_terms_matched) return true;
182	if ((t1_here).second.num_query_terms_matched < (t2_here).second.num_query_terms_matched) return false;
183	}
184
185	if (compare_doc_weight) {
186	if ((t1_here).second.docweight > (t2_here).second.docweight) return true;
187	if ((t1_here).second.docweight < (t2_here).second.docweight) return false;
188	}
189
190	return t1>t2;
191	}
192
193
194
195
196	/////////////////////////////////
197	// functions for queryfilterclass
198	/////////////////////////////////
199
200	// loads up phrases data structure with any phrases (that's the quoted bits)
201	// occuring in the querystring
202	void queryfilterclass::get_phrase_terms (const text_t &querystring,
203	const termfreqclassarray &orgterms,
204	vector<termfreqclassarray> &phrases) {
205
206	text_t::const_iterator here = querystring.begin();
207	text_t::const_iterator end = querystring.end();
208
209	termfreqclassarray tmpterms;
210
211	int termcount = 0;
212	bool foundquote = false;
213	bool foundbreak = false;
214	bool start = true;
215	while (here != end) {
216	if (*here == '\"') {
217	if (foundquote) {
218	if (!foundbreak && !start) {
219	tmpterms.push_back (orgterms[termcount]);
220	termcount ++;
221	}
222	if (tmpterms.size() > 1) {
223	phrases.push_back (tmpterms);
224	tmpterms.erase (tmpterms.begin(), tmpterms.end());
225	}
226	foundquote = false;
227	foundbreak = true;
228	} else foundquote = true;
229	} else if (!is_unicode_letdig(*here)) {
230	// found a break between terms
231	if (!foundbreak && !start) {
232	if (foundquote)
233	tmpterms.push_back (orgterms[termcount]);
234	termcount ++;
235	}
236	foundbreak = true;
237	} else {
238	start = false;
239	foundbreak = false;
240	}
241	here++;
242	}
243	}
244
245	// do aditional query processing
246	void queryfilterclass::post_process (const queryparamclass &queryparams,
247	queryresultsclass &queryresults) {
248
249	// post-process the results if needed
250	if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
251
252	// get the terms between quotes (if any)
253	vector<termfreqclassarray> phrases;
254	get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
255
256	num_phrases = phrases.size();
257	if (num_phrases > 0) {
258
259	// get the long version of the index
260	text_t longindex;
261	indexmap.to2from (queryparams.index, longindex);
262
263	vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
264	vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
265
266	while (this_phrase != end_phrase) {
267
268	// process each of the matched documents
269	docresultmap::iterator docs_here = queryresults.docs.docset.begin();
270	docresultmap::iterator docs_end = queryresults.docs.docset.end();
271	while (docs_here != docs_end) {
272	if (OID_phrase_search (mgsearchptr, gdbmptr, queryparams.index,
273	queryparams.subcollection, queryparams.language,
274	longindex, queryparams.collection, *this_phrase,
275	(*docs_here).second.docnum)) {
276	(*docs_here).second.num_phrase_match++;
277	}
278
279	docs_here++;
280	}
281	this_phrase++;
282	}
283	}
284	}
285	}
286
287	// get the query parameters
288	void queryfilterclass::parse_query_params (const FilterRequest_t &request,
289	vector<queryparamclass> &query_params,
290	int &startresults, int &endresults,
291	text_t &phrasematch, ostream &logout) {
292	outconvertclass text_t2ascii;
293
294	// set defaults for the return parameters
295	query_params.erase(query_params.begin(), query_params.end());
296	startresults = filterOptions["StartResults"].defaultValue.getint();
297	endresults = filterOptions["EndResults"].defaultValue.getint();
298	phrasematch = filterOptions["PhraseMatch"].defaultValue;
299
300	// set defaults for query parameters
301	queryparamclass query;
302	query.combinequery = "or"; // first one must be "or"
303	query.collection = collection;
304	query.index = filterOptions["Index"].defaultValue;
305	query.subcollection = filterOptions["Subcollection"].defaultValue;
306	query.language = filterOptions["Language"].defaultValue;
307	query.querystring.clear();
308	query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
309	query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
310	query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
311	query.stemming = (filterOptions["Stem"].defaultValue == "true");
312	query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
313
314	OptionValue_tarray::const_iterator options_here = request.filterOptions.begin();
315	OptionValue_tarray::const_iterator options_end = request.filterOptions.end();
316	while (options_here != options_end) {
317	if ((*options_here).name == "CombineQuery") {
318	// add this query
319
320	// "all", needed when combining queries where the document results are needed
321	if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
322	query_params.push_back (query);
323
324	// start on next query
325	query.clear();
326	query.combinequery = (*options_here).value;
327
328	// set defaults for query parameters
329	query.collection = collection;
330	query.index = filterOptions["Index"].defaultValue;
331	query.subcollection = filterOptions["Subcollection"].defaultValue;
332	query.language = filterOptions["Language"].defaultValue;
333	query.querystring.clear();
334	query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
335	query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
336	query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
337	query.stemming = (filterOptions["Stem"].defaultValue == "true");
338
339	// "all", needed when combining queries where the document results are needed
340	if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
341	else query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
342
343	} else if ((*options_here).name == "StartResults") {
344	startresults = (*options_here).value.getint();
345	} else if ((*options_here).name == "EndResults") {
346	endresults = (*options_here).value.getint();
347	} else if ((*options_here).name == "QueryType") {
348	query.search_type = ((*options_here).value == "ranked");
349	} else if ((*options_here).name == "MatchMode") {
350	query.match_mode = ((*options_here).value == "all");
351	if (query.match_mode == 1) query.maxdocs = -1;
352	} else if ((*options_here).name == "Term") {
353	query.querystring = (*options_here).value;
354	} else if ((*options_here).name == "Casefold") {
355	query.casefolding = ((*options_here).value == "true");
356	} else if ((*options_here).name == "Stem") {
357	query.stemming = ((*options_here).value == "true");
358	} else if ((*options_here).name == "Index") {
359	query.index = (*options_here).value;
360	} else if ((*options_here).name == "Subcollection") {
361	query.subcollection = (*options_here).value;
362	} else if ((*options_here).name == "Language") {
363	query.language = (*options_here).value;
364	} else if ((*options_here).name == "Maxdocs") {
365	query.maxdocs = (*options_here).value.getint();
366	} else if ((*options_here).name == "PhraseMatch") {
367	phrasematch = (*options_here).value;
368	} else {
369	logout << text_t2ascii
370	<< "warning: unknown queryfilter option \""
371	<< (*options_here).name
372	<< "\" ignored.\n\n";
373	}
374
375	options_here++;
376	}
377
378	// add the last query
379	query_params.push_back (query);
380	}
381
382
383
384	// do query that might involve multiple sub queries
385	// mgsearchptr and gdbmptr are assumed to be valid
386	void queryfilterclass::do_multi_query (const FilterRequest_t &request,
387	const vector<queryparamclass> &query_params,
388	queryresultsclass &multiresults,
389	comerror_t &err, ostream &logout) {
390	outconvertclass text_t2ascii;
391
392	err = noError;
393	mgsearchptr->setcollectdir (collectdir);
394	multiresults.clear();
395
396	vector<queryparamclass>::const_iterator query_here = query_params.begin();
397	vector<queryparamclass>::const_iterator query_end = query_params.end();
398	while (query_here != query_end) {
399	queryresultsclass thisqueryresults;
400
401	if (!mgsearchptr->search(*query_here, thisqueryresults)) {
402	// most likely a system problem
403	logout << text_t2ascii
404	<< "system problem: could not do search with mg for index \""
405	<< (query_here).index << (query_here).subcollection
406	<< (*query_here).language << "\".\n\n";
407	err = systemProblem;
408	return;
409	}
410
411	// combine the results
412	if (need_matching_docs (request.filterResultOptions)) {
413	// post-process the results if needed
414	if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
415	!thisqueryresults.docs.docset.empty()) {
416	post_process (*query_here, thisqueryresults);
417	thisqueryresults.postprocessed = true;
418	multiresults.postprocessed = true;
419	}
420
421	if (query_params.size() == 1) {
422	multiresults.docs = thisqueryresults.docs; // just one set of results
423	multiresults.docs_matched = thisqueryresults.docs_matched;
424	multiresults.is_approx = thisqueryresults.is_approx;
425
426	} else {
427	if ((*query_here).combinequery == "and") {
428	multiresults.docs.combine_and (thisqueryresults.docs);
429	} else if ((*query_here).combinequery == "or") {
430	multiresults.docs.combine_or (thisqueryresults.docs);
431	} else if ((*query_here).combinequery == "not") {
432	multiresults.docs.combine_not (thisqueryresults.docs);
433	}
434	multiresults.docs_matched = multiresults.docs.docset.size();
435	multiresults.is_approx = Exact;
436	}
437	}
438
439	// combine the term information
440	if (need_term_info (request.filterResultOptions)) {
441	// append the terms
442	multiresults.orgterms.insert(multiresults.orgterms.end(),
443	thisqueryresults.orgterms.begin(),
444	thisqueryresults.orgterms.end());
445
446	// add the term variants
447	text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
448	text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
449	while (termvar_here != termvar_end) {
450	multiresults.termvariants.insert(*termvar_here);
451	termvar_here++;
452	}
453	}
454
455	query_here++;
456	}
457
458	// sort and unique the query terms
459	multiresults.sortuniqqueryterms ();
460	}
461
462
463	void queryfilterclass::sort_doc_results (const FilterRequest_t &/request/,
464	docresultsclass &docs) {
465	resultsorderer_t resultsorderer;
466	resultsorderer.compare_phrase_match = true;
467	resultsorderer.docset = &(docs.docset);
468
469	// first get a list of document numbers
470	docs.docnum_order();
471
472	sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
473	}
474
475
476
477	queryfilterclass::queryfilterclass () {
478	gdbmptr = NULL;
479	mgsearchptr = NULL;
480	num_phrases = 0;
481
482	FilterOption_t filtopt;
483	filtopt.name = "CombineQuery";
484	filtopt.type = FilterOption_t::enumeratedt;
485	filtopt.repeatable = FilterOption_t::onePerQuery;
486	filtopt.defaultValue = "and";
487	filtopt.validValues.push_back("and");
488	filtopt.validValues.push_back("or");
489	filtopt.validValues.push_back("not");
490	filterOptions["CombineQuery"] = filtopt;
491
492	// -- onePerQuery StartResults integer
493	filtopt.clear();
494	filtopt.name = "StartResults";
495	filtopt.type = FilterOption_t::integert;
496	filtopt.repeatable = FilterOption_t::onePerQuery;
497	filtopt.defaultValue = "1";
498	filtopt.validValues.push_back("1");
499	filtopt.validValues.push_back("1000");
500	filterOptions["StartResults"] = filtopt;
501
502	// -- onePerQuery EndResults integer
503	filtopt.clear();
504	filtopt.name = "EndResults";
505	filtopt.type = FilterOption_t::integert;
506	filtopt.repeatable = FilterOption_t::onePerQuery;
507	filtopt.defaultValue = "10";
508	filtopt.validValues.push_back("-1");
509	filtopt.validValues.push_back("1000");
510	filterOptions["EndResults"] = filtopt;
511
512	// -- onePerQuery QueryType enumerated (boolean, ranked)
513	filtopt.clear();
514	filtopt.name = "QueryType";
515	filtopt.type = FilterOption_t::enumeratedt;
516	filtopt.repeatable = FilterOption_t::onePerQuery;
517	filtopt.defaultValue = "ranked";
518	filtopt.validValues.push_back("boolean");
519	filtopt.validValues.push_back("ranked");
520	filterOptions["QueryType"] = filtopt;
521
522	// -- onePerQuery MatchMode enumerated (some, all)
523	filtopt.clear();
524	filtopt.name = "MatchMode";
525	filtopt.type = FilterOption_t::enumeratedt;
526	filtopt.repeatable = FilterOption_t::onePerQuery;
527	filtopt.defaultValue = "some";
528	filtopt.validValues.push_back("some");
529	filtopt.validValues.push_back("all");
530	filterOptions["MatchMode"] = filtopt;
531
532	// -- onePerTerm Term string ???
533	filtopt.clear();
534	filtopt.name = "Term";
535	filtopt.type = FilterOption_t::stringt;
536	filtopt.repeatable = FilterOption_t::onePerTerm;
537	filtopt.defaultValue = "";
538	filterOptions["Term"] = filtopt;
539
540	// -- onePerTerm Casefold boolean
541	filtopt.clear();
542	filtopt.name = "Casefold";
543	filtopt.type = FilterOption_t::booleant;
544	filtopt.repeatable = FilterOption_t::onePerTerm;
545	filtopt.defaultValue = "true";
546	filtopt.validValues.push_back("false");
547	filtopt.validValues.push_back("true");
548	filterOptions["Casefold"] = filtopt;
549
550	// -- onePerTerm Stem boolean
551	filtopt.clear();
552	filtopt.name = "Stem";
553	filtopt.type = FilterOption_t::booleant;
554	filtopt.repeatable = FilterOption_t::onePerTerm;
555	filtopt.defaultValue = "false";
556	filtopt.validValues.push_back("false");
557	filtopt.validValues.push_back("true");
558	filterOptions["Stem"] = filtopt;
559
560	// -- onePerTerm Index enumerated
561	filtopt.clear();
562	filtopt.name = "Index";
563	filtopt.type = FilterOption_t::enumeratedt;
564	filtopt.repeatable = FilterOption_t::onePerTerm;
565	filtopt.defaultValue = "";
566	filterOptions["Index"] = filtopt;
567
568	// -- onePerTerm Subcollection enumerated
569	filtopt.clear();
570	filtopt.name = "Subcollection";
571	filtopt.type = FilterOption_t::enumeratedt;
572	filtopt.repeatable = FilterOption_t::onePerTerm;
573	filtopt.defaultValue = "";
574	filterOptions["Subcollection"] = filtopt;
575
576	// -- onePerTerm Language enumerated
577	filtopt.clear();
578	filtopt.name = "Language";
579	filtopt.type = FilterOption_t::enumeratedt;
580	filtopt.repeatable = FilterOption_t::onePerTerm;
581	filtopt.defaultValue = "";
582	filterOptions["Language"] = filtopt;
583
584	// -- onePerQuery Maxdocs integer
585	filtopt.clear();
586	filtopt.name = "Maxdocs";
587	filtopt.type = FilterOption_t::integert;
588	filtopt.repeatable = FilterOption_t::onePerQuery;
589	filtopt.defaultValue = "200";
590	filtopt.validValues.push_back("-1");
591	filtopt.validValues.push_back("1000");
592	filterOptions["Maxdocs"] = filtopt;
593
594	// -- onePerQuery PhraseMatch enumerated
595	filtopt.clear();
596	filtopt.name = "PhraseMatch";
597	filtopt.type = FilterOption_t::enumeratedt;
598	filtopt.repeatable = FilterOption_t::onePerQuery;
599	filtopt.defaultValue = "some_phrases";
600	filtopt.validValues.push_back ("all_phrases");
601	filtopt.validValues.push_back ("some_phrases");
602	filtopt.validValues.push_back ("all_docs");
603	filterOptions["PhraseMatch"] = filtopt;
604	}
605
606	queryfilterclass::~queryfilterclass () {
607	}
608
609	void queryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
610	filterclass::configure (key, cfgline);
611
612	if (key == "indexmap") {
613	indexmap.importmap (cfgline);
614
615	// update the list of indexes in the filter information
616	text_tarray options;
617	indexmap.gettoarray (options);
618	filterOptions["Index"].validValues = options;
619
620	} else if (key == "defaultindex") {
621	indexmap.from2to (cfgline[0], filterOptions["Index"].defaultValue);
622
623	} else if (key == "subcollectionmap") {
624	subcollectionmap.importmap (cfgline);
625
626	// update the list of subcollections in the filter information
627	text_tarray options;
628	subcollectionmap.gettoarray (options);
629	filterOptions["Subcollection"].validValues = options;
630
631	} else if (key == "defaultsubcollection") {
632	subcollectionmap.from2to (cfgline[0], filterOptions["Subcollection"].defaultValue);
633
634	} else if (key == "languagemap") {
635	languagemap.importmap (cfgline);
636
637	// update the list of languages in the filter information
638	text_tarray options;
639	languagemap.gettoarray (options);
640	filterOptions["Language"].validValues = options;
641
642	} else if (key == "defaultlanguage")
643	languagemap.from2to (cfgline[0], filterOptions["Language"].defaultValue);
644	}
645
646	bool queryfilterclass::init (ostream &logout) {
647	outconvertclass text_t2ascii;
648
649	if (!filterclass::init(logout)) return false;
650
651	// get the filename for the database and make sure it exists
652	gdbm_filename = filename_cat(collectdir,"index","text",collection);
653
654	if (littleEndian()) gdbm_filename += ".ldb";
655	else gdbm_filename += ".bdb";
656
657	if (!file_exists(gdbm_filename)) {
658	logout << text_t2ascii
659	<< "warning: gdbm database \"" //****
660	<< gdbm_filename << "\" does not exist\n\n";
661	//return false; //****
662	}
663
664	return true;
665	}
666
667	void queryfilterclass::filter (const FilterRequest_t &request,
668	FilterResponse_t &response,
669	comerror_t &err, ostream &logout) {
670	outconvertclass text_t2ascii;
671
672	response.clear ();
673	err = noError;
674	if (gdbmptr == NULL) {
675	// most likely a configuration problem
676	logout << text_t2ascii
677	<< "configuration error: queryfilter contains a null gdbmclass\n\n";
678	err = configurationError;
679	return;
680	}
681	if (mgsearchptr == NULL) {
682	// most likely a configuration problem
683	logout << text_t2ascii
684	<< "configuration error: queryfilter contains a null mgsearchclass\n\n";
685	err = configurationError;
686	return;
687	}
688
689	// open the database
690	gdbmptr->setlogout(&logout);
691	if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
692	// most likely a system problem (we have already checked that the
693	// gdbm database exists)
694	logout << text_t2ascii
695	<< "system problem: open on gdbm database \""
696	<< gdbm_filename << "\" failed\n\n";
697	err = systemProblem;
698	return;
699	}
700
701	// get the query parameters
702	int startresults = filterOptions["StartResults"].defaultValue.getint();
703	int endresults = filterOptions["EndResults"].defaultValue.getint();
704	text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
705
706	vector<queryparamclass> queryfilterparams;
707	parse_query_params (request, queryfilterparams, startresults,
708	endresults, phrasematch, logout);
709
710	// do query
711	queryresultsclass queryresults;
712	do_multi_query (request, queryfilterparams, queryresults, err, logout);
713	if (err != noError) return;
714
715	// assemble document results
716	if (need_matching_docs (request.filterResultOptions)) {
717	// sort the query results
718	sort_doc_results (request, queryresults.docs);
719
720	int resultnum = 1;
721	ResultDocInfo_t resultdoc;
722	text_t trans_OID;
723	vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
724	vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
725
726	if (endresults == -1) endresults = MAXNUMDOCS;
727	while (docorder_here != docorder_end) {
728	if (resultnum > endresults) break;
729
730	// translate the document number
731	if (!translate(gdbmptr, *docorder_here, trans_OID)) {
732	logout << text_t2ascii
733	<< "warning: could not translate mg document number \""
734	<< *docorder_here << "\"to OID.\n\n";
735
736	} else {
737	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
738
739	// documents containing matching phrases will be sorted to the top so
740	// we can break out once we're past those that match the PhraseMatch
741	// option -- "all_phrases" = return only those documents containing all
742	// phrases in query string
743	// "some_phrases" = return only those documents containing
744	// at least 1 of the phrases in the document
745	// "all_docs" = return all documents regardless
746	if (num_phrases > 0) {
747	if ((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) {
748	queryresults.docs_matched = response.docInfo.size();
749	break;
750	}
751	if ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1)) {
752	queryresults.docs_matched = response.docInfo.size();
753	break;
754	}
755	}
756
757	// see if there is a result for this number,
758	// if it is in the request set (or the request set is empty)
759	if (docset_here != queryresults.docs.docset.end() &&
760	(request.docSet.empty() \|\| in_set(request.docSet, trans_OID))) {
761	if (resultnum >= startresults) {
762	// add this document
763	resultdoc.OID = trans_OID;
764	resultdoc.result_num = resultnum;
765	resultdoc.ranking = (int)((docset_here).second.docweight 10000.0 + 0.5);
766
767	// these next two are not available on all versions of mg
768	resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
769	resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
770
771	response.docInfo.push_back (resultdoc);
772	}
773
774	resultnum++;
775	}
776	}
777
778	docorder_here++;
779	}
780	}
781
782	// assemble the term results
783	if (need_term_info(request.filterResultOptions)) {
784	// note: the terms have already been sorted and uniqued
785
786	TermInfo_t terminfo;
787	bool terms_first = true;
788	termfreqclassarray::iterator terms_here = queryresults.terms.begin();
789	termfreqclassarray::iterator terms_end = queryresults.terms.end();
790
791	while (terms_here != terms_end) {
792	terminfo.clear();
793	terminfo.term = (*terms_here).termstr;
794	terminfo.freq = (*terms_here).termfreq;
795	if (terms_first) {
796	text_tset::iterator termvariants_here = queryresults.termvariants.begin();
797	text_tset::iterator termvariants_end = queryresults.termvariants.end();
798	while (termvariants_here != termvariants_end) {
799	terminfo.matchTerms.push_back (*termvariants_here);
800	termvariants_here++;
801	}
802	}
803	terms_first = false;
804
805	response.termInfo.push_back (terminfo);
806
807	terms_here++;
808	}
809	}
810
811	response.numDocs = queryresults.docs_matched;
812	response.isApprox = queryresults.is_approx;
813	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: