Context Navigation

source: trunk/gsdl/src/colservr/queryfilter.cpp@ 766

Last change on this file since 766 was 766, checked in by sjboddie, 25 years ago
Added filter option to remove documents not matching a phrase match. This used to be done in the receptionist.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 26.5 KB

Line
1	/**********************************************************************
2	*
3	* queryfilter.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: queryfilter.cpp 766 1999-11-01 22:06:06Z sjboddie $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.20 1999/11/01 22:06:06 sjboddie
31	Added filter option to remove documents not matching a phrase match.
32	This used to be done in the receptionist.
33
34	Revision 1.19 1999/10/19 03:23:40 davidb
35	Collection building support through web pages
36	and internal and external link handling for collection documents
37
38	Revision 1.18 1999/09/22 03:43:18 sjboddie
39	Endresults queryfilter option may now take '-1' for 'all'
40
41	Revision 1.17 1999/09/21 12:01:07 sjboddie
42	added Maxdocs queryfilter option (which may be -1 for 'all')
43
44	Revision 1.16 1999/09/07 04:57:24 sjboddie
45	added gpl notice
46
47	Revision 1.15 1999/08/31 22:47:09 rjmcnab
48	Added matchmode option for some and all.
49
50	Revision 1.14 1999/07/16 03:42:21 sjboddie
51	changed isApprox
52
53	Revision 1.13 1999/07/16 00:17:06 sjboddie
54	got using phrasesearch for post-processing
55
56	Revision 1.12 1999/07/09 02:19:43 rjmcnab
57	Fixed a couple of compiler conflicts
58
59	Revision 1.11 1999/07/08 20:49:44 rjmcnab
60	Added result_num to the ResultDocInto_t structure.
61
62	Revision 1.10 1999/07/07 06:19:46 rjmcnab
63	Added ability to combine two or more independant queries.
64
65	Revision 1.9 1999/07/01 09:29:20 rjmcnab
66	Changes for better reporting of number documents which match a query. Changes
67	should still work as before with older versions of mg.
68
69	Revision 1.8 1999/07/01 03:59:54 rjmcnab
70	reduced MAXDOCS to 200 (more reasonable ???). I also added a virtual
71	method for post-processing the query.
72
73	Revision 1.7 1999/06/30 04:04:13 rjmcnab
74	made stemming functions available from mgsearch and made the stems
75	for the query terms available in queryinfo
76
77	Revision 1.6 1999/06/29 22:06:23 rjmcnab
78	Added a couple of fields to queryinfo to handle a special version
79	of mg.
80
81	Revision 1.5 1999/06/27 22:08:48 sjboddie
82	now check for defaultindex, defaultsubcollection, and defaultlanguage
83	entries in config files
84
85	Revision 1.4 1999/06/16 02:03:25 sjboddie
86	fixed bug in isApprox and set MAXDOCS to always be 500
87
88	Revision 1.3 1999/04/19 23:56:09 rjmcnab
89	Finished the gdbm metadata stuff
90
91	Revision 1.2 1999/04/12 03:45:03 rjmcnab
92	Finished the query filter.
93
94	Revision 1.1 1999/04/06 22:22:09 rjmcnab
95	Initial revision.
96
97	*/
98
99
100	#include "queryfilter.h"
101	#include "fileutil.h"
102	#include "queryinfo.h"
103	#include "phrasesearch.h"
104	#include <assert.h>
105
106
107	// some useful functions
108
109	// translate will return true if successful
110	static bool translate (gdbmclass *gdbmptr, int docnum, text_t &trans_OID) {
111	infodbclass info;
112
113	trans_OID.clear();
114
115	// get the info
116	if (gdbmptr == NULL) return false;
117	if (!gdbmptr->getinfo(docnum, info)) return false;
118
119	// translate
120	if (info["section"].empty()) return false;
121
122	trans_OID = info["section"];
123	return true;
124	}
125
126
127	// whether document results are needed
128	static bool need_matching_docs (int filterResultOptions) {
129	return ((filterResultOptions & FROID) \|\| (filterResultOptions & FRranking) \|\|
130	(filterResultOptions & FRmetadata));
131	}
132
133	// whether term information is needed
134	static bool need_term_info (int filterResultOptions) {
135	return ((filterResultOptions & FRtermFreq) \|\| (filterResultOptions & FRmatchTerms));
136	}
137
138	///////////////////////////////
139	// methods for resultsorderer_t
140	///////////////////////////////
141
142	resultsorderer_t::resultsorderer_t() {
143	clear ();
144	}
145
146	void resultsorderer_t::clear() {
147	compare_phrase_match = false;
148	compare_terms_match = false;
149	compare_doc_weight = true;
150
151	docset = NULL;
152	}
153
154	bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
155	if (docset == NULL) return t1>t2;
156
157	docresultmap::iterator t1_here = docset->find(t1);
158	docresultmap::iterator t2_here = docset->find(t2);
159	docresultmap::iterator end = docset->end();
160
161	// sort all the document numbers not in the document set to
162	// the end of the list
163	if (t1_here == end) {
164	if (t2_here == end) return t1>t2;
165	else return true;
166	} else if (t2_here == end) return false;
167
168	if (compare_phrase_match) {
169	if ((t1_here).second.num_phrase_match > (t2_here).second.num_phrase_match) return true;
170	if ((t1_here).second.num_phrase_match < (t2_here).second.num_phrase_match) return false;
171	}
172
173	if (compare_terms_match) {
174	if ((t1_here).second.num_query_terms_matched > (t2_here).second.num_query_terms_matched) return true;
175	if ((t1_here).second.num_query_terms_matched < (t2_here).second.num_query_terms_matched) return false;
176	}
177
178	if (compare_doc_weight) {
179	if ((t1_here).second.docweight > (t2_here).second.docweight) return true;
180	if ((t1_here).second.docweight < (t2_here).second.docweight) return false;
181	}
182
183	return t1>t2;
184	}
185
186
187
188
189	/////////////////////////////////
190	// functions for queryfilterclass
191	/////////////////////////////////
192
193	// loads up phrases data structure with any phrases (that's the quoted bits)
194	// occuring in the querystring
195	void queryfilterclass::get_phrase_terms (const text_t &querystring,
196	const termfreqclassarray &orgterms,
197	vector<termfreqclassarray> &phrases) {
198
199	text_t::const_iterator here = querystring.begin();
200	text_t::const_iterator end = querystring.end();
201
202	termfreqclassarray tmpterms;
203
204	int termcount = 0;
205	bool foundquote = false;
206	bool foundbreak = false;
207	bool start = true;
208	while (here != end) {
209	if (*here == '\"') {
210	if (foundquote) {
211	if (!foundbreak && !start) {
212	tmpterms.push_back (orgterms[termcount]);
213	termcount ++;
214	}
215	if (tmpterms.size() > 1) {
216	phrases.push_back (tmpterms);
217	tmpterms.erase (tmpterms.begin(), tmpterms.end());
218	}
219	foundquote = false;
220	foundbreak = true;
221	} else foundquote = true;
222	} else if (!is_unicode_letdig(*here)) {
223	// found a break between terms
224	if (!foundbreak && !start) {
225	if (foundquote)
226	tmpterms.push_back (orgterms[termcount]);
227	termcount ++;
228	}
229	foundbreak = true;
230	} else {
231	start = false;
232	foundbreak = false;
233	}
234	here++;
235	}
236	}
237
238	// do aditional query processing
239	void queryfilterclass::post_process (const queryparamclass &queryparams,
240	queryresultsclass &queryresults) {
241
242	// post-process the results if needed
243	if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
244
245	// get the terms between quotes (if any)
246	vector<termfreqclassarray> phrases;
247	get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
248
249	num_phrases = phrases.size();
250	if (num_phrases > 0) {
251
252	// get the long version of the index
253	text_t longindex;
254	indexmap.to2from (queryparams.index, longindex);
255
256	vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
257	vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
258
259	while (this_phrase != end_phrase) {
260
261	// process each of the matched documents
262	docresultmap::iterator docs_here = queryresults.docs.docset.begin();
263	docresultmap::iterator docs_end = queryresults.docs.docset.end();
264	while (docs_here != docs_end) {
265	if (OID_phrase_search (mgsearchptr, gdbmptr, queryparams.index,
266	queryparams.subcollection, queryparams.language,
267	longindex, queryparams.collection, *this_phrase,
268	(*docs_here).second.docnum)) {
269	(*docs_here).second.num_phrase_match++;
270	}
271
272	docs_here++;
273	}
274	this_phrase++;
275	}
276	}
277	}
278	}
279
280	// get the query parameters
281	void queryfilterclass::parse_query_params (const FilterRequest_t &request,
282	vector<queryparamclass> &query_params,
283	int &startresults, int &endresults,
284	text_t &phrasematch, ostream &logout) {
285	outconvertclass text_t2ascii;
286
287	// set defaults for the return parameters
288	query_params.erase(query_params.begin(), query_params.end());
289	startresults = filterOptions["StartResults"].defaultValue.getint();
290	endresults = filterOptions["EndResults"].defaultValue.getint();
291	phrasematch = filterOptions["PhraseMatch"].defaultValue;
292
293	// set defaults for query parameters
294	queryparamclass query;
295	query.combinequery = "or"; // first one must be "or"
296	query.collection = collection;
297	query.index = filterOptions["Index"].defaultValue;
298	query.subcollection = filterOptions["Subcollection"].defaultValue;
299	query.language = filterOptions["Language"].defaultValue;
300	query.querystring.clear();
301	query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
302	query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
303	query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
304	query.stemming = (filterOptions["Stem"].defaultValue == "true");
305	query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
306
307	OptionValue_tarray::const_iterator options_here = request.filterOptions.begin();
308	OptionValue_tarray::const_iterator options_end = request.filterOptions.end();
309	while (options_here != options_end) {
310	if ((*options_here).name == "CombineQuery") {
311	// add this query
312
313	// "all", needed when combining queries where the document results are needed
314	if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
315	query_params.push_back (query);
316
317	// start on next query
318	query.clear();
319	query.combinequery = (*options_here).value;
320
321	// set defaults for query parameters
322	query.collection = collection;
323	query.index = filterOptions["Index"].defaultValue;
324	query.subcollection = filterOptions["Subcollection"].defaultValue;
325	query.language = filterOptions["Language"].defaultValue;
326	query.querystring.clear();
327	query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
328	query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
329	query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
330	query.stemming = (filterOptions["Stem"].defaultValue == "true");
331
332	// "all", needed when combining queries where the document results are needed
333	if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
334	else query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
335
336	} else if ((*options_here).name == "StartResults") {
337	startresults = (*options_here).value.getint();
338	} else if ((*options_here).name == "EndResults") {
339	endresults = (*options_here).value.getint();
340	} else if ((*options_here).name == "QueryType") {
341	query.search_type = ((*options_here).value == "ranked");
342	} else if ((*options_here).name == "MatchMode") {
343	query.match_mode = ((*options_here).value == "all");
344	if (query.match_mode == 1) query.maxdocs = -1;
345	} else if ((*options_here).name == "Term") {
346	query.querystring = (*options_here).value;
347	} else if ((*options_here).name == "Casefold") {
348	query.casefolding = ((*options_here).value == "true");
349	} else if ((*options_here).name == "Stem") {
350	query.stemming = ((*options_here).value == "true");
351	} else if ((*options_here).name == "Index") {
352	query.index = (*options_here).value;
353	} else if ((*options_here).name == "Subcollection") {
354	query.subcollection = (*options_here).value;
355	} else if ((*options_here).name == "Language") {
356	query.language = (*options_here).value;
357	} else if ((*options_here).name == "Maxdocs") {
358	query.maxdocs = (*options_here).value.getint();
359	} else if ((*options_here).name == "PhraseMatch") {
360	phrasematch = (*options_here).value;
361	} else {
362	logout << text_t2ascii
363	<< "warning: unknown queryfilter option \""
364	<< (*options_here).name
365	<< "\" ignored.\n\n";
366	}
367
368	options_here++;
369	}
370
371	// add the last query
372	query_params.push_back (query);
373	}
374
375
376
377	// do query that might involve multiple sub queries
378	// mgsearchptr and gdbmptr are assumed to be valid
379	void queryfilterclass::do_multi_query (const FilterRequest_t &request,
380	const vector<queryparamclass> &query_params,
381	queryresultsclass &multiresults,
382	comerror_t &err, ostream &logout) {
383	outconvertclass text_t2ascii;
384
385	err = noError;
386	mgsearchptr->setcollectdir (collectdir);
387	multiresults.clear();
388
389	vector<queryparamclass>::const_iterator query_here = query_params.begin();
390	vector<queryparamclass>::const_iterator query_end = query_params.end();
391	while (query_here != query_end) {
392	queryresultsclass thisqueryresults;
393
394	if (!mgsearchptr->search(*query_here, thisqueryresults)) {
395	// most likely a system problem
396	logout << text_t2ascii
397	<< "system problem: could not do search with mg for index \""
398	<< (query_here).index << (query_here).subcollection
399	<< (*query_here).language << "\".\n\n";
400	err = systemProblem;
401	return;
402	}
403
404	// combine the results
405	if (need_matching_docs (request.filterResultOptions)) {
406	// post-process the results if needed
407	if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
408	!thisqueryresults.docs.docset.empty()) {
409	post_process (*query_here, thisqueryresults);
410	thisqueryresults.postprocessed = true;
411	multiresults.postprocessed = true;
412	}
413
414	if (query_params.size() == 1) {
415	multiresults.docs = thisqueryresults.docs; // just one set of results
416	multiresults.docs_matched = thisqueryresults.docs_matched;
417	multiresults.is_approx = thisqueryresults.is_approx;
418
419	} else {
420	if ((*query_here).combinequery == "and") {
421	multiresults.docs.combine_and (thisqueryresults.docs);
422	} else if ((*query_here).combinequery == "or") {
423	multiresults.docs.combine_or (thisqueryresults.docs);
424	} else if ((*query_here).combinequery == "not") {
425	multiresults.docs.combine_not (thisqueryresults.docs);
426	}
427	multiresults.docs_matched = multiresults.docs.docset.size();
428	multiresults.is_approx = Exact;
429	}
430	}
431
432	// combine the term information
433	if (need_term_info (request.filterResultOptions)) {
434	// append the terms
435	multiresults.orgterms.insert(multiresults.orgterms.end(),
436	thisqueryresults.orgterms.begin(),
437	thisqueryresults.orgterms.end());
438
439	// add the term variants
440	text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
441	text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
442	while (termvar_here != termvar_end) {
443	multiresults.termvariants.insert(*termvar_here);
444	termvar_here++;
445	}
446	}
447
448	query_here++;
449	}
450
451	// sort and unique the query terms
452	multiresults.sortuniqqueryterms ();
453	}
454
455
456	void queryfilterclass::sort_doc_results (const FilterRequest_t &/request/,
457	docresultsclass &docs) {
458	resultsorderer_t resultsorderer;
459	resultsorderer.compare_phrase_match = true;
460	resultsorderer.docset = &(docs.docset);
461
462	// first get a list of document numbers
463	docs.docnum_order();
464
465	sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
466	}
467
468
469
470	queryfilterclass::queryfilterclass () {
471	gdbmptr = NULL;
472	mgsearchptr = NULL;
473	num_phrases = 0;
474
475	FilterOption_t filtopt;
476	filtopt.name = "CombineQuery";
477	filtopt.type = FilterOption_t::enumeratedt;
478	filtopt.repeatable = FilterOption_t::onePerQuery;
479	filtopt.defaultValue = "and";
480	filtopt.validValues.push_back("and");
481	filtopt.validValues.push_back("or");
482	filtopt.validValues.push_back("not");
483	filterOptions["CombineQuery"] = filtopt;
484
485	// -- onePerQuery StartResults integer
486	filtopt.clear();
487	filtopt.name = "StartResults";
488	filtopt.type = FilterOption_t::integert;
489	filtopt.repeatable = FilterOption_t::onePerQuery;
490	filtopt.defaultValue = "1";
491	filtopt.validValues.push_back("1");
492	filtopt.validValues.push_back("1000");
493	filterOptions["StartResults"] = filtopt;
494
495	// -- onePerQuery EndResults integer
496	filtopt.clear();
497	filtopt.name = "EndResults";
498	filtopt.type = FilterOption_t::integert;
499	filtopt.repeatable = FilterOption_t::onePerQuery;
500	filtopt.defaultValue = "10";
501	filtopt.validValues.push_back("-1");
502	filtopt.validValues.push_back("1000");
503	filterOptions["EndResults"] = filtopt;
504
505	// -- onePerQuery QueryType enumerated (boolean, ranked)
506	filtopt.clear();
507	filtopt.name = "QueryType";
508	filtopt.type = FilterOption_t::enumeratedt;
509	filtopt.repeatable = FilterOption_t::onePerQuery;
510	filtopt.defaultValue = "ranked";
511	filtopt.validValues.push_back("boolean");
512	filtopt.validValues.push_back("ranked");
513	filterOptions["QueryType"] = filtopt;
514
515	// -- onePerQuery MatchMode enumerated (some, all)
516	filtopt.clear();
517	filtopt.name = "MatchMode";
518	filtopt.type = FilterOption_t::enumeratedt;
519	filtopt.repeatable = FilterOption_t::onePerQuery;
520	filtopt.defaultValue = "some";
521	filtopt.validValues.push_back("some");
522	filtopt.validValues.push_back("all");
523	filterOptions["MatchMode"] = filtopt;
524
525	// -- onePerTerm Term string ???
526	filtopt.clear();
527	filtopt.name = "Term";
528	filtopt.type = FilterOption_t::stringt;
529	filtopt.repeatable = FilterOption_t::onePerTerm;
530	filtopt.defaultValue = "";
531	filterOptions["Term"] = filtopt;
532
533	// -- onePerTerm Casefold boolean
534	filtopt.clear();
535	filtopt.name = "Casefold";
536	filtopt.type = FilterOption_t::booleant;
537	filtopt.repeatable = FilterOption_t::onePerTerm;
538	filtopt.defaultValue = "true";
539	filtopt.validValues.push_back("false");
540	filtopt.validValues.push_back("true");
541	filterOptions["Casefold"] = filtopt;
542
543	// -- onePerTerm Stem boolean
544	filtopt.clear();
545	filtopt.name = "Stem";
546	filtopt.type = FilterOption_t::booleant;
547	filtopt.repeatable = FilterOption_t::onePerTerm;
548	filtopt.defaultValue = "false";
549	filtopt.validValues.push_back("false");
550	filtopt.validValues.push_back("true");
551	filterOptions["Stem"] = filtopt;
552
553	// -- onePerTerm Index enumerated
554	filtopt.clear();
555	filtopt.name = "Index";
556	filtopt.type = FilterOption_t::enumeratedt;
557	filtopt.repeatable = FilterOption_t::onePerTerm;
558	filtopt.defaultValue = "";
559	filterOptions["Index"] = filtopt;
560
561	// -- onePerTerm Subcollection enumerated
562	filtopt.clear();
563	filtopt.name = "Subcollection";
564	filtopt.type = FilterOption_t::enumeratedt;
565	filtopt.repeatable = FilterOption_t::onePerTerm;
566	filtopt.defaultValue = "";
567	filterOptions["Subcollection"] = filtopt;
568
569	// -- onePerTerm Language enumerated
570	filtopt.clear();
571	filtopt.name = "Language";
572	filtopt.type = FilterOption_t::enumeratedt;
573	filtopt.repeatable = FilterOption_t::onePerTerm;
574	filtopt.defaultValue = "";
575	filterOptions["Language"] = filtopt;
576
577	// -- onePerQuery Maxdocs integer
578	filtopt.clear();
579	filtopt.name = "Maxdocs";
580	filtopt.type = FilterOption_t::integert;
581	filtopt.repeatable = FilterOption_t::onePerQuery;
582	filtopt.defaultValue = "200";
583	filtopt.validValues.push_back("-1");
584	filtopt.validValues.push_back("1000");
585	filterOptions["Maxdocs"] = filtopt;
586
587	// -- onePerQuery PhraseMatch enumerated
588	filtopt.clear();
589	filtopt.name = "PhraseMatch";
590	filtopt.type = FilterOption_t::enumeratedt;
591	filtopt.repeatable = FilterOption_t::onePerQuery;
592	filtopt.defaultValue = "some_phrases";
593	filtopt.validValues.push_back ("all_phrases");
594	filtopt.validValues.push_back ("some_phrases");
595	filtopt.validValues.push_back ("all_docs");
596	filterOptions["PhraseMatch"] = filtopt;
597	}
598
599	queryfilterclass::~queryfilterclass () {
600	}
601
602	void queryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
603	filterclass::configure (key, cfgline);
604
605	if (key == "indexmap") {
606	indexmap.importmap (cfgline);
607
608	// update the list of indexes in the filter information
609	text_tarray options;
610	indexmap.gettoarray (options);
611	filterOptions["Index"].validValues = options;
612
613	} else if (key == "defaultindex") {
614	indexmap.from2to (cfgline[0], filterOptions["Index"].defaultValue);
615
616	} else if (key == "subcollectionmap") {
617	subcollectionmap.importmap (cfgline);
618
619	// update the list of subcollections in the filter information
620	text_tarray options;
621	subcollectionmap.gettoarray (options);
622	filterOptions["Subcollection"].validValues = options;
623
624	} else if (key == "defaultsubcollection") {
625	subcollectionmap.from2to (cfgline[0], filterOptions["Subcollection"].defaultValue);
626
627	} else if (key == "languagemap") {
628	languagemap.importmap (cfgline);
629
630	// update the list of languages in the filter information
631	text_tarray options;
632	languagemap.gettoarray (options);
633	filterOptions["Language"].validValues = options;
634
635	} else if (key == "defaultlanguage")
636	languagemap.from2to (cfgline[0], filterOptions["Language"].defaultValue);
637	}
638
639	bool queryfilterclass::init (ostream &logout) {
640	outconvertclass text_t2ascii;
641
642	if (!filterclass::init(logout)) return false;
643
644	// get the filename for the database and make sure it exists
645	gdbm_filename = filename_cat(collectdir,"index","text",collection);
646
647	#ifdef _LITTLE_ENDIAN
648	gdbm_filename += ".ldb";
649	#else
650	gdbm_filename += ".bdb";
651	#endif
652	if (!file_exists(gdbm_filename)) {
653	logout << text_t2ascii
654	<< "warning: gdbm database \"" //****
655	<< gdbm_filename << "\" does not exist\n\n";
656	//return false; //****
657	}
658
659	return true;
660	}
661
662	void queryfilterclass::filter (const FilterRequest_t &request,
663	FilterResponse_t &response,
664	comerror_t &err, ostream &logout) {
665	outconvertclass text_t2ascii;
666
667	response.clear ();
668	err = noError;
669	if (gdbmptr == NULL) {
670	// most likely a configuration problem
671	logout << text_t2ascii
672	<< "configuration error: queryfilter contains a null gdbmclass\n\n";
673	err = configurationError;
674	return;
675	}
676	if (mgsearchptr == NULL) {
677	// most likely a configuration problem
678	logout << text_t2ascii
679	<< "configuration error: queryfilter contains a null mgsearchclass\n\n";
680	err = configurationError;
681	return;
682	}
683
684	// open the database
685	gdbmptr->setlogout(&logout);
686	if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
687	// most likely a system problem (we have already checked that the
688	// gdbm database exists)
689	logout << text_t2ascii
690	<< "system problem: open on gdbm database \""
691	<< gdbm_filename << "\" failed\n\n";
692	err = systemProblem;
693	return;
694	}
695
696	// get the query parameters
697	int startresults = filterOptions["StartResults"].defaultValue.getint();
698	int endresults = filterOptions["EndResults"].defaultValue.getint();
699	text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
700
701	vector<queryparamclass> queryfilterparams;
702	parse_query_params (request, queryfilterparams, startresults,
703	endresults, phrasematch, logout);
704
705	// do query
706	queryresultsclass queryresults;
707	do_multi_query (request, queryfilterparams, queryresults, err, logout);
708	if (err != noError) return;
709
710	// assemble document results
711	if (need_matching_docs (request.filterResultOptions)) {
712	// sort the query results
713	sort_doc_results (request, queryresults.docs);
714
715	int resultnum = 1;
716	ResultDocInfo_t resultdoc;
717	text_t trans_OID;
718	vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
719	vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
720
721	if (endresults == -1) endresults = MAXNUMDOCS;
722	while (docorder_here != docorder_end) {
723	if (resultnum > endresults) break;
724
725	// translate the document number
726	if (!translate(gdbmptr, *docorder_here, trans_OID)) {
727	logout << text_t2ascii
728	<< "warning: could not translate mg document number \""
729	<< *docorder_here << "\"to OID.\n\n";
730
731	} else {
732	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
733
734	// documents containing matching phrases will be sorted to the top so
735	// we can break out once we're past those that match the PhraseMatch
736	// option -- "all_phrases" = return only those documents containing all
737	// phrases in query string
738	// "some_phrases" = return only those documents containing
739	// at least 1 of the phrases in the document
740	// "all_docs" = return all documents regardless
741	if (num_phrases > 0) {
742	if ((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases))
743	break;
744	if ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))
745	break;
746	}
747
748	// see if there is a result for this number,
749	// if it is in the request set (or the request set is empty)
750	if (docset_here != queryresults.docs.docset.end() &&
751	(request.docSet.empty() \|\| in_set(request.docSet, trans_OID))) {
752	if (resultnum >= startresults) {
753	// add this document
754	resultdoc.OID = trans_OID;
755	resultdoc.result_num = resultnum;
756	resultdoc.ranking = (int)((docset_here).second.docweight 10000.0 + 0.5);
757
758	// these next two are not available on all versions of mg
759	resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
760	resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
761
762	response.docInfo.push_back (resultdoc);
763	}
764
765	resultnum++;
766	}
767	}
768
769	docorder_here++;
770	}
771	}
772
773	// assemble the term results
774	if (need_term_info(request.filterResultOptions)) {
775	// note: the terms have already been sorted and uniqued
776
777	TermInfo_t terminfo;
778	bool terms_first = true;
779	termfreqclassarray::iterator terms_here = queryresults.terms.begin();
780	termfreqclassarray::iterator terms_end = queryresults.terms.end();
781
782	while (terms_here != terms_end) {
783	terminfo.clear();
784	terminfo.term = (*terms_here).termstr;
785	terminfo.freq = (*terms_here).termfreq;
786	if (terms_first) {
787	text_tset::iterator termvariants_here = queryresults.termvariants.begin();
788	text_tset::iterator termvariants_end = queryresults.termvariants.end();
789	while (termvariants_here != termvariants_end) {
790	terminfo.matchTerms.push_back (*termvariants_here);
791	termvariants_here++;
792	}
793	}
794	terms_first = false;
795
796	response.termInfo.push_back (terminfo);
797
798	terms_here++;
799	}
800	}
801
802	response.numDocs = queryresults.docs_matched;
803	response.isApprox = queryresults.is_approx;
804	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: