Context Navigation

source: branches/corba/gsdl/src/colservr/queryfilter.cpp@ 1074

Last change on this file since 1074 was 1074, checked in by cs025, 24 years ago

Corba improvements; tidied client initialisation in corbaproto and also
added a number of changes to the main trunk which somehow had not made
it into the corba branch via update before its instantiation.

Also the dated use of the GSDL_GSDLHOME macro was removed, at the expense
of some particularly poor code in corbaserver where log file creation is
now nowhere near so elegant.

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 27.2 KB

Line
1	/**********************************************************************
2	*
3	* queryfilter.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: queryfilter.cpp 1074 2000-04-06 11:11:40Z cs025 $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.22.2.1 2000/04/06 11:11:37 cs025
31	Corba improvements; tidied client initialisation in corbaproto and also
32	added a number of changes to the main trunk which somehow had not made
33	it into the corba branch via update before its instantiation.
34
35	Also the dated use of the GSDL_GSDLHOME macro was removed, at the expense
36	of some particularly poor code in corbaserver where log file creation is
37	now nowhere near so elegant.
38
39	Revision 1.22 2000/02/29 01:35:56 sjboddie
40	tidied up endianness and fastcgi
41
42	Revision 1.21 1999/11/25 02:21:13 sjboddie
43	fixed bug in phrasematch stuff
44
45	Revision 1.20 1999/11/01 22:06:06 sjboddie
46	Added filter option to remove documents not matching a phrase match.
47	This used to be done in the receptionist.
48
49	Revision 1.19 1999/10/19 03:23:40 davidb
50	Collection building support through web pages
51	and internal and external link handling for collection documents
52
53	Revision 1.18 1999/09/22 03:43:18 sjboddie
54	Endresults queryfilter option may now take '-1' for 'all'
55
56	Revision 1.17 1999/09/21 12:01:07 sjboddie
57	added Maxdocs queryfilter option (which may be -1 for 'all')
58
59	Revision 1.16 1999/09/07 04:57:24 sjboddie
60	added gpl notice
61
62	Revision 1.15 1999/08/31 22:47:09 rjmcnab
63	Added matchmode option for some and all.
64
65	Revision 1.14 1999/07/16 03:42:21 sjboddie
66	changed isApprox
67
68	Revision 1.13 1999/07/16 00:17:06 sjboddie
69	got using phrasesearch for post-processing
70
71	Revision 1.12 1999/07/09 02:19:43 rjmcnab
72	Fixed a couple of compiler conflicts
73
74	Revision 1.11 1999/07/08 20:49:44 rjmcnab
75	Added result_num to the ResultDocInto_t structure.
76
77	Revision 1.10 1999/07/07 06:19:46 rjmcnab
78	Added ability to combine two or more independant queries.
79
80	Revision 1.9 1999/07/01 09:29:20 rjmcnab
81	Changes for better reporting of number documents which match a query. Changes
82	should still work as before with older versions of mg.
83
84	Revision 1.8 1999/07/01 03:59:54 rjmcnab
85	reduced MAXDOCS to 200 (more reasonable ???). I also added a virtual
86	method for post-processing the query.
87
88	Revision 1.7 1999/06/30 04:04:13 rjmcnab
89	made stemming functions available from mgsearch and made the stems
90	for the query terms available in queryinfo
91
92	Revision 1.6 1999/06/29 22:06:23 rjmcnab
93	Added a couple of fields to queryinfo to handle a special version
94	of mg.
95
96	Revision 1.5 1999/06/27 22:08:48 sjboddie
97	now check for defaultindex, defaultsubcollection, and defaultlanguage
98	entries in config files
99
100	Revision 1.4 1999/06/16 02:03:25 sjboddie
101	fixed bug in isApprox and set MAXDOCS to always be 500
102
103	Revision 1.3 1999/04/19 23:56:09 rjmcnab
104	Finished the gdbm metadata stuff
105
106	Revision 1.2 1999/04/12 03:45:03 rjmcnab
107	Finished the query filter.
108
109	Revision 1.1 1999/04/06 22:22:09 rjmcnab
110	Initial revision.
111
112	*/
113
114
115	#include "queryfilter.h"
116	#include "fileutil.h"
117	#include "queryinfo.h"
118	#include "phrasesearch.h"
119	#include "gsdltools.h"
120	#include <assert.h>
121
122
123	// some useful functions
124
125	// translate will return true if successful
126	static bool translate (gdbmclass *gdbmptr, int docnum, text_t &trans_OID) {
127	infodbclass info;
128
129	trans_OID.clear();
130
131	// get the info
132	if (gdbmptr == NULL) return false;
133	if (!gdbmptr->getinfo(docnum, info)) return false;
134
135	// translate
136	if (info["section"].empty()) return false;
137
138	trans_OID = info["section"];
139	return true;
140	}
141
142
143	// whether document results are needed
144	static bool need_matching_docs (int filterResultOptions) {
145	return ((filterResultOptions & FROID) \|\| (filterResultOptions & FRranking) \|\|
146	(filterResultOptions & FRmetadata));
147	}
148
149	// whether term information is needed
150	static bool need_term_info (int filterResultOptions) {
151	return ((filterResultOptions & FRtermFreq) \|\| (filterResultOptions & FRmatchTerms));
152	}
153
154	///////////////////////////////
155	// methods for resultsorderer_t
156	///////////////////////////////
157
158	resultsorderer_t::resultsorderer_t() {
159	clear ();
160	}
161
162	void resultsorderer_t::clear() {
163	compare_phrase_match = false;
164	compare_terms_match = false;
165	compare_doc_weight = true;
166
167	docset = NULL;
168	}
169
170	bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
171	if (docset == NULL) return t1>t2;
172
173	docresultmap::iterator t1_here = docset->find(t1);
174	docresultmap::iterator t2_here = docset->find(t2);
175	docresultmap::iterator end = docset->end();
176
177	// sort all the document numbers not in the document set to
178	// the end of the list
179	if (t1_here == end) {
180	if (t2_here == end) return t1>t2;
181	else return true;
182	} else if (t2_here == end) return false;
183
184	if (compare_phrase_match) {
185	if ((t1_here).second.num_phrase_match > (t2_here).second.num_phrase_match) return true;
186	if ((t1_here).second.num_phrase_match < (t2_here).second.num_phrase_match) return false;
187	}
188
189	if (compare_terms_match) {
190	if ((t1_here).second.num_query_terms_matched > (t2_here).second.num_query_terms_matched) return true;
191	if ((t1_here).second.num_query_terms_matched < (t2_here).second.num_query_terms_matched) return false;
192	}
193
194	if (compare_doc_weight) {
195	if ((t1_here).second.docweight > (t2_here).second.docweight) return true;
196	if ((t1_here).second.docweight < (t2_here).second.docweight) return false;
197	}
198
199	return t1>t2;
200	}
201
202
203
204
205	/////////////////////////////////
206	// functions for queryfilterclass
207	/////////////////////////////////
208
209	// loads up phrases data structure with any phrases (that's the quoted bits)
210	// occuring in the querystring
211	void queryfilterclass::get_phrase_terms (const text_t &querystring,
212	const termfreqclassarray &orgterms,
213	vector<termfreqclassarray> &phrases) {
214
215	text_t::const_iterator here = querystring.begin();
216	text_t::const_iterator end = querystring.end();
217
218	termfreqclassarray tmpterms;
219
220	int termcount = 0;
221	bool foundquote = false;
222	bool foundbreak = false;
223	bool start = true;
224	while (here != end) {
225	if (*here == '\"') {
226	if (foundquote) {
227	if (!foundbreak && !start) {
228	tmpterms.push_back (orgterms[termcount]);
229	termcount ++;
230	}
231	if (tmpterms.size() > 1) {
232	phrases.push_back (tmpterms);
233	tmpterms.erase (tmpterms.begin(), tmpterms.end());
234	}
235	foundquote = false;
236	foundbreak = true;
237	} else foundquote = true;
238	} else if (!is_unicode_letdig(*here)) {
239	// found a break between terms
240	if (!foundbreak && !start) {
241	if (foundquote)
242	tmpterms.push_back (orgterms[termcount]);
243	termcount ++;
244	}
245	foundbreak = true;
246	} else {
247	start = false;
248	foundbreak = false;
249	}
250	here++;
251	}
252	}
253
254	// do aditional query processing
255	void queryfilterclass::post_process (const queryparamclass &queryparams,
256	queryresultsclass &queryresults) {
257
258	// post-process the results if needed
259	if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
260
261	// get the terms between quotes (if any)
262	vector<termfreqclassarray> phrases;
263	get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
264
265	num_phrases = phrases.size();
266	if (num_phrases > 0) {
267
268	// get the long version of the index
269	text_t longindex;
270	indexmap.to2from (queryparams.index, longindex);
271
272	vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
273	vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
274
275	while (this_phrase != end_phrase) {
276
277	// process each of the matched documents
278	docresultmap::iterator docs_here = queryresults.docs.docset.begin();
279	docresultmap::iterator docs_end = queryresults.docs.docset.end();
280	while (docs_here != docs_end) {
281	if (OID_phrase_search (mgsearchptr, gdbmptr, queryparams.index,
282	queryparams.subcollection, queryparams.language,
283	longindex, queryparams.collection, *this_phrase,
284	(*docs_here).second.docnum)) {
285	(*docs_here).second.num_phrase_match++;
286	}
287
288	docs_here++;
289	}
290	this_phrase++;
291	}
292	}
293	}
294	}
295
296	// get the query parameters
297	void queryfilterclass::parse_query_params (const FilterRequest_t &request,
298	vector<queryparamclass> &query_params,
299	int &startresults, int &endresults,
300	text_t &phrasematch, ostream &logout) {
301	outconvertclass text_t2ascii;
302
303	// set defaults for the return parameters
304	query_params.erase(query_params.begin(), query_params.end());
305	startresults = filterOptions["StartResults"].defaultValue.getint();
306	endresults = filterOptions["EndResults"].defaultValue.getint();
307	phrasematch = filterOptions["PhraseMatch"].defaultValue;
308
309	// set defaults for query parameters
310	queryparamclass query;
311	query.combinequery = "or"; // first one must be "or"
312	query.collection = collection;
313	query.index = filterOptions["Index"].defaultValue;
314	query.subcollection = filterOptions["Subcollection"].defaultValue;
315	query.language = filterOptions["Language"].defaultValue;
316	query.querystring.clear();
317	query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
318	query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
319	query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
320	query.stemming = (filterOptions["Stem"].defaultValue == "true");
321	query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
322
323	OptionValue_tarray::const_iterator options_here = request.filterOptions.begin();
324	OptionValue_tarray::const_iterator options_end = request.filterOptions.end();
325	while (options_here != options_end) {
326	if ((*options_here).name == "CombineQuery") {
327	// add this query
328
329	// "all", needed when combining queries where the document results are needed
330	if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
331	query_params.push_back (query);
332
333	// start on next query
334	query.clear();
335	query.combinequery = (*options_here).value;
336
337	// set defaults for query parameters
338	query.collection = collection;
339	query.index = filterOptions["Index"].defaultValue;
340	query.subcollection = filterOptions["Subcollection"].defaultValue;
341	query.language = filterOptions["Language"].defaultValue;
342	query.querystring.clear();
343	query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
344	query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
345	query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
346	query.stemming = (filterOptions["Stem"].defaultValue == "true");
347
348	// "all", needed when combining queries where the document results are needed
349	if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
350	else query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
351
352	} else if ((*options_here).name == "StartResults") {
353	startresults = (*options_here).value.getint();
354	} else if ((*options_here).name == "EndResults") {
355	endresults = (*options_here).value.getint();
356	} else if ((*options_here).name == "QueryType") {
357	query.search_type = ((*options_here).value == "ranked");
358	} else if ((*options_here).name == "MatchMode") {
359	query.match_mode = ((*options_here).value == "all");
360	if (query.match_mode == 1) query.maxdocs = -1;
361	} else if ((*options_here).name == "Term") {
362	query.querystring = (*options_here).value;
363	} else if ((*options_here).name == "Casefold") {
364	query.casefolding = ((*options_here).value == "true");
365	} else if ((*options_here).name == "Stem") {
366	query.stemming = ((*options_here).value == "true");
367	} else if ((*options_here).name == "Index") {
368	query.index = (*options_here).value;
369	} else if ((*options_here).name == "Subcollection") {
370	query.subcollection = (*options_here).value;
371	} else if ((*options_here).name == "Language") {
372	query.language = (*options_here).value;
373	} else if ((*options_here).name == "Maxdocs") {
374	query.maxdocs = (*options_here).value.getint();
375	} else if ((*options_here).name == "PhraseMatch") {
376	phrasematch = (*options_here).value;
377	} else {
378	logout << text_t2ascii
379	<< "warning: unknown queryfilter option \""
380	<< (*options_here).name
381	<< "\" ignored.\n\n";
382	}
383
384	options_here++;
385	}
386
387	// add the last query
388	query_params.push_back (query);
389	}
390
391
392
393	// do query that might involve multiple sub queries
394	// mgsearchptr and gdbmptr are assumed to be valid
395	void queryfilterclass::do_multi_query (const FilterRequest_t &request,
396	const vector<queryparamclass> &query_params,
397	queryresultsclass &multiresults,
398	comerror_t &err, ostream &logout) {
399	outconvertclass text_t2ascii;
400
401	err = noError;
402	mgsearchptr->setcollectdir (collectdir);
403	multiresults.clear();
404
405	vector<queryparamclass>::const_iterator query_here = query_params.begin();
406	vector<queryparamclass>::const_iterator query_end = query_params.end();
407	while (query_here != query_end) {
408	queryresultsclass thisqueryresults;
409
410	if (!mgsearchptr->search(*query_here, thisqueryresults)) {
411	// most likely a system problem
412	logout << text_t2ascii
413	<< "system problem: could not do search with mg for index \""
414	<< (query_here).index << (query_here).subcollection
415	<< (*query_here).language << "\".\n\n";
416	err = systemProblem;
417	return;
418	}
419
420	// combine the results
421	if (need_matching_docs (request.filterResultOptions)) {
422	// post-process the results if needed
423	if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
424	!thisqueryresults.docs.docset.empty()) {
425	post_process (*query_here, thisqueryresults);
426	thisqueryresults.postprocessed = true;
427	multiresults.postprocessed = true;
428	}
429
430	if (query_params.size() == 1) {
431	multiresults.docs = thisqueryresults.docs; // just one set of results
432	multiresults.docs_matched = thisqueryresults.docs_matched;
433	multiresults.is_approx = thisqueryresults.is_approx;
434
435	} else {
436	if ((*query_here).combinequery == "and") {
437	multiresults.docs.combine_and (thisqueryresults.docs);
438	} else if ((*query_here).combinequery == "or") {
439	multiresults.docs.combine_or (thisqueryresults.docs);
440	} else if ((*query_here).combinequery == "not") {
441	multiresults.docs.combine_not (thisqueryresults.docs);
442	}
443	multiresults.docs_matched = multiresults.docs.docset.size();
444	multiresults.is_approx = Exact;
445	}
446	}
447
448	// combine the term information
449	if (need_term_info (request.filterResultOptions)) {
450	// append the terms
451	multiresults.orgterms.insert(multiresults.orgterms.end(),
452	thisqueryresults.orgterms.begin(),
453	thisqueryresults.orgterms.end());
454
455	// add the term variants
456	text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
457	text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
458	while (termvar_here != termvar_end) {
459	multiresults.termvariants.insert(*termvar_here);
460	termvar_here++;
461	}
462	}
463
464	query_here++;
465	}
466
467	// sort and unique the query terms
468	multiresults.sortuniqqueryterms ();
469	}
470
471
472	void queryfilterclass::sort_doc_results (const FilterRequest_t &/request/,
473	docresultsclass &docs) {
474	resultsorderer_t resultsorderer;
475	resultsorderer.compare_phrase_match = true;
476	resultsorderer.docset = &(docs.docset);
477
478	// first get a list of document numbers
479	docs.docnum_order();
480
481	sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
482	}
483
484
485
486	queryfilterclass::queryfilterclass () {
487	gdbmptr = NULL;
488	mgsearchptr = NULL;
489	num_phrases = 0;
490
491	FilterOption_t filtopt;
492	filtopt.name = "CombineQuery";
493	filtopt.type = FilterOption_t::enumeratedt;
494	filtopt.repeatable = FilterOption_t::onePerQuery;
495	filtopt.defaultValue = "and";
496	filtopt.validValues.push_back("and");
497	filtopt.validValues.push_back("or");
498	filtopt.validValues.push_back("not");
499	filterOptions["CombineQuery"] = filtopt;
500
501	// -- onePerQuery StartResults integer
502	filtopt.clear();
503	filtopt.name = "StartResults";
504	filtopt.type = FilterOption_t::integert;
505	filtopt.repeatable = FilterOption_t::onePerQuery;
506	filtopt.defaultValue = "1";
507	filtopt.validValues.push_back("1");
508	filtopt.validValues.push_back("1000");
509	filterOptions["StartResults"] = filtopt;
510
511	// -- onePerQuery EndResults integer
512	filtopt.clear();
513	filtopt.name = "EndResults";
514	filtopt.type = FilterOption_t::integert;
515	filtopt.repeatable = FilterOption_t::onePerQuery;
516	filtopt.defaultValue = "10";
517	filtopt.validValues.push_back("-1");
518	filtopt.validValues.push_back("1000");
519	filterOptions["EndResults"] = filtopt;
520
521	// -- onePerQuery QueryType enumerated (boolean, ranked)
522	filtopt.clear();
523	filtopt.name = "QueryType";
524	filtopt.type = FilterOption_t::enumeratedt;
525	filtopt.repeatable = FilterOption_t::onePerQuery;
526	filtopt.defaultValue = "ranked";
527	filtopt.validValues.push_back("boolean");
528	filtopt.validValues.push_back("ranked");
529	filterOptions["QueryType"] = filtopt;
530
531	// -- onePerQuery MatchMode enumerated (some, all)
532	filtopt.clear();
533	filtopt.name = "MatchMode";
534	filtopt.type = FilterOption_t::enumeratedt;
535	filtopt.repeatable = FilterOption_t::onePerQuery;
536	filtopt.defaultValue = "some";
537	filtopt.validValues.push_back("some");
538	filtopt.validValues.push_back("all");
539	filterOptions["MatchMode"] = filtopt;
540
541	// -- onePerTerm Term string ???
542	filtopt.clear();
543	filtopt.name = "Term";
544	filtopt.type = FilterOption_t::stringt;
545	filtopt.repeatable = FilterOption_t::onePerTerm;
546	filtopt.defaultValue = "";
547	filterOptions["Term"] = filtopt;
548
549	// -- onePerTerm Casefold boolean
550	filtopt.clear();
551	filtopt.name = "Casefold";
552	filtopt.type = FilterOption_t::booleant;
553	filtopt.repeatable = FilterOption_t::onePerTerm;
554	filtopt.defaultValue = "true";
555	filtopt.validValues.push_back("false");
556	filtopt.validValues.push_back("true");
557	filterOptions["Casefold"] = filtopt;
558
559	// -- onePerTerm Stem boolean
560	filtopt.clear();
561	filtopt.name = "Stem";
562	filtopt.type = FilterOption_t::booleant;
563	filtopt.repeatable = FilterOption_t::onePerTerm;
564	filtopt.defaultValue = "false";
565	filtopt.validValues.push_back("false");
566	filtopt.validValues.push_back("true");
567	filterOptions["Stem"] = filtopt;
568
569	// -- onePerTerm Index enumerated
570	filtopt.clear();
571	filtopt.name = "Index";
572	filtopt.type = FilterOption_t::enumeratedt;
573	filtopt.repeatable = FilterOption_t::onePerTerm;
574	filtopt.defaultValue = "";
575	filterOptions["Index"] = filtopt;
576
577	// -- onePerTerm Subcollection enumerated
578	filtopt.clear();
579	filtopt.name = "Subcollection";
580	filtopt.type = FilterOption_t::enumeratedt;
581	filtopt.repeatable = FilterOption_t::onePerTerm;
582	filtopt.defaultValue = "";
583	filterOptions["Subcollection"] = filtopt;
584
585	// -- onePerTerm Language enumerated
586	filtopt.clear();
587	filtopt.name = "Language";
588	filtopt.type = FilterOption_t::enumeratedt;
589	filtopt.repeatable = FilterOption_t::onePerTerm;
590	filtopt.defaultValue = "";
591	filterOptions["Language"] = filtopt;
592
593	// -- onePerQuery Maxdocs integer
594	filtopt.clear();
595	filtopt.name = "Maxdocs";
596	filtopt.type = FilterOption_t::integert;
597	filtopt.repeatable = FilterOption_t::onePerQuery;
598	filtopt.defaultValue = "200";
599	filtopt.validValues.push_back("-1");
600	filtopt.validValues.push_back("1000");
601	filterOptions["Maxdocs"] = filtopt;
602
603	// -- onePerQuery PhraseMatch enumerated
604	filtopt.clear();
605	filtopt.name = "PhraseMatch";
606	filtopt.type = FilterOption_t::enumeratedt;
607	filtopt.repeatable = FilterOption_t::onePerQuery;
608	filtopt.defaultValue = "some_phrases";
609	filtopt.validValues.push_back ("all_phrases");
610	filtopt.validValues.push_back ("some_phrases");
611	filtopt.validValues.push_back ("all_docs");
612	filterOptions["PhraseMatch"] = filtopt;
613	}
614
615	queryfilterclass::~queryfilterclass () {
616	}
617
618	void queryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
619	filterclass::configure (key, cfgline);
620
621	if (key == "indexmap") {
622	indexmap.importmap (cfgline);
623
624	// update the list of indexes in the filter information
625	text_tarray options;
626	indexmap.gettoarray (options);
627	filterOptions["Index"].validValues = options;
628
629	} else if (key == "defaultindex") {
630	indexmap.from2to (cfgline[0], filterOptions["Index"].defaultValue);
631
632	} else if (key == "subcollectionmap") {
633	subcollectionmap.importmap (cfgline);
634
635	// update the list of subcollections in the filter information
636	text_tarray options;
637	subcollectionmap.gettoarray (options);
638	filterOptions["Subcollection"].validValues = options;
639
640	} else if (key == "defaultsubcollection") {
641	subcollectionmap.from2to (cfgline[0], filterOptions["Subcollection"].defaultValue);
642
643	} else if (key == "languagemap") {
644	languagemap.importmap (cfgline);
645
646	// update the list of languages in the filter information
647	text_tarray options;
648	languagemap.gettoarray (options);
649	filterOptions["Language"].validValues = options;
650
651	} else if (key == "defaultlanguage")
652	languagemap.from2to (cfgline[0], filterOptions["Language"].defaultValue);
653	}
654
655	bool queryfilterclass::init (ostream &logout) {
656	outconvertclass text_t2ascii;
657
658	if (!filterclass::init(logout)) return false;
659
660	// get the filename for the database and make sure it exists
661	gdbm_filename = filename_cat(collectdir,"index","text",collection);
662
663	if (littleEndian()) gdbm_filename += ".ldb";
664	else gdbm_filename += ".bdb";
665
666	if (!file_exists(gdbm_filename)) {
667	logout << text_t2ascii
668	<< "warning: gdbm database \"" //****
669	<< gdbm_filename << "\" does not exist\n\n";
670	//return false; //****
671	}
672
673	return true;
674	}
675
676	void queryfilterclass::filter (const FilterRequest_t &request,
677	FilterResponse_t &response,
678	comerror_t &err, ostream &logout) {
679	outconvertclass text_t2ascii;
680
681	response.clear ();
682	err = noError;
683	if (gdbmptr == NULL) {
684	// most likely a configuration problem
685	logout << text_t2ascii
686	<< "configuration error: queryfilter contains a null gdbmclass\n\n";
687	err = configurationError;
688	return;
689	}
690	if (mgsearchptr == NULL) {
691	// most likely a configuration problem
692	logout << text_t2ascii
693	<< "configuration error: queryfilter contains a null mgsearchclass\n\n";
694	err = configurationError;
695	return;
696	}
697
698	// open the database
699	gdbmptr->setlogout(&logout);
700	if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
701	// most likely a system problem (we have already checked that the
702	// gdbm database exists)
703	logout << text_t2ascii
704	<< "system problem: open on gdbm database \""
705	<< gdbm_filename << "\" failed\n\n";
706	err = systemProblem;
707	return;
708	}
709
710	// get the query parameters
711	int startresults = filterOptions["StartResults"].defaultValue.getint();
712	int endresults = filterOptions["EndResults"].defaultValue.getint();
713	text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
714
715	vector<queryparamclass> queryfilterparams;
716	parse_query_params (request, queryfilterparams, startresults,
717	endresults, phrasematch, logout);
718
719	// do query
720	queryresultsclass queryresults;
721	do_multi_query (request, queryfilterparams, queryresults, err, logout);
722	if (err != noError) return;
723
724	// assemble document results
725	if (need_matching_docs (request.filterResultOptions)) {
726	// sort the query results
727	sort_doc_results (request, queryresults.docs);
728
729	int resultnum = 1;
730	ResultDocInfo_t resultdoc;
731	text_t trans_OID;
732	vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
733	vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
734
735	if (endresults == -1) endresults = MAXNUMDOCS;
736	while (docorder_here != docorder_end) {
737	if (resultnum > endresults) break;
738
739	// translate the document number
740	if (!translate(gdbmptr, *docorder_here, trans_OID)) {
741	logout << text_t2ascii
742	<< "warning: could not translate mg document number \""
743	<< *docorder_here << "\"to OID.\n\n";
744
745	} else {
746	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
747
748	// documents containing matching phrases will be sorted to the top so
749	// we can break out once we're past those that match the PhraseMatch
750	// option -- "all_phrases" = return only those documents containing all
751	// phrases in query string
752	// "some_phrases" = return only those documents containing
753	// at least 1 of the phrases in the document
754	// "all_docs" = return all documents regardless
755	if (num_phrases > 0) {
756	if ((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) {
757	queryresults.docs_matched = response.docInfo.size();
758	break;
759	}
760	if ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1)) {
761	queryresults.docs_matched = response.docInfo.size();
762	break;
763	}
764	}
765
766	// see if there is a result for this number,
767	// if it is in the request set (or the request set is empty)
768	if (docset_here != queryresults.docs.docset.end() &&
769	(request.docSet.empty() \|\| in_set(request.docSet, trans_OID))) {
770	if (resultnum >= startresults) {
771	// add this document
772	resultdoc.OID = trans_OID;
773	resultdoc.result_num = resultnum;
774	resultdoc.ranking = (int)((docset_here).second.docweight 10000.0 + 0.5);
775
776	// these next two are not available on all versions of mg
777	resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
778	resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
779
780	response.docInfo.push_back (resultdoc);
781	}
782
783	resultnum++;
784	}
785	}
786
787	docorder_here++;
788	}
789	}
790
791	// assemble the term results
792	if (need_term_info(request.filterResultOptions)) {
793	// note: the terms have already been sorted and uniqued
794
795	TermInfo_t terminfo;
796	bool terms_first = true;
797	termfreqclassarray::iterator terms_here = queryresults.terms.begin();
798	termfreqclassarray::iterator terms_end = queryresults.terms.end();
799
800	while (terms_here != terms_end) {
801	terminfo.clear();
802	terminfo.term = (*terms_here).termstr;
803	terminfo.freq = (*terms_here).termfreq;
804	if (terms_first) {
805	text_tset::iterator termvariants_here = queryresults.termvariants.begin();
806	text_tset::iterator termvariants_end = queryresults.termvariants.end();
807	while (termvariants_here != termvariants_end) {
808	terminfo.matchTerms.push_back (*termvariants_here);
809	termvariants_here++;
810	}
811	}
812	terms_first = false;
813
814	response.termInfo.push_back (terminfo);
815
816	terms_here++;
817	}
818	}
819
820	response.numDocs = queryresults.docs_matched;
821	response.isApprox = queryresults.is_approx;
822	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: