Context Navigation

source: trunk/gsdl/src/colservr/queryfilter.cpp@ 990

Last change on this file since 990 was 990, checked in by sjboddie, 24 years ago
tidied up endianness and fastcgi
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 26.8 KB

Rev	Line
[227]	1	/**********************************************************************
	2	*
	3	* queryfilter.cpp --
	4	* Copyright (C) 1999 The New Zealand Digital Library Project
	5	*
[534]	6	* A component of the Greenstone digital library software
	7	* from the New Zealand Digital Library Project at the
	8	* University of Waikato, New Zealand.
[227]	9	*
[534]	10	* This program is free software; you can redistribute it and/or modify
	11	* it under the terms of the GNU General Public License as published by
	12	* the Free Software Foundation; either version 2 of the License, or
	13	* (at your option) any later version.
	14	*
	15	* This program is distributed in the hope that it will be useful,
	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	* GNU General Public License for more details.
	19	*
	20	* You should have received a copy of the GNU General Public License
	21	* along with this program; if not, write to the Free Software
	22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	*
[227]	24	* $Id: queryfilter.cpp 990 2000-02-29 01:35:56Z sjboddie $
	25	*
	26	*********************************************************************/
	27
	28	/*
	29	$Log$
[990]	30	Revision 1.22 2000/02/29 01:35:56 sjboddie
	31	tidied up endianness and fastcgi
	32
[787]	33	Revision 1.21 1999/11/25 02:21:13 sjboddie
	34	fixed bug in phrasematch stuff
	35
[766]	36	Revision 1.20 1999/11/01 22:06:06 sjboddie
	37	Added filter option to remove documents not matching a phrase match.
	38	This used to be done in the receptionist.
	39
[722]	40	Revision 1.19 1999/10/19 03:23:40 davidb
	41	Collection building support through web pages
	42	and internal and external link handling for collection documents
	43
[621]	44	Revision 1.18 1999/09/22 03:43:18 sjboddie
	45	Endresults queryfilter option may now take '-1' for 'all'
	46
[613]	47	Revision 1.17 1999/09/21 12:01:07 sjboddie
	48	added Maxdocs queryfilter option (which may be -1 for 'all')
	49
[534]	50	Revision 1.16 1999/09/07 04:57:24 sjboddie
	51	added gpl notice
	52
[501]	53	Revision 1.15 1999/08/31 22:47:09 rjmcnab
	54	Added matchmode option for some and all.
	55
[398]	56	Revision 1.14 1999/07/16 03:42:21 sjboddie
	57	changed isApprox
	58
[396]	59	Revision 1.13 1999/07/16 00:17:06 sjboddie
	60	got using phrasesearch for post-processing
	61
[358]	62	Revision 1.12 1999/07/09 02:19:43 rjmcnab
	63	Fixed a couple of compiler conflicts
	64
[355]	65	Revision 1.11 1999/07/08 20:49:44 rjmcnab
	66	Added result_num to the ResultDocInto_t structure.
	67
[351]	68	Revision 1.10 1999/07/07 06:19:46 rjmcnab
	69	Added ability to combine two or more independant queries.
	70
[334]	71	Revision 1.9 1999/07/01 09:29:20 rjmcnab
	72	Changes for better reporting of number documents which match a query. Changes
	73	should still work as before with older versions of mg.
	74
[327]	75	Revision 1.8 1999/07/01 03:59:54 rjmcnab
	76	reduced MAXDOCS to 200 (more reasonable ???). I also added a virtual
	77	method for post-processing the query.
	78
[319]	79	Revision 1.7 1999/06/30 04:04:13 rjmcnab
	80	made stemming functions available from mgsearch and made the stems
	81	for the query terms available in queryinfo
	82
[311]	83	Revision 1.6 1999/06/29 22:06:23 rjmcnab
	84	Added a couple of fields to queryinfo to handle a special version
	85	of mg.
	86
[302]	87	Revision 1.5 1999/06/27 22:08:48 sjboddie
	88	now check for defaultindex, defaultsubcollection, and defaultlanguage
	89	entries in config files
	90
[273]	91	Revision 1.4 1999/06/16 02:03:25 sjboddie
	92	fixed bug in isApprox and set MAXDOCS to always be 500
	93
[238]	94	Revision 1.3 1999/04/19 23:56:09 rjmcnab
	95	Finished the gdbm metadata stuff
	96
[235]	97	Revision 1.2 1999/04/12 03:45:03 rjmcnab
	98	Finished the query filter.
	99
[227]	100	Revision 1.1 1999/04/06 22:22:09 rjmcnab
	101	Initial revision.
	102
	103	*/
	104
	105
	106	#include "queryfilter.h"
	107	#include "fileutil.h"
[235]	108	#include "queryinfo.h"
[396]	109	#include "phrasesearch.h"
[990]	110	#include "gsdltools.h"
[396]	111	#include <assert.h>
[227]	112
	113
[235]	114	// some useful functions
	115
	116	// translate will return true if successful
	117	static bool translate (gdbmclass *gdbmptr, int docnum, text_t &trans_OID) {
	118	infodbclass info;
	119
	120	trans_OID.clear();
	121
	122	// get the info
	123	if (gdbmptr == NULL) return false;
	124	if (!gdbmptr->getinfo(docnum, info)) return false;
	125
	126	// translate
	127	if (info["section"].empty()) return false;
	128
	129	trans_OID = info["section"];
	130	return true;
	131	}
	132
	133
[351]	134	// whether document results are needed
	135	static bool need_matching_docs (int filterResultOptions) {
	136	return ((filterResultOptions & FROID) \|\| (filterResultOptions & FRranking) \|\|
	137	(filterResultOptions & FRmetadata));
	138	}
	139
	140	// whether term information is needed
	141	static bool need_term_info (int filterResultOptions) {
	142	return ((filterResultOptions & FRtermFreq) \|\| (filterResultOptions & FRmatchTerms));
	143	}
	144
	145	///////////////////////////////
	146	// methods for resultsorderer_t
	147	///////////////////////////////
	148
	149	resultsorderer_t::resultsorderer_t() {
	150	clear ();
	151	}
	152
	153	void resultsorderer_t::clear() {
	154	compare_phrase_match = false;
	155	compare_terms_match = false;
	156	compare_doc_weight = true;
	157
	158	docset = NULL;
	159	}
	160
	161	bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
	162	if (docset == NULL) return t1>t2;
	163
	164	docresultmap::iterator t1_here = docset->find(t1);
	165	docresultmap::iterator t2_here = docset->find(t2);
	166	docresultmap::iterator end = docset->end();
	167
	168	// sort all the document numbers not in the document set to
	169	// the end of the list
	170	if (t1_here == end) {
	171	if (t2_here == end) return t1>t2;
	172	else return true;
	173	} else if (t2_here == end) return false;
	174
	175	if (compare_phrase_match) {
	176	if ((t1_here).second.num_phrase_match > (t2_here).second.num_phrase_match) return true;
	177	if ((t1_here).second.num_phrase_match < (t2_here).second.num_phrase_match) return false;
	178	}
	179
	180	if (compare_terms_match) {
	181	if ((t1_here).second.num_query_terms_matched > (t2_here).second.num_query_terms_matched) return true;
	182	if ((t1_here).second.num_query_terms_matched < (t2_here).second.num_query_terms_matched) return false;
	183	}
	184
	185	if (compare_doc_weight) {
	186	if ((t1_here).second.docweight > (t2_here).second.docweight) return true;
	187	if ((t1_here).second.docweight < (t2_here).second.docweight) return false;
	188	}
	189
	190	return t1>t2;
	191	}
	192
	193
	194
	195
	196	/////////////////////////////////
	197	// functions for queryfilterclass
	198	/////////////////////////////////
	199
[396]	200	// loads up phrases data structure with any phrases (that's the quoted bits)
	201	// occuring in the querystring
	202	void queryfilterclass::get_phrase_terms (const text_t &querystring,
	203	const termfreqclassarray &orgterms,
	204	vector<termfreqclassarray> &phrases) {
	205
	206	text_t::const_iterator here = querystring.begin();
	207	text_t::const_iterator end = querystring.end();
	208
	209	termfreqclassarray tmpterms;
	210
	211	int termcount = 0;
	212	bool foundquote = false;
	213	bool foundbreak = false;
	214	bool start = true;
	215	while (here != end) {
	216	if (*here == '\"') {
	217	if (foundquote) {
	218	if (!foundbreak && !start) {
	219	tmpterms.push_back (orgterms[termcount]);
	220	termcount ++;
	221	}
	222	if (tmpterms.size() > 1) {
	223	phrases.push_back (tmpterms);
	224	tmpterms.erase (tmpterms.begin(), tmpterms.end());
	225	}
	226	foundquote = false;
	227	foundbreak = true;
	228	} else foundquote = true;
	229	} else if (!is_unicode_letdig(*here)) {
	230	// found a break between terms
	231	if (!foundbreak && !start) {
	232	if (foundquote)
	233	tmpterms.push_back (orgterms[termcount]);
	234	termcount ++;
	235	}
	236	foundbreak = true;
	237	} else {
	238	start = false;
	239	foundbreak = false;
	240	}
	241	here++;
	242	}
	243	}
	244
[327]	245	// do aditional query processing
[396]	246	void queryfilterclass::post_process (const queryparamclass &queryparams,
	247	queryresultsclass &queryresults) {
	248
	249	// post-process the results if needed
	250	if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
	251
	252	// get the terms between quotes (if any)
	253	vector<termfreqclassarray> phrases;
	254	get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
	255
[766]	256	num_phrases = phrases.size();
	257	if (num_phrases > 0) {
[396]	258
	259	// get the long version of the index
	260	text_t longindex;
	261	indexmap.to2from (queryparams.index, longindex);
	262
	263	vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
	264	vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
	265
	266	while (this_phrase != end_phrase) {
	267
	268	// process each of the matched documents
	269	docresultmap::iterator docs_here = queryresults.docs.docset.begin();
	270	docresultmap::iterator docs_end = queryresults.docs.docset.end();
	271	while (docs_here != docs_end) {
	272	if (OID_phrase_search (mgsearchptr, gdbmptr, queryparams.index,
	273	queryparams.subcollection, queryparams.language,
	274	longindex, queryparams.collection, *this_phrase,
	275	(*docs_here).second.docnum)) {
	276	(*docs_here).second.num_phrase_match++;
	277	}
	278
	279	docs_here++;
	280	}
	281	this_phrase++;
	282	}
	283	}
	284	}
[327]	285	}
[235]	286
[351]	287	// get the query parameters
	288	void queryfilterclass::parse_query_params (const FilterRequest_t &request,
	289	vector<queryparamclass> &query_params,
[766]	290	int &startresults, int &endresults,
	291	text_t &phrasematch, ostream &logout) {
[351]	292	outconvertclass text_t2ascii;
[327]	293
[351]	294	// set defaults for the return parameters
[358]	295	query_params.erase(query_params.begin(), query_params.end());
[351]	296	startresults = filterOptions["StartResults"].defaultValue.getint();
	297	endresults = filterOptions["EndResults"].defaultValue.getint();
[766]	298	phrasematch = filterOptions["PhraseMatch"].defaultValue;
[327]	299
[351]	300	// set defaults for query parameters
	301	queryparamclass query;
	302	query.combinequery = "or"; // first one must be "or"
	303	query.collection = collection;
	304	query.index = filterOptions["Index"].defaultValue;
	305	query.subcollection = filterOptions["Subcollection"].defaultValue;
	306	query.language = filterOptions["Language"].defaultValue;
	307	query.querystring.clear();
	308	query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
[501]	309	query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
[351]	310	query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
	311	query.stemming = (filterOptions["Stem"].defaultValue == "true");
[613]	312	query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
[351]	313
	314	OptionValue_tarray::const_iterator options_here = request.filterOptions.begin();
	315	OptionValue_tarray::const_iterator options_end = request.filterOptions.end();
	316	while (options_here != options_end) {
	317	if ((*options_here).name == "CombineQuery") {
	318	// add this query
	319
	320	// "all", needed when combining queries where the document results are needed
	321	if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
	322	query_params.push_back (query);
	323
	324	// start on next query
	325	query.clear();
	326	query.combinequery = (*options_here).value;
	327
	328	// set defaults for query parameters
	329	query.collection = collection;
	330	query.index = filterOptions["Index"].defaultValue;
	331	query.subcollection = filterOptions["Subcollection"].defaultValue;
	332	query.language = filterOptions["Language"].defaultValue;
	333	query.querystring.clear();
	334	query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
[501]	335	query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
[351]	336	query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
	337	query.stemming = (filterOptions["Stem"].defaultValue == "true");
	338
	339	// "all", needed when combining queries where the document results are needed
	340	if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
[613]	341	else query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
[351]	342
	343	} else if ((*options_here).name == "StartResults") {
	344	startresults = (*options_here).value.getint();
	345	} else if ((*options_here).name == "EndResults") {
	346	endresults = (*options_here).value.getint();
	347	} else if ((*options_here).name == "QueryType") {
	348	query.search_type = ((*options_here).value == "ranked");
[501]	349	} else if ((*options_here).name == "MatchMode") {
	350	query.match_mode = ((*options_here).value == "all");
	351	if (query.match_mode == 1) query.maxdocs = -1;
[351]	352	} else if ((*options_here).name == "Term") {
	353	query.querystring = (*options_here).value;
	354	} else if ((*options_here).name == "Casefold") {
	355	query.casefolding = ((*options_here).value == "true");
	356	} else if ((*options_here).name == "Stem") {
	357	query.stemming = ((*options_here).value == "true");
	358	} else if ((*options_here).name == "Index") {
	359	query.index = (*options_here).value;
	360	} else if ((*options_here).name == "Subcollection") {
	361	query.subcollection = (*options_here).value;
	362	} else if ((*options_here).name == "Language") {
	363	query.language = (*options_here).value;
[613]	364	} else if ((*options_here).name == "Maxdocs") {
	365	query.maxdocs = (*options_here).value.getint();
[766]	366	} else if ((*options_here).name == "PhraseMatch") {
	367	phrasematch = (*options_here).value;
[351]	368	} else {
	369	logout << text_t2ascii
	370	<< "warning: unknown queryfilter option \""
	371	<< (*options_here).name
	372	<< "\" ignored.\n\n";
	373	}
	374
	375	options_here++;
	376	}
	377
	378	// add the last query
	379	query_params.push_back (query);
	380	}
	381
	382
	383
	384	// do query that might involve multiple sub queries
	385	// mgsearchptr and gdbmptr are assumed to be valid
	386	void queryfilterclass::do_multi_query (const FilterRequest_t &request,
	387	const vector<queryparamclass> &query_params,
	388	queryresultsclass &multiresults,
	389	comerror_t &err, ostream &logout) {
	390	outconvertclass text_t2ascii;
	391
	392	err = noError;
	393	mgsearchptr->setcollectdir (collectdir);
	394	multiresults.clear();
	395
	396	vector<queryparamclass>::const_iterator query_here = query_params.begin();
	397	vector<queryparamclass>::const_iterator query_end = query_params.end();
	398	while (query_here != query_end) {
	399	queryresultsclass thisqueryresults;
	400
	401	if (!mgsearchptr->search(*query_here, thisqueryresults)) {
	402	// most likely a system problem
	403	logout << text_t2ascii
	404	<< "system problem: could not do search with mg for index \""
	405	<< (query_here).index << (query_here).subcollection
	406	<< (*query_here).language << "\".\n\n";
	407	err = systemProblem;
	408	return;
	409	}
	410
	411	// combine the results
	412	if (need_matching_docs (request.filterResultOptions)) {
	413	// post-process the results if needed
	414	if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
	415	!thisqueryresults.docs.docset.empty()) {
	416	post_process (*query_here, thisqueryresults);
	417	thisqueryresults.postprocessed = true;
	418	multiresults.postprocessed = true;
	419	}
	420
	421	if (query_params.size() == 1) {
	422	multiresults.docs = thisqueryresults.docs; // just one set of results
	423	multiresults.docs_matched = thisqueryresults.docs_matched;
	424	multiresults.is_approx = thisqueryresults.is_approx;
	425
	426	} else {
	427	if ((*query_here).combinequery == "and") {
	428	multiresults.docs.combine_and (thisqueryresults.docs);
	429	} else if ((*query_here).combinequery == "or") {
	430	multiresults.docs.combine_or (thisqueryresults.docs);
	431	} else if ((*query_here).combinequery == "not") {
	432	multiresults.docs.combine_not (thisqueryresults.docs);
	433	}
	434	multiresults.docs_matched = multiresults.docs.docset.size();
[398]	435	multiresults.is_approx = Exact;
[351]	436	}
	437	}
	438
	439	// combine the term information
	440	if (need_term_info (request.filterResultOptions)) {
	441	// append the terms
	442	multiresults.orgterms.insert(multiresults.orgterms.end(),
	443	thisqueryresults.orgterms.begin(),
	444	thisqueryresults.orgterms.end());
	445
	446	// add the term variants
[358]	447	text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
	448	text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
	449	while (termvar_here != termvar_end) {
	450	multiresults.termvariants.insert(*termvar_here);
	451	termvar_here++;
	452	}
[351]	453	}
	454
	455	query_here++;
	456	}
	457
	458	// sort and unique the query terms
	459	multiresults.sortuniqqueryterms ();
	460	}
	461
	462
	463	void queryfilterclass::sort_doc_results (const FilterRequest_t &/request/,
	464	docresultsclass &docs) {
	465	resultsorderer_t resultsorderer;
[396]	466	resultsorderer.compare_phrase_match = true;
[351]	467	resultsorderer.docset = &(docs.docset);
	468
	469	// first get a list of document numbers
	470	docs.docnum_order();
	471
	472	sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
	473	}
	474
	475
	476
[227]	477	queryfilterclass::queryfilterclass () {
	478	gdbmptr = NULL;
	479	mgsearchptr = NULL;
[766]	480	num_phrases = 0;
[227]	481
[351]	482	FilterOption_t filtopt;
	483	filtopt.name = "CombineQuery";
	484	filtopt.type = FilterOption_t::enumeratedt;
	485	filtopt.repeatable = FilterOption_t::onePerQuery;
	486	filtopt.defaultValue = "and";
	487	filtopt.validValues.push_back("and");
	488	filtopt.validValues.push_back("or");
	489	filtopt.validValues.push_back("not");
	490	filterOptions["CombineQuery"] = filtopt;
	491
[227]	492	// -- onePerQuery StartResults integer
[351]	493	filtopt.clear();
[227]	494	filtopt.name = "StartResults";
	495	filtopt.type = FilterOption_t::integert;
	496	filtopt.repeatable = FilterOption_t::onePerQuery;
	497	filtopt.defaultValue = "1";
	498	filtopt.validValues.push_back("1");
	499	filtopt.validValues.push_back("1000");
	500	filterOptions["StartResults"] = filtopt;
	501
	502	// -- onePerQuery EndResults integer
	503	filtopt.clear();
	504	filtopt.name = "EndResults";
	505	filtopt.type = FilterOption_t::integert;
	506	filtopt.repeatable = FilterOption_t::onePerQuery;
	507	filtopt.defaultValue = "10";
[621]	508	filtopt.validValues.push_back("-1");
[227]	509	filtopt.validValues.push_back("1000");
	510	filterOptions["EndResults"] = filtopt;
	511
	512	// -- onePerQuery QueryType enumerated (boolean, ranked)
	513	filtopt.clear();
	514	filtopt.name = "QueryType";
	515	filtopt.type = FilterOption_t::enumeratedt;
	516	filtopt.repeatable = FilterOption_t::onePerQuery;
	517	filtopt.defaultValue = "ranked";
	518	filtopt.validValues.push_back("boolean");
	519	filtopt.validValues.push_back("ranked");
	520	filterOptions["QueryType"] = filtopt;
	521
[501]	522	// -- onePerQuery MatchMode enumerated (some, all)
	523	filtopt.clear();
	524	filtopt.name = "MatchMode";
	525	filtopt.type = FilterOption_t::enumeratedt;
	526	filtopt.repeatable = FilterOption_t::onePerQuery;
	527	filtopt.defaultValue = "some";
	528	filtopt.validValues.push_back("some");
	529	filtopt.validValues.push_back("all");
[613]	530	filterOptions["MatchMode"] = filtopt;
[501]	531
[227]	532	// -- onePerTerm Term string ???
	533	filtopt.clear();
	534	filtopt.name = "Term";
	535	filtopt.type = FilterOption_t::stringt;
	536	filtopt.repeatable = FilterOption_t::onePerTerm;
	537	filtopt.defaultValue = "";
	538	filterOptions["Term"] = filtopt;
	539
	540	// -- onePerTerm Casefold boolean
	541	filtopt.clear();
	542	filtopt.name = "Casefold";
	543	filtopt.type = FilterOption_t::booleant;
	544	filtopt.repeatable = FilterOption_t::onePerTerm;
	545	filtopt.defaultValue = "true";
	546	filtopt.validValues.push_back("false");
	547	filtopt.validValues.push_back("true");
	548	filterOptions["Casefold"] = filtopt;
	549
	550	// -- onePerTerm Stem boolean
	551	filtopt.clear();
	552	filtopt.name = "Stem";
	553	filtopt.type = FilterOption_t::booleant;
	554	filtopt.repeatable = FilterOption_t::onePerTerm;
	555	filtopt.defaultValue = "false";
	556	filtopt.validValues.push_back("false");
	557	filtopt.validValues.push_back("true");
	558	filterOptions["Stem"] = filtopt;
	559
	560	// -- onePerTerm Index enumerated
	561	filtopt.clear();
	562	filtopt.name = "Index";
	563	filtopt.type = FilterOption_t::enumeratedt;
	564	filtopt.repeatable = FilterOption_t::onePerTerm;
	565	filtopt.defaultValue = "";
	566	filterOptions["Index"] = filtopt;
	567
	568	// -- onePerTerm Subcollection enumerated
	569	filtopt.clear();
	570	filtopt.name = "Subcollection";
	571	filtopt.type = FilterOption_t::enumeratedt;
	572	filtopt.repeatable = FilterOption_t::onePerTerm;
	573	filtopt.defaultValue = "";
	574	filterOptions["Subcollection"] = filtopt;
	575
	576	// -- onePerTerm Language enumerated
	577	filtopt.clear();
	578	filtopt.name = "Language";
	579	filtopt.type = FilterOption_t::enumeratedt;
	580	filtopt.repeatable = FilterOption_t::onePerTerm;
	581	filtopt.defaultValue = "";
	582	filterOptions["Language"] = filtopt;
[613]	583
	584	// -- onePerQuery Maxdocs integer
	585	filtopt.clear();
	586	filtopt.name = "Maxdocs";
	587	filtopt.type = FilterOption_t::integert;
	588	filtopt.repeatable = FilterOption_t::onePerQuery;
	589	filtopt.defaultValue = "200";
	590	filtopt.validValues.push_back("-1");
	591	filtopt.validValues.push_back("1000");
	592	filterOptions["Maxdocs"] = filtopt;
[766]	593
	594	// -- onePerQuery PhraseMatch enumerated
	595	filtopt.clear();
	596	filtopt.name = "PhraseMatch";
	597	filtopt.type = FilterOption_t::enumeratedt;
	598	filtopt.repeatable = FilterOption_t::onePerQuery;
	599	filtopt.defaultValue = "some_phrases";
	600	filtopt.validValues.push_back ("all_phrases");
	601	filtopt.validValues.push_back ("some_phrases");
	602	filtopt.validValues.push_back ("all_docs");
	603	filterOptions["PhraseMatch"] = filtopt;
[227]	604	}
	605
	606	queryfilterclass::~queryfilterclass () {
	607	}
	608
	609	void queryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
	610	filterclass::configure (key, cfgline);
	611
	612	if (key == "indexmap") {
	613	indexmap.importmap (cfgline);
	614
	615	// update the list of indexes in the filter information
	616	text_tarray options;
	617	indexmap.gettoarray (options);
	618	filterOptions["Index"].validValues = options;
	619
[302]	620	} else if (key == "defaultindex") {
	621	indexmap.from2to (cfgline[0], filterOptions["Index"].defaultValue);
	622
[227]	623	} else if (key == "subcollectionmap") {
	624	subcollectionmap.importmap (cfgline);
	625
	626	// update the list of subcollections in the filter information
	627	text_tarray options;
	628	subcollectionmap.gettoarray (options);
	629	filterOptions["Subcollection"].validValues = options;
	630
[302]	631	} else if (key == "defaultsubcollection") {
	632	subcollectionmap.from2to (cfgline[0], filterOptions["Subcollection"].defaultValue);
	633
[227]	634	} else if (key == "languagemap") {
	635	languagemap.importmap (cfgline);
	636
	637	// update the list of languages in the filter information
	638	text_tarray options;
	639	languagemap.gettoarray (options);
	640	filterOptions["Language"].validValues = options;
[302]	641
	642	} else if (key == "defaultlanguage")
	643	languagemap.from2to (cfgline[0], filterOptions["Language"].defaultValue);
[227]	644	}
	645
	646	bool queryfilterclass::init (ostream &logout) {
	647	outconvertclass text_t2ascii;
	648
	649	if (!filterclass::init(logout)) return false;
	650
	651	// get the filename for the database and make sure it exists
[534]	652	gdbm_filename = filename_cat(collectdir,"index","text",collection);
[396]	653
[990]	654	if (littleEndian()) gdbm_filename += ".ldb";
	655	else gdbm_filename += ".bdb";
	656
[227]	657	if (!file_exists(gdbm_filename)) {
	658	logout << text_t2ascii
[722]	659	<< "warning: gdbm database \"" //****
[227]	660	<< gdbm_filename << "\" does not exist\n\n";
[722]	661	//return false; //****
[227]	662	}
	663
	664	return true;
	665	}
	666
[235]	667	void queryfilterclass::filter (const FilterRequest_t &request,
[273]	668	FilterResponse_t &response,
	669	comerror_t &err, ostream &logout) {
[235]	670	outconvertclass text_t2ascii;
	671
	672	response.clear ();
[227]	673	err = noError;
[235]	674	if (gdbmptr == NULL) {
	675	// most likely a configuration problem
	676	logout << text_t2ascii
	677	<< "configuration error: queryfilter contains a null gdbmclass\n\n";
	678	err = configurationError;
	679	return;
	680	}
	681	if (mgsearchptr == NULL) {
	682	// most likely a configuration problem
	683	logout << text_t2ascii
	684	<< "configuration error: queryfilter contains a null mgsearchclass\n\n";
	685	err = configurationError;
	686	return;
	687	}
	688
	689	// open the database
	690	gdbmptr->setlogout(&logout);
[501]	691	if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
[235]	692	// most likely a system problem (we have already checked that the
	693	// gdbm database exists)
	694	logout << text_t2ascii
	695	<< "system problem: open on gdbm database \""
	696	<< gdbm_filename << "\" failed\n\n";
	697	err = systemProblem;
	698	return;
	699	}
	700
	701	// get the query parameters
	702	int startresults = filterOptions["StartResults"].defaultValue.getint();
	703	int endresults = filterOptions["EndResults"].defaultValue.getint();
[766]	704	text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
	705
[351]	706	vector<queryparamclass> queryfilterparams;
[766]	707	parse_query_params (request, queryfilterparams, startresults,
	708	endresults, phrasematch, logout);
[351]	709
[235]	710	// do query
	711	queryresultsclass queryresults;
[351]	712	do_multi_query (request, queryfilterparams, queryresults, err, logout);
	713	if (err != noError) return;
	714
[235]	715	// assemble document results
[351]	716	if (need_matching_docs (request.filterResultOptions)) {
	717	// sort the query results
	718	sort_doc_results (request, queryresults.docs);
	719
[235]	720	int resultnum = 1;
	721	ResultDocInfo_t resultdoc;
	722	text_t trans_OID;
[351]	723	vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
	724	vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
	725
[621]	726	if (endresults == -1) endresults = MAXNUMDOCS;
[351]	727	while (docorder_here != docorder_end) {
[235]	728	if (resultnum > endresults) break;
	729
	730	// translate the document number
[351]	731	if (!translate(gdbmptr, *docorder_here, trans_OID)) {
[235]	732	logout << text_t2ascii
	733	<< "warning: could not translate mg document number \""
[351]	734	<< *docorder_here << "\"to OID.\n\n";
[235]	735
	736	} else {
[351]	737	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
	738
[766]	739	// documents containing matching phrases will be sorted to the top so
	740	// we can break out once we're past those that match the PhraseMatch
	741	// option -- "all_phrases" = return only those documents containing all
	742	// phrases in query string
	743	// "some_phrases" = return only those documents containing
	744	// at least 1 of the phrases in the document
	745	// "all_docs" = return all documents regardless
	746	if (num_phrases > 0) {
[787]	747	if ((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) {
	748	queryresults.docs_matched = response.docInfo.size();
[766]	749	break;
[787]	750	}
	751	if ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1)) {
	752	queryresults.docs_matched = response.docInfo.size();
[766]	753	break;
[787]	754	}
[766]	755	}
	756
[351]	757	// see if there is a result for this number,
	758	// if it is in the request set (or the request set is empty)
	759	if (docset_here != queryresults.docs.docset.end() &&
	760	(request.docSet.empty() \|\| in_set(request.docSet, trans_OID))) {
[235]	761	if (resultnum >= startresults) {
	762	// add this document
	763	resultdoc.OID = trans_OID;
[355]	764	resultdoc.result_num = resultnum;
[351]	765	resultdoc.ranking = (int)((docset_here).second.docweight 10000.0 + 0.5);
[311]	766
	767	// these next two are not available on all versions of mg
[351]	768	resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
	769	resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
[311]	770
[235]	771	response.docInfo.push_back (resultdoc);
	772	}
	773
	774	resultnum++;
	775	}
	776	}
	777
[351]	778	docorder_here++;
[235]	779	}
	780	}
	781
	782	// assemble the term results
[351]	783	if (need_term_info(request.filterResultOptions)) {
[319]	784	// note: the terms have already been sorted and uniqued
[235]	785
	786	TermInfo_t terminfo;
	787	bool terms_first = true;
[396]	788	termfreqclassarray::iterator terms_here = queryresults.terms.begin();
	789	termfreqclassarray::iterator terms_end = queryresults.terms.end();
[235]	790
	791	while (terms_here != terms_end) {
	792	terminfo.clear();
	793	terminfo.term = (*terms_here).termstr;
	794	terminfo.freq = (*terms_here).termfreq;
[351]	795	if (terms_first) {
	796	text_tset::iterator termvariants_here = queryresults.termvariants.begin();
	797	text_tset::iterator termvariants_end = queryresults.termvariants.end();
	798	while (termvariants_here != termvariants_end) {
	799	terminfo.matchTerms.push_back (*termvariants_here);
	800	termvariants_here++;
	801	}
	802	}
[235]	803	terms_first = false;
	804
	805	response.termInfo.push_back (terminfo);
	806
	807	terms_here++;
	808	}
	809	}
	810
[334]	811	response.numDocs = queryresults.docs_matched;
	812	response.isApprox = queryresults.is_approx;
[227]	813	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: