Context Navigation

source: trunk/gsdl/src/colservr/mgqueryfilter.cpp@ 2134

Last change on this file since 2134 was 2134, checked in by sjboddie, 23 years ago

mg phrase searching now always sets maxdocs to -1 (all) - this means that
a phrase search is guaranteed always to hit any document that contains the
phrase but also means that bad phrase searches (like "and the") will take
a very long time, especially on a large collection.

also added a bit of a hack to handle program number indexes for various
bbc collections.

Property svn:keywords set to Author Date Id Revision

File size: 17.1 KB

Rev	Line
[1324]	1	/**********************************************************************
	2	*
	3	* mgqueryfilter.cpp -- implementation of queryfilter for old mg
	4	* Copyright (C) 1999 The New Zealand Digital Library Project
	5	*
	6	* A component of the Greenstone digital library software
	7	* from the New Zealand Digital Library Project at the
	8	* University of Waikato, New Zealand.
	9	*
	10	* This program is free software; you can redistribute it and/or modify
	11	* it under the terms of the GNU General Public License as published by
	12	* the Free Software Foundation; either version 2 of the License, or
	13	* (at your option) any later version.
	14	*
	15	* This program is distributed in the hope that it will be useful,
	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	* GNU General Public License for more details.
	19	*
	20	* You should have received a copy of the GNU General Public License
	21	* along with this program; if not, write to the Free Software
	22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	*
	24	*********************************************************************/
	25
	26	#include "mgqueryfilter.h"
	27	#include "fileutil.h"
	28	#include "phrasesearch.h"
	29	#include <assert.h>
	30	#include "mgsearch.h"
	31
	32	///////////////////////////////
	33	// methods for resultsorderer_t
	34	///////////////////////////////
	35
	36	resultsorderer_t::resultsorderer_t() {
	37	clear ();
	38	}
	39
	40	void resultsorderer_t::clear() {
	41	compare_phrase_match = false;
	42	compare_terms_match = false;
	43	compare_doc_weight = true;
	44
	45	docset = NULL;
	46	}
	47
	48	bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
	49	if (docset == NULL) return t1>t2;
	50
	51	docresultmap::iterator t1_here = docset->find(t1);
	52	docresultmap::iterator t2_here = docset->find(t2);
	53	docresultmap::iterator end = docset->end();
	54
	55	// sort all the document numbers not in the document set to
	56	// the end of the list
	57	if (t1_here == end) {
	58	if (t2_here == end) return t1>t2;
	59	else return true;
	60	} else if (t2_here == end) return false;
	61
	62	if (compare_phrase_match) {
	63	if ((t1_here).second.num_phrase_match > (t2_here).second.num_phrase_match) return true;
	64	if ((t1_here).second.num_phrase_match < (t2_here).second.num_phrase_match) return false;
	65	}
	66
	67	if (compare_terms_match) {
	68	if ((t1_here).second.num_query_terms_matched > (t2_here).second.num_query_terms_matched) return true;
	69	if ((t1_here).second.num_query_terms_matched < (t2_here).second.num_query_terms_matched) return false;
	70	}
	71
	72	if (compare_doc_weight) {
	73	if ((t1_here).second.docweight > (t2_here).second.docweight) return true;
	74	if ((t1_here).second.docweight < (t2_here).second.docweight) return false;
	75	}
	76
	77	return t1>t2;
	78	}
	79
	80
	81
	82
	83	/////////////////////////////////
	84	// functions for mgqueryfilterclass
	85	/////////////////////////////////
	86
	87	// loads up phrases data structure with any phrases (that's the quoted bits)
	88	// occuring in the querystring
	89	void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
	90	const termfreqclassarray &orgterms,
	91	vector<termfreqclassarray> &phrases) {
	92
	93	text_t::const_iterator here = querystring.begin();
	94	text_t::const_iterator end = querystring.end();
	95
	96	termfreqclassarray tmpterms;
	97
	98	int termcount = 0;
	99	bool foundquote = false;
	100	bool foundbreak = false;
	101	bool start = true;
	102	while (here != end) {
	103	if (*here == '\"') {
	104	if (foundquote) {
	105	if (!foundbreak && !start) {
	106	tmpterms.push_back (orgterms[termcount]);
	107	termcount ++;
	108	}
	109	if (tmpterms.size() > 1) {
	110	phrases.push_back (tmpterms);
	111	tmpterms.erase (tmpterms.begin(), tmpterms.end());
	112	}
	113	foundquote = false;
	114	foundbreak = true;
	115	} else foundquote = true;
	116	} else if (!is_unicode_letdig(*here)) {
	117	// found a break between terms
	118	if (!foundbreak && !start) {
	119	if (foundquote)
	120	tmpterms.push_back (orgterms[termcount]);
	121	termcount ++;
	122	}
	123	foundbreak = true;
	124	} else {
	125	start = false;
	126	foundbreak = false;
	127	}
	128	here++;
	129	}
	130	}
	131
	132	// do aditional query processing
	133	void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
	134	queryresultsclass &queryresults) {
	135
	136	// post-process the results if needed
	137	if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
	138
	139	// get the terms between quotes (if any)
	140	vector<termfreqclassarray> phrases;
	141	get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
	142
	143	num_phrases = phrases.size();
	144	if (num_phrases > 0) {
	145
	146	// get the long version of the index
	147	text_t longindex;
	148	indexmap.to2from (queryparams.index, longindex);
	149
	150	vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
	151	vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
	152
	153	while (this_phrase != end_phrase) {
	154
	155	// process each of the matched documents
	156	docresultmap::iterator docs_here = queryresults.docs.docset.begin();
	157	docresultmap::iterator docs_end = queryresults.docs.docset.end();
	158	while (docs_here != docs_end) {
	159	if (OID_phrase_search (((mgsearchclass)mgsearchptr), *gdbmptr, queryparams.index,
	160	queryparams.subcollection, queryparams.language,
	161	longindex, queryparams.collection, *this_phrase,
	162	(*docs_here).second.docnum)) {
	163	(*docs_here).second.num_phrase_match++;
	164	}
	165
	166	docs_here++;
	167	}
	168	this_phrase++;
	169	}
	170	}
	171	}
	172	}
	173
	174
	175	// do query that might involve multiple sub queries
	176	// mgsearchptr and gdbmptr are assumed to be valid
	177	void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
	178	const vector<queryparamclass> &query_params,
	179	queryresultsclass &multiresults,
	180	comerror_t &err, ostream &logout) {
	181	outconvertclass text_t2ascii;
	182
	183	err = noError;
	184	mgsearchptr->setcollectdir (collectdir);
	185	multiresults.clear();
	186
	187	vector<queryparamclass>::const_iterator query_here = query_params.begin();
	188	vector<queryparamclass>::const_iterator query_end = query_params.end();
	189	while (query_here != query_end) {
	190	queryresultsclass thisqueryresults;
[1662]	191
[1324]	192	if (!mgsearchptr->search(*query_here, thisqueryresults)) {
	193	// most likely a system problem
	194	logout << text_t2ascii
	195	<< "system problem: could not do search with mg for index \""
	196	<< (query_here).index << (query_here).subcollection
	197	<< (*query_here).language << "\".\n\n";
	198	err = systemProblem;
	199	return;
	200	}
	201
	202	// combine the results
	203	if (need_matching_docs (request.filterResultOptions)) {
	204	// post-process the results if needed
	205	if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
	206	!thisqueryresults.docs.docset.empty()) {
	207	post_process (*query_here, thisqueryresults);
	208	thisqueryresults.postprocessed = true;
	209	multiresults.postprocessed = true;
[1721]	210	} else {
	211	num_phrases = 0;
[1324]	212	}
	213
	214	if (query_params.size() == 1) {
	215	multiresults.docs = thisqueryresults.docs; // just one set of results
	216	multiresults.docs_matched = thisqueryresults.docs_matched;
	217	multiresults.is_approx = thisqueryresults.is_approx;
	218
	219	} else {
	220	if ((*query_here).combinequery == "and") {
	221	multiresults.docs.combine_and (thisqueryresults.docs);
	222	} else if ((*query_here).combinequery == "or") {
	223	multiresults.docs.combine_or (thisqueryresults.docs);
	224	} else if ((*query_here).combinequery == "not") {
	225	multiresults.docs.combine_not (thisqueryresults.docs);
	226	}
	227	multiresults.docs_matched = multiresults.docs.docset.size();
	228	multiresults.is_approx = Exact;
	229	}
	230	}
	231
	232	// combine the term information
	233	if (need_term_info (request.filterResultOptions)) {
	234	// append the terms
	235	multiresults.orgterms.insert(multiresults.orgterms.end(),
	236	thisqueryresults.orgterms.begin(),
	237	thisqueryresults.orgterms.end());
	238
	239	// add the term variants
	240	text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
	241	text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
	242	while (termvar_here != termvar_end) {
	243	multiresults.termvariants.insert(*termvar_here);
	244	termvar_here++;
	245	}
	246	}
	247
	248	query_here++;
	249	}
	250
	251	// sort and unique the query terms
	252	multiresults.sortuniqqueryterms ();
	253	}
	254
	255
	256	void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/request/,
	257	docresultsclass &docs) {
	258	resultsorderer_t resultsorderer;
	259	resultsorderer.compare_phrase_match = true;
	260	resultsorderer.docset = &(docs.docset);
	261
	262	// first get a list of document numbers
	263	docs.docnum_order();
	264
	265	sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
	266	}
	267
	268
	269
	270	mgqueryfilterclass::mgqueryfilterclass ()
	271	:queryfilterclass() {
	272
	273	num_phrases = 0;
	274
	275	}
	276
	277	mgqueryfilterclass::~mgqueryfilterclass () {
	278	}
	279
	280	void mgqueryfilterclass::filter (const FilterRequest_t &request,
	281	FilterResponse_t &response,
	282	comerror_t &err, ostream &logout) {
	283	outconvertclass text_t2ascii;
	284
	285	response.clear ();
	286	err = noError;
	287	if (gdbmptr == NULL) {
	288	// most likely a configuration problem
	289	logout << text_t2ascii
	290	<< "configuration error: mgqueryfilter contains a null gdbmclass\n\n";
	291	err = configurationError;
	292	return;
	293	}
	294	if (mgsearchptr == NULL) {
	295	// most likely a configuration problem
	296	logout << text_t2ascii
	297	<< "configuration error: mgqueryfilter contains a null mgsearchclass\n\n";
	298	err = configurationError;
	299	return;
	300	}
	301
	302	// open the database
	303	gdbmptr->setlogout(&logout);
	304	if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
	305	// most likely a system problem (we have already checked that the
	306	// gdbm database exists)
	307	logout << text_t2ascii
	308	<< "system problem: open on gdbm database \""
	309	<< gdbm_filename << "\" failed\n\n";
	310	err = systemProblem;
	311	return;
	312	}
	313
	314	// get the query parameters
	315	int startresults = filterOptions["StartResults"].defaultValue.getint();
	316	int endresults = filterOptions["EndResults"].defaultValue.getint();
	317	text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
	318
	319	vector<queryparamclass> queryfilterparams;
	320	parse_query_params (request, queryfilterparams, startresults,
[1662]	321	endresults, phrasematch, logout);
	322	// do any mg specific diddling with query parameters that may be required
	323	mg_parse_query_params (request, queryfilterparams, startresults,
	324	endresults, phrasematch, logout);
	325
	326
[1324]	327	// do query
	328	queryresultsclass queryresults;
	329	do_multi_query (request, queryfilterparams, queryresults, err, logout);
	330	if (err != noError) return;
	331
	332	// assemble document results
	333	if (need_matching_docs (request.filterResultOptions)) {
	334	// sort the query results
	335	sort_doc_results (request, queryresults.docs);
	336
	337	int resultnum = 1;
	338	ResultDocInfo_t resultdoc;
	339	text_t trans_OID;
	340	vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
	341	vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
	342
[1662]	343	// documents containing matching phrases will be sorted to the top so
	344	// we can break out once we're past those that match the PhraseMatch
	345	// option -- "all_phrases" = return only those documents containing all
	346	// phrases in query string
	347	// "some_phrases" = return only those documents containing
	348	// at least 1 of the phrases in the document
	349	// "all_docs" = return all documents regardless
	350	if (num_phrases > 0) {
	351	int numdocs = 0;
	352	while (docorder_here != docorder_end) {
	353	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
	354
	355	if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) \|\|
	356	((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
	357	queryresults.docs_matched = numdocs;
	358	break;
	359	}
	360	numdocs ++;
	361	docorder_here ++;
	362	}
	363	}
	364
[1324]	365	if (endresults == -1) endresults = MAXNUMDOCS;
[1662]	366	docorder_here = queryresults.docs.docorder.begin();
[1324]	367	while (docorder_here != docorder_end) {
[1662]	368	if (resultnum > endresults \|\| resultnum > queryresults.docs_matched) break;
[1324]	369
	370	// translate the document number
	371	if (!translate(gdbmptr, *docorder_here, trans_OID)) {
	372	logout << text_t2ascii
	373	<< "warning: could not translate mg document number \""
	374	<< *docorder_here << "\"to OID.\n\n";
	375
	376	} else {
	377	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
	378
	379	// see if there is a result for this number,
	380	// if it is in the request set (or the request set is empty)
	381	if (docset_here != queryresults.docs.docset.end() &&
	382	(request.docSet.empty() \|\| in_set(request.docSet, trans_OID))) {
	383	if (resultnum >= startresults) {
	384	// add this document
	385	resultdoc.OID = trans_OID;
	386	resultdoc.result_num = resultnum;
	387	resultdoc.ranking = (int)((docset_here).second.docweight 10000.0 + 0.5);
	388
	389	// these next two are not available on all versions of mg
	390	resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
	391	resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
	392
	393	response.docInfo.push_back (resultdoc);
	394	}
	395
	396	resultnum++;
	397	}
	398	}
	399
	400	docorder_here++;
	401	}
	402	}
	403
	404	// assemble the term results
	405	if (need_term_info(request.filterResultOptions)) {
	406	// note: the terms have already been sorted and uniqued
	407
	408	TermInfo_t terminfo;
	409	bool terms_first = true;
	410	termfreqclassarray::iterator terms_here = queryresults.terms.begin();
	411	termfreqclassarray::iterator terms_end = queryresults.terms.end();
	412
	413	while (terms_here != terms_end) {
	414	terminfo.clear();
	415	terminfo.term = (*terms_here).termstr;
	416	terminfo.freq = (*terms_here).termfreq;
	417	if (terms_first) {
	418	text_tset::iterator termvariants_here = queryresults.termvariants.begin();
	419	text_tset::iterator termvariants_end = queryresults.termvariants.end();
	420	while (termvariants_here != termvariants_end) {
	421	terminfo.matchTerms.push_back (*termvariants_here);
	422	termvariants_here++;
	423	}
	424	}
	425	terms_first = false;
	426
	427	response.termInfo.push_back (terminfo);
	428
	429	terms_here++;
	430	}
	431	}
	432
	433	response.numDocs = queryresults.docs_matched;
	434	response.isApprox = queryresults.is_approx;
	435	}
	436
[1662]	437	void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/request/,
	438	vector<queryparamclass> &query_params,
	439	int &/startresults/, int &/endresults/,
	440	text_t &/phrasematch/, ostream &/logout/) {
	441
	442	// outconvertclass text_t2ascii;
	443
	444	vector<queryparamclass>::iterator query_here = query_params.begin();
	445	vector<queryparamclass>::iterator query_end = query_params.end();
	446	while (query_here != query_end) {
	447
[2134]	448	// if we're doing a phrase search we want to maximise hits by making it
	449	// a boolean search on the index with the finest granularity - we'll
	450	// also set maxdocs to "all" (realizing that this will cause searches
	451	// like "and the" on a large collection to take a very very long time).
	452
[1662]	453	// we're deciding it's a phrase search based on if the querystring
	454	// contains at least 2 double quotes (not very scientific but
	455	// then neither is the rest of the mg phrase searching functionality :-)
	456	if (countchar ((query_here).querystring.begin(), (query_here).querystring.end(), '"') > 1) {
	457	(*query_here).search_type = 0;
	458
[2134]	459	// set maxdocs to "all"
	460	(*query_here).maxdocs = -1;
	461
[1662]	462	// Get the long version of the index and test to see if any indexes with
	463	// finer granularity exist. Indexes must be the same type (i.e. same metadata
	464	// or "text").
	465	text_t longindex; text_tarray splitindex;
	466	indexmap.to2from ((*query_here).index, longindex);
	467	splitchar (longindex.begin(), longindex.end(), ':', splitindex);
	468	text_t &granularity = splitindex[0];
	469	text_t &indextype = splitindex[1];
	470	bool found = false;
	471	// currently supported granularity options are "document", "section" and "paragraph"
	472	if (granularity == "document" \|\| granularity == "section") {
	473	text_t shortindex;
	474	if (indexmap.fromexists ("paragraph:" + indextype)) {
	475	indexmap.from2to ("paragraph:" + indextype, shortindex);
	476	(*query_here).index = shortindex;
	477	found = true;
	478	}
	479	if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
	480	indexmap.from2to ("section:" + indextype, shortindex);
	481	(*query_here).index = shortindex;
	482	}
	483	}
	484	}
	485
[2134]	486	#ifdef GSDL_BBC_COLLECTION
	487	// This is a special hack for the BBC collection's ProgNumber index
	488
	489	// if we're searching a ProgNumber index we want to:
	490	// 1. Remove all non-alphanumeric characters from the query string
	491	// 2. Make it a boolean search
	492	// 3. Turn off case-folding
	493	text_t longindex; text_tarray splitindex;
	494	indexmap.to2from ((*query_here).index, longindex);
	495	splitchar (longindex.begin(), longindex.end(), ':', splitindex);
	496	text_t &indextype = splitindex[1];
	497	if (indextype == "ProgNumber") {
	498	(*query_here).search_type = 0;
	499	(*query_here).casefolding = 0;
	500	text_t new_querystring;
	501	text_t::const_iterator here = (*query_here).querystring.begin();
	502	text_t::const_iterator end = (*query_here).querystring.end();
	503	while (here != end) {
	504	if ((here >= 'a' && here <= 'z') \|\| (here >= 'A' && here <= 'Z') \|\|
	505	(here >= '0' && here <= '9')) {
	506	new_querystring.push_back (*here);
	507	}
	508	here ++;
	509	}
	510	(*query_here).querystring = new_querystring;
	511	}
	512	#endif
	513
[1662]	514	query_here ++;
	515	}
	516	}
	517

Note: See TracBrowser for help on using the repository browser.

Download in other formats: