Context Navigation

mgqueryfilter.cpp@ 28762

Last change on this file since 28762 was 27064, checked in by kjdon, 11 years ago
adding reverse sort/sort order in for lucene search results sorting. reorganising code to avoid duplication, added fieldedqueryfilter in the chain of inheritance
Property svn:keywords set to `Author Date Id Revision`
File size: 18.8 KB

Rev	Line
[1324]	1	/**********************************************************************
	2	*
	3	* mgqueryfilter.cpp -- implementation of queryfilter for old mg
	4	* Copyright (C) 1999 The New Zealand Digital Library Project
	5	*
	6	* A component of the Greenstone digital library software
	7	* from the New Zealand Digital Library Project at the
	8	* University of Waikato, New Zealand.
	9	*
	10	* This program is free software; you can redistribute it and/or modify
	11	* it under the terms of the GNU General Public License as published by
	12	* the Free Software Foundation; either version 2 of the License, or
	13	* (at your option) any later version.
	14	*
	15	* This program is distributed in the hope that it will be useful,
	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	* GNU General Public License for more details.
	19	*
	20	* You should have received a copy of the GNU General Public License
	21	* along with this program; if not, write to the Free Software
	22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	*
	24	*********************************************************************/
	25
	26	#include "mgqueryfilter.h"
	27	#include "fileutil.h"
	28	#include "phrasesearch.h"
	29	#include "mgsearch.h"
[11002]	30	#include "phrases.h"
[1324]	31
	32	///////////////////////////////
	33	// methods for resultsorderer_t
	34	///////////////////////////////
	35
	36	resultsorderer_t::resultsorderer_t() {
	37	clear ();
	38	}
	39
	40	void resultsorderer_t::clear() {
	41	compare_phrase_match = false;
	42	compare_terms_match = false;
	43	compare_doc_weight = true;
	44
	45	docset = NULL;
	46	}
	47
[16445]	48	bool resultsorderer_t::operator()(const text_t &t1, const text_t &t2) const {
[1324]	49	if (docset == NULL) return t1>t2;
	50
	51	docresultmap::iterator t1_here = docset->find(t1);
	52	docresultmap::iterator t2_here = docset->find(t2);
	53	docresultmap::iterator end = docset->end();
	54
	55	// sort all the document numbers not in the document set to
	56	// the end of the list
	57	if (t1_here == end) {
	58	if (t2_here == end) return t1>t2;
	59	else return true;
	60	} else if (t2_here == end) return false;
	61
	62	if (compare_phrase_match) {
	63	if ((t1_here).second.num_phrase_match > (t2_here).second.num_phrase_match) return true;
	64	if ((t1_here).second.num_phrase_match < (t2_here).second.num_phrase_match) return false;
	65	}
	66
	67	if (compare_terms_match) {
	68	if ((t1_here).second.num_query_terms_matched > (t2_here).second.num_query_terms_matched) return true;
	69	if ((t1_here).second.num_query_terms_matched < (t2_here).second.num_query_terms_matched) return false;
	70	}
	71
	72	if (compare_doc_weight) {
	73	if ((t1_here).second.docweight > (t2_here).second.docweight) return true;
	74	if ((t1_here).second.docweight < (t2_here).second.docweight) return false;
	75	}
	76
	77	return t1>t2;
	78	}
	79
	80
	81
	82
	83	/////////////////////////////////
	84	// functions for mgqueryfilterclass
	85	/////////////////////////////////
	86
[4193]	87
	88	void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
	89	queryfilterclass::configure (key, cfgline);
	90
[12314]	91	if (key == "indexstem") {
[9937]	92	((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
	93	}
	94
[4193]	95	}
	96
[1324]	97	// loads up phrases data structure with any phrases (that's the quoted bits)
	98	// occuring in the querystring
	99	void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
	100	const termfreqclassarray &orgterms,
	101	vector<termfreqclassarray> &phrases) {
	102
	103	text_t::const_iterator here = querystring.begin();
	104	text_t::const_iterator end = querystring.end();
	105
	106	termfreqclassarray tmpterms;
	107
	108	int termcount = 0;
	109	bool foundquote = false;
	110	bool foundbreak = false;
	111	bool start = true;
	112	while (here != end) {
	113	if (*here == '\"') {
	114	if (foundquote) {
	115	if (!foundbreak && !start) {
	116	tmpterms.push_back (orgterms[termcount]);
[9620]	117	++termcount;
[1324]	118	}
	119	if (tmpterms.size() > 1) {
	120	phrases.push_back (tmpterms);
	121	}
[11002]	122	tmpterms.erase (tmpterms.begin(), tmpterms.end());
	123
[1324]	124	foundquote = false;
	125	foundbreak = true;
	126	} else foundquote = true;
	127	} else if (!is_unicode_letdig(*here)) {
	128	// found a break between terms
	129	if (!foundbreak && !start) {
[11002]	130	if (foundquote) {
[1324]	131	tmpterms.push_back (orgterms[termcount]);
[11002]	132	}
[9620]	133	++termcount;
[1324]	134	}
	135	foundbreak = true;
	136	} else {
	137	start = false;
	138	foundbreak = false;
	139	}
[9620]	140	++here;
[1324]	141	}
	142	}
	143
	144	// do aditional query processing
	145	void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
	146	queryresultsclass &queryresults) {
	147
	148	// post-process the results if needed
	149	if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
	150
	151	// get the terms between quotes (if any)
	152	vector<termfreqclassarray> phrases;
	153	get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
	154
	155	num_phrases = phrases.size();
	156	if (num_phrases > 0) {
	157
	158	// get the long version of the index
	159	text_t longindex;
	160	indexmap.to2from (queryparams.index, longindex);
	161
	162	vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
	163	vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
	164
	165	while (this_phrase != end_phrase) {
	166
	167	// process each of the matched documents
	168	docresultmap::iterator docs_here = queryresults.docs.docset.begin();
	169	docresultmap::iterator docs_end = queryresults.docs.docset.end();
	170	while (docs_here != docs_end) {
[15558]	171	if (OID_phrase_search (((mgsearchclass)textsearchptr), *db_ptr, queryparams.index,
[1324]	172	queryparams.subcollection, queryparams.language,
	173	longindex, queryparams.collection, *this_phrase,
	174	(*docs_here).second.docnum)) {
[9620]	175	++docs_here->second.num_phrase_match;
[1324]	176	}
	177
[9620]	178	++docs_here;
[1324]	179	}
[9620]	180	++this_phrase;
[1324]	181	}
	182	}
	183	}
	184	}
	185
	186
	187	// do query that might involve multiple sub queries
[15595]	188	// textsearchptr and db_ptr are assumed to be valid
[1324]	189	void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
	190	const vector<queryparamclass> &query_params,
	191	queryresultsclass &multiresults,
	192	comerror_t &err, ostream &logout) {
	193	outconvertclass text_t2ascii;
	194
	195	err = noError;
[8026]	196	textsearchptr->setcollectdir (collectdir);
[16310]	197
[1324]	198	multiresults.clear();
	199
	200	vector<queryparamclass>::const_iterator query_here = query_params.begin();
	201	vector<queryparamclass>::const_iterator query_end = query_params.end();
	202	while (query_here != query_end) {
	203	queryresultsclass thisqueryresults;
[1662]	204
[8026]	205	if (!textsearchptr->search(*query_here, thisqueryresults)) {
[1324]	206	// most likely a system problem
	207	logout << text_t2ascii
	208	<< "system problem: could not do search with mg for index \""
	209	<< (query_here).index << (query_here).subcollection
	210	<< (*query_here).language << "\".\n\n";
	211	err = systemProblem;
	212	return;
	213	}
	214
	215	// combine the results
	216	if (need_matching_docs (request.filterResultOptions)) {
	217	// post-process the results if needed
	218	if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
	219	!thisqueryresults.docs.docset.empty()) {
	220	post_process (*query_here, thisqueryresults);
	221	thisqueryresults.postprocessed = true;
	222	multiresults.postprocessed = true;
[1721]	223	} else {
	224	num_phrases = 0;
[1324]	225	}
	226
	227	if (query_params.size() == 1) {
	228	multiresults.docs = thisqueryresults.docs; // just one set of results
	229	multiresults.docs_matched = thisqueryresults.docs_matched;
	230	multiresults.is_approx = thisqueryresults.is_approx;
	231
	232	} else {
	233	if ((*query_here).combinequery == "and") {
	234	multiresults.docs.combine_and (thisqueryresults.docs);
	235	} else if ((*query_here).combinequery == "or") {
	236	multiresults.docs.combine_or (thisqueryresults.docs);
	237	} else if ((*query_here).combinequery == "not") {
	238	multiresults.docs.combine_not (thisqueryresults.docs);
	239	}
	240	multiresults.docs_matched = multiresults.docs.docset.size();
	241	multiresults.is_approx = Exact;
	242	}
	243	}
	244
	245	// combine the term information
	246	if (need_term_info (request.filterResultOptions)) {
	247	// append the terms
	248	multiresults.orgterms.insert(multiresults.orgterms.end(),
	249	thisqueryresults.orgterms.begin(),
	250	thisqueryresults.orgterms.end());
	251
	252	// add the term variants
	253	text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
	254	text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
	255	while (termvar_here != termvar_end) {
	256	multiresults.termvariants.insert(*termvar_here);
[9620]	257	++termvar_here;
[1324]	258	}
	259	}
	260
[9620]	261	++query_here;
[1324]	262	}
	263
	264	// sort and unique the query terms
	265	multiresults.sortuniqqueryterms ();
	266	}
	267
	268
	269	void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/request/,
	270	docresultsclass &docs) {
	271	resultsorderer_t resultsorderer;
	272	resultsorderer.compare_phrase_match = true;
	273	resultsorderer.docset = &(docs.docset);
	274
	275	// first get a list of document numbers
	276	docs.docnum_order();
	277
	278	sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
	279	}
	280
	281
	282
	283	mgqueryfilterclass::mgqueryfilterclass ()
	284	:queryfilterclass() {
	285
	286	num_phrases = 0;
[27064]	287
	288	FilterOption_t filtopt;
	289	// -- onePerQuery PhraseMatch enumerated
	290	filtopt.name = "PhraseMatch";
	291	filtopt.type = FilterOption_t::enumeratedt;
	292	filtopt.repeatable = FilterOption_t::onePerQuery;
	293	filtopt.defaultValue = "some_phrases";
	294	filtopt.validValues.push_back ("all_phrases");
	295	filtopt.validValues.push_back ("some_phrases");
	296	filtopt.validValues.push_back ("all_docs");
	297	filterOptions["PhraseMatch"] = filtopt;
	298
[1324]	299	}
	300
	301	mgqueryfilterclass::~mgqueryfilterclass () {
	302	}
	303
	304	void mgqueryfilterclass::filter (const FilterRequest_t &request,
	305	FilterResponse_t &response,
	306	comerror_t &err, ostream &logout) {
	307	outconvertclass text_t2ascii;
	308
	309	response.clear ();
	310	err = noError;
[15558]	311	if (db_ptr == NULL) {
[1324]	312	// most likely a configuration problem
	313	logout << text_t2ascii
[15558]	314	<< "configuration error: mgqueryfilter contains a null dbclass\n\n";
[1324]	315	err = configurationError;
	316	return;
	317	}
[8026]	318	if (textsearchptr == NULL) {
[1324]	319	// most likely a configuration problem
	320	logout << text_t2ascii
[8026]	321	<< "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
[1324]	322	err = configurationError;
	323	return;
	324	}
	325
	326	// open the database
[15558]	327	db_ptr->setlogout(&logout);
	328	if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
	329	// most likely a system problem (we have already checked that the database exists)
[1324]	330	logout << text_t2ascii
[15558]	331	<< "system problem: open on database \"" << db_filename << "\" failed\n\n";
[1324]	332	err = systemProblem;
	333	return;
	334	}
	335
	336	// get the query parameters
	337	int startresults = filterOptions["StartResults"].defaultValue.getint();
	338	int endresults = filterOptions["EndResults"].defaultValue.getint();
	339	text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
	340
	341	vector<queryparamclass> queryfilterparams;
	342	parse_query_params (request, queryfilterparams, startresults,
[1662]	343	endresults, phrasematch, logout);
	344	// do any mg specific diddling with query parameters that may be required
[27064]	345	// mg_parse_query_params (request, queryfilterparams, startresults,
	346	// endresults, phrasematch, logout);
[1662]	347
	348
[1324]	349	// do query
	350	queryresultsclass queryresults;
	351	do_multi_query (request, queryfilterparams, queryresults, err, logout);
	352	if (err != noError) return;
	353
	354	// assemble document results
	355	if (need_matching_docs (request.filterResultOptions)) {
	356	// sort the query results
[5850]	357	// only want to sort the docs if we have done a ranked search or there were phrases
	358	if (num_phrases > 0 \|\| (request.filterResultOptions & FRranking)) {
	359	sort_doc_results (request, queryresults.docs);
	360	}
[1324]	361	int resultnum = 1;
	362	ResultDocInfo_t resultdoc;
	363	text_t trans_OID;
[16445]	364	vector<text_t>::iterator docorder_here = queryresults.docs.docorder.begin();
	365	vector<text_t>::iterator docorder_end = queryresults.docs.docorder.end();
[1324]	366
[1662]	367	// documents containing matching phrases will be sorted to the top so
	368	// we can break out once we're past those that match the PhraseMatch
	369	// option -- "all_phrases" = return only those documents containing all
	370	// phrases in query string
	371	// "some_phrases" = return only those documents containing
	372	// at least 1 of the phrases in the document
	373	// "all_docs" = return all documents regardless
	374	if (num_phrases > 0) {
	375	int numdocs = 0;
	376	while (docorder_here != docorder_end) {
	377	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
	378
	379	if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) \|\|
	380	((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
	381	queryresults.docs_matched = numdocs;
	382	break;
	383	}
[9620]	384	++numdocs;
	385	++docorder_here;
[1662]	386	}
	387	}
	388
[1324]	389	if (endresults == -1) endresults = MAXNUMDOCS;
[1662]	390	docorder_here = queryresults.docs.docorder.begin();
[1324]	391	while (docorder_here != docorder_end) {
[1662]	392	if (resultnum > endresults \|\| resultnum > queryresults.docs_matched) break;
[1324]	393
	394	// translate the document number
[15558]	395	if (!translate(db_ptr, *docorder_here, trans_OID)) {
[1324]	396	logout << text_t2ascii
	397	<< "warning: could not translate mg document number \""
	398	<< *docorder_here << "\"to OID.\n\n";
	399
	400	} else {
	401	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
	402
	403	// see if there is a result for this number,
	404	// if it is in the request set (or the request set is empty)
	405	if (docset_here != queryresults.docs.docset.end() &&
	406	(request.docSet.empty() \|\| in_set(request.docSet, trans_OID))) {
	407	if (resultnum >= startresults) {
	408	// add this document
	409	resultdoc.OID = trans_OID;
	410	resultdoc.result_num = resultnum;
	411	resultdoc.ranking = (int)((docset_here).second.docweight 10000.0 + 0.5);
	412
	413	// these next two are not available on all versions of mg
	414	resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
	415	resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
	416
	417	response.docInfo.push_back (resultdoc);
	418	}
	419
[9620]	420	++resultnum;
[1324]	421	}
	422	}
	423
[9620]	424	++docorder_here;
[1324]	425	}
	426	}
	427
	428	// assemble the term results
	429	if (need_term_info(request.filterResultOptions)) {
	430	// note: the terms have already been sorted and uniqued
	431
	432	TermInfo_t terminfo;
	433	bool terms_first = true;
	434	termfreqclassarray::iterator terms_here = queryresults.terms.begin();
	435	termfreqclassarray::iterator terms_end = queryresults.terms.end();
	436
	437	while (terms_here != terms_end) {
	438	terminfo.clear();
	439	terminfo.term = (*terms_here).termstr;
	440	terminfo.freq = (*terms_here).termfreq;
	441	if (terms_first) {
	442	text_tset::iterator termvariants_here = queryresults.termvariants.begin();
	443	text_tset::iterator termvariants_end = queryresults.termvariants.end();
	444	while (termvariants_here != termvariants_end) {
	445	terminfo.matchTerms.push_back (*termvariants_here);
[9620]	446	++termvariants_here;
[1324]	447	}
	448	}
	449	terms_first = false;
	450
	451	response.termInfo.push_back (terminfo);
	452
[9620]	453	++terms_here;
[1324]	454	}
	455	}
	456
[15558]	457	db_ptr->closedatabase(); // Important that local library doesn't leave any files open
[1324]	458	response.numDocs = queryresults.docs_matched;
	459	response.isApprox = queryresults.is_approx;
	460	}
	461
[27064]	462	void mgqueryfilterclass::parse_query_params (const FilterRequest_t &request,
[1662]	463	vector<queryparamclass> &query_params,
[27064]	464	int &startresults, int &endresults,
	465	text_t &phrasematch, ostream &logout) {
[1662]	466
[27064]	467	queryfilterclass::parse_query_params (request, query_params,
	468	startresults, endresults, logout);
	469
	470	phrasematch = filterOptions["PhraseMatch"].defaultValue;
	471
	472	// is there a better way to do this than iterate through all the options again??
	473	OptionValue_tarray::const_iterator options_here = request.filterOptions.begin();
	474	OptionValue_tarray::const_iterator options_end = request.filterOptions.end();
	475	while (options_here != options_end) {
	476	if ((*options_here).name == "PhraseMatch") {
	477	phrasematch = (*options_here).value;
	478	break;
	479	}
	480	++options_here;
	481	}
[1662]	482
	483	vector<queryparamclass>::iterator query_here = query_params.begin();
	484	vector<queryparamclass>::iterator query_end = query_params.end();
	485	while (query_here != query_end) {
	486
[2134]	487	// if we're doing a phrase search we want to maximise hits by making it
	488	// a boolean search on the index with the finest granularity - we'll
	489	// also set maxdocs to "all" (realizing that this will cause searches
	490	// like "and the" on a large collection to take a very very long time).
	491
[1662]	492	// we're deciding it's a phrase search based on if the querystring
	493	// contains at least 2 double quotes (not very scientific but
	494	// then neither is the rest of the mg phrase searching functionality :-)
[11002]	495	//if (countchar ((query_here).querystring.begin(), (query_here).querystring.end(), '"') > 1) {
	496
	497	// [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here
	498	text_tarray phrases;
	499	get_phrases((*query_here).querystring, phrases);
	500
	501	if (phrases.size() > 0) {
[1662]	502	(*query_here).search_type = 0;
	503
[2134]	504	// set maxdocs to "all"
	505	(*query_here).maxdocs = -1;
	506
[1662]	507	// Get the long version of the index and test to see if any indexes with
	508	// finer granularity exist. Indexes must be the same type (i.e. same metadata
	509	// or "text").
	510	text_t longindex; text_tarray splitindex;
	511	indexmap.to2from ((*query_here).index, longindex);
	512	splitchar (longindex.begin(), longindex.end(), ':', splitindex);
	513	text_t &granularity = splitindex[0];
	514	text_t &indextype = splitindex[1];
	515	bool found = false;
	516	// currently supported granularity options are "document", "section" and "paragraph"
	517	if (granularity == "document" \|\| granularity == "section") {
	518	text_t shortindex;
	519	if (indexmap.fromexists ("paragraph:" + indextype)) {
	520	indexmap.from2to ("paragraph:" + indextype, shortindex);
	521	(*query_here).index = shortindex;
	522	found = true;
	523	}
	524	if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
	525	indexmap.from2to ("section:" + indextype, shortindex);
	526	(*query_here).index = shortindex;
	527	}
	528	}
	529	}
[4507]	530
	531	#ifdef GSDL_BBC_COLLECTION
	532	// This is a special hack for the BBC collection's ProgNumber and zzabn
	533	// indexes (they're built this way to prevent mg_perf_hash_build from
	534	// dying at build time)
	535
[4735]	536	// if we're searching the ProgNumber index we want to
[4507]	537	// remove all non-alphanumeric characters from the query string
	538	text_t longindex; text_tarray splitindex;
	539	indexmap.to2from ((*query_here).index, longindex);
	540	splitchar (longindex.begin(), longindex.end(), ':', splitindex);
	541	text_t &indextype = splitindex[1];
[4735]	542	if (indextype == "ProgNumber") {
[4507]	543	text_t new_querystring;
	544	text_t::const_iterator here = (*query_here).querystring.begin();
	545	text_t::const_iterator end = (*query_here).querystring.end();
	546	while (here != end) {
	547	if ((here >= 'a' && here <= 'z') \|\| (here >= 'A' && here <= 'Z') \|\|
	548	(here >= '0' && here <= '9')) {
	549	new_querystring.push_back (*here);
	550	}
[9620]	551	++here;
[4507]	552	}
	553	(*query_here).querystring = new_querystring;
	554	}
	555	#endif
[9620]	556	++query_here;
[1662]	557	}
	558	}
	559

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/runtime-src/src/colservr/mgqueryfilter.cpp@ 28762

Download in other formats: