Context Navigation

source: gsdl/trunk/src/colservr/mgqueryfilter.cpp@ 15580

Last change on this file since 15580 was 15558, checked in by mdewsnip, 16 years ago
(Adding new DB support) Changed lots of "gdbm"s to "db"s, in preparation for adding new DB types.
Property svn:keywords set to `Author Date Id Revision`
File size: 17.8 KB

Rev	Line
[1324]	1	/**********************************************************************
	2	*
	3	* mgqueryfilter.cpp -- implementation of queryfilter for old mg
	4	* Copyright (C) 1999 The New Zealand Digital Library Project
	5	*
	6	* A component of the Greenstone digital library software
	7	* from the New Zealand Digital Library Project at the
	8	* University of Waikato, New Zealand.
	9	*
	10	* This program is free software; you can redistribute it and/or modify
	11	* it under the terms of the GNU General Public License as published by
	12	* the Free Software Foundation; either version 2 of the License, or
	13	* (at your option) any later version.
	14	*
	15	* This program is distributed in the hope that it will be useful,
	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	* GNU General Public License for more details.
	19	*
	20	* You should have received a copy of the GNU General Public License
	21	* along with this program; if not, write to the Free Software
	22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	*
	24	*********************************************************************/
	25
	26	#include "mgqueryfilter.h"
	27	#include "fileutil.h"
	28	#include "phrasesearch.h"
	29	#include <assert.h>
	30	#include "mgsearch.h"
[11002]	31	#include "phrases.h"
[1324]	32
	33	///////////////////////////////
	34	// methods for resultsorderer_t
	35	///////////////////////////////
	36
	37	resultsorderer_t::resultsorderer_t() {
	38	clear ();
	39	}
	40
	41	void resultsorderer_t::clear() {
	42	compare_phrase_match = false;
	43	compare_terms_match = false;
	44	compare_doc_weight = true;
	45
	46	docset = NULL;
	47	}
	48
	49	bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
	50	if (docset == NULL) return t1>t2;
	51
	52	docresultmap::iterator t1_here = docset->find(t1);
	53	docresultmap::iterator t2_here = docset->find(t2);
	54	docresultmap::iterator end = docset->end();
	55
	56	// sort all the document numbers not in the document set to
	57	// the end of the list
	58	if (t1_here == end) {
	59	if (t2_here == end) return t1>t2;
	60	else return true;
	61	} else if (t2_here == end) return false;
	62
	63	if (compare_phrase_match) {
	64	if ((t1_here).second.num_phrase_match > (t2_here).second.num_phrase_match) return true;
	65	if ((t1_here).second.num_phrase_match < (t2_here).second.num_phrase_match) return false;
	66	}
	67
	68	if (compare_terms_match) {
	69	if ((t1_here).second.num_query_terms_matched > (t2_here).second.num_query_terms_matched) return true;
	70	if ((t1_here).second.num_query_terms_matched < (t2_here).second.num_query_terms_matched) return false;
	71	}
	72
	73	if (compare_doc_weight) {
	74	if ((t1_here).second.docweight > (t2_here).second.docweight) return true;
	75	if ((t1_here).second.docweight < (t2_here).second.docweight) return false;
	76	}
	77
	78	return t1>t2;
	79	}
	80
	81
	82
	83
	84	/////////////////////////////////
	85	// functions for mgqueryfilterclass
	86	/////////////////////////////////
	87
[4193]	88
	89	void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
	90	queryfilterclass::configure (key, cfgline);
	91
[12314]	92	if (key == "indexstem") {
[9937]	93	((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
	94	}
	95
[4193]	96	}
	97
[1324]	98	// loads up phrases data structure with any phrases (that's the quoted bits)
	99	// occuring in the querystring
	100	void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
	101	const termfreqclassarray &orgterms,
	102	vector<termfreqclassarray> &phrases) {
	103
	104	text_t::const_iterator here = querystring.begin();
	105	text_t::const_iterator end = querystring.end();
	106
	107	termfreqclassarray tmpterms;
	108
	109	int termcount = 0;
	110	bool foundquote = false;
	111	bool foundbreak = false;
	112	bool start = true;
	113	while (here != end) {
	114	if (*here == '\"') {
	115	if (foundquote) {
	116	if (!foundbreak && !start) {
	117	tmpterms.push_back (orgterms[termcount]);
[9620]	118	++termcount;
[1324]	119	}
	120	if (tmpterms.size() > 1) {
	121	phrases.push_back (tmpterms);
	122	}
[11002]	123	tmpterms.erase (tmpterms.begin(), tmpterms.end());
	124
[1324]	125	foundquote = false;
	126	foundbreak = true;
	127	} else foundquote = true;
	128	} else if (!is_unicode_letdig(*here)) {
	129	// found a break between terms
	130	if (!foundbreak && !start) {
[11002]	131	if (foundquote) {
[1324]	132	tmpterms.push_back (orgterms[termcount]);
[11002]	133	}
[9620]	134	++termcount;
[1324]	135	}
	136	foundbreak = true;
	137	} else {
	138	start = false;
	139	foundbreak = false;
	140	}
[9620]	141	++here;
[1324]	142	}
	143	}
	144
	145	// do aditional query processing
	146	void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
	147	queryresultsclass &queryresults) {
	148
	149	// post-process the results if needed
	150	if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
	151
	152	// get the terms between quotes (if any)
	153	vector<termfreqclassarray> phrases;
	154	get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
	155
	156	num_phrases = phrases.size();
	157	if (num_phrases > 0) {
	158
	159	// get the long version of the index
	160	text_t longindex;
	161	indexmap.to2from (queryparams.index, longindex);
	162
	163	vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
	164	vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
	165
	166	while (this_phrase != end_phrase) {
	167
	168	// process each of the matched documents
	169	docresultmap::iterator docs_here = queryresults.docs.docset.begin();
	170	docresultmap::iterator docs_end = queryresults.docs.docset.end();
	171	while (docs_here != docs_end) {
[15558]	172	if (OID_phrase_search (((mgsearchclass)textsearchptr), *db_ptr, queryparams.index,
[1324]	173	queryparams.subcollection, queryparams.language,
	174	longindex, queryparams.collection, *this_phrase,
	175	(*docs_here).second.docnum)) {
[9620]	176	++docs_here->second.num_phrase_match;
[1324]	177	}
	178
[9620]	179	++docs_here;
[1324]	180	}
[9620]	181	++this_phrase;
[1324]	182	}
	183	}
	184	}
	185	}
	186
	187
	188	// do query that might involve multiple sub queries
[15558]	189	// mgsearchptr and db_ptr are assumed to be valid
[1324]	190	void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
	191	const vector<queryparamclass> &query_params,
	192	queryresultsclass &multiresults,
	193	comerror_t &err, ostream &logout) {
	194	outconvertclass text_t2ascii;
	195
	196	err = noError;
[8026]	197	textsearchptr->setcollectdir (collectdir);
[1324]	198	multiresults.clear();
	199
	200	vector<queryparamclass>::const_iterator query_here = query_params.begin();
	201	vector<queryparamclass>::const_iterator query_end = query_params.end();
	202	while (query_here != query_end) {
	203	queryresultsclass thisqueryresults;
[1662]	204
[8026]	205	if (!textsearchptr->search(*query_here, thisqueryresults)) {
[1324]	206	// most likely a system problem
	207	logout << text_t2ascii
	208	<< "system problem: could not do search with mg for index \""
	209	<< (query_here).index << (query_here).subcollection
	210	<< (*query_here).language << "\".\n\n";
	211	err = systemProblem;
	212	return;
	213	}
	214
	215	// combine the results
	216	if (need_matching_docs (request.filterResultOptions)) {
	217	// post-process the results if needed
	218	if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
	219	!thisqueryresults.docs.docset.empty()) {
	220	post_process (*query_here, thisqueryresults);
	221	thisqueryresults.postprocessed = true;
	222	multiresults.postprocessed = true;
[1721]	223	} else {
	224	num_phrases = 0;
[1324]	225	}
	226
	227	if (query_params.size() == 1) {
	228	multiresults.docs = thisqueryresults.docs; // just one set of results
	229	multiresults.docs_matched = thisqueryresults.docs_matched;
	230	multiresults.is_approx = thisqueryresults.is_approx;
	231
	232	} else {
	233	if ((*query_here).combinequery == "and") {
	234	multiresults.docs.combine_and (thisqueryresults.docs);
	235	} else if ((*query_here).combinequery == "or") {
	236	multiresults.docs.combine_or (thisqueryresults.docs);
	237	} else if ((*query_here).combinequery == "not") {
	238	multiresults.docs.combine_not (thisqueryresults.docs);
	239	}
	240	multiresults.docs_matched = multiresults.docs.docset.size();
	241	multiresults.is_approx = Exact;
	242	}
	243	}
	244
	245	// combine the term information
	246	if (need_term_info (request.filterResultOptions)) {
	247	// append the terms
	248	multiresults.orgterms.insert(multiresults.orgterms.end(),
	249	thisqueryresults.orgterms.begin(),
	250	thisqueryresults.orgterms.end());
	251
	252	// add the term variants
	253	text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
	254	text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
	255	while (termvar_here != termvar_end) {
	256	multiresults.termvariants.insert(*termvar_here);
[9620]	257	++termvar_here;
[1324]	258	}
	259	}
	260
[9620]	261	++query_here;
[1324]	262	}
	263
	264	// sort and unique the query terms
	265	multiresults.sortuniqqueryterms ();
	266	}
	267
	268
	269	void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/request/,
	270	docresultsclass &docs) {
	271	resultsorderer_t resultsorderer;
	272	resultsorderer.compare_phrase_match = true;
	273	resultsorderer.docset = &(docs.docset);
	274
	275	// first get a list of document numbers
	276	docs.docnum_order();
	277
	278	sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
	279	}
	280
	281
	282
	283	mgqueryfilterclass::mgqueryfilterclass ()
	284	:queryfilterclass() {
	285
	286	num_phrases = 0;
	287	}
	288
	289	mgqueryfilterclass::~mgqueryfilterclass () {
	290	}
	291
	292	void mgqueryfilterclass::filter (const FilterRequest_t &request,
	293	FilterResponse_t &response,
	294	comerror_t &err, ostream &logout) {
	295	outconvertclass text_t2ascii;
	296
	297	response.clear ();
	298	err = noError;
[15558]	299	if (db_ptr == NULL) {
[1324]	300	// most likely a configuration problem
	301	logout << text_t2ascii
[15558]	302	<< "configuration error: mgqueryfilter contains a null dbclass\n\n";
[1324]	303	err = configurationError;
	304	return;
	305	}
[8026]	306	if (textsearchptr == NULL) {
[1324]	307	// most likely a configuration problem
	308	logout << text_t2ascii
[8026]	309	<< "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
[1324]	310	err = configurationError;
	311	return;
	312	}
	313
	314	// open the database
[15558]	315	db_ptr->setlogout(&logout);
	316	if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
	317	// most likely a system problem (we have already checked that the database exists)
[1324]	318	logout << text_t2ascii
[15558]	319	<< "system problem: open on database \"" << db_filename << "\" failed\n\n";
[1324]	320	err = systemProblem;
	321	return;
	322	}
	323
	324	// get the query parameters
	325	int startresults = filterOptions["StartResults"].defaultValue.getint();
	326	int endresults = filterOptions["EndResults"].defaultValue.getint();
	327	text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
	328
	329	vector<queryparamclass> queryfilterparams;
	330	parse_query_params (request, queryfilterparams, startresults,
[1662]	331	endresults, phrasematch, logout);
	332	// do any mg specific diddling with query parameters that may be required
	333	mg_parse_query_params (request, queryfilterparams, startresults,
	334	endresults, phrasematch, logout);
	335
	336
[1324]	337	// do query
	338	queryresultsclass queryresults;
	339	do_multi_query (request, queryfilterparams, queryresults, err, logout);
	340	if (err != noError) return;
	341
	342	// assemble document results
	343	if (need_matching_docs (request.filterResultOptions)) {
	344	// sort the query results
[5850]	345	// only want to sort the docs if we have done a ranked search or there were phrases
	346	if (num_phrases > 0 \|\| (request.filterResultOptions & FRranking)) {
	347	sort_doc_results (request, queryresults.docs);
	348	}
[1324]	349	int resultnum = 1;
	350	ResultDocInfo_t resultdoc;
	351	text_t trans_OID;
	352	vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
	353	vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
	354
[1662]	355	// documents containing matching phrases will be sorted to the top so
	356	// we can break out once we're past those that match the PhraseMatch
	357	// option -- "all_phrases" = return only those documents containing all
	358	// phrases in query string
	359	// "some_phrases" = return only those documents containing
	360	// at least 1 of the phrases in the document
	361	// "all_docs" = return all documents regardless
	362	if (num_phrases > 0) {
	363	int numdocs = 0;
	364	while (docorder_here != docorder_end) {
	365	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
	366
	367	if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) \|\|
	368	((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
	369	queryresults.docs_matched = numdocs;
	370	break;
	371	}
[9620]	372	++numdocs;
	373	++docorder_here;
[1662]	374	}
	375	}
	376
[1324]	377	if (endresults == -1) endresults = MAXNUMDOCS;
[1662]	378	docorder_here = queryresults.docs.docorder.begin();
[1324]	379	while (docorder_here != docorder_end) {
[1662]	380	if (resultnum > endresults \|\| resultnum > queryresults.docs_matched) break;
[1324]	381
	382	// translate the document number
[15558]	383	if (!translate(db_ptr, *docorder_here, trans_OID)) {
[1324]	384	logout << text_t2ascii
	385	<< "warning: could not translate mg document number \""
	386	<< *docorder_here << "\"to OID.\n\n";
	387
	388	} else {
	389	docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
	390
	391	// see if there is a result for this number,
	392	// if it is in the request set (or the request set is empty)
	393	if (docset_here != queryresults.docs.docset.end() &&
	394	(request.docSet.empty() \|\| in_set(request.docSet, trans_OID))) {
	395	if (resultnum >= startresults) {
	396	// add this document
	397	resultdoc.OID = trans_OID;
	398	resultdoc.result_num = resultnum;
	399	resultdoc.ranking = (int)((docset_here).second.docweight 10000.0 + 0.5);
	400
	401	// these next two are not available on all versions of mg
	402	resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
	403	resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
	404
	405	response.docInfo.push_back (resultdoc);
	406	}
	407
[9620]	408	++resultnum;
[1324]	409	}
	410	}
	411
[9620]	412	++docorder_here;
[1324]	413	}
	414	}
	415
	416	// assemble the term results
	417	if (need_term_info(request.filterResultOptions)) {
	418	// note: the terms have already been sorted and uniqued
	419
	420	TermInfo_t terminfo;
	421	bool terms_first = true;
	422	termfreqclassarray::iterator terms_here = queryresults.terms.begin();
	423	termfreqclassarray::iterator terms_end = queryresults.terms.end();
	424
	425	while (terms_here != terms_end) {
	426	terminfo.clear();
	427	terminfo.term = (*terms_here).termstr;
	428	terminfo.freq = (*terms_here).termfreq;
	429	if (terms_first) {
	430	text_tset::iterator termvariants_here = queryresults.termvariants.begin();
	431	text_tset::iterator termvariants_end = queryresults.termvariants.end();
	432	while (termvariants_here != termvariants_end) {
	433	terminfo.matchTerms.push_back (*termvariants_here);
[9620]	434	++termvariants_here;
[1324]	435	}
	436	}
	437	terms_first = false;
	438
	439	response.termInfo.push_back (terminfo);
	440
[9620]	441	++terms_here;
[1324]	442	}
	443	}
	444
[15558]	445	db_ptr->closedatabase(); // Important that local library doesn't leave any files open
[1324]	446	response.numDocs = queryresults.docs_matched;
	447	response.isApprox = queryresults.is_approx;
	448	}
	449
[1662]	450	void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/request/,
	451	vector<queryparamclass> &query_params,
	452	int &/startresults/, int &/endresults/,
	453	text_t &/phrasematch/, ostream &/logout/) {
	454
	455	// outconvertclass text_t2ascii;
	456
	457	vector<queryparamclass>::iterator query_here = query_params.begin();
	458	vector<queryparamclass>::iterator query_end = query_params.end();
	459	while (query_here != query_end) {
	460
[2134]	461	// if we're doing a phrase search we want to maximise hits by making it
	462	// a boolean search on the index with the finest granularity - we'll
	463	// also set maxdocs to "all" (realizing that this will cause searches
	464	// like "and the" on a large collection to take a very very long time).
	465
[1662]	466	// we're deciding it's a phrase search based on if the querystring
	467	// contains at least 2 double quotes (not very scientific but
	468	// then neither is the rest of the mg phrase searching functionality :-)
[11002]	469	//if (countchar ((query_here).querystring.begin(), (query_here).querystring.end(), '"') > 1) {
	470
	471	// [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here
	472	text_tarray phrases;
	473	get_phrases((*query_here).querystring, phrases);
	474
	475	if (phrases.size() > 0) {
[1662]	476	(*query_here).search_type = 0;
	477
[2134]	478	// set maxdocs to "all"
	479	(*query_here).maxdocs = -1;
	480
[1662]	481	// Get the long version of the index and test to see if any indexes with
	482	// finer granularity exist. Indexes must be the same type (i.e. same metadata
	483	// or "text").
	484	text_t longindex; text_tarray splitindex;
	485	indexmap.to2from ((*query_here).index, longindex);
	486	splitchar (longindex.begin(), longindex.end(), ':', splitindex);
	487	text_t &granularity = splitindex[0];
	488	text_t &indextype = splitindex[1];
	489	bool found = false;
	490	// currently supported granularity options are "document", "section" and "paragraph"
	491	if (granularity == "document" \|\| granularity == "section") {
	492	text_t shortindex;
	493	if (indexmap.fromexists ("paragraph:" + indextype)) {
	494	indexmap.from2to ("paragraph:" + indextype, shortindex);
	495	(*query_here).index = shortindex;
	496	found = true;
	497	}
	498	if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
	499	indexmap.from2to ("section:" + indextype, shortindex);
	500	(*query_here).index = shortindex;
	501	}
	502	}
	503	}
[4507]	504
	505	#ifdef GSDL_BBC_COLLECTION
	506	// This is a special hack for the BBC collection's ProgNumber and zzabn
	507	// indexes (they're built this way to prevent mg_perf_hash_build from
	508	// dying at build time)
	509
[4735]	510	// if we're searching the ProgNumber index we want to
[4507]	511	// remove all non-alphanumeric characters from the query string
	512	text_t longindex; text_tarray splitindex;
	513	indexmap.to2from ((*query_here).index, longindex);
	514	splitchar (longindex.begin(), longindex.end(), ':', splitindex);
	515	text_t &indextype = splitindex[1];
[4735]	516	if (indextype == "ProgNumber") {
[4507]	517	text_t new_querystring;
	518	text_t::const_iterator here = (*query_here).querystring.begin();
	519	text_t::const_iterator end = (*query_here).querystring.end();
	520	while (here != end) {
	521	if ((here >= 'a' && here <= 'z') \|\| (here >= 'A' && here <= 'Z') \|\|
	522	(here >= '0' && here <= '9')) {
	523	new_querystring.push_back (*here);
	524	}
[9620]	525	++here;
[4507]	526	}
	527	(*query_here).querystring = new_querystring;
	528	}
	529	#endif
[9620]	530	++query_here;
[1662]	531	}
	532	}
	533

Note: See TracBrowser for help on using the repository browser.

Download in other formats: