Context Navigation

source: trunk/gsdl/src/colservr/mgsearch.cpp@ 539

Last change on this file since 539 was 539, checked in by rjmcnab, 25 years ago
Seems to be an error in mg for retrieving documents using a paragraph based index for some cases. Just added a work around (loads the default index every time).
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.1 KB

Rev	Line
[110]	1	/**********************************************************************
	2	*
	3	* mgsearch.cpp --
	4	* Copyright (C) 1999 The New Zealand Digital Library Project
	5	*
[534]	6	* A component of the Greenstone digital library software
	7	* from the New Zealand Digital Library Project at the
	8	* University of Waikato, New Zealand.
[110]	9	*
[534]	10	* This program is free software; you can redistribute it and/or modify
	11	* it under the terms of the GNU General Public License as published by
	12	* the Free Software Foundation; either version 2 of the License, or
	13	* (at your option) any later version.
	14	*
	15	* This program is distributed in the hope that it will be useful,
	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	* GNU General Public License for more details.
	19	*
	20	* You should have received a copy of the GNU General Public License
	21	* along with this program; if not, write to the Free Software
	22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	*
[110]	24	* $Id: mgsearch.cpp 539 1999-09-07 22:52:52Z rjmcnab $
	25	*
	26	*********************************************************************/
	27
	28	/*
	29	$Log$
[539]	30	Revision 1.19 1999/09/07 22:52:52 rjmcnab
	31	Seems to be an error in mg for retrieving documents using a paragraph
	32	based index for some cases. Just added a work around (loads the default
	33	index every time).
	34
[534]	35	Revision 1.18 1999/09/07 04:57:22 sjboddie
	36	added gpl notice
	37
[497]	38	Revision 1.17 1999/08/31 22:42:41 rjmcnab
	39	A couple of minor things.
	40
[473]	41	Revision 1.16 1999/08/25 04:51:06 sjboddie
	42	small change to allow for searching using boolean operators
	43
[401]	44	Revision 1.15 1999/07/16 08:35:03 rjmcnab
	45	Fixed a weird bug to do with a faulty case statement.
	46
[398]	47	Revision 1.14 1999/07/16 03:42:22 sjboddie
	48	changed isApprox
	49
[393]	50	Revision 1.13 1999/07/16 00:12:46 sjboddie
	51	removed all the old post-processing stuff
	52
[350]	53	Revision 1.12 1999/07/07 06:17:47 rjmcnab
	54	broke search_index into index+subcollection+language
	55	within mgsearch
	56
[343]	57	Revision 1.11 1999/07/05 21:06:43 rjmcnab
	58	Disabled quoted strings.
	59
[334]	60	Revision 1.10 1999/07/01 09:29:19 rjmcnab
	61	Changes for better reporting of number documents which match a query. Changes
	62	should still work as before with older versions of mg.
	63
[325]	64	Revision 1.9 1999/07/01 03:54:48 rjmcnab
	65	Added code to plug in the equivalent terms of each of the query terms.
	66	Also added a function to get a raw utf8 encoded mg document (for speeding
	67	up a phrase matching function)
	68
[319]	69	Revision 1.8 1999/06/30 04:04:12 rjmcnab
	70	made stemming functions available from mgsearch and made the stems
	71	for the query terms available in queryinfo
	72
[301]	73	Revision 1.7 1999/06/27 22:07:27 sjboddie
	74	got rid of all the old functions for dealing with dir indexes
	75
[265]	76	Revision 1.6 1999/06/09 00:41:32 sjboddie
	77	phrase searching now uses case-folding if it's turned on
	78
[163]	79	Revision 1.5 1999/02/21 22:31:35 rjmcnab
	80
	81	Removed locateinfo.
	82
[138]	83	Revision 1.4 1999/02/03 01:13:27 sjboddie
	84
	85	Got interface to handle subcollections and language subcollections -
	86	committed changes made to some of the collections
	87
[114]	88	Revision 1.3 1999/01/19 01:38:17 rjmcnab
	89
	90	Made the source more portable.
	91
[112]	92	Revision 1.2 1999/01/12 01:51:02 rjmcnab
	93
	94	Standard header.
	95
[110]	96	Revision 1.1 1999/01/08 09:02:16 rjmcnab
	97
	98	Moved from src/library.
	99
	100	*/
	101
	102
[114]	103	#include "gsdlconf.h"
[110]	104	#include "mgsearch.h"
[163]	105	#include "fileutil.h"
[110]	106
	107	#include <string.h>
	108	#include <stdio.h>
	109	#include <stdlib.h>
	110	#include <ctype.h>
	111
[114]	112	#if defined(GSDL_USE_OBJECTSPACE)
	113	# include <ospace\std\iostream>
	114	#elif defined(GSDL_USE_IOS_H)
[110]	115	# include <iostream.h>
	116	#else
[114]	117	# include <iostream>
	118	#endif
[110]	119
[114]	120	#if defined(__WIN32__)
[110]	121	// gdbm stuff
	122	# include "autoconf.h"
	123	# include "systems.h"
	124	# include "gdbmconst.h"
	125	# include "gdbm.h"
[114]	126	#else
	127	# include <gdbm.h>
[110]	128	#endif
[114]	129
[110]	130
	131	#include <assert.h>
	132
	133	#include "mgq.h"
[163]	134	// #include "locateinfo.h"
[110]	135	#include "gsdlunicode.h"
	136	#include "unitool.h"
	137
	138
	139	/////////////
	140	// globals //
	141	/////////////
	142
[325]	143	static char *tempdoc = NULL;
	144	static int templen = 0;
[110]	145
[325]	146
[319]	147	//////////////////////
	148	// useful functions //
	149	//////////////////////
	150
	151
	152	// input and output are in utf8
	153	text_t mgsearch_stemword (const text_t &word) {
	154	// allocate working stem space
	155	int maxstemlen = mgq_getmaxstemlen ();
	156	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
	157	if (word_stem == NULL) return "";
	158
	159	// copy word to word_stem
	160	int len = 0;
	161	text_t::const_iterator here = word.begin();
	162	text_t::const_iterator end = word.end();
	163	while (len < maxstemlen && here != end) {
	164	word_stem[len+1] = (unsigned char)(*here);
	165	len++; here++;
	166	}
	167	word_stem[len+1] = '\0';
	168	word_stem[0] = len;
	169
	170	mgq_stemword (word_stem);
	171
	172	// copy word_stem back to tempstr
	173	text_t tempstr;
	174	tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
	175
[325]	176	delete [] word_stem;
	177
[319]	178	return tempstr;
	179	}
	180
	181
	182
[110]	183	////////////////////////
	184	// callback functions //
	185	////////////////////////
	186
	187	// This routine is called for each document found in a search
	188	// it assumes that cache_num is set up correctly to point to
	189	// a suitable result cache
[497]	190	int ourquerycallback(char * /UDoc/, int /ULen/, int DocNum,
[110]	191	float Weight, void *info) {
	192
	193
	194	queryresultsclass queryresults = (queryresultsclass )info;
	195
	196	// append this entry to the document results
	197	docresultclass docresult;
	198	docresult.docnum = DocNum;
[319]	199	docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
	200	docresult.docweight = Weight - docresult.num_query_terms_matched*100;
	201
[350]	202	queryresults->docs.docset[DocNum] = docresult;
	203	queryresults->docs.docorder.push_back(DocNum);
[110]	204
	205	return 0;
	206	}
	207
[325]	208	int termequivcallback(char Word, int ULen, int /Freq*/,
	209	float /Weight/, void *info) {
	210	text_tset equivterms = (text_tset )info;
	211	if (equivterms == NULL) return 0;
	212
	213	text_t thisterm;
	214	thisterm.setcarr(Word, ULen);
	215
	216	equivterms->insert(thisterm);
	217
	218	return 0;
	219	}
	220
	221
	222	void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
	223	// allocate working stem space
	224	int maxstemlen = mgq_getmaxstemlen ();
	225	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
	226	if (word_stem == NULL) return;
	227
	228	// copy word to word_stem
	229	int len = 0;
	230	text_t::const_iterator here = word.begin();
	231	text_t::const_iterator end = word.end();
	232	while (len < maxstemlen && here != end) {
	233	word_stem[len+1] = (unsigned char)(*here);
	234	len++; here++;
	235	}
	236	word_stem[len+1] = '\0';
	237	word_stem[0] = len;
	238
	239	// get the equivalent terms
	240	mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
	241
	242	delete [] word_stem;
	243
	244	return;
	245	}
	246
	247	text_tset utf8equivterms; // kept as utf8 string for fast matching
	248
	249
[110]	250	// This callback is called once for each term in the query
	251	int termfreqcallback(char *Word, int ULen, int Freq,
[114]	252	float /Weight/, void *info) {
[110]	253	queryresultsclass queryresults = (queryresultsclass )info;
[325]	254	if (queryresults == NULL) return 0;
[110]	255
	256	text_t term;
	257	term.setcarr(Word, ULen);
	258	termfreqclass termfreq;
[325]	259
[110]	260	termfreq.termstr = to_uni(term);
[325]	261	text_t utf8termstem = mgsearch_stemword (term);
	262	termfreq.termstemstr = to_uni (utf8termstem);
	263
	264	mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
	265
[110]	266	termfreq.termfreq = Freq;
[319]	267	queryresults->orgterms.push_back(termfreq);
[110]	268
	269	return 0;
	270	}
	271
	272	// this callback is called once for each variation of each term
[319]	273	int termvariantscallback(char Word, int ULen, int /Freq*/,
	274	float /Weight/, void *info) {
[110]	275
	276	text_t term;
	277	term.setcarr(Word, ULen);
	278	queryresultsclass queryresults = (queryresultsclass )info;
[350]	279	queryresults->termvariants.insert(to_uni(term));
[110]	280
	281	return 0;
	282	}
	283
	284	// This callback is for getting document text
[325]	285	int doctextcallback(char Doc, int ULen, int /Freq*/,
[497]	286	float /Weight/, void * /info/) {
[325]	287	tempdoc = Doc;
	288	templen = ULen;
[110]	289
	290	return 0;
	291	}
	292
	293
[163]	294	static text_t getindexsuffix (const text_t &collection,
	295	const text_t &index) {
[393]	296
	297	text_t indexsuffix = "index";
[163]	298	indexsuffix = filename_cat (indexsuffix, index);
	299	indexsuffix = filename_cat (indexsuffix, collection);
	300	return indexsuffix;
	301	}
[110]	302
[163]	303
	304
	305
[110]	306	////////////////////
	307	// mgsearch class //
	308	////////////////////
	309
	310	mgsearchclass::mgsearchclass ()
	311	{
	312	cache = new querycache (RESULTCACHESIZE);
	313	}
	314
	315	mgsearchclass::~mgsearchclass ()
	316	{
	317	if (cache != NULL)
	318	{
	319	delete cache;
	320	cache = NULL;
	321	}
	322	}
	323
	324
	325	void mgsearchclass::setcollectdir (const text_t &thecollectdir)
	326	{
	327	collectdir = thecollectdir;
	328	}
	329
[319]	330	// you only need to use this function before doing any stemming
	331	// casefolding and stemming will be set if values for them are
	332	// provided (0 or 1).
	333	// makeindexcurrent returns true if it was able to load the database
	334	bool mgsearchclass::makeindexcurrent (const text_t &index,
[350]	335	const text_t &subcollection,
	336	const text_t &language,
[319]	337	const text_t &collection,
	338	int casefolding,
	339	int stemming) {
	340	bool databaseloaded = true;
[110]	341
[319]	342	// get the names of the collection, index and text suffixes
	343	char *ccollection = collection.getcstr();
	344	assert (ccollection != NULL);
[350]	345	char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
[319]	346	assert (idxsuffix != NULL);
	347	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
	348	assert (txtsuffix != NULL);
	349
	350	#ifdef __WIN32__
	351	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
	352	#else
	353	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
	354	#endif
	355
	356	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
	357	if (casefolding == 0) mgq_ask(".set casefold off");
	358	else if (casefolding > 0) mgq_ask(".set casefold on");
	359	if (stemming == 0) mgq_ask(".set stem off");
	360	else if (stemming > 0) mgq_ask(".set stem on");
	361
	362	} else databaseloaded = false;
	363
	364	// free up the c strings
	365	delete ccollection;
	366	delete idxsuffix;
	367	delete txtsuffix;
	368	delete ccollectdir;
	369
	370	return databaseloaded;
	371	}
	372
	373
	374	// stem word uses the values set in the last call to makeindexcurrent
	375	// to stem the word. It is assumed that word is in unicode
	376	text_t mgsearchclass::stemword (const text_t &word) {
	377	return to_uni (mgsearch_stemword (to_utf8 (word)));
	378	}
	379
[325]	380	text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
	381	return to_uni (mgsearch_stemword (to_utf8 (here, end)));
	382	}
	383
	384
[110]	385	bool mgsearchclass::search(const queryparamclass &queryparams,
[319]	386	queryresultsclass &queryresults) {
[110]	387	assert (cache != NULL);
	388
	389	queryresults.clear();
	390
	391	// first check the cache
[319]	392	if (cache->find(queryparams, queryresults)) return true;
[110]	393
	394	// make sure there is a query to be processed
	395	text_t::const_iterator queryhere = queryparams.querystring.begin();
	396	text_t::const_iterator queryend = queryparams.querystring.end();
	397	while (queryhere != queryend) {
	398	if (is_unicode_letdig (*queryhere)) break;
	399	queryhere++;
	400	}
	401
	402	// if we reached the end of the query string without finding
	403	// any alphanumeric characters then return no results (and say
	404	// the database was loaded)
	405	if (queryhere == queryend) return true;
	406
[350]	407	if (makeindexcurrent (queryparams.index, queryparams.subcollection,
	408	queryparams.language, queryparams.collection)) {
[319]	409	setsearchmode (queryparams);
	410	submitquery (queryparams);
[334]	411	getresults (queryparams, queryresults);
[319]	412	return true;
	413	}
[110]	414
[319]	415	return false;
[110]	416	}
	417
	418
	419	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
	420	{
	421	mgq_ask(".set expert true");
[319]	422	mgq_ask(".set sorted_terms true");
[110]	423	mgq_ask(".set accumulator_method list");
[497]	424	mgq_ask(".set max_accumulators 500000");
	425	mgq_ask(".set maxparas 500000");
[110]	426	mgq_ask(".set verbatim true");
	427	mgq_ask(".unset skip_dump");
	428	mgq_ask(".set mode docnums");
	429
	430	switch (queryparams.search_type)
	431	{
	432	case 0: mgq_ask(".set query boolean"); break;
	433	case 1: mgq_ask(".set query ranked"); break;
	434	}
	435	switch (queryparams.casefolding)
	436	{
	437	case 1: mgq_ask(".set casefold on"); break;
	438	case 0: mgq_ask(".set casefold off"); break;
	439	}
	440	switch (queryparams.stemming)
	441	{
	442	case 1: mgq_ask(".set stem on"); break;
	443	case 0: mgq_ask(".set stem off"); break;
	444	}
	445	mgq_ask(".set heads_length 150");
	446
[350]	447	if (queryparams.maxdocs == -1) {
	448	mgq_ask(".set maxdocs all");
	449	} else {
	450	char maxdocstr[32];
	451	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
	452	mgq_ask(maxdocstr);
	453	}
[110]	454	}
	455
	456
	457	void mgsearchclass::submitquery (const queryparamclass &queryparams)
	458	{
	459	// sort out the query string
	460	text_t ttquerystring = queryparams.querystring;
	461	filterquery (ttquerystring);
	462	char *querystring = to_utf8(ttquerystring).getcstr();
	463
	464	// submit the query
	465	mgq_ask(querystring);
	466
	467	delete querystring;
	468	}
	469
	470
[334]	471	void mgsearchclass::getresults (const queryparamclass &queryparams,
	472	queryresultsclass &queryresults) {
[393]	473
	474	mgq_results(result_docnums, 0, MAXNUMDOCS,
	475	ourquerycallback, (void *)(&queryresults));
[110]	476
	477	// get the term frequencies
	478	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
	479	termfreqcallback, (void *)(&queryresults));
[319]	480	queryresults.sortuniqqueryterms();
	481
	482	// get term variants
[110]	483	mgq_results(result_terms, 0, MAXNUMTERMS,
[319]	484	termvariantscallback, (void *)(&queryresults));
[334]	485
	486	// get the number of documents retrieved
	487	int total_retrieved = 0, is_approx = 0;
	488	mgq_docsretrieved (&total_retrieved, &is_approx);
	489
	490	if (total_retrieved == 0) {
	491	// not available (or really was zero)
[350]	492	queryresults.docs_matched = queryresults.docs.docset.size();
[334]	493	if (queryresults.docs_matched < queryparams.maxdocs)
[398]	494	queryresults.is_approx = Exact;
[334]	495	else
[398]	496	queryresults.is_approx = MoreThan;
[334]	497	} else {
	498	queryresults.docs_matched = total_retrieved;
[401]	499	if (is_approx) queryresults.is_approx = Approximate;
	500	else queryresults.is_approx = Exact;
[334]	501	}
[110]	502	}
	503
	504	void mgsearchclass::filterquery (text_t &ttquerystring) {
	505	text_t::iterator ithere = ttquerystring.begin ();
	506	text_t::iterator itend = ttquerystring.end ();
	507
[473]	508	// remove all non alphanumeric characters (except
	509	// boolean operators
[110]	510	while (ithere != itend) {
[473]	511	if ((!is_unicode_letdig(ithere)) && (ithere != '!') &&
	512	(ithere != '&') && (ithere != '\|') && (*ithere != '(') &&
	513	(ithere != ')')) (ithere) = ' ';
[110]	514	ithere++;
	515	}
	516	}
	517
	518
	519	// the document text for 'docnum' is placed in 'output'
	520	// docTargetDocument returns 'true' if it was able to
	521	// try to get a document
	522	// collection is needed to see if an index from the
	523	// collection is loaded. If no index has been loaded
	524	// defaultindex is needed to load one
[350]	525	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
	526	const text_t &defaultsubcollection,
	527	const text_t &defaultlanguage,
[110]	528	const text_t &collection,
	529	int docnum,
[325]	530	text_t &output) {
[110]	531	output.clear();
	532
[325]	533	// get the mg version of the document
	534	char *mgdoc = NULL;
	535	int doclen = 0;
[350]	536	if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
	537	collection, docnum, mgdoc, doclen)) return false;
[325]	538	if (mgdoc == NULL) return false;
[110]	539
[325]	540	// replace all control-Cs with spaces
	541	char *mgdoc_here = mgdoc;
	542	char *mgdoc_end = mgdoc + doclen;
	543	while (mgdoc_here < mgdoc_end) {
	544	if (mgdoc_here == '\x3') mgdoc_here = ' ';
	545	mgdoc_here++;
	546	}
[110]	547
[325]	548	// convert this document to unicode
	549	utf8inconvertclass inconvert;
	550	convertclass::status_t status;
	551	inconvert.reset ();
	552	inconvert.setinput (mgdoc, doclen);
	553	inconvert.convert (output, status);
[110]	554
[325]	555	return true;
	556	}
[110]	557
[325]	558
	559	bool mgsearchclass::mgdocument (const text_t &defaultindex,
[350]	560	const text_t &defaultsubcollection,
	561	const text_t &defaultlanguage,
[325]	562	const text_t &collection,
	563	int docnum,
	564	char *&UDoc, int &ULen) {
[497]	565	int databaseloaded = 0;
[325]	566
	567	UDoc = NULL; ULen = 0;
	568
	569	// see if we can make an appropriate database current
[539]	570	// char *ccollection = collection.getcstr();
	571	// assert (ccollection != NULL);
	572	// databaseloaded = load_text_database (ccollection);
	573	// delete ccollection;
[110]	574
[325]	575	// try and load the database
[539]	576	// if (!databaseloaded)
	577	databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
	578	defaultlanguage, collection);
[325]	579
	580	if (databaseloaded) {
	581	// retrieve the document from mg
	582	char docstr[32];
	583	sprintf(docstr, "%i", docnum);
	584
	585	mgq_ask(".set mode text");
	586	mgq_ask(".set query docnums");
	587	mgq_ask(docstr);
[110]	588
[325]	589	tempdoc = NULL;
	590	templen = 0;
	591	mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
	592	UDoc = tempdoc;
	593	ULen = templen;
	594	}
[110]	595
[497]	596	return (bool)databaseloaded;
[110]	597	}
	598

Note: See TracBrowser for help on using the repository browser.

Download in other formats: