Context Navigation

source: gsdl/tags/gsdl-2_30d-distribution/gsdl/src/colservr/mgsearch.cpp@ 14121

Last change on this file since 14121 was 2011, checked in by sjboddie, 23 years ago
Set mg's accumulator method back to 'list' as the recent change appeared to introduce a new (and more serious) bug while fixing the old bug. For now we'll just have to live with it the way it is.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.7 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: mgsearch.cpp 2011 2001-02-19 02:02:00Z sjboddie $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.31 2001/02/19 02:02:00 sjboddie
31	Set mg's accumulator method back to 'list' as the recent change appeared
32	to introduce a new (and more serious) bug while fixing the old bug. For
33	now we'll just have to live with it the way it is.
34
35	Revision 1.30 2001/02/15 22:58:11 kjm18
36	added a comment
37
38	Revision 1.29 2001/02/15 03:57:02 kjm18
39	changed accumulator_method for mg to be array rather than list - it was
40	getting some weird results with ranked searches
41
42	Revision 1.28 2001/01/25 18:26:44 cs025
43	Included CORBA branch for first time
44
45	Revision 1.22.2.1 2000/04/04 15:02:32 cs025
46	Corba first commit
47
48	Revision 1.22 1999/09/24 02:41:21 rjmcnab
49	change to use has_unicode_letdig in text_t
50
51	Revision 1.21 1999/09/21 21:41:41 sjboddie
52	fixed an error in what I committed last
53
54	Revision 1.20 1999/09/21 11:59:26 sjboddie
55	added Maxdocs queryfilter option (which may be -1 for 'all)
56
57	Revision 1.19 1999/09/07 22:52:52 rjmcnab
58	Seems to be an error in mg for retrieving documents using a paragraph
59	based index for some cases. Just added a work around (loads the default
60	index every time).
61
62	Revision 1.18 1999/09/07 04:57:22 sjboddie
63	added gpl notice
64
65	Revision 1.17 1999/08/31 22:42:41 rjmcnab
66	A couple of minor things.
67
68	Revision 1.16 1999/08/25 04:51:06 sjboddie
69	small change to allow for searching using boolean operators
70
71	Revision 1.15 1999/07/16 08:35:03 rjmcnab
72	Fixed a weird bug to do with a faulty case statement.
73
74	Revision 1.14 1999/07/16 03:42:22 sjboddie
75	changed isApprox
76
77	Revision 1.13 1999/07/16 00:12:46 sjboddie
78	removed all the old post-processing stuff
79
80	Revision 1.12 1999/07/07 06:17:47 rjmcnab
81	broke search_index into index+subcollection+language
82	within mgsearch
83
84	Revision 1.11 1999/07/05 21:06:43 rjmcnab
85	Disabled quoted strings.
86
87	Revision 1.10 1999/07/01 09:29:19 rjmcnab
88	Changes for better reporting of number documents which match a query. Changes
89	should still work as before with older versions of mg.
90
91	Revision 1.9 1999/07/01 03:54:48 rjmcnab
92	Added code to plug in the equivalent terms of each of the query terms.
93	Also added a function to get a raw utf8 encoded mg document (for speeding
94	up a phrase matching function)
95
96	Revision 1.8 1999/06/30 04:04:12 rjmcnab
97	made stemming functions available from mgsearch and made the stems
98	for the query terms available in queryinfo
99
100	Revision 1.7 1999/06/27 22:07:27 sjboddie
101	got rid of all the old functions for dealing with dir indexes
102
103	Revision 1.6 1999/06/09 00:41:32 sjboddie
104	phrase searching now uses case-folding if it's turned on
105
106	Revision 1.5 1999/02/21 22:31:35 rjmcnab
107
108	Removed locateinfo.
109
110	Revision 1.4 1999/02/03 01:13:27 sjboddie
111
112	Got interface to handle subcollections and language subcollections -
113	committed changes made to some of the collections
114
115	Revision 1.3 1999/01/19 01:38:17 rjmcnab
116
117	Made the source more portable.
118
119	Revision 1.2 1999/01/12 01:51:02 rjmcnab
120
121	Standard header.
122
123	Revision 1.1 1999/01/08 09:02:16 rjmcnab
124
125	Moved from src/library.
126
127	*/
128
129	#include "gsdlconf.h"
130	#include "mgsearch.h"
131	#include "fileutil.h"
132
133	#include <string.h>
134	#include <stdio.h>
135	#include <stdlib.h>
136	#include <ctype.h>
137
138	#if defined(GSDL_USE_OBJECTSPACE)
139	# include <ospace\std\iostream>
140	#elif defined(GSDL_USE_IOS_H)
141	# include <iostream.h>
142	#else
143	# include <iostream>
144	#endif
145
146	#if defined(__WIN32__)
147	// gdbm stuff
148	# include "autoconf.h"
149	# include "systems.h"
150	# include "gdbmconst.h"
151	# include "gdbm.h"
152	#else
153	# include <gdbm.h>
154	#endif
155
156
157	#include <assert.h>
158
159	#include "mgq.h"
160	// #include "locateinfo.h"
161	#include "gsdlunicode.h"
162	#include "unitool.h"
163
164
165	/////////////
166	// globals //
167	/////////////
168
169	static char *tempdoc = NULL;
170	static int templen = 0;
171
172
173	//////////////////////
174	// useful functions //
175	//////////////////////
176
177
178	// input and output are in utf8
179	text_t mgsearch_stemword (const text_t &word) {
180	// allocate working stem space
181	int maxstemlen = mgq_getmaxstemlen ();
182	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
183	if (word_stem == NULL) return "";
184
185	// copy word to word_stem
186	int len = 0;
187	text_t::const_iterator here = word.begin();
188	text_t::const_iterator end = word.end();
189	while (len < maxstemlen && here != end) {
190	word_stem[len+1] = (unsigned char)(*here);
191	len++; here++;
192	}
193	word_stem[len+1] = '\0';
194	word_stem[0] = len;
195
196	mgq_stemword (word_stem);
197
198	// copy word_stem back to tempstr
199	text_t tempstr;
200	tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
201
202	delete [] word_stem;
203
204	return tempstr;
205	}
206
207
208
209	////////////////////////
210	// callback functions //
211	////////////////////////
212
213	// This routine is called for each document found in a search
214	// it assumes that cache_num is set up correctly to point to
215	// a suitable result cache
216	int ourquerycallback(char * /UDoc/, int /ULen/, int DocNum,
217	float Weight, void *info) {
218
219
220	queryresultsclass queryresults = (queryresultsclass )info;
221
222	// append this entry to the document results
223	docresultclass docresult;
224	docresult.docnum = DocNum;
225	docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
226	docresult.docweight = Weight - docresult.num_query_terms_matched*100;
227
228	queryresults->docs.docset[DocNum] = docresult;
229	queryresults->docs.docorder.push_back(DocNum);
230
231	return 0;
232	}
233
234	int termequivcallback(char Word, int ULen, int /Freq*/,
235	float /Weight/, void *info) {
236	text_tset equivterms = (text_tset )info;
237	if (equivterms == NULL) return 0;
238
239	text_t thisterm;
240	thisterm.setcarr(Word, ULen);
241
242	equivterms->insert(thisterm);
243
244	return 0;
245	}
246
247
248	void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
249	// allocate working stem space
250	int maxstemlen = mgq_getmaxstemlen ();
251	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
252	if (word_stem == NULL) return;
253
254	// copy word to word_stem
255	int len = 0;
256	text_t::const_iterator here = word.begin();
257	text_t::const_iterator end = word.end();
258	while (len < maxstemlen && here != end) {
259	word_stem[len+1] = (unsigned char)(*here);
260	len++; here++;
261	}
262	word_stem[len+1] = '\0';
263	word_stem[0] = len;
264
265	// get the equivalent terms
266	mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
267
268	delete [] word_stem;
269
270	return;
271	}
272
273	text_tset utf8equivterms; // kept as utf8 string for fast matching
274
275
276	// This callback is called once for each term in the query
277	int termfreqcallback(char *Word, int ULen, int Freq,
278	float /Weight/, void *info) {
279	queryresultsclass queryresults = (queryresultsclass )info;
280	if (queryresults == NULL) return 0;
281
282	text_t term;
283	term.setcarr(Word, ULen);
284	termfreqclass termfreq;
285
286	termfreq.termstr = to_uni(term);
287	text_t utf8termstem = mgsearch_stemword (term);
288	termfreq.termstemstr = to_uni (utf8termstem);
289
290	mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
291
292	termfreq.termfreq = Freq;
293	queryresults->orgterms.push_back(termfreq);
294
295	return 0;
296	}
297
298	// this callback is called once for each variation of each term
299	int termvariantscallback(char Word, int ULen, int /Freq*/,
300	float /Weight/, void *info) {
301
302	text_t term;
303	term.setcarr(Word, ULen);
304	queryresultsclass queryresults = (queryresultsclass )info;
305	queryresults->termvariants.insert(to_uni(term));
306
307	return 0;
308	}
309
310	// This callback is for getting document text
311	int doctextcallback(char Doc, int ULen, int /Freq*/,
312	float /Weight/, void * /info/) {
313	tempdoc = Doc;
314	templen = ULen;
315
316	return 0;
317	}
318
319
320	static text_t getindexsuffix (const text_t &collection,
321	const text_t &index) {
322
323	text_t indexsuffix = "index";
324	indexsuffix = filename_cat (indexsuffix, index);
325	indexsuffix = filename_cat (indexsuffix, collection);
326	return indexsuffix;
327	}
328
329
330
331
332	////////////////////
333	// mgsearch class //
334	////////////////////
335
336	mgsearchclass::mgsearchclass ()
337	: searchclass() {
338
339	}
340
341	mgsearchclass::~mgsearchclass ()
342	{
343	if (cache != NULL)
344	{
345	delete cache;
346	cache = NULL;
347	}
348	}
349
350	// you only need to use this function before doing any stemming
351	// casefolding and stemming will be set if values for them are
352	// provided (0 or 1).
353	// makeindexcurrent returns true if it was able to load the database
354	bool mgsearchclass::makeindexcurrent (const text_t &index,
355	const text_t &subcollection,
356	const text_t &language,
357	const text_t &collection,
358	int casefolding,
359	int stemming) {
360	bool databaseloaded = true;
361
362	// get the names of the collection, index and text suffixes
363	char *ccollection = collection.getcstr();
364	assert (ccollection != NULL);
365	char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
366	assert (idxsuffix != NULL);
367	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
368	assert (txtsuffix != NULL);
369
370	#ifdef __WIN32__
371	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
372	#else
373	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
374	#endif
375
376	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
377	if (casefolding == 0) mgq_ask(".set casefold off");
378	else if (casefolding > 0) mgq_ask(".set casefold on");
379	if (stemming == 0) mgq_ask(".set stem off");
380	else if (stemming > 0) mgq_ask(".set stem on");
381
382	} else databaseloaded = false;
383
384	// free up the c strings
385	delete ccollection;
386	delete idxsuffix;
387	delete txtsuffix;
388	delete ccollectdir;
389
390	return databaseloaded;
391	}
392
393
394	// stem word uses the values set in the last call to makeindexcurrent
395	// to stem the word. It is assumed that word is in unicode
396	text_t mgsearchclass::stemword (const text_t &word) {
397	return to_uni (mgsearch_stemword (to_utf8 (word)));
398	}
399
400	text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
401	return to_uni (mgsearch_stemword (to_utf8 (here, end)));
402	}
403
404	/**
405	* search directs the whole execution of the search; a number of other
406	* functions in this class are called as a result, and precondition
407	* checks are also made
408	*/
409	bool mgsearchclass::search(const queryparamclass &queryparams,
410	queryresultsclass &queryresults) {
411	// assert (cache != NULL);
412
413	// clear any previous results
414	queryresults.clear();
415	// first check the cache
416	if (cache != NULL) {
417	if (cache->find(queryparams, queryresults)) return true;
418	}
419	// make sure there is a query to be processed
420	if (!has_unicode_letdig(queryparams.querystring)) return true;
421
422	if (makeindexcurrent (queryparams.index, queryparams.subcollection,
423	queryparams.language, queryparams.collection)) {
424	// initialise the form of results
425	setsearchmode (queryparams);
426
427	// execute the query
428	submitquery (queryparams);
429
430	// retrieve the results
431	getresults (queryparams, queryresults);
432
433	return true;
434	}
435
436	return false;
437	}
438
439	/* accumulator_method has been changed to use array rather than list.
440	list appears to be broken somewhat - for some ranked queries, it returned
441	fewer results than it should have (eg 45 instead of 50). The three other
442	methods (array, splay_tree, hash_table) all return the same number of
443	documents, in the same order, with the same ranks. list returns what
444	appears to be the same documents (but less of them), but with different ranks,
445	and in a different order. Minimal time tests dont show any speed improvement
446	of list over array (maybe because its broken??). [02/2001, kjm18]
447
448	... [sjboddie, also 02/2001] turns out that changing the accumulator_method
449	introduced a more serious bug than it fixed (i.e. occasionally when doing a
450	ranked search for a very common word you get no results at all). I've
451	changed it back to list for now, one day we should play with other
452	accumulator_methods but for now I don't have time and don't want to risk
453	introducing bugs (better the devil you know ;)
454	*/
455	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
456	{
457	mgq_ask(".set expert true");
458	mgq_ask(".set sorted_terms true");
459	mgq_ask(".set accumulator_method list");
460	mgq_ask(".set max_accumulators 500000");
461	mgq_ask(".set maxparas 500000");
462	mgq_ask(".set verbatim true");
463	mgq_ask(".unset skip_dump");
464	mgq_ask(".set mode docnums");
465
466	switch (queryparams.search_type)
467	{
468	case 0: mgq_ask(".set query boolean"); break;
469	case 1: mgq_ask(".set query ranked"); break;
470	}
471	switch (queryparams.casefolding)
472	{
473	case 1: mgq_ask(".set casefold on"); break;
474	case 0: mgq_ask(".set casefold off"); break;
475	}
476	switch (queryparams.stemming)
477	{
478	case 1: mgq_ask(".set stem on"); break;
479	case 0: mgq_ask(".set stem off"); break;
480	}
481	mgq_ask(".set heads_length 150");
482
483	if (queryparams.maxdocs == -1) {
484	mgq_ask(".set maxdocs all");
485	} else {
486	char maxdocstr[32];
487	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
488	mgq_ask(maxdocstr);
489	}
490	}
491
492	/**
493	* submitquery constructs the query string (into UTF8 encoding)
494	* and submits it using mgq_ask to the mg search engine. Most
495	* of the processing will be done inside Greenstone
496	*/
497	void mgsearchclass::submitquery (const queryparamclass &queryparams)
498	{
499	// sort out the query string; copy it, remove all special characters
500	// and then convert it to a string in UTF8 format
501	text_t ttquerystring = queryparams.querystring;
502	filterquery (ttquerystring);
503	char *querystring = to_utf8(ttquerystring).getcstr();
504
505	// submit the query
506	mgq_ask(querystring);
507
508	// destroy the temporary character array
509	delete querystring;
510	}
511
512	/**
513	* getrults is called to retrieve the required data on the docs
514	* which responded to the query submitted in submitquery above.
515	*
516	* It calls the local mgquery (mgq) interface to MG several times,
517	* to obtain the document numbers, term frequencies, term variants
518	* etc. All processing of the query will be done by Greenstone
519	* thereafter
520	*/
521	void mgsearchclass::getresults (const queryparamclass &queryparams,
522	queryresultsclass &queryresults) {
523	// get the configuration for the maximum number of documents to
524	// retrieve
525	int howmany = queryparams.maxdocs;
526	if (howmany == -1) howmany = MAXNUMDOCS;
527	mgq_results(result_docnums, 0, howmany,
528	ourquerycallback, (void *)(&queryresults));
529
530	// get the term frequencies
531	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
532	termfreqcallback, (void *)(&queryresults));
533	queryresults.sortuniqqueryterms();
534
535	// get term variants
536	mgq_results(result_terms, 0, MAXNUMTERMS,
537	termvariantscallback, (void *)(&queryresults));
538
539	// get the number of documents retrieved
540	int total_retrieved = 0, is_approx = 0;
541	mgq_docsretrieved (&total_retrieved, &is_approx);
542
543	if (total_retrieved == 0) {
544	// not available (or really was zero)
545	queryresults.docs_matched = queryresults.docs.docset.size();
546	if ((queryparams.maxdocs == -1) \|\|
547	(queryresults.docs_matched < queryparams.maxdocs))
548	queryresults.is_approx = Exact;
549	else
550	queryresults.is_approx = MoreThan;
551	} else {
552	queryresults.docs_matched = total_retrieved;
553	if (is_approx) queryresults.is_approx = Approximate;
554	else queryresults.is_approx = Exact;
555	}
556	}
557
558	/**
559	* Tidies the given querystring, removing special characters
560	*/
561	void mgsearchclass::filterquery (text_t &ttquerystring) {
562	text_t::iterator ithere = ttquerystring.begin ();
563	text_t::iterator itend = ttquerystring.end ();
564
565	// remove all non alphanumeric characters (except
566	// boolean operators
567	while (ithere != itend) {
568	if ((!is_unicode_letdig(ithere)) && (ithere != '!') &&
569	(ithere != '&') && (ithere != '\|') && (*ithere != '(') &&
570	(ithere != ')')) (ithere) = ' ';
571	ithere++;
572	}
573	}
574
575
576	// the document text for 'docnum' is placed in 'output'
577	// docTargetDocument returns 'true' if it was able to
578	// try to get a document
579	// collection is needed to see if an index from the
580	// collection is loaded. If no index has been loaded
581	// defaultindex is needed to load one
582	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
583	const text_t &defaultsubcollection,
584	const text_t &defaultlanguage,
585	const text_t &collection,
586	int docnum,
587	text_t &output) {
588	output.clear();
589
590	// get the mg version of the document
591	char *mgdoc = NULL;
592	int doclen = 0;
593	if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
594	collection, docnum, mgdoc, doclen)) return false;
595	if (mgdoc == NULL) return false;
596
597	// replace all control-Cs with spaces
598	char *mgdoc_here = mgdoc;
599	char *mgdoc_end = mgdoc + doclen;
600	while (mgdoc_here < mgdoc_end) {
601	if (mgdoc_here == '\x3') mgdoc_here = ' ';
602	mgdoc_here++;
603	}
604
605	// convert this document to unicode
606	utf8inconvertclass inconvert;
607	convertclass::status_t status;
608	inconvert.reset ();
609	inconvert.setinput (mgdoc, doclen);
610	inconvert.convert (output, status);
611
612	return true;
613	}
614
615
616	bool mgsearchclass::mgdocument (const text_t &defaultindex,
617	const text_t &defaultsubcollection,
618	const text_t &defaultlanguage,
619	const text_t &collection,
620	int docnum,
621	char *&UDoc, int &ULen) {
622	int databaseloaded = 0;
623
624	UDoc = NULL; ULen = 0;
625
626	// see if we can make an appropriate database current
627	// char *ccollection = collection.getcstr();
628	// assert (ccollection != NULL);
629	// databaseloaded = load_text_database (ccollection);
630	// delete ccollection;
631
632	// try and load the database
633	// if (!databaseloaded)
634	databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
635	defaultlanguage, collection);
636
637	if (databaseloaded) {
638	// retrieve the document from mg
639	char docstr[32];
640	sprintf(docstr, "%i", docnum);
641
642	mgq_ask(".set mode text");
643	mgq_ask(".set query docnums");
644	mgq_ask(docstr);
645
646	tempdoc = NULL;
647	templen = 0;
648	mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
649	UDoc = tempdoc;
650	ULen = templen;
651	}
652
653	return (bool)databaseloaded;
654	}
655

Note: See TracBrowser for help on using the repository browser.

Download in other formats: