Context Navigation

source: trunk/gsdl/src/colservr/mgsearch.cpp@ 1987

Last change on this file since 1987 was 1987, checked in by kjm18, 23 years ago
changed accumulator_method for mg to be array rather than list - it was getting some weird results with ranked searches
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 17.4 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: mgsearch.cpp 1987 2001-02-15 03:57:02Z kjm18 $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.29 2001/02/15 03:57:02 kjm18
31	changed accumulator_method for mg to be array rather than list - it was
32	getting some weird results with ranked searches
33
34	Revision 1.28 2001/01/25 18:26:44 cs025
35	Included CORBA branch for first time
36
37	Revision 1.22.2.1 2000/04/04 15:02:32 cs025
38	Corba first commit
39
40	Revision 1.22 1999/09/24 02:41:21 rjmcnab
41	change to use has_unicode_letdig in text_t
42
43	Revision 1.21 1999/09/21 21:41:41 sjboddie
44	fixed an error in what I committed last
45
46	Revision 1.20 1999/09/21 11:59:26 sjboddie
47	added Maxdocs queryfilter option (which may be -1 for 'all)
48
49	Revision 1.19 1999/09/07 22:52:52 rjmcnab
50	Seems to be an error in mg for retrieving documents using a paragraph
51	based index for some cases. Just added a work around (loads the default
52	index every time).
53
54	Revision 1.18 1999/09/07 04:57:22 sjboddie
55	added gpl notice
56
57	Revision 1.17 1999/08/31 22:42:41 rjmcnab
58	A couple of minor things.
59
60	Revision 1.16 1999/08/25 04:51:06 sjboddie
61	small change to allow for searching using boolean operators
62
63	Revision 1.15 1999/07/16 08:35:03 rjmcnab
64	Fixed a weird bug to do with a faulty case statement.
65
66	Revision 1.14 1999/07/16 03:42:22 sjboddie
67	changed isApprox
68
69	Revision 1.13 1999/07/16 00:12:46 sjboddie
70	removed all the old post-processing stuff
71
72	Revision 1.12 1999/07/07 06:17:47 rjmcnab
73	broke search_index into index+subcollection+language
74	within mgsearch
75
76	Revision 1.11 1999/07/05 21:06:43 rjmcnab
77	Disabled quoted strings.
78
79	Revision 1.10 1999/07/01 09:29:19 rjmcnab
80	Changes for better reporting of number documents which match a query. Changes
81	should still work as before with older versions of mg.
82
83	Revision 1.9 1999/07/01 03:54:48 rjmcnab
84	Added code to plug in the equivalent terms of each of the query terms.
85	Also added a function to get a raw utf8 encoded mg document (for speeding
86	up a phrase matching function)
87
88	Revision 1.8 1999/06/30 04:04:12 rjmcnab
89	made stemming functions available from mgsearch and made the stems
90	for the query terms available in queryinfo
91
92	Revision 1.7 1999/06/27 22:07:27 sjboddie
93	got rid of all the old functions for dealing with dir indexes
94
95	Revision 1.6 1999/06/09 00:41:32 sjboddie
96	phrase searching now uses case-folding if it's turned on
97
98	Revision 1.5 1999/02/21 22:31:35 rjmcnab
99
100	Removed locateinfo.
101
102	Revision 1.4 1999/02/03 01:13:27 sjboddie
103
104	Got interface to handle subcollections and language subcollections -
105	committed changes made to some of the collections
106
107	Revision 1.3 1999/01/19 01:38:17 rjmcnab
108
109	Made the source more portable.
110
111	Revision 1.2 1999/01/12 01:51:02 rjmcnab
112
113	Standard header.
114
115	Revision 1.1 1999/01/08 09:02:16 rjmcnab
116
117	Moved from src/library.
118
119	*/
120
121	#include "gsdlconf.h"
122	#include "mgsearch.h"
123	#include "fileutil.h"
124
125	#include <string.h>
126	#include <stdio.h>
127	#include <stdlib.h>
128	#include <ctype.h>
129
130	#if defined(GSDL_USE_OBJECTSPACE)
131	# include <ospace\std\iostream>
132	#elif defined(GSDL_USE_IOS_H)
133	# include <iostream.h>
134	#else
135	# include <iostream>
136	#endif
137
138	#if defined(__WIN32__)
139	// gdbm stuff
140	# include "autoconf.h"
141	# include "systems.h"
142	# include "gdbmconst.h"
143	# include "gdbm.h"
144	#else
145	# include <gdbm.h>
146	#endif
147
148
149	#include <assert.h>
150
151	#include "mgq.h"
152	// #include "locateinfo.h"
153	#include "gsdlunicode.h"
154	#include "unitool.h"
155
156
157	/////////////
158	// globals //
159	/////////////
160
161	static char *tempdoc = NULL;
162	static int templen = 0;
163
164
165	//////////////////////
166	// useful functions //
167	//////////////////////
168
169
170	// input and output are in utf8
171	text_t mgsearch_stemword (const text_t &word) {
172	// allocate working stem space
173	int maxstemlen = mgq_getmaxstemlen ();
174	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
175	if (word_stem == NULL) return "";
176
177	// copy word to word_stem
178	int len = 0;
179	text_t::const_iterator here = word.begin();
180	text_t::const_iterator end = word.end();
181	while (len < maxstemlen && here != end) {
182	word_stem[len+1] = (unsigned char)(*here);
183	len++; here++;
184	}
185	word_stem[len+1] = '\0';
186	word_stem[0] = len;
187
188	mgq_stemword (word_stem);
189
190	// copy word_stem back to tempstr
191	text_t tempstr;
192	tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
193
194	delete [] word_stem;
195
196	return tempstr;
197	}
198
199
200
201	////////////////////////
202	// callback functions //
203	////////////////////////
204
205	// This routine is called for each document found in a search
206	// it assumes that cache_num is set up correctly to point to
207	// a suitable result cache
208	int ourquerycallback(char * /UDoc/, int /ULen/, int DocNum,
209	float Weight, void *info) {
210
211
212	queryresultsclass queryresults = (queryresultsclass )info;
213
214	// append this entry to the document results
215	docresultclass docresult;
216	docresult.docnum = DocNum;
217	docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
218	docresult.docweight = Weight - docresult.num_query_terms_matched*100;
219
220	queryresults->docs.docset[DocNum] = docresult;
221	queryresults->docs.docorder.push_back(DocNum);
222
223	return 0;
224	}
225
226	int termequivcallback(char Word, int ULen, int /Freq*/,
227	float /Weight/, void *info) {
228	text_tset equivterms = (text_tset )info;
229	if (equivterms == NULL) return 0;
230
231	text_t thisterm;
232	thisterm.setcarr(Word, ULen);
233
234	equivterms->insert(thisterm);
235
236	return 0;
237	}
238
239
240	void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
241	// allocate working stem space
242	int maxstemlen = mgq_getmaxstemlen ();
243	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
244	if (word_stem == NULL) return;
245
246	// copy word to word_stem
247	int len = 0;
248	text_t::const_iterator here = word.begin();
249	text_t::const_iterator end = word.end();
250	while (len < maxstemlen && here != end) {
251	word_stem[len+1] = (unsigned char)(*here);
252	len++; here++;
253	}
254	word_stem[len+1] = '\0';
255	word_stem[0] = len;
256
257	// get the equivalent terms
258	mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
259
260	delete [] word_stem;
261
262	return;
263	}
264
265	text_tset utf8equivterms; // kept as utf8 string for fast matching
266
267
268	// This callback is called once for each term in the query
269	int termfreqcallback(char *Word, int ULen, int Freq,
270	float /Weight/, void *info) {
271	queryresultsclass queryresults = (queryresultsclass )info;
272	if (queryresults == NULL) return 0;
273
274	text_t term;
275	term.setcarr(Word, ULen);
276	termfreqclass termfreq;
277
278	termfreq.termstr = to_uni(term);
279	text_t utf8termstem = mgsearch_stemword (term);
280	termfreq.termstemstr = to_uni (utf8termstem);
281
282	mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
283
284	termfreq.termfreq = Freq;
285	queryresults->orgterms.push_back(termfreq);
286
287	return 0;
288	}
289
290	// this callback is called once for each variation of each term
291	int termvariantscallback(char Word, int ULen, int /Freq*/,
292	float /Weight/, void *info) {
293
294	text_t term;
295	term.setcarr(Word, ULen);
296	queryresultsclass queryresults = (queryresultsclass )info;
297	queryresults->termvariants.insert(to_uni(term));
298
299	return 0;
300	}
301
302	// This callback is for getting document text
303	int doctextcallback(char Doc, int ULen, int /Freq*/,
304	float /Weight/, void * /info/) {
305	tempdoc = Doc;
306	templen = ULen;
307
308	return 0;
309	}
310
311
312	static text_t getindexsuffix (const text_t &collection,
313	const text_t &index) {
314
315	text_t indexsuffix = "index";
316	indexsuffix = filename_cat (indexsuffix, index);
317	indexsuffix = filename_cat (indexsuffix, collection);
318	return indexsuffix;
319	}
320
321
322
323
324	////////////////////
325	// mgsearch class //
326	////////////////////
327
328	mgsearchclass::mgsearchclass ()
329	: searchclass() {
330
331	}
332
333	mgsearchclass::~mgsearchclass ()
334	{
335	if (cache != NULL)
336	{
337	delete cache;
338	cache = NULL;
339	}
340	}
341
342	// you only need to use this function before doing any stemming
343	// casefolding and stemming will be set if values for them are
344	// provided (0 or 1).
345	// makeindexcurrent returns true if it was able to load the database
346	bool mgsearchclass::makeindexcurrent (const text_t &index,
347	const text_t &subcollection,
348	const text_t &language,
349	const text_t &collection,
350	int casefolding,
351	int stemming) {
352	bool databaseloaded = true;
353
354	// get the names of the collection, index and text suffixes
355	char *ccollection = collection.getcstr();
356	assert (ccollection != NULL);
357	char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
358	assert (idxsuffix != NULL);
359	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
360	assert (txtsuffix != NULL);
361
362	#ifdef __WIN32__
363	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
364	#else
365	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
366	#endif
367
368	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
369	if (casefolding == 0) mgq_ask(".set casefold off");
370	else if (casefolding > 0) mgq_ask(".set casefold on");
371	if (stemming == 0) mgq_ask(".set stem off");
372	else if (stemming > 0) mgq_ask(".set stem on");
373
374	} else databaseloaded = false;
375
376	// free up the c strings
377	delete ccollection;
378	delete idxsuffix;
379	delete txtsuffix;
380	delete ccollectdir;
381
382	return databaseloaded;
383	}
384
385
386	// stem word uses the values set in the last call to makeindexcurrent
387	// to stem the word. It is assumed that word is in unicode
388	text_t mgsearchclass::stemword (const text_t &word) {
389	return to_uni (mgsearch_stemword (to_utf8 (word)));
390	}
391
392	text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
393	return to_uni (mgsearch_stemword (to_utf8 (here, end)));
394	}
395
396	/**
397	* search directs the whole execution of the search; a number of other
398	* functions in this class are called as a result, and precondition
399	* checks are also made
400	*/
401	bool mgsearchclass::search(const queryparamclass &queryparams,
402	queryresultsclass &queryresults) {
403	// assert (cache != NULL);
404
405	// clear any previous results
406	queryresults.clear();
407	// first check the cache
408	if (cache != NULL) {
409	if (cache->find(queryparams, queryresults)) return true;
410	}
411	// make sure there is a query to be processed
412	if (!has_unicode_letdig(queryparams.querystring)) return true;
413
414	if (makeindexcurrent (queryparams.index, queryparams.subcollection,
415	queryparams.language, queryparams.collection)) {
416	// initialise the form of results
417	setsearchmode (queryparams);
418
419	// execute the query
420	submitquery (queryparams);
421
422	// retrieve the results
423	getresults (queryparams, queryresults);
424
425	return true;
426	}
427
428	return false;
429	}
430
431
432	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
433	{
434	mgq_ask(".set expert true");
435	mgq_ask(".set sorted_terms true");
436	mgq_ask(".set accumulator_method array");
437	mgq_ask(".set max_accumulators 500000");
438	mgq_ask(".set maxparas 500000");
439	mgq_ask(".set verbatim true");
440	mgq_ask(".unset skip_dump");
441	mgq_ask(".set mode docnums");
442
443	switch (queryparams.search_type)
444	{
445	case 0: mgq_ask(".set query boolean"); break;
446	case 1: mgq_ask(".set query ranked"); break;
447	}
448	switch (queryparams.casefolding)
449	{
450	case 1: mgq_ask(".set casefold on"); break;
451	case 0: mgq_ask(".set casefold off"); break;
452	}
453	switch (queryparams.stemming)
454	{
455	case 1: mgq_ask(".set stem on"); break;
456	case 0: mgq_ask(".set stem off"); break;
457	}
458	mgq_ask(".set heads_length 150");
459
460	if (queryparams.maxdocs == -1) {
461	mgq_ask(".set maxdocs all");
462	} else {
463	char maxdocstr[32];
464	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
465	mgq_ask(maxdocstr);
466	}
467	}
468
469	/**
470	* submitquery constructs the query string (into UTF8 encoding)
471	* and submits it using mgq_ask to the mg search engine. Most
472	* of the processing will be done inside Greenstone
473	*/
474	void mgsearchclass::submitquery (const queryparamclass &queryparams)
475	{
476	// sort out the query string; copy it, remove all special characters
477	// and then convert it to a string in UTF8 format
478	text_t ttquerystring = queryparams.querystring;
479	filterquery (ttquerystring);
480	char *querystring = to_utf8(ttquerystring).getcstr();
481
482	// submit the query
483	mgq_ask(querystring);
484
485	// destroy the temporary character array
486	delete querystring;
487	}
488
489	/**
490	* getrults is called to retrieve the required data on the docs
491	* which responded to the query submitted in submitquery above.
492	*
493	* It calls the local mgquery (mgq) interface to MG several times,
494	* to obtain the document numbers, term frequencies, term variants
495	* etc. All processing of the query will be done by Greenstone
496	* thereafter
497	*/
498	void mgsearchclass::getresults (const queryparamclass &queryparams,
499	queryresultsclass &queryresults) {
500	// get the configuration for the maximum number of documents to
501	// retrieve
502	int howmany = queryparams.maxdocs;
503	if (howmany == -1) howmany = MAXNUMDOCS;
504	mgq_results(result_docnums, 0, howmany,
505	ourquerycallback, (void *)(&queryresults));
506
507	// get the term frequencies
508	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
509	termfreqcallback, (void *)(&queryresults));
510	queryresults.sortuniqqueryterms();
511
512	// get term variants
513	mgq_results(result_terms, 0, MAXNUMTERMS,
514	termvariantscallback, (void *)(&queryresults));
515
516	// get the number of documents retrieved
517	int total_retrieved = 0, is_approx = 0;
518	mgq_docsretrieved (&total_retrieved, &is_approx);
519
520	if (total_retrieved == 0) {
521	// not available (or really was zero)
522	queryresults.docs_matched = queryresults.docs.docset.size();
523	if ((queryparams.maxdocs == -1) \|\|
524	(queryresults.docs_matched < queryparams.maxdocs))
525	queryresults.is_approx = Exact;
526	else
527	queryresults.is_approx = MoreThan;
528	} else {
529	queryresults.docs_matched = total_retrieved;
530	if (is_approx) queryresults.is_approx = Approximate;
531	else queryresults.is_approx = Exact;
532	}
533	}
534
535	/**
536	* Tidies the given querystring, removing special characters
537	*/
538	void mgsearchclass::filterquery (text_t &ttquerystring) {
539	text_t::iterator ithere = ttquerystring.begin ();
540	text_t::iterator itend = ttquerystring.end ();
541
542	// remove all non alphanumeric characters (except
543	// boolean operators
544	while (ithere != itend) {
545	if ((!is_unicode_letdig(ithere)) && (ithere != '!') &&
546	(ithere != '&') && (ithere != '\|') && (*ithere != '(') &&
547	(ithere != ')')) (ithere) = ' ';
548	ithere++;
549	}
550	}
551
552
553	// the document text for 'docnum' is placed in 'output'
554	// docTargetDocument returns 'true' if it was able to
555	// try to get a document
556	// collection is needed to see if an index from the
557	// collection is loaded. If no index has been loaded
558	// defaultindex is needed to load one
559	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
560	const text_t &defaultsubcollection,
561	const text_t &defaultlanguage,
562	const text_t &collection,
563	int docnum,
564	text_t &output) {
565	output.clear();
566
567	// get the mg version of the document
568	char *mgdoc = NULL;
569	int doclen = 0;
570	if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
571	collection, docnum, mgdoc, doclen)) return false;
572	if (mgdoc == NULL) return false;
573
574	// replace all control-Cs with spaces
575	char *mgdoc_here = mgdoc;
576	char *mgdoc_end = mgdoc + doclen;
577	while (mgdoc_here < mgdoc_end) {
578	if (mgdoc_here == '\x3') mgdoc_here = ' ';
579	mgdoc_here++;
580	}
581
582	// convert this document to unicode
583	utf8inconvertclass inconvert;
584	convertclass::status_t status;
585	inconvert.reset ();
586	inconvert.setinput (mgdoc, doclen);
587	inconvert.convert (output, status);
588
589	return true;
590	}
591
592
593	bool mgsearchclass::mgdocument (const text_t &defaultindex,
594	const text_t &defaultsubcollection,
595	const text_t &defaultlanguage,
596	const text_t &collection,
597	int docnum,
598	char *&UDoc, int &ULen) {
599	int databaseloaded = 0;
600
601	UDoc = NULL; ULen = 0;
602
603	// see if we can make an appropriate database current
604	// char *ccollection = collection.getcstr();
605	// assert (ccollection != NULL);
606	// databaseloaded = load_text_database (ccollection);
607	// delete ccollection;
608
609	// try and load the database
610	// if (!databaseloaded)
611	databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
612	defaultlanguage, collection);
613
614	if (databaseloaded) {
615	// retrieve the document from mg
616	char docstr[32];
617	sprintf(docstr, "%i", docnum);
618
619	mgq_ask(".set mode text");
620	mgq_ask(".set query docnums");
621	mgq_ask(docstr);
622
623	tempdoc = NULL;
624	templen = 0;
625	mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
626	UDoc = tempdoc;
627	ULen = templen;
628	}
629
630	return (bool)databaseloaded;
631	}
632

Note: See TracBrowser for help on using the repository browser.

Download in other formats: