Context Navigation

source: trunk/gsdl/src/colservr/mgsearch.cpp@ 539

Last change on this file since 539 was 539, checked in by rjmcnab, 25 years ago
Seems to be an error in mg for retrieving documents using a paragraph based index for some cases. Just added a work around (loads the default index every time).
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.1 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: mgsearch.cpp 539 1999-09-07 22:52:52Z rjmcnab $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.19 1999/09/07 22:52:52 rjmcnab
31	Seems to be an error in mg for retrieving documents using a paragraph
32	based index for some cases. Just added a work around (loads the default
33	index every time).
34
35	Revision 1.18 1999/09/07 04:57:22 sjboddie
36	added gpl notice
37
38	Revision 1.17 1999/08/31 22:42:41 rjmcnab
39	A couple of minor things.
40
41	Revision 1.16 1999/08/25 04:51:06 sjboddie
42	small change to allow for searching using boolean operators
43
44	Revision 1.15 1999/07/16 08:35:03 rjmcnab
45	Fixed a weird bug to do with a faulty case statement.
46
47	Revision 1.14 1999/07/16 03:42:22 sjboddie
48	changed isApprox
49
50	Revision 1.13 1999/07/16 00:12:46 sjboddie
51	removed all the old post-processing stuff
52
53	Revision 1.12 1999/07/07 06:17:47 rjmcnab
54	broke search_index into index+subcollection+language
55	within mgsearch
56
57	Revision 1.11 1999/07/05 21:06:43 rjmcnab
58	Disabled quoted strings.
59
60	Revision 1.10 1999/07/01 09:29:19 rjmcnab
61	Changes for better reporting of number documents which match a query. Changes
62	should still work as before with older versions of mg.
63
64	Revision 1.9 1999/07/01 03:54:48 rjmcnab
65	Added code to plug in the equivalent terms of each of the query terms.
66	Also added a function to get a raw utf8 encoded mg document (for speeding
67	up a phrase matching function)
68
69	Revision 1.8 1999/06/30 04:04:12 rjmcnab
70	made stemming functions available from mgsearch and made the stems
71	for the query terms available in queryinfo
72
73	Revision 1.7 1999/06/27 22:07:27 sjboddie
74	got rid of all the old functions for dealing with dir indexes
75
76	Revision 1.6 1999/06/09 00:41:32 sjboddie
77	phrase searching now uses case-folding if it's turned on
78
79	Revision 1.5 1999/02/21 22:31:35 rjmcnab
80
81	Removed locateinfo.
82
83	Revision 1.4 1999/02/03 01:13:27 sjboddie
84
85	Got interface to handle subcollections and language subcollections -
86	committed changes made to some of the collections
87
88	Revision 1.3 1999/01/19 01:38:17 rjmcnab
89
90	Made the source more portable.
91
92	Revision 1.2 1999/01/12 01:51:02 rjmcnab
93
94	Standard header.
95
96	Revision 1.1 1999/01/08 09:02:16 rjmcnab
97
98	Moved from src/library.
99
100	*/
101
102
103	#include "gsdlconf.h"
104	#include "mgsearch.h"
105	#include "fileutil.h"
106
107	#include <string.h>
108	#include <stdio.h>
109	#include <stdlib.h>
110	#include <ctype.h>
111
112	#if defined(GSDL_USE_OBJECTSPACE)
113	# include <ospace\std\iostream>
114	#elif defined(GSDL_USE_IOS_H)
115	# include <iostream.h>
116	#else
117	# include <iostream>
118	#endif
119
120	#if defined(__WIN32__)
121	// gdbm stuff
122	# include "autoconf.h"
123	# include "systems.h"
124	# include "gdbmconst.h"
125	# include "gdbm.h"
126	#else
127	# include <gdbm.h>
128	#endif
129
130
131	#include <assert.h>
132
133	#include "mgq.h"
134	// #include "locateinfo.h"
135	#include "gsdlunicode.h"
136	#include "unitool.h"
137
138
139	/////////////
140	// globals //
141	/////////////
142
143	static char *tempdoc = NULL;
144	static int templen = 0;
145
146
147	//////////////////////
148	// useful functions //
149	//////////////////////
150
151
152	// input and output are in utf8
153	text_t mgsearch_stemword (const text_t &word) {
154	// allocate working stem space
155	int maxstemlen = mgq_getmaxstemlen ();
156	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
157	if (word_stem == NULL) return "";
158
159	// copy word to word_stem
160	int len = 0;
161	text_t::const_iterator here = word.begin();
162	text_t::const_iterator end = word.end();
163	while (len < maxstemlen && here != end) {
164	word_stem[len+1] = (unsigned char)(*here);
165	len++; here++;
166	}
167	word_stem[len+1] = '\0';
168	word_stem[0] = len;
169
170	mgq_stemword (word_stem);
171
172	// copy word_stem back to tempstr
173	text_t tempstr;
174	tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
175
176	delete [] word_stem;
177
178	return tempstr;
179	}
180
181
182
183	////////////////////////
184	// callback functions //
185	////////////////////////
186
187	// This routine is called for each document found in a search
188	// it assumes that cache_num is set up correctly to point to
189	// a suitable result cache
190	int ourquerycallback(char * /UDoc/, int /ULen/, int DocNum,
191	float Weight, void *info) {
192
193
194	queryresultsclass queryresults = (queryresultsclass )info;
195
196	// append this entry to the document results
197	docresultclass docresult;
198	docresult.docnum = DocNum;
199	docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
200	docresult.docweight = Weight - docresult.num_query_terms_matched*100;
201
202	queryresults->docs.docset[DocNum] = docresult;
203	queryresults->docs.docorder.push_back(DocNum);
204
205	return 0;
206	}
207
208	int termequivcallback(char Word, int ULen, int /Freq*/,
209	float /Weight/, void *info) {
210	text_tset equivterms = (text_tset )info;
211	if (equivterms == NULL) return 0;
212
213	text_t thisterm;
214	thisterm.setcarr(Word, ULen);
215
216	equivterms->insert(thisterm);
217
218	return 0;
219	}
220
221
222	void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
223	// allocate working stem space
224	int maxstemlen = mgq_getmaxstemlen ();
225	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
226	if (word_stem == NULL) return;
227
228	// copy word to word_stem
229	int len = 0;
230	text_t::const_iterator here = word.begin();
231	text_t::const_iterator end = word.end();
232	while (len < maxstemlen && here != end) {
233	word_stem[len+1] = (unsigned char)(*here);
234	len++; here++;
235	}
236	word_stem[len+1] = '\0';
237	word_stem[0] = len;
238
239	// get the equivalent terms
240	mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
241
242	delete [] word_stem;
243
244	return;
245	}
246
247	text_tset utf8equivterms; // kept as utf8 string for fast matching
248
249
250	// This callback is called once for each term in the query
251	int termfreqcallback(char *Word, int ULen, int Freq,
252	float /Weight/, void *info) {
253	queryresultsclass queryresults = (queryresultsclass )info;
254	if (queryresults == NULL) return 0;
255
256	text_t term;
257	term.setcarr(Word, ULen);
258	termfreqclass termfreq;
259
260	termfreq.termstr = to_uni(term);
261	text_t utf8termstem = mgsearch_stemword (term);
262	termfreq.termstemstr = to_uni (utf8termstem);
263
264	mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
265
266	termfreq.termfreq = Freq;
267	queryresults->orgterms.push_back(termfreq);
268
269	return 0;
270	}
271
272	// this callback is called once for each variation of each term
273	int termvariantscallback(char Word, int ULen, int /Freq*/,
274	float /Weight/, void *info) {
275
276	text_t term;
277	term.setcarr(Word, ULen);
278	queryresultsclass queryresults = (queryresultsclass )info;
279	queryresults->termvariants.insert(to_uni(term));
280
281	return 0;
282	}
283
284	// This callback is for getting document text
285	int doctextcallback(char Doc, int ULen, int /Freq*/,
286	float /Weight/, void * /info/) {
287	tempdoc = Doc;
288	templen = ULen;
289
290	return 0;
291	}
292
293
294	static text_t getindexsuffix (const text_t &collection,
295	const text_t &index) {
296
297	text_t indexsuffix = "index";
298	indexsuffix = filename_cat (indexsuffix, index);
299	indexsuffix = filename_cat (indexsuffix, collection);
300	return indexsuffix;
301	}
302
303
304
305
306	////////////////////
307	// mgsearch class //
308	////////////////////
309
310	mgsearchclass::mgsearchclass ()
311	{
312	cache = new querycache (RESULTCACHESIZE);
313	}
314
315	mgsearchclass::~mgsearchclass ()
316	{
317	if (cache != NULL)
318	{
319	delete cache;
320	cache = NULL;
321	}
322	}
323
324
325	void mgsearchclass::setcollectdir (const text_t &thecollectdir)
326	{
327	collectdir = thecollectdir;
328	}
329
330	// you only need to use this function before doing any stemming
331	// casefolding and stemming will be set if values for them are
332	// provided (0 or 1).
333	// makeindexcurrent returns true if it was able to load the database
334	bool mgsearchclass::makeindexcurrent (const text_t &index,
335	const text_t &subcollection,
336	const text_t &language,
337	const text_t &collection,
338	int casefolding,
339	int stemming) {
340	bool databaseloaded = true;
341
342	// get the names of the collection, index and text suffixes
343	char *ccollection = collection.getcstr();
344	assert (ccollection != NULL);
345	char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
346	assert (idxsuffix != NULL);
347	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
348	assert (txtsuffix != NULL);
349
350	#ifdef __WIN32__
351	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
352	#else
353	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
354	#endif
355
356	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
357	if (casefolding == 0) mgq_ask(".set casefold off");
358	else if (casefolding > 0) mgq_ask(".set casefold on");
359	if (stemming == 0) mgq_ask(".set stem off");
360	else if (stemming > 0) mgq_ask(".set stem on");
361
362	} else databaseloaded = false;
363
364	// free up the c strings
365	delete ccollection;
366	delete idxsuffix;
367	delete txtsuffix;
368	delete ccollectdir;
369
370	return databaseloaded;
371	}
372
373
374	// stem word uses the values set in the last call to makeindexcurrent
375	// to stem the word. It is assumed that word is in unicode
376	text_t mgsearchclass::stemword (const text_t &word) {
377	return to_uni (mgsearch_stemword (to_utf8 (word)));
378	}
379
380	text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
381	return to_uni (mgsearch_stemword (to_utf8 (here, end)));
382	}
383
384
385	bool mgsearchclass::search(const queryparamclass &queryparams,
386	queryresultsclass &queryresults) {
387	assert (cache != NULL);
388
389	queryresults.clear();
390
391	// first check the cache
392	if (cache->find(queryparams, queryresults)) return true;
393
394	// make sure there is a query to be processed
395	text_t::const_iterator queryhere = queryparams.querystring.begin();
396	text_t::const_iterator queryend = queryparams.querystring.end();
397	while (queryhere != queryend) {
398	if (is_unicode_letdig (*queryhere)) break;
399	queryhere++;
400	}
401
402	// if we reached the end of the query string without finding
403	// any alphanumeric characters then return no results (and say
404	// the database was loaded)
405	if (queryhere == queryend) return true;
406
407	if (makeindexcurrent (queryparams.index, queryparams.subcollection,
408	queryparams.language, queryparams.collection)) {
409	setsearchmode (queryparams);
410	submitquery (queryparams);
411	getresults (queryparams, queryresults);
412	return true;
413	}
414
415	return false;
416	}
417
418
419	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
420	{
421	mgq_ask(".set expert true");
422	mgq_ask(".set sorted_terms true");
423	mgq_ask(".set accumulator_method list");
424	mgq_ask(".set max_accumulators 500000");
425	mgq_ask(".set maxparas 500000");
426	mgq_ask(".set verbatim true");
427	mgq_ask(".unset skip_dump");
428	mgq_ask(".set mode docnums");
429
430	switch (queryparams.search_type)
431	{
432	case 0: mgq_ask(".set query boolean"); break;
433	case 1: mgq_ask(".set query ranked"); break;
434	}
435	switch (queryparams.casefolding)
436	{
437	case 1: mgq_ask(".set casefold on"); break;
438	case 0: mgq_ask(".set casefold off"); break;
439	}
440	switch (queryparams.stemming)
441	{
442	case 1: mgq_ask(".set stem on"); break;
443	case 0: mgq_ask(".set stem off"); break;
444	}
445	mgq_ask(".set heads_length 150");
446
447	if (queryparams.maxdocs == -1) {
448	mgq_ask(".set maxdocs all");
449	} else {
450	char maxdocstr[32];
451	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
452	mgq_ask(maxdocstr);
453	}
454	}
455
456
457	void mgsearchclass::submitquery (const queryparamclass &queryparams)
458	{
459	// sort out the query string
460	text_t ttquerystring = queryparams.querystring;
461	filterquery (ttquerystring);
462	char *querystring = to_utf8(ttquerystring).getcstr();
463
464	// submit the query
465	mgq_ask(querystring);
466
467	delete querystring;
468	}
469
470
471	void mgsearchclass::getresults (const queryparamclass &queryparams,
472	queryresultsclass &queryresults) {
473
474	mgq_results(result_docnums, 0, MAXNUMDOCS,
475	ourquerycallback, (void *)(&queryresults));
476
477	// get the term frequencies
478	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
479	termfreqcallback, (void *)(&queryresults));
480	queryresults.sortuniqqueryterms();
481
482	// get term variants
483	mgq_results(result_terms, 0, MAXNUMTERMS,
484	termvariantscallback, (void *)(&queryresults));
485
486	// get the number of documents retrieved
487	int total_retrieved = 0, is_approx = 0;
488	mgq_docsretrieved (&total_retrieved, &is_approx);
489
490	if (total_retrieved == 0) {
491	// not available (or really was zero)
492	queryresults.docs_matched = queryresults.docs.docset.size();
493	if (queryresults.docs_matched < queryparams.maxdocs)
494	queryresults.is_approx = Exact;
495	else
496	queryresults.is_approx = MoreThan;
497	} else {
498	queryresults.docs_matched = total_retrieved;
499	if (is_approx) queryresults.is_approx = Approximate;
500	else queryresults.is_approx = Exact;
501	}
502	}
503
504	void mgsearchclass::filterquery (text_t &ttquerystring) {
505	text_t::iterator ithere = ttquerystring.begin ();
506	text_t::iterator itend = ttquerystring.end ();
507
508	// remove all non alphanumeric characters (except
509	// boolean operators
510	while (ithere != itend) {
511	if ((!is_unicode_letdig(ithere)) && (ithere != '!') &&
512	(ithere != '&') && (ithere != '\|') && (*ithere != '(') &&
513	(ithere != ')')) (ithere) = ' ';
514	ithere++;
515	}
516	}
517
518
519	// the document text for 'docnum' is placed in 'output'
520	// docTargetDocument returns 'true' if it was able to
521	// try to get a document
522	// collection is needed to see if an index from the
523	// collection is loaded. If no index has been loaded
524	// defaultindex is needed to load one
525	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
526	const text_t &defaultsubcollection,
527	const text_t &defaultlanguage,
528	const text_t &collection,
529	int docnum,
530	text_t &output) {
531	output.clear();
532
533	// get the mg version of the document
534	char *mgdoc = NULL;
535	int doclen = 0;
536	if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
537	collection, docnum, mgdoc, doclen)) return false;
538	if (mgdoc == NULL) return false;
539
540	// replace all control-Cs with spaces
541	char *mgdoc_here = mgdoc;
542	char *mgdoc_end = mgdoc + doclen;
543	while (mgdoc_here < mgdoc_end) {
544	if (mgdoc_here == '\x3') mgdoc_here = ' ';
545	mgdoc_here++;
546	}
547
548	// convert this document to unicode
549	utf8inconvertclass inconvert;
550	convertclass::status_t status;
551	inconvert.reset ();
552	inconvert.setinput (mgdoc, doclen);
553	inconvert.convert (output, status);
554
555	return true;
556	}
557
558
559	bool mgsearchclass::mgdocument (const text_t &defaultindex,
560	const text_t &defaultsubcollection,
561	const text_t &defaultlanguage,
562	const text_t &collection,
563	int docnum,
564	char *&UDoc, int &ULen) {
565	int databaseloaded = 0;
566
567	UDoc = NULL; ULen = 0;
568
569	// see if we can make an appropriate database current
570	// char *ccollection = collection.getcstr();
571	// assert (ccollection != NULL);
572	// databaseloaded = load_text_database (ccollection);
573	// delete ccollection;
574
575	// try and load the database
576	// if (!databaseloaded)
577	databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
578	defaultlanguage, collection);
579
580	if (databaseloaded) {
581	// retrieve the document from mg
582	char docstr[32];
583	sprintf(docstr, "%i", docnum);
584
585	mgq_ask(".set mode text");
586	mgq_ask(".set query docnums");
587	mgq_ask(docstr);
588
589	tempdoc = NULL;
590	templen = 0;
591	mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
592	UDoc = tempdoc;
593	ULen = templen;
594	}
595
596	return (bool)databaseloaded;
597	}
598

Note: See TracBrowser for help on using the repository browser.

Download in other formats: