Context Navigation

source: trunk/gsdl/src/colservr/mgsearch.cpp@ 615

Last change on this file since 615 was 615, checked in by sjboddie, 25 years ago
fixed an error in what I committed last
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.4 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: mgsearch.cpp 615 1999-09-21 21:41:41Z sjboddie $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.21 1999/09/21 21:41:41 sjboddie
31	fixed an error in what I committed last
32
33	Revision 1.20 1999/09/21 11:59:26 sjboddie
34	added Maxdocs queryfilter option (which may be -1 for 'all)
35
36	Revision 1.19 1999/09/07 22:52:52 rjmcnab
37	Seems to be an error in mg for retrieving documents using a paragraph
38	based index for some cases. Just added a work around (loads the default
39	index every time).
40
41	Revision 1.18 1999/09/07 04:57:22 sjboddie
42	added gpl notice
43
44	Revision 1.17 1999/08/31 22:42:41 rjmcnab
45	A couple of minor things.
46
47	Revision 1.16 1999/08/25 04:51:06 sjboddie
48	small change to allow for searching using boolean operators
49
50	Revision 1.15 1999/07/16 08:35:03 rjmcnab
51	Fixed a weird bug to do with a faulty case statement.
52
53	Revision 1.14 1999/07/16 03:42:22 sjboddie
54	changed isApprox
55
56	Revision 1.13 1999/07/16 00:12:46 sjboddie
57	removed all the old post-processing stuff
58
59	Revision 1.12 1999/07/07 06:17:47 rjmcnab
60	broke search_index into index+subcollection+language
61	within mgsearch
62
63	Revision 1.11 1999/07/05 21:06:43 rjmcnab
64	Disabled quoted strings.
65
66	Revision 1.10 1999/07/01 09:29:19 rjmcnab
67	Changes for better reporting of number documents which match a query. Changes
68	should still work as before with older versions of mg.
69
70	Revision 1.9 1999/07/01 03:54:48 rjmcnab
71	Added code to plug in the equivalent terms of each of the query terms.
72	Also added a function to get a raw utf8 encoded mg document (for speeding
73	up a phrase matching function)
74
75	Revision 1.8 1999/06/30 04:04:12 rjmcnab
76	made stemming functions available from mgsearch and made the stems
77	for the query terms available in queryinfo
78
79	Revision 1.7 1999/06/27 22:07:27 sjboddie
80	got rid of all the old functions for dealing with dir indexes
81
82	Revision 1.6 1999/06/09 00:41:32 sjboddie
83	phrase searching now uses case-folding if it's turned on
84
85	Revision 1.5 1999/02/21 22:31:35 rjmcnab
86
87	Removed locateinfo.
88
89	Revision 1.4 1999/02/03 01:13:27 sjboddie
90
91	Got interface to handle subcollections and language subcollections -
92	committed changes made to some of the collections
93
94	Revision 1.3 1999/01/19 01:38:17 rjmcnab
95
96	Made the source more portable.
97
98	Revision 1.2 1999/01/12 01:51:02 rjmcnab
99
100	Standard header.
101
102	Revision 1.1 1999/01/08 09:02:16 rjmcnab
103
104	Moved from src/library.
105
106	*/
107
108
109	#include "gsdlconf.h"
110	#include "mgsearch.h"
111	#include "fileutil.h"
112
113	#include <string.h>
114	#include <stdio.h>
115	#include <stdlib.h>
116	#include <ctype.h>
117
118	#if defined(GSDL_USE_OBJECTSPACE)
119	# include <ospace\std\iostream>
120	#elif defined(GSDL_USE_IOS_H)
121	# include <iostream.h>
122	#else
123	# include <iostream>
124	#endif
125
126	#if defined(__WIN32__)
127	// gdbm stuff
128	# include "autoconf.h"
129	# include "systems.h"
130	# include "gdbmconst.h"
131	# include "gdbm.h"
132	#else
133	# include <gdbm.h>
134	#endif
135
136
137	#include <assert.h>
138
139	#include "mgq.h"
140	// #include "locateinfo.h"
141	#include "gsdlunicode.h"
142	#include "unitool.h"
143
144
145	/////////////
146	// globals //
147	/////////////
148
149	static char *tempdoc = NULL;
150	static int templen = 0;
151
152
153	//////////////////////
154	// useful functions //
155	//////////////////////
156
157
158	// input and output are in utf8
159	text_t mgsearch_stemword (const text_t &word) {
160	// allocate working stem space
161	int maxstemlen = mgq_getmaxstemlen ();
162	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
163	if (word_stem == NULL) return "";
164
165	// copy word to word_stem
166	int len = 0;
167	text_t::const_iterator here = word.begin();
168	text_t::const_iterator end = word.end();
169	while (len < maxstemlen && here != end) {
170	word_stem[len+1] = (unsigned char)(*here);
171	len++; here++;
172	}
173	word_stem[len+1] = '\0';
174	word_stem[0] = len;
175
176	mgq_stemword (word_stem);
177
178	// copy word_stem back to tempstr
179	text_t tempstr;
180	tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
181
182	delete [] word_stem;
183
184	return tempstr;
185	}
186
187
188
189	////////////////////////
190	// callback functions //
191	////////////////////////
192
193	// This routine is called for each document found in a search
194	// it assumes that cache_num is set up correctly to point to
195	// a suitable result cache
196	int ourquerycallback(char * /UDoc/, int /ULen/, int DocNum,
197	float Weight, void *info) {
198
199
200	queryresultsclass queryresults = (queryresultsclass )info;
201
202	// append this entry to the document results
203	docresultclass docresult;
204	docresult.docnum = DocNum;
205	docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
206	docresult.docweight = Weight - docresult.num_query_terms_matched*100;
207
208	queryresults->docs.docset[DocNum] = docresult;
209	queryresults->docs.docorder.push_back(DocNum);
210
211	return 0;
212	}
213
214	int termequivcallback(char Word, int ULen, int /Freq*/,
215	float /Weight/, void *info) {
216	text_tset equivterms = (text_tset )info;
217	if (equivterms == NULL) return 0;
218
219	text_t thisterm;
220	thisterm.setcarr(Word, ULen);
221
222	equivterms->insert(thisterm);
223
224	return 0;
225	}
226
227
228	void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
229	// allocate working stem space
230	int maxstemlen = mgq_getmaxstemlen ();
231	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
232	if (word_stem == NULL) return;
233
234	// copy word to word_stem
235	int len = 0;
236	text_t::const_iterator here = word.begin();
237	text_t::const_iterator end = word.end();
238	while (len < maxstemlen && here != end) {
239	word_stem[len+1] = (unsigned char)(*here);
240	len++; here++;
241	}
242	word_stem[len+1] = '\0';
243	word_stem[0] = len;
244
245	// get the equivalent terms
246	mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
247
248	delete [] word_stem;
249
250	return;
251	}
252
253	text_tset utf8equivterms; // kept as utf8 string for fast matching
254
255
256	// This callback is called once for each term in the query
257	int termfreqcallback(char *Word, int ULen, int Freq,
258	float /Weight/, void *info) {
259	queryresultsclass queryresults = (queryresultsclass )info;
260	if (queryresults == NULL) return 0;
261
262	text_t term;
263	term.setcarr(Word, ULen);
264	termfreqclass termfreq;
265
266	termfreq.termstr = to_uni(term);
267	text_t utf8termstem = mgsearch_stemword (term);
268	termfreq.termstemstr = to_uni (utf8termstem);
269
270	mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
271
272	termfreq.termfreq = Freq;
273	queryresults->orgterms.push_back(termfreq);
274
275	return 0;
276	}
277
278	// this callback is called once for each variation of each term
279	int termvariantscallback(char Word, int ULen, int /Freq*/,
280	float /Weight/, void *info) {
281
282	text_t term;
283	term.setcarr(Word, ULen);
284	queryresultsclass queryresults = (queryresultsclass )info;
285	queryresults->termvariants.insert(to_uni(term));
286
287	return 0;
288	}
289
290	// This callback is for getting document text
291	int doctextcallback(char Doc, int ULen, int /Freq*/,
292	float /Weight/, void * /info/) {
293	tempdoc = Doc;
294	templen = ULen;
295
296	return 0;
297	}
298
299
300	static text_t getindexsuffix (const text_t &collection,
301	const text_t &index) {
302
303	text_t indexsuffix = "index";
304	indexsuffix = filename_cat (indexsuffix, index);
305	indexsuffix = filename_cat (indexsuffix, collection);
306	return indexsuffix;
307	}
308
309
310
311
312	////////////////////
313	// mgsearch class //
314	////////////////////
315
316	mgsearchclass::mgsearchclass ()
317	{
318	cache = new querycache (RESULTCACHESIZE);
319	}
320
321	mgsearchclass::~mgsearchclass ()
322	{
323	if (cache != NULL)
324	{
325	delete cache;
326	cache = NULL;
327	}
328	}
329
330
331	void mgsearchclass::setcollectdir (const text_t &thecollectdir)
332	{
333	collectdir = thecollectdir;
334	}
335
336	// you only need to use this function before doing any stemming
337	// casefolding and stemming will be set if values for them are
338	// provided (0 or 1).
339	// makeindexcurrent returns true if it was able to load the database
340	bool mgsearchclass::makeindexcurrent (const text_t &index,
341	const text_t &subcollection,
342	const text_t &language,
343	const text_t &collection,
344	int casefolding,
345	int stemming) {
346	bool databaseloaded = true;
347
348	// get the names of the collection, index and text suffixes
349	char *ccollection = collection.getcstr();
350	assert (ccollection != NULL);
351	char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
352	assert (idxsuffix != NULL);
353	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
354	assert (txtsuffix != NULL);
355
356	#ifdef __WIN32__
357	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
358	#else
359	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
360	#endif
361
362	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
363	if (casefolding == 0) mgq_ask(".set casefold off");
364	else if (casefolding > 0) mgq_ask(".set casefold on");
365	if (stemming == 0) mgq_ask(".set stem off");
366	else if (stemming > 0) mgq_ask(".set stem on");
367
368	} else databaseloaded = false;
369
370	// free up the c strings
371	delete ccollection;
372	delete idxsuffix;
373	delete txtsuffix;
374	delete ccollectdir;
375
376	return databaseloaded;
377	}
378
379
380	// stem word uses the values set in the last call to makeindexcurrent
381	// to stem the word. It is assumed that word is in unicode
382	text_t mgsearchclass::stemword (const text_t &word) {
383	return to_uni (mgsearch_stemword (to_utf8 (word)));
384	}
385
386	text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
387	return to_uni (mgsearch_stemword (to_utf8 (here, end)));
388	}
389
390
391	bool mgsearchclass::search(const queryparamclass &queryparams,
392	queryresultsclass &queryresults) {
393	assert (cache != NULL);
394
395	queryresults.clear();
396
397	// first check the cache
398	if (cache->find(queryparams, queryresults)) return true;
399
400	// make sure there is a query to be processed
401	text_t::const_iterator queryhere = queryparams.querystring.begin();
402	text_t::const_iterator queryend = queryparams.querystring.end();
403	while (queryhere != queryend) {
404	if (is_unicode_letdig (*queryhere)) break;
405	queryhere++;
406	}
407
408	// if we reached the end of the query string without finding
409	// any alphanumeric characters then return no results (and say
410	// the database was loaded)
411	if (queryhere == queryend) return true;
412
413	if (makeindexcurrent (queryparams.index, queryparams.subcollection,
414	queryparams.language, queryparams.collection)) {
415	setsearchmode (queryparams);
416	submitquery (queryparams);
417	getresults (queryparams, queryresults);
418	return true;
419	}
420
421	return false;
422	}
423
424
425	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
426	{
427	mgq_ask(".set expert true");
428	mgq_ask(".set sorted_terms true");
429	mgq_ask(".set accumulator_method list");
430	mgq_ask(".set max_accumulators 500000");
431	mgq_ask(".set maxparas 500000");
432	mgq_ask(".set verbatim true");
433	mgq_ask(".unset skip_dump");
434	mgq_ask(".set mode docnums");
435
436	switch (queryparams.search_type)
437	{
438	case 0: mgq_ask(".set query boolean"); break;
439	case 1: mgq_ask(".set query ranked"); break;
440	}
441	switch (queryparams.casefolding)
442	{
443	case 1: mgq_ask(".set casefold on"); break;
444	case 0: mgq_ask(".set casefold off"); break;
445	}
446	switch (queryparams.stemming)
447	{
448	case 1: mgq_ask(".set stem on"); break;
449	case 0: mgq_ask(".set stem off"); break;
450	}
451	mgq_ask(".set heads_length 150");
452
453	if (queryparams.maxdocs == -1) {
454	mgq_ask(".set maxdocs all");
455	} else {
456	char maxdocstr[32];
457	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
458	mgq_ask(maxdocstr);
459	}
460	}
461
462
463	void mgsearchclass::submitquery (const queryparamclass &queryparams)
464	{
465	// sort out the query string
466	text_t ttquerystring = queryparams.querystring;
467	filterquery (ttquerystring);
468	char *querystring = to_utf8(ttquerystring).getcstr();
469
470	// submit the query
471	mgq_ask(querystring);
472
473	delete querystring;
474	}
475
476
477	void mgsearchclass::getresults (const queryparamclass &queryparams,
478	queryresultsclass &queryresults) {
479
480	int howmany = queryparams.maxdocs;
481	if (howmany == -1) howmany = MAXNUMDOCS;
482	mgq_results(result_docnums, 0, howmany,
483	ourquerycallback, (void *)(&queryresults));
484
485	// get the term frequencies
486	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
487	termfreqcallback, (void *)(&queryresults));
488	queryresults.sortuniqqueryterms();
489
490	// get term variants
491	mgq_results(result_terms, 0, MAXNUMTERMS,
492	termvariantscallback, (void *)(&queryresults));
493
494	// get the number of documents retrieved
495	int total_retrieved = 0, is_approx = 0;
496	mgq_docsretrieved (&total_retrieved, &is_approx);
497
498	if (total_retrieved == 0) {
499	// not available (or really was zero)
500	queryresults.docs_matched = queryresults.docs.docset.size();
501	if ((queryparams.maxdocs == -1) \|\|
502	(queryresults.docs_matched < queryparams.maxdocs))
503	queryresults.is_approx = Exact;
504	else
505	queryresults.is_approx = MoreThan;
506	} else {
507	queryresults.docs_matched = total_retrieved;
508	if (is_approx) queryresults.is_approx = Approximate;
509	else queryresults.is_approx = Exact;
510	}
511	}
512
513	void mgsearchclass::filterquery (text_t &ttquerystring) {
514	text_t::iterator ithere = ttquerystring.begin ();
515	text_t::iterator itend = ttquerystring.end ();
516
517	// remove all non alphanumeric characters (except
518	// boolean operators
519	while (ithere != itend) {
520	if ((!is_unicode_letdig(ithere)) && (ithere != '!') &&
521	(ithere != '&') && (ithere != '\|') && (*ithere != '(') &&
522	(ithere != ')')) (ithere) = ' ';
523	ithere++;
524	}
525	}
526
527
528	// the document text for 'docnum' is placed in 'output'
529	// docTargetDocument returns 'true' if it was able to
530	// try to get a document
531	// collection is needed to see if an index from the
532	// collection is loaded. If no index has been loaded
533	// defaultindex is needed to load one
534	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
535	const text_t &defaultsubcollection,
536	const text_t &defaultlanguage,
537	const text_t &collection,
538	int docnum,
539	text_t &output) {
540	output.clear();
541
542	// get the mg version of the document
543	char *mgdoc = NULL;
544	int doclen = 0;
545	if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
546	collection, docnum, mgdoc, doclen)) return false;
547	if (mgdoc == NULL) return false;
548
549	// replace all control-Cs with spaces
550	char *mgdoc_here = mgdoc;
551	char *mgdoc_end = mgdoc + doclen;
552	while (mgdoc_here < mgdoc_end) {
553	if (mgdoc_here == '\x3') mgdoc_here = ' ';
554	mgdoc_here++;
555	}
556
557	// convert this document to unicode
558	utf8inconvertclass inconvert;
559	convertclass::status_t status;
560	inconvert.reset ();
561	inconvert.setinput (mgdoc, doclen);
562	inconvert.convert (output, status);
563
564	return true;
565	}
566
567
568	bool mgsearchclass::mgdocument (const text_t &defaultindex,
569	const text_t &defaultsubcollection,
570	const text_t &defaultlanguage,
571	const text_t &collection,
572	int docnum,
573	char *&UDoc, int &ULen) {
574	int databaseloaded = 0;
575
576	UDoc = NULL; ULen = 0;
577
578	// see if we can make an appropriate database current
579	// char *ccollection = collection.getcstr();
580	// assert (ccollection != NULL);
581	// databaseloaded = load_text_database (ccollection);
582	// delete ccollection;
583
584	// try and load the database
585	// if (!databaseloaded)
586	databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
587	defaultlanguage, collection);
588
589	if (databaseloaded) {
590	// retrieve the document from mg
591	char docstr[32];
592	sprintf(docstr, "%i", docnum);
593
594	mgq_ask(".set mode text");
595	mgq_ask(".set query docnums");
596	mgq_ask(docstr);
597
598	tempdoc = NULL;
599	templen = 0;
600	mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
601	UDoc = tempdoc;
602	ULen = templen;
603	}
604
605	return (bool)databaseloaded;
606	}
607

Note: See TracBrowser for help on using the repository browser.

Download in other formats: