Context Navigation

source: branches/New_Config_Format-branch/gsdl/src/colservr/mgsearch.cpp@ 1279

Last change on this file since 1279 was 1279, checked in by sjboddie, 24 years ago
merged changes to trunk into New_Config_Format branch
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.4 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: mgsearch.cpp 1279 2000-07-12 22:21:53Z sjboddie $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.22.4.1 2000/07/12 22:21:27 sjboddie
31	merged changes to trunk into New_Config_Format branch
32
33	Revision 1.23 2000/07/03 21:58:41 nzdl
34	removed mg directive that was causing meaningless warning messages
35	in errout.txt
36
37	Revision 1.22 1999/09/24 02:41:21 rjmcnab
38	change to use has_unicode_letdig in text_t
39
40	Revision 1.21 1999/09/21 21:41:41 sjboddie
41	fixed an error in what I committed last
42
43	Revision 1.20 1999/09/21 11:59:26 sjboddie
44	added Maxdocs queryfilter option (which may be -1 for 'all)
45
46	Revision 1.19 1999/09/07 22:52:52 rjmcnab
47	Seems to be an error in mg for retrieving documents using a paragraph
48	based index for some cases. Just added a work around (loads the default
49	index every time).
50
51	Revision 1.18 1999/09/07 04:57:22 sjboddie
52	added gpl notice
53
54	Revision 1.17 1999/08/31 22:42:41 rjmcnab
55	A couple of minor things.
56
57	Revision 1.16 1999/08/25 04:51:06 sjboddie
58	small change to allow for searching using boolean operators
59
60	Revision 1.15 1999/07/16 08:35:03 rjmcnab
61	Fixed a weird bug to do with a faulty case statement.
62
63	Revision 1.14 1999/07/16 03:42:22 sjboddie
64	changed isApprox
65
66	Revision 1.13 1999/07/16 00:12:46 sjboddie
67	removed all the old post-processing stuff
68
69	Revision 1.12 1999/07/07 06:17:47 rjmcnab
70	broke search_index into index+subcollection+language
71	within mgsearch
72
73	Revision 1.11 1999/07/05 21:06:43 rjmcnab
74	Disabled quoted strings.
75
76	Revision 1.10 1999/07/01 09:29:19 rjmcnab
77	Changes for better reporting of number documents which match a query. Changes
78	should still work as before with older versions of mg.
79
80	Revision 1.9 1999/07/01 03:54:48 rjmcnab
81	Added code to plug in the equivalent terms of each of the query terms.
82	Also added a function to get a raw utf8 encoded mg document (for speeding
83	up a phrase matching function)
84
85	Revision 1.8 1999/06/30 04:04:12 rjmcnab
86	made stemming functions available from mgsearch and made the stems
87	for the query terms available in queryinfo
88
89	Revision 1.7 1999/06/27 22:07:27 sjboddie
90	got rid of all the old functions for dealing with dir indexes
91
92	Revision 1.6 1999/06/09 00:41:32 sjboddie
93	phrase searching now uses case-folding if it's turned on
94
95	Revision 1.5 1999/02/21 22:31:35 rjmcnab
96
97	Removed locateinfo.
98
99	Revision 1.4 1999/02/03 01:13:27 sjboddie
100
101	Got interface to handle subcollections and language subcollections -
102	committed changes made to some of the collections
103
104	Revision 1.3 1999/01/19 01:38:17 rjmcnab
105
106	Made the source more portable.
107
108	Revision 1.2 1999/01/12 01:51:02 rjmcnab
109
110	Standard header.
111
112	Revision 1.1 1999/01/08 09:02:16 rjmcnab
113
114	Moved from src/library.
115
116	*/
117
118
119	#include "gsdlconf.h"
120	#include "mgsearch.h"
121	#include "fileutil.h"
122
123	#include <string.h>
124	#include <stdio.h>
125	#include <stdlib.h>
126	#include <ctype.h>
127
128	#if defined(GSDL_USE_OBJECTSPACE)
129	# include <ospace\std\iostream>
130	#elif defined(GSDL_USE_IOS_H)
131	# include <iostream.h>
132	#else
133	# include <iostream>
134	#endif
135
136	#if defined(__WIN32__)
137	// gdbm stuff
138	# include "autoconf.h"
139	# include "systems.h"
140	# include "gdbmconst.h"
141	# include "gdbm.h"
142	#else
143	# include <gdbm.h>
144	#endif
145
146
147	#include <assert.h>
148
149	#include "mgq.h"
150	// #include "locateinfo.h"
151	#include "gsdlunicode.h"
152	#include "unitool.h"
153
154
155	/////////////
156	// globals //
157	/////////////
158
159	static char *tempdoc = NULL;
160	static int templen = 0;
161
162
163	//////////////////////
164	// useful functions //
165	//////////////////////
166
167
168	// input and output are in utf8
169	text_t mgsearch_stemword (const text_t &word) {
170	// allocate working stem space
171	int maxstemlen = mgq_getmaxstemlen ();
172	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
173	if (word_stem == NULL) return "";
174
175	// copy word to word_stem
176	int len = 0;
177	text_t::const_iterator here = word.begin();
178	text_t::const_iterator end = word.end();
179	while (len < maxstemlen && here != end) {
180	word_stem[len+1] = (unsigned char)(*here);
181	len++; here++;
182	}
183	word_stem[len+1] = '\0';
184	word_stem[0] = len;
185
186	mgq_stemword (word_stem);
187
188	// copy word_stem back to tempstr
189	text_t tempstr;
190	tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
191
192	delete [] word_stem;
193
194	return tempstr;
195	}
196
197
198
199	////////////////////////
200	// callback functions //
201	////////////////////////
202
203	// This routine is called for each document found in a search
204	// it assumes that cache_num is set up correctly to point to
205	// a suitable result cache
206	int ourquerycallback(char * /UDoc/, int /ULen/, int DocNum,
207	float Weight, void *info) {
208
209
210	queryresultsclass queryresults = (queryresultsclass )info;
211
212	// append this entry to the document results
213	docresultclass docresult;
214	docresult.docnum = DocNum;
215	docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
216	docresult.docweight = Weight - docresult.num_query_terms_matched*100;
217
218	queryresults->docs.docset[DocNum] = docresult;
219	queryresults->docs.docorder.push_back(DocNum);
220
221	return 0;
222	}
223
224	int termequivcallback(char Word, int ULen, int /Freq*/,
225	float /Weight/, void *info) {
226	text_tset equivterms = (text_tset )info;
227	if (equivterms == NULL) return 0;
228
229	text_t thisterm;
230	thisterm.setcarr(Word, ULen);
231
232	equivterms->insert(thisterm);
233
234	return 0;
235	}
236
237
238	void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
239	// allocate working stem space
240	int maxstemlen = mgq_getmaxstemlen ();
241	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
242	if (word_stem == NULL) return;
243
244	// copy word to word_stem
245	int len = 0;
246	text_t::const_iterator here = word.begin();
247	text_t::const_iterator end = word.end();
248	while (len < maxstemlen && here != end) {
249	word_stem[len+1] = (unsigned char)(*here);
250	len++; here++;
251	}
252	word_stem[len+1] = '\0';
253	word_stem[0] = len;
254
255	// get the equivalent terms
256	mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
257
258	delete [] word_stem;
259
260	return;
261	}
262
263	text_tset utf8equivterms; // kept as utf8 string for fast matching
264
265
266	// This callback is called once for each term in the query
267	int termfreqcallback(char *Word, int ULen, int Freq,
268	float /Weight/, void *info) {
269	queryresultsclass queryresults = (queryresultsclass )info;
270	if (queryresults == NULL) return 0;
271
272	text_t term;
273	term.setcarr(Word, ULen);
274	termfreqclass termfreq;
275
276	termfreq.termstr = to_uni(term);
277	text_t utf8termstem = mgsearch_stemword (term);
278	termfreq.termstemstr = to_uni (utf8termstem);
279
280	mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
281
282	termfreq.termfreq = Freq;
283	queryresults->orgterms.push_back(termfreq);
284
285	return 0;
286	}
287
288	// this callback is called once for each variation of each term
289	int termvariantscallback(char Word, int ULen, int /Freq*/,
290	float /Weight/, void *info) {
291
292	text_t term;
293	term.setcarr(Word, ULen);
294	queryresultsclass queryresults = (queryresultsclass )info;
295	queryresults->termvariants.insert(to_uni(term));
296
297	return 0;
298	}
299
300	// This callback is for getting document text
301	int doctextcallback(char Doc, int ULen, int /Freq*/,
302	float /Weight/, void * /info/) {
303	tempdoc = Doc;
304	templen = ULen;
305
306	return 0;
307	}
308
309
310	static text_t getindexsuffix (const text_t &collection,
311	const text_t &index) {
312
313	text_t indexsuffix = "index";
314	indexsuffix = filename_cat (indexsuffix, index);
315	indexsuffix = filename_cat (indexsuffix, collection);
316	return indexsuffix;
317	}
318
319
320
321
322	////////////////////
323	// mgsearch class //
324	////////////////////
325
326	mgsearchclass::mgsearchclass ()
327	{
328	cache = new querycache (RESULTCACHESIZE);
329	}
330
331	mgsearchclass::~mgsearchclass ()
332	{
333	if (cache != NULL)
334	{
335	delete cache;
336	cache = NULL;
337	}
338	}
339
340
341	void mgsearchclass::setcollectdir (const text_t &thecollectdir)
342	{
343	collectdir = thecollectdir;
344	}
345
346	// you only need to use this function before doing any stemming
347	// casefolding and stemming will be set if values for them are
348	// provided (0 or 1).
349	// makeindexcurrent returns true if it was able to load the database
350	bool mgsearchclass::makeindexcurrent (const text_t &index,
351	const text_t &subcollection,
352	const text_t &language,
353	const text_t &collection,
354	int casefolding,
355	int stemming) {
356	bool databaseloaded = true;
357
358	// get the names of the collection, index and text suffixes
359	char *ccollection = collection.getcstr();
360	assert (ccollection != NULL);
361	char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
362	assert (idxsuffix != NULL);
363	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
364	assert (txtsuffix != NULL);
365
366	#ifdef __WIN32__
367	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
368	#else
369	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
370	#endif
371
372	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
373	if (casefolding == 0) mgq_ask(".set casefold off");
374	else if (casefolding > 0) mgq_ask(".set casefold on");
375	if (stemming == 0) mgq_ask(".set stem off");
376	else if (stemming > 0) mgq_ask(".set stem on");
377
378	} else databaseloaded = false;
379
380	// free up the c strings
381	delete ccollection;
382	delete idxsuffix;
383	delete txtsuffix;
384	delete ccollectdir;
385
386	return databaseloaded;
387	}
388
389
390	// stem word uses the values set in the last call to makeindexcurrent
391	// to stem the word. It is assumed that word is in unicode
392	text_t mgsearchclass::stemword (const text_t &word) {
393	return to_uni (mgsearch_stemword (to_utf8 (word)));
394	}
395
396	text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
397	return to_uni (mgsearch_stemword (to_utf8 (here, end)));
398	}
399
400
401	bool mgsearchclass::search(const queryparamclass &queryparams,
402	queryresultsclass &queryresults) {
403	assert (cache != NULL);
404
405	queryresults.clear();
406
407	// first check the cache
408	if (cache->find(queryparams, queryresults)) return true;
409
410	// make sure there is a query to be processed
411	if (!has_unicode_letdig(queryparams.querystring)) return true;
412
413	if (makeindexcurrent (queryparams.index, queryparams.subcollection,
414	queryparams.language, queryparams.collection)) {
415	setsearchmode (queryparams);
416	submitquery (queryparams);
417	getresults (queryparams, queryresults);
418	return true;
419	}
420
421	return false;
422	}
423
424
425	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
426	{
427	mgq_ask(".set expert true");
428	mgq_ask(".set sorted_terms true");
429	mgq_ask(".set accumulator_method list");
430	mgq_ask(".set max_accumulators 500000");
431	mgq_ask(".set maxparas 500000");
432	mgq_ask(".set verbatim true");
433	// mgq_ask(".unset skip_dump");
434	mgq_ask(".set mode docnums");
435
436	switch (queryparams.search_type)
437	{
438	case 0: mgq_ask(".set query boolean"); break;
439	case 1: mgq_ask(".set query ranked"); break;
440	}
441	switch (queryparams.casefolding)
442	{
443	case 1: mgq_ask(".set casefold on"); break;
444	case 0: mgq_ask(".set casefold off"); break;
445	}
446	switch (queryparams.stemming)
447	{
448	case 1: mgq_ask(".set stem on"); break;
449	case 0: mgq_ask(".set stem off"); break;
450	}
451	mgq_ask(".set heads_length 150");
452
453	if (queryparams.maxdocs == -1) {
454	mgq_ask(".set maxdocs all");
455	} else {
456	char maxdocstr[32];
457	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
458	mgq_ask(maxdocstr);
459	}
460	}
461
462
463	void mgsearchclass::submitquery (const queryparamclass &queryparams)
464	{
465	// sort out the query string
466	text_t ttquerystring = queryparams.querystring;
467	filterquery (ttquerystring);
468	char *querystring = to_utf8(ttquerystring).getcstr();
469
470	// submit the query
471	mgq_ask(querystring);
472
473	delete querystring;
474	}
475
476
477	void mgsearchclass::getresults (const queryparamclass &queryparams,
478	queryresultsclass &queryresults) {
479
480	int howmany = queryparams.maxdocs;
481	if (howmany == -1) howmany = MAXNUMDOCS;
482	mgq_results(result_docnums, 0, howmany,
483	ourquerycallback, (void *)(&queryresults));
484
485	// get the term frequencies
486	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
487	termfreqcallback, (void *)(&queryresults));
488	queryresults.sortuniqqueryterms();
489
490	// get term variants
491	mgq_results(result_terms, 0, MAXNUMTERMS,
492	termvariantscallback, (void *)(&queryresults));
493
494	// get the number of documents retrieved
495	int total_retrieved = 0, is_approx = 0;
496	mgq_docsretrieved (&total_retrieved, &is_approx);
497
498	if (total_retrieved == 0) {
499	// not available (or really was zero)
500	queryresults.docs_matched = queryresults.docs.docset.size();
501	if ((queryparams.maxdocs == -1) \|\|
502	(queryresults.docs_matched < queryparams.maxdocs))
503	queryresults.is_approx = Exact;
504	else
505	queryresults.is_approx = MoreThan;
506	} else {
507	queryresults.docs_matched = total_retrieved;
508	if (is_approx) queryresults.is_approx = Approximate;
509	else queryresults.is_approx = Exact;
510	}
511	}
512
513	void mgsearchclass::filterquery (text_t &ttquerystring) {
514	text_t::iterator ithere = ttquerystring.begin ();
515	text_t::iterator itend = ttquerystring.end ();
516
517	// remove all non alphanumeric characters (except
518	// boolean operators
519	while (ithere != itend) {
520	if ((!is_unicode_letdig(ithere)) && (ithere != '!') &&
521	(ithere != '&') && (ithere != '\|') && (*ithere != '(') &&
522	(ithere != ')')) (ithere) = ' ';
523	ithere++;
524	}
525	}
526
527
528	// the document text for 'docnum' is placed in 'output'
529	// docTargetDocument returns 'true' if it was able to
530	// try to get a document
531	// collection is needed to see if an index from the
532	// collection is loaded. If no index has been loaded
533	// defaultindex is needed to load one
534	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
535	const text_t &defaultsubcollection,
536	const text_t &defaultlanguage,
537	const text_t &collection,
538	int docnum,
539	text_t &output) {
540	output.clear();
541
542	// get the mg version of the document
543	char *mgdoc = NULL;
544	int doclen = 0;
545	if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
546	collection, docnum, mgdoc, doclen)) return false;
547	if (mgdoc == NULL) return false;
548
549	// replace all control-Cs with spaces
550	char *mgdoc_here = mgdoc;
551	char *mgdoc_end = mgdoc + doclen;
552	while (mgdoc_here < mgdoc_end) {
553	if (mgdoc_here == '\x3') mgdoc_here = ' ';
554	mgdoc_here++;
555	}
556
557	// convert this document to unicode
558	utf8inconvertclass inconvert;
559	convertclass::status_t status;
560	inconvert.reset ();
561	inconvert.setinput (mgdoc, doclen);
562	inconvert.convert (output, status);
563
564	return true;
565	}
566
567
568	bool mgsearchclass::mgdocument (const text_t &defaultindex,
569	const text_t &defaultsubcollection,
570	const text_t &defaultlanguage,
571	const text_t &collection,
572	int docnum,
573	char *&UDoc, int &ULen) {
574	int databaseloaded = 0;
575
576	UDoc = NULL; ULen = 0;
577
578	// see if we can make an appropriate database current
579	// char *ccollection = collection.getcstr();
580	// assert (ccollection != NULL);
581	// databaseloaded = load_text_database (ccollection);
582	// delete ccollection;
583
584	// try and load the database
585	// if (!databaseloaded)
586	databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
587	defaultlanguage, collection);
588
589	if (databaseloaded) {
590	// retrieve the document from mg
591	char docstr[32];
592	sprintf(docstr, "%i", docnum);
593
594	mgq_ask(".set mode text");
595	mgq_ask(".set query docnums");
596	mgq_ask(docstr);
597
598	tempdoc = NULL;
599	templen = 0;
600	mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
601	UDoc = tempdoc;
602	ULen = templen;
603	}
604
605	return (bool)databaseloaded;
606	}
607

Note: See TracBrowser for help on using the repository browser.

Download in other formats: