Context Navigation

source: trunk/gsdl/src/colservr/mgsearch.cpp@ 1990

Last change on this file since 1990 was 1990, checked in by kjm18, 23 years ago
added a comment
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.1 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: mgsearch.cpp 1990 2001-02-15 22:58:11Z kjm18 $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.30 2001/02/15 22:58:11 kjm18
31	added a comment
32
33	Revision 1.29 2001/02/15 03:57:02 kjm18
34	changed accumulator_method for mg to be array rather than list - it was
35	getting some weird results with ranked searches
36
37	Revision 1.28 2001/01/25 18:26:44 cs025
38	Included CORBA branch for first time
39
40	Revision 1.22.2.1 2000/04/04 15:02:32 cs025
41	Corba first commit
42
43	Revision 1.22 1999/09/24 02:41:21 rjmcnab
44	change to use has_unicode_letdig in text_t
45
46	Revision 1.21 1999/09/21 21:41:41 sjboddie
47	fixed an error in what I committed last
48
49	Revision 1.20 1999/09/21 11:59:26 sjboddie
50	added Maxdocs queryfilter option (which may be -1 for 'all)
51
52	Revision 1.19 1999/09/07 22:52:52 rjmcnab
53	Seems to be an error in mg for retrieving documents using a paragraph
54	based index for some cases. Just added a work around (loads the default
55	index every time).
56
57	Revision 1.18 1999/09/07 04:57:22 sjboddie
58	added gpl notice
59
60	Revision 1.17 1999/08/31 22:42:41 rjmcnab
61	A couple of minor things.
62
63	Revision 1.16 1999/08/25 04:51:06 sjboddie
64	small change to allow for searching using boolean operators
65
66	Revision 1.15 1999/07/16 08:35:03 rjmcnab
67	Fixed a weird bug to do with a faulty case statement.
68
69	Revision 1.14 1999/07/16 03:42:22 sjboddie
70	changed isApprox
71
72	Revision 1.13 1999/07/16 00:12:46 sjboddie
73	removed all the old post-processing stuff
74
75	Revision 1.12 1999/07/07 06:17:47 rjmcnab
76	broke search_index into index+subcollection+language
77	within mgsearch
78
79	Revision 1.11 1999/07/05 21:06:43 rjmcnab
80	Disabled quoted strings.
81
82	Revision 1.10 1999/07/01 09:29:19 rjmcnab
83	Changes for better reporting of number documents which match a query. Changes
84	should still work as before with older versions of mg.
85
86	Revision 1.9 1999/07/01 03:54:48 rjmcnab
87	Added code to plug in the equivalent terms of each of the query terms.
88	Also added a function to get a raw utf8 encoded mg document (for speeding
89	up a phrase matching function)
90
91	Revision 1.8 1999/06/30 04:04:12 rjmcnab
92	made stemming functions available from mgsearch and made the stems
93	for the query terms available in queryinfo
94
95	Revision 1.7 1999/06/27 22:07:27 sjboddie
96	got rid of all the old functions for dealing with dir indexes
97
98	Revision 1.6 1999/06/09 00:41:32 sjboddie
99	phrase searching now uses case-folding if it's turned on
100
101	Revision 1.5 1999/02/21 22:31:35 rjmcnab
102
103	Removed locateinfo.
104
105	Revision 1.4 1999/02/03 01:13:27 sjboddie
106
107	Got interface to handle subcollections and language subcollections -
108	committed changes made to some of the collections
109
110	Revision 1.3 1999/01/19 01:38:17 rjmcnab
111
112	Made the source more portable.
113
114	Revision 1.2 1999/01/12 01:51:02 rjmcnab
115
116	Standard header.
117
118	Revision 1.1 1999/01/08 09:02:16 rjmcnab
119
120	Moved from src/library.
121
122	*/
123
124	#include "gsdlconf.h"
125	#include "mgsearch.h"
126	#include "fileutil.h"
127
128	#include <string.h>
129	#include <stdio.h>
130	#include <stdlib.h>
131	#include <ctype.h>
132
133	#if defined(GSDL_USE_OBJECTSPACE)
134	# include <ospace\std\iostream>
135	#elif defined(GSDL_USE_IOS_H)
136	# include <iostream.h>
137	#else
138	# include <iostream>
139	#endif
140
141	#if defined(__WIN32__)
142	// gdbm stuff
143	# include "autoconf.h"
144	# include "systems.h"
145	# include "gdbmconst.h"
146	# include "gdbm.h"
147	#else
148	# include <gdbm.h>
149	#endif
150
151
152	#include <assert.h>
153
154	#include "mgq.h"
155	// #include "locateinfo.h"
156	#include "gsdlunicode.h"
157	#include "unitool.h"
158
159
160	/////////////
161	// globals //
162	/////////////
163
164	static char *tempdoc = NULL;
165	static int templen = 0;
166
167
168	//////////////////////
169	// useful functions //
170	//////////////////////
171
172
173	// input and output are in utf8
174	text_t mgsearch_stemword (const text_t &word) {
175	// allocate working stem space
176	int maxstemlen = mgq_getmaxstemlen ();
177	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
178	if (word_stem == NULL) return "";
179
180	// copy word to word_stem
181	int len = 0;
182	text_t::const_iterator here = word.begin();
183	text_t::const_iterator end = word.end();
184	while (len < maxstemlen && here != end) {
185	word_stem[len+1] = (unsigned char)(*here);
186	len++; here++;
187	}
188	word_stem[len+1] = '\0';
189	word_stem[0] = len;
190
191	mgq_stemword (word_stem);
192
193	// copy word_stem back to tempstr
194	text_t tempstr;
195	tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
196
197	delete [] word_stem;
198
199	return tempstr;
200	}
201
202
203
204	////////////////////////
205	// callback functions //
206	////////////////////////
207
208	// This routine is called for each document found in a search
209	// it assumes that cache_num is set up correctly to point to
210	// a suitable result cache
211	int ourquerycallback(char * /UDoc/, int /ULen/, int DocNum,
212	float Weight, void *info) {
213
214
215	queryresultsclass queryresults = (queryresultsclass )info;
216
217	// append this entry to the document results
218	docresultclass docresult;
219	docresult.docnum = DocNum;
220	docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
221	docresult.docweight = Weight - docresult.num_query_terms_matched*100;
222
223	queryresults->docs.docset[DocNum] = docresult;
224	queryresults->docs.docorder.push_back(DocNum);
225
226	return 0;
227	}
228
229	int termequivcallback(char Word, int ULen, int /Freq*/,
230	float /Weight/, void *info) {
231	text_tset equivterms = (text_tset )info;
232	if (equivterms == NULL) return 0;
233
234	text_t thisterm;
235	thisterm.setcarr(Word, ULen);
236
237	equivterms->insert(thisterm);
238
239	return 0;
240	}
241
242
243	void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
244	// allocate working stem space
245	int maxstemlen = mgq_getmaxstemlen ();
246	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
247	if (word_stem == NULL) return;
248
249	// copy word to word_stem
250	int len = 0;
251	text_t::const_iterator here = word.begin();
252	text_t::const_iterator end = word.end();
253	while (len < maxstemlen && here != end) {
254	word_stem[len+1] = (unsigned char)(*here);
255	len++; here++;
256	}
257	word_stem[len+1] = '\0';
258	word_stem[0] = len;
259
260	// get the equivalent terms
261	mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
262
263	delete [] word_stem;
264
265	return;
266	}
267
268	text_tset utf8equivterms; // kept as utf8 string for fast matching
269
270
271	// This callback is called once for each term in the query
272	int termfreqcallback(char *Word, int ULen, int Freq,
273	float /Weight/, void *info) {
274	queryresultsclass queryresults = (queryresultsclass )info;
275	if (queryresults == NULL) return 0;
276
277	text_t term;
278	term.setcarr(Word, ULen);
279	termfreqclass termfreq;
280
281	termfreq.termstr = to_uni(term);
282	text_t utf8termstem = mgsearch_stemword (term);
283	termfreq.termstemstr = to_uni (utf8termstem);
284
285	mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
286
287	termfreq.termfreq = Freq;
288	queryresults->orgterms.push_back(termfreq);
289
290	return 0;
291	}
292
293	// this callback is called once for each variation of each term
294	int termvariantscallback(char Word, int ULen, int /Freq*/,
295	float /Weight/, void *info) {
296
297	text_t term;
298	term.setcarr(Word, ULen);
299	queryresultsclass queryresults = (queryresultsclass )info;
300	queryresults->termvariants.insert(to_uni(term));
301
302	return 0;
303	}
304
305	// This callback is for getting document text
306	int doctextcallback(char Doc, int ULen, int /Freq*/,
307	float /Weight/, void * /info/) {
308	tempdoc = Doc;
309	templen = ULen;
310
311	return 0;
312	}
313
314
315	static text_t getindexsuffix (const text_t &collection,
316	const text_t &index) {
317
318	text_t indexsuffix = "index";
319	indexsuffix = filename_cat (indexsuffix, index);
320	indexsuffix = filename_cat (indexsuffix, collection);
321	return indexsuffix;
322	}
323
324
325
326
327	////////////////////
328	// mgsearch class //
329	////////////////////
330
331	mgsearchclass::mgsearchclass ()
332	: searchclass() {
333
334	}
335
336	mgsearchclass::~mgsearchclass ()
337	{
338	if (cache != NULL)
339	{
340	delete cache;
341	cache = NULL;
342	}
343	}
344
345	// you only need to use this function before doing any stemming
346	// casefolding and stemming will be set if values for them are
347	// provided (0 or 1).
348	// makeindexcurrent returns true if it was able to load the database
349	bool mgsearchclass::makeindexcurrent (const text_t &index,
350	const text_t &subcollection,
351	const text_t &language,
352	const text_t &collection,
353	int casefolding,
354	int stemming) {
355	bool databaseloaded = true;
356
357	// get the names of the collection, index and text suffixes
358	char *ccollection = collection.getcstr();
359	assert (ccollection != NULL);
360	char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
361	assert (idxsuffix != NULL);
362	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
363	assert (txtsuffix != NULL);
364
365	#ifdef __WIN32__
366	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
367	#else
368	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
369	#endif
370
371	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
372	if (casefolding == 0) mgq_ask(".set casefold off");
373	else if (casefolding > 0) mgq_ask(".set casefold on");
374	if (stemming == 0) mgq_ask(".set stem off");
375	else if (stemming > 0) mgq_ask(".set stem on");
376
377	} else databaseloaded = false;
378
379	// free up the c strings
380	delete ccollection;
381	delete idxsuffix;
382	delete txtsuffix;
383	delete ccollectdir;
384
385	return databaseloaded;
386	}
387
388
389	// stem word uses the values set in the last call to makeindexcurrent
390	// to stem the word. It is assumed that word is in unicode
391	text_t mgsearchclass::stemword (const text_t &word) {
392	return to_uni (mgsearch_stemword (to_utf8 (word)));
393	}
394
395	text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
396	return to_uni (mgsearch_stemword (to_utf8 (here, end)));
397	}
398
399	/**
400	* search directs the whole execution of the search; a number of other
401	* functions in this class are called as a result, and precondition
402	* checks are also made
403	*/
404	bool mgsearchclass::search(const queryparamclass &queryparams,
405	queryresultsclass &queryresults) {
406	// assert (cache != NULL);
407
408	// clear any previous results
409	queryresults.clear();
410	// first check the cache
411	if (cache != NULL) {
412	if (cache->find(queryparams, queryresults)) return true;
413	}
414	// make sure there is a query to be processed
415	if (!has_unicode_letdig(queryparams.querystring)) return true;
416
417	if (makeindexcurrent (queryparams.index, queryparams.subcollection,
418	queryparams.language, queryparams.collection)) {
419	// initialise the form of results
420	setsearchmode (queryparams);
421
422	// execute the query
423	submitquery (queryparams);
424
425	// retrieve the results
426	getresults (queryparams, queryresults);
427
428	return true;
429	}
430
431	return false;
432	}
433
434	/* accumulator_method has been changed to use array rather than list.
435	list appears to be broken somewhat - for some ranked queries, it returned
436	fewer results than it should have (eg 45 instead of 50). The three other
437	methods (array, splay_tree, hash_table) all return the same number of
438	documents, in the same order, with the same ranks. list returns what
439	appears to be the same documents (but less of them), but with different ranks,
440	and in a different order. Minimal time tests dont show any speed improvement
441	of list over array (maybe because its broken??). [02/2001, kjm18]
442	*/
443	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
444	{
445	mgq_ask(".set expert true");
446	mgq_ask(".set sorted_terms true");
447	mgq_ask(".set accumulator_method array");
448	mgq_ask(".set max_accumulators 500000");
449	mgq_ask(".set maxparas 500000");
450	mgq_ask(".set verbatim true");
451	mgq_ask(".unset skip_dump");
452	mgq_ask(".set mode docnums");
453
454	switch (queryparams.search_type)
455	{
456	case 0: mgq_ask(".set query boolean"); break;
457	case 1: mgq_ask(".set query ranked"); break;
458	}
459	switch (queryparams.casefolding)
460	{
461	case 1: mgq_ask(".set casefold on"); break;
462	case 0: mgq_ask(".set casefold off"); break;
463	}
464	switch (queryparams.stemming)
465	{
466	case 1: mgq_ask(".set stem on"); break;
467	case 0: mgq_ask(".set stem off"); break;
468	}
469	mgq_ask(".set heads_length 150");
470
471	if (queryparams.maxdocs == -1) {
472	mgq_ask(".set maxdocs all");
473	} else {
474	char maxdocstr[32];
475	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
476	mgq_ask(maxdocstr);
477	}
478	}
479
480	/**
481	* submitquery constructs the query string (into UTF8 encoding)
482	* and submits it using mgq_ask to the mg search engine. Most
483	* of the processing will be done inside Greenstone
484	*/
485	void mgsearchclass::submitquery (const queryparamclass &queryparams)
486	{
487	// sort out the query string; copy it, remove all special characters
488	// and then convert it to a string in UTF8 format
489	text_t ttquerystring = queryparams.querystring;
490	filterquery (ttquerystring);
491	char *querystring = to_utf8(ttquerystring).getcstr();
492
493	// submit the query
494	mgq_ask(querystring);
495
496	// destroy the temporary character array
497	delete querystring;
498	}
499
500	/**
501	* getrults is called to retrieve the required data on the docs
502	* which responded to the query submitted in submitquery above.
503	*
504	* It calls the local mgquery (mgq) interface to MG several times,
505	* to obtain the document numbers, term frequencies, term variants
506	* etc. All processing of the query will be done by Greenstone
507	* thereafter
508	*/
509	void mgsearchclass::getresults (const queryparamclass &queryparams,
510	queryresultsclass &queryresults) {
511	// get the configuration for the maximum number of documents to
512	// retrieve
513	int howmany = queryparams.maxdocs;
514	if (howmany == -1) howmany = MAXNUMDOCS;
515	mgq_results(result_docnums, 0, howmany,
516	ourquerycallback, (void *)(&queryresults));
517
518	// get the term frequencies
519	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
520	termfreqcallback, (void *)(&queryresults));
521	queryresults.sortuniqqueryterms();
522
523	// get term variants
524	mgq_results(result_terms, 0, MAXNUMTERMS,
525	termvariantscallback, (void *)(&queryresults));
526
527	// get the number of documents retrieved
528	int total_retrieved = 0, is_approx = 0;
529	mgq_docsretrieved (&total_retrieved, &is_approx);
530
531	if (total_retrieved == 0) {
532	// not available (or really was zero)
533	queryresults.docs_matched = queryresults.docs.docset.size();
534	if ((queryparams.maxdocs == -1) \|\|
535	(queryresults.docs_matched < queryparams.maxdocs))
536	queryresults.is_approx = Exact;
537	else
538	queryresults.is_approx = MoreThan;
539	} else {
540	queryresults.docs_matched = total_retrieved;
541	if (is_approx) queryresults.is_approx = Approximate;
542	else queryresults.is_approx = Exact;
543	}
544	}
545
546	/**
547	* Tidies the given querystring, removing special characters
548	*/
549	void mgsearchclass::filterquery (text_t &ttquerystring) {
550	text_t::iterator ithere = ttquerystring.begin ();
551	text_t::iterator itend = ttquerystring.end ();
552
553	// remove all non alphanumeric characters (except
554	// boolean operators
555	while (ithere != itend) {
556	if ((!is_unicode_letdig(ithere)) && (ithere != '!') &&
557	(ithere != '&') && (ithere != '\|') && (*ithere != '(') &&
558	(ithere != ')')) (ithere) = ' ';
559	ithere++;
560	}
561	}
562
563
564	// the document text for 'docnum' is placed in 'output'
565	// docTargetDocument returns 'true' if it was able to
566	// try to get a document
567	// collection is needed to see if an index from the
568	// collection is loaded. If no index has been loaded
569	// defaultindex is needed to load one
570	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
571	const text_t &defaultsubcollection,
572	const text_t &defaultlanguage,
573	const text_t &collection,
574	int docnum,
575	text_t &output) {
576	output.clear();
577
578	// get the mg version of the document
579	char *mgdoc = NULL;
580	int doclen = 0;
581	if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
582	collection, docnum, mgdoc, doclen)) return false;
583	if (mgdoc == NULL) return false;
584
585	// replace all control-Cs with spaces
586	char *mgdoc_here = mgdoc;
587	char *mgdoc_end = mgdoc + doclen;
588	while (mgdoc_here < mgdoc_end) {
589	if (mgdoc_here == '\x3') mgdoc_here = ' ';
590	mgdoc_here++;
591	}
592
593	// convert this document to unicode
594	utf8inconvertclass inconvert;
595	convertclass::status_t status;
596	inconvert.reset ();
597	inconvert.setinput (mgdoc, doclen);
598	inconvert.convert (output, status);
599
600	return true;
601	}
602
603
604	bool mgsearchclass::mgdocument (const text_t &defaultindex,
605	const text_t &defaultsubcollection,
606	const text_t &defaultlanguage,
607	const text_t &collection,
608	int docnum,
609	char *&UDoc, int &ULen) {
610	int databaseloaded = 0;
611
612	UDoc = NULL; ULen = 0;
613
614	// see if we can make an appropriate database current
615	// char *ccollection = collection.getcstr();
616	// assert (ccollection != NULL);
617	// databaseloaded = load_text_database (ccollection);
618	// delete ccollection;
619
620	// try and load the database
621	// if (!databaseloaded)
622	databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
623	defaultlanguage, collection);
624
625	if (databaseloaded) {
626	// retrieve the document from mg
627	char docstr[32];
628	sprintf(docstr, "%i", docnum);
629
630	mgq_ask(".set mode text");
631	mgq_ask(".set query docnums");
632	mgq_ask(docstr);
633
634	tempdoc = NULL;
635	templen = 0;
636	mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
637	UDoc = tempdoc;
638	ULen = templen;
639	}
640
641	return (bool)databaseloaded;
642	}
643

Note: See TracBrowser for help on using the repository browser.

Download in other formats: