Context Navigation

source: trunk/gsdl/src/colservr/mgsearch.cpp@ 1860

Last change on this file since 1860 was 1860, checked in by cs025, 23 years ago
Included CORBA branch for first time
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 17.3 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: mgsearch.cpp 1860 2001-01-25 18:26:45Z cs025 $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.28 2001/01/25 18:26:44 cs025
31	Included CORBA branch for first time
32
33	Revision 1.22.2.1 2000/04/04 15:02:32 cs025
34	Corba first commit
35
36	Revision 1.22 1999/09/24 02:41:21 rjmcnab
37	change to use has_unicode_letdig in text_t
38
39	Revision 1.21 1999/09/21 21:41:41 sjboddie
40	fixed an error in what I committed last
41
42	Revision 1.20 1999/09/21 11:59:26 sjboddie
43	added Maxdocs queryfilter option (which may be -1 for 'all)
44
45	Revision 1.19 1999/09/07 22:52:52 rjmcnab
46	Seems to be an error in mg for retrieving documents using a paragraph
47	based index for some cases. Just added a work around (loads the default
48	index every time).
49
50	Revision 1.18 1999/09/07 04:57:22 sjboddie
51	added gpl notice
52
53	Revision 1.17 1999/08/31 22:42:41 rjmcnab
54	A couple of minor things.
55
56	Revision 1.16 1999/08/25 04:51:06 sjboddie
57	small change to allow for searching using boolean operators
58
59	Revision 1.15 1999/07/16 08:35:03 rjmcnab
60	Fixed a weird bug to do with a faulty case statement.
61
62	Revision 1.14 1999/07/16 03:42:22 sjboddie
63	changed isApprox
64
65	Revision 1.13 1999/07/16 00:12:46 sjboddie
66	removed all the old post-processing stuff
67
68	Revision 1.12 1999/07/07 06:17:47 rjmcnab
69	broke search_index into index+subcollection+language
70	within mgsearch
71
72	Revision 1.11 1999/07/05 21:06:43 rjmcnab
73	Disabled quoted strings.
74
75	Revision 1.10 1999/07/01 09:29:19 rjmcnab
76	Changes for better reporting of number documents which match a query. Changes
77	should still work as before with older versions of mg.
78
79	Revision 1.9 1999/07/01 03:54:48 rjmcnab
80	Added code to plug in the equivalent terms of each of the query terms.
81	Also added a function to get a raw utf8 encoded mg document (for speeding
82	up a phrase matching function)
83
84	Revision 1.8 1999/06/30 04:04:12 rjmcnab
85	made stemming functions available from mgsearch and made the stems
86	for the query terms available in queryinfo
87
88	Revision 1.7 1999/06/27 22:07:27 sjboddie
89	got rid of all the old functions for dealing with dir indexes
90
91	Revision 1.6 1999/06/09 00:41:32 sjboddie
92	phrase searching now uses case-folding if it's turned on
93
94	Revision 1.5 1999/02/21 22:31:35 rjmcnab
95
96	Removed locateinfo.
97
98	Revision 1.4 1999/02/03 01:13:27 sjboddie
99
100	Got interface to handle subcollections and language subcollections -
101	committed changes made to some of the collections
102
103	Revision 1.3 1999/01/19 01:38:17 rjmcnab
104
105	Made the source more portable.
106
107	Revision 1.2 1999/01/12 01:51:02 rjmcnab
108
109	Standard header.
110
111	Revision 1.1 1999/01/08 09:02:16 rjmcnab
112
113	Moved from src/library.
114
115	*/
116
117	#include "gsdlconf.h"
118	#include "mgsearch.h"
119	#include "fileutil.h"
120
121	#include <string.h>
122	#include <stdio.h>
123	#include <stdlib.h>
124	#include <ctype.h>
125
126	#if defined(GSDL_USE_OBJECTSPACE)
127	# include <ospace\std\iostream>
128	#elif defined(GSDL_USE_IOS_H)
129	# include <iostream.h>
130	#else
131	# include <iostream>
132	#endif
133
134	#if defined(__WIN32__)
135	// gdbm stuff
136	# include "autoconf.h"
137	# include "systems.h"
138	# include "gdbmconst.h"
139	# include "gdbm.h"
140	#else
141	# include <gdbm.h>
142	#endif
143
144
145	#include <assert.h>
146
147	#include "mgq.h"
148	// #include "locateinfo.h"
149	#include "gsdlunicode.h"
150	#include "unitool.h"
151
152
153	/////////////
154	// globals //
155	/////////////
156
157	static char *tempdoc = NULL;
158	static int templen = 0;
159
160
161	//////////////////////
162	// useful functions //
163	//////////////////////
164
165
166	// input and output are in utf8
167	text_t mgsearch_stemword (const text_t &word) {
168	// allocate working stem space
169	int maxstemlen = mgq_getmaxstemlen ();
170	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
171	if (word_stem == NULL) return "";
172
173	// copy word to word_stem
174	int len = 0;
175	text_t::const_iterator here = word.begin();
176	text_t::const_iterator end = word.end();
177	while (len < maxstemlen && here != end) {
178	word_stem[len+1] = (unsigned char)(*here);
179	len++; here++;
180	}
181	word_stem[len+1] = '\0';
182	word_stem[0] = len;
183
184	mgq_stemword (word_stem);
185
186	// copy word_stem back to tempstr
187	text_t tempstr;
188	tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
189
190	delete [] word_stem;
191
192	return tempstr;
193	}
194
195
196
197	////////////////////////
198	// callback functions //
199	////////////////////////
200
201	// This routine is called for each document found in a search
202	// it assumes that cache_num is set up correctly to point to
203	// a suitable result cache
204	int ourquerycallback(char * /UDoc/, int /ULen/, int DocNum,
205	float Weight, void *info) {
206
207
208	queryresultsclass queryresults = (queryresultsclass )info;
209
210	// append this entry to the document results
211	docresultclass docresult;
212	docresult.docnum = DocNum;
213	docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
214	docresult.docweight = Weight - docresult.num_query_terms_matched*100;
215
216	queryresults->docs.docset[DocNum] = docresult;
217	queryresults->docs.docorder.push_back(DocNum);
218
219	return 0;
220	}
221
222	int termequivcallback(char Word, int ULen, int /Freq*/,
223	float /Weight/, void *info) {
224	text_tset equivterms = (text_tset )info;
225	if (equivterms == NULL) return 0;
226
227	text_t thisterm;
228	thisterm.setcarr(Word, ULen);
229
230	equivterms->insert(thisterm);
231
232	return 0;
233	}
234
235
236	void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
237	// allocate working stem space
238	int maxstemlen = mgq_getmaxstemlen ();
239	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
240	if (word_stem == NULL) return;
241
242	// copy word to word_stem
243	int len = 0;
244	text_t::const_iterator here = word.begin();
245	text_t::const_iterator end = word.end();
246	while (len < maxstemlen && here != end) {
247	word_stem[len+1] = (unsigned char)(*here);
248	len++; here++;
249	}
250	word_stem[len+1] = '\0';
251	word_stem[0] = len;
252
253	// get the equivalent terms
254	mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
255
256	delete [] word_stem;
257
258	return;
259	}
260
261	text_tset utf8equivterms; // kept as utf8 string for fast matching
262
263
264	// This callback is called once for each term in the query
265	int termfreqcallback(char *Word, int ULen, int Freq,
266	float /Weight/, void *info) {
267	queryresultsclass queryresults = (queryresultsclass )info;
268	if (queryresults == NULL) return 0;
269
270	text_t term;
271	term.setcarr(Word, ULen);
272	termfreqclass termfreq;
273
274	termfreq.termstr = to_uni(term);
275	text_t utf8termstem = mgsearch_stemword (term);
276	termfreq.termstemstr = to_uni (utf8termstem);
277
278	mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
279
280	termfreq.termfreq = Freq;
281	queryresults->orgterms.push_back(termfreq);
282
283	return 0;
284	}
285
286	// this callback is called once for each variation of each term
287	int termvariantscallback(char Word, int ULen, int /Freq*/,
288	float /Weight/, void *info) {
289
290	text_t term;
291	term.setcarr(Word, ULen);
292	queryresultsclass queryresults = (queryresultsclass )info;
293	queryresults->termvariants.insert(to_uni(term));
294
295	return 0;
296	}
297
298	// This callback is for getting document text
299	int doctextcallback(char Doc, int ULen, int /Freq*/,
300	float /Weight/, void * /info/) {
301	tempdoc = Doc;
302	templen = ULen;
303
304	return 0;
305	}
306
307
308	static text_t getindexsuffix (const text_t &collection,
309	const text_t &index) {
310
311	text_t indexsuffix = "index";
312	indexsuffix = filename_cat (indexsuffix, index);
313	indexsuffix = filename_cat (indexsuffix, collection);
314	return indexsuffix;
315	}
316
317
318
319
320	////////////////////
321	// mgsearch class //
322	////////////////////
323
324	mgsearchclass::mgsearchclass ()
325	: searchclass() {
326
327	}
328
329	mgsearchclass::~mgsearchclass ()
330	{
331	if (cache != NULL)
332	{
333	delete cache;
334	cache = NULL;
335	}
336	}
337
338	// you only need to use this function before doing any stemming
339	// casefolding and stemming will be set if values for them are
340	// provided (0 or 1).
341	// makeindexcurrent returns true if it was able to load the database
342	bool mgsearchclass::makeindexcurrent (const text_t &index,
343	const text_t &subcollection,
344	const text_t &language,
345	const text_t &collection,
346	int casefolding,
347	int stemming) {
348	bool databaseloaded = true;
349
350	// get the names of the collection, index and text suffixes
351	char *ccollection = collection.getcstr();
352	assert (ccollection != NULL);
353	char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
354	assert (idxsuffix != NULL);
355	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
356	assert (txtsuffix != NULL);
357
358	#ifdef __WIN32__
359	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
360	#else
361	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
362	#endif
363
364	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
365	if (casefolding == 0) mgq_ask(".set casefold off");
366	else if (casefolding > 0) mgq_ask(".set casefold on");
367	if (stemming == 0) mgq_ask(".set stem off");
368	else if (stemming > 0) mgq_ask(".set stem on");
369
370	} else databaseloaded = false;
371
372	// free up the c strings
373	delete ccollection;
374	delete idxsuffix;
375	delete txtsuffix;
376	delete ccollectdir;
377
378	return databaseloaded;
379	}
380
381
382	// stem word uses the values set in the last call to makeindexcurrent
383	// to stem the word. It is assumed that word is in unicode
384	text_t mgsearchclass::stemword (const text_t &word) {
385	return to_uni (mgsearch_stemword (to_utf8 (word)));
386	}
387
388	text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
389	return to_uni (mgsearch_stemword (to_utf8 (here, end)));
390	}
391
392	/**
393	* search directs the whole execution of the search; a number of other
394	* functions in this class are called as a result, and precondition
395	* checks are also made
396	*/
397	bool mgsearchclass::search(const queryparamclass &queryparams,
398	queryresultsclass &queryresults) {
399	// assert (cache != NULL);
400
401	// clear any previous results
402	queryresults.clear();
403	// first check the cache
404	if (cache != NULL) {
405	if (cache->find(queryparams, queryresults)) return true;
406	}
407	// make sure there is a query to be processed
408	if (!has_unicode_letdig(queryparams.querystring)) return true;
409
410	if (makeindexcurrent (queryparams.index, queryparams.subcollection,
411	queryparams.language, queryparams.collection)) {
412	// initialise the form of results
413	setsearchmode (queryparams);
414
415	// execute the query
416	submitquery (queryparams);
417
418	// retrieve the results
419	getresults (queryparams, queryresults);
420
421	return true;
422	}
423
424	return false;
425	}
426
427
428	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
429	{
430	mgq_ask(".set expert true");
431	mgq_ask(".set sorted_terms true");
432	mgq_ask(".set accumulator_method list");
433	mgq_ask(".set max_accumulators 500000");
434	mgq_ask(".set maxparas 500000");
435	mgq_ask(".set verbatim true");
436	mgq_ask(".unset skip_dump");
437	mgq_ask(".set mode docnums");
438
439	switch (queryparams.search_type)
440	{
441	case 0: mgq_ask(".set query boolean"); break;
442	case 1: mgq_ask(".set query ranked"); break;
443	}
444	switch (queryparams.casefolding)
445	{
446	case 1: mgq_ask(".set casefold on"); break;
447	case 0: mgq_ask(".set casefold off"); break;
448	}
449	switch (queryparams.stemming)
450	{
451	case 1: mgq_ask(".set stem on"); break;
452	case 0: mgq_ask(".set stem off"); break;
453	}
454	mgq_ask(".set heads_length 150");
455
456	if (queryparams.maxdocs == -1) {
457	mgq_ask(".set maxdocs all");
458	} else {
459	char maxdocstr[32];
460	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
461	mgq_ask(maxdocstr);
462	}
463	}
464
465	/**
466	* submitquery constructs the query string (into UTF8 encoding)
467	* and submits it using mgq_ask to the mg search engine. Most
468	* of the processing will be done inside Greenstone
469	*/
470	void mgsearchclass::submitquery (const queryparamclass &queryparams)
471	{
472	// sort out the query string; copy it, remove all special characters
473	// and then convert it to a string in UTF8 format
474	text_t ttquerystring = queryparams.querystring;
475	filterquery (ttquerystring);
476	char *querystring = to_utf8(ttquerystring).getcstr();
477
478	// submit the query
479	mgq_ask(querystring);
480
481	// destroy the temporary character array
482	delete querystring;
483	}
484
485	/**
486	* getrults is called to retrieve the required data on the docs
487	* which responded to the query submitted in submitquery above.
488	*
489	* It calls the local mgquery (mgq) interface to MG several times,
490	* to obtain the document numbers, term frequencies, term variants
491	* etc. All processing of the query will be done by Greenstone
492	* thereafter
493	*/
494	void mgsearchclass::getresults (const queryparamclass &queryparams,
495	queryresultsclass &queryresults) {
496	// get the configuration for the maximum number of documents to
497	// retrieve
498	int howmany = queryparams.maxdocs;
499	if (howmany == -1) howmany = MAXNUMDOCS;
500	mgq_results(result_docnums, 0, howmany,
501	ourquerycallback, (void *)(&queryresults));
502
503	// get the term frequencies
504	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
505	termfreqcallback, (void *)(&queryresults));
506	queryresults.sortuniqqueryterms();
507
508	// get term variants
509	mgq_results(result_terms, 0, MAXNUMTERMS,
510	termvariantscallback, (void *)(&queryresults));
511
512	// get the number of documents retrieved
513	int total_retrieved = 0, is_approx = 0;
514	mgq_docsretrieved (&total_retrieved, &is_approx);
515
516	if (total_retrieved == 0) {
517	// not available (or really was zero)
518	queryresults.docs_matched = queryresults.docs.docset.size();
519	if ((queryparams.maxdocs == -1) \|\|
520	(queryresults.docs_matched < queryparams.maxdocs))
521	queryresults.is_approx = Exact;
522	else
523	queryresults.is_approx = MoreThan;
524	} else {
525	queryresults.docs_matched = total_retrieved;
526	if (is_approx) queryresults.is_approx = Approximate;
527	else queryresults.is_approx = Exact;
528	}
529	}
530
531	/**
532	* Tidies the given querystring, removing special characters
533	*/
534	void mgsearchclass::filterquery (text_t &ttquerystring) {
535	text_t::iterator ithere = ttquerystring.begin ();
536	text_t::iterator itend = ttquerystring.end ();
537
538	// remove all non alphanumeric characters (except
539	// boolean operators
540	while (ithere != itend) {
541	if ((!is_unicode_letdig(ithere)) && (ithere != '!') &&
542	(ithere != '&') && (ithere != '\|') && (*ithere != '(') &&
543	(ithere != ')')) (ithere) = ' ';
544	ithere++;
545	}
546	}
547
548
549	// the document text for 'docnum' is placed in 'output'
550	// docTargetDocument returns 'true' if it was able to
551	// try to get a document
552	// collection is needed to see if an index from the
553	// collection is loaded. If no index has been loaded
554	// defaultindex is needed to load one
555	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
556	const text_t &defaultsubcollection,
557	const text_t &defaultlanguage,
558	const text_t &collection,
559	int docnum,
560	text_t &output) {
561	output.clear();
562
563	// get the mg version of the document
564	char *mgdoc = NULL;
565	int doclen = 0;
566	if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
567	collection, docnum, mgdoc, doclen)) return false;
568	if (mgdoc == NULL) return false;
569
570	// replace all control-Cs with spaces
571	char *mgdoc_here = mgdoc;
572	char *mgdoc_end = mgdoc + doclen;
573	while (mgdoc_here < mgdoc_end) {
574	if (mgdoc_here == '\x3') mgdoc_here = ' ';
575	mgdoc_here++;
576	}
577
578	// convert this document to unicode
579	utf8inconvertclass inconvert;
580	convertclass::status_t status;
581	inconvert.reset ();
582	inconvert.setinput (mgdoc, doclen);
583	inconvert.convert (output, status);
584
585	return true;
586	}
587
588
589	bool mgsearchclass::mgdocument (const text_t &defaultindex,
590	const text_t &defaultsubcollection,
591	const text_t &defaultlanguage,
592	const text_t &collection,
593	int docnum,
594	char *&UDoc, int &ULen) {
595	int databaseloaded = 0;
596
597	UDoc = NULL; ULen = 0;
598
599	// see if we can make an appropriate database current
600	// char *ccollection = collection.getcstr();
601	// assert (ccollection != NULL);
602	// databaseloaded = load_text_database (ccollection);
603	// delete ccollection;
604
605	// try and load the database
606	// if (!databaseloaded)
607	databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
608	defaultlanguage, collection);
609
610	if (databaseloaded) {
611	// retrieve the document from mg
612	char docstr[32];
613	sprintf(docstr, "%i", docnum);
614
615	mgq_ask(".set mode text");
616	mgq_ask(".set query docnums");
617	mgq_ask(docstr);
618
619	tempdoc = NULL;
620	templen = 0;
621	mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
622	UDoc = tempdoc;
623	ULen = templen;
624	}
625
626	return (bool)databaseloaded;
627	}
628

Note: See TracBrowser for help on using the repository browser.

Download in other formats: