Context Navigation

source: trunk/gsdl/src/colservr/mgsearch.cpp@ 13780

Last change on this file since 13780 was 13780, checked in by mdewsnip, 17 years ago
GLI/LOCAL LIBRARY: To prevent the problems with the GLI being unable to install newly built collections because the local library is holding files open, much more care needs to be taken to close files (typically the GDBM database and the MG/MGPP index files) after use. Fixed a lot of places where files were being left open.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.5 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "gsdlconf.h"
27	#include "mgsearch.h"
28	#include "fileutil.h"
29
30	#include <string.h>
31	#include <stdio.h>
32	#include <stdlib.h>
33	#include <ctype.h>
34
35	#if defined(GSDL_USE_OBJECTSPACE)
36	# include <ospace\std\iostream>
37	#elif defined(GSDL_USE_IOS_H)
38	# include <iostream.h>
39	#else
40	# include <iostream>
41	#endif
42
43	#if defined(__WIN32__)
44	// gdbm stuff
45	# include "autoconf.h"
46	# include "systems.h"
47	# include "gdbmconst.h"
48	# include "gdbm.h"
49	#else
50	# include <gdbm.h>
51	#endif
52
53
54	#include <assert.h>
55
56	#include "mgq.h"
57	// #include "locateinfo.h"
58	#include "gsdlunicode.h"
59	#include "unitool.h"
60
61
62	/////////////
63	// globals //
64	/////////////
65
66	static char *tempdoc = NULL;
67	static int templen = 0;
68
69
70	//////////////////////
71	// useful functions //
72	//////////////////////
73
74
75	// input and output are in utf8
76	text_t mgsearch_stemword (const text_t &word) {
77	// allocate working stem space
78	int maxstemlen = mgq_getmaxstemlen ();
79	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
80	if (word_stem == NULL) return "";
81
82	// copy word to word_stem
83	int len = 0;
84	text_t::const_iterator here = word.begin();
85	text_t::const_iterator end = word.end();
86	while (len < maxstemlen && here != end) {
87	word_stem[len+1] = (unsigned char)(*here);
88	++len; ++here;
89	}
90	word_stem[len+1] = '\0';
91	word_stem[0] = len;
92
93	mgq_stemword (word_stem);
94
95	// copy word_stem back to tempstr
96	text_t tempstr;
97	tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
98
99	delete [] word_stem;
100
101	return tempstr;
102	}
103
104
105
106	////////////////////////
107	// callback functions //
108	////////////////////////
109
110	// This routine is called for each document found in a search
111	// it assumes that cache_num is set up correctly to point to
112	// a suitable result cache
113	int ourquerycallback(char * /UDoc/, int /ULen/, int DocNum,
114	float Weight, void *info) {
115
116
117	queryresultsclass queryresults = (queryresultsclass )info;
118
119	// append this entry to the document results
120	docresultclass docresult;
121	docresult.docnum = DocNum;
122	docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
123	docresult.docweight = Weight - docresult.num_query_terms_matched*100;
124
125	queryresults->docs.docset[DocNum] = docresult;
126	queryresults->docs.docorder.push_back(DocNum);
127
128	return 0;
129	}
130
131	int termequivcallback(char Word, int ULen, int /Freq*/,
132	float /Weight/, void *info) {
133	text_tset equivterms = (text_tset )info;
134	if (equivterms == NULL) return 0;
135
136	text_t thisterm;
137	thisterm.setcarr(Word, ULen);
138
139	equivterms->insert(thisterm);
140
141	return 0;
142	}
143
144
145	void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
146	// allocate working stem space
147	int maxstemlen = mgq_getmaxstemlen ();
148	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
149	if (word_stem == NULL) return;
150
151	// copy word to word_stem
152	int len = 0;
153	text_t::const_iterator here = word.begin();
154	text_t::const_iterator end = word.end();
155	while (len < maxstemlen && here != end) {
156	word_stem[len+1] = (unsigned char)(*here);
157	++len; ++here;
158	}
159	word_stem[len+1] = '\0';
160	word_stem[0] = len;
161
162	// get the equivalent terms
163	mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
164
165	delete [] word_stem;
166
167	return;
168	}
169
170	text_tset utf8equivterms; // kept as utf8 string for fast matching
171
172
173	// This callback is called once for each term in the query
174	int termfreqcallback(char *Word, int ULen, int Freq,
175	float /Weight/, void *info) {
176	queryresultsclass queryresults = (queryresultsclass )info;
177	if (queryresults == NULL) return 0;
178
179	text_t term;
180	term.setcarr(Word, ULen);
181	termfreqclass termfreq;
182
183	termfreq.termstr = to_uni(term);
184	text_t utf8termstem = mgsearch_stemword (term);
185	termfreq.termstemstr = to_uni (utf8termstem);
186
187	mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
188
189	termfreq.termfreq = Freq;
190	queryresults->orgterms.push_back(termfreq);
191
192	return 0;
193	}
194
195	// this callback is called once for each variation of each term
196	int termvariantscallback(char Word, int ULen, int /Freq*/,
197	float /Weight/, void *info) {
198
199	text_t term;
200	term.setcarr(Word, ULen);
201	queryresultsclass queryresults = (queryresultsclass )info;
202	queryresults->termvariants.insert(to_uni(term));
203
204	return 0;
205	}
206
207	// This callback is for getting document text
208	int doctextcallback(char Doc, int ULen, int /Freq*/,
209	float /Weight/, void * /info/) {
210	tempdoc = Doc;
211	templen = ULen;
212
213	return 0;
214	}
215
216
217	text_t mgsearchclass::getindexsuffix (const text_t &collection,
218	const text_t &index) {
219
220	text_t indexsuffix = "index";
221	indexsuffix = filename_cat (indexsuffix, index);
222	if (indexstem.empty()) {
223	// no index stem, use the coll name
224	indexsuffix = filename_cat (indexsuffix, collection);
225	} else {
226	indexsuffix = filename_cat (indexsuffix, indexstem);
227	}
228	return indexsuffix;
229	}
230
231
232
233
234	////////////////////
235	// mgsearch class //
236	////////////////////
237
238	mgsearchclass::mgsearchclass ()
239	: searchclass() {
240
241	}
242
243	mgsearchclass::~mgsearchclass ()
244	{
245	if (cache != NULL)
246	{
247	delete cache;
248	cache = NULL;
249	}
250	}
251
252	void mgsearchclass::set_indexstem(const text_t &stem) {
253	indexstem = stem;
254
255	}
256
257	// you only need to use this function before doing any stemming
258	// casefolding and stemming will be set if values for them are
259	// provided (0 or 1).
260	// makeindexcurrent returns true if it was able to load the database
261	bool mgsearchclass::makeindexcurrent (const text_t &index,
262	const text_t &subcollection,
263	const text_t &language,
264	const text_t &collection,
265	int casefolding,
266	int stemming) {
267	bool databaseloaded = true;
268
269	// get the names of the collection, index and text suffixes
270	char *ccollection = collection.getcstr();
271	assert (ccollection != NULL);
272	char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
273	assert (idxsuffix != NULL);
274	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
275	assert (txtsuffix != NULL);
276	#ifdef __WIN32__
277	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
278	#else
279	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
280	#endif
281
282	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
283	if (casefolding == 0) mgq_ask(".set casefold off");
284	else if (casefolding > 0) mgq_ask(".set casefold on");
285	if (stemming == 0) mgq_ask(".set stem off");
286	else if (stemming > 0) mgq_ask(".set stem on");
287
288	} else databaseloaded = false;
289
290	// free up the c strings
291	delete []ccollection;
292	delete []idxsuffix;
293	delete []txtsuffix;
294	delete []ccollectdir;
295
296	return databaseloaded;
297	}
298
299
300	// stem word uses the values set in the last call to makeindexcurrent
301	// to stem the word. It is assumed that word is in unicode
302	text_t mgsearchclass::stemword (const text_t &word) {
303	return to_uni (mgsearch_stemword (to_utf8 (word)));
304	}
305
306	text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
307	return to_uni (mgsearch_stemword (to_utf8 (here, end)));
308	}
309
310	/**
311	* search directs the whole execution of the search; a number of other
312	* functions in this class are called as a result, and precondition
313	* checks are also made
314	*/
315	bool mgsearchclass::search(const queryparamclass &queryparams,
316	queryresultsclass &queryresults) {
317	// assert (cache != NULL);
318
319	// clear any previous results
320	queryresults.clear();
321	// first check the cache
322	if (cache != NULL) {
323	if (cache->find(queryparams, queryresults)) return true;
324	}
325	// make sure there is a query to be processed
326	if (!has_unicode_letdig(queryparams.querystring)) return true;
327
328	if (makeindexcurrent (queryparams.index, queryparams.subcollection,
329	queryparams.language, queryparams.collection)) {
330	// initialise the form of results
331	setsearchmode (queryparams);
332
333	// execute the query
334	submitquery (queryparams);
335
336	// retrieve the results
337	getresults (queryparams, queryresults);
338	unload_database(); // Important that local library doesn't leave any files open
339	return true;
340	}
341
342	return false;
343	}
344
345	/* accumulator_method has been changed to use array rather than list.
346	list appears to be broken somewhat - for some ranked queries, it returned
347	fewer results than it should have (eg 45 instead of 50). The three other
348	methods (array, splay_tree, hash_table) all return the same number of
349	documents, in the same order, with the same ranks. list returns what
350	appears to be the same documents (but less of them), but with different ranks,
351	and in a different order. Minimal time tests dont show any speed improvement
352	of list over array (maybe because its broken??). [02/2001, kjm18]
353
354	... [sjboddie, also 02/2001] turns out that changing the accumulator_method
355	introduced a more serious bug than it fixed (i.e. occasionally when doing a
356	ranked search for a very common word you get no results at all). I've
357	changed it back to list for now, one day we should play with other
358	accumulator_methods but for now I don't have time and don't want to risk
359	introducing bugs (better the devil you know ;)
360	*/
361	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
362	{
363	mgq_ask(".set expert true");
364	mgq_ask(".set sorted_terms true");
365	mgq_ask(".set accumulator_method list");
366	mgq_ask(".set max_accumulators 500000");
367	mgq_ask(".set maxparas 500000");
368	mgq_ask(".set verbatim true");
369	mgq_ask(".unset skip_dump");
370	mgq_ask(".set mode docnums");
371
372	switch (queryparams.search_type)
373	{
374	case 0: mgq_ask(".set query boolean"); break;
375	case 1: mgq_ask(".set query ranked"); break;
376	}
377	switch (queryparams.casefolding)
378	{
379	case 1: mgq_ask(".set casefold on"); break;
380	case 0: mgq_ask(".set casefold off"); break;
381	}
382	switch (queryparams.stemming)
383	{
384	case 1: mgq_ask(".set stem on"); break;
385	case 0: mgq_ask(".set stem off"); break;
386	}
387	mgq_ask(".set heads_length 150");
388
389	if (queryparams.maxdocs == -1) {
390	mgq_ask(".set maxdocs all");
391	} else {
392	char maxdocstr[32];
393	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
394	mgq_ask(maxdocstr);
395	}
396
397	char maxnumericstr[32];
398	sprintf(maxnumericstr, ".set maxnumeric %i", queryparams.maxnumeric);
399	mgq_ask(maxnumericstr);
400
401	}
402
403	/**
404	* submitquery constructs the query string (into UTF8 encoding)
405	* and submits it using mgq_ask to the mg search engine. Most
406	* of the processing will be done inside Greenstone
407	*/
408	void mgsearchclass::submitquery (const queryparamclass &queryparams)
409	{
410	// sort out the query string; copy it, remove all special characters
411	// and then convert it to a string in UTF8 format
412	text_t ttquerystring = queryparams.querystring;
413	filterquery (ttquerystring);
414	char *querystring = to_utf8(ttquerystring).getcstr();
415
416	// submit the query
417	mgq_ask(querystring);
418
419	// destroy the temporary character array
420	delete []querystring;
421	}
422
423	/**
424	* getrults is called to retrieve the required data on the docs
425	* which responded to the query submitted in submitquery above.
426	*
427	* It calls the local mgquery (mgq) interface to MG several times,
428	* to obtain the document numbers, term frequencies, term variants
429	* etc. All processing of the query will be done by Greenstone
430	* thereafter
431	*/
432	void mgsearchclass::getresults (const queryparamclass &queryparams,
433	queryresultsclass &queryresults) {
434	// get the configuration for the maximum number of documents to
435	// retrieve
436	int howmany = queryparams.maxdocs;
437	if (howmany == -1) howmany = MAXNUMDOCS;
438	mgq_results(result_docnums, 0, howmany,
439	ourquerycallback, (void *)(&queryresults));
440
441	// get the term frequencies
442	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
443	termfreqcallback, (void *)(&queryresults));
444	queryresults.sortuniqqueryterms();
445
446	// get term variants
447	mgq_results(result_terms, 0, MAXNUMTERMS,
448	termvariantscallback, (void *)(&queryresults));
449
450	// get the number of documents retrieved
451	int total_retrieved = 0, is_approx = 0;
452	mgq_docsretrieved (&total_retrieved, &is_approx);
453
454	if (total_retrieved == 0) {
455	// not available (or really was zero)
456	queryresults.docs_matched = queryresults.docs.docset.size();
457	if ((queryparams.maxdocs == -1) \|\|
458	(queryresults.docs_matched < queryparams.maxdocs))
459	queryresults.is_approx = Exact;
460	else
461	queryresults.is_approx = MoreThan;
462	} else {
463	queryresults.docs_matched = total_retrieved;
464	if (is_approx) queryresults.is_approx = Approximate;
465	else queryresults.is_approx = Exact;
466	}
467	}
468
469	/**
470	* Tidies the given querystring, removing special characters
471	*/
472	void mgsearchclass::filterquery (text_t &ttquerystring) {
473	text_t::iterator ithere = ttquerystring.begin ();
474	text_t::iterator itend = ttquerystring.end ();
475
476	// remove all non alphanumeric characters (except
477	// boolean operators
478	while (ithere != itend) {
479	if ((!is_unicode_letdig(ithere)) && (ithere != '!') &&
480	(ithere != '&') && (ithere != '\|') && (*ithere != '(') &&
481	(ithere != ')')) (ithere) = ' ';
482	++ithere;
483	}
484	}
485
486
487	// the document text for 'docnum' is placed in 'output'
488	// docTargetDocument returns 'true' if it was able to
489	// try to get a document
490	// collection is needed to see if an index from the
491	// collection is loaded. If no index has been loaded
492	// defaultindex is needed to load one
493	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
494	const text_t &defaultsubcollection,
495	const text_t &defaultlanguage,
496	const text_t &collection,
497	int docnum,
498	text_t &output) {
499	output.clear();
500
501	// get the mg version of the document
502	char *mgdoc = NULL;
503	int doclen = 0;
504	if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
505	collection, docnum, mgdoc, doclen)) return false;
506	if (mgdoc == NULL) return false;
507
508	// replace all control-Cs with spaces
509	char *mgdoc_here = mgdoc;
510	char *mgdoc_end = mgdoc + doclen;
511	while (mgdoc_here < mgdoc_end) {
512	if (mgdoc_here == '\x3') mgdoc_here = ' ';
513	++mgdoc_here;
514	}
515
516	// convert this document to unicode
517	utf8inconvertclass inconvert;
518	convertclass::status_t status;
519	inconvert.reset ();
520	inconvert.setinput (mgdoc, doclen);
521	inconvert.convert (output, status);
522
523	return true;
524	}
525
526
527	bool mgsearchclass::mgdocument (const text_t &defaultindex,
528	const text_t &defaultsubcollection,
529	const text_t &defaultlanguage,
530	const text_t &collection,
531	int docnum,
532	char *&UDoc, int &ULen) {
533	int databaseloaded = 0;
534
535	UDoc = NULL; ULen = 0;
536
537	// see if we can make an appropriate database current
538	// char *ccollection = collection.getcstr();
539	// assert (ccollection != NULL);
540	// databaseloaded = load_text_database (ccollection);
541	// delete []ccollection;
542
543	// try and load the database
544	// if (!databaseloaded)
545	databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
546	defaultlanguage, collection);
547
548	if (databaseloaded) {
549	// retrieve the document from mg
550	char docstr[32];
551	sprintf(docstr, "%i", docnum);
552
553	mgq_ask(".set mode text");
554	mgq_ask(".set query docnums");
555	mgq_ask(docstr);
556
557	tempdoc = NULL;
558	templen = 0;
559	mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
560	UDoc = tempdoc;
561	ULen = templen;
562	}
563
564	unload_database(); // Important that local library doesn't leave any files open
565	return (bool)databaseloaded;
566	}
567
568	// unload_database simply calls mgq's close_all_databases function to clear
569	// any cached databases - this is useful when attempting to completely
570	// remove all trace of a collectionserver at runtime (when using a
571	// persistent version of Greenstone like the windows local library)
572	void mgsearchclass::unload_database () {
573	close_all_databases();
574	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: