Context Navigation

source: gsdl/trunk/src/colservr/mgsearch.cpp@ 15580

Last change on this file since 15580 was 13789, checked in by mdewsnip, 17 years ago
Fixed a problem with my previous change causing the local library to often crash when displaying a document. This was due to the document text being in memory that was deleted when the database was unloaded.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.6 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "gsdlconf.h"
27	#include "mgsearch.h"
28	#include "fileutil.h"
29
30	#include <string.h>
31	#include <stdio.h>
32	#include <stdlib.h>
33	#include <ctype.h>
34
35	#if defined(GSDL_USE_OBJECTSPACE)
36	# include <ospace\std\iostream>
37	#elif defined(GSDL_USE_IOS_H)
38	# include <iostream.h>
39	#else
40	# include <iostream>
41	#endif
42
43	#if defined(__WIN32__)
44	// gdbm stuff
45	# include "autoconf.h"
46	# include "systems.h"
47	# include "gdbmconst.h"
48	# include "gdbm.h"
49	#else
50	# include <gdbm.h>
51	#endif
52
53
54	#include <assert.h>
55
56	#include "mgq.h"
57	// #include "locateinfo.h"
58	#include "gsdlunicode.h"
59	#include "unitool.h"
60
61
62	/////////////
63	// globals //
64	/////////////
65
66	static char *tempdoc = NULL;
67	static int templen = 0;
68
69
70	//////////////////////
71	// useful functions //
72	//////////////////////
73
74
75	// input and output are in utf8
76	text_t mgsearch_stemword (const text_t &word) {
77	// allocate working stem space
78	int maxstemlen = mgq_getmaxstemlen ();
79	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
80	if (word_stem == NULL) return "";
81
82	// copy word to word_stem
83	int len = 0;
84	text_t::const_iterator here = word.begin();
85	text_t::const_iterator end = word.end();
86	while (len < maxstemlen && here != end) {
87	word_stem[len+1] = (unsigned char)(*here);
88	++len; ++here;
89	}
90	word_stem[len+1] = '\0';
91	word_stem[0] = len;
92
93	mgq_stemword (word_stem);
94
95	// copy word_stem back to tempstr
96	text_t tempstr;
97	tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
98
99	delete [] word_stem;
100
101	return tempstr;
102	}
103
104
105
106	////////////////////////
107	// callback functions //
108	////////////////////////
109
110	// This routine is called for each document found in a search
111	// it assumes that cache_num is set up correctly to point to
112	// a suitable result cache
113	int ourquerycallback(char * /UDoc/, int /ULen/, int DocNum,
114	float Weight, void *info) {
115
116
117	queryresultsclass queryresults = (queryresultsclass )info;
118
119	// append this entry to the document results
120	docresultclass docresult;
121	docresult.docnum = DocNum;
122	docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
123	docresult.docweight = Weight - docresult.num_query_terms_matched*100;
124
125	queryresults->docs.docset[DocNum] = docresult;
126	queryresults->docs.docorder.push_back(DocNum);
127
128	return 0;
129	}
130
131	int termequivcallback(char Word, int ULen, int /Freq*/,
132	float /Weight/, void *info) {
133	text_tset equivterms = (text_tset )info;
134	if (equivterms == NULL) return 0;
135
136	text_t thisterm;
137	thisterm.setcarr(Word, ULen);
138
139	equivterms->insert(thisterm);
140
141	return 0;
142	}
143
144
145	void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
146	// allocate working stem space
147	int maxstemlen = mgq_getmaxstemlen ();
148	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
149	if (word_stem == NULL) return;
150
151	// copy word to word_stem
152	int len = 0;
153	text_t::const_iterator here = word.begin();
154	text_t::const_iterator end = word.end();
155	while (len < maxstemlen && here != end) {
156	word_stem[len+1] = (unsigned char)(*here);
157	++len; ++here;
158	}
159	word_stem[len+1] = '\0';
160	word_stem[0] = len;
161
162	// get the equivalent terms
163	mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
164
165	delete [] word_stem;
166
167	return;
168	}
169
170	text_tset utf8equivterms; // kept as utf8 string for fast matching
171
172
173	// This callback is called once for each term in the query
174	int termfreqcallback(char *Word, int ULen, int Freq,
175	float /Weight/, void *info) {
176	queryresultsclass queryresults = (queryresultsclass )info;
177	if (queryresults == NULL) return 0;
178
179	text_t term;
180	term.setcarr(Word, ULen);
181	termfreqclass termfreq;
182
183	termfreq.termstr = to_uni(term);
184	text_t utf8termstem = mgsearch_stemword (term);
185	termfreq.termstemstr = to_uni (utf8termstem);
186
187	mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
188
189	termfreq.termfreq = Freq;
190	queryresults->orgterms.push_back(termfreq);
191
192	return 0;
193	}
194
195	// this callback is called once for each variation of each term
196	int termvariantscallback(char Word, int ULen, int /Freq*/,
197	float /Weight/, void *info) {
198
199	text_t term;
200	term.setcarr(Word, ULen);
201	queryresultsclass queryresults = (queryresultsclass )info;
202	queryresults->termvariants.insert(to_uni(term));
203
204	return 0;
205	}
206
207	// This callback is for getting document text
208	int doctextcallback(char Doc, int ULen, int /Freq*/,
209	float /Weight/, void * /info/) {
210	if (Doc != NULL) {
211	// Make a copy of this string so we can unload the database without losing it
212	tempdoc = new char[ULen + 1];
213	strcpy(tempdoc, Doc);
214	}
215	templen = ULen;
216
217	return 0;
218	}
219
220
221	text_t mgsearchclass::getindexsuffix (const text_t &collection,
222	const text_t &index) {
223
224	text_t indexsuffix = "index";
225	indexsuffix = filename_cat (indexsuffix, index);
226	if (indexstem.empty()) {
227	// no index stem, use the coll name
228	indexsuffix = filename_cat (indexsuffix, collection);
229	} else {
230	indexsuffix = filename_cat (indexsuffix, indexstem);
231	}
232	return indexsuffix;
233	}
234
235
236
237
238	////////////////////
239	// mgsearch class //
240	////////////////////
241
242	mgsearchclass::mgsearchclass ()
243	: searchclass() {
244
245	}
246
247	mgsearchclass::~mgsearchclass ()
248	{
249	if (cache != NULL)
250	{
251	delete cache;
252	cache = NULL;
253	}
254	}
255
256	void mgsearchclass::set_indexstem(const text_t &stem) {
257	indexstem = stem;
258
259	}
260
261	// you only need to use this function before doing any stemming
262	// casefolding and stemming will be set if values for them are
263	// provided (0 or 1).
264	// makeindexcurrent returns true if it was able to load the database
265	bool mgsearchclass::makeindexcurrent (const text_t &index,
266	const text_t &subcollection,
267	const text_t &language,
268	const text_t &collection,
269	int casefolding,
270	int stemming) {
271	bool databaseloaded = true;
272
273	// get the names of the collection, index and text suffixes
274	char *ccollection = collection.getcstr();
275	assert (ccollection != NULL);
276	char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
277	assert (idxsuffix != NULL);
278	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
279	assert (txtsuffix != NULL);
280	#ifdef __WIN32__
281	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
282	#else
283	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
284	#endif
285
286	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
287	if (casefolding == 0) mgq_ask(".set casefold off");
288	else if (casefolding > 0) mgq_ask(".set casefold on");
289	if (stemming == 0) mgq_ask(".set stem off");
290	else if (stemming > 0) mgq_ask(".set stem on");
291
292	} else databaseloaded = false;
293
294	// free up the c strings
295	delete []ccollection;
296	delete []idxsuffix;
297	delete []txtsuffix;
298	delete []ccollectdir;
299
300	return databaseloaded;
301	}
302
303
304	// stem word uses the values set in the last call to makeindexcurrent
305	// to stem the word. It is assumed that word is in unicode
306	text_t mgsearchclass::stemword (const text_t &word) {
307	return to_uni (mgsearch_stemword (to_utf8 (word)));
308	}
309
310	text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
311	return to_uni (mgsearch_stemword (to_utf8 (here, end)));
312	}
313
314	/**
315	* search directs the whole execution of the search; a number of other
316	* functions in this class are called as a result, and precondition
317	* checks are also made
318	*/
319	bool mgsearchclass::search(const queryparamclass &queryparams,
320	queryresultsclass &queryresults) {
321	// assert (cache != NULL);
322
323	// clear any previous results
324	queryresults.clear();
325	// first check the cache
326	if (cache != NULL) {
327	if (cache->find(queryparams, queryresults)) return true;
328	}
329	// make sure there is a query to be processed
330	if (!has_unicode_letdig(queryparams.querystring)) return true;
331
332	if (makeindexcurrent (queryparams.index, queryparams.subcollection,
333	queryparams.language, queryparams.collection)) {
334	// initialise the form of results
335	setsearchmode (queryparams);
336
337	// execute the query
338	submitquery (queryparams);
339
340	// retrieve the results
341	getresults (queryparams, queryresults);
342	unload_database(); // Important that local library doesn't leave any files open
343	return true;
344	}
345
346	return false;
347	}
348
349	/* accumulator_method has been changed to use array rather than list.
350	list appears to be broken somewhat - for some ranked queries, it returned
351	fewer results than it should have (eg 45 instead of 50). The three other
352	methods (array, splay_tree, hash_table) all return the same number of
353	documents, in the same order, with the same ranks. list returns what
354	appears to be the same documents (but less of them), but with different ranks,
355	and in a different order. Minimal time tests dont show any speed improvement
356	of list over array (maybe because its broken??). [02/2001, kjm18]
357
358	... [sjboddie, also 02/2001] turns out that changing the accumulator_method
359	introduced a more serious bug than it fixed (i.e. occasionally when doing a
360	ranked search for a very common word you get no results at all). I've
361	changed it back to list for now, one day we should play with other
362	accumulator_methods but for now I don't have time and don't want to risk
363	introducing bugs (better the devil you know ;)
364	*/
365	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
366	{
367	mgq_ask(".set expert true");
368	mgq_ask(".set sorted_terms true");
369	mgq_ask(".set accumulator_method list");
370	mgq_ask(".set max_accumulators 500000");
371	mgq_ask(".set maxparas 500000");
372	mgq_ask(".set verbatim true");
373	mgq_ask(".unset skip_dump");
374	mgq_ask(".set mode docnums");
375
376	switch (queryparams.search_type)
377	{
378	case 0: mgq_ask(".set query boolean"); break;
379	case 1: mgq_ask(".set query ranked"); break;
380	}
381	switch (queryparams.casefolding)
382	{
383	case 1: mgq_ask(".set casefold on"); break;
384	case 0: mgq_ask(".set casefold off"); break;
385	}
386	switch (queryparams.stemming)
387	{
388	case 1: mgq_ask(".set stem on"); break;
389	case 0: mgq_ask(".set stem off"); break;
390	}
391	mgq_ask(".set heads_length 150");
392
393	if (queryparams.maxdocs == -1) {
394	mgq_ask(".set maxdocs all");
395	} else {
396	char maxdocstr[32];
397	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
398	mgq_ask(maxdocstr);
399	}
400
401	char maxnumericstr[32];
402	sprintf(maxnumericstr, ".set maxnumeric %i", queryparams.maxnumeric);
403	mgq_ask(maxnumericstr);
404
405	}
406
407	/**
408	* submitquery constructs the query string (into UTF8 encoding)
409	* and submits it using mgq_ask to the mg search engine. Most
410	* of the processing will be done inside Greenstone
411	*/
412	void mgsearchclass::submitquery (const queryparamclass &queryparams)
413	{
414	// sort out the query string; copy it, remove all special characters
415	// and then convert it to a string in UTF8 format
416	text_t ttquerystring = queryparams.querystring;
417	filterquery (ttquerystring);
418	char *querystring = to_utf8(ttquerystring).getcstr();
419
420	// submit the query
421	mgq_ask(querystring);
422
423	// destroy the temporary character array
424	delete []querystring;
425	}
426
427	/**
428	* getrults is called to retrieve the required data on the docs
429	* which responded to the query submitted in submitquery above.
430	*
431	* It calls the local mgquery (mgq) interface to MG several times,
432	* to obtain the document numbers, term frequencies, term variants
433	* etc. All processing of the query will be done by Greenstone
434	* thereafter
435	*/
436	void mgsearchclass::getresults (const queryparamclass &queryparams,
437	queryresultsclass &queryresults) {
438	// get the configuration for the maximum number of documents to
439	// retrieve
440	int howmany = queryparams.maxdocs;
441	if (howmany == -1) howmany = MAXNUMDOCS;
442	mgq_results(result_docnums, 0, howmany,
443	ourquerycallback, (void *)(&queryresults));
444
445	// get the term frequencies
446	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
447	termfreqcallback, (void *)(&queryresults));
448	queryresults.sortuniqqueryterms();
449
450	// get term variants
451	mgq_results(result_terms, 0, MAXNUMTERMS,
452	termvariantscallback, (void *)(&queryresults));
453
454	// get the number of documents retrieved
455	int total_retrieved = 0, is_approx = 0;
456	mgq_docsretrieved (&total_retrieved, &is_approx);
457
458	if (total_retrieved == 0) {
459	// not available (or really was zero)
460	queryresults.docs_matched = queryresults.docs.docset.size();
461	if ((queryparams.maxdocs == -1) \|\|
462	(queryresults.docs_matched < queryparams.maxdocs))
463	queryresults.is_approx = Exact;
464	else
465	queryresults.is_approx = MoreThan;
466	} else {
467	queryresults.docs_matched = total_retrieved;
468	if (is_approx) queryresults.is_approx = Approximate;
469	else queryresults.is_approx = Exact;
470	}
471	}
472
473	/**
474	* Tidies the given querystring, removing special characters
475	*/
476	void mgsearchclass::filterquery (text_t &ttquerystring) {
477	text_t::iterator ithere = ttquerystring.begin ();
478	text_t::iterator itend = ttquerystring.end ();
479
480	// remove all non alphanumeric characters (except
481	// boolean operators
482	while (ithere != itend) {
483	if ((!is_unicode_letdig(ithere)) && (ithere != '!') &&
484	(ithere != '&') && (ithere != '\|') && (*ithere != '(') &&
485	(ithere != ')')) (ithere) = ' ';
486	++ithere;
487	}
488	}
489
490
491	// the document text for 'docnum' is placed in 'output'
492	// docTargetDocument returns 'true' if it was able to
493	// try to get a document
494	// collection is needed to see if an index from the
495	// collection is loaded. If no index has been loaded
496	// defaultindex is needed to load one
497	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
498	const text_t &defaultsubcollection,
499	const text_t &defaultlanguage,
500	const text_t &collection,
501	int docnum,
502	text_t &output) {
503	output.clear();
504
505	// get the mg version of the document
506	char *mgdoc = NULL;
507	int doclen = 0;
508	if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
509	collection, docnum, mgdoc, doclen)) return false;
510	if (mgdoc == NULL) return false;
511
512	// replace all control-Cs with spaces
513	char *mgdoc_here = mgdoc;
514	char *mgdoc_end = mgdoc + doclen;
515	while (mgdoc_here < mgdoc_end) {
516	if (mgdoc_here == '\x3') mgdoc_here = ' ';
517	++mgdoc_here;
518	}
519
520	// convert this document to unicode
521	utf8inconvertclass inconvert;
522	convertclass::status_t status;
523	inconvert.reset ();
524	inconvert.setinput (mgdoc, doclen);
525	inconvert.convert (output, status);
526
527	delete[] mgdoc;
528	return true;
529	}
530
531
532	bool mgsearchclass::mgdocument (const text_t &defaultindex,
533	const text_t &defaultsubcollection,
534	const text_t &defaultlanguage,
535	const text_t &collection,
536	int docnum,
537	char *&UDoc, int &ULen) {
538	int databaseloaded = 0;
539
540	UDoc = NULL; ULen = 0;
541
542	// see if we can make an appropriate database current
543	// char *ccollection = collection.getcstr();
544	// assert (ccollection != NULL);
545	// databaseloaded = load_text_database (ccollection);
546	// delete []ccollection;
547
548	// try and load the database
549	// if (!databaseloaded)
550	databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
551	defaultlanguage, collection);
552
553	if (databaseloaded) {
554	// retrieve the document from mg
555	char docstr[32];
556	sprintf(docstr, "%i", docnum);
557
558	mgq_ask(".set mode text");
559	mgq_ask(".set query docnums");
560	mgq_ask(docstr);
561
562	tempdoc = NULL;
563	templen = 0;
564	mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
565	UDoc = tempdoc;
566	ULen = templen;
567	}
568
569	unload_database(); // Important that local library doesn't leave any files open
570	return (bool)databaseloaded;
571	}
572
573	// unload_database simply calls mgq's close_all_databases function to clear
574	// any cached databases - this is useful when attempting to completely
575	// remove all trace of a collectionserver at runtime (when using a
576	// persistent version of Greenstone like the windows local library)
577	void mgsearchclass::unload_database () {
578	close_all_databases();
579	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: