Context Navigation

source: gsdl/trunk/src/colservr/mgsearch.cpp@ 15757

Last change on this file since 15757 was 15590, checked in by mdewsnip, 16 years ago
Removed unnecessary inclusions of the gdbm headers.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.5 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "gsdlconf.h"
27	#include "mgsearch.h"
28	#include "fileutil.h"
29
30	#include <string.h>
31	#include <stdio.h>
32	#include <stdlib.h>
33	#include <ctype.h>
34
35	#if defined(GSDL_USE_OBJECTSPACE)
36	# include <ospace\std\iostream>
37	#elif defined(GSDL_USE_IOS_H)
38	# include <iostream.h>
39	#else
40	# include <iostream>
41	#endif
42
43
44	#include <assert.h>
45
46	#include "mgq.h"
47	// #include "locateinfo.h"
48	#include "gsdlunicode.h"
49	#include "unitool.h"
50
51
52	/////////////
53	// globals //
54	/////////////
55
56	static char *tempdoc = NULL;
57	static int templen = 0;
58
59
60	//////////////////////
61	// useful functions //
62	//////////////////////
63
64
65	// input and output are in utf8
66	text_t mgsearch_stemword (const text_t &word) {
67	// allocate working stem space
68	int maxstemlen = mgq_getmaxstemlen ();
69	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
70	if (word_stem == NULL) return "";
71
72	// copy word to word_stem
73	int len = 0;
74	text_t::const_iterator here = word.begin();
75	text_t::const_iterator end = word.end();
76	while (len < maxstemlen && here != end) {
77	word_stem[len+1] = (unsigned char)(*here);
78	++len; ++here;
79	}
80	word_stem[len+1] = '\0';
81	word_stem[0] = len;
82
83	mgq_stemword (word_stem);
84
85	// copy word_stem back to tempstr
86	text_t tempstr;
87	tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
88
89	delete [] word_stem;
90
91	return tempstr;
92	}
93
94
95
96	////////////////////////
97	// callback functions //
98	////////////////////////
99
100	// This routine is called for each document found in a search
101	// it assumes that cache_num is set up correctly to point to
102	// a suitable result cache
103	int ourquerycallback(char * /UDoc/, int /ULen/, int DocNum,
104	float Weight, void *info) {
105
106
107	queryresultsclass queryresults = (queryresultsclass )info;
108
109	// append this entry to the document results
110	docresultclass docresult;
111	docresult.docnum = DocNum;
112	docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
113	docresult.docweight = Weight - docresult.num_query_terms_matched*100;
114
115	queryresults->docs.docset[DocNum] = docresult;
116	queryresults->docs.docorder.push_back(DocNum);
117
118	return 0;
119	}
120
121	int termequivcallback(char Word, int ULen, int /Freq*/,
122	float /Weight/, void *info) {
123	text_tset equivterms = (text_tset )info;
124	if (equivterms == NULL) return 0;
125
126	text_t thisterm;
127	thisterm.setcarr(Word, ULen);
128
129	equivterms->insert(thisterm);
130
131	return 0;
132	}
133
134
135	void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
136	// allocate working stem space
137	int maxstemlen = mgq_getmaxstemlen ();
138	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
139	if (word_stem == NULL) return;
140
141	// copy word to word_stem
142	int len = 0;
143	text_t::const_iterator here = word.begin();
144	text_t::const_iterator end = word.end();
145	while (len < maxstemlen && here != end) {
146	word_stem[len+1] = (unsigned char)(*here);
147	++len; ++here;
148	}
149	word_stem[len+1] = '\0';
150	word_stem[0] = len;
151
152	// get the equivalent terms
153	mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
154
155	delete [] word_stem;
156
157	return;
158	}
159
160	text_tset utf8equivterms; // kept as utf8 string for fast matching
161
162
163	// This callback is called once for each term in the query
164	int termfreqcallback(char *Word, int ULen, int Freq,
165	float /Weight/, void *info) {
166	queryresultsclass queryresults = (queryresultsclass )info;
167	if (queryresults == NULL) return 0;
168
169	text_t term;
170	term.setcarr(Word, ULen);
171	termfreqclass termfreq;
172
173	termfreq.termstr = to_uni(term);
174	text_t utf8termstem = mgsearch_stemword (term);
175	termfreq.termstemstr = to_uni (utf8termstem);
176
177	mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
178
179	termfreq.termfreq = Freq;
180	queryresults->orgterms.push_back(termfreq);
181
182	return 0;
183	}
184
185	// this callback is called once for each variation of each term
186	int termvariantscallback(char Word, int ULen, int /Freq*/,
187	float /Weight/, void *info) {
188
189	text_t term;
190	term.setcarr(Word, ULen);
191	queryresultsclass queryresults = (queryresultsclass )info;
192	queryresults->termvariants.insert(to_uni(term));
193
194	return 0;
195	}
196
197	// This callback is for getting document text
198	int doctextcallback(char Doc, int ULen, int /Freq*/,
199	float /Weight/, void * /info/) {
200	if (Doc != NULL) {
201	// Make a copy of this string so we can unload the database without losing it
202	tempdoc = new char[ULen + 1];
203	strcpy(tempdoc, Doc);
204	}
205	templen = ULen;
206
207	return 0;
208	}
209
210
211	text_t mgsearchclass::getindexsuffix (const text_t &collection,
212	const text_t &index) {
213
214	text_t indexsuffix = "index";
215	indexsuffix = filename_cat (indexsuffix, index);
216	if (indexstem.empty()) {
217	// no index stem, use the coll name
218	indexsuffix = filename_cat (indexsuffix, collection);
219	} else {
220	indexsuffix = filename_cat (indexsuffix, indexstem);
221	}
222	return indexsuffix;
223	}
224
225
226
227
228	////////////////////
229	// mgsearch class //
230	////////////////////
231
232	mgsearchclass::mgsearchclass ()
233	: searchclass() {
234
235	}
236
237	mgsearchclass::~mgsearchclass ()
238	{
239	if (cache != NULL)
240	{
241	delete cache;
242	cache = NULL;
243	}
244	}
245
246	void mgsearchclass::set_indexstem(const text_t &stem) {
247	indexstem = stem;
248
249	}
250
251	// you only need to use this function before doing any stemming
252	// casefolding and stemming will be set if values for them are
253	// provided (0 or 1).
254	// makeindexcurrent returns true if it was able to load the database
255	bool mgsearchclass::makeindexcurrent (const text_t &index,
256	const text_t &subcollection,
257	const text_t &language,
258	const text_t &collection,
259	int casefolding,
260	int stemming) {
261	bool databaseloaded = true;
262
263	// get the names of the collection, index and text suffixes
264	char *ccollection = collection.getcstr();
265	assert (ccollection != NULL);
266	char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
267	assert (idxsuffix != NULL);
268	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
269	assert (txtsuffix != NULL);
270	#ifdef __WIN32__
271	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
272	#else
273	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
274	#endif
275
276	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
277	if (casefolding == 0) mgq_ask(".set casefold off");
278	else if (casefolding > 0) mgq_ask(".set casefold on");
279	if (stemming == 0) mgq_ask(".set stem off");
280	else if (stemming > 0) mgq_ask(".set stem on");
281
282	} else databaseloaded = false;
283
284	// free up the c strings
285	delete []ccollection;
286	delete []idxsuffix;
287	delete []txtsuffix;
288	delete []ccollectdir;
289
290	return databaseloaded;
291	}
292
293
294	// stem word uses the values set in the last call to makeindexcurrent
295	// to stem the word. It is assumed that word is in unicode
296	text_t mgsearchclass::stemword (const text_t &word) {
297	return to_uni (mgsearch_stemword (to_utf8 (word)));
298	}
299
300	text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
301	return to_uni (mgsearch_stemword (to_utf8 (here, end)));
302	}
303
304	/**
305	* search directs the whole execution of the search; a number of other
306	* functions in this class are called as a result, and precondition
307	* checks are also made
308	*/
309	bool mgsearchclass::search(const queryparamclass &queryparams,
310	queryresultsclass &queryresults) {
311	// assert (cache != NULL);
312
313	// clear any previous results
314	queryresults.clear();
315	// first check the cache
316	if (cache != NULL) {
317	if (cache->find(queryparams, queryresults)) return true;
318	}
319	// make sure there is a query to be processed
320	if (!has_unicode_letdig(queryparams.querystring)) return true;
321
322	if (makeindexcurrent (queryparams.index, queryparams.subcollection,
323	queryparams.language, queryparams.collection)) {
324	// initialise the form of results
325	setsearchmode (queryparams);
326
327	// execute the query
328	submitquery (queryparams);
329
330	// retrieve the results
331	getresults (queryparams, queryresults);
332	unload_database(); // Important that local library doesn't leave any files open
333	return true;
334	}
335
336	return false;
337	}
338
339	/* accumulator_method has been changed to use array rather than list.
340	list appears to be broken somewhat - for some ranked queries, it returned
341	fewer results than it should have (eg 45 instead of 50). The three other
342	methods (array, splay_tree, hash_table) all return the same number of
343	documents, in the same order, with the same ranks. list returns what
344	appears to be the same documents (but less of them), but with different ranks,
345	and in a different order. Minimal time tests dont show any speed improvement
346	of list over array (maybe because its broken??). [02/2001, kjm18]
347
348	... [sjboddie, also 02/2001] turns out that changing the accumulator_method
349	introduced a more serious bug than it fixed (i.e. occasionally when doing a
350	ranked search for a very common word you get no results at all). I've
351	changed it back to list for now, one day we should play with other
352	accumulator_methods but for now I don't have time and don't want to risk
353	introducing bugs (better the devil you know ;)
354	*/
355	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
356	{
357	mgq_ask(".set expert true");
358	mgq_ask(".set sorted_terms true");
359	mgq_ask(".set accumulator_method list");
360	mgq_ask(".set max_accumulators 500000");
361	mgq_ask(".set maxparas 500000");
362	mgq_ask(".set verbatim true");
363	mgq_ask(".unset skip_dump");
364	mgq_ask(".set mode docnums");
365
366	switch (queryparams.search_type)
367	{
368	case 0: mgq_ask(".set query boolean"); break;
369	case 1: mgq_ask(".set query ranked"); break;
370	}
371	switch (queryparams.casefolding)
372	{
373	case 1: mgq_ask(".set casefold on"); break;
374	case 0: mgq_ask(".set casefold off"); break;
375	}
376	switch (queryparams.stemming)
377	{
378	case 1: mgq_ask(".set stem on"); break;
379	case 0: mgq_ask(".set stem off"); break;
380	}
381	mgq_ask(".set heads_length 150");
382
383	if (queryparams.maxdocs == -1) {
384	mgq_ask(".set maxdocs all");
385	} else {
386	char maxdocstr[32];
387	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
388	mgq_ask(maxdocstr);
389	}
390
391	char maxnumericstr[32];
392	sprintf(maxnumericstr, ".set maxnumeric %i", queryparams.maxnumeric);
393	mgq_ask(maxnumericstr);
394
395	}
396
397	/**
398	* submitquery constructs the query string (into UTF8 encoding)
399	* and submits it using mgq_ask to the mg search engine. Most
400	* of the processing will be done inside Greenstone
401	*/
402	void mgsearchclass::submitquery (const queryparamclass &queryparams)
403	{
404	// sort out the query string; copy it, remove all special characters
405	// and then convert it to a string in UTF8 format
406	text_t ttquerystring = queryparams.querystring;
407	filterquery (ttquerystring);
408	char *querystring = to_utf8(ttquerystring).getcstr();
409
410	// submit the query
411	mgq_ask(querystring);
412
413	// destroy the temporary character array
414	delete []querystring;
415	}
416
417	/**
418	* getrults is called to retrieve the required data on the docs
419	* which responded to the query submitted in submitquery above.
420	*
421	* It calls the local mgquery (mgq) interface to MG several times,
422	* to obtain the document numbers, term frequencies, term variants
423	* etc. All processing of the query will be done by Greenstone
424	* thereafter
425	*/
426	void mgsearchclass::getresults (const queryparamclass &queryparams,
427	queryresultsclass &queryresults) {
428	// get the configuration for the maximum number of documents to
429	// retrieve
430	int howmany = queryparams.maxdocs;
431	if (howmany == -1) howmany = MAXNUMDOCS;
432	mgq_results(result_docnums, 0, howmany,
433	ourquerycallback, (void *)(&queryresults));
434
435	// get the term frequencies
436	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
437	termfreqcallback, (void *)(&queryresults));
438	queryresults.sortuniqqueryterms();
439
440	// get term variants
441	mgq_results(result_terms, 0, MAXNUMTERMS,
442	termvariantscallback, (void *)(&queryresults));
443
444	// get the number of documents retrieved
445	int total_retrieved = 0, is_approx = 0;
446	mgq_docsretrieved (&total_retrieved, &is_approx);
447
448	if (total_retrieved == 0) {
449	// not available (or really was zero)
450	queryresults.docs_matched = queryresults.docs.docset.size();
451	if ((queryparams.maxdocs == -1) \|\|
452	(queryresults.docs_matched < queryparams.maxdocs))
453	queryresults.is_approx = Exact;
454	else
455	queryresults.is_approx = MoreThan;
456	} else {
457	queryresults.docs_matched = total_retrieved;
458	if (is_approx) queryresults.is_approx = Approximate;
459	else queryresults.is_approx = Exact;
460	}
461	}
462
463	/**
464	* Tidies the given querystring, removing special characters
465	*/
466	void mgsearchclass::filterquery (text_t &ttquerystring) {
467	text_t::iterator ithere = ttquerystring.begin ();
468	text_t::iterator itend = ttquerystring.end ();
469
470	// remove all non alphanumeric characters (except
471	// boolean operators
472	while (ithere != itend) {
473	if ((!is_unicode_letdig(ithere)) && (ithere != '!') &&
474	(ithere != '&') && (ithere != '\|') && (*ithere != '(') &&
475	(ithere != ')')) (ithere) = ' ';
476	++ithere;
477	}
478	}
479
480
481	// the document text for 'docnum' is placed in 'output'
482	// docTargetDocument returns 'true' if it was able to
483	// try to get a document
484	// collection is needed to see if an index from the
485	// collection is loaded. If no index has been loaded
486	// defaultindex is needed to load one
487	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
488	const text_t &defaultsubcollection,
489	const text_t &defaultlanguage,
490	const text_t &collection,
491	int docnum,
492	text_t &output) {
493	output.clear();
494
495	// get the mg version of the document
496	char *mgdoc = NULL;
497	int doclen = 0;
498	if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
499	collection, docnum, mgdoc, doclen)) return false;
500	if (mgdoc == NULL) return false;
501
502	// replace all control-Cs with spaces
503	char *mgdoc_here = mgdoc;
504	char *mgdoc_end = mgdoc + doclen;
505	while (mgdoc_here < mgdoc_end) {
506	if (mgdoc_here == '\x3') mgdoc_here = ' ';
507	++mgdoc_here;
508	}
509
510	// convert this document to unicode
511	utf8inconvertclass inconvert;
512	convertclass::status_t status;
513	inconvert.reset ();
514	inconvert.setinput (mgdoc, doclen);
515	inconvert.convert (output, status);
516
517	delete[] mgdoc;
518	return true;
519	}
520
521
522	bool mgsearchclass::mgdocument (const text_t &defaultindex,
523	const text_t &defaultsubcollection,
524	const text_t &defaultlanguage,
525	const text_t &collection,
526	int docnum,
527	char *&UDoc, int &ULen) {
528	int databaseloaded = 0;
529
530	UDoc = NULL; ULen = 0;
531
532	// see if we can make an appropriate database current
533	// char *ccollection = collection.getcstr();
534	// assert (ccollection != NULL);
535	// databaseloaded = load_text_database (ccollection);
536	// delete []ccollection;
537
538	// try and load the database
539	// if (!databaseloaded)
540	databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
541	defaultlanguage, collection);
542
543	if (databaseloaded) {
544	// retrieve the document from mg
545	char docstr[32];
546	sprintf(docstr, "%i", docnum);
547
548	mgq_ask(".set mode text");
549	mgq_ask(".set query docnums");
550	mgq_ask(docstr);
551
552	tempdoc = NULL;
553	templen = 0;
554	mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
555	UDoc = tempdoc;
556	ULen = templen;
557	}
558
559	unload_database(); // Important that local library doesn't leave any files open
560	return (bool)databaseloaded;
561	}
562
563	// unload_database simply calls mgq's close_all_databases function to clear
564	// any cached databases - this is useful when attempting to completely
565	// remove all trace of a collectionserver at runtime (when using a
566	// persistent version of Greenstone like the windows local library)
567	void mgsearchclass::unload_database () {
568	close_all_databases();
569	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: