Context Navigation

source: trunk/gsdl/src/colservr/mgsearch.cpp@ 9620

Last change on this file since 9620 was 9620, checked in by kjdon, 19 years ago
added some x++ -> ++x changes submitted by Emanuel Dejanu
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.1 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "gsdlconf.h"
27	#include "mgsearch.h"
28	#include "fileutil.h"
29
30	#include <string.h>
31	#include <stdio.h>
32	#include <stdlib.h>
33	#include <ctype.h>
34
35	#if defined(GSDL_USE_OBJECTSPACE)
36	# include <ospace\std\iostream>
37	#elif defined(GSDL_USE_IOS_H)
38	# include <iostream.h>
39	#else
40	# include <iostream>
41	#endif
42
43	#if defined(__WIN32__)
44	// gdbm stuff
45	# include "autoconf.h"
46	# include "systems.h"
47	# include "gdbmconst.h"
48	# include "gdbm.h"
49	#else
50	# include <gdbm.h>
51	#endif
52
53
54	#include <assert.h>
55
56	#include "mgq.h"
57	// #include "locateinfo.h"
58	#include "gsdlunicode.h"
59	#include "unitool.h"
60
61
62	/////////////
63	// globals //
64	/////////////
65
66	static char *tempdoc = NULL;
67	static int templen = 0;
68
69
70	//////////////////////
71	// useful functions //
72	//////////////////////
73
74
75	// input and output are in utf8
76	text_t mgsearch_stemword (const text_t &word) {
77	// allocate working stem space
78	int maxstemlen = mgq_getmaxstemlen ();
79	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
80	if (word_stem == NULL) return "";
81
82	// copy word to word_stem
83	int len = 0;
84	text_t::const_iterator here = word.begin();
85	text_t::const_iterator end = word.end();
86	while (len < maxstemlen && here != end) {
87	word_stem[len+1] = (unsigned char)(*here);
88	++len; ++here;
89	}
90	word_stem[len+1] = '\0';
91	word_stem[0] = len;
92
93	mgq_stemword (word_stem);
94
95	// copy word_stem back to tempstr
96	text_t tempstr;
97	tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
98
99	delete [] word_stem;
100
101	return tempstr;
102	}
103
104
105
106	////////////////////////
107	// callback functions //
108	////////////////////////
109
110	// This routine is called for each document found in a search
111	// it assumes that cache_num is set up correctly to point to
112	// a suitable result cache
113	int ourquerycallback(char * /UDoc/, int /ULen/, int DocNum,
114	float Weight, void *info) {
115
116
117	queryresultsclass queryresults = (queryresultsclass )info;
118
119	// append this entry to the document results
120	docresultclass docresult;
121	docresult.docnum = DocNum;
122	docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
123	docresult.docweight = Weight - docresult.num_query_terms_matched*100;
124
125	queryresults->docs.docset[DocNum] = docresult;
126	queryresults->docs.docorder.push_back(DocNum);
127
128	return 0;
129	}
130
131	int termequivcallback(char Word, int ULen, int /Freq*/,
132	float /Weight/, void *info) {
133	text_tset equivterms = (text_tset )info;
134	if (equivterms == NULL) return 0;
135
136	text_t thisterm;
137	thisterm.setcarr(Word, ULen);
138
139	equivterms->insert(thisterm);
140
141	return 0;
142	}
143
144
145	void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
146	// allocate working stem space
147	int maxstemlen = mgq_getmaxstemlen ();
148	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
149	if (word_stem == NULL) return;
150
151	// copy word to word_stem
152	int len = 0;
153	text_t::const_iterator here = word.begin();
154	text_t::const_iterator end = word.end();
155	while (len < maxstemlen && here != end) {
156	word_stem[len+1] = (unsigned char)(*here);
157	++len; ++here;
158	}
159	word_stem[len+1] = '\0';
160	word_stem[0] = len;
161
162	// get the equivalent terms
163	mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
164
165	delete [] word_stem;
166
167	return;
168	}
169
170	text_tset utf8equivterms; // kept as utf8 string for fast matching
171
172
173	// This callback is called once for each term in the query
174	int termfreqcallback(char *Word, int ULen, int Freq,
175	float /Weight/, void *info) {
176	queryresultsclass queryresults = (queryresultsclass )info;
177	if (queryresults == NULL) return 0;
178
179	text_t term;
180	term.setcarr(Word, ULen);
181	termfreqclass termfreq;
182
183	termfreq.termstr = to_uni(term);
184	text_t utf8termstem = mgsearch_stemword (term);
185	termfreq.termstemstr = to_uni (utf8termstem);
186
187	mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
188
189	termfreq.termfreq = Freq;
190	queryresults->orgterms.push_back(termfreq);
191
192	return 0;
193	}
194
195	// this callback is called once for each variation of each term
196	int termvariantscallback(char Word, int ULen, int /Freq*/,
197	float /Weight/, void *info) {
198
199	text_t term;
200	term.setcarr(Word, ULen);
201	queryresultsclass queryresults = (queryresultsclass )info;
202	queryresults->termvariants.insert(to_uni(term));
203
204	return 0;
205	}
206
207	// This callback is for getting document text
208	int doctextcallback(char Doc, int ULen, int /Freq*/,
209	float /Weight/, void * /info/) {
210	tempdoc = Doc;
211	templen = ULen;
212
213	return 0;
214	}
215
216
217	static text_t getindexsuffix (const text_t &collection,
218	const text_t &index) {
219
220	text_t indexsuffix = "index";
221	indexsuffix = filename_cat (indexsuffix, index);
222	indexsuffix = filename_cat (indexsuffix, collection);
223	return indexsuffix;
224	}
225
226
227
228
229	////////////////////
230	// mgsearch class //
231	////////////////////
232
233	mgsearchclass::mgsearchclass ()
234	: searchclass() {
235
236	}
237
238	mgsearchclass::~mgsearchclass ()
239	{
240	if (cache != NULL)
241	{
242	delete cache;
243	cache = NULL;
244	}
245	}
246
247	// you only need to use this function before doing any stemming
248	// casefolding and stemming will be set if values for them are
249	// provided (0 or 1).
250	// makeindexcurrent returns true if it was able to load the database
251	bool mgsearchclass::makeindexcurrent (const text_t &index,
252	const text_t &subcollection,
253	const text_t &language,
254	const text_t &collection,
255	int casefolding,
256	int stemming) {
257	bool databaseloaded = true;
258
259	// get the names of the collection, index and text suffixes
260	char *ccollection = collection.getcstr();
261	assert (ccollection != NULL);
262	char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
263	assert (idxsuffix != NULL);
264	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
265	assert (txtsuffix != NULL);
266
267	#ifdef __WIN32__
268	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
269	#else
270	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
271	#endif
272
273	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
274	if (casefolding == 0) mgq_ask(".set casefold off");
275	else if (casefolding > 0) mgq_ask(".set casefold on");
276	if (stemming == 0) mgq_ask(".set stem off");
277	else if (stemming > 0) mgq_ask(".set stem on");
278
279	} else databaseloaded = false;
280
281	// free up the c strings
282	delete ccollection;
283	delete idxsuffix;
284	delete txtsuffix;
285	delete ccollectdir;
286
287	return databaseloaded;
288	}
289
290
291	// stem word uses the values set in the last call to makeindexcurrent
292	// to stem the word. It is assumed that word is in unicode
293	text_t mgsearchclass::stemword (const text_t &word) {
294	return to_uni (mgsearch_stemword (to_utf8 (word)));
295	}
296
297	text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
298	return to_uni (mgsearch_stemword (to_utf8 (here, end)));
299	}
300
301	/**
302	* search directs the whole execution of the search; a number of other
303	* functions in this class are called as a result, and precondition
304	* checks are also made
305	*/
306	bool mgsearchclass::search(const queryparamclass &queryparams,
307	queryresultsclass &queryresults) {
308	// assert (cache != NULL);
309
310	// clear any previous results
311	queryresults.clear();
312	// first check the cache
313	if (cache != NULL) {
314	if (cache->find(queryparams, queryresults)) return true;
315	}
316	// make sure there is a query to be processed
317	if (!has_unicode_letdig(queryparams.querystring)) return true;
318
319	if (makeindexcurrent (queryparams.index, queryparams.subcollection,
320	queryparams.language, queryparams.collection)) {
321	// initialise the form of results
322	setsearchmode (queryparams);
323
324	// execute the query
325	submitquery (queryparams);
326
327	// retrieve the results
328	getresults (queryparams, queryresults);
329	return true;
330	}
331
332	return false;
333	}
334
335	/* accumulator_method has been changed to use array rather than list.
336	list appears to be broken somewhat - for some ranked queries, it returned
337	fewer results than it should have (eg 45 instead of 50). The three other
338	methods (array, splay_tree, hash_table) all return the same number of
339	documents, in the same order, with the same ranks. list returns what
340	appears to be the same documents (but less of them), but with different ranks,
341	and in a different order. Minimal time tests dont show any speed improvement
342	of list over array (maybe because its broken??). [02/2001, kjm18]
343
344	... [sjboddie, also 02/2001] turns out that changing the accumulator_method
345	introduced a more serious bug than it fixed (i.e. occasionally when doing a
346	ranked search for a very common word you get no results at all). I've
347	changed it back to list for now, one day we should play with other
348	accumulator_methods but for now I don't have time and don't want to risk
349	introducing bugs (better the devil you know ;)
350	*/
351	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
352	{
353	mgq_ask(".set expert true");
354	mgq_ask(".set sorted_terms true");
355	mgq_ask(".set accumulator_method list");
356	mgq_ask(".set max_accumulators 500000");
357	mgq_ask(".set maxparas 500000");
358	mgq_ask(".set verbatim true");
359	mgq_ask(".unset skip_dump");
360	mgq_ask(".set mode docnums");
361
362	switch (queryparams.search_type)
363	{
364	case 0: mgq_ask(".set query boolean"); break;
365	case 1: mgq_ask(".set query ranked"); break;
366	}
367	switch (queryparams.casefolding)
368	{
369	case 1: mgq_ask(".set casefold on"); break;
370	case 0: mgq_ask(".set casefold off"); break;
371	}
372	switch (queryparams.stemming)
373	{
374	case 1: mgq_ask(".set stem on"); break;
375	case 0: mgq_ask(".set stem off"); break;
376	}
377	mgq_ask(".set heads_length 150");
378
379	if (queryparams.maxdocs == -1) {
380	mgq_ask(".set maxdocs all");
381	} else {
382	char maxdocstr[32];
383	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
384	mgq_ask(maxdocstr);
385	}
386
387	char maxnumericstr[32];
388	sprintf(maxnumericstr, ".set maxnumeric %i", queryparams.maxnumeric);
389	mgq_ask(maxnumericstr);
390
391	}
392
393	/**
394	* submitquery constructs the query string (into UTF8 encoding)
395	* and submits it using mgq_ask to the mg search engine. Most
396	* of the processing will be done inside Greenstone
397	*/
398	void mgsearchclass::submitquery (const queryparamclass &queryparams)
399	{
400	// sort out the query string; copy it, remove all special characters
401	// and then convert it to a string in UTF8 format
402	text_t ttquerystring = queryparams.querystring;
403	filterquery (ttquerystring);
404	char *querystring = to_utf8(ttquerystring).getcstr();
405
406	// submit the query
407	mgq_ask(querystring);
408
409	// destroy the temporary character array
410	delete querystring;
411	}
412
413	/**
414	* getrults is called to retrieve the required data on the docs
415	* which responded to the query submitted in submitquery above.
416	*
417	* It calls the local mgquery (mgq) interface to MG several times,
418	* to obtain the document numbers, term frequencies, term variants
419	* etc. All processing of the query will be done by Greenstone
420	* thereafter
421	*/
422	void mgsearchclass::getresults (const queryparamclass &queryparams,
423	queryresultsclass &queryresults) {
424	// get the configuration for the maximum number of documents to
425	// retrieve
426	int howmany = queryparams.maxdocs;
427	if (howmany == -1) howmany = MAXNUMDOCS;
428	mgq_results(result_docnums, 0, howmany,
429	ourquerycallback, (void *)(&queryresults));
430
431	// get the term frequencies
432	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
433	termfreqcallback, (void *)(&queryresults));
434	queryresults.sortuniqqueryterms();
435
436	// get term variants
437	mgq_results(result_terms, 0, MAXNUMTERMS,
438	termvariantscallback, (void *)(&queryresults));
439
440	// get the number of documents retrieved
441	int total_retrieved = 0, is_approx = 0;
442	mgq_docsretrieved (&total_retrieved, &is_approx);
443
444	if (total_retrieved == 0) {
445	// not available (or really was zero)
446	queryresults.docs_matched = queryresults.docs.docset.size();
447	if ((queryparams.maxdocs == -1) \|\|
448	(queryresults.docs_matched < queryparams.maxdocs))
449	queryresults.is_approx = Exact;
450	else
451	queryresults.is_approx = MoreThan;
452	} else {
453	queryresults.docs_matched = total_retrieved;
454	if (is_approx) queryresults.is_approx = Approximate;
455	else queryresults.is_approx = Exact;
456	}
457	}
458
459	/**
460	* Tidies the given querystring, removing special characters
461	*/
462	void mgsearchclass::filterquery (text_t &ttquerystring) {
463	text_t::iterator ithere = ttquerystring.begin ();
464	text_t::iterator itend = ttquerystring.end ();
465
466	// remove all non alphanumeric characters (except
467	// boolean operators
468	while (ithere != itend) {
469	if ((!is_unicode_letdig(ithere)) && (ithere != '!') &&
470	(ithere != '&') && (ithere != '\|') && (*ithere != '(') &&
471	(ithere != ')')) (ithere) = ' ';
472	++ithere;
473	}
474	}
475
476
477	// the document text for 'docnum' is placed in 'output'
478	// docTargetDocument returns 'true' if it was able to
479	// try to get a document
480	// collection is needed to see if an index from the
481	// collection is loaded. If no index has been loaded
482	// defaultindex is needed to load one
483	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
484	const text_t &defaultsubcollection,
485	const text_t &defaultlanguage,
486	const text_t &collection,
487	int docnum,
488	text_t &output) {
489	output.clear();
490
491	// get the mg version of the document
492	char *mgdoc = NULL;
493	int doclen = 0;
494	if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
495	collection, docnum, mgdoc, doclen)) return false;
496	if (mgdoc == NULL) return false;
497
498	// replace all control-Cs with spaces
499	char *mgdoc_here = mgdoc;
500	char *mgdoc_end = mgdoc + doclen;
501	while (mgdoc_here < mgdoc_end) {
502	if (mgdoc_here == '\x3') mgdoc_here = ' ';
503	++mgdoc_here;
504	}
505
506	// convert this document to unicode
507	utf8inconvertclass inconvert;
508	convertclass::status_t status;
509	inconvert.reset ();
510	inconvert.setinput (mgdoc, doclen);
511	inconvert.convert (output, status);
512
513	return true;
514	}
515
516
517	bool mgsearchclass::mgdocument (const text_t &defaultindex,
518	const text_t &defaultsubcollection,
519	const text_t &defaultlanguage,
520	const text_t &collection,
521	int docnum,
522	char *&UDoc, int &ULen) {
523	int databaseloaded = 0;
524
525	UDoc = NULL; ULen = 0;
526
527	// see if we can make an appropriate database current
528	// char *ccollection = collection.getcstr();
529	// assert (ccollection != NULL);
530	// databaseloaded = load_text_database (ccollection);
531	// delete ccollection;
532
533	// try and load the database
534	// if (!databaseloaded)
535	databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
536	defaultlanguage, collection);
537
538	if (databaseloaded) {
539	// retrieve the document from mg
540	char docstr[32];
541	sprintf(docstr, "%i", docnum);
542
543	mgq_ask(".set mode text");
544	mgq_ask(".set query docnums");
545	mgq_ask(docstr);
546
547	tempdoc = NULL;
548	templen = 0;
549	mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
550	UDoc = tempdoc;
551	ULen = templen;
552	}
553
554	return (bool)databaseloaded;
555	}
556
557	// unload_database simply calls mgq's close_all_databases function to clear
558	// any cached databases - this is useful when attempting to completely
559	// remove all trace of a collectionserver at runtime (when using a
560	// persistent version of Greenstone like the windows local library)
561	void mgsearchclass::unload_database () {
562	close_all_databases();
563	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: