Context Navigation

source: trunk/gsdl/src/colservr/mgsearch.cpp@ 1285

Last change on this file since 1285 was 1285, checked in by sjboddie, 24 years ago
Removed CVS logging information from source files
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 13.6 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "gsdlconf.h"
27	#include "mgsearch.h"
28	#include "fileutil.h"
29
30	#include <string.h>
31	#include <stdio.h>
32	#include <stdlib.h>
33	#include <ctype.h>
34
35	#if defined(GSDL_USE_OBJECTSPACE)
36	# include <ospace\std\iostream>
37	#elif defined(GSDL_USE_IOS_H)
38	# include <iostream.h>
39	#else
40	# include <iostream>
41	#endif
42
43	#if defined(__WIN32__)
44	// gdbm stuff
45	# include "autoconf.h"
46	# include "systems.h"
47	# include "gdbmconst.h"
48	# include "gdbm.h"
49	#else
50	# include <gdbm.h>
51	#endif
52
53
54	#include <assert.h>
55
56	#include "mgq.h"
57	// #include "locateinfo.h"
58	#include "gsdlunicode.h"
59	#include "unitool.h"
60
61
62	/////////////
63	// globals //
64	/////////////
65
66	static char *tempdoc = NULL;
67	static int templen = 0;
68
69
70	//////////////////////
71	// useful functions //
72	//////////////////////
73
74
75	// input and output are in utf8
76	text_t mgsearch_stemword (const text_t &word) {
77	// allocate working stem space
78	int maxstemlen = mgq_getmaxstemlen ();
79	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
80	if (word_stem == NULL) return "";
81
82	// copy word to word_stem
83	int len = 0;
84	text_t::const_iterator here = word.begin();
85	text_t::const_iterator end = word.end();
86	while (len < maxstemlen && here != end) {
87	word_stem[len+1] = (unsigned char)(*here);
88	len++; here++;
89	}
90	word_stem[len+1] = '\0';
91	word_stem[0] = len;
92
93	mgq_stemword (word_stem);
94
95	// copy word_stem back to tempstr
96	text_t tempstr;
97	tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
98
99	delete [] word_stem;
100
101	return tempstr;
102	}
103
104
105
106	////////////////////////
107	// callback functions //
108	////////////////////////
109
110	// This routine is called for each document found in a search
111	// it assumes that cache_num is set up correctly to point to
112	// a suitable result cache
113	int ourquerycallback(char * /UDoc/, int /ULen/, int DocNum,
114	float Weight, void *info) {
115
116
117	queryresultsclass queryresults = (queryresultsclass )info;
118
119	// append this entry to the document results
120	docresultclass docresult;
121	docresult.docnum = DocNum;
122	docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
123	docresult.docweight = Weight - docresult.num_query_terms_matched*100;
124
125	queryresults->docs.docset[DocNum] = docresult;
126	queryresults->docs.docorder.push_back(DocNum);
127
128	return 0;
129	}
130
131	int termequivcallback(char Word, int ULen, int /Freq*/,
132	float /Weight/, void *info) {
133	text_tset equivterms = (text_tset )info;
134	if (equivterms == NULL) return 0;
135
136	text_t thisterm;
137	thisterm.setcarr(Word, ULen);
138
139	equivterms->insert(thisterm);
140
141	return 0;
142	}
143
144
145	void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
146	// allocate working stem space
147	int maxstemlen = mgq_getmaxstemlen ();
148	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
149	if (word_stem == NULL) return;
150
151	// copy word to word_stem
152	int len = 0;
153	text_t::const_iterator here = word.begin();
154	text_t::const_iterator end = word.end();
155	while (len < maxstemlen && here != end) {
156	word_stem[len+1] = (unsigned char)(*here);
157	len++; here++;
158	}
159	word_stem[len+1] = '\0';
160	word_stem[0] = len;
161
162	// get the equivalent terms
163	mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
164
165	delete [] word_stem;
166
167	return;
168	}
169
170	text_tset utf8equivterms; // kept as utf8 string for fast matching
171
172
173	// This callback is called once for each term in the query
174	int termfreqcallback(char *Word, int ULen, int Freq,
175	float /Weight/, void *info) {
176	queryresultsclass queryresults = (queryresultsclass )info;
177	if (queryresults == NULL) return 0;
178
179	text_t term;
180	term.setcarr(Word, ULen);
181	termfreqclass termfreq;
182
183	termfreq.termstr = to_uni(term);
184	text_t utf8termstem = mgsearch_stemword (term);
185	termfreq.termstemstr = to_uni (utf8termstem);
186
187	mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
188
189	termfreq.termfreq = Freq;
190	queryresults->orgterms.push_back(termfreq);
191
192	return 0;
193	}
194
195	// this callback is called once for each variation of each term
196	int termvariantscallback(char Word, int ULen, int /Freq*/,
197	float /Weight/, void *info) {
198
199	text_t term;
200	term.setcarr(Word, ULen);
201	queryresultsclass queryresults = (queryresultsclass )info;
202	queryresults->termvariants.insert(to_uni(term));
203
204	return 0;
205	}
206
207	// This callback is for getting document text
208	int doctextcallback(char Doc, int ULen, int /Freq*/,
209	float /Weight/, void * /info/) {
210	tempdoc = Doc;
211	templen = ULen;
212
213	return 0;
214	}
215
216
217	static text_t getindexsuffix (const text_t &collection,
218	const text_t &index) {
219
220	text_t indexsuffix = "index";
221	indexsuffix = filename_cat (indexsuffix, index);
222	indexsuffix = filename_cat (indexsuffix, collection);
223	return indexsuffix;
224	}
225
226
227
228
229	////////////////////
230	// mgsearch class //
231	////////////////////
232
233	mgsearchclass::mgsearchclass ()
234	{
235	cache = new querycache (RESULTCACHESIZE);
236	}
237
238	mgsearchclass::~mgsearchclass ()
239	{
240	if (cache != NULL)
241	{
242	delete cache;
243	cache = NULL;
244	}
245	}
246
247
248	void mgsearchclass::setcollectdir (const text_t &thecollectdir)
249	{
250	collectdir = thecollectdir;
251	}
252
253	// you only need to use this function before doing any stemming
254	// casefolding and stemming will be set if values for them are
255	// provided (0 or 1).
256	// makeindexcurrent returns true if it was able to load the database
257	bool mgsearchclass::makeindexcurrent (const text_t &index,
258	const text_t &subcollection,
259	const text_t &language,
260	const text_t &collection,
261	int casefolding,
262	int stemming) {
263	bool databaseloaded = true;
264
265	// get the names of the collection, index and text suffixes
266	char *ccollection = collection.getcstr();
267	assert (ccollection != NULL);
268	char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
269	assert (idxsuffix != NULL);
270	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
271	assert (txtsuffix != NULL);
272
273	#ifdef __WIN32__
274	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
275	#else
276	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
277	#endif
278
279	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
280	if (casefolding == 0) mgq_ask(".set casefold off");
281	else if (casefolding > 0) mgq_ask(".set casefold on");
282	if (stemming == 0) mgq_ask(".set stem off");
283	else if (stemming > 0) mgq_ask(".set stem on");
284
285	} else databaseloaded = false;
286
287	// free up the c strings
288	delete ccollection;
289	delete idxsuffix;
290	delete txtsuffix;
291	delete ccollectdir;
292
293	return databaseloaded;
294	}
295
296
297	// stem word uses the values set in the last call to makeindexcurrent
298	// to stem the word. It is assumed that word is in unicode
299	text_t mgsearchclass::stemword (const text_t &word) {
300	return to_uni (mgsearch_stemword (to_utf8 (word)));
301	}
302
303	text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
304	return to_uni (mgsearch_stemword (to_utf8 (here, end)));
305	}
306
307
308	bool mgsearchclass::search(const queryparamclass &queryparams,
309	queryresultsclass &queryresults) {
310	assert (cache != NULL);
311
312	queryresults.clear();
313
314	// first check the cache
315	if (cache->find(queryparams, queryresults)) return true;
316
317	// make sure there is a query to be processed
318	if (!has_unicode_letdig(queryparams.querystring)) return true;
319
320	if (makeindexcurrent (queryparams.index, queryparams.subcollection,
321	queryparams.language, queryparams.collection)) {
322	setsearchmode (queryparams);
323	submitquery (queryparams);
324	getresults (queryparams, queryresults);
325	return true;
326	}
327
328	return false;
329	}
330
331
332	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
333	{
334	mgq_ask(".set expert true");
335	mgq_ask(".set sorted_terms true");
336	mgq_ask(".set accumulator_method list");
337	mgq_ask(".set max_accumulators 500000");
338	mgq_ask(".set maxparas 500000");
339	mgq_ask(".set verbatim true");
340	// mgq_ask(".unset skip_dump");
341	mgq_ask(".set mode docnums");
342
343	switch (queryparams.search_type)
344	{
345	case 0: mgq_ask(".set query boolean"); break;
346	case 1: mgq_ask(".set query ranked"); break;
347	}
348	switch (queryparams.casefolding)
349	{
350	case 1: mgq_ask(".set casefold on"); break;
351	case 0: mgq_ask(".set casefold off"); break;
352	}
353	switch (queryparams.stemming)
354	{
355	case 1: mgq_ask(".set stem on"); break;
356	case 0: mgq_ask(".set stem off"); break;
357	}
358	mgq_ask(".set heads_length 150");
359
360	if (queryparams.maxdocs == -1) {
361	mgq_ask(".set maxdocs all");
362	} else {
363	char maxdocstr[32];
364	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
365	mgq_ask(maxdocstr);
366	}
367	}
368
369
370	void mgsearchclass::submitquery (const queryparamclass &queryparams)
371	{
372	// sort out the query string
373	text_t ttquerystring = queryparams.querystring;
374	filterquery (ttquerystring);
375	char *querystring = to_utf8(ttquerystring).getcstr();
376
377	// submit the query
378	mgq_ask(querystring);
379
380	delete querystring;
381	}
382
383
384	void mgsearchclass::getresults (const queryparamclass &queryparams,
385	queryresultsclass &queryresults) {
386
387	int howmany = queryparams.maxdocs;
388	if (howmany == -1) howmany = MAXNUMDOCS;
389	mgq_results(result_docnums, 0, howmany,
390	ourquerycallback, (void *)(&queryresults));
391
392	// get the term frequencies
393	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
394	termfreqcallback, (void *)(&queryresults));
395	queryresults.sortuniqqueryterms();
396
397	// get term variants
398	mgq_results(result_terms, 0, MAXNUMTERMS,
399	termvariantscallback, (void *)(&queryresults));
400
401	// get the number of documents retrieved
402	int total_retrieved = 0, is_approx = 0;
403	mgq_docsretrieved (&total_retrieved, &is_approx);
404
405	if (total_retrieved == 0) {
406	// not available (or really was zero)
407	queryresults.docs_matched = queryresults.docs.docset.size();
408	if ((queryparams.maxdocs == -1) \|\|
409	(queryresults.docs_matched < queryparams.maxdocs))
410	queryresults.is_approx = Exact;
411	else
412	queryresults.is_approx = MoreThan;
413	} else {
414	queryresults.docs_matched = total_retrieved;
415	if (is_approx) queryresults.is_approx = Approximate;
416	else queryresults.is_approx = Exact;
417	}
418	}
419
420	void mgsearchclass::filterquery (text_t &ttquerystring) {
421	text_t::iterator ithere = ttquerystring.begin ();
422	text_t::iterator itend = ttquerystring.end ();
423
424	// remove all non alphanumeric characters (except
425	// boolean operators
426	while (ithere != itend) {
427	if ((!is_unicode_letdig(ithere)) && (ithere != '!') &&
428	(ithere != '&') && (ithere != '\|') && (*ithere != '(') &&
429	(ithere != ')')) (ithere) = ' ';
430	ithere++;
431	}
432	}
433
434
435	// the document text for 'docnum' is placed in 'output'
436	// docTargetDocument returns 'true' if it was able to
437	// try to get a document
438	// collection is needed to see if an index from the
439	// collection is loaded. If no index has been loaded
440	// defaultindex is needed to load one
441	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
442	const text_t &defaultsubcollection,
443	const text_t &defaultlanguage,
444	const text_t &collection,
445	int docnum,
446	text_t &output) {
447	output.clear();
448
449	// get the mg version of the document
450	char *mgdoc = NULL;
451	int doclen = 0;
452	if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
453	collection, docnum, mgdoc, doclen)) return false;
454	if (mgdoc == NULL) return false;
455
456	// replace all control-Cs with spaces
457	char *mgdoc_here = mgdoc;
458	char *mgdoc_end = mgdoc + doclen;
459	while (mgdoc_here < mgdoc_end) {
460	if (mgdoc_here == '\x3') mgdoc_here = ' ';
461	mgdoc_here++;
462	}
463
464	// convert this document to unicode
465	utf8inconvertclass inconvert;
466	convertclass::status_t status;
467	inconvert.reset ();
468	inconvert.setinput (mgdoc, doclen);
469	inconvert.convert (output, status);
470
471	return true;
472	}
473
474
475	bool mgsearchclass::mgdocument (const text_t &defaultindex,
476	const text_t &defaultsubcollection,
477	const text_t &defaultlanguage,
478	const text_t &collection,
479	int docnum,
480	char *&UDoc, int &ULen) {
481	int databaseloaded = 0;
482
483	UDoc = NULL; ULen = 0;
484
485	// see if we can make an appropriate database current
486	// char *ccollection = collection.getcstr();
487	// assert (ccollection != NULL);
488	// databaseloaded = load_text_database (ccollection);
489	// delete ccollection;
490
491	// try and load the database
492	// if (!databaseloaded)
493	databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
494	defaultlanguage, collection);
495
496	if (databaseloaded) {
497	// retrieve the document from mg
498	char docstr[32];
499	sprintf(docstr, "%i", docnum);
500
501	mgq_ask(".set mode text");
502	mgq_ask(".set query docnums");
503	mgq_ask(docstr);
504
505	tempdoc = NULL;
506	templen = 0;
507	mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
508	UDoc = tempdoc;
509	ULen = templen;
510	}
511
512	return (bool)databaseloaded;
513	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: