Context Navigation

source: main/tags/2.25/gsdl/src/colservr/mgsearch.cpp@ 24204

Last change on this file since 24204 was 1324, checked in by kjm18, 24 years ago
mgpp incorporated. the old mgsearchclass and queryfilterclass are changed. Have a base searchclass, from which mgsearchclass and mgppsearchclass inherit. Have a base queryfilterclass, from which mgqueryfilterclass and mgppqueryfilterclass inherit. librarymain in recpt should choose the appropriate type (mg vs mgpp) for each collection.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 13.7 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "gsdlconf.h"
27	#include "mgsearch.h"
28	#include "fileutil.h"
29
30	#include <string.h>
31	#include <stdio.h>
32	#include <stdlib.h>
33	#include <ctype.h>
34
35	#if defined(GSDL_USE_OBJECTSPACE)
36	# include <ospace\std\iostream>
37	#elif defined(GSDL_USE_IOS_H)
38	# include <iostream.h>
39	#else
40	# include <iostream>
41	#endif
42
43	#if defined(__WIN32__)
44	// gdbm stuff
45	# include "autoconf.h"
46	# include "systems.h"
47	# include "gdbmconst.h"
48	# include "gdbm.h"
49	#else
50	# include <gdbm.h>
51	#endif
52
53
54	#include <assert.h>
55
56	#include "mgq.h"
57	// #include "locateinfo.h"
58	#include "gsdlunicode.h"
59	#include "unitool.h"
60
61
62	/////////////
63	// globals //
64	/////////////
65
66	static char *tempdoc = NULL;
67	static int templen = 0;
68
69
70	//////////////////////
71	// useful functions //
72	//////////////////////
73
74
75	// input and output are in utf8
76	text_t mgsearch_stemword (const text_t &word) {
77	// allocate working stem space
78	int maxstemlen = mgq_getmaxstemlen ();
79	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
80	if (word_stem == NULL) return "";
81
82	// copy word to word_stem
83	int len = 0;
84	text_t::const_iterator here = word.begin();
85	text_t::const_iterator end = word.end();
86	while (len < maxstemlen && here != end) {
87	word_stem[len+1] = (unsigned char)(*here);
88	len++; here++;
89	}
90	word_stem[len+1] = '\0';
91	word_stem[0] = len;
92
93	mgq_stemword (word_stem);
94
95	// copy word_stem back to tempstr
96	text_t tempstr;
97	tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
98
99	delete [] word_stem;
100
101	return tempstr;
102	}
103
104
105
106	////////////////////////
107	// callback functions //
108	////////////////////////
109
110	// This routine is called for each document found in a search
111	// it assumes that cache_num is set up correctly to point to
112	// a suitable result cache
113	int ourquerycallback(char * /UDoc/, int /ULen/, int DocNum,
114	float Weight, void *info) {
115
116
117	queryresultsclass queryresults = (queryresultsclass )info;
118
119	// append this entry to the document results
120	docresultclass docresult;
121	docresult.docnum = DocNum;
122	docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
123	docresult.docweight = Weight - docresult.num_query_terms_matched*100;
124
125	queryresults->docs.docset[DocNum] = docresult;
126	queryresults->docs.docorder.push_back(DocNum);
127
128	return 0;
129	}
130
131	int termequivcallback(char Word, int ULen, int /Freq*/,
132	float /Weight/, void *info) {
133	text_tset equivterms = (text_tset )info;
134	if (equivterms == NULL) return 0;
135
136	text_t thisterm;
137	thisterm.setcarr(Word, ULen);
138
139	equivterms->insert(thisterm);
140
141	return 0;
142	}
143
144
145	void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
146	// allocate working stem space
147	int maxstemlen = mgq_getmaxstemlen ();
148	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
149	if (word_stem == NULL) return;
150
151	// copy word to word_stem
152	int len = 0;
153	text_t::const_iterator here = word.begin();
154	text_t::const_iterator end = word.end();
155	while (len < maxstemlen && here != end) {
156	word_stem[len+1] = (unsigned char)(*here);
157	len++; here++;
158	}
159	word_stem[len+1] = '\0';
160	word_stem[0] = len;
161
162	// get the equivalent terms
163	mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
164
165	delete [] word_stem;
166
167	return;
168	}
169
170	text_tset utf8equivterms; // kept as utf8 string for fast matching
171
172
173	// This callback is called once for each term in the query
174	int termfreqcallback(char *Word, int ULen, int Freq,
175	float /Weight/, void *info) {
176	queryresultsclass queryresults = (queryresultsclass )info;
177	if (queryresults == NULL) return 0;
178
179	text_t term;
180	term.setcarr(Word, ULen);
181	termfreqclass termfreq;
182
183	termfreq.termstr = to_uni(term);
184	text_t utf8termstem = mgsearch_stemword (term);
185	termfreq.termstemstr = to_uni (utf8termstem);
186
187	mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
188
189	termfreq.termfreq = Freq;
190	queryresults->orgterms.push_back(termfreq);
191
192	return 0;
193	}
194
195	// this callback is called once for each variation of each term
196	int termvariantscallback(char Word, int ULen, int /Freq*/,
197	float /Weight/, void *info) {
198
199	text_t term;
200	term.setcarr(Word, ULen);
201	queryresultsclass queryresults = (queryresultsclass )info;
202	queryresults->termvariants.insert(to_uni(term));
203
204	return 0;
205	}
206
207	// This callback is for getting document text
208	int doctextcallback(char Doc, int ULen, int /Freq*/,
209	float /Weight/, void * /info/) {
210	tempdoc = Doc;
211	templen = ULen;
212
213	return 0;
214	}
215
216
217	static text_t getindexsuffix (const text_t &collection,
218	const text_t &index) {
219
220	text_t indexsuffix = "index";
221	indexsuffix = filename_cat (indexsuffix, index);
222	indexsuffix = filename_cat (indexsuffix, collection);
223	return indexsuffix;
224	}
225
226
227
228
229	////////////////////
230	// mgsearch class //
231	////////////////////
232
233	mgsearchclass::mgsearchclass ()
234	: searchclass() {
235
236	}
237
238	mgsearchclass::~mgsearchclass ()
239	{
240	if (cache != NULL)
241	{
242	delete cache;
243	cache = NULL;
244	}
245	}
246
247	// you only need to use this function before doing any stemming
248	// casefolding and stemming will be set if values for them are
249	// provided (0 or 1).
250	// makeindexcurrent returns true if it was able to load the database
251	bool mgsearchclass::makeindexcurrent (const text_t &index,
252	const text_t &subcollection,
253	const text_t &language,
254	const text_t &collection,
255	int casefolding,
256	int stemming) {
257	bool databaseloaded = true;
258
259	// get the names of the collection, index and text suffixes
260	char *ccollection = collection.getcstr();
261	assert (ccollection != NULL);
262	char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
263	assert (idxsuffix != NULL);
264	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
265	assert (txtsuffix != NULL);
266
267	#ifdef __WIN32__
268	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
269	#else
270	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
271	#endif
272
273	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
274	if (casefolding == 0) mgq_ask(".set casefold off");
275	else if (casefolding > 0) mgq_ask(".set casefold on");
276	if (stemming == 0) mgq_ask(".set stem off");
277	else if (stemming > 0) mgq_ask(".set stem on");
278
279	} else databaseloaded = false;
280
281	// free up the c strings
282	delete ccollection;
283	delete idxsuffix;
284	delete txtsuffix;
285	delete ccollectdir;
286
287	return databaseloaded;
288	}
289
290
291	// stem word uses the values set in the last call to makeindexcurrent
292	// to stem the word. It is assumed that word is in unicode
293	text_t mgsearchclass::stemword (const text_t &word) {
294	return to_uni (mgsearch_stemword (to_utf8 (word)));
295	}
296
297	text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
298	return to_uni (mgsearch_stemword (to_utf8 (here, end)));
299	}
300
301
302	bool mgsearchclass::search(const queryparamclass &queryparams,
303	queryresultsclass &queryresults) {
304	// assert (cache != NULL);
305
306	queryresults.clear();
307	cerr << "mgsearch start of search"<<endl;
308	// first check the cache
309	if (cache != NULL) {
310	if (cache->find(queryparams, queryresults)) return true;
311	}
312	// make sure there is a query to be processed
313	if (!has_unicode_letdig(queryparams.querystring)) return true;
314
315	if (makeindexcurrent (queryparams.index, queryparams.subcollection,
316	queryparams.language, queryparams.collection)) {
317	cerr << "made index current "<<endl;
318	setsearchmode (queryparams);
319	submitquery (queryparams);
320	getresults (queryparams, queryresults);
321	cerr << "got results"<<endl;
322	return true;
323	}
324
325	return false;
326	}
327
328
329	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
330	{
331	mgq_ask(".set expert true");
332	mgq_ask(".set sorted_terms true");
333	mgq_ask(".set accumulator_method list");
334	mgq_ask(".set max_accumulators 500000");
335	mgq_ask(".set maxparas 500000");
336	mgq_ask(".set verbatim true");
337	mgq_ask(".unset skip_dump");
338	mgq_ask(".set mode docnums");
339
340	switch (queryparams.search_type)
341	{
342	case 0: mgq_ask(".set query boolean"); break;
343	case 1: mgq_ask(".set query ranked"); break;
344	}
345	switch (queryparams.casefolding)
346	{
347	case 1: mgq_ask(".set casefold on"); break;
348	case 0: mgq_ask(".set casefold off"); break;
349	}
350	switch (queryparams.stemming)
351	{
352	case 1: mgq_ask(".set stem on"); break;
353	case 0: mgq_ask(".set stem off"); break;
354	}
355	mgq_ask(".set heads_length 150");
356
357	if (queryparams.maxdocs == -1) {
358	mgq_ask(".set maxdocs all");
359	} else {
360	char maxdocstr[32];
361	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
362	mgq_ask(maxdocstr);
363	}
364	}
365
366
367	void mgsearchclass::submitquery (const queryparamclass &queryparams)
368	{
369	// sort out the query string
370	text_t ttquerystring = queryparams.querystring;
371	filterquery (ttquerystring);
372	char *querystring = to_utf8(ttquerystring).getcstr();
373
374	// submit the query
375	mgq_ask(querystring);
376
377	delete querystring;
378	}
379
380
381	void mgsearchclass::getresults (const queryparamclass &queryparams,
382	queryresultsclass &queryresults) {
383
384	int howmany = queryparams.maxdocs;
385	if (howmany == -1) howmany = MAXNUMDOCS;
386	mgq_results(result_docnums, 0, howmany,
387	ourquerycallback, (void *)(&queryresults));
388
389	// get the term frequencies
390	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
391	termfreqcallback, (void *)(&queryresults));
392	queryresults.sortuniqqueryterms();
393
394	// get term variants
395	mgq_results(result_terms, 0, MAXNUMTERMS,
396	termvariantscallback, (void *)(&queryresults));
397
398	// get the number of documents retrieved
399	int total_retrieved = 0, is_approx = 0;
400	mgq_docsretrieved (&total_retrieved, &is_approx);
401
402	if (total_retrieved == 0) {
403	// not available (or really was zero)
404	queryresults.docs_matched = queryresults.docs.docset.size();
405	if ((queryparams.maxdocs == -1) \|\|
406	(queryresults.docs_matched < queryparams.maxdocs))
407	queryresults.is_approx = Exact;
408	else
409	queryresults.is_approx = MoreThan;
410	} else {
411	queryresults.docs_matched = total_retrieved;
412	if (is_approx) queryresults.is_approx = Approximate;
413	else queryresults.is_approx = Exact;
414	}
415	}
416
417	void mgsearchclass::filterquery (text_t &ttquerystring) {
418	text_t::iterator ithere = ttquerystring.begin ();
419	text_t::iterator itend = ttquerystring.end ();
420
421	// remove all non alphanumeric characters (except
422	// boolean operators
423	while (ithere != itend) {
424	if ((!is_unicode_letdig(ithere)) && (ithere != '!') &&
425	(ithere != '&') && (ithere != '\|') && (*ithere != '(') &&
426	(ithere != ')')) (ithere) = ' ';
427	ithere++;
428	}
429	}
430
431
432	// the document text for 'docnum' is placed in 'output'
433	// docTargetDocument returns 'true' if it was able to
434	// try to get a document
435	// collection is needed to see if an index from the
436	// collection is loaded. If no index has been loaded
437	// defaultindex is needed to load one
438	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
439	const text_t &defaultsubcollection,
440	const text_t &defaultlanguage,
441	const text_t &collection,
442	int docnum,
443	text_t &output) {
444	output.clear();
445
446	// get the mg version of the document
447	char *mgdoc = NULL;
448	int doclen = 0;
449	if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
450	collection, docnum, mgdoc, doclen)) return false;
451	if (mgdoc == NULL) return false;
452
453	// replace all control-Cs with spaces
454	char *mgdoc_here = mgdoc;
455	char *mgdoc_end = mgdoc + doclen;
456	while (mgdoc_here < mgdoc_end) {
457	if (mgdoc_here == '\x3') mgdoc_here = ' ';
458	mgdoc_here++;
459	}
460
461	// convert this document to unicode
462	utf8inconvertclass inconvert;
463	convertclass::status_t status;
464	inconvert.reset ();
465	inconvert.setinput (mgdoc, doclen);
466	inconvert.convert (output, status);
467
468	return true;
469	}
470
471
472	bool mgsearchclass::mgdocument (const text_t &defaultindex,
473	const text_t &defaultsubcollection,
474	const text_t &defaultlanguage,
475	const text_t &collection,
476	int docnum,
477	char *&UDoc, int &ULen) {
478	int databaseloaded = 0;
479
480	UDoc = NULL; ULen = 0;
481
482	// see if we can make an appropriate database current
483	// char *ccollection = collection.getcstr();
484	// assert (ccollection != NULL);
485	// databaseloaded = load_text_database (ccollection);
486	// delete ccollection;
487
488	// try and load the database
489	// if (!databaseloaded)
490	databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
491	defaultlanguage, collection);
492
493	if (databaseloaded) {
494	// retrieve the document from mg
495	char docstr[32];
496	sprintf(docstr, "%i", docnum);
497
498	mgq_ask(".set mode text");
499	mgq_ask(".set query docnums");
500	mgq_ask(docstr);
501
502	tempdoc = NULL;
503	templen = 0;
504	mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
505	UDoc = tempdoc;
506	ULen = templen;
507	}
508
509	return (bool)databaseloaded;
510	}
511

Note: See TracBrowser for help on using the repository browser.

Download in other formats: