Context Navigation

source: trunk/gsdl/src/colservr/mgsearch.cpp@ 633

Last change on this file since 633 was 633, checked in by rjmcnab, 25 years ago
change to use has_unicode_letdig in text_t
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.1 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: mgsearch.cpp 633 1999-09-24 02:41:21Z rjmcnab $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.22 1999/09/24 02:41:21 rjmcnab
31	change to use has_unicode_letdig in text_t
32
33	Revision 1.21 1999/09/21 21:41:41 sjboddie
34	fixed an error in what I committed last
35
36	Revision 1.20 1999/09/21 11:59:26 sjboddie
37	added Maxdocs queryfilter option (which may be -1 for 'all)
38
39	Revision 1.19 1999/09/07 22:52:52 rjmcnab
40	Seems to be an error in mg for retrieving documents using a paragraph
41	based index for some cases. Just added a work around (loads the default
42	index every time).
43
44	Revision 1.18 1999/09/07 04:57:22 sjboddie
45	added gpl notice
46
47	Revision 1.17 1999/08/31 22:42:41 rjmcnab
48	A couple of minor things.
49
50	Revision 1.16 1999/08/25 04:51:06 sjboddie
51	small change to allow for searching using boolean operators
52
53	Revision 1.15 1999/07/16 08:35:03 rjmcnab
54	Fixed a weird bug to do with a faulty case statement.
55
56	Revision 1.14 1999/07/16 03:42:22 sjboddie
57	changed isApprox
58
59	Revision 1.13 1999/07/16 00:12:46 sjboddie
60	removed all the old post-processing stuff
61
62	Revision 1.12 1999/07/07 06:17:47 rjmcnab
63	broke search_index into index+subcollection+language
64	within mgsearch
65
66	Revision 1.11 1999/07/05 21:06:43 rjmcnab
67	Disabled quoted strings.
68
69	Revision 1.10 1999/07/01 09:29:19 rjmcnab
70	Changes for better reporting of number documents which match a query. Changes
71	should still work as before with older versions of mg.
72
73	Revision 1.9 1999/07/01 03:54:48 rjmcnab
74	Added code to plug in the equivalent terms of each of the query terms.
75	Also added a function to get a raw utf8 encoded mg document (for speeding
76	up a phrase matching function)
77
78	Revision 1.8 1999/06/30 04:04:12 rjmcnab
79	made stemming functions available from mgsearch and made the stems
80	for the query terms available in queryinfo
81
82	Revision 1.7 1999/06/27 22:07:27 sjboddie
83	got rid of all the old functions for dealing with dir indexes
84
85	Revision 1.6 1999/06/09 00:41:32 sjboddie
86	phrase searching now uses case-folding if it's turned on
87
88	Revision 1.5 1999/02/21 22:31:35 rjmcnab
89
90	Removed locateinfo.
91
92	Revision 1.4 1999/02/03 01:13:27 sjboddie
93
94	Got interface to handle subcollections and language subcollections -
95	committed changes made to some of the collections
96
97	Revision 1.3 1999/01/19 01:38:17 rjmcnab
98
99	Made the source more portable.
100
101	Revision 1.2 1999/01/12 01:51:02 rjmcnab
102
103	Standard header.
104
105	Revision 1.1 1999/01/08 09:02:16 rjmcnab
106
107	Moved from src/library.
108
109	*/
110
111
112	#include "gsdlconf.h"
113	#include "mgsearch.h"
114	#include "fileutil.h"
115
116	#include <string.h>
117	#include <stdio.h>
118	#include <stdlib.h>
119	#include <ctype.h>
120
121	#if defined(GSDL_USE_OBJECTSPACE)
122	# include <ospace\std\iostream>
123	#elif defined(GSDL_USE_IOS_H)
124	# include <iostream.h>
125	#else
126	# include <iostream>
127	#endif
128
129	#if defined(__WIN32__)
130	// gdbm stuff
131	# include "autoconf.h"
132	# include "systems.h"
133	# include "gdbmconst.h"
134	# include "gdbm.h"
135	#else
136	# include <gdbm.h>
137	#endif
138
139
140	#include <assert.h>
141
142	#include "mgq.h"
143	// #include "locateinfo.h"
144	#include "gsdlunicode.h"
145	#include "unitool.h"
146
147
148	/////////////
149	// globals //
150	/////////////
151
152	static char *tempdoc = NULL;
153	static int templen = 0;
154
155
156	//////////////////////
157	// useful functions //
158	//////////////////////
159
160
161	// input and output are in utf8
162	text_t mgsearch_stemword (const text_t &word) {
163	// allocate working stem space
164	int maxstemlen = mgq_getmaxstemlen ();
165	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
166	if (word_stem == NULL) return "";
167
168	// copy word to word_stem
169	int len = 0;
170	text_t::const_iterator here = word.begin();
171	text_t::const_iterator end = word.end();
172	while (len < maxstemlen && here != end) {
173	word_stem[len+1] = (unsigned char)(*here);
174	len++; here++;
175	}
176	word_stem[len+1] = '\0';
177	word_stem[0] = len;
178
179	mgq_stemword (word_stem);
180
181	// copy word_stem back to tempstr
182	text_t tempstr;
183	tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
184
185	delete [] word_stem;
186
187	return tempstr;
188	}
189
190
191
192	////////////////////////
193	// callback functions //
194	////////////////////////
195
196	// This routine is called for each document found in a search
197	// it assumes that cache_num is set up correctly to point to
198	// a suitable result cache
199	int ourquerycallback(char * /UDoc/, int /ULen/, int DocNum,
200	float Weight, void *info) {
201
202
203	queryresultsclass queryresults = (queryresultsclass )info;
204
205	// append this entry to the document results
206	docresultclass docresult;
207	docresult.docnum = DocNum;
208	docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
209	docresult.docweight = Weight - docresult.num_query_terms_matched*100;
210
211	queryresults->docs.docset[DocNum] = docresult;
212	queryresults->docs.docorder.push_back(DocNum);
213
214	return 0;
215	}
216
217	int termequivcallback(char Word, int ULen, int /Freq*/,
218	float /Weight/, void *info) {
219	text_tset equivterms = (text_tset )info;
220	if (equivterms == NULL) return 0;
221
222	text_t thisterm;
223	thisterm.setcarr(Word, ULen);
224
225	equivterms->insert(thisterm);
226
227	return 0;
228	}
229
230
231	void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
232	// allocate working stem space
233	int maxstemlen = mgq_getmaxstemlen ();
234	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
235	if (word_stem == NULL) return;
236
237	// copy word to word_stem
238	int len = 0;
239	text_t::const_iterator here = word.begin();
240	text_t::const_iterator end = word.end();
241	while (len < maxstemlen && here != end) {
242	word_stem[len+1] = (unsigned char)(*here);
243	len++; here++;
244	}
245	word_stem[len+1] = '\0';
246	word_stem[0] = len;
247
248	// get the equivalent terms
249	mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
250
251	delete [] word_stem;
252
253	return;
254	}
255
256	text_tset utf8equivterms; // kept as utf8 string for fast matching
257
258
259	// This callback is called once for each term in the query
260	int termfreqcallback(char *Word, int ULen, int Freq,
261	float /Weight/, void *info) {
262	queryresultsclass queryresults = (queryresultsclass )info;
263	if (queryresults == NULL) return 0;
264
265	text_t term;
266	term.setcarr(Word, ULen);
267	termfreqclass termfreq;
268
269	termfreq.termstr = to_uni(term);
270	text_t utf8termstem = mgsearch_stemword (term);
271	termfreq.termstemstr = to_uni (utf8termstem);
272
273	mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
274
275	termfreq.termfreq = Freq;
276	queryresults->orgterms.push_back(termfreq);
277
278	return 0;
279	}
280
281	// this callback is called once for each variation of each term
282	int termvariantscallback(char Word, int ULen, int /Freq*/,
283	float /Weight/, void *info) {
284
285	text_t term;
286	term.setcarr(Word, ULen);
287	queryresultsclass queryresults = (queryresultsclass )info;
288	queryresults->termvariants.insert(to_uni(term));
289
290	return 0;
291	}
292
293	// This callback is for getting document text
294	int doctextcallback(char Doc, int ULen, int /Freq*/,
295	float /Weight/, void * /info/) {
296	tempdoc = Doc;
297	templen = ULen;
298
299	return 0;
300	}
301
302
303	static text_t getindexsuffix (const text_t &collection,
304	const text_t &index) {
305
306	text_t indexsuffix = "index";
307	indexsuffix = filename_cat (indexsuffix, index);
308	indexsuffix = filename_cat (indexsuffix, collection);
309	return indexsuffix;
310	}
311
312
313
314
315	////////////////////
316	// mgsearch class //
317	////////////////////
318
319	mgsearchclass::mgsearchclass ()
320	{
321	cache = new querycache (RESULTCACHESIZE);
322	}
323
324	mgsearchclass::~mgsearchclass ()
325	{
326	if (cache != NULL)
327	{
328	delete cache;
329	cache = NULL;
330	}
331	}
332
333
334	void mgsearchclass::setcollectdir (const text_t &thecollectdir)
335	{
336	collectdir = thecollectdir;
337	}
338
339	// you only need to use this function before doing any stemming
340	// casefolding and stemming will be set if values for them are
341	// provided (0 or 1).
342	// makeindexcurrent returns true if it was able to load the database
343	bool mgsearchclass::makeindexcurrent (const text_t &index,
344	const text_t &subcollection,
345	const text_t &language,
346	const text_t &collection,
347	int casefolding,
348	int stemming) {
349	bool databaseloaded = true;
350
351	// get the names of the collection, index and text suffixes
352	char *ccollection = collection.getcstr();
353	assert (ccollection != NULL);
354	char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
355	assert (idxsuffix != NULL);
356	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
357	assert (txtsuffix != NULL);
358
359	#ifdef __WIN32__
360	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
361	#else
362	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
363	#endif
364
365	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
366	if (casefolding == 0) mgq_ask(".set casefold off");
367	else if (casefolding > 0) mgq_ask(".set casefold on");
368	if (stemming == 0) mgq_ask(".set stem off");
369	else if (stemming > 0) mgq_ask(".set stem on");
370
371	} else databaseloaded = false;
372
373	// free up the c strings
374	delete ccollection;
375	delete idxsuffix;
376	delete txtsuffix;
377	delete ccollectdir;
378
379	return databaseloaded;
380	}
381
382
383	// stem word uses the values set in the last call to makeindexcurrent
384	// to stem the word. It is assumed that word is in unicode
385	text_t mgsearchclass::stemword (const text_t &word) {
386	return to_uni (mgsearch_stemword (to_utf8 (word)));
387	}
388
389	text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
390	return to_uni (mgsearch_stemword (to_utf8 (here, end)));
391	}
392
393
394	bool mgsearchclass::search(const queryparamclass &queryparams,
395	queryresultsclass &queryresults) {
396	assert (cache != NULL);
397
398	queryresults.clear();
399
400	// first check the cache
401	if (cache->find(queryparams, queryresults)) return true;
402
403	// make sure there is a query to be processed
404	if (!has_unicode_letdig(queryparams.querystring)) return true;
405
406	if (makeindexcurrent (queryparams.index, queryparams.subcollection,
407	queryparams.language, queryparams.collection)) {
408	setsearchmode (queryparams);
409	submitquery (queryparams);
410	getresults (queryparams, queryresults);
411	return true;
412	}
413
414	return false;
415	}
416
417
418	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
419	{
420	mgq_ask(".set expert true");
421	mgq_ask(".set sorted_terms true");
422	mgq_ask(".set accumulator_method list");
423	mgq_ask(".set max_accumulators 500000");
424	mgq_ask(".set maxparas 500000");
425	mgq_ask(".set verbatim true");
426	mgq_ask(".unset skip_dump");
427	mgq_ask(".set mode docnums");
428
429	switch (queryparams.search_type)
430	{
431	case 0: mgq_ask(".set query boolean"); break;
432	case 1: mgq_ask(".set query ranked"); break;
433	}
434	switch (queryparams.casefolding)
435	{
436	case 1: mgq_ask(".set casefold on"); break;
437	case 0: mgq_ask(".set casefold off"); break;
438	}
439	switch (queryparams.stemming)
440	{
441	case 1: mgq_ask(".set stem on"); break;
442	case 0: mgq_ask(".set stem off"); break;
443	}
444	mgq_ask(".set heads_length 150");
445
446	if (queryparams.maxdocs == -1) {
447	mgq_ask(".set maxdocs all");
448	} else {
449	char maxdocstr[32];
450	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
451	mgq_ask(maxdocstr);
452	}
453	}
454
455
456	void mgsearchclass::submitquery (const queryparamclass &queryparams)
457	{
458	// sort out the query string
459	text_t ttquerystring = queryparams.querystring;
460	filterquery (ttquerystring);
461	char *querystring = to_utf8(ttquerystring).getcstr();
462
463	// submit the query
464	mgq_ask(querystring);
465
466	delete querystring;
467	}
468
469
470	void mgsearchclass::getresults (const queryparamclass &queryparams,
471	queryresultsclass &queryresults) {
472
473	int howmany = queryparams.maxdocs;
474	if (howmany == -1) howmany = MAXNUMDOCS;
475	mgq_results(result_docnums, 0, howmany,
476	ourquerycallback, (void *)(&queryresults));
477
478	// get the term frequencies
479	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
480	termfreqcallback, (void *)(&queryresults));
481	queryresults.sortuniqqueryterms();
482
483	// get term variants
484	mgq_results(result_terms, 0, MAXNUMTERMS,
485	termvariantscallback, (void *)(&queryresults));
486
487	// get the number of documents retrieved
488	int total_retrieved = 0, is_approx = 0;
489	mgq_docsretrieved (&total_retrieved, &is_approx);
490
491	if (total_retrieved == 0) {
492	// not available (or really was zero)
493	queryresults.docs_matched = queryresults.docs.docset.size();
494	if ((queryparams.maxdocs == -1) \|\|
495	(queryresults.docs_matched < queryparams.maxdocs))
496	queryresults.is_approx = Exact;
497	else
498	queryresults.is_approx = MoreThan;
499	} else {
500	queryresults.docs_matched = total_retrieved;
501	if (is_approx) queryresults.is_approx = Approximate;
502	else queryresults.is_approx = Exact;
503	}
504	}
505
506	void mgsearchclass::filterquery (text_t &ttquerystring) {
507	text_t::iterator ithere = ttquerystring.begin ();
508	text_t::iterator itend = ttquerystring.end ();
509
510	// remove all non alphanumeric characters (except
511	// boolean operators
512	while (ithere != itend) {
513	if ((!is_unicode_letdig(ithere)) && (ithere != '!') &&
514	(ithere != '&') && (ithere != '\|') && (*ithere != '(') &&
515	(ithere != ')')) (ithere) = ' ';
516	ithere++;
517	}
518	}
519
520
521	// the document text for 'docnum' is placed in 'output'
522	// docTargetDocument returns 'true' if it was able to
523	// try to get a document
524	// collection is needed to see if an index from the
525	// collection is loaded. If no index has been loaded
526	// defaultindex is needed to load one
527	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
528	const text_t &defaultsubcollection,
529	const text_t &defaultlanguage,
530	const text_t &collection,
531	int docnum,
532	text_t &output) {
533	output.clear();
534
535	// get the mg version of the document
536	char *mgdoc = NULL;
537	int doclen = 0;
538	if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
539	collection, docnum, mgdoc, doclen)) return false;
540	if (mgdoc == NULL) return false;
541
542	// replace all control-Cs with spaces
543	char *mgdoc_here = mgdoc;
544	char *mgdoc_end = mgdoc + doclen;
545	while (mgdoc_here < mgdoc_end) {
546	if (mgdoc_here == '\x3') mgdoc_here = ' ';
547	mgdoc_here++;
548	}
549
550	// convert this document to unicode
551	utf8inconvertclass inconvert;
552	convertclass::status_t status;
553	inconvert.reset ();
554	inconvert.setinput (mgdoc, doclen);
555	inconvert.convert (output, status);
556
557	return true;
558	}
559
560
561	bool mgsearchclass::mgdocument (const text_t &defaultindex,
562	const text_t &defaultsubcollection,
563	const text_t &defaultlanguage,
564	const text_t &collection,
565	int docnum,
566	char *&UDoc, int &ULen) {
567	int databaseloaded = 0;
568
569	UDoc = NULL; ULen = 0;
570
571	// see if we can make an appropriate database current
572	// char *ccollection = collection.getcstr();
573	// assert (ccollection != NULL);
574	// databaseloaded = load_text_database (ccollection);
575	// delete ccollection;
576
577	// try and load the database
578	// if (!databaseloaded)
579	databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
580	defaultlanguage, collection);
581
582	if (databaseloaded) {
583	// retrieve the document from mg
584	char docstr[32];
585	sprintf(docstr, "%i", docnum);
586
587	mgq_ask(".set mode text");
588	mgq_ask(".set query docnums");
589	mgq_ask(docstr);
590
591	tempdoc = NULL;
592	templen = 0;
593	mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
594	UDoc = tempdoc;
595	ULen = templen;
596	}
597
598	return (bool)databaseloaded;
599	}
600

Note: See TracBrowser for help on using the repository browser.

Download in other formats: