Context Navigation

source: trunk/gsdl/src/colservr/mgsearch.cpp@ 401

Last change on this file since 401 was 401, checked in by rjmcnab, 25 years ago
Fixed a weird bug to do with a faulty case statement.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 14.8 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* PUT COPYRIGHT NOTICE HERE
7	*
8	* $Id: mgsearch.cpp 401 1999-07-16 08:35:03Z rjmcnab $
9	*
10	*********************************************************************/
11
12	/*
13	$Log$
14	Revision 1.15 1999/07/16 08:35:03 rjmcnab
15	Fixed a weird bug to do with a faulty case statement.
16
17	Revision 1.14 1999/07/16 03:42:22 sjboddie
18	changed isApprox
19
20	Revision 1.13 1999/07/16 00:12:46 sjboddie
21	removed all the old post-processing stuff
22
23	Revision 1.12 1999/07/07 06:17:47 rjmcnab
24	broke search_index into index+subcollection+language
25	within mgsearch
26
27	Revision 1.11 1999/07/05 21:06:43 rjmcnab
28	Disabled quoted strings.
29
30	Revision 1.10 1999/07/01 09:29:19 rjmcnab
31	Changes for better reporting of number documents which match a query. Changes
32	should still work as before with older versions of mg.
33
34	Revision 1.9 1999/07/01 03:54:48 rjmcnab
35	Added code to plug in the equivalent terms of each of the query terms.
36	Also added a function to get a raw utf8 encoded mg document (for speeding
37	up a phrase matching function)
38
39	Revision 1.8 1999/06/30 04:04:12 rjmcnab
40	made stemming functions available from mgsearch and made the stems
41	for the query terms available in queryinfo
42
43	Revision 1.7 1999/06/27 22:07:27 sjboddie
44	got rid of all the old functions for dealing with dir indexes
45
46	Revision 1.6 1999/06/09 00:41:32 sjboddie
47	phrase searching now uses case-folding if it's turned on
48
49	Revision 1.5 1999/02/21 22:31:35 rjmcnab
50
51	Removed locateinfo.
52
53	Revision 1.4 1999/02/03 01:13:27 sjboddie
54
55	Got interface to handle subcollections and language subcollections -
56	committed changes made to some of the collections
57
58	Revision 1.3 1999/01/19 01:38:17 rjmcnab
59
60	Made the source more portable.
61
62	Revision 1.2 1999/01/12 01:51:02 rjmcnab
63
64	Standard header.
65
66	Revision 1.1 1999/01/08 09:02:16 rjmcnab
67
68	Moved from src/library.
69
70	*/
71
72
73	#include "gsdlconf.h"
74	#include "mgsearch.h"
75	#include "fileutil.h"
76
77	#include <string.h>
78	#include <stdio.h>
79	#include <stdlib.h>
80	#include <ctype.h>
81
82	#if defined(GSDL_USE_OBJECTSPACE)
83	# include <ospace\std\iostream>
84	#elif defined(GSDL_USE_IOS_H)
85	# include <iostream.h>
86	#else
87	# include <iostream>
88	#endif
89
90	#if defined(__WIN32__)
91	// gdbm stuff
92	# include "autoconf.h"
93	# include "systems.h"
94	# include "gdbmconst.h"
95	# include "gdbm.h"
96	#else
97	# include <gdbm.h>
98	#endif
99
100
101	#include <assert.h>
102
103	#include "mgq.h"
104	// #include "locateinfo.h"
105	#include "gsdlunicode.h"
106	#include "unitool.h"
107
108
109	/////////////
110	// globals //
111	/////////////
112
113	static char *tempdoc = NULL;
114	static int templen = 0;
115
116
117	//////////////////////
118	// useful functions //
119	//////////////////////
120
121
122	// input and output are in utf8
123	text_t mgsearch_stemword (const text_t &word) {
124	// allocate working stem space
125	int maxstemlen = mgq_getmaxstemlen ();
126	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
127	if (word_stem == NULL) return "";
128
129	// copy word to word_stem
130	int len = 0;
131	text_t::const_iterator here = word.begin();
132	text_t::const_iterator end = word.end();
133	while (len < maxstemlen && here != end) {
134	word_stem[len+1] = (unsigned char)(*here);
135	len++; here++;
136	}
137	word_stem[len+1] = '\0';
138	word_stem[0] = len;
139
140	mgq_stemword (word_stem);
141
142	// copy word_stem back to tempstr
143	text_t tempstr;
144	tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
145
146	delete [] word_stem;
147
148	return tempstr;
149	}
150
151
152
153	////////////////////////
154	// callback functions //
155	////////////////////////
156
157	// This routine is called for each document found in a search
158	// it assumes that cache_num is set up correctly to point to
159	// a suitable result cache
160	int ourquerycallback(char /UDoc/, int /ULen*/, int DocNum,
161	float Weight, void *info) {
162
163
164	queryresultsclass queryresults = (queryresultsclass )info;
165
166	// append this entry to the document results
167	docresultclass docresult;
168	docresult.docnum = DocNum;
169	docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
170	docresult.docweight = Weight - docresult.num_query_terms_matched*100;
171
172	queryresults->docs.docset[DocNum] = docresult;
173	queryresults->docs.docorder.push_back(DocNum);
174
175	return 0;
176	}
177
178	int termequivcallback(char Word, int ULen, int /Freq*/,
179	float /Weight/, void *info) {
180	text_tset equivterms = (text_tset )info;
181	if (equivterms == NULL) return 0;
182
183	text_t thisterm;
184	thisterm.setcarr(Word, ULen);
185
186	equivterms->insert(thisterm);
187
188	return 0;
189	}
190
191
192	void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
193	// allocate working stem space
194	int maxstemlen = mgq_getmaxstemlen ();
195	unsigned char *word_stem = new unsigned char [maxstemlen + 2];
196	if (word_stem == NULL) return;
197
198	// copy word to word_stem
199	int len = 0;
200	text_t::const_iterator here = word.begin();
201	text_t::const_iterator end = word.end();
202	while (len < maxstemlen && here != end) {
203	word_stem[len+1] = (unsigned char)(*here);
204	len++; here++;
205	}
206	word_stem[len+1] = '\0';
207	word_stem[0] = len;
208
209	// get the equivalent terms
210	mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
211
212	delete [] word_stem;
213
214	return;
215	}
216
217	text_tset utf8equivterms; // kept as utf8 string for fast matching
218
219
220	// This callback is called once for each term in the query
221	int termfreqcallback(char *Word, int ULen, int Freq,
222	float /Weight/, void *info) {
223	queryresultsclass queryresults = (queryresultsclass )info;
224	if (queryresults == NULL) return 0;
225
226	text_t term;
227	term.setcarr(Word, ULen);
228	termfreqclass termfreq;
229
230	termfreq.termstr = to_uni(term);
231	text_t utf8termstem = mgsearch_stemword (term);
232	termfreq.termstemstr = to_uni (utf8termstem);
233
234	mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
235
236	termfreq.termfreq = Freq;
237	queryresults->orgterms.push_back(termfreq);
238
239	return 0;
240	}
241
242	// this callback is called once for each variation of each term
243	int termvariantscallback(char Word, int ULen, int /Freq*/,
244	float /Weight/, void *info) {
245
246	text_t term;
247	term.setcarr(Word, ULen);
248	queryresultsclass queryresults = (queryresultsclass )info;
249	queryresults->termvariants.insert(to_uni(term));
250
251	return 0;
252	}
253
254	// This callback is for getting document text
255	int doctextcallback(char Doc, int ULen, int /Freq*/,
256	float /Weight/, void /info*/) {
257	tempdoc = Doc;
258	templen = ULen;
259
260	return 0;
261	}
262
263
264	static text_t getindexsuffix (const text_t &collection,
265	const text_t &index) {
266
267	text_t indexsuffix = "index";
268	// temporary hack so old version of niupepa collection
269	// can stay up until new one's finished
270	if (collection == "niupepa") indexsuffix = "index.new";
271
272	indexsuffix = filename_cat (indexsuffix, index);
273	indexsuffix = filename_cat (indexsuffix, collection);
274	return indexsuffix;
275	}
276
277
278
279
280	////////////////////
281	// mgsearch class //
282	////////////////////
283
284	mgsearchclass::mgsearchclass ()
285	{
286	cache = new querycache (RESULTCACHESIZE);
287	}
288
289	mgsearchclass::~mgsearchclass ()
290	{
291	if (cache != NULL)
292	{
293	delete cache;
294	cache = NULL;
295	}
296	}
297
298
299	void mgsearchclass::setcollectdir (const text_t &thecollectdir)
300	{
301	collectdir = thecollectdir;
302	}
303
304	// you only need to use this function before doing any stemming
305	// casefolding and stemming will be set if values for them are
306	// provided (0 or 1).
307	// makeindexcurrent returns true if it was able to load the database
308	bool mgsearchclass::makeindexcurrent (const text_t &index,
309	const text_t &subcollection,
310	const text_t &language,
311	const text_t &collection,
312	int casefolding,
313	int stemming) {
314	bool databaseloaded = true;
315
316	// get the names of the collection, index and text suffixes
317	char *ccollection = collection.getcstr();
318	assert (ccollection != NULL);
319	char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
320	assert (idxsuffix != NULL);
321	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
322	assert (txtsuffix != NULL);
323
324	#ifdef __WIN32__
325	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
326	#else
327	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
328	#endif
329
330	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
331	if (casefolding == 0) mgq_ask(".set casefold off");
332	else if (casefolding > 0) mgq_ask(".set casefold on");
333	if (stemming == 0) mgq_ask(".set stem off");
334	else if (stemming > 0) mgq_ask(".set stem on");
335
336	} else databaseloaded = false;
337
338	// free up the c strings
339	delete ccollection;
340	delete idxsuffix;
341	delete txtsuffix;
342	delete ccollectdir;
343
344	return databaseloaded;
345	}
346
347
348	// stem word uses the values set in the last call to makeindexcurrent
349	// to stem the word. It is assumed that word is in unicode
350	text_t mgsearchclass::stemword (const text_t &word) {
351	return to_uni (mgsearch_stemword (to_utf8 (word)));
352	}
353
354	text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
355	return to_uni (mgsearch_stemword (to_utf8 (here, end)));
356	}
357
358
359	bool mgsearchclass::search(const queryparamclass &queryparams,
360	queryresultsclass &queryresults) {
361	assert (cache != NULL);
362
363	queryresults.clear();
364
365	// first check the cache
366	if (cache->find(queryparams, queryresults)) return true;
367
368	// make sure there is a query to be processed
369	text_t::const_iterator queryhere = queryparams.querystring.begin();
370	text_t::const_iterator queryend = queryparams.querystring.end();
371	while (queryhere != queryend) {
372	if (is_unicode_letdig (*queryhere)) break;
373	queryhere++;
374	}
375
376	// if we reached the end of the query string without finding
377	// any alphanumeric characters then return no results (and say
378	// the database was loaded)
379	if (queryhere == queryend) return true;
380
381	if (makeindexcurrent (queryparams.index, queryparams.subcollection,
382	queryparams.language, queryparams.collection)) {
383	setsearchmode (queryparams);
384	submitquery (queryparams);
385	getresults (queryparams, queryresults);
386	return true;
387	}
388
389	return false;
390	}
391
392
393	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
394	{
395	mgq_ask(".set expert true");
396	mgq_ask(".set sorted_terms true");
397	mgq_ask(".set accumulator_method list");
398	mgq_ask(".set max_accumulators 50000");
399	mgq_ask(".set verbatim true");
400	mgq_ask(".unset skip_dump");
401	mgq_ask(".set mode docnums");
402
403	switch (queryparams.search_type)
404	{
405	case 0: mgq_ask(".set query boolean"); break;
406	case 1: mgq_ask(".set query ranked"); break;
407	}
408	switch (queryparams.casefolding)
409	{
410	case 1: mgq_ask(".set casefold on"); break;
411	case 0: mgq_ask(".set casefold off"); break;
412	}
413	switch (queryparams.stemming)
414	{
415	case 1: mgq_ask(".set stem on"); break;
416	case 0: mgq_ask(".set stem off"); break;
417	}
418	mgq_ask(".set heads_length 150");
419
420	if (queryparams.maxdocs == -1) {
421	mgq_ask(".set maxdocs all");
422	} else {
423	char maxdocstr[32];
424	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
425	mgq_ask(maxdocstr);
426	}
427	}
428
429
430	void mgsearchclass::submitquery (const queryparamclass &queryparams)
431	{
432	// sort out the query string
433	text_t ttquerystring = queryparams.querystring;
434	filterquery (ttquerystring);
435	char *querystring = to_utf8(ttquerystring).getcstr();
436
437	// submit the query
438	mgq_ask(querystring);
439
440	delete querystring;
441	}
442
443
444	void mgsearchclass::getresults (const queryparamclass &queryparams,
445	queryresultsclass &queryresults) {
446
447	mgq_results(result_docnums, 0, MAXNUMDOCS,
448	ourquerycallback, (void *)(&queryresults));
449
450	// get the term frequencies
451	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
452	termfreqcallback, (void *)(&queryresults));
453	queryresults.sortuniqqueryterms();
454
455	// get term variants
456	mgq_results(result_terms, 0, MAXNUMTERMS,
457	termvariantscallback, (void *)(&queryresults));
458
459	// get the number of documents retrieved
460	int total_retrieved = 0, is_approx = 0;
461	mgq_docsretrieved (&total_retrieved, &is_approx);
462
463	if (total_retrieved == 0) {
464	// not available (or really was zero)
465	queryresults.docs_matched = queryresults.docs.docset.size();
466	if (queryresults.docs_matched < queryparams.maxdocs)
467	queryresults.is_approx = Exact;
468	else
469	queryresults.is_approx = MoreThan;
470	} else {
471	queryresults.docs_matched = total_retrieved;
472	if (is_approx) queryresults.is_approx = Approximate;
473	else queryresults.is_approx = Exact;
474	}
475	}
476
477	void mgsearchclass::filterquery (text_t &ttquerystring) {
478	text_t::iterator ithere = ttquerystring.begin ();
479	text_t::iterator itend = ttquerystring.end ();
480
481	// remove all non alphanumeric characters
482	while (ithere != itend) {
483	if (!is_unicode_letdig(ithere)) (ithere) = ' ';
484	ithere++;
485	}
486	}
487
488
489	// the document text for 'docnum' is placed in 'output'
490	// docTargetDocument returns 'true' if it was able to
491	// try to get a document
492	// collection is needed to see if an index from the
493	// collection is loaded. If no index has been loaded
494	// defaultindex is needed to load one
495	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
496	const text_t &defaultsubcollection,
497	const text_t &defaultlanguage,
498	const text_t &collection,
499	int docnum,
500	text_t &output) {
501	output.clear();
502
503	// get the mg version of the document
504	char *mgdoc = NULL;
505	int doclen = 0;
506	if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
507	collection, docnum, mgdoc, doclen)) return false;
508	if (mgdoc == NULL) return false;
509
510	// replace all control-Cs with spaces
511	char *mgdoc_here = mgdoc;
512	char *mgdoc_end = mgdoc + doclen;
513	while (mgdoc_here < mgdoc_end) {
514	if (mgdoc_here == '\x3') mgdoc_here = ' ';
515	mgdoc_here++;
516	}
517
518	// convert this document to unicode
519	utf8inconvertclass inconvert;
520	convertclass::status_t status;
521	inconvert.reset ();
522	inconvert.setinput (mgdoc, doclen);
523	inconvert.convert (output, status);
524
525	return true;
526	}
527
528
529	bool mgsearchclass::mgdocument (const text_t &defaultindex,
530	const text_t &defaultsubcollection,
531	const text_t &defaultlanguage,
532	const text_t &collection,
533	int docnum,
534	char *&UDoc, int &ULen) {
535	bool databaseloaded = 0;
536
537	UDoc = NULL; ULen = 0;
538
539	// see if we can make an appropriate database current
540	char *ccollection = collection.getcstr();
541	assert (ccollection != NULL);
542	databaseloaded = load_text_database (ccollection);
543	delete ccollection;
544
545	// try and load the database
546	if (!databaseloaded) databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
547	defaultlanguage, collection);
548
549	if (databaseloaded) {
550	// retrieve the document from mg
551	char docstr[32];
552	sprintf(docstr, "%i", docnum);
553
554	mgq_ask(".set mode text");
555	mgq_ask(".set query docnums");
556	mgq_ask(docstr);
557
558	tempdoc = NULL;
559	templen = 0;
560	mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
561	UDoc = tempdoc;
562	ULen = templen;
563	}
564
565	return databaseloaded;
566	}
567

Note: See TracBrowser for help on using the repository browser.

Download in other formats: