Context Navigation

source: trunk/gsdl/src/colservr/mgsearch.cpp@ 265

Last change on this file since 265 was 265, checked in by sjboddie, 25 years ago
phrase searching now uses case-folding if it's turned on
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.6 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* PUT COPYRIGHT NOTICE HERE
7	*
8	* $Id: mgsearch.cpp 265 1999-06-09 00:41:32Z sjboddie $
9	*
10	*********************************************************************/
11
12	/*
13	$Log$
14	Revision 1.6 1999/06/09 00:41:32 sjboddie
15	phrase searching now uses case-folding if it's turned on
16
17	Revision 1.5 1999/02/21 22:31:35 rjmcnab
18
19	Removed locateinfo.
20
21	Revision 1.4 1999/02/03 01:13:27 sjboddie
22
23	Got interface to handle subcollections and language subcollections -
24	committed changes made to some of the collections
25
26	Revision 1.3 1999/01/19 01:38:17 rjmcnab
27
28	Made the source more portable.
29
30	Revision 1.2 1999/01/12 01:51:02 rjmcnab
31
32	Standard header.
33
34	Revision 1.1 1999/01/08 09:02:16 rjmcnab
35
36	Moved from src/library.
37
38	*/
39
40
41	#include "gsdlconf.h"
42	#include "mgsearch.h"
43	#include "fileutil.h"
44
45	#include <string.h>
46	#include <stdio.h>
47	#include <stdlib.h>
48	#include <ctype.h>
49
50	#if defined(GSDL_USE_OBJECTSPACE)
51	# include <ospace\std\iostream>
52	#elif defined(GSDL_USE_IOS_H)
53	# include <iostream.h>
54	#else
55	# include <iostream>
56	#endif
57
58	#if defined(__WIN32__)
59	// gdbm stuff
60	# include "autoconf.h"
61	# include "systems.h"
62	# include "gdbmconst.h"
63	# include "gdbm.h"
64	#else
65	# include <gdbm.h>
66	#endif
67
68
69	#include <assert.h>
70
71	#include "mgq.h"
72	// #include "locateinfo.h"
73	#include "gsdlunicode.h"
74	#include "unitool.h"
75
76
77	/////////////
78	// globals //
79	/////////////
80
81	static char *quotedquery = NULL;
82	static int casefold;
83
84
85	/////////////////////////
86	// index map functions //
87	/////////////////////////
88
89	void getrealdir (const text_t &map, text_t &realpart, text_t &dirpart) {
90	realpart.clear ();
91	dirpart.clear();
92
93	text_t::const_iterator here = map.begin();
94	text_t::const_iterator end = map.end();
95
96	// get the real index
97	while (here != end && *here != '-') {
98	realpart.push_back(*here);
99	here++;
100	}
101
102	if (here != end) here++;
103	if (here != end && *here == '>') here++;
104
105	// get the dir index
106	while (here != end) {
107	dirpart.push_back(*here);
108	here++;
109	}
110	}
111
112	void getrealdirindex (const text_t &indexmap, const text_t &subcollectionmap,
113	const text_t &languagemap, text_t &realindex,
114	text_t &dirindex) {
115	text_t real, dir;
116	realindex.clear();
117	dirindex.clear();
118
119	getrealdir (indexmap, real, dir);
120	realindex += real;
121	dirindex += dir;
122
123	getrealdir (subcollectionmap, real, dir);
124	realindex += real;
125	dirindex += dir;
126
127	getrealdir (languagemap, real, dir);
128	realindex += real;
129	dirindex += dir;
130	}
131
132	//bool isdirindex (const text_tarray &indexmap, const text_t &dirindex) {
133	// text_tarray::const_iterator here = indexmap.begin();
134	// text_tarray::const_iterator end = indexmap.end();
135	// text_t maprealindex, mapdirindex;
136
137	// while (here != end) {
138	// getrealdirindex (*here, maprealindex, mapdirindex);
139	// if (mapdirindex == dirindex) return true;
140	// here++;
141	// }
142
143	// return false;
144	//}
145
146	void getrealindexparts (const text_tarray &/indexmap/, const text_tarray &/subcollectionmap/,
147	const text_tarray &languagemap, const text_t &realindex,
148	text_t &index, text_t &subcollection, text_t &language) {
149
150	index.clear();
151	subcollection.clear();
152	language.clear();
153
154	text_tarray parts;
155	splitchar (realindex.begin(), realindex.end(), ':', parts);
156	int numparts = parts.size();
157
158	if (numparts >= 2) {
159	index = parts[0] + ":" + parts[1];
160
161	if (numparts == 3) {
162	if (languagemap.empty())
163	subcollection = parts[2];
164	else
165	language = parts[2];
166	} else if (numparts == 4) {
167	subcollection = parts[2];
168	language = parts[3];
169	}
170	}
171	}
172
173
174	void getdirindexparts (const text_tarray &/indexmap/, const text_tarray &/subcollectionmap/,
175	const text_tarray &languagemap, const text_t &dirindex,
176	text_t &index, text_t &subcollection, text_t &language) {
177
178	index.clear();
179	subcollection.clear();
180	language.clear();
181
182	int indexsize = dirindex.size();
183	if (indexsize != 3 && indexsize != 5 &&
184	indexsize != 7) return;
185
186	text_t::const_iterator dibegin = dirindex.begin();
187	text_t::const_iterator diend = dirindex.end();
188
189	// first three characters make up index part
190	index = substr(dibegin, dibegin+3);
191
192	if (indexsize == 5) {
193	if (languagemap.empty())
194	subcollection = substr(dibegin+3, dibegin+5);
195	else
196	language = substr(dibegin+3, dibegin+5);
197	} else if (indexsize == 7) {
198	subcollection = substr(dibegin+3, dibegin+5);
199	language = substr(dibegin+5, diend);
200	}
201	}
202
203
204	bool isrealindex (const text_tarray &indexmap, const text_tarray &subcollectionmap,
205	const text_tarray &languagemap, const text_t &realindex) {
206
207	text_t index, subcollection, language, realpart, dirpart;
208	getrealindexparts (indexmap, subcollectionmap, languagemap, realindex,
209	index, subcollection, language);
210
211	// check index part
212	text_tarray::const_iterator here = indexmap.begin();
213	text_tarray::const_iterator end = indexmap.end();
214	bool exists = false;
215	while (here != end) {
216	getrealdir (*here, realpart, dirpart);
217	if (realpart == index) {exists = true; break;}
218	here++;
219	}
220	if (!exists) return false;
221
222	// check subcollection part if there is one
223	if (!subcollection.empty()) {
224	here = subcollectionmap.begin();
225	end = subcollectionmap.end();
226	exists = false;
227	while (here != end) {
228	getrealdir (*here, realpart, dirpart);
229	if (realpart == subcollection) {exists = true; break;}
230	here++;
231	}
232	if (!exists) return false;
233	}
234
235	// check language part if there is one
236	if (!language.empty()) {
237	here = languagemap.begin();
238	end = languagemap.end();
239	exists = false;
240	while (here != end) {
241	getrealdir (*here, realpart, dirpart);
242	if (realpart == language) {exists = true; break;}
243	here++;
244	}
245	if (!exists) return false;
246	}
247	return true;
248	}
249
250	text_t dir2realindex (const text_tarray &indexmap, const text_tarray &subcollectionmap,
251	const text_tarray &languagemap, const text_t &dirindex) {
252
253	text_t index, subcollection, language, realpart, dirpart, realindex;
254	getdirindexparts (indexmap, subcollectionmap, languagemap, dirindex,
255	index, subcollection, language);
256
257	// get index part
258	text_tarray::const_iterator here = indexmap.begin();
259	text_tarray::const_iterator end = indexmap.end();
260	while (here != end) {
261	getrealdir (*here, realpart, dirpart);
262	if (dirpart == index) {realindex += realpart; break;}
263	here++;
264	}
265
266	if (realindex.empty()) return "";
267
268	// get subcollection part
269	here = subcollectionmap.begin();
270	end = subcollectionmap.end();
271	while (here != end) {
272	getrealdir (*here, realpart, dirpart);
273	if (dirpart == subcollection) {realindex += ":" + realpart; break;}
274	here++;
275	}
276
277	// get language part
278	here = languagemap.begin();
279	end = languagemap.end();
280	while (here != end) {
281	getrealdir (*here, realpart, dirpart);
282	if (dirpart == language) {realindex += ":" + realpart; break;}
283	here++;
284	}
285	return realindex;
286	}
287
288	text_t real2dirindex (const text_tarray &indexmap, const text_tarray &subcollectionmap,
289	const text_tarray &languagemap, const text_t &realindex) {
290
291	text_t index, subcollection, language, realpart, dirpart, dirindex;
292	getrealindexparts (indexmap, subcollectionmap, languagemap, realindex,
293	index, subcollection, language);
294
295	// get index part
296	text_tarray::const_iterator here = indexmap.begin();
297	text_tarray::const_iterator end = indexmap.end();
298	while (here != end) {
299	getrealdir (*here, realpart, dirpart);
300	if (realpart == index) {dirindex += dirpart; break;}
301	here++;
302	}
303
304	if (dirindex.empty()) return "";
305
306	// get subcollection part
307	here = subcollectionmap.begin();
308	end = subcollectionmap.end();
309	while (here != end) {
310	getrealdir (*here, realpart, dirpart);
311	if (realpart == subcollection) {dirindex += dirpart; break;}
312	here++;
313	}
314
315	// get language part
316	here = languagemap.begin();
317	end = languagemap.end();
318	while (here != end) {
319	getrealdir (*here, realpart, dirpart);
320	if (realpart == language) {dirindex += dirpart; break;}
321	here++;
322	}
323	return dirindex;
324	}
325
326	text_t real2macroindex (const text_t &realindex) {
327	text_t macroindex;
328	text_t::const_iterator here = realindex.begin();
329	text_t::const_iterator end = realindex.end();
330	unsigned short c;
331
332	while (here != end) {
333	c = *here;
334	if ((c >= '0' && c <= '9') \|\|
335	(c >= 'A' && c <= 'Z') \|\|
336	(c >= 'a' && c <= 'z'))
337	macroindex.push_back (*here);
338	here++;
339	}
340
341	return macroindex;
342	}
343
344	bool isdoclevelindex (const text_t &realindex) {
345	char *docstr = "document";
346	text_t::const_iterator here = realindex.begin ();
347	text_t::const_iterator end = realindex.end ();
348
349	while (here != end) {
350	if (*docstr == '\0') return true;
351	if (docstr != (char)(here)) return false;
352	docstr++;
353	here++;
354	}
355
356	return false;
357	}
358
359	text_t getdoclevelindex (const text_tarray &/indexmap/) {
360	//text_tarray::const_iterator here = indexmap.begin();
361	//text_tarray::const_iterator end = indexmap.end();
362	//text_t maprealindex, mapdirindex;
363
364	// while (here != end) {
365	// getrealdirindex (*here, maprealindex, mapdirindex);
366	// if (isdoclevelindex (maprealindex)) return maprealindex;
367	// here++;
368	//}
369
370	return "";
371	}
372
373
374
375
376	////////////////////////
377	// callback functions //
378	////////////////////////
379
380	// This routine is called for each document found in a search
381	// it assumes that cache_num is set up correctly to point to
382	// a suitable result cache
383	int ourquerycallback(char UDoc, int /ULen*/, int DocNum,
384	float Weight, void *info) {
385
386
387	queryresultsclass queryresults = (queryresultsclass )info;
388
389	// check the returned document for the presence of the
390	// quoted part of the query, if there was one
391
392	// if (UDoc != NULL && quotedquery != NULL &&
393	// quotedquery[0] != '\0' && strstr (UDoc, quotedquery) == NULL) return 0;
394
395
396	if (UDoc != NULL && quotedquery != NULL && quotedquery[0] != '\0') {
397
398	if (casefold) {
399	int len;
400	for (len = 0; quotedquery[len] != '\0'; len ++)
401	quotedquery[len] = tolower (quotedquery[len]);
402	for (len = 0; UDoc[len] != '\0'; len ++)
403	UDoc[len] = tolower (UDoc[len]);
404	}
405	if (strstr (UDoc, quotedquery) == NULL) return 0;
406	}
407
408	// append this entry to the document results
409	docresultclass docresult;
410	docresult.docnum = DocNum;
411	docresult.docweight = Weight;
412
413	queryresults->docs.push_back(docresult);
414
415	return 0;
416	}
417
418	// This callback is called once for each term in the query
419	int termfreqcallback(char *Word, int ULen, int Freq,
420	float /Weight/, void *info) {
421	queryresultsclass queryresults = (queryresultsclass )info;
422
423	text_t term;
424	term.setcarr(Word, ULen);
425	termfreqclass termfreq;
426	termfreq.termstr = to_uni(term);
427	termfreq.termfreq = Freq;
428	queryresults->terms.push_back(termfreq);
429
430	return 0;
431	}
432
433	// this callback is called once for each variation of each term
434	int termscallback(char Word, int ULen, int /Freq*/,
435	float /Weight/, void *info) {
436
437	text_t term;
438	term.setcarr(Word, ULen);
439	queryresultsclass queryresults = (queryresultsclass )info;
440	queryresults->termvariants.push_back(to_uni(term));
441
442	return 0;
443	}
444
445	// This callback is for getting document text
446	int doctextcallback(char Word, int ULen, int /Freq*/,
447	float /Weight/, void *info) {
448	text_t output = (text_t )info;
449	if (output == NULL) return 0;
450	output->clear();
451
452	utf8inconvertclass inconvert;
453	convertclass::status_t status;
454	inconvert.reset ();
455	inconvert.setinput (Word, ULen);
456	inconvert.convert (*output, status);
457
458	// replace all control-Cs with spaces
459	text_t::iterator here = output->begin();
460	text_t::iterator end = output->end();
461	while (here != end) {
462	if (here == '\x3') here = ' ';
463	here++;
464	}
465
466	return 0;
467	}
468
469
470	static text_t getindexsuffix (const text_t &collection,
471	const text_t &index) {
472	text_t indexsuffix = "index";
473	indexsuffix = filename_cat (indexsuffix, index);
474	indexsuffix = filename_cat (indexsuffix, collection);
475	return indexsuffix;
476	}
477
478
479
480
481	////////////////////
482	// mgsearch class //
483	////////////////////
484
485	mgsearchclass::mgsearchclass ()
486	{
487	cache = new querycache (RESULTCACHESIZE);
488	}
489
490	mgsearchclass::~mgsearchclass ()
491	{
492	if (cache != NULL)
493	{
494	delete cache;
495	cache = NULL;
496	}
497	}
498
499
500	void mgsearchclass::setcollectdir (const text_t &thecollectdir)
501	{
502	collectdir = thecollectdir;
503	}
504
505
506	bool mgsearchclass::search(const queryparamclass &queryparams,
507	queryresultsclass &queryresults)
508	{
509	bool databaseloaded = true;
510
511	assert (cache != NULL);
512
513	queryresults.clear();
514
515	// first check the cache
516	if (cache->find(queryparams, queryresults))
517	return true;
518
519	// make sure there is a query to be processed
520	text_t::const_iterator queryhere = queryparams.querystring.begin();
521	text_t::const_iterator queryend = queryparams.querystring.end();
522	while (queryhere != queryend) {
523	if (is_unicode_letdig (*queryhere)) break;
524	queryhere++;
525	}
526
527	// if we reached the end of the query string without finding
528	// any alphanumeric characters then return no results (and say
529	// the database was loaded)
530	if (queryhere == queryend) return true;
531
532	casefold = queryparams.casefolding;
533
534	// get the names of the collection, index and text suffixes
535	char *ccollection = queryparams.collection.getcstr();
536	assert (ccollection != NULL);
537	char *idxsuffix = (getindexsuffix (queryparams.collection,
538	queryparams.search_index)).getcstr();
539	assert (idxsuffix != NULL);
540	char *txtsuffix = (getindexsuffix (queryparams.collection, "text")).getcstr();
541	assert (txtsuffix != NULL);
542
543	#ifdef __WIN32__
544	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
545	#else
546	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
547	#endif
548
549	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix))
550	{
551	setsearchmode (queryparams);
552	submitquery (queryparams);
553	getresults (queryresults);
554	}
555	else databaseloaded = false;
556
557	// free up the c strings
558	delete ccollection;
559	delete idxsuffix;
560	delete txtsuffix;
561	delete ccollectdir;
562
563	return databaseloaded;
564	}
565
566
567	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
568	{
569	mgq_ask(".set expert true");
570	mgq_ask(".set accumulator_method list");
571	mgq_ask(".set max_accumulators 50000");
572	mgq_ask(".set verbatim true");
573	mgq_ask(".unset skip_dump");
574	mgq_ask(".set mode docnums");
575
576	switch (queryparams.search_type)
577	{
578	case 0: mgq_ask(".set query boolean"); break;
579	case 1: mgq_ask(".set query ranked"); break;
580	}
581	switch (queryparams.casefolding)
582	{
583	case 1: mgq_ask(".set casefold on"); break;
584	case 0: mgq_ask(".set casefold off"); break;
585	}
586	switch (queryparams.stemming)
587	{
588	case 1: mgq_ask(".set stem on"); break;
589	case 0: mgq_ask(".set stem off"); break;
590	}
591	mgq_ask(".set heads_length 150");
592
593	char maxdocstr[32];
594	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
595	mgq_ask(maxdocstr);
596	}
597
598
599	void mgsearchclass::submitquery (const queryparamclass &queryparams)
600	{
601	// sort out the query string
602	text_t ttquerystring = queryparams.querystring;
603	text_t ttquotedquery;
604	extractquoted (ttquerystring, ttquotedquery);
605	filterquery (ttquerystring);
606
607	// turn the strings into c strings for mg
608	if (quotedquery != NULL) // quotedquery is a global
609	{
610	delete quotedquery;
611	quotedquery = NULL;
612	}
613
614	// quotedquery will be deleted on the next call to this function
615	quotedquery = to_utf8(ttquotedquery).getcstr ();
616	char *querystring = to_utf8(ttquerystring).getcstr();
617
618	// submit the query
619	mgq_ask(querystring);
620
621	delete querystring;
622	}
623
624
625	void mgsearchclass::getresults (queryresultsclass &queryresults)
626	{
627	if (quotedquery[0] == '\0')
628	{
629	// don't need the text
630	mgq_results(result_docnums, 0, MAXNUMDOCS,
631	ourquerycallback, (void *)(&queryresults));
632	}
633	else
634	{
635	// we need the text for this one
636	mgq_results(result_docs, 0, MAXNUMDOCS,
637	ourquerycallback, (void *)(&queryresults));
638	}
639
640	// get the term frequencies
641	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
642	termfreqcallback, (void *)(&queryresults));
643	mgq_results(result_terms, 0, MAXNUMTERMS,
644	termscallback, (void *)(&queryresults));
645	queryresults.sortqueryterms();
646	queryresults.uniqqueryterms();
647	}
648
649
650	void mgsearchclass::extractquoted (text_t &ttquerystring, text_t &ttquotedquery)
651	{
652	ttquotedquery.clear();
653
654	text_t::iterator ithere = ttquerystring.begin ();
655	text_t::iterator itend = ttquerystring.end ();
656
657	bool inquote = false;
658
659	while (ithere != itend)
660	{
661	if ((*ithere) == '\"')
662	{
663	if (!inquote) ttquotedquery.clear ();
664	inquote = !inquote;
665	*ithere = ' '; // delete the quote
666	}
667	else if (inquote)
668	{
669	ttquotedquery.push_back(*ithere);
670	*ithere = ' ';
671	}
672
673	ithere++;
674	}
675	}
676
677
678	void mgsearchclass::filterquery (text_t &ttquerystring) {
679	text_t::iterator ithere = ttquerystring.begin ();
680	text_t::iterator itend = ttquerystring.end ();
681
682	// remove all non alphanumeric characters
683	while (ithere != itend) {
684	if (!is_unicode_letdig(ithere)) (ithere) = ' ';
685	ithere++;
686	}
687	}
688
689
690	// the document text for 'docnum' is placed in 'output'
691	// docTargetDocument returns 'true' if it was able to
692	// try to get a document
693	// collection is needed to see if an index from the
694	// collection is loaded. If no index has been loaded
695	// defaultindex is needed to load one
696	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
697	const text_t &collection,
698	int docnum,
699	text_t &output)
700	{
701	int databaseloaded = 0;
702
703	output.clear();
704
705	char *ccollection = collection.getcstr();
706	assert (ccollection != NULL);
707
708	// see if we can make an appropriate database current
709	databaseloaded = load_text_database (ccollection);
710
711	// try and load the database
712	if (!databaseloaded)
713	{
714	// get the names of the index and text suffixes
715	char *idxsuffix = (getindexsuffix (collection,
716	defaultindex)).getcstr();
717	assert (idxsuffix != NULL);
718	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
719	assert (txtsuffix != NULL);
720
721	#ifdef __WIN32__
722	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
723	#else
724	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
725	#endif
726
727	databaseloaded = load_database(ccollection, ccollectdir, idxsuffix, txtsuffix);
728
729	// free up the c strings
730	delete idxsuffix;
731	delete txtsuffix;
732	delete ccollectdir;
733	}
734
735	// free up the c collection string
736	delete ccollection;
737
738	if (databaseloaded)
739	{
740	// retrieve the document from mg
741	char docstr[32];
742	sprintf(docstr, "%i", docnum);
743
744	mgq_ask(".set mode text");
745	mgq_ask(".set query docnums");
746	mgq_ask(docstr);
747	mgq_results (result_docs, 0, 1, doctextcallback, (void *)&output);
748	}
749
750	return databaseloaded;
751	}
752

Note: See TracBrowser for help on using the repository browser.

Download in other formats: