Context Navigation

source: trunk/gsdl/src/colservr/mgsearch.cpp@ 163

Last change on this file since 163 was 163, checked in by rjmcnab, 25 years ago
Removed locateinfo.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.1 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* PUT COPYRIGHT NOTICE HERE
7	*
8	* $Id: mgsearch.cpp 163 1999-02-21 22:31:35Z rjmcnab $
9	*
10	*********************************************************************/
11
12	/*
13	$Log$
14	Revision 1.5 1999/02/21 22:31:35 rjmcnab
15
16	Removed locateinfo.
17
18	Revision 1.4 1999/02/03 01:13:27 sjboddie
19
20	Got interface to handle subcollections and language subcollections -
21	committed changes made to some of the collections
22
23	Revision 1.3 1999/01/19 01:38:17 rjmcnab
24
25	Made the source more portable.
26
27	Revision 1.2 1999/01/12 01:51:02 rjmcnab
28
29	Standard header.
30
31	Revision 1.1 1999/01/08 09:02:16 rjmcnab
32
33	Moved from src/library.
34
35	*/
36
37
38	#include "gsdlconf.h"
39	#include "mgsearch.h"
40	#include "fileutil.h"
41
42	#include <string.h>
43	#include <stdio.h>
44	#include <stdlib.h>
45	#include <ctype.h>
46
47	#if defined(GSDL_USE_OBJECTSPACE)
48	# include <ospace\std\iostream>
49	#elif defined(GSDL_USE_IOS_H)
50	# include <iostream.h>
51	#else
52	# include <iostream>
53	#endif
54
55	#if defined(__WIN32__)
56	// gdbm stuff
57	# include "autoconf.h"
58	# include "systems.h"
59	# include "gdbmconst.h"
60	# include "gdbm.h"
61	#else
62	# include <gdbm.h>
63	#endif
64
65
66	#include <assert.h>
67
68	#include "mgq.h"
69	// #include "locateinfo.h"
70	#include "gsdlunicode.h"
71	#include "unitool.h"
72
73
74	/////////////
75	// globals //
76	/////////////
77
78	static char *quotedquery = NULL;
79
80
81	/////////////////////////
82	// index map functions //
83	/////////////////////////
84
85	void getrealdir (const text_t &map, text_t &realpart, text_t &dirpart) {
86	realpart.clear ();
87	dirpart.clear();
88
89	text_t::const_iterator here = map.begin();
90	text_t::const_iterator end = map.end();
91
92	// get the real index
93	while (here != end && *here != '-') {
94	realpart.push_back(*here);
95	here++;
96	}
97
98	if (here != end) here++;
99	if (here != end && *here == '>') here++;
100
101	// get the dir index
102	while (here != end) {
103	dirpart.push_back(*here);
104	here++;
105	}
106	}
107
108	void getrealdirindex (const text_t &indexmap, const text_t &subcollectionmap,
109	const text_t &languagemap, text_t &realindex,
110	text_t &dirindex) {
111	text_t real, dir;
112	realindex.clear();
113	dirindex.clear();
114
115	getrealdir (indexmap, real, dir);
116	realindex += real;
117	dirindex += dir;
118
119	getrealdir (subcollectionmap, real, dir);
120	realindex += real;
121	dirindex += dir;
122
123	getrealdir (languagemap, real, dir);
124	realindex += real;
125	dirindex += dir;
126	}
127
128	//bool isdirindex (const text_tarray &indexmap, const text_t &dirindex) {
129	// text_tarray::const_iterator here = indexmap.begin();
130	// text_tarray::const_iterator end = indexmap.end();
131	// text_t maprealindex, mapdirindex;
132
133	// while (here != end) {
134	// getrealdirindex (*here, maprealindex, mapdirindex);
135	// if (mapdirindex == dirindex) return true;
136	// here++;
137	// }
138
139	// return false;
140	//}
141
142	void getrealindexparts (const text_tarray &/indexmap/, const text_tarray &/subcollectionmap/,
143	const text_tarray &languagemap, const text_t &realindex,
144	text_t &index, text_t &subcollection, text_t &language) {
145
146	index.clear();
147	subcollection.clear();
148	language.clear();
149
150	text_tarray parts;
151	splitchar (realindex.begin(), realindex.end(), ':', parts);
152	int numparts = parts.size();
153
154	if (numparts >= 2) {
155	index = parts[0] + ":" + parts[1];
156
157	if (numparts == 3) {
158	if (languagemap.empty())
159	subcollection = parts[2];
160	else
161	language = parts[2];
162	} else if (numparts == 4) {
163	subcollection = parts[2];
164	language = parts[3];
165	}
166	}
167	}
168
169
170	void getdirindexparts (const text_tarray &/indexmap/, const text_tarray &/subcollectionmap/,
171	const text_tarray &languagemap, const text_t &dirindex,
172	text_t &index, text_t &subcollection, text_t &language) {
173
174	index.clear();
175	subcollection.clear();
176	language.clear();
177
178	int indexsize = dirindex.size();
179	if (indexsize != 3 && indexsize != 5 &&
180	indexsize != 7) return;
181
182	text_t::const_iterator dibegin = dirindex.begin();
183	text_t::const_iterator diend = dirindex.end();
184
185	// first three characters make up index part
186	index = substr(dibegin, dibegin+3);
187
188	if (indexsize == 5) {
189	if (languagemap.empty())
190	subcollection = substr(dibegin+3, dibegin+5);
191	else
192	language = substr(dibegin+3, dibegin+5);
193	} else if (indexsize == 7) {
194	subcollection = substr(dibegin+3, dibegin+5);
195	language = substr(dibegin+5, diend);
196	}
197	}
198
199
200	bool isrealindex (const text_tarray &indexmap, const text_tarray &subcollectionmap,
201	const text_tarray &languagemap, const text_t &realindex) {
202
203	text_t index, subcollection, language, realpart, dirpart;
204	getrealindexparts (indexmap, subcollectionmap, languagemap, realindex,
205	index, subcollection, language);
206
207	// check index part
208	text_tarray::const_iterator here = indexmap.begin();
209	text_tarray::const_iterator end = indexmap.end();
210	bool exists = false;
211	while (here != end) {
212	getrealdir (*here, realpart, dirpart);
213	if (realpart == index) {exists = true; break;}
214	here++;
215	}
216	if (!exists) return false;
217
218	// check subcollection part if there is one
219	if (!subcollection.empty()) {
220	here = subcollectionmap.begin();
221	end = subcollectionmap.end();
222	exists = false;
223	while (here != end) {
224	getrealdir (*here, realpart, dirpart);
225	if (realpart == subcollection) {exists = true; break;}
226	here++;
227	}
228	if (!exists) return false;
229	}
230
231	// check language part if there is one
232	if (!language.empty()) {
233	here = languagemap.begin();
234	end = languagemap.end();
235	exists = false;
236	while (here != end) {
237	getrealdir (*here, realpart, dirpart);
238	if (realpart == language) {exists = true; break;}
239	here++;
240	}
241	if (!exists) return false;
242	}
243	return true;
244	}
245
246	text_t dir2realindex (const text_tarray &indexmap, const text_tarray &subcollectionmap,
247	const text_tarray &languagemap, const text_t &dirindex) {
248
249	text_t index, subcollection, language, realpart, dirpart, realindex;
250	getdirindexparts (indexmap, subcollectionmap, languagemap, dirindex,
251	index, subcollection, language);
252
253	// get index part
254	text_tarray::const_iterator here = indexmap.begin();
255	text_tarray::const_iterator end = indexmap.end();
256	while (here != end) {
257	getrealdir (*here, realpart, dirpart);
258	if (dirpart == index) {realindex += realpart; break;}
259	here++;
260	}
261
262	if (realindex.empty()) return "";
263
264	// get subcollection part
265	here = subcollectionmap.begin();
266	end = subcollectionmap.end();
267	while (here != end) {
268	getrealdir (*here, realpart, dirpart);
269	if (dirpart == subcollection) {realindex += ":" + realpart; break;}
270	here++;
271	}
272
273	// get language part
274	here = languagemap.begin();
275	end = languagemap.end();
276	while (here != end) {
277	getrealdir (*here, realpart, dirpart);
278	if (dirpart == language) {realindex += ":" + realpart; break;}
279	here++;
280	}
281	return realindex;
282	}
283
284	text_t real2dirindex (const text_tarray &indexmap, const text_tarray &subcollectionmap,
285	const text_tarray &languagemap, const text_t &realindex) {
286
287	text_t index, subcollection, language, realpart, dirpart, dirindex;
288	getrealindexparts (indexmap, subcollectionmap, languagemap, realindex,
289	index, subcollection, language);
290
291	// get index part
292	text_tarray::const_iterator here = indexmap.begin();
293	text_tarray::const_iterator end = indexmap.end();
294	while (here != end) {
295	getrealdir (*here, realpart, dirpart);
296	if (realpart == index) {dirindex += dirpart; break;}
297	here++;
298	}
299
300	if (dirindex.empty()) return "";
301
302	// get subcollection part
303	here = subcollectionmap.begin();
304	end = subcollectionmap.end();
305	while (here != end) {
306	getrealdir (*here, realpart, dirpart);
307	if (realpart == subcollection) {dirindex += dirpart; break;}
308	here++;
309	}
310
311	// get language part
312	here = languagemap.begin();
313	end = languagemap.end();
314	while (here != end) {
315	getrealdir (*here, realpart, dirpart);
316	if (realpart == language) {dirindex += dirpart; break;}
317	here++;
318	}
319	return dirindex;
320	}
321
322	text_t real2macroindex (const text_t &realindex) {
323	text_t macroindex;
324	text_t::const_iterator here = realindex.begin();
325	text_t::const_iterator end = realindex.end();
326	unsigned short c;
327
328	while (here != end) {
329	c = *here;
330	if ((c >= '0' && c <= '9') \|\|
331	(c >= 'A' && c <= 'Z') \|\|
332	(c >= 'a' && c <= 'z'))
333	macroindex.push_back (*here);
334	here++;
335	}
336
337	return macroindex;
338	}
339
340	bool isdoclevelindex (const text_t &realindex) {
341	char *docstr = "document";
342	text_t::const_iterator here = realindex.begin ();
343	text_t::const_iterator end = realindex.end ();
344
345	while (here != end) {
346	if (*docstr == '\0') return true;
347	if (docstr != (char)(here)) return false;
348	docstr++;
349	here++;
350	}
351
352	return false;
353	}
354
355	text_t getdoclevelindex (const text_tarray &/indexmap/) {
356	//text_tarray::const_iterator here = indexmap.begin();
357	//text_tarray::const_iterator end = indexmap.end();
358	//text_t maprealindex, mapdirindex;
359
360	// while (here != end) {
361	// getrealdirindex (*here, maprealindex, mapdirindex);
362	// if (isdoclevelindex (maprealindex)) return maprealindex;
363	// here++;
364	//}
365
366	return "";
367	}
368
369
370
371
372	////////////////////////
373	// callback functions //
374	////////////////////////
375
376	// This routine is called for each document found in a search
377	// it assumes that cache_num is set up correctly to point to
378	// a suitable result cache
379	int ourquerycallback(char UDoc, int /ULen*/, int DocNum,
380	float Weight, void *info) {
381
382
383	queryresultsclass queryresults = (queryresultsclass )info;
384
385	// check the returned document for the presence of the
386	// quoted part of the query, if there was one
387
388	if (UDoc != NULL && quotedquery != NULL &&
389	quotedquery[0] != '\0' && strstr (UDoc, quotedquery) == NULL) return 0;
390
391	// append this entry to the document results
392	docresultclass docresult;
393	docresult.docnum = DocNum;
394	docresult.docweight = Weight;
395
396	queryresults->docs.push_back(docresult);
397
398	return 0;
399	}
400
401	// This callback is called once for each term in the query
402	int termfreqcallback(char *Word, int ULen, int Freq,
403	float /Weight/, void *info) {
404	queryresultsclass queryresults = (queryresultsclass )info;
405
406	text_t term;
407	term.setcarr(Word, ULen);
408	termfreqclass termfreq;
409	termfreq.termstr = to_uni(term);
410	termfreq.termfreq = Freq;
411	queryresults->terms.push_back(termfreq);
412
413	return 0;
414	}
415
416	// this callback is called once for each variation of each term
417	int termscallback(char Word, int ULen, int /Freq*/,
418	float /Weight/, void *info) {
419
420	text_t term;
421	term.setcarr(Word, ULen);
422	queryresultsclass queryresults = (queryresultsclass )info;
423	queryresults->termvariants.push_back(to_uni(term));
424
425	return 0;
426	}
427
428	// This callback is for getting document text
429	int doctextcallback(char Word, int ULen, int /Freq*/,
430	float /Weight/, void *info) {
431	text_t output = (text_t )info;
432	if (output == NULL) return 0;
433	output->clear();
434
435	utf8inconvertclass inconvert;
436	convertclass::status_t status;
437	inconvert.reset ();
438	inconvert.setinput (Word, ULen);
439	inconvert.convert (*output, status);
440
441	// replace all control-Cs with spaces
442	text_t::iterator here = output->begin();
443	text_t::iterator end = output->end();
444	while (here != end) {
445	if (here == '\x3') here = ' ';
446	here++;
447	}
448
449	return 0;
450	}
451
452
453	static text_t getindexsuffix (const text_t &collection,
454	const text_t &index) {
455	text_t indexsuffix = "index";
456	indexsuffix = filename_cat (indexsuffix, index);
457	indexsuffix = filename_cat (indexsuffix, collection);
458	return indexsuffix;
459	}
460
461
462
463
464	////////////////////
465	// mgsearch class //
466	////////////////////
467
468	mgsearchclass::mgsearchclass ()
469	{
470	cache = new querycache (RESULTCACHESIZE);
471	}
472
473	mgsearchclass::~mgsearchclass ()
474	{
475	if (cache != NULL)
476	{
477	delete cache;
478	cache = NULL;
479	}
480	}
481
482
483	void mgsearchclass::setcollectdir (const text_t &thecollectdir)
484	{
485	collectdir = thecollectdir;
486	}
487
488
489	bool mgsearchclass::search(const queryparamclass &queryparams,
490	queryresultsclass &queryresults)
491	{
492	bool databaseloaded = true;
493
494	assert (cache != NULL);
495
496	queryresults.clear();
497
498	// first check the cache
499	if (cache->find(queryparams, queryresults))
500	return true;
501
502	// make sure there is a query to be processed
503	text_t::const_iterator queryhere = queryparams.querystring.begin();
504	text_t::const_iterator queryend = queryparams.querystring.end();
505	while (queryhere != queryend) {
506	if (is_unicode_letdig (*queryhere)) break;
507	queryhere++;
508	}
509
510	// if we reached the end of the query string without finding
511	// any alphanumeric characters then return no results (and say
512	// the database was loaded)
513	if (queryhere == queryend) return true;
514
515
516	// get the names of the collection, index and text suffixes
517	char *ccollection = queryparams.collection.getcstr();
518	assert (ccollection != NULL);
519	char *idxsuffix = (getindexsuffix (queryparams.collection,
520	queryparams.search_index)).getcstr();
521	assert (idxsuffix != NULL);
522	char *txtsuffix = (getindexsuffix (queryparams.collection, "text")).getcstr();
523	assert (txtsuffix != NULL);
524
525	#ifdef __WIN32__
526	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
527	#else
528	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
529	#endif
530
531	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix))
532	{
533	setsearchmode (queryparams);
534	submitquery (queryparams);
535	getresults (queryresults);
536	}
537	else databaseloaded = false;
538
539	// free up the c strings
540	delete ccollection;
541	delete idxsuffix;
542	delete txtsuffix;
543	delete ccollectdir;
544
545	return databaseloaded;
546	}
547
548
549	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
550	{
551	mgq_ask(".set expert true");
552	mgq_ask(".set accumulator_method list");
553	mgq_ask(".set max_accumulators 50000");
554	mgq_ask(".set verbatim true");
555	mgq_ask(".unset skip_dump");
556	mgq_ask(".set mode docnums");
557
558	switch (queryparams.search_type)
559	{
560	case 0: mgq_ask(".set query boolean"); break;
561	case 1: mgq_ask(".set query ranked"); break;
562	}
563	switch (queryparams.casefolding)
564	{
565	case 1: mgq_ask(".set casefold on"); break;
566	case 0: mgq_ask(".set casefold off"); break;
567	}
568	switch (queryparams.stemming)
569	{
570	case 1: mgq_ask(".set stem on"); break;
571	case 0: mgq_ask(".set stem off"); break;
572	}
573	mgq_ask(".set heads_length 150");
574
575	char maxdocstr[32];
576	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
577	mgq_ask(maxdocstr);
578	}
579
580
581	void mgsearchclass::submitquery (const queryparamclass &queryparams)
582	{
583	// sort out the query string
584	text_t ttquerystring = queryparams.querystring;
585	text_t ttquotedquery;
586	extractquoted (ttquerystring, ttquotedquery);
587	filterquery (ttquerystring);
588
589	// turn the strings into c strings for mg
590	if (quotedquery != NULL) // quotedquery is a global
591	{
592	delete quotedquery;
593	quotedquery = NULL;
594	}
595
596	// quotedquery will be deleted on the next call to this function
597	quotedquery = to_utf8(ttquotedquery).getcstr ();
598	char *querystring = to_utf8(ttquerystring).getcstr();
599
600	// submit the query
601	mgq_ask(querystring);
602
603	delete querystring;
604	}
605
606
607	void mgsearchclass::getresults (queryresultsclass &queryresults)
608	{
609	if (quotedquery[0] == '\0')
610	{
611	// don't need the text
612	mgq_results(result_docnums, 0, MAXNUMDOCS,
613	ourquerycallback, (void *)(&queryresults));
614	}
615	else
616	{
617	// we need the text for this one
618	mgq_results(result_docs, 0, MAXNUMDOCS,
619	ourquerycallback, (void *)(&queryresults));
620	}
621
622	// get the term frequencies
623	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
624	termfreqcallback, (void *)(&queryresults));
625	mgq_results(result_terms, 0, MAXNUMTERMS,
626	termscallback, (void *)(&queryresults));
627	queryresults.sortqueryterms();
628	queryresults.uniqqueryterms();
629	}
630
631
632	void mgsearchclass::extractquoted (text_t &ttquerystring, text_t &ttquotedquery)
633	{
634	ttquotedquery.clear();
635
636	text_t::iterator ithere = ttquerystring.begin ();
637	text_t::iterator itend = ttquerystring.end ();
638
639	bool inquote = false;
640
641	while (ithere != itend)
642	{
643	if ((*ithere) == '\"')
644	{
645	if (!inquote) ttquotedquery.clear ();
646	inquote = !inquote;
647	*ithere = ' '; // delete the quote
648	}
649	else if (inquote)
650	{
651	ttquotedquery.push_back(*ithere);
652	*ithere = ' ';
653	}
654
655	ithere++;
656	}
657	}
658
659
660	void mgsearchclass::filterquery (text_t &ttquerystring) {
661	text_t::iterator ithere = ttquerystring.begin ();
662	text_t::iterator itend = ttquerystring.end ();
663
664	// remove all non alphanumeric characters
665	while (ithere != itend) {
666	if (!is_unicode_letdig(ithere)) (ithere) = ' ';
667	ithere++;
668	}
669	}
670
671
672	// the document text for 'docnum' is placed in 'output'
673	// docTargetDocument returns 'true' if it was able to
674	// try to get a document
675	// collection is needed to see if an index from the
676	// collection is loaded. If no index has been loaded
677	// defaultindex is needed to load one
678	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
679	const text_t &collection,
680	int docnum,
681	text_t &output)
682	{
683	int databaseloaded = 0;
684
685	output.clear();
686
687	char *ccollection = collection.getcstr();
688	assert (ccollection != NULL);
689
690	// see if we can make an appropriate database current
691	databaseloaded = load_text_database (ccollection);
692
693	// try and load the database
694	if (!databaseloaded)
695	{
696	// get the names of the index and text suffixes
697	char *idxsuffix = (getindexsuffix (collection,
698	defaultindex)).getcstr();
699	assert (idxsuffix != NULL);
700	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
701	assert (txtsuffix != NULL);
702
703	#ifdef __WIN32__
704	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
705	#else
706	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
707	#endif
708
709	databaseloaded = load_database(ccollection, ccollectdir, idxsuffix, txtsuffix);
710
711	// free up the c strings
712	delete idxsuffix;
713	delete txtsuffix;
714	delete ccollectdir;
715	}
716
717	// free up the c collection string
718	delete ccollection;
719
720	if (databaseloaded)
721	{
722	// retrieve the document from mg
723	char docstr[32];
724	sprintf(docstr, "%i", docnum);
725
726	mgq_ask(".set mode text");
727	mgq_ask(".set query docnums");
728	mgq_ask(docstr);
729	mgq_results (result_docs, 0, 1, doctextcallback, (void *)&output);
730	}
731
732	return databaseloaded;
733	}
734

Note: See TracBrowser for help on using the repository browser.

Download in other formats: