Context Navigation

source: trunk/gsdl/src/colservr/mgsearch.cpp@ 301

Last change on this file since 301 was 301, checked in by sjboddie, 25 years ago
got rid of all the old functions for dealing with dir indexes
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 11.2 KB

Line
1	/**********************************************************************
2	*
3	* mgsearch.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* PUT COPYRIGHT NOTICE HERE
7	*
8	* $Id: mgsearch.cpp 301 1999-06-27 22:07:27Z sjboddie $
9	*
10	*********************************************************************/
11
12	/*
13	$Log$
14	Revision 1.7 1999/06/27 22:07:27 sjboddie
15	got rid of all the old functions for dealing with dir indexes
16
17	Revision 1.6 1999/06/09 00:41:32 sjboddie
18	phrase searching now uses case-folding if it's turned on
19
20	Revision 1.5 1999/02/21 22:31:35 rjmcnab
21
22	Removed locateinfo.
23
24	Revision 1.4 1999/02/03 01:13:27 sjboddie
25
26	Got interface to handle subcollections and language subcollections -
27	committed changes made to some of the collections
28
29	Revision 1.3 1999/01/19 01:38:17 rjmcnab
30
31	Made the source more portable.
32
33	Revision 1.2 1999/01/12 01:51:02 rjmcnab
34
35	Standard header.
36
37	Revision 1.1 1999/01/08 09:02:16 rjmcnab
38
39	Moved from src/library.
40
41	*/
42
43
44	#include "gsdlconf.h"
45	#include "mgsearch.h"
46	#include "fileutil.h"
47
48	#include <string.h>
49	#include <stdio.h>
50	#include <stdlib.h>
51	#include <ctype.h>
52
53	#if defined(GSDL_USE_OBJECTSPACE)
54	# include <ospace\std\iostream>
55	#elif defined(GSDL_USE_IOS_H)
56	# include <iostream.h>
57	#else
58	# include <iostream>
59	#endif
60
61	#if defined(__WIN32__)
62	// gdbm stuff
63	# include "autoconf.h"
64	# include "systems.h"
65	# include "gdbmconst.h"
66	# include "gdbm.h"
67	#else
68	# include <gdbm.h>
69	#endif
70
71
72	#include <assert.h>
73
74	#include "mgq.h"
75	// #include "locateinfo.h"
76	#include "gsdlunicode.h"
77	#include "unitool.h"
78
79
80	/////////////
81	// globals //
82	/////////////
83
84	static char *quotedquery = NULL;
85	static int casefold;
86
87
88	////////////////////////
89	// callback functions //
90	////////////////////////
91
92	// This routine is called for each document found in a search
93	// it assumes that cache_num is set up correctly to point to
94	// a suitable result cache
95	int ourquerycallback(char UDoc, int /ULen*/, int DocNum,
96	float Weight, void *info) {
97
98
99	queryresultsclass queryresults = (queryresultsclass )info;
100
101	// check the returned document for the presence of the
102	// quoted part of the query, if there was one
103
104	// if (UDoc != NULL && quotedquery != NULL &&
105	// quotedquery[0] != '\0' && strstr (UDoc, quotedquery) == NULL) return 0;
106
107
108	if (UDoc != NULL && quotedquery != NULL && quotedquery[0] != '\0') {
109
110	if (casefold) {
111	int len;
112	for (len = 0; quotedquery[len] != '\0'; len ++)
113	quotedquery[len] = tolower (quotedquery[len]);
114	for (len = 0; UDoc[len] != '\0'; len ++)
115	UDoc[len] = tolower (UDoc[len]);
116	}
117	if (strstr (UDoc, quotedquery) == NULL) return 0;
118	}
119
120	// append this entry to the document results
121	docresultclass docresult;
122	docresult.docnum = DocNum;
123	docresult.docweight = Weight;
124
125	queryresults->docs.push_back(docresult);
126
127	return 0;
128	}
129
130	// This callback is called once for each term in the query
131	int termfreqcallback(char *Word, int ULen, int Freq,
132	float /Weight/, void *info) {
133	queryresultsclass queryresults = (queryresultsclass )info;
134
135	text_t term;
136	term.setcarr(Word, ULen);
137	termfreqclass termfreq;
138	termfreq.termstr = to_uni(term);
139	termfreq.termfreq = Freq;
140	queryresults->terms.push_back(termfreq);
141
142	return 0;
143	}
144
145	// this callback is called once for each variation of each term
146	int termscallback(char Word, int ULen, int /Freq*/,
147	float /Weight/, void *info) {
148
149	text_t term;
150	term.setcarr(Word, ULen);
151	queryresultsclass queryresults = (queryresultsclass )info;
152	queryresults->termvariants.push_back(to_uni(term));
153
154	return 0;
155	}
156
157	// This callback is for getting document text
158	int doctextcallback(char Word, int ULen, int /Freq*/,
159	float /Weight/, void *info) {
160	text_t output = (text_t )info;
161	if (output == NULL) return 0;
162	output->clear();
163
164	utf8inconvertclass inconvert;
165	convertclass::status_t status;
166	inconvert.reset ();
167	inconvert.setinput (Word, ULen);
168	inconvert.convert (*output, status);
169
170	// replace all control-Cs with spaces
171	text_t::iterator here = output->begin();
172	text_t::iterator end = output->end();
173	while (here != end) {
174	if (here == '\x3') here = ' ';
175	here++;
176	}
177
178	return 0;
179	}
180
181
182	static text_t getindexsuffix (const text_t &collection,
183	const text_t &index) {
184	text_t indexsuffix = "index";
185	indexsuffix = filename_cat (indexsuffix, index);
186	indexsuffix = filename_cat (indexsuffix, collection);
187	return indexsuffix;
188	}
189
190
191
192
193	////////////////////
194	// mgsearch class //
195	////////////////////
196
197	mgsearchclass::mgsearchclass ()
198	{
199	cache = new querycache (RESULTCACHESIZE);
200	}
201
202	mgsearchclass::~mgsearchclass ()
203	{
204	if (cache != NULL)
205	{
206	delete cache;
207	cache = NULL;
208	}
209	}
210
211
212	void mgsearchclass::setcollectdir (const text_t &thecollectdir)
213	{
214	collectdir = thecollectdir;
215	}
216
217
218	bool mgsearchclass::search(const queryparamclass &queryparams,
219	queryresultsclass &queryresults)
220	{
221	bool databaseloaded = true;
222
223	assert (cache != NULL);
224
225	queryresults.clear();
226
227	// first check the cache
228	if (cache->find(queryparams, queryresults))
229	return true;
230
231	// make sure there is a query to be processed
232	text_t::const_iterator queryhere = queryparams.querystring.begin();
233	text_t::const_iterator queryend = queryparams.querystring.end();
234	while (queryhere != queryend) {
235	if (is_unicode_letdig (*queryhere)) break;
236	queryhere++;
237	}
238
239	// if we reached the end of the query string without finding
240	// any alphanumeric characters then return no results (and say
241	// the database was loaded)
242	if (queryhere == queryend) return true;
243
244	casefold = queryparams.casefolding;
245
246	// get the names of the collection, index and text suffixes
247	char *ccollection = queryparams.collection.getcstr();
248	assert (ccollection != NULL);
249	char *idxsuffix = (getindexsuffix (queryparams.collection,
250	queryparams.search_index)).getcstr();
251	assert (idxsuffix != NULL);
252	char *txtsuffix = (getindexsuffix (queryparams.collection, "text")).getcstr();
253	assert (txtsuffix != NULL);
254
255	#ifdef __WIN32__
256	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
257	#else
258	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
259	#endif
260
261	if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix))
262	{
263	setsearchmode (queryparams);
264	submitquery (queryparams);
265	getresults (queryresults);
266	}
267	else databaseloaded = false;
268
269	// free up the c strings
270	delete ccollection;
271	delete idxsuffix;
272	delete txtsuffix;
273	delete ccollectdir;
274
275	return databaseloaded;
276	}
277
278
279	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
280	{
281	mgq_ask(".set expert true");
282	mgq_ask(".set accumulator_method list");
283	mgq_ask(".set max_accumulators 50000");
284	mgq_ask(".set verbatim true");
285	mgq_ask(".unset skip_dump");
286	mgq_ask(".set mode docnums");
287
288	switch (queryparams.search_type)
289	{
290	case 0: mgq_ask(".set query boolean"); break;
291	case 1: mgq_ask(".set query ranked"); break;
292	}
293	switch (queryparams.casefolding)
294	{
295	case 1: mgq_ask(".set casefold on"); break;
296	case 0: mgq_ask(".set casefold off"); break;
297	}
298	switch (queryparams.stemming)
299	{
300	case 1: mgq_ask(".set stem on"); break;
301	case 0: mgq_ask(".set stem off"); break;
302	}
303	mgq_ask(".set heads_length 150");
304
305	char maxdocstr[32];
306	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
307	mgq_ask(maxdocstr);
308	}
309
310
311	void mgsearchclass::submitquery (const queryparamclass &queryparams)
312	{
313	// sort out the query string
314	text_t ttquerystring = queryparams.querystring;
315	text_t ttquotedquery;
316	extractquoted (ttquerystring, ttquotedquery);
317	filterquery (ttquerystring);
318
319	// turn the strings into c strings for mg
320	if (quotedquery != NULL) // quotedquery is a global
321	{
322	delete quotedquery;
323	quotedquery = NULL;
324	}
325
326	// quotedquery will be deleted on the next call to this function
327	quotedquery = to_utf8(ttquotedquery).getcstr ();
328	char *querystring = to_utf8(ttquerystring).getcstr();
329
330	// submit the query
331	mgq_ask(querystring);
332
333	delete querystring;
334	}
335
336
337	void mgsearchclass::getresults (queryresultsclass &queryresults)
338	{
339	if (quotedquery[0] == '\0')
340	{
341	// don't need the text
342	mgq_results(result_docnums, 0, MAXNUMDOCS,
343	ourquerycallback, (void *)(&queryresults));
344	}
345	else
346	{
347	// we need the text for this one
348	mgq_results(result_docs, 0, MAXNUMDOCS,
349	ourquerycallback, (void *)(&queryresults));
350	}
351
352	// get the term frequencies
353	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
354	termfreqcallback, (void *)(&queryresults));
355	mgq_results(result_terms, 0, MAXNUMTERMS,
356	termscallback, (void *)(&queryresults));
357	queryresults.sortqueryterms();
358	queryresults.uniqqueryterms();
359	}
360
361
362	void mgsearchclass::extractquoted (text_t &ttquerystring, text_t &ttquotedquery)
363	{
364	ttquotedquery.clear();
365
366	text_t::iterator ithere = ttquerystring.begin ();
367	text_t::iterator itend = ttquerystring.end ();
368
369	bool inquote = false;
370
371	while (ithere != itend)
372	{
373	if ((*ithere) == '\"')
374	{
375	if (!inquote) ttquotedquery.clear ();
376	inquote = !inquote;
377	*ithere = ' '; // delete the quote
378	}
379	else if (inquote)
380	{
381	ttquotedquery.push_back(*ithere);
382	*ithere = ' ';
383	}
384
385	ithere++;
386	}
387	}
388
389
390	void mgsearchclass::filterquery (text_t &ttquerystring) {
391	text_t::iterator ithere = ttquerystring.begin ();
392	text_t::iterator itend = ttquerystring.end ();
393
394	// remove all non alphanumeric characters
395	while (ithere != itend) {
396	if (!is_unicode_letdig(ithere)) (ithere) = ' ';
397	ithere++;
398	}
399	}
400
401
402	// the document text for 'docnum' is placed in 'output'
403	// docTargetDocument returns 'true' if it was able to
404	// try to get a document
405	// collection is needed to see if an index from the
406	// collection is loaded. If no index has been loaded
407	// defaultindex is needed to load one
408	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
409	const text_t &collection,
410	int docnum,
411	text_t &output)
412	{
413	int databaseloaded = 0;
414
415	output.clear();
416
417	char *ccollection = collection.getcstr();
418	assert (ccollection != NULL);
419
420	// see if we can make an appropriate database current
421	databaseloaded = load_text_database (ccollection);
422
423	// try and load the database
424	if (!databaseloaded)
425	{
426	// get the names of the index and text suffixes
427	char *idxsuffix = (getindexsuffix (collection,
428	defaultindex)).getcstr();
429	assert (idxsuffix != NULL);
430	char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
431	assert (txtsuffix != NULL);
432
433	#ifdef __WIN32__
434	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
435	#else
436	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
437	#endif
438
439	databaseloaded = load_database(ccollection, ccollectdir, idxsuffix, txtsuffix);
440
441	// free up the c strings
442	delete idxsuffix;
443	delete txtsuffix;
444	delete ccollectdir;
445	}
446
447	// free up the c collection string
448	delete ccollection;
449
450	if (databaseloaded)
451	{
452	// retrieve the document from mg
453	char docstr[32];
454	sprintf(docstr, "%i", docnum);
455
456	mgq_ask(".set mode text");
457	mgq_ask(".set query docnums");
458	mgq_ask(docstr);
459	mgq_results (result_docs, 0, 1, doctextcallback, (void *)&output);
460	}
461
462	return databaseloaded;
463	}
464

Note: See TracBrowser for help on using the repository browser.

Download in other formats: