Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/src/library/mgsearch.cpp@ 4

Last change on this file since 4 was 4, checked in by sjboddie, 25 years ago
Initial revision
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 11.3 KB

Line
1	#include <string.h>
2	#include <stdio.h>
3	#include <stdlib.h>
4	#include <ctype.h>
5
6	#ifdef __GNUG__
7	# include <iostream.h>
8	# include <gdbm.h>
9
10	#else
11	# ifndef USE_OBJECTSPACE
12	# include <iostream>
13	# else
14	# include <ospace\std\iostream>
15	# endif
16
17	// gdbm stuff
18	# include "autoconf.h"
19	# include "systems.h"
20	# include "gdbmconst.h"
21	# include "gdbm.h"
22	#endif
23
24	#include <assert.h>
25
26	#include "mgq.h"
27	#include "mgsearch.h"
28	#include "locateinfo.h"
29
30	/////////////
31	// globals //
32	/////////////
33
34	static char *quotedquery = NULL;
35
36
37
38	////////////////////////
39	// callback functions //
40	////////////////////////
41
42	// This routine is called for each document found in a search
43	// it assumes that cache_num is set up correctly to point to
44	// a suitable result cache
45	int ourquerycallback(char *UDoc, int ULen, int DocNum,
46	float Weight, void *info) {
47
48
49	queryresultsclass queryresults = (queryresultsclass )info;
50
51	// check the returned document for the presence of the
52	// quoted part of the query, if there was one
53
54	if (UDoc != NULL && quotedquery != NULL &&
55	quotedquery[0] != '\0' && strstr (UDoc, quotedquery) == NULL) return 0;
56
57	// append this entry to the document results
58	docresultclass docresult;
59	docresult.docnum = DocNum;
60	docresult.docweight = Weight;
61
62	queryresults->docs.push_back(docresult);
63
64	return 0;
65	}
66
67	// This callback is called once for each term in the query
68	int termfreqcallback(char *Word, int ULen, int Freq,
69	float Weight, void *info) {
70	queryresultsclass queryresults = (queryresultsclass )info;
71
72	termfreqclass termfreq;
73	termfreq.termstr.setcarr(Word, ULen);
74	termfreq.termfreq = Freq;
75	queryresults->terms.push_back(termfreq);
76
77	return 0;
78	}
79
80	// this callback is called once for each variation of each term
81	int termscallback(char *Word, int ULen, int Freq,
82	float Weight, void *info) {
83
84	queryresultsclass queryresults = (queryresultsclass )info;
85	queryresults->termvariants.push_back(Word);
86
87	return 0;
88	}
89
90	// This callback is for getting document text
91	int doctextcallback(char *Word, int ULen, int Freq,
92	float Weight, void *info) {
93	text_t output = (text_t )info;
94	if (output == NULL) return 0;
95
96	output->setcarr(Word, ULen);
97
98	// replace all control-Cs with spaces
99	text_t::iterator here = output->begin();
100	text_t::iterator end = output->end();
101	while (here != end)
102	{
103	if (here == '\x3') here = ' ';
104	here++;
105	}
106
107	return 0;
108	}
109
110
111
112	////////////////////
113	// mgsearch class //
114	////////////////////
115
116	mgsearchclass::mgsearchclass ()
117	{
118	cache = new querycache (RESULTCACHESIZE);
119	}
120
121	mgsearchclass::~mgsearchclass ()
122	{
123	if (cache != NULL)
124	{
125	delete cache;
126	cache = NULL;
127	}
128	}
129
130
131	void mgsearchclass::setindexhome (const text_t &theindexhome)
132	{
133	indexhome = theindexhome;
134	}
135
136
137	bool mgsearchclass::search(const queryparamclass &queryparams,
138	queryresultsclass &queryresults)
139	{
140	bool databaseloaded = true;
141
142	assert (cache != NULL);
143
144	queryresults.clear();
145
146	// first check the cache
147	if (cache->find(queryparams, queryresults))
148	{
149	return true;
150	}
151
152	// make sure there is a query to be processed
153	text_t::const_iterator queryhere = queryparams.querystring.begin();
154	text_t::const_iterator queryend = queryparams.querystring.end();
155	while (queryhere != queryend) {
156	if (((queryhere >= 65) && (queryhere <= 90)) \|\|
157	((queryhere >= 97) && (queryhere <= 122)) \|\|
158	((queryhere >= 192) && (queryhere <= 214)) \|\|
159	((queryhere >= 216) && (queryhere <= 246)) \|\|
160	((queryhere >= 248) && (queryhere <= 255)) \|\|
161	((queryhere >= '0') && (queryhere <= '9'))) break;
162	queryhere++;
163	}
164
165	// if we reached the end of the query string without finding
166	// any alphanumeric characters then return no results (and say
167	// the database was loaded)
168	if (queryhere == queryend) return true;
169
170
171
172	// get the names of the index and text suffixes
173	text_t ttidxsuffix, tttxtsuffix;
174	getindexsuffix (queryparams.search_index,
175	queryparams.collection, ttidxsuffix);
176	gettextsuffix (queryparams.collection, tttxtsuffix);
177	char *idxsuffix = ttidxsuffix.getcstr(); assert (idxsuffix != NULL);
178	char *txtsuffix = tttxtsuffix.getcstr(); assert (txtsuffix != NULL);
179
180	#ifdef __WIN32__
181	char *cindexhome = (indexhome+"\\").getcstr(); assert (cindexhome != NULL);
182	#else
183	char *cindexhome = indexhome.getcstr(); assert (cindexhome != NULL);
184	#endif
185
186	if (load_database(cindexhome, idxsuffix, txtsuffix))
187	{
188	setsearchmode (queryparams);
189	submitquery (queryparams);
190	getresults (queryresults);
191	}
192	else databaseloaded = false;
193
194	// free up the c strings
195	delete idxsuffix;
196	delete txtsuffix;
197	delete cindexhome;
198
199	return databaseloaded;
200	}
201
202
203	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
204	{
205	mgq_ask(".set expert true");
206	mgq_ask(".set accumulator_method list");
207	mgq_ask(".set max_accumulators 50000");
208	mgq_ask(".set verbatim true");
209	mgq_ask(".unset skip_dump");
210	mgq_ask(".set mode docnums");
211
212	switch (queryparams.search_type)
213	{
214	case 0: mgq_ask(".set query boolean"); break;
215	case 1: mgq_ask(".set query ranked"); break;
216	}
217	switch (queryparams.casefolding)
218	{
219	case 1: mgq_ask(".set casefold on"); break;
220	case 0: mgq_ask(".set casefold off"); break;
221	}
222	switch (queryparams.stemming)
223	{
224	case 1: mgq_ask(".set stem on"); break;
225	case 0: mgq_ask(".set stem off"); break;
226	}
227	mgq_ask(".set heads_length 150");
228
229	char maxdocstr[32];
230	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
231	mgq_ask(maxdocstr);
232	}
233
234
235	void mgsearchclass::submitquery (const queryparamclass &queryparams)
236	{
237	// sort out the query string
238	text_t ttquerystring = queryparams.querystring;
239	text_t ttquotedquery;
240	extractquoted (ttquerystring, ttquotedquery);
241	filterquery (ttquerystring);
242
243	// turn the strings into c strings for mg
244	if (quotedquery != NULL) // quotedquery is a global
245	{
246	delete quotedquery;
247	quotedquery = NULL;
248	}
249
250	// quotedquery will be deleted on the next call to this function
251	quotedquery = ttquotedquery.getcstr ();
252	char *querystring = ttquerystring.getcstr();
253
254	// submit the query
255	mgq_ask(querystring);
256
257	delete querystring;
258	}
259
260
261	void mgsearchclass::getresults (queryresultsclass &queryresults)
262	{
263	if (quotedquery[0] == '\0')
264	{
265	// don't need the text
266	mgq_results(result_docnums, 0, MAXNUMDOCS,
267	ourquerycallback, (void *)(&queryresults));
268	}
269	else
270	{
271	// we need the text for this one
272	mgq_results(result_docs, 0, MAXNUMDOCS,
273	ourquerycallback, (void *)(&queryresults));
274	}
275
276	// get the term frequencies
277	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
278	termfreqcallback, (void *)(&queryresults));
279	mgq_results(result_terms, 0, MAXNUMTERMS,
280	termscallback, (void *)(&queryresults));
281	queryresults.sortqueryterms();
282	queryresults.uniqqueryterms();
283	}
284
285
286	void mgsearchclass::extractquoted (text_t &ttquerystring, text_t &ttquotedquery)
287	{
288	ttquotedquery.clear();
289
290	text_t::iterator ithere = ttquerystring.begin ();
291	text_t::iterator itend = ttquerystring.end ();
292
293	bool inquote = false;
294
295	while (ithere != itend)
296	{
297	if ((*ithere) == '\"')
298	{
299	if (!inquote) ttquotedquery.clear ();
300	inquote = !inquote;
301	*ithere = ' '; // delete the quote
302	}
303	else if (inquote)
304	{
305	ttquotedquery.push_back(*ithere);
306	*ithere = ' ';
307	}
308
309	ithere++;
310	}
311	}
312
313
314	void mgsearchclass::filterquery (text_t &ttquerystring)
315	{
316
317	text_t::iterator ithere = ttquerystring.begin ();
318	text_t::iterator itend = ttquerystring.end ();
319	unsigned short c;
320
321	// remove all non alphanumeric characters below 127
322	while (ithere != itend)
323	{
324	c = *ithere;
325
326	// if ((c <= 127) && !((c >= '0' && c <= '9') \|\|
327	// (c >= 'A' && c <= 'Z') \|\|
328	// (c >= 'a' && c <= 'z')))
329	if (!(((c >= 65) && (c <= 90)) \|\|
330	((c >= 97) && (c <= 122)) \|\|
331	((c >= 192) && (c <= 214)) \|\|
332	((c >= 216) && (c <= 246)) \|\|
333	((c >= 248) && (c <= 255)) \|\|
334	((c >= '0') && (c <= '9')) \|\|
335	(c == 176)))
336	(*ithere) = ' ';
337
338	ithere++;
339	}
340	}
341
342
343	// the document text for 'docnum' is placed in 'output'
344	// docTargetDocument returns 'true' if it was able to
345	// try to get a document
346	// collection is needed to see if an index from the
347	// collection is loaded. If no index has been loaded
348	// defaultindex is needed to load one
349	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
350	const text_t &collection,
351	int docnum,
352	text_t &output)
353	{
354	bool databaseloaded = true;
355
356	output.clear();
357
358
359	// make sure index is level 2
360
361	////// this changed with new naming scheme in new building software
362	///// i.e paragraph level index no longer contain number '3' but begin
363	///// with letter 'p'
364
365	text_t db_loaded = db_loaded_name;
366
367	if (!db_loaded.empty()) {
368	text_t::const_iterator here = db_loaded.begin();
369	text_t::const_iterator end = db_loaded.end();
370
371
372	//while (here != end) {
373	// if (*here == '3')
374	// databaseloaded = false;
375	// here ++;
376	//}
377
378	char separator = '/';
379	text_t db;
380	int found = 0;
381	#ifdef __WIN32__
382	separator = '\\';
383	#endif;
384	// strip away path to db and following collection name
385	end --;
386	while (end != here) {
387	if (*end == separator) {
388	if (found) break;
389	else {db.clear(); found = 1; end--; continue;}
390	}
391	db.push_back(*end);
392	end --;
393	}
394
395	// string will have been reversed above so see if last
396	// character is 'p'
397	if (db[db.size()-1] == 'p') databaseloaded = false;
398	}
399
400	// find out if the database is already loaded
401	// this is needed because a different index (but valid one)
402	// might be already loaded.
403	// this comparison is needed because 'load_database'
404	// is now more oriented towards indexes
405	if (databaseloaded == true) {
406	text_t::const_iterator here = collection.begin();
407	text_t::const_iterator end = collection.end();
408	char *dbhere = &db_loaded_name[strlen(db_loaded_name) - collection.size()]; // assumes collection shorter than db_loaded_name
409	while (here != end)
410	{
411	if (here != dbhere)
412	{
413	databaseloaded = false;
414	break;
415	}
416	here++;
417	dbhere++;
418	}
419	}
420
421	// try and load the database
422	if (!databaseloaded)
423	{
424	// get the names of the index and text suffixes
425	text_t ttidxsuffix, tttxtsuffix;
426	getindexsuffix (defaultindex, collection, ttidxsuffix);
427	gettextsuffix (collection, tttxtsuffix);
428	char *idxsuffix = ttidxsuffix.getcstr(); assert (idxsuffix != NULL);
429	char *txtsuffix = tttxtsuffix.getcstr(); assert (txtsuffix != NULL);
430
431	#ifdef __WIN32__
432	char *cindexhome = (indexhome+"\\").getcstr(); assert (cindexhome != NULL);
433	#else
434	char *cindexhome = indexhome.getcstr(); assert (cindexhome != NULL);
435	#endif
436
437
438	if (load_database(cindexhome, idxsuffix, txtsuffix))
439	{
440	databaseloaded = true;
441	}
442	else
443	{
444	databaseloaded = false;
445	}
446
447	// free up the c strings
448	delete idxsuffix;
449	delete txtsuffix;
450	delete cindexhome;
451	}
452
453	if (databaseloaded)
454	{
455	// retrieve the document from mg
456	char docstr[32];
457	sprintf(docstr, "%i", docnum);
458
459	mgq_ask(".set mode text");
460	mgq_ask(".set query docnums");
461	mgq_ask(docstr);
462	mgq_results (result_docs, 0, 1, doctextcallback, (void *)&output);
463	}
464
465	return databaseloaded;
466	}
467

Note: See TracBrowser for help on using the repository browser.

Download in other formats: