Context Navigation

source: trunk/gsdl/src/library/mgsearch.cpp@ 91

Last change on this file since 91 was 91, checked in by rjmcnab, 25 years ago
Changed the directory structure (collect.cfg and site.cfg now reside in the collection/etc directory). Changed all input to the library software to be converted from utf-8 to unicode (info database, mg, and display). Got lib.init to read in collect.cfg and build.cfg and used the information to read in the macrofiles. Made it check for each macro file in both the collection directory and then the main directory.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 11.2 KB

Line
1	#include <string.h>
2	#include <stdio.h>
3	#include <stdlib.h>
4	#include <ctype.h>
5
6	#ifdef __GNUG__
7	# include <iostream.h>
8	# include <gdbm.h>
9
10	#else
11	# ifndef USE_OBJECTSPACE
12	# include <iostream>
13	# else
14	# include <ospace\std\iostream>
15	# endif
16
17	// gdbm stuff
18	# include "autoconf.h"
19	# include "systems.h"
20	# include "gdbmconst.h"
21	# include "gdbm.h"
22	#endif
23
24	#include <assert.h>
25
26	#include "mgq.h"
27	#include "mgsearch.h"
28	#include "locateinfo.h"
29	#include "gsdlunicode.h"
30	#include "unitool.h"
31
32
33	/////////////
34	// globals //
35	/////////////
36
37	static char *quotedquery = NULL;
38
39
40
41	////////////////////////
42	// callback functions //
43	////////////////////////
44
45	// This routine is called for each document found in a search
46	// it assumes that cache_num is set up correctly to point to
47	// a suitable result cache
48	int ourquerycallback(char *UDoc, int ULen, int DocNum,
49	float Weight, void *info) {
50
51
52	queryresultsclass queryresults = (queryresultsclass )info;
53
54	// check the returned document for the presence of the
55	// quoted part of the query, if there was one
56
57	if (UDoc != NULL && quotedquery != NULL &&
58	quotedquery[0] != '\0' && strstr (UDoc, quotedquery) == NULL) return 0;
59
60	// append this entry to the document results
61	docresultclass docresult;
62	docresult.docnum = DocNum;
63	docresult.docweight = Weight;
64
65	queryresults->docs.push_back(docresult);
66
67	return 0;
68	}
69
70	// This callback is called once for each term in the query
71	int termfreqcallback(char *Word, int ULen, int Freq,
72	float Weight, void *info) {
73	queryresultsclass queryresults = (queryresultsclass )info;
74
75	termfreqclass termfreq;
76	termfreq.termstr.setcarr(Word, ULen);
77	termfreq.termfreq = Freq;
78	queryresults->terms.push_back(termfreq);
79
80	return 0;
81	}
82
83	// this callback is called once for each variation of each term
84	int termscallback(char *Word, int ULen, int Freq,
85	float Weight, void *info) {
86
87	// convert term from utf8 to unicode
88	text_t term;
89	utf8inconvertclass inconvert;
90	convertclass::status_t status;
91	inconvert.reset ();
92	inconvert.setinput (Word, ULen);
93	inconvert.convert (term, status);
94
95	queryresultsclass queryresults = (queryresultsclass )info;
96	queryresults->termvariants.push_back(term);
97
98	return 0;
99	}
100
101	// This callback is for getting document text
102	int doctextcallback(char *Word, int ULen, int Freq,
103	float Weight, void *info) {
104	text_t output = (text_t )info;
105	if (output == NULL) return 0;
106	output->clear();
107
108	utf8inconvertclass inconvert;
109	convertclass::status_t status;
110	inconvert.reset ();
111	inconvert.setinput (Word, ULen);
112	inconvert.convert (*output, status);
113
114	// replace all control-Cs with spaces
115	text_t::iterator here = output->begin();
116	text_t::iterator end = output->end();
117	while (here != end) {
118	if (here == '\x3') here = ' ';
119	here++;
120	}
121
122	return 0;
123	}
124
125
126
127	////////////////////
128	// mgsearch class //
129	////////////////////
130
131	mgsearchclass::mgsearchclass ()
132	{
133	cache = new querycache (RESULTCACHESIZE);
134	}
135
136	mgsearchclass::~mgsearchclass ()
137	{
138	if (cache != NULL)
139	{
140	delete cache;
141	cache = NULL;
142	}
143	}
144
145
146	void mgsearchclass::setcollectdir (const text_t &thecollectdir)
147	{
148	collectdir = thecollectdir;
149	}
150
151
152	bool mgsearchclass::search(const queryparamclass &queryparams,
153	queryresultsclass &queryresults)
154	{
155	bool databaseloaded = true;
156
157	assert (cache != NULL);
158
159	queryresults.clear();
160
161	// first check the cache
162	if (cache->find(queryparams, queryresults))
163	return true;
164
165	// make sure there is a query to be processed
166	text_t::const_iterator queryhere = queryparams.querystring.begin();
167	text_t::const_iterator queryend = queryparams.querystring.end();
168	while (queryhere != queryend) {
169	if (is_unicode_letdig (*queryhere)) break;
170	queryhere++;
171	}
172
173	// if we reached the end of the query string without finding
174	// any alphanumeric characters then return no results (and say
175	// the database was loaded)
176	if (queryhere == queryend) return true;
177
178
179	// get the names of the index and text suffixes
180	char *idxsuffix = (getindexsuffix (queryparams.collection,
181	queryparams.search_index)).getcstr();
182	assert (idxsuffix != NULL);
183	char *txtsuffix = (gettextsuffix (queryparams.collection)).getcstr();
184	assert (txtsuffix != NULL);
185
186	#ifdef __WIN32__
187	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
188	#else
189	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
190	#endif
191
192	if (load_database(ccollectdir, idxsuffix, txtsuffix))
193	{
194	setsearchmode (queryparams);
195	submitquery (queryparams);
196	getresults (queryresults);
197	}
198	else databaseloaded = false;
199
200	// free up the c strings
201	delete idxsuffix;
202	delete txtsuffix;
203	delete ccollectdir;
204
205	return databaseloaded;
206	}
207
208
209	void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
210	{
211	mgq_ask(".set expert true");
212	mgq_ask(".set accumulator_method list");
213	mgq_ask(".set max_accumulators 50000");
214	mgq_ask(".set verbatim true");
215	mgq_ask(".unset skip_dump");
216	mgq_ask(".set mode docnums");
217
218	switch (queryparams.search_type)
219	{
220	case 0: mgq_ask(".set query boolean"); break;
221	case 1: mgq_ask(".set query ranked"); break;
222	}
223	switch (queryparams.casefolding)
224	{
225	case 1: mgq_ask(".set casefold on"); break;
226	case 0: mgq_ask(".set casefold off"); break;
227	}
228	switch (queryparams.stemming)
229	{
230	case 1: mgq_ask(".set stem on"); break;
231	case 0: mgq_ask(".set stem off"); break;
232	}
233	mgq_ask(".set heads_length 150");
234
235	char maxdocstr[32];
236	sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
237	mgq_ask(maxdocstr);
238	}
239
240
241	void mgsearchclass::submitquery (const queryparamclass &queryparams)
242	{
243	// sort out the query string
244	text_t ttquerystring = queryparams.querystring;
245	text_t ttquotedquery;
246	extractquoted (ttquerystring, ttquotedquery);
247	filterquery (ttquerystring);
248
249	// turn the strings into c strings for mg
250	if (quotedquery != NULL) // quotedquery is a global
251	{
252	delete quotedquery;
253	quotedquery = NULL;
254	}
255
256	// quotedquery will be deleted on the next call to this function
257	quotedquery = ttquotedquery.getcstr ();
258	char *querystring = ttquerystring.getcstr();
259
260	// submit the query
261	mgq_ask(querystring);
262
263	delete querystring;
264	}
265
266
267	void mgsearchclass::getresults (queryresultsclass &queryresults)
268	{
269	if (quotedquery[0] == '\0')
270	{
271	// don't need the text
272	mgq_results(result_docnums, 0, MAXNUMDOCS,
273	ourquerycallback, (void *)(&queryresults));
274	}
275	else
276	{
277	// we need the text for this one
278	mgq_results(result_docs, 0, MAXNUMDOCS,
279	ourquerycallback, (void *)(&queryresults));
280	}
281
282	// get the term frequencies
283	mgq_results(result_termfreqs, 0, MAXNUMTERMS,
284	termfreqcallback, (void *)(&queryresults));
285	mgq_results(result_terms, 0, MAXNUMTERMS,
286	termscallback, (void *)(&queryresults));
287	queryresults.sortqueryterms();
288	queryresults.uniqqueryterms();
289	}
290
291
292	void mgsearchclass::extractquoted (text_t &ttquerystring, text_t &ttquotedquery)
293	{
294	ttquotedquery.clear();
295
296	text_t::iterator ithere = ttquerystring.begin ();
297	text_t::iterator itend = ttquerystring.end ();
298
299	bool inquote = false;
300
301	while (ithere != itend)
302	{
303	if ((*ithere) == '\"')
304	{
305	if (!inquote) ttquotedquery.clear ();
306	inquote = !inquote;
307	*ithere = ' '; // delete the quote
308	}
309	else if (inquote)
310	{
311	ttquotedquery.push_back(*ithere);
312	*ithere = ' ';
313	}
314
315	ithere++;
316	}
317	}
318
319
320	void mgsearchclass::filterquery (text_t &ttquerystring)
321	{
322
323	text_t::iterator ithere = ttquerystring.begin ();
324	text_t::iterator itend = ttquerystring.end ();
325	unsigned short c;
326
327	// remove all non alphanumeric characters below 127
328	while (ithere != itend)
329	{
330	c = *ithere;
331
332	// if ((c <= 127) && !((c >= '0' && c <= '9') \|\|
333	// (c >= 'A' && c <= 'Z') \|\|
334	// (c >= 'a' && c <= 'z')))
335	if (!(((c >= 65) && (c <= 90)) \|\|
336	((c >= 97) && (c <= 122)) \|\|
337	((c >= 192) && (c <= 214)) \|\|
338	((c >= 216) && (c <= 246)) \|\|
339	((c >= 248) && (c <= 255)) \|\|
340	((c >= '0') && (c <= '9')) \|\|
341	(c == 176)))
342	(*ithere) = ' ';
343
344	ithere++;
345	}
346	}
347
348
349	// the document text for 'docnum' is placed in 'output'
350	// docTargetDocument returns 'true' if it was able to
351	// try to get a document
352	// collection is needed to see if an index from the
353	// collection is loaded. If no index has been loaded
354	// defaultindex is needed to load one
355	bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
356	const text_t &collection,
357	int docnum,
358	text_t &output)
359	{
360	bool databaseloaded = true;
361
362	output.clear();
363
364
365	// make sure index is level 2
366
367	////// this changed with new naming scheme in new building software
368	///// i.e paragraph level index no longer contain number '3' but begin
369	///// with letter 'p'
370
371	text_t db_loaded = db_loaded_name;
372
373	if (!db_loaded.empty()) {
374	text_t::const_iterator here = db_loaded.begin();
375	text_t::const_iterator end = db_loaded.end();
376
377
378	//while (here != end) {
379	// if (*here == '3')
380	// databaseloaded = false;
381	// here ++;
382	//}
383
384	char separator = '/';
385	text_t db;
386	int found = 0;
387	#ifdef __WIN32__
388	separator = '\\';
389	#endif;
390	// strip away path to db and following collection name
391	end --;
392	while (end != here) {
393	if (*end == separator) {
394	if (found) break;
395	else {db.clear(); found = 1; end--; continue;}
396	}
397	db.push_back(*end);
398	end --;
399	}
400
401	// string will have been reversed above so see if last
402	// character is 'p'
403	if (db[db.size()-1] == 'p') databaseloaded = false;
404	}
405
406	// find out if the database is already loaded
407	// this is needed because a different index (but valid one)
408	// might be already loaded.
409	// this comparison is needed because 'load_database'
410	// is now more oriented towards indexes
411	if (databaseloaded == true) {
412	text_t::const_iterator here = collection.begin();
413	text_t::const_iterator end = collection.end();
414	char *dbhere = &db_loaded_name[strlen(db_loaded_name) - collection.size()]; // assumes collection shorter than db_loaded_name
415	while (here != end)
416	{
417	if (here != dbhere)
418	{
419	databaseloaded = false;
420	break;
421	}
422	here++;
423	dbhere++;
424	}
425	}
426
427	// try and load the database
428	if (!databaseloaded)
429	{
430	// get the names of the index and text suffixes
431	char *idxsuffix = (getindexsuffix (collection,
432	defaultindex)).getcstr();
433	assert (idxsuffix != NULL);
434	char *txtsuffix = (gettextsuffix (collection)).getcstr();
435	assert (txtsuffix != NULL);
436
437	#ifdef __WIN32__
438	char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
439	#else
440	char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
441	#endif
442
443	if (load_database(ccollectdir, idxsuffix, txtsuffix))
444	databaseloaded = true;
445	else
446	databaseloaded = false;
447
448	// free up the c strings
449	delete idxsuffix;
450	delete txtsuffix;
451	delete ccollectdir;
452	}
453
454	if (databaseloaded)
455	{
456	// retrieve the document from mg
457	char docstr[32];
458	sprintf(docstr, "%i", docnum);
459
460	mgq_ask(".set mode text");
461	mgq_ask(".set query docnums");
462	mgq_ask(docstr);
463	mgq_results (result_docs, 0, 1, doctextcallback, (void *)&output);
464	}
465
466	return databaseloaded;
467	}
468

Note: See TracBrowser for help on using the repository browser.

Download in other formats: