source: trunk/gsdl/src/library/mgsearch.cpp@ 91

Last change on this file since 91 was 91, checked in by rjmcnab, 25 years ago

Changed the directory structure (collect.cfg and site.cfg now reside
in the collection/etc directory). Changed all input to the library
software to be converted from utf-8 to unicode (info database, mg,
and display). Got lib.init to read in collect.cfg and build.cfg and
used the information to read in the macrofiles. Made it check for each
macro file in both the collection directory and then the main directory.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.2 KB
Line 
1#include <string.h>
2#include <stdio.h>
3#include <stdlib.h>
4#include <ctype.h>
5
6#ifdef __GNUG__
7# include <iostream.h>
8# include <gdbm.h>
9
10#else
11# ifndef USE_OBJECTSPACE
12# include <iostream>
13# else
14# include <ospace\std\iostream>
15# endif
16
17// gdbm stuff
18# include "autoconf.h"
19# include "systems.h"
20# include "gdbmconst.h"
21# include "gdbm.h"
22#endif
23
24#include <assert.h>
25
26#include "mgq.h"
27#include "mgsearch.h"
28#include "locateinfo.h"
29#include "gsdlunicode.h"
30#include "unitool.h"
31
32
33/////////////
34// globals //
35/////////////
36
37static char *quotedquery = NULL;
38
39
40
41////////////////////////
42// callback functions //
43////////////////////////
44
45// This routine is called for each document found in a search
46// it assumes that cache_num is set up correctly to point to
47// a suitable result cache
48int ourquerycallback(char *UDoc, int ULen, int DocNum,
49 float Weight, void *info) {
50
51
52 queryresultsclass *queryresults = (queryresultsclass * )info;
53
54 // check the returned document for the presence of the
55 // quoted part of the query, if there was one
56
57 if (UDoc != NULL && quotedquery != NULL &&
58 quotedquery[0] != '\0' && strstr (UDoc, quotedquery) == NULL) return 0;
59
60 // append this entry to the document results
61 docresultclass docresult;
62 docresult.docnum = DocNum;
63 docresult.docweight = Weight;
64
65 queryresults->docs.push_back(docresult);
66
67 return 0;
68}
69
70// This callback is called once for each term in the query
71int termfreqcallback(char *Word, int ULen, int Freq,
72 float Weight, void *info) {
73 queryresultsclass *queryresults = (queryresultsclass *)info;
74
75 termfreqclass termfreq;
76 termfreq.termstr.setcarr(Word, ULen);
77 termfreq.termfreq = Freq;
78 queryresults->terms.push_back(termfreq);
79
80 return 0;
81}
82
83// this callback is called once for each variation of each term
84int termscallback(char *Word, int ULen, int Freq,
85 float Weight, void *info) {
86
87 // convert term from utf8 to unicode
88 text_t term;
89 utf8inconvertclass inconvert;
90 convertclass::status_t status;
91 inconvert.reset ();
92 inconvert.setinput (Word, ULen);
93 inconvert.convert (term, status);
94
95 queryresultsclass *queryresults = (queryresultsclass *)info;
96 queryresults->termvariants.push_back(term);
97
98 return 0;
99}
100
101// This callback is for getting document text
102int doctextcallback(char *Word, int ULen, int Freq,
103 float Weight, void *info) {
104 text_t *output = (text_t *)info;
105 if (output == NULL) return 0;
106 output->clear();
107
108 utf8inconvertclass inconvert;
109 convertclass::status_t status;
110 inconvert.reset ();
111 inconvert.setinput (Word, ULen);
112 inconvert.convert (*output, status);
113
114 // replace all control-Cs with spaces
115 text_t::iterator here = output->begin();
116 text_t::iterator end = output->end();
117 while (here != end) {
118 if (*here == '\x3') *here = ' ';
119 here++;
120 }
121
122 return 0;
123}
124
125
126
127////////////////////
128// mgsearch class //
129////////////////////
130
131mgsearchclass::mgsearchclass ()
132{
133 cache = new querycache (RESULTCACHESIZE);
134}
135
136mgsearchclass::~mgsearchclass ()
137{
138 if (cache != NULL)
139 {
140 delete cache;
141 cache = NULL;
142 }
143}
144
145
146void mgsearchclass::setcollectdir (const text_t &thecollectdir)
147{
148 collectdir = thecollectdir;
149}
150
151
152bool mgsearchclass::search(const queryparamclass &queryparams,
153 queryresultsclass &queryresults)
154{
155 bool databaseloaded = true;
156
157 assert (cache != NULL);
158
159 queryresults.clear();
160
161 // first check the cache
162 if (cache->find(queryparams, queryresults))
163 return true;
164
165 // make sure there is a query to be processed
166 text_t::const_iterator queryhere = queryparams.querystring.begin();
167 text_t::const_iterator queryend = queryparams.querystring.end();
168 while (queryhere != queryend) {
169 if (is_unicode_letdig (*queryhere)) break;
170 queryhere++;
171 }
172
173 // if we reached the end of the query string without finding
174 // any alphanumeric characters then return no results (and say
175 // the database was loaded)
176 if (queryhere == queryend) return true;
177
178
179 // get the names of the index and text suffixes
180 char *idxsuffix = (getindexsuffix (queryparams.collection,
181 queryparams.search_index)).getcstr();
182 assert (idxsuffix != NULL);
183 char *txtsuffix = (gettextsuffix (queryparams.collection)).getcstr();
184 assert (txtsuffix != NULL);
185
186#ifdef __WIN32__
187 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
188#else
189 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
190#endif
191
192 if (load_database(ccollectdir, idxsuffix, txtsuffix))
193 {
194 setsearchmode (queryparams);
195 submitquery (queryparams);
196 getresults (queryresults);
197 }
198 else databaseloaded = false;
199
200 // free up the c strings
201 delete idxsuffix;
202 delete txtsuffix;
203 delete ccollectdir;
204
205 return databaseloaded;
206}
207
208
209void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
210{
211 mgq_ask(".set expert true");
212 mgq_ask(".set accumulator_method list");
213 mgq_ask(".set max_accumulators 50000");
214 mgq_ask(".set verbatim true");
215 mgq_ask(".unset skip_dump");
216 mgq_ask(".set mode docnums");
217
218 switch (queryparams.search_type)
219 {
220 case 0: mgq_ask(".set query boolean"); break;
221 case 1: mgq_ask(".set query ranked"); break;
222 }
223 switch (queryparams.casefolding)
224 {
225 case 1: mgq_ask(".set casefold on"); break;
226 case 0: mgq_ask(".set casefold off"); break;
227 }
228 switch (queryparams.stemming)
229 {
230 case 1: mgq_ask(".set stem on"); break;
231 case 0: mgq_ask(".set stem off"); break;
232 }
233 mgq_ask(".set heads_length 150");
234
235 char maxdocstr[32];
236 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
237 mgq_ask(maxdocstr);
238}
239
240
241void mgsearchclass::submitquery (const queryparamclass &queryparams)
242{
243 // sort out the query string
244 text_t ttquerystring = queryparams.querystring;
245 text_t ttquotedquery;
246 extractquoted (ttquerystring, ttquotedquery);
247 filterquery (ttquerystring);
248
249 // turn the strings into c strings for mg
250 if (quotedquery != NULL) // quotedquery is a global
251 {
252 delete quotedquery;
253 quotedquery = NULL;
254 }
255
256 // quotedquery will be deleted on the next call to this function
257 quotedquery = ttquotedquery.getcstr ();
258 char *querystring = ttquerystring.getcstr();
259
260 // submit the query
261 mgq_ask(querystring);
262
263 delete querystring;
264}
265
266
267void mgsearchclass::getresults (queryresultsclass &queryresults)
268{
269 if (quotedquery[0] == '\0')
270 {
271 // don't need the text
272 mgq_results(result_docnums, 0, MAXNUMDOCS,
273 ourquerycallback, (void *)(&queryresults));
274 }
275 else
276 {
277 // we need the text for this one
278 mgq_results(result_docs, 0, MAXNUMDOCS,
279 ourquerycallback, (void *)(&queryresults));
280 }
281
282 // get the term frequencies
283 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
284 termfreqcallback, (void *)(&queryresults));
285 mgq_results(result_terms, 0, MAXNUMTERMS,
286 termscallback, (void *)(&queryresults));
287 queryresults.sortqueryterms();
288 queryresults.uniqqueryterms();
289}
290
291
292void mgsearchclass::extractquoted (text_t &ttquerystring, text_t &ttquotedquery)
293{
294 ttquotedquery.clear();
295
296 text_t::iterator ithere = ttquerystring.begin ();
297 text_t::iterator itend = ttquerystring.end ();
298
299 bool inquote = false;
300
301 while (ithere != itend)
302 {
303 if ((*ithere) == '\"')
304 {
305 if (!inquote) ttquotedquery.clear ();
306 inquote = !inquote;
307 *ithere = ' '; // delete the quote
308 }
309 else if (inquote)
310 {
311 ttquotedquery.push_back(*ithere);
312 *ithere = ' ';
313 }
314
315 ithere++;
316 }
317}
318
319
320void mgsearchclass::filterquery (text_t &ttquerystring)
321{
322
323 text_t::iterator ithere = ttquerystring.begin ();
324 text_t::iterator itend = ttquerystring.end ();
325 unsigned short c;
326
327 // remove all non alphanumeric characters below 127
328 while (ithere != itend)
329 {
330 c = *ithere;
331
332 // if ((c <= 127) && !((c >= '0' && c <= '9') ||
333 // (c >= 'A' && c <= 'Z') ||
334 // (c >= 'a' && c <= 'z')))
335 if (!(((c >= 65) && (c <= 90)) ||
336 ((c >= 97) && (c <= 122)) ||
337 ((c >= 192) && (c <= 214)) ||
338 ((c >= 216) && (c <= 246)) ||
339 ((c >= 248) && (c <= 255)) ||
340 ((c >= '0') && (c <= '9')) ||
341 (c == 176)))
342 (*ithere) = ' ';
343
344 ithere++;
345 }
346}
347
348
349// the document text for 'docnum' is placed in 'output'
350// docTargetDocument returns 'true' if it was able to
351// try to get a document
352// collection is needed to see if an index from the
353// collection is loaded. If no index has been loaded
354// defaultindex is needed to load one
355bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
356 const text_t &collection,
357 int docnum,
358 text_t &output)
359{
360 bool databaseloaded = true;
361
362 output.clear();
363
364
365 // make sure index is level 2
366
367 ////// this changed with new naming scheme in new building software
368 ///// i.e paragraph level index no longer contain number '3' but begin
369 ///// with letter 'p'
370
371 text_t db_loaded = db_loaded_name;
372
373 if (!db_loaded.empty()) {
374 text_t::const_iterator here = db_loaded.begin();
375 text_t::const_iterator end = db_loaded.end();
376
377
378 //while (here != end) {
379 // if (*here == '3')
380 // databaseloaded = false;
381 // here ++;
382 //}
383
384 char separator = '/';
385 text_t db;
386 int found = 0;
387#ifdef __WIN32__
388 separator = '\\';
389#endif;
390 // strip away path to db and following collection name
391 end --;
392 while (end != here) {
393 if (*end == separator) {
394 if (found) break;
395 else {db.clear(); found = 1; end--; continue;}
396 }
397 db.push_back(*end);
398 end --;
399 }
400
401 // string will have been reversed above so see if last
402 // character is 'p'
403 if (db[db.size()-1] == 'p') databaseloaded = false;
404 }
405
406 // find out if the database is already loaded
407 // this is needed because a different index (but valid one)
408 // might be already loaded.
409 // this comparison is needed because 'load_database'
410 // is now more oriented towards indexes
411 if (databaseloaded == true) {
412 text_t::const_iterator here = collection.begin();
413 text_t::const_iterator end = collection.end();
414 char *dbhere = &db_loaded_name[strlen(db_loaded_name) - collection.size()]; // assumes collection shorter than db_loaded_name
415 while (here != end)
416 {
417 if (*here != *dbhere)
418 {
419 databaseloaded = false;
420 break;
421 }
422 here++;
423 dbhere++;
424 }
425 }
426
427 // try and load the database
428 if (!databaseloaded)
429 {
430 // get the names of the index and text suffixes
431 char *idxsuffix = (getindexsuffix (collection,
432 defaultindex)).getcstr();
433 assert (idxsuffix != NULL);
434 char *txtsuffix = (gettextsuffix (collection)).getcstr();
435 assert (txtsuffix != NULL);
436
437#ifdef __WIN32__
438 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
439#else
440 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
441#endif
442
443 if (load_database(ccollectdir, idxsuffix, txtsuffix))
444 databaseloaded = true;
445 else
446 databaseloaded = false;
447
448 // free up the c strings
449 delete idxsuffix;
450 delete txtsuffix;
451 delete ccollectdir;
452 }
453
454 if (databaseloaded)
455 {
456 // retrieve the document from mg
457 char docstr[32];
458 sprintf(docstr, "%i", docnum);
459
460 mgq_ask(".set mode text");
461 mgq_ask(".set query docnums");
462 mgq_ask(docstr);
463 mgq_results (result_docs, 0, 1, doctextcallback, (void *)&output);
464 }
465
466 return databaseloaded;
467}
468
Note: See TracBrowser for help on using the repository browser.