source: trunk/gsdl/src/library/mgsearch.cpp@ 4

Last change on this file since 4 was 4, checked in by sjboddie, 25 years ago

Initial revision

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.3 KB
Line 
1#include <string.h>
2#include <stdio.h>
3#include <stdlib.h>
4#include <ctype.h>
5
6#ifdef __GNUG__
7# include <iostream.h>
8# include <gdbm.h>
9
10#else
11# ifndef USE_OBJECTSPACE
12# include <iostream>
13# else
14# include <ospace\std\iostream>
15# endif
16
17// gdbm stuff
18# include "autoconf.h"
19# include "systems.h"
20# include "gdbmconst.h"
21# include "gdbm.h"
22#endif
23
24#include <assert.h>
25
26#include "mgq.h"
27#include "mgsearch.h"
28#include "locateinfo.h"
29
30/////////////
31// globals //
32/////////////
33
34static char *quotedquery = NULL;
35
36
37
38////////////////////////
39// callback functions //
40////////////////////////
41
42// This routine is called for each document found in a search
43// it assumes that cache_num is set up correctly to point to
44// a suitable result cache
45int ourquerycallback(char *UDoc, int ULen, int DocNum,
46 float Weight, void *info) {
47
48
49 queryresultsclass *queryresults = (queryresultsclass * )info;
50
51 // check the returned document for the presence of the
52 // quoted part of the query, if there was one
53
54 if (UDoc != NULL && quotedquery != NULL &&
55 quotedquery[0] != '\0' && strstr (UDoc, quotedquery) == NULL) return 0;
56
57 // append this entry to the document results
58 docresultclass docresult;
59 docresult.docnum = DocNum;
60 docresult.docweight = Weight;
61
62 queryresults->docs.push_back(docresult);
63
64 return 0;
65}
66
67// This callback is called once for each term in the query
68int termfreqcallback(char *Word, int ULen, int Freq,
69 float Weight, void *info) {
70 queryresultsclass *queryresults = (queryresultsclass *)info;
71
72 termfreqclass termfreq;
73 termfreq.termstr.setcarr(Word, ULen);
74 termfreq.termfreq = Freq;
75 queryresults->terms.push_back(termfreq);
76
77 return 0;
78}
79
80// this callback is called once for each variation of each term
81int termscallback(char *Word, int ULen, int Freq,
82 float Weight, void *info) {
83
84 queryresultsclass *queryresults = (queryresultsclass *)info;
85 queryresults->termvariants.push_back(Word);
86
87 return 0;
88}
89
90// This callback is for getting document text
91int doctextcallback(char *Word, int ULen, int Freq,
92 float Weight, void *info) {
93 text_t *output = (text_t *)info;
94 if (output == NULL) return 0;
95
96 output->setcarr(Word, ULen);
97
98 // replace all control-Cs with spaces
99 text_t::iterator here = output->begin();
100 text_t::iterator end = output->end();
101 while (here != end)
102 {
103 if (*here == '\x3') *here = ' ';
104 here++;
105 }
106
107 return 0;
108}
109
110
111
112////////////////////
113// mgsearch class //
114////////////////////
115
116mgsearchclass::mgsearchclass ()
117{
118 cache = new querycache (RESULTCACHESIZE);
119}
120
121mgsearchclass::~mgsearchclass ()
122{
123 if (cache != NULL)
124 {
125 delete cache;
126 cache = NULL;
127 }
128}
129
130
131void mgsearchclass::setindexhome (const text_t &theindexhome)
132{
133 indexhome = theindexhome;
134}
135
136
137bool mgsearchclass::search(const queryparamclass &queryparams,
138 queryresultsclass &queryresults)
139{
140 bool databaseloaded = true;
141
142 assert (cache != NULL);
143
144 queryresults.clear();
145
146 // first check the cache
147 if (cache->find(queryparams, queryresults))
148 {
149 return true;
150 }
151
152 // make sure there is a query to be processed
153 text_t::const_iterator queryhere = queryparams.querystring.begin();
154 text_t::const_iterator queryend = queryparams.querystring.end();
155 while (queryhere != queryend) {
156 if (((*queryhere >= 65) && (*queryhere <= 90)) ||
157 ((*queryhere >= 97) && (*queryhere <= 122)) ||
158 ((*queryhere >= 192) && (*queryhere <= 214)) ||
159 ((*queryhere >= 216) && (*queryhere <= 246)) ||
160 ((*queryhere >= 248) && (*queryhere <= 255)) ||
161 ((*queryhere >= '0') && (*queryhere <= '9'))) break;
162 queryhere++;
163 }
164
165 // if we reached the end of the query string without finding
166 // any alphanumeric characters then return no results (and say
167 // the database was loaded)
168 if (queryhere == queryend) return true;
169
170
171
172 // get the names of the index and text suffixes
173 text_t ttidxsuffix, tttxtsuffix;
174 getindexsuffix (queryparams.search_index,
175 queryparams.collection, ttidxsuffix);
176 gettextsuffix (queryparams.collection, tttxtsuffix);
177 char *idxsuffix = ttidxsuffix.getcstr(); assert (idxsuffix != NULL);
178 char *txtsuffix = tttxtsuffix.getcstr(); assert (txtsuffix != NULL);
179
180#ifdef __WIN32__
181 char *cindexhome = (indexhome+"\\").getcstr(); assert (cindexhome != NULL);
182#else
183 char *cindexhome = indexhome.getcstr(); assert (cindexhome != NULL);
184#endif
185
186 if (load_database(cindexhome, idxsuffix, txtsuffix))
187 {
188 setsearchmode (queryparams);
189 submitquery (queryparams);
190 getresults (queryresults);
191 }
192 else databaseloaded = false;
193
194 // free up the c strings
195 delete idxsuffix;
196 delete txtsuffix;
197 delete cindexhome;
198
199 return databaseloaded;
200}
201
202
203void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
204{
205 mgq_ask(".set expert true");
206 mgq_ask(".set accumulator_method list");
207 mgq_ask(".set max_accumulators 50000");
208 mgq_ask(".set verbatim true");
209 mgq_ask(".unset skip_dump");
210 mgq_ask(".set mode docnums");
211
212 switch (queryparams.search_type)
213 {
214 case 0: mgq_ask(".set query boolean"); break;
215 case 1: mgq_ask(".set query ranked"); break;
216 }
217 switch (queryparams.casefolding)
218 {
219 case 1: mgq_ask(".set casefold on"); break;
220 case 0: mgq_ask(".set casefold off"); break;
221 }
222 switch (queryparams.stemming)
223 {
224 case 1: mgq_ask(".set stem on"); break;
225 case 0: mgq_ask(".set stem off"); break;
226 }
227 mgq_ask(".set heads_length 150");
228
229 char maxdocstr[32];
230 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
231 mgq_ask(maxdocstr);
232}
233
234
235void mgsearchclass::submitquery (const queryparamclass &queryparams)
236{
237 // sort out the query string
238 text_t ttquerystring = queryparams.querystring;
239 text_t ttquotedquery;
240 extractquoted (ttquerystring, ttquotedquery);
241 filterquery (ttquerystring);
242
243 // turn the strings into c strings for mg
244 if (quotedquery != NULL) // quotedquery is a global
245 {
246 delete quotedquery;
247 quotedquery = NULL;
248 }
249
250 // quotedquery will be deleted on the next call to this function
251 quotedquery = ttquotedquery.getcstr ();
252 char *querystring = ttquerystring.getcstr();
253
254 // submit the query
255 mgq_ask(querystring);
256
257 delete querystring;
258}
259
260
261void mgsearchclass::getresults (queryresultsclass &queryresults)
262{
263 if (quotedquery[0] == '\0')
264 {
265 // don't need the text
266 mgq_results(result_docnums, 0, MAXNUMDOCS,
267 ourquerycallback, (void *)(&queryresults));
268 }
269 else
270 {
271 // we need the text for this one
272 mgq_results(result_docs, 0, MAXNUMDOCS,
273 ourquerycallback, (void *)(&queryresults));
274 }
275
276 // get the term frequencies
277 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
278 termfreqcallback, (void *)(&queryresults));
279 mgq_results(result_terms, 0, MAXNUMTERMS,
280 termscallback, (void *)(&queryresults));
281 queryresults.sortqueryterms();
282 queryresults.uniqqueryterms();
283}
284
285
286void mgsearchclass::extractquoted (text_t &ttquerystring, text_t &ttquotedquery)
287{
288 ttquotedquery.clear();
289
290 text_t::iterator ithere = ttquerystring.begin ();
291 text_t::iterator itend = ttquerystring.end ();
292
293 bool inquote = false;
294
295 while (ithere != itend)
296 {
297 if ((*ithere) == '\"')
298 {
299 if (!inquote) ttquotedquery.clear ();
300 inquote = !inquote;
301 *ithere = ' '; // delete the quote
302 }
303 else if (inquote)
304 {
305 ttquotedquery.push_back(*ithere);
306 *ithere = ' ';
307 }
308
309 ithere++;
310 }
311}
312
313
314void mgsearchclass::filterquery (text_t &ttquerystring)
315{
316
317 text_t::iterator ithere = ttquerystring.begin ();
318 text_t::iterator itend = ttquerystring.end ();
319 unsigned short c;
320
321 // remove all non alphanumeric characters below 127
322 while (ithere != itend)
323 {
324 c = *ithere;
325
326 // if ((c <= 127) && !((c >= '0' && c <= '9') ||
327 // (c >= 'A' && c <= 'Z') ||
328 // (c >= 'a' && c <= 'z')))
329 if (!(((c >= 65) && (c <= 90)) ||
330 ((c >= 97) && (c <= 122)) ||
331 ((c >= 192) && (c <= 214)) ||
332 ((c >= 216) && (c <= 246)) ||
333 ((c >= 248) && (c <= 255)) ||
334 ((c >= '0') && (c <= '9')) ||
335 (c == 176)))
336 (*ithere) = ' ';
337
338 ithere++;
339 }
340}
341
342
343// the document text for 'docnum' is placed in 'output'
344// docTargetDocument returns 'true' if it was able to
345// try to get a document
346// collection is needed to see if an index from the
347// collection is loaded. If no index has been loaded
348// defaultindex is needed to load one
349bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
350 const text_t &collection,
351 int docnum,
352 text_t &output)
353{
354 bool databaseloaded = true;
355
356 output.clear();
357
358
359 // make sure index is level 2
360
361 ////// this changed with new naming scheme in new building software
362 ///// i.e paragraph level index no longer contain number '3' but begin
363 ///// with letter 'p'
364
365 text_t db_loaded = db_loaded_name;
366
367 if (!db_loaded.empty()) {
368 text_t::const_iterator here = db_loaded.begin();
369 text_t::const_iterator end = db_loaded.end();
370
371
372 //while (here != end) {
373 // if (*here == '3')
374 // databaseloaded = false;
375 // here ++;
376 //}
377
378 char separator = '/';
379 text_t db;
380 int found = 0;
381#ifdef __WIN32__
382 separator = '\\';
383#endif;
384 // strip away path to db and following collection name
385 end --;
386 while (end != here) {
387 if (*end == separator) {
388 if (found) break;
389 else {db.clear(); found = 1; end--; continue;}
390 }
391 db.push_back(*end);
392 end --;
393 }
394
395 // string will have been reversed above so see if last
396 // character is 'p'
397 if (db[db.size()-1] == 'p') databaseloaded = false;
398 }
399
400 // find out if the database is already loaded
401 // this is needed because a different index (but valid one)
402 // might be already loaded.
403 // this comparison is needed because 'load_database'
404 // is now more oriented towards indexes
405 if (databaseloaded == true) {
406 text_t::const_iterator here = collection.begin();
407 text_t::const_iterator end = collection.end();
408 char *dbhere = &db_loaded_name[strlen(db_loaded_name) - collection.size()]; // assumes collection shorter than db_loaded_name
409 while (here != end)
410 {
411 if (*here != *dbhere)
412 {
413 databaseloaded = false;
414 break;
415 }
416 here++;
417 dbhere++;
418 }
419 }
420
421 // try and load the database
422 if (!databaseloaded)
423 {
424 // get the names of the index and text suffixes
425 text_t ttidxsuffix, tttxtsuffix;
426 getindexsuffix (defaultindex, collection, ttidxsuffix);
427 gettextsuffix (collection, tttxtsuffix);
428 char *idxsuffix = ttidxsuffix.getcstr(); assert (idxsuffix != NULL);
429 char *txtsuffix = tttxtsuffix.getcstr(); assert (txtsuffix != NULL);
430
431#ifdef __WIN32__
432 char *cindexhome = (indexhome+"\\").getcstr(); assert (cindexhome != NULL);
433#else
434 char *cindexhome = indexhome.getcstr(); assert (cindexhome != NULL);
435#endif
436
437
438 if (load_database(cindexhome, idxsuffix, txtsuffix))
439 {
440 databaseloaded = true;
441 }
442 else
443 {
444 databaseloaded = false;
445 }
446
447 // free up the c strings
448 delete idxsuffix;
449 delete txtsuffix;
450 delete cindexhome;
451 }
452
453 if (databaseloaded)
454 {
455 // retrieve the document from mg
456 char docstr[32];
457 sprintf(docstr, "%i", docnum);
458
459 mgq_ask(".set mode text");
460 mgq_ask(".set query docnums");
461 mgq_ask(docstr);
462 mgq_results (result_docs, 0, 1, doctextcallback, (void *)&output);
463 }
464
465 return databaseloaded;
466}
467
Note: See TracBrowser for help on using the repository browser.