source: trunk/gsdl/src/library/mgsearch.cpp@ 32

Last change on this file since 32 was 32, checked in by sjboddie, 25 years ago

Altered existing collections (gberg, unu, unesco and hdl) to fit in
with new dirctory structure.
Altered browse software - put most of contents of collections browse.cpp
files (e.g. hdlbrowse.cpp) into browse.cpp

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.2 KB
Line 
1#include <string.h>
2#include <stdio.h>
3#include <stdlib.h>
4#include <ctype.h>
5
6#ifdef __GNUG__
7# include <iostream.h>
8# include <gdbm.h>
9
10#else
11# ifndef USE_OBJECTSPACE
12# include <iostream>
13# else
14# include <ospace\std\iostream>
15# endif
16
17// gdbm stuff
18# include "autoconf.h"
19# include "systems.h"
20# include "gdbmconst.h"
21# include "gdbm.h"
22#endif
23
24#include <assert.h>
25
26#include "mgq.h"
27#include "mgsearch.h"
28#include "locateinfo.h"
29
30/////////////
31// globals //
32/////////////
33
34static char *quotedquery = NULL;
35
36
37
38////////////////////////
39// callback functions //
40////////////////////////
41
42// This routine is called for each document found in a search
43// it assumes that cache_num is set up correctly to point to
44// a suitable result cache
45int ourquerycallback(char *UDoc, int ULen, int DocNum,
46 float Weight, void *info) {
47
48
49 queryresultsclass *queryresults = (queryresultsclass * )info;
50
51 // check the returned document for the presence of the
52 // quoted part of the query, if there was one
53
54 if (UDoc != NULL && quotedquery != NULL &&
55 quotedquery[0] != '\0' && strstr (UDoc, quotedquery) == NULL) return 0;
56
57 // append this entry to the document results
58 docresultclass docresult;
59 docresult.docnum = DocNum;
60 docresult.docweight = Weight;
61
62 queryresults->docs.push_back(docresult);
63
64 return 0;
65}
66
67// This callback is called once for each term in the query
68int termfreqcallback(char *Word, int ULen, int Freq,
69 float Weight, void *info) {
70 queryresultsclass *queryresults = (queryresultsclass *)info;
71
72 termfreqclass termfreq;
73 termfreq.termstr.setcarr(Word, ULen);
74 termfreq.termfreq = Freq;
75 queryresults->terms.push_back(termfreq);
76
77 return 0;
78}
79
80// this callback is called once for each variation of each term
81int termscallback(char *Word, int ULen, int Freq,
82 float Weight, void *info) {
83
84 queryresultsclass *queryresults = (queryresultsclass *)info;
85 queryresults->termvariants.push_back(Word);
86
87 return 0;
88}
89
90// This callback is for getting document text
91int doctextcallback(char *Word, int ULen, int Freq,
92 float Weight, void *info) {
93 text_t *output = (text_t *)info;
94 if (output == NULL) return 0;
95
96 output->setcarr(Word, ULen);
97
98 // replace all control-Cs with spaces
99 text_t::iterator here = output->begin();
100 text_t::iterator end = output->end();
101 while (here != end)
102 {
103 if (*here == '\x3') *here = ' ';
104 here++;
105 }
106
107 return 0;
108}
109
110
111
112////////////////////
113// mgsearch class //
114////////////////////
115
116mgsearchclass::mgsearchclass ()
117{
118 cache = new querycache (RESULTCACHESIZE);
119}
120
121mgsearchclass::~mgsearchclass ()
122{
123 if (cache != NULL)
124 {
125 delete cache;
126 cache = NULL;
127 }
128}
129
130
131void mgsearchclass::setindexhome (const text_t &theindexhome)
132{
133 indexhome = theindexhome;
134}
135
136
137bool mgsearchclass::search(const queryparamclass &queryparams,
138 queryresultsclass &queryresults)
139{
140 bool databaseloaded = true;
141
142 assert (cache != NULL);
143
144 queryresults.clear();
145
146 // first check the cache
147 if (cache->find(queryparams, queryresults))
148 {
149 return true;
150 }
151
152 // make sure there is a query to be processed
153 text_t::const_iterator queryhere = queryparams.querystring.begin();
154 text_t::const_iterator queryend = queryparams.querystring.end();
155 while (queryhere != queryend) {
156 if (((*queryhere >= 65) && (*queryhere <= 90)) ||
157 ((*queryhere >= 97) && (*queryhere <= 122)) ||
158 ((*queryhere >= 192) && (*queryhere <= 214)) ||
159 ((*queryhere >= 216) && (*queryhere <= 246)) ||
160 ((*queryhere >= 248) && (*queryhere <= 255)) ||
161 ((*queryhere >= '0') && (*queryhere <= '9'))) break;
162 queryhere++;
163 }
164
165 // if we reached the end of the query string without finding
166 // any alphanumeric characters then return no results (and say
167 // the database was loaded)
168 if (queryhere == queryend) return true;
169
170
171
172 // get the names of the index and text suffixes
173 text_t ttidxsuffix, tttxtsuffix;
174 getindexsuffix (queryparams.search_index,
175 queryparams.collection, ttidxsuffix);
176 gettextsuffix (queryparams.collection, tttxtsuffix);
177 char *idxsuffix = ttidxsuffix.getcstr(); assert (idxsuffix != NULL);
178 char *txtsuffix = tttxtsuffix.getcstr(); assert (txtsuffix != NULL);
179
180#ifdef __WIN32__
181 char *cindexhome = (indexhome+"\\").getcstr(); assert (cindexhome != NULL);
182#else
183 char *cindexhome = indexhome.getcstr(); assert (cindexhome != NULL);
184#endif
185
186 if (load_database(cindexhome, idxsuffix, txtsuffix))
187 {
188 setsearchmode (queryparams);
189 submitquery (queryparams);
190 getresults (queryresults);
191 }
192 else databaseloaded = false;
193
194 // free up the c strings
195 delete idxsuffix;
196 delete txtsuffix;
197 delete cindexhome;
198
199 return databaseloaded;
200}
201
202
203void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
204{
205 mgq_ask(".set expert true");
206 mgq_ask(".set accumulator_method list");
207 mgq_ask(".set max_accumulators 50000");
208 mgq_ask(".set verbatim true");
209 mgq_ask(".unset skip_dump");
210 mgq_ask(".set mode docnums");
211
212 switch (queryparams.search_type)
213 {
214 case 0: mgq_ask(".set query boolean"); break;
215 case 1: mgq_ask(".set query ranked"); break;
216 }
217 switch (queryparams.casefolding)
218 {
219 case 1: mgq_ask(".set casefold on"); break;
220 case 0: mgq_ask(".set casefold off"); break;
221 }
222 switch (queryparams.stemming)
223 {
224 case 1: mgq_ask(".set stem on"); break;
225 case 0: mgq_ask(".set stem off"); break;
226 }
227 mgq_ask(".set heads_length 150");
228
229 char maxdocstr[32];
230 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
231 mgq_ask(maxdocstr);
232}
233
234
235void mgsearchclass::submitquery (const queryparamclass &queryparams)
236{
237 // sort out the query string
238 text_t ttquerystring = queryparams.querystring;
239 text_t ttquotedquery;
240 extractquoted (ttquerystring, ttquotedquery);
241 filterquery (ttquerystring);
242
243 // turn the strings into c strings for mg
244 if (quotedquery != NULL) // quotedquery is a global
245 {
246 delete quotedquery;
247 quotedquery = NULL;
248 }
249
250 // quotedquery will be deleted on the next call to this function
251 quotedquery = ttquotedquery.getcstr ();
252 char *querystring = ttquerystring.getcstr();
253
254 // submit the query
255 mgq_ask(querystring);
256
257 delete querystring;
258}
259
260
261void mgsearchclass::getresults (queryresultsclass &queryresults)
262{
263 if (quotedquery[0] == '\0')
264 {
265 // don't need the text
266 mgq_results(result_docnums, 0, MAXNUMDOCS,
267 ourquerycallback, (void *)(&queryresults));
268 }
269 else
270 {
271 // we need the text for this one
272 mgq_results(result_docs, 0, MAXNUMDOCS,
273 ourquerycallback, (void *)(&queryresults));
274 }
275
276 // get the term frequencies
277 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
278 termfreqcallback, (void *)(&queryresults));
279 mgq_results(result_terms, 0, MAXNUMTERMS,
280 termscallback, (void *)(&queryresults));
281 queryresults.sortqueryterms();
282 queryresults.uniqqueryterms();
283}
284
285
286void mgsearchclass::extractquoted (text_t &ttquerystring, text_t &ttquotedquery)
287{
288 ttquotedquery.clear();
289
290 text_t::iterator ithere = ttquerystring.begin ();
291 text_t::iterator itend = ttquerystring.end ();
292
293 bool inquote = false;
294
295 while (ithere != itend)
296 {
297 if ((*ithere) == '\"')
298 {
299 if (!inquote) ttquotedquery.clear ();
300 inquote = !inquote;
301 *ithere = ' '; // delete the quote
302 }
303 else if (inquote)
304 {
305 ttquotedquery.push_back(*ithere);
306 *ithere = ' ';
307 }
308
309 ithere++;
310 }
311}
312
313
314void mgsearchclass::filterquery (text_t &ttquerystring)
315{
316
317 text_t::iterator ithere = ttquerystring.begin ();
318 text_t::iterator itend = ttquerystring.end ();
319 unsigned short c;
320
321 // remove all non alphanumeric characters below 127
322 while (ithere != itend)
323 {
324 c = *ithere;
325
326 // if ((c <= 127) && !((c >= '0' && c <= '9') ||
327 // (c >= 'A' && c <= 'Z') ||
328 // (c >= 'a' && c <= 'z')))
329 if (!(((c >= 65) && (c <= 90)) ||
330 ((c >= 97) && (c <= 122)) ||
331 ((c >= 192) && (c <= 214)) ||
332 ((c >= 216) && (c <= 246)) ||
333 ((c >= 248) && (c <= 255)) ||
334 ((c >= '0') && (c <= '9')) ||
335 (c == 176)))
336 (*ithere) = ' ';
337
338 ithere++;
339 }
340}
341
342
343// the document text for 'docnum' is placed in 'output'
344// docTargetDocument returns 'true' if it was able to
345// try to get a document
346// collection is needed to see if an index from the
347// collection is loaded. If no index has been loaded
348// defaultindex is needed to load one
349bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
350 const text_t &collection,
351 int docnum,
352 text_t &output)
353{
354 bool databaseloaded = true;
355
356 output.clear();
357
358
359 // make sure index is level 2
360
361 ////// this changed with new naming scheme in new building software
362 ///// i.e paragraph level index no longer contain number '3' but begin
363 ///// with letter 'p'
364
365 text_t db_loaded = db_loaded_name;
366
367 if (!db_loaded.empty()) {
368 text_t::const_iterator here = db_loaded.begin();
369 text_t::const_iterator end = db_loaded.end();
370
371
372 //while (here != end) {
373 // if (*here == '3')
374 // databaseloaded = false;
375 // here ++;
376 //}
377
378 char separator = '/';
379 text_t db;
380 int found = 0;
381#ifdef __WIN32__
382 separator = '\\';
383#endif;
384 // strip away path to db and following collection name
385 end --;
386 while (end != here) {
387 if (*end == separator) {
388 if (found) break;
389 else {db.clear(); found = 1; end--; continue;}
390 }
391 db.push_back(*end);
392 end --;
393 }
394
395 // string will have been reversed above so see if last
396 // character is 'p'
397 if (db[db.size()-1] == 'p') databaseloaded = false;
398 }
399
400 // find out if the database is already loaded
401 // this is needed because a different index (but valid one)
402 // might be already loaded.
403 // this comparison is needed because 'load_database'
404 // is now more oriented towards indexes
405 if (databaseloaded == true) {
406 text_t::const_iterator here = collection.begin();
407 text_t::const_iterator end = collection.end();
408 char *dbhere = &db_loaded_name[strlen(db_loaded_name) - collection.size()]; // assumes collection shorter than db_loaded_name
409 while (here != end)
410 {
411 if (*here != *dbhere)
412 {
413 databaseloaded = false;
414 break;
415 }
416 here++;
417 dbhere++;
418 }
419 }
420
421 // try and load the database
422 if (!databaseloaded)
423 {
424 // get the names of the index and text suffixes
425 text_t ttidxsuffix, tttxtsuffix;
426 getindexsuffix (defaultindex, collection, ttidxsuffix);
427 gettextsuffix (collection, tttxtsuffix);
428 char *idxsuffix = ttidxsuffix.getcstr(); assert (idxsuffix != NULL);
429 char *txtsuffix = tttxtsuffix.getcstr(); assert (txtsuffix != NULL);
430
431#ifdef __WIN32__
432 char *cindexhome = (indexhome+"\\").getcstr(); assert (cindexhome != NULL);
433#else
434 char *cindexhome = indexhome.getcstr(); assert (cindexhome != NULL);
435#endif
436
437
438 if (load_database(cindexhome, idxsuffix, txtsuffix))
439 {
440 databaseloaded = true;
441 }
442 else
443 {
444 databaseloaded = false;
445 }
446
447 // free up the c strings
448 delete idxsuffix;
449 delete txtsuffix;
450 delete cindexhome;
451 }
452
453 if (databaseloaded)
454 {
455 // retrieve the document from mg
456 char docstr[32];
457 sprintf(docstr, "%i", docnum);
458
459 mgq_ask(".set mode text");
460 mgq_ask(".set query docnums");
461 mgq_ask(docstr);
462 mgq_results (result_docs, 0, 1, doctextcallback, (void *)&output);
463 }
464
465 return databaseloaded;
466}
467
Note: See TracBrowser for help on using the repository browser.