source: trunk/gsdl/src/colservr/mgsearch.cpp@ 401

Last change on this file since 401 was 401, checked in by rjmcnab, 25 years ago

Fixed a weird bug to do with a faulty case statement.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.8 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: mgsearch.cpp 401 1999-07-16 08:35:03Z rjmcnab $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.15 1999/07/16 08:35:03 rjmcnab
15 Fixed a weird bug to do with a faulty case statement.
16
17 Revision 1.14 1999/07/16 03:42:22 sjboddie
18 changed isApprox
19
20 Revision 1.13 1999/07/16 00:12:46 sjboddie
21 removed all the old post-processing stuff
22
23 Revision 1.12 1999/07/07 06:17:47 rjmcnab
24 broke search_index into index+subcollection+language
25 within mgsearch
26
27 Revision 1.11 1999/07/05 21:06:43 rjmcnab
28 Disabled quoted strings.
29
30 Revision 1.10 1999/07/01 09:29:19 rjmcnab
31 Changes for better reporting of number documents which match a query. Changes
32 should still work as before with older versions of mg.
33
34 Revision 1.9 1999/07/01 03:54:48 rjmcnab
35 Added code to plug in the equivalent terms of each of the query terms.
36 Also added a function to get a raw utf8 encoded mg document (for speeding
37 up a phrase matching function)
38
39 Revision 1.8 1999/06/30 04:04:12 rjmcnab
40 made stemming functions available from mgsearch and made the stems
41 for the query terms available in queryinfo
42
43 Revision 1.7 1999/06/27 22:07:27 sjboddie
44 got rid of all the old functions for dealing with dir indexes
45
46 Revision 1.6 1999/06/09 00:41:32 sjboddie
47 phrase searching now uses case-folding if it's turned on
48
49 Revision 1.5 1999/02/21 22:31:35 rjmcnab
50
51 Removed locateinfo.
52
53 Revision 1.4 1999/02/03 01:13:27 sjboddie
54
55 Got interface to handle subcollections and language subcollections -
56 committed changes made to some of the collections
57
58 Revision 1.3 1999/01/19 01:38:17 rjmcnab
59
60 Made the source more portable.
61
62 Revision 1.2 1999/01/12 01:51:02 rjmcnab
63
64 Standard header.
65
66 Revision 1.1 1999/01/08 09:02:16 rjmcnab
67
68 Moved from src/library.
69
70 */
71
72
73#include "gsdlconf.h"
74#include "mgsearch.h"
75#include "fileutil.h"
76
77#include <string.h>
78#include <stdio.h>
79#include <stdlib.h>
80#include <ctype.h>
81
82#if defined(GSDL_USE_OBJECTSPACE)
83# include <ospace\std\iostream>
84#elif defined(GSDL_USE_IOS_H)
85# include <iostream.h>
86#else
87# include <iostream>
88#endif
89
90#if defined(__WIN32__)
91// gdbm stuff
92# include "autoconf.h"
93# include "systems.h"
94# include "gdbmconst.h"
95# include "gdbm.h"
96#else
97# include <gdbm.h>
98#endif
99
100
101#include <assert.h>
102
103#include "mgq.h"
104// #include "locateinfo.h"
105#include "gsdlunicode.h"
106#include "unitool.h"
107
108
109/////////////
110// globals //
111/////////////
112
113static char *tempdoc = NULL;
114static int templen = 0;
115
116
117//////////////////////
118// useful functions //
119//////////////////////
120
121
122// input and output are in utf8
123text_t mgsearch_stemword (const text_t &word) {
124 // allocate working stem space
125 int maxstemlen = mgq_getmaxstemlen ();
126 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
127 if (word_stem == NULL) return "";
128
129 // copy word to word_stem
130 int len = 0;
131 text_t::const_iterator here = word.begin();
132 text_t::const_iterator end = word.end();
133 while (len < maxstemlen && here != end) {
134 word_stem[len+1] = (unsigned char)(*here);
135 len++; here++;
136 }
137 word_stem[len+1] = '\0';
138 word_stem[0] = len;
139
140 mgq_stemword (word_stem);
141
142 // copy word_stem back to tempstr
143 text_t tempstr;
144 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
145
146 delete [] word_stem;
147
148 return tempstr;
149}
150
151
152
153////////////////////////
154// callback functions //
155////////////////////////
156
157// This routine is called for each document found in a search
158// it assumes that cache_num is set up correctly to point to
159// a suitable result cache
160int ourquerycallback(char */*UDoc*/, int /*ULen*/, int DocNum,
161 float Weight, void *info) {
162
163
164 queryresultsclass *queryresults = (queryresultsclass * )info;
165
166 // append this entry to the document results
167 docresultclass docresult;
168 docresult.docnum = DocNum;
169 docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
170 docresult.docweight = Weight - docresult.num_query_terms_matched*100;
171
172 queryresults->docs.docset[DocNum] = docresult;
173 queryresults->docs.docorder.push_back(DocNum);
174
175 return 0;
176}
177
178int termequivcallback(char *Word, int ULen, int /*Freq*/,
179 float /*Weight*/, void *info) {
180 text_tset *equivterms = (text_tset *)info;
181 if (equivterms == NULL) return 0;
182
183 text_t thisterm;
184 thisterm.setcarr(Word, ULen);
185
186 equivterms->insert(thisterm);
187
188 return 0;
189}
190
191
192void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
193 // allocate working stem space
194 int maxstemlen = mgq_getmaxstemlen ();
195 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
196 if (word_stem == NULL) return;
197
198 // copy word to word_stem
199 int len = 0;
200 text_t::const_iterator here = word.begin();
201 text_t::const_iterator end = word.end();
202 while (len < maxstemlen && here != end) {
203 word_stem[len+1] = (unsigned char)(*here);
204 len++; here++;
205 }
206 word_stem[len+1] = '\0';
207 word_stem[0] = len;
208
209 // get the equivalent terms
210 mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
211
212 delete [] word_stem;
213
214 return;
215}
216
217 text_tset utf8equivterms; // kept as utf8 string for fast matching
218
219
220// This callback is called once for each term in the query
221int termfreqcallback(char *Word, int ULen, int Freq,
222 float /*Weight*/, void *info) {
223 queryresultsclass *queryresults = (queryresultsclass *)info;
224 if (queryresults == NULL) return 0;
225
226 text_t term;
227 term.setcarr(Word, ULen);
228 termfreqclass termfreq;
229
230 termfreq.termstr = to_uni(term);
231 text_t utf8termstem = mgsearch_stemword (term);
232 termfreq.termstemstr = to_uni (utf8termstem);
233
234 mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
235
236 termfreq.termfreq = Freq;
237 queryresults->orgterms.push_back(termfreq);
238
239 return 0;
240}
241
242// this callback is called once for each variation of each term
243int termvariantscallback(char *Word, int ULen, int /*Freq*/,
244 float /*Weight*/, void *info) {
245
246 text_t term;
247 term.setcarr(Word, ULen);
248 queryresultsclass *queryresults = (queryresultsclass *)info;
249 queryresults->termvariants.insert(to_uni(term));
250
251 return 0;
252}
253
254// This callback is for getting document text
255int doctextcallback(char *Doc, int ULen, int /*Freq*/,
256 float /*Weight*/, void */*info*/) {
257 tempdoc = Doc;
258 templen = ULen;
259
260 return 0;
261}
262
263
264static text_t getindexsuffix (const text_t &collection,
265 const text_t &index) {
266
267 text_t indexsuffix = "index";
268 // temporary hack so old version of niupepa collection
269 // can stay up until new one's finished
270 if (collection == "niupepa") indexsuffix = "index.new";
271
272 indexsuffix = filename_cat (indexsuffix, index);
273 indexsuffix = filename_cat (indexsuffix, collection);
274 return indexsuffix;
275}
276
277
278
279
280////////////////////
281// mgsearch class //
282////////////////////
283
284mgsearchclass::mgsearchclass ()
285{
286 cache = new querycache (RESULTCACHESIZE);
287}
288
289mgsearchclass::~mgsearchclass ()
290{
291 if (cache != NULL)
292 {
293 delete cache;
294 cache = NULL;
295 }
296}
297
298
299void mgsearchclass::setcollectdir (const text_t &thecollectdir)
300{
301 collectdir = thecollectdir;
302}
303
304// you only need to use this function before doing any stemming
305// casefolding and stemming will be set if values for them are
306// provided (0 or 1).
307// makeindexcurrent returns true if it was able to load the database
308bool mgsearchclass::makeindexcurrent (const text_t &index,
309 const text_t &subcollection,
310 const text_t &language,
311 const text_t &collection,
312 int casefolding,
313 int stemming) {
314 bool databaseloaded = true;
315
316 // get the names of the collection, index and text suffixes
317 char *ccollection = collection.getcstr();
318 assert (ccollection != NULL);
319 char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
320 assert (idxsuffix != NULL);
321 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
322 assert (txtsuffix != NULL);
323
324#ifdef __WIN32__
325 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
326#else
327 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
328#endif
329
330 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
331 if (casefolding == 0) mgq_ask(".set casefold off");
332 else if (casefolding > 0) mgq_ask(".set casefold on");
333 if (stemming == 0) mgq_ask(".set stem off");
334 else if (stemming > 0) mgq_ask(".set stem on");
335
336 } else databaseloaded = false;
337
338 // free up the c strings
339 delete ccollection;
340 delete idxsuffix;
341 delete txtsuffix;
342 delete ccollectdir;
343
344 return databaseloaded;
345}
346
347
348// stem word uses the values set in the last call to makeindexcurrent
349// to stem the word. It is assumed that word is in unicode
350text_t mgsearchclass::stemword (const text_t &word) {
351 return to_uni (mgsearch_stemword (to_utf8 (word)));
352}
353
354text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
355 return to_uni (mgsearch_stemword (to_utf8 (here, end)));
356}
357
358
359bool mgsearchclass::search(const queryparamclass &queryparams,
360 queryresultsclass &queryresults) {
361 assert (cache != NULL);
362
363 queryresults.clear();
364
365 // first check the cache
366 if (cache->find(queryparams, queryresults)) return true;
367
368 // make sure there is a query to be processed
369 text_t::const_iterator queryhere = queryparams.querystring.begin();
370 text_t::const_iterator queryend = queryparams.querystring.end();
371 while (queryhere != queryend) {
372 if (is_unicode_letdig (*queryhere)) break;
373 queryhere++;
374 }
375
376 // if we reached the end of the query string without finding
377 // any alphanumeric characters then return no results (and say
378 // the database was loaded)
379 if (queryhere == queryend) return true;
380
381 if (makeindexcurrent (queryparams.index, queryparams.subcollection,
382 queryparams.language, queryparams.collection)) {
383 setsearchmode (queryparams);
384 submitquery (queryparams);
385 getresults (queryparams, queryresults);
386 return true;
387 }
388
389 return false;
390}
391
392
393void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
394{
395 mgq_ask(".set expert true");
396 mgq_ask(".set sorted_terms true");
397 mgq_ask(".set accumulator_method list");
398 mgq_ask(".set max_accumulators 50000");
399 mgq_ask(".set verbatim true");
400 mgq_ask(".unset skip_dump");
401 mgq_ask(".set mode docnums");
402
403 switch (queryparams.search_type)
404 {
405 case 0: mgq_ask(".set query boolean"); break;
406 case 1: mgq_ask(".set query ranked"); break;
407 }
408 switch (queryparams.casefolding)
409 {
410 case 1: mgq_ask(".set casefold on"); break;
411 case 0: mgq_ask(".set casefold off"); break;
412 }
413 switch (queryparams.stemming)
414 {
415 case 1: mgq_ask(".set stem on"); break;
416 case 0: mgq_ask(".set stem off"); break;
417 }
418 mgq_ask(".set heads_length 150");
419
420 if (queryparams.maxdocs == -1) {
421 mgq_ask(".set maxdocs all");
422 } else {
423 char maxdocstr[32];
424 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
425 mgq_ask(maxdocstr);
426 }
427}
428
429
430void mgsearchclass::submitquery (const queryparamclass &queryparams)
431{
432 // sort out the query string
433 text_t ttquerystring = queryparams.querystring;
434 filterquery (ttquerystring);
435 char *querystring = to_utf8(ttquerystring).getcstr();
436
437 // submit the query
438 mgq_ask(querystring);
439
440 delete querystring;
441}
442
443
444void mgsearchclass::getresults (const queryparamclass &queryparams,
445 queryresultsclass &queryresults) {
446
447 mgq_results(result_docnums, 0, MAXNUMDOCS,
448 ourquerycallback, (void *)(&queryresults));
449
450 // get the term frequencies
451 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
452 termfreqcallback, (void *)(&queryresults));
453 queryresults.sortuniqqueryterms();
454
455 // get term variants
456 mgq_results(result_terms, 0, MAXNUMTERMS,
457 termvariantscallback, (void *)(&queryresults));
458
459 // get the number of documents retrieved
460 int total_retrieved = 0, is_approx = 0;
461 mgq_docsretrieved (&total_retrieved, &is_approx);
462
463 if (total_retrieved == 0) {
464 // not available (or really was zero)
465 queryresults.docs_matched = queryresults.docs.docset.size();
466 if (queryresults.docs_matched < queryparams.maxdocs)
467 queryresults.is_approx = Exact;
468 else
469 queryresults.is_approx = MoreThan;
470 } else {
471 queryresults.docs_matched = total_retrieved;
472 if (is_approx) queryresults.is_approx = Approximate;
473 else queryresults.is_approx = Exact;
474 }
475}
476
477void mgsearchclass::filterquery (text_t &ttquerystring) {
478 text_t::iterator ithere = ttquerystring.begin ();
479 text_t::iterator itend = ttquerystring.end ();
480
481 // remove all non alphanumeric characters
482 while (ithere != itend) {
483 if (!is_unicode_letdig(*ithere)) (*ithere) = ' ';
484 ithere++;
485 }
486}
487
488
489// the document text for 'docnum' is placed in 'output'
490// docTargetDocument returns 'true' if it was able to
491// try to get a document
492// collection is needed to see if an index from the
493// collection is loaded. If no index has been loaded
494// defaultindex is needed to load one
495bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
496 const text_t &defaultsubcollection,
497 const text_t &defaultlanguage,
498 const text_t &collection,
499 int docnum,
500 text_t &output) {
501 output.clear();
502
503 // get the mg version of the document
504 char *mgdoc = NULL;
505 int doclen = 0;
506 if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
507 collection, docnum, mgdoc, doclen)) return false;
508 if (mgdoc == NULL) return false;
509
510 // replace all control-Cs with spaces
511 char *mgdoc_here = mgdoc;
512 char *mgdoc_end = mgdoc + doclen;
513 while (mgdoc_here < mgdoc_end) {
514 if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
515 mgdoc_here++;
516 }
517
518 // convert this document to unicode
519 utf8inconvertclass inconvert;
520 convertclass::status_t status;
521 inconvert.reset ();
522 inconvert.setinput (mgdoc, doclen);
523 inconvert.convert (output, status);
524
525 return true;
526}
527
528
529bool mgsearchclass::mgdocument (const text_t &defaultindex,
530 const text_t &defaultsubcollection,
531 const text_t &defaultlanguage,
532 const text_t &collection,
533 int docnum,
534 char *&UDoc, int &ULen) {
535 bool databaseloaded = 0;
536
537 UDoc = NULL; ULen = 0;
538
539 // see if we can make an appropriate database current
540 char *ccollection = collection.getcstr();
541 assert (ccollection != NULL);
542 databaseloaded = load_text_database (ccollection);
543 delete ccollection;
544
545 // try and load the database
546 if (!databaseloaded) databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
547 defaultlanguage, collection);
548
549 if (databaseloaded) {
550 // retrieve the document from mg
551 char docstr[32];
552 sprintf(docstr, "%i", docnum);
553
554 mgq_ask(".set mode text");
555 mgq_ask(".set query docnums");
556 mgq_ask(docstr);
557
558 tempdoc = NULL;
559 templen = 0;
560 mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
561 UDoc = tempdoc;
562 ULen = templen;
563 }
564
565 return databaseloaded;
566}
567
Note: See TracBrowser for help on using the repository browser.