source: trunk/gsdl/src/colservr/mgsearch.cpp@ 539

Last change on this file since 539 was 539, checked in by rjmcnab, 25 years ago

Seems to be an error in mg for retrieving documents using a paragraph
based index for some cases. Just added a work around (loads the default
index every time).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.1 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: mgsearch.cpp 539 1999-09-07 22:52:52Z rjmcnab $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.19 1999/09/07 22:52:52 rjmcnab
31 Seems to be an error in mg for retrieving documents using a paragraph
32 based index for some cases. Just added a work around (loads the default
33 index every time).
34
35 Revision 1.18 1999/09/07 04:57:22 sjboddie
36 added gpl notice
37
38 Revision 1.17 1999/08/31 22:42:41 rjmcnab
39 A couple of minor things.
40
41 Revision 1.16 1999/08/25 04:51:06 sjboddie
42 small change to allow for searching using boolean operators
43
44 Revision 1.15 1999/07/16 08:35:03 rjmcnab
45 Fixed a weird bug to do with a faulty case statement.
46
47 Revision 1.14 1999/07/16 03:42:22 sjboddie
48 changed isApprox
49
50 Revision 1.13 1999/07/16 00:12:46 sjboddie
51 removed all the old post-processing stuff
52
53 Revision 1.12 1999/07/07 06:17:47 rjmcnab
54 broke search_index into index+subcollection+language
55 within mgsearch
56
57 Revision 1.11 1999/07/05 21:06:43 rjmcnab
58 Disabled quoted strings.
59
60 Revision 1.10 1999/07/01 09:29:19 rjmcnab
61 Changes for better reporting of number documents which match a query. Changes
62 should still work as before with older versions of mg.
63
64 Revision 1.9 1999/07/01 03:54:48 rjmcnab
65 Added code to plug in the equivalent terms of each of the query terms.
66 Also added a function to get a raw utf8 encoded mg document (for speeding
67 up a phrase matching function)
68
69 Revision 1.8 1999/06/30 04:04:12 rjmcnab
70 made stemming functions available from mgsearch and made the stems
71 for the query terms available in queryinfo
72
73 Revision 1.7 1999/06/27 22:07:27 sjboddie
74 got rid of all the old functions for dealing with dir indexes
75
76 Revision 1.6 1999/06/09 00:41:32 sjboddie
77 phrase searching now uses case-folding if it's turned on
78
79 Revision 1.5 1999/02/21 22:31:35 rjmcnab
80
81 Removed locateinfo.
82
83 Revision 1.4 1999/02/03 01:13:27 sjboddie
84
85 Got interface to handle subcollections and language subcollections -
86 committed changes made to some of the collections
87
88 Revision 1.3 1999/01/19 01:38:17 rjmcnab
89
90 Made the source more portable.
91
92 Revision 1.2 1999/01/12 01:51:02 rjmcnab
93
94 Standard header.
95
96 Revision 1.1 1999/01/08 09:02:16 rjmcnab
97
98 Moved from src/library.
99
100 */
101
102
103#include "gsdlconf.h"
104#include "mgsearch.h"
105#include "fileutil.h"
106
107#include <string.h>
108#include <stdio.h>
109#include <stdlib.h>
110#include <ctype.h>
111
112#if defined(GSDL_USE_OBJECTSPACE)
113# include <ospace\std\iostream>
114#elif defined(GSDL_USE_IOS_H)
115# include <iostream.h>
116#else
117# include <iostream>
118#endif
119
120#if defined(__WIN32__)
121// gdbm stuff
122# include "autoconf.h"
123# include "systems.h"
124# include "gdbmconst.h"
125# include "gdbm.h"
126#else
127# include <gdbm.h>
128#endif
129
130
131#include <assert.h>
132
133#include "mgq.h"
134// #include "locateinfo.h"
135#include "gsdlunicode.h"
136#include "unitool.h"
137
138
139/////////////
140// globals //
141/////////////
142
143static char *tempdoc = NULL;
144static int templen = 0;
145
146
147//////////////////////
148// useful functions //
149//////////////////////
150
151
152// input and output are in utf8
153text_t mgsearch_stemword (const text_t &word) {
154 // allocate working stem space
155 int maxstemlen = mgq_getmaxstemlen ();
156 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
157 if (word_stem == NULL) return "";
158
159 // copy word to word_stem
160 int len = 0;
161 text_t::const_iterator here = word.begin();
162 text_t::const_iterator end = word.end();
163 while (len < maxstemlen && here != end) {
164 word_stem[len+1] = (unsigned char)(*here);
165 len++; here++;
166 }
167 word_stem[len+1] = '\0';
168 word_stem[0] = len;
169
170 mgq_stemword (word_stem);
171
172 // copy word_stem back to tempstr
173 text_t tempstr;
174 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
175
176 delete [] word_stem;
177
178 return tempstr;
179}
180
181
182
183////////////////////////
184// callback functions //
185////////////////////////
186
187// This routine is called for each document found in a search
188// it assumes that cache_num is set up correctly to point to
189// a suitable result cache
190int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
191 float Weight, void *info) {
192
193
194 queryresultsclass *queryresults = (queryresultsclass * )info;
195
196 // append this entry to the document results
197 docresultclass docresult;
198 docresult.docnum = DocNum;
199 docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
200 docresult.docweight = Weight - docresult.num_query_terms_matched*100;
201
202 queryresults->docs.docset[DocNum] = docresult;
203 queryresults->docs.docorder.push_back(DocNum);
204
205 return 0;
206}
207
208int termequivcallback(char *Word, int ULen, int /*Freq*/,
209 float /*Weight*/, void *info) {
210 text_tset *equivterms = (text_tset *)info;
211 if (equivterms == NULL) return 0;
212
213 text_t thisterm;
214 thisterm.setcarr(Word, ULen);
215
216 equivterms->insert(thisterm);
217
218 return 0;
219}
220
221
222void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
223 // allocate working stem space
224 int maxstemlen = mgq_getmaxstemlen ();
225 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
226 if (word_stem == NULL) return;
227
228 // copy word to word_stem
229 int len = 0;
230 text_t::const_iterator here = word.begin();
231 text_t::const_iterator end = word.end();
232 while (len < maxstemlen && here != end) {
233 word_stem[len+1] = (unsigned char)(*here);
234 len++; here++;
235 }
236 word_stem[len+1] = '\0';
237 word_stem[0] = len;
238
239 // get the equivalent terms
240 mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
241
242 delete [] word_stem;
243
244 return;
245}
246
247 text_tset utf8equivterms; // kept as utf8 string for fast matching
248
249
250// This callback is called once for each term in the query
251int termfreqcallback(char *Word, int ULen, int Freq,
252 float /*Weight*/, void *info) {
253 queryresultsclass *queryresults = (queryresultsclass *)info;
254 if (queryresults == NULL) return 0;
255
256 text_t term;
257 term.setcarr(Word, ULen);
258 termfreqclass termfreq;
259
260 termfreq.termstr = to_uni(term);
261 text_t utf8termstem = mgsearch_stemword (term);
262 termfreq.termstemstr = to_uni (utf8termstem);
263
264 mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
265
266 termfreq.termfreq = Freq;
267 queryresults->orgterms.push_back(termfreq);
268
269 return 0;
270}
271
272// this callback is called once for each variation of each term
273int termvariantscallback(char *Word, int ULen, int /*Freq*/,
274 float /*Weight*/, void *info) {
275
276 text_t term;
277 term.setcarr(Word, ULen);
278 queryresultsclass *queryresults = (queryresultsclass *)info;
279 queryresults->termvariants.insert(to_uni(term));
280
281 return 0;
282}
283
284// This callback is for getting document text
285int doctextcallback(char *Doc, int ULen, int /*Freq*/,
286 float /*Weight*/, void * /*info*/) {
287 tempdoc = Doc;
288 templen = ULen;
289
290 return 0;
291}
292
293
294static text_t getindexsuffix (const text_t &collection,
295 const text_t &index) {
296
297 text_t indexsuffix = "index";
298 indexsuffix = filename_cat (indexsuffix, index);
299 indexsuffix = filename_cat (indexsuffix, collection);
300 return indexsuffix;
301}
302
303
304
305
306////////////////////
307// mgsearch class //
308////////////////////
309
310mgsearchclass::mgsearchclass ()
311{
312 cache = new querycache (RESULTCACHESIZE);
313}
314
315mgsearchclass::~mgsearchclass ()
316{
317 if (cache != NULL)
318 {
319 delete cache;
320 cache = NULL;
321 }
322}
323
324
325void mgsearchclass::setcollectdir (const text_t &thecollectdir)
326{
327 collectdir = thecollectdir;
328}
329
330// you only need to use this function before doing any stemming
331// casefolding and stemming will be set if values for them are
332// provided (0 or 1).
333// makeindexcurrent returns true if it was able to load the database
334bool mgsearchclass::makeindexcurrent (const text_t &index,
335 const text_t &subcollection,
336 const text_t &language,
337 const text_t &collection,
338 int casefolding,
339 int stemming) {
340 bool databaseloaded = true;
341
342 // get the names of the collection, index and text suffixes
343 char *ccollection = collection.getcstr();
344 assert (ccollection != NULL);
345 char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
346 assert (idxsuffix != NULL);
347 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
348 assert (txtsuffix != NULL);
349
350#ifdef __WIN32__
351 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
352#else
353 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
354#endif
355
356 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
357 if (casefolding == 0) mgq_ask(".set casefold off");
358 else if (casefolding > 0) mgq_ask(".set casefold on");
359 if (stemming == 0) mgq_ask(".set stem off");
360 else if (stemming > 0) mgq_ask(".set stem on");
361
362 } else databaseloaded = false;
363
364 // free up the c strings
365 delete ccollection;
366 delete idxsuffix;
367 delete txtsuffix;
368 delete ccollectdir;
369
370 return databaseloaded;
371}
372
373
374// stem word uses the values set in the last call to makeindexcurrent
375// to stem the word. It is assumed that word is in unicode
376text_t mgsearchclass::stemword (const text_t &word) {
377 return to_uni (mgsearch_stemword (to_utf8 (word)));
378}
379
380text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
381 return to_uni (mgsearch_stemword (to_utf8 (here, end)));
382}
383
384
385bool mgsearchclass::search(const queryparamclass &queryparams,
386 queryresultsclass &queryresults) {
387 assert (cache != NULL);
388
389 queryresults.clear();
390
391 // first check the cache
392 if (cache->find(queryparams, queryresults)) return true;
393
394 // make sure there is a query to be processed
395 text_t::const_iterator queryhere = queryparams.querystring.begin();
396 text_t::const_iterator queryend = queryparams.querystring.end();
397 while (queryhere != queryend) {
398 if (is_unicode_letdig (*queryhere)) break;
399 queryhere++;
400 }
401
402 // if we reached the end of the query string without finding
403 // any alphanumeric characters then return no results (and say
404 // the database was loaded)
405 if (queryhere == queryend) return true;
406
407 if (makeindexcurrent (queryparams.index, queryparams.subcollection,
408 queryparams.language, queryparams.collection)) {
409 setsearchmode (queryparams);
410 submitquery (queryparams);
411 getresults (queryparams, queryresults);
412 return true;
413 }
414
415 return false;
416}
417
418
419void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
420{
421 mgq_ask(".set expert true");
422 mgq_ask(".set sorted_terms true");
423 mgq_ask(".set accumulator_method list");
424 mgq_ask(".set max_accumulators 500000");
425 mgq_ask(".set maxparas 500000");
426 mgq_ask(".set verbatim true");
427 mgq_ask(".unset skip_dump");
428 mgq_ask(".set mode docnums");
429
430 switch (queryparams.search_type)
431 {
432 case 0: mgq_ask(".set query boolean"); break;
433 case 1: mgq_ask(".set query ranked"); break;
434 }
435 switch (queryparams.casefolding)
436 {
437 case 1: mgq_ask(".set casefold on"); break;
438 case 0: mgq_ask(".set casefold off"); break;
439 }
440 switch (queryparams.stemming)
441 {
442 case 1: mgq_ask(".set stem on"); break;
443 case 0: mgq_ask(".set stem off"); break;
444 }
445 mgq_ask(".set heads_length 150");
446
447 if (queryparams.maxdocs == -1) {
448 mgq_ask(".set maxdocs all");
449 } else {
450 char maxdocstr[32];
451 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
452 mgq_ask(maxdocstr);
453 }
454}
455
456
457void mgsearchclass::submitquery (const queryparamclass &queryparams)
458{
459 // sort out the query string
460 text_t ttquerystring = queryparams.querystring;
461 filterquery (ttquerystring);
462 char *querystring = to_utf8(ttquerystring).getcstr();
463
464 // submit the query
465 mgq_ask(querystring);
466
467 delete querystring;
468}
469
470
471void mgsearchclass::getresults (const queryparamclass &queryparams,
472 queryresultsclass &queryresults) {
473
474 mgq_results(result_docnums, 0, MAXNUMDOCS,
475 ourquerycallback, (void *)(&queryresults));
476
477 // get the term frequencies
478 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
479 termfreqcallback, (void *)(&queryresults));
480 queryresults.sortuniqqueryterms();
481
482 // get term variants
483 mgq_results(result_terms, 0, MAXNUMTERMS,
484 termvariantscallback, (void *)(&queryresults));
485
486 // get the number of documents retrieved
487 int total_retrieved = 0, is_approx = 0;
488 mgq_docsretrieved (&total_retrieved, &is_approx);
489
490 if (total_retrieved == 0) {
491 // not available (or really was zero)
492 queryresults.docs_matched = queryresults.docs.docset.size();
493 if (queryresults.docs_matched < queryparams.maxdocs)
494 queryresults.is_approx = Exact;
495 else
496 queryresults.is_approx = MoreThan;
497 } else {
498 queryresults.docs_matched = total_retrieved;
499 if (is_approx) queryresults.is_approx = Approximate;
500 else queryresults.is_approx = Exact;
501 }
502}
503
504void mgsearchclass::filterquery (text_t &ttquerystring) {
505 text_t::iterator ithere = ttquerystring.begin ();
506 text_t::iterator itend = ttquerystring.end ();
507
508 // remove all non alphanumeric characters (except
509 // boolean operators
510 while (ithere != itend) {
511 if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
512 (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
513 (*ithere != ')')) (*ithere) = ' ';
514 ithere++;
515 }
516}
517
518
519// the document text for 'docnum' is placed in 'output'
520// docTargetDocument returns 'true' if it was able to
521// try to get a document
522// collection is needed to see if an index from the
523// collection is loaded. If no index has been loaded
524// defaultindex is needed to load one
525bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
526 const text_t &defaultsubcollection,
527 const text_t &defaultlanguage,
528 const text_t &collection,
529 int docnum,
530 text_t &output) {
531 output.clear();
532
533 // get the mg version of the document
534 char *mgdoc = NULL;
535 int doclen = 0;
536 if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
537 collection, docnum, mgdoc, doclen)) return false;
538 if (mgdoc == NULL) return false;
539
540 // replace all control-Cs with spaces
541 char *mgdoc_here = mgdoc;
542 char *mgdoc_end = mgdoc + doclen;
543 while (mgdoc_here < mgdoc_end) {
544 if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
545 mgdoc_here++;
546 }
547
548 // convert this document to unicode
549 utf8inconvertclass inconvert;
550 convertclass::status_t status;
551 inconvert.reset ();
552 inconvert.setinput (mgdoc, doclen);
553 inconvert.convert (output, status);
554
555 return true;
556}
557
558
559bool mgsearchclass::mgdocument (const text_t &defaultindex,
560 const text_t &defaultsubcollection,
561 const text_t &defaultlanguage,
562 const text_t &collection,
563 int docnum,
564 char *&UDoc, int &ULen) {
565 int databaseloaded = 0;
566
567 UDoc = NULL; ULen = 0;
568
569 // see if we can make an appropriate database current
570// char *ccollection = collection.getcstr();
571// assert (ccollection != NULL);
572// databaseloaded = load_text_database (ccollection);
573// delete ccollection;
574
575 // try and load the database
576// if (!databaseloaded)
577 databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
578 defaultlanguage, collection);
579
580 if (databaseloaded) {
581 // retrieve the document from mg
582 char docstr[32];
583 sprintf(docstr, "%i", docnum);
584
585 mgq_ask(".set mode text");
586 mgq_ask(".set query docnums");
587 mgq_ask(docstr);
588
589 tempdoc = NULL;
590 templen = 0;
591 mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
592 UDoc = tempdoc;
593 ULen = templen;
594 }
595
596 return (bool)databaseloaded;
597}
598
Note: See TracBrowser for help on using the repository browser.