source: gsdl/tags/gsdl-2_30d-distribution/gsdl/src/colservr/mgsearch.cpp@ 14121

Last change on this file since 14121 was 2011, checked in by sjboddie, 23 years ago

Set mg's accumulator method back to 'list' as the recent change appeared
to introduce a new (and more serious) bug while fixing the old bug. For
now we'll just have to live with it the way it is.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.7 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: mgsearch.cpp 2011 2001-02-19 02:02:00Z sjboddie $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.31 2001/02/19 02:02:00 sjboddie
31 Set mg's accumulator method back to 'list' as the recent change appeared
32 to introduce a new (and more serious) bug while fixing the old bug. For
33 now we'll just have to live with it the way it is.
34
35 Revision 1.30 2001/02/15 22:58:11 kjm18
36 added a comment
37
38 Revision 1.29 2001/02/15 03:57:02 kjm18
39 changed accumulator_method for mg to be array rather than list - it was
40 getting some weird results with ranked searches
41
42 Revision 1.28 2001/01/25 18:26:44 cs025
43 Included CORBA branch for first time
44
45 Revision 1.22.2.1 2000/04/04 15:02:32 cs025
46 Corba first commit
47
48 Revision 1.22 1999/09/24 02:41:21 rjmcnab
49 change to use has_unicode_letdig in text_t
50
51 Revision 1.21 1999/09/21 21:41:41 sjboddie
52 fixed an error in what I committed last
53
54 Revision 1.20 1999/09/21 11:59:26 sjboddie
55 added Maxdocs queryfilter option (which may be -1 for 'all)
56
57 Revision 1.19 1999/09/07 22:52:52 rjmcnab
58 Seems to be an error in mg for retrieving documents using a paragraph
59 based index for some cases. Just added a work around (loads the default
60 index every time).
61
62 Revision 1.18 1999/09/07 04:57:22 sjboddie
63 added gpl notice
64
65 Revision 1.17 1999/08/31 22:42:41 rjmcnab
66 A couple of minor things.
67
68 Revision 1.16 1999/08/25 04:51:06 sjboddie
69 small change to allow for searching using boolean operators
70
71 Revision 1.15 1999/07/16 08:35:03 rjmcnab
72 Fixed a weird bug to do with a faulty case statement.
73
74 Revision 1.14 1999/07/16 03:42:22 sjboddie
75 changed isApprox
76
77 Revision 1.13 1999/07/16 00:12:46 sjboddie
78 removed all the old post-processing stuff
79
80 Revision 1.12 1999/07/07 06:17:47 rjmcnab
81 broke search_index into index+subcollection+language
82 within mgsearch
83
84 Revision 1.11 1999/07/05 21:06:43 rjmcnab
85 Disabled quoted strings.
86
87 Revision 1.10 1999/07/01 09:29:19 rjmcnab
88 Changes for better reporting of number documents which match a query. Changes
89 should still work as before with older versions of mg.
90
91 Revision 1.9 1999/07/01 03:54:48 rjmcnab
92 Added code to plug in the equivalent terms of each of the query terms.
93 Also added a function to get a raw utf8 encoded mg document (for speeding
94 up a phrase matching function)
95
96 Revision 1.8 1999/06/30 04:04:12 rjmcnab
97 made stemming functions available from mgsearch and made the stems
98 for the query terms available in queryinfo
99
100 Revision 1.7 1999/06/27 22:07:27 sjboddie
101 got rid of all the old functions for dealing with dir indexes
102
103 Revision 1.6 1999/06/09 00:41:32 sjboddie
104 phrase searching now uses case-folding if it's turned on
105
106 Revision 1.5 1999/02/21 22:31:35 rjmcnab
107
108 Removed locateinfo.
109
110 Revision 1.4 1999/02/03 01:13:27 sjboddie
111
112 Got interface to handle subcollections and language subcollections -
113 committed changes made to some of the collections
114
115 Revision 1.3 1999/01/19 01:38:17 rjmcnab
116
117 Made the source more portable.
118
119 Revision 1.2 1999/01/12 01:51:02 rjmcnab
120
121 Standard header.
122
123 Revision 1.1 1999/01/08 09:02:16 rjmcnab
124
125 Moved from src/library.
126
127 */
128
129#include "gsdlconf.h"
130#include "mgsearch.h"
131#include "fileutil.h"
132
133#include <string.h>
134#include <stdio.h>
135#include <stdlib.h>
136#include <ctype.h>
137
138#if defined(GSDL_USE_OBJECTSPACE)
139# include <ospace\std\iostream>
140#elif defined(GSDL_USE_IOS_H)
141# include <iostream.h>
142#else
143# include <iostream>
144#endif
145
146#if defined(__WIN32__)
147// gdbm stuff
148# include "autoconf.h"
149# include "systems.h"
150# include "gdbmconst.h"
151# include "gdbm.h"
152#else
153# include <gdbm.h>
154#endif
155
156
157#include <assert.h>
158
159#include "mgq.h"
160// #include "locateinfo.h"
161#include "gsdlunicode.h"
162#include "unitool.h"
163
164
165/////////////
166// globals //
167/////////////
168
169static char *tempdoc = NULL;
170static int templen = 0;
171
172
173//////////////////////
174// useful functions //
175//////////////////////
176
177
178// input and output are in utf8
179text_t mgsearch_stemword (const text_t &word) {
180 // allocate working stem space
181 int maxstemlen = mgq_getmaxstemlen ();
182 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
183 if (word_stem == NULL) return "";
184
185 // copy word to word_stem
186 int len = 0;
187 text_t::const_iterator here = word.begin();
188 text_t::const_iterator end = word.end();
189 while (len < maxstemlen && here != end) {
190 word_stem[len+1] = (unsigned char)(*here);
191 len++; here++;
192 }
193 word_stem[len+1] = '\0';
194 word_stem[0] = len;
195
196 mgq_stemword (word_stem);
197
198 // copy word_stem back to tempstr
199 text_t tempstr;
200 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
201
202 delete [] word_stem;
203
204 return tempstr;
205}
206
207
208
209////////////////////////
210// callback functions //
211////////////////////////
212
213// This routine is called for each document found in a search
214// it assumes that cache_num is set up correctly to point to
215// a suitable result cache
216int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
217 float Weight, void *info) {
218
219
220 queryresultsclass *queryresults = (queryresultsclass * )info;
221
222 // append this entry to the document results
223 docresultclass docresult;
224 docresult.docnum = DocNum;
225 docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
226 docresult.docweight = Weight - docresult.num_query_terms_matched*100;
227
228 queryresults->docs.docset[DocNum] = docresult;
229 queryresults->docs.docorder.push_back(DocNum);
230
231 return 0;
232}
233
234int termequivcallback(char *Word, int ULen, int /*Freq*/,
235 float /*Weight*/, void *info) {
236 text_tset *equivterms = (text_tset *)info;
237 if (equivterms == NULL) return 0;
238
239 text_t thisterm;
240 thisterm.setcarr(Word, ULen);
241
242 equivterms->insert(thisterm);
243
244 return 0;
245}
246
247
248void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
249 // allocate working stem space
250 int maxstemlen = mgq_getmaxstemlen ();
251 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
252 if (word_stem == NULL) return;
253
254 // copy word to word_stem
255 int len = 0;
256 text_t::const_iterator here = word.begin();
257 text_t::const_iterator end = word.end();
258 while (len < maxstemlen && here != end) {
259 word_stem[len+1] = (unsigned char)(*here);
260 len++; here++;
261 }
262 word_stem[len+1] = '\0';
263 word_stem[0] = len;
264
265 // get the equivalent terms
266 mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
267
268 delete [] word_stem;
269
270 return;
271}
272
273 text_tset utf8equivterms; // kept as utf8 string for fast matching
274
275
276// This callback is called once for each term in the query
277int termfreqcallback(char *Word, int ULen, int Freq,
278 float /*Weight*/, void *info) {
279 queryresultsclass *queryresults = (queryresultsclass *)info;
280 if (queryresults == NULL) return 0;
281
282 text_t term;
283 term.setcarr(Word, ULen);
284 termfreqclass termfreq;
285
286 termfreq.termstr = to_uni(term);
287 text_t utf8termstem = mgsearch_stemword (term);
288 termfreq.termstemstr = to_uni (utf8termstem);
289
290 mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
291
292 termfreq.termfreq = Freq;
293 queryresults->orgterms.push_back(termfreq);
294
295 return 0;
296}
297
298// this callback is called once for each variation of each term
299int termvariantscallback(char *Word, int ULen, int /*Freq*/,
300 float /*Weight*/, void *info) {
301
302 text_t term;
303 term.setcarr(Word, ULen);
304 queryresultsclass *queryresults = (queryresultsclass *)info;
305 queryresults->termvariants.insert(to_uni(term));
306
307 return 0;
308}
309
310// This callback is for getting document text
311int doctextcallback(char *Doc, int ULen, int /*Freq*/,
312 float /*Weight*/, void * /*info*/) {
313 tempdoc = Doc;
314 templen = ULen;
315
316 return 0;
317}
318
319
320static text_t getindexsuffix (const text_t &collection,
321 const text_t &index) {
322
323 text_t indexsuffix = "index";
324 indexsuffix = filename_cat (indexsuffix, index);
325 indexsuffix = filename_cat (indexsuffix, collection);
326 return indexsuffix;
327}
328
329
330
331
332////////////////////
333// mgsearch class //
334////////////////////
335
336mgsearchclass::mgsearchclass ()
337 : searchclass() {
338
339}
340
341mgsearchclass::~mgsearchclass ()
342{
343 if (cache != NULL)
344 {
345 delete cache;
346 cache = NULL;
347 }
348}
349
350// you only need to use this function before doing any stemming
351// casefolding and stemming will be set if values for them are
352// provided (0 or 1).
353// makeindexcurrent returns true if it was able to load the database
354bool mgsearchclass::makeindexcurrent (const text_t &index,
355 const text_t &subcollection,
356 const text_t &language,
357 const text_t &collection,
358 int casefolding,
359 int stemming) {
360 bool databaseloaded = true;
361
362 // get the names of the collection, index and text suffixes
363 char *ccollection = collection.getcstr();
364 assert (ccollection != NULL);
365 char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
366 assert (idxsuffix != NULL);
367 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
368 assert (txtsuffix != NULL);
369
370#ifdef __WIN32__
371 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
372#else
373 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
374#endif
375
376 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
377 if (casefolding == 0) mgq_ask(".set casefold off");
378 else if (casefolding > 0) mgq_ask(".set casefold on");
379 if (stemming == 0) mgq_ask(".set stem off");
380 else if (stemming > 0) mgq_ask(".set stem on");
381
382 } else databaseloaded = false;
383
384 // free up the c strings
385 delete ccollection;
386 delete idxsuffix;
387 delete txtsuffix;
388 delete ccollectdir;
389
390 return databaseloaded;
391}
392
393
394// stem word uses the values set in the last call to makeindexcurrent
395// to stem the word. It is assumed that word is in unicode
396text_t mgsearchclass::stemword (const text_t &word) {
397 return to_uni (mgsearch_stemword (to_utf8 (word)));
398}
399
400text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
401 return to_uni (mgsearch_stemword (to_utf8 (here, end)));
402}
403
404/**
405 * search directs the whole execution of the search; a number of other
406 * functions in this class are called as a result, and precondition
407 * checks are also made
408 */
409bool mgsearchclass::search(const queryparamclass &queryparams,
410 queryresultsclass &queryresults) {
411 // assert (cache != NULL);
412
413 // clear any previous results
414 queryresults.clear();
415 // first check the cache
416 if (cache != NULL) {
417 if (cache->find(queryparams, queryresults)) return true;
418 }
419 // make sure there is a query to be processed
420 if (!has_unicode_letdig(queryparams.querystring)) return true;
421
422 if (makeindexcurrent (queryparams.index, queryparams.subcollection,
423 queryparams.language, queryparams.collection)) {
424 // initialise the form of results
425 setsearchmode (queryparams);
426
427 // execute the query
428 submitquery (queryparams);
429
430 // retrieve the results
431 getresults (queryparams, queryresults);
432
433 return true;
434 }
435
436 return false;
437}
438
439/* accumulator_method has been changed to use array rather than list.
440list appears to be broken somewhat - for some ranked queries, it returned
441fewer results than it should have (eg 45 instead of 50). The three other
442methods (array, splay_tree, hash_table) all return the same number of
443documents, in the same order, with the same ranks. list returns what
444appears to be the same documents (but less of them), but with different ranks,
445and in a different order. Minimal time tests dont show any speed improvement
446of list over array (maybe because its broken??). [02/2001, kjm18]
447
448... [sjboddie, also 02/2001] turns out that changing the accumulator_method
449introduced a more serious bug than it fixed (i.e. occasionally when doing a
450ranked search for a very common word you get no results at all). I've
451changed it back to list for now, one day we should play with other
452accumulator_methods but for now I don't have time and don't want to risk
453introducing bugs (better the devil you know ;)
454*/
455void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
456{
457 mgq_ask(".set expert true");
458 mgq_ask(".set sorted_terms true");
459 mgq_ask(".set accumulator_method list");
460 mgq_ask(".set max_accumulators 500000");
461 mgq_ask(".set maxparas 500000");
462 mgq_ask(".set verbatim true");
463 mgq_ask(".unset skip_dump");
464 mgq_ask(".set mode docnums");
465
466 switch (queryparams.search_type)
467 {
468 case 0: mgq_ask(".set query boolean"); break;
469 case 1: mgq_ask(".set query ranked"); break;
470 }
471 switch (queryparams.casefolding)
472 {
473 case 1: mgq_ask(".set casefold on"); break;
474 case 0: mgq_ask(".set casefold off"); break;
475 }
476 switch (queryparams.stemming)
477 {
478 case 1: mgq_ask(".set stem on"); break;
479 case 0: mgq_ask(".set stem off"); break;
480 }
481 mgq_ask(".set heads_length 150");
482
483 if (queryparams.maxdocs == -1) {
484 mgq_ask(".set maxdocs all");
485 } else {
486 char maxdocstr[32];
487 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
488 mgq_ask(maxdocstr);
489 }
490}
491
492/**
493 * submitquery constructs the query string (into UTF8 encoding)
494 * and submits it using mgq_ask to the mg search engine. Most
495 * of the processing will be done inside Greenstone
496 */
497void mgsearchclass::submitquery (const queryparamclass &queryparams)
498{
499 // sort out the query string; copy it, remove all special characters
500 // and then convert it to a string in UTF8 format
501 text_t ttquerystring = queryparams.querystring;
502 filterquery (ttquerystring);
503 char *querystring = to_utf8(ttquerystring).getcstr();
504
505 // submit the query
506 mgq_ask(querystring);
507
508 // destroy the temporary character array
509 delete querystring;
510}
511
512/**
513 * getrults is called to retrieve the required data on the docs
514 * which responded to the query submitted in submitquery above.
515 *
516 * It calls the local mgquery (mgq) interface to MG several times,
517 * to obtain the document numbers, term frequencies, term variants
518 * etc. All processing of the query will be done by Greenstone
519 * thereafter
520 */
521void mgsearchclass::getresults (const queryparamclass &queryparams,
522 queryresultsclass &queryresults) {
523 // get the configuration for the maximum number of documents to
524 // retrieve
525 int howmany = queryparams.maxdocs;
526 if (howmany == -1) howmany = MAXNUMDOCS;
527 mgq_results(result_docnums, 0, howmany,
528 ourquerycallback, (void *)(&queryresults));
529
530 // get the term frequencies
531 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
532 termfreqcallback, (void *)(&queryresults));
533 queryresults.sortuniqqueryterms();
534
535 // get term variants
536 mgq_results(result_terms, 0, MAXNUMTERMS,
537 termvariantscallback, (void *)(&queryresults));
538
539 // get the number of documents retrieved
540 int total_retrieved = 0, is_approx = 0;
541 mgq_docsretrieved (&total_retrieved, &is_approx);
542
543 if (total_retrieved == 0) {
544 // not available (or really was zero)
545 queryresults.docs_matched = queryresults.docs.docset.size();
546 if ((queryparams.maxdocs == -1) ||
547 (queryresults.docs_matched < queryparams.maxdocs))
548 queryresults.is_approx = Exact;
549 else
550 queryresults.is_approx = MoreThan;
551 } else {
552 queryresults.docs_matched = total_retrieved;
553 if (is_approx) queryresults.is_approx = Approximate;
554 else queryresults.is_approx = Exact;
555 }
556}
557
558/**
559 * Tidies the given querystring, removing special characters
560 */
561void mgsearchclass::filterquery (text_t &ttquerystring) {
562 text_t::iterator ithere = ttquerystring.begin ();
563 text_t::iterator itend = ttquerystring.end ();
564
565 // remove all non alphanumeric characters (except
566 // boolean operators
567 while (ithere != itend) {
568 if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
569 (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
570 (*ithere != ')')) (*ithere) = ' ';
571 ithere++;
572 }
573}
574
575
576// the document text for 'docnum' is placed in 'output'
577// docTargetDocument returns 'true' if it was able to
578// try to get a document
579// collection is needed to see if an index from the
580// collection is loaded. If no index has been loaded
581// defaultindex is needed to load one
582bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
583 const text_t &defaultsubcollection,
584 const text_t &defaultlanguage,
585 const text_t &collection,
586 int docnum,
587 text_t &output) {
588 output.clear();
589
590 // get the mg version of the document
591 char *mgdoc = NULL;
592 int doclen = 0;
593 if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
594 collection, docnum, mgdoc, doclen)) return false;
595 if (mgdoc == NULL) return false;
596
597 // replace all control-Cs with spaces
598 char *mgdoc_here = mgdoc;
599 char *mgdoc_end = mgdoc + doclen;
600 while (mgdoc_here < mgdoc_end) {
601 if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
602 mgdoc_here++;
603 }
604
605 // convert this document to unicode
606 utf8inconvertclass inconvert;
607 convertclass::status_t status;
608 inconvert.reset ();
609 inconvert.setinput (mgdoc, doclen);
610 inconvert.convert (output, status);
611
612 return true;
613}
614
615
616bool mgsearchclass::mgdocument (const text_t &defaultindex,
617 const text_t &defaultsubcollection,
618 const text_t &defaultlanguage,
619 const text_t &collection,
620 int docnum,
621 char *&UDoc, int &ULen) {
622 int databaseloaded = 0;
623
624 UDoc = NULL; ULen = 0;
625
626 // see if we can make an appropriate database current
627// char *ccollection = collection.getcstr();
628// assert (ccollection != NULL);
629// databaseloaded = load_text_database (ccollection);
630// delete ccollection;
631
632 // try and load the database
633// if (!databaseloaded)
634 databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
635 defaultlanguage, collection);
636
637 if (databaseloaded) {
638 // retrieve the document from mg
639 char docstr[32];
640 sprintf(docstr, "%i", docnum);
641
642 mgq_ask(".set mode text");
643 mgq_ask(".set query docnums");
644 mgq_ask(docstr);
645
646 tempdoc = NULL;
647 templen = 0;
648 mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
649 UDoc = tempdoc;
650 ULen = templen;
651 }
652
653 return (bool)databaseloaded;
654}
655
Note: See TracBrowser for help on using the repository browser.