source: trunk/gsdl/src/colservr/mgsearch.cpp@ 1990

Last change on this file since 1990 was 1990, checked in by kjm18, 23 years ago

added a comment

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.1 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: mgsearch.cpp 1990 2001-02-15 22:58:11Z kjm18 $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.30 2001/02/15 22:58:11 kjm18
31 added a comment
32
33 Revision 1.29 2001/02/15 03:57:02 kjm18
34 changed accumulator_method for mg to be array rather than list - it was
35 getting some weird results with ranked searches
36
37 Revision 1.28 2001/01/25 18:26:44 cs025
38 Included CORBA branch for first time
39
40 Revision 1.22.2.1 2000/04/04 15:02:32 cs025
41 Corba first commit
42
43 Revision 1.22 1999/09/24 02:41:21 rjmcnab
44 change to use has_unicode_letdig in text_t
45
46 Revision 1.21 1999/09/21 21:41:41 sjboddie
47 fixed an error in what I committed last
48
49 Revision 1.20 1999/09/21 11:59:26 sjboddie
50 added Maxdocs queryfilter option (which may be -1 for 'all)
51
52 Revision 1.19 1999/09/07 22:52:52 rjmcnab
53 Seems to be an error in mg for retrieving documents using a paragraph
54 based index for some cases. Just added a work around (loads the default
55 index every time).
56
57 Revision 1.18 1999/09/07 04:57:22 sjboddie
58 added gpl notice
59
60 Revision 1.17 1999/08/31 22:42:41 rjmcnab
61 A couple of minor things.
62
63 Revision 1.16 1999/08/25 04:51:06 sjboddie
64 small change to allow for searching using boolean operators
65
66 Revision 1.15 1999/07/16 08:35:03 rjmcnab
67 Fixed a weird bug to do with a faulty case statement.
68
69 Revision 1.14 1999/07/16 03:42:22 sjboddie
70 changed isApprox
71
72 Revision 1.13 1999/07/16 00:12:46 sjboddie
73 removed all the old post-processing stuff
74
75 Revision 1.12 1999/07/07 06:17:47 rjmcnab
76 broke search_index into index+subcollection+language
77 within mgsearch
78
79 Revision 1.11 1999/07/05 21:06:43 rjmcnab
80 Disabled quoted strings.
81
82 Revision 1.10 1999/07/01 09:29:19 rjmcnab
83 Changes for better reporting of number documents which match a query. Changes
84 should still work as before with older versions of mg.
85
86 Revision 1.9 1999/07/01 03:54:48 rjmcnab
87 Added code to plug in the equivalent terms of each of the query terms.
88 Also added a function to get a raw utf8 encoded mg document (for speeding
89 up a phrase matching function)
90
91 Revision 1.8 1999/06/30 04:04:12 rjmcnab
92 made stemming functions available from mgsearch and made the stems
93 for the query terms available in queryinfo
94
95 Revision 1.7 1999/06/27 22:07:27 sjboddie
96 got rid of all the old functions for dealing with dir indexes
97
98 Revision 1.6 1999/06/09 00:41:32 sjboddie
99 phrase searching now uses case-folding if it's turned on
100
101 Revision 1.5 1999/02/21 22:31:35 rjmcnab
102
103 Removed locateinfo.
104
105 Revision 1.4 1999/02/03 01:13:27 sjboddie
106
107 Got interface to handle subcollections and language subcollections -
108 committed changes made to some of the collections
109
110 Revision 1.3 1999/01/19 01:38:17 rjmcnab
111
112 Made the source more portable.
113
114 Revision 1.2 1999/01/12 01:51:02 rjmcnab
115
116 Standard header.
117
118 Revision 1.1 1999/01/08 09:02:16 rjmcnab
119
120 Moved from src/library.
121
122 */
123
124#include "gsdlconf.h"
125#include "mgsearch.h"
126#include "fileutil.h"
127
128#include <string.h>
129#include <stdio.h>
130#include <stdlib.h>
131#include <ctype.h>
132
133#if defined(GSDL_USE_OBJECTSPACE)
134# include <ospace\std\iostream>
135#elif defined(GSDL_USE_IOS_H)
136# include <iostream.h>
137#else
138# include <iostream>
139#endif
140
141#if defined(__WIN32__)
142// gdbm stuff
143# include "autoconf.h"
144# include "systems.h"
145# include "gdbmconst.h"
146# include "gdbm.h"
147#else
148# include <gdbm.h>
149#endif
150
151
152#include <assert.h>
153
154#include "mgq.h"
155// #include "locateinfo.h"
156#include "gsdlunicode.h"
157#include "unitool.h"
158
159
160/////////////
161// globals //
162/////////////
163
164static char *tempdoc = NULL;
165static int templen = 0;
166
167
168//////////////////////
169// useful functions //
170//////////////////////
171
172
173// input and output are in utf8
174text_t mgsearch_stemword (const text_t &word) {
175 // allocate working stem space
176 int maxstemlen = mgq_getmaxstemlen ();
177 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
178 if (word_stem == NULL) return "";
179
180 // copy word to word_stem
181 int len = 0;
182 text_t::const_iterator here = word.begin();
183 text_t::const_iterator end = word.end();
184 while (len < maxstemlen && here != end) {
185 word_stem[len+1] = (unsigned char)(*here);
186 len++; here++;
187 }
188 word_stem[len+1] = '\0';
189 word_stem[0] = len;
190
191 mgq_stemword (word_stem);
192
193 // copy word_stem back to tempstr
194 text_t tempstr;
195 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
196
197 delete [] word_stem;
198
199 return tempstr;
200}
201
202
203
204////////////////////////
205// callback functions //
206////////////////////////
207
208// This routine is called for each document found in a search
209// it assumes that cache_num is set up correctly to point to
210// a suitable result cache
211int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
212 float Weight, void *info) {
213
214
215 queryresultsclass *queryresults = (queryresultsclass * )info;
216
217 // append this entry to the document results
218 docresultclass docresult;
219 docresult.docnum = DocNum;
220 docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
221 docresult.docweight = Weight - docresult.num_query_terms_matched*100;
222
223 queryresults->docs.docset[DocNum] = docresult;
224 queryresults->docs.docorder.push_back(DocNum);
225
226 return 0;
227}
228
229int termequivcallback(char *Word, int ULen, int /*Freq*/,
230 float /*Weight*/, void *info) {
231 text_tset *equivterms = (text_tset *)info;
232 if (equivterms == NULL) return 0;
233
234 text_t thisterm;
235 thisterm.setcarr(Word, ULen);
236
237 equivterms->insert(thisterm);
238
239 return 0;
240}
241
242
243void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
244 // allocate working stem space
245 int maxstemlen = mgq_getmaxstemlen ();
246 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
247 if (word_stem == NULL) return;
248
249 // copy word to word_stem
250 int len = 0;
251 text_t::const_iterator here = word.begin();
252 text_t::const_iterator end = word.end();
253 while (len < maxstemlen && here != end) {
254 word_stem[len+1] = (unsigned char)(*here);
255 len++; here++;
256 }
257 word_stem[len+1] = '\0';
258 word_stem[0] = len;
259
260 // get the equivalent terms
261 mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
262
263 delete [] word_stem;
264
265 return;
266}
267
268 text_tset utf8equivterms; // kept as utf8 string for fast matching
269
270
271// This callback is called once for each term in the query
272int termfreqcallback(char *Word, int ULen, int Freq,
273 float /*Weight*/, void *info) {
274 queryresultsclass *queryresults = (queryresultsclass *)info;
275 if (queryresults == NULL) return 0;
276
277 text_t term;
278 term.setcarr(Word, ULen);
279 termfreqclass termfreq;
280
281 termfreq.termstr = to_uni(term);
282 text_t utf8termstem = mgsearch_stemword (term);
283 termfreq.termstemstr = to_uni (utf8termstem);
284
285 mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
286
287 termfreq.termfreq = Freq;
288 queryresults->orgterms.push_back(termfreq);
289
290 return 0;
291}
292
293// this callback is called once for each variation of each term
294int termvariantscallback(char *Word, int ULen, int /*Freq*/,
295 float /*Weight*/, void *info) {
296
297 text_t term;
298 term.setcarr(Word, ULen);
299 queryresultsclass *queryresults = (queryresultsclass *)info;
300 queryresults->termvariants.insert(to_uni(term));
301
302 return 0;
303}
304
305// This callback is for getting document text
306int doctextcallback(char *Doc, int ULen, int /*Freq*/,
307 float /*Weight*/, void * /*info*/) {
308 tempdoc = Doc;
309 templen = ULen;
310
311 return 0;
312}
313
314
315static text_t getindexsuffix (const text_t &collection,
316 const text_t &index) {
317
318 text_t indexsuffix = "index";
319 indexsuffix = filename_cat (indexsuffix, index);
320 indexsuffix = filename_cat (indexsuffix, collection);
321 return indexsuffix;
322}
323
324
325
326
327////////////////////
328// mgsearch class //
329////////////////////
330
331mgsearchclass::mgsearchclass ()
332 : searchclass() {
333
334}
335
336mgsearchclass::~mgsearchclass ()
337{
338 if (cache != NULL)
339 {
340 delete cache;
341 cache = NULL;
342 }
343}
344
345// you only need to use this function before doing any stemming
346// casefolding and stemming will be set if values for them are
347// provided (0 or 1).
348// makeindexcurrent returns true if it was able to load the database
349bool mgsearchclass::makeindexcurrent (const text_t &index,
350 const text_t &subcollection,
351 const text_t &language,
352 const text_t &collection,
353 int casefolding,
354 int stemming) {
355 bool databaseloaded = true;
356
357 // get the names of the collection, index and text suffixes
358 char *ccollection = collection.getcstr();
359 assert (ccollection != NULL);
360 char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
361 assert (idxsuffix != NULL);
362 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
363 assert (txtsuffix != NULL);
364
365#ifdef __WIN32__
366 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
367#else
368 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
369#endif
370
371 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
372 if (casefolding == 0) mgq_ask(".set casefold off");
373 else if (casefolding > 0) mgq_ask(".set casefold on");
374 if (stemming == 0) mgq_ask(".set stem off");
375 else if (stemming > 0) mgq_ask(".set stem on");
376
377 } else databaseloaded = false;
378
379 // free up the c strings
380 delete ccollection;
381 delete idxsuffix;
382 delete txtsuffix;
383 delete ccollectdir;
384
385 return databaseloaded;
386}
387
388
389// stem word uses the values set in the last call to makeindexcurrent
390// to stem the word. It is assumed that word is in unicode
391text_t mgsearchclass::stemword (const text_t &word) {
392 return to_uni (mgsearch_stemword (to_utf8 (word)));
393}
394
395text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
396 return to_uni (mgsearch_stemword (to_utf8 (here, end)));
397}
398
399/**
400 * search directs the whole execution of the search; a number of other
401 * functions in this class are called as a result, and precondition
402 * checks are also made
403 */
404bool mgsearchclass::search(const queryparamclass &queryparams,
405 queryresultsclass &queryresults) {
406 // assert (cache != NULL);
407
408 // clear any previous results
409 queryresults.clear();
410 // first check the cache
411 if (cache != NULL) {
412 if (cache->find(queryparams, queryresults)) return true;
413 }
414 // make sure there is a query to be processed
415 if (!has_unicode_letdig(queryparams.querystring)) return true;
416
417 if (makeindexcurrent (queryparams.index, queryparams.subcollection,
418 queryparams.language, queryparams.collection)) {
419 // initialise the form of results
420 setsearchmode (queryparams);
421
422 // execute the query
423 submitquery (queryparams);
424
425 // retrieve the results
426 getresults (queryparams, queryresults);
427
428 return true;
429 }
430
431 return false;
432}
433
434/* accumulator_method has been changed to use array rather than list.
435list appears to be broken somewhat - for some ranked queries, it returned
436fewer results than it should have (eg 45 instead of 50). The three other
437methods (array, splay_tree, hash_table) all return the same number of
438documents, in the same order, with the same ranks. list returns what
439appears to be the same documents (but less of them), but with different ranks,
440and in a different order. Minimal time tests dont show any speed improvement
441of list over array (maybe because its broken??). [02/2001, kjm18]
442*/
443void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
444{
445 mgq_ask(".set expert true");
446 mgq_ask(".set sorted_terms true");
447 mgq_ask(".set accumulator_method array");
448 mgq_ask(".set max_accumulators 500000");
449 mgq_ask(".set maxparas 500000");
450 mgq_ask(".set verbatim true");
451 mgq_ask(".unset skip_dump");
452 mgq_ask(".set mode docnums");
453
454 switch (queryparams.search_type)
455 {
456 case 0: mgq_ask(".set query boolean"); break;
457 case 1: mgq_ask(".set query ranked"); break;
458 }
459 switch (queryparams.casefolding)
460 {
461 case 1: mgq_ask(".set casefold on"); break;
462 case 0: mgq_ask(".set casefold off"); break;
463 }
464 switch (queryparams.stemming)
465 {
466 case 1: mgq_ask(".set stem on"); break;
467 case 0: mgq_ask(".set stem off"); break;
468 }
469 mgq_ask(".set heads_length 150");
470
471 if (queryparams.maxdocs == -1) {
472 mgq_ask(".set maxdocs all");
473 } else {
474 char maxdocstr[32];
475 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
476 mgq_ask(maxdocstr);
477 }
478}
479
480/**
481 * submitquery constructs the query string (into UTF8 encoding)
482 * and submits it using mgq_ask to the mg search engine. Most
483 * of the processing will be done inside Greenstone
484 */
485void mgsearchclass::submitquery (const queryparamclass &queryparams)
486{
487 // sort out the query string; copy it, remove all special characters
488 // and then convert it to a string in UTF8 format
489 text_t ttquerystring = queryparams.querystring;
490 filterquery (ttquerystring);
491 char *querystring = to_utf8(ttquerystring).getcstr();
492
493 // submit the query
494 mgq_ask(querystring);
495
496 // destroy the temporary character array
497 delete querystring;
498}
499
500/**
501 * getrults is called to retrieve the required data on the docs
502 * which responded to the query submitted in submitquery above.
503 *
504 * It calls the local mgquery (mgq) interface to MG several times,
505 * to obtain the document numbers, term frequencies, term variants
506 * etc. All processing of the query will be done by Greenstone
507 * thereafter
508 */
509void mgsearchclass::getresults (const queryparamclass &queryparams,
510 queryresultsclass &queryresults) {
511 // get the configuration for the maximum number of documents to
512 // retrieve
513 int howmany = queryparams.maxdocs;
514 if (howmany == -1) howmany = MAXNUMDOCS;
515 mgq_results(result_docnums, 0, howmany,
516 ourquerycallback, (void *)(&queryresults));
517
518 // get the term frequencies
519 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
520 termfreqcallback, (void *)(&queryresults));
521 queryresults.sortuniqqueryterms();
522
523 // get term variants
524 mgq_results(result_terms, 0, MAXNUMTERMS,
525 termvariantscallback, (void *)(&queryresults));
526
527 // get the number of documents retrieved
528 int total_retrieved = 0, is_approx = 0;
529 mgq_docsretrieved (&total_retrieved, &is_approx);
530
531 if (total_retrieved == 0) {
532 // not available (or really was zero)
533 queryresults.docs_matched = queryresults.docs.docset.size();
534 if ((queryparams.maxdocs == -1) ||
535 (queryresults.docs_matched < queryparams.maxdocs))
536 queryresults.is_approx = Exact;
537 else
538 queryresults.is_approx = MoreThan;
539 } else {
540 queryresults.docs_matched = total_retrieved;
541 if (is_approx) queryresults.is_approx = Approximate;
542 else queryresults.is_approx = Exact;
543 }
544}
545
546/**
547 * Tidies the given querystring, removing special characters
548 */
549void mgsearchclass::filterquery (text_t &ttquerystring) {
550 text_t::iterator ithere = ttquerystring.begin ();
551 text_t::iterator itend = ttquerystring.end ();
552
553 // remove all non alphanumeric characters (except
554 // boolean operators
555 while (ithere != itend) {
556 if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
557 (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
558 (*ithere != ')')) (*ithere) = ' ';
559 ithere++;
560 }
561}
562
563
564// the document text for 'docnum' is placed in 'output'
565// docTargetDocument returns 'true' if it was able to
566// try to get a document
567// collection is needed to see if an index from the
568// collection is loaded. If no index has been loaded
569// defaultindex is needed to load one
570bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
571 const text_t &defaultsubcollection,
572 const text_t &defaultlanguage,
573 const text_t &collection,
574 int docnum,
575 text_t &output) {
576 output.clear();
577
578 // get the mg version of the document
579 char *mgdoc = NULL;
580 int doclen = 0;
581 if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
582 collection, docnum, mgdoc, doclen)) return false;
583 if (mgdoc == NULL) return false;
584
585 // replace all control-Cs with spaces
586 char *mgdoc_here = mgdoc;
587 char *mgdoc_end = mgdoc + doclen;
588 while (mgdoc_here < mgdoc_end) {
589 if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
590 mgdoc_here++;
591 }
592
593 // convert this document to unicode
594 utf8inconvertclass inconvert;
595 convertclass::status_t status;
596 inconvert.reset ();
597 inconvert.setinput (mgdoc, doclen);
598 inconvert.convert (output, status);
599
600 return true;
601}
602
603
604bool mgsearchclass::mgdocument (const text_t &defaultindex,
605 const text_t &defaultsubcollection,
606 const text_t &defaultlanguage,
607 const text_t &collection,
608 int docnum,
609 char *&UDoc, int &ULen) {
610 int databaseloaded = 0;
611
612 UDoc = NULL; ULen = 0;
613
614 // see if we can make an appropriate database current
615// char *ccollection = collection.getcstr();
616// assert (ccollection != NULL);
617// databaseloaded = load_text_database (ccollection);
618// delete ccollection;
619
620 // try and load the database
621// if (!databaseloaded)
622 databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
623 defaultlanguage, collection);
624
625 if (databaseloaded) {
626 // retrieve the document from mg
627 char docstr[32];
628 sprintf(docstr, "%i", docnum);
629
630 mgq_ask(".set mode text");
631 mgq_ask(".set query docnums");
632 mgq_ask(docstr);
633
634 tempdoc = NULL;
635 templen = 0;
636 mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
637 UDoc = tempdoc;
638 ULen = templen;
639 }
640
641 return (bool)databaseloaded;
642}
643
Note: See TracBrowser for help on using the repository browser.