source: trunk/gsdl/src/colservr/mgsearch.cpp@ 1987

Last change on this file since 1987 was 1987, checked in by kjm18, 23 years ago

changed accumulator_method for mg to be array rather than list - it was
getting some weird results with ranked searches

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.4 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: mgsearch.cpp 1987 2001-02-15 03:57:02Z kjm18 $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.29 2001/02/15 03:57:02 kjm18
31 changed accumulator_method for mg to be array rather than list - it was
32 getting some weird results with ranked searches
33
34 Revision 1.28 2001/01/25 18:26:44 cs025
35 Included CORBA branch for first time
36
37 Revision 1.22.2.1 2000/04/04 15:02:32 cs025
38 Corba first commit
39
40 Revision 1.22 1999/09/24 02:41:21 rjmcnab
41 change to use has_unicode_letdig in text_t
42
43 Revision 1.21 1999/09/21 21:41:41 sjboddie
44 fixed an error in what I committed last
45
46 Revision 1.20 1999/09/21 11:59:26 sjboddie
47 added Maxdocs queryfilter option (which may be -1 for 'all)
48
49 Revision 1.19 1999/09/07 22:52:52 rjmcnab
50 Seems to be an error in mg for retrieving documents using a paragraph
51 based index for some cases. Just added a work around (loads the default
52 index every time).
53
54 Revision 1.18 1999/09/07 04:57:22 sjboddie
55 added gpl notice
56
57 Revision 1.17 1999/08/31 22:42:41 rjmcnab
58 A couple of minor things.
59
60 Revision 1.16 1999/08/25 04:51:06 sjboddie
61 small change to allow for searching using boolean operators
62
63 Revision 1.15 1999/07/16 08:35:03 rjmcnab
64 Fixed a weird bug to do with a faulty case statement.
65
66 Revision 1.14 1999/07/16 03:42:22 sjboddie
67 changed isApprox
68
69 Revision 1.13 1999/07/16 00:12:46 sjboddie
70 removed all the old post-processing stuff
71
72 Revision 1.12 1999/07/07 06:17:47 rjmcnab
73 broke search_index into index+subcollection+language
74 within mgsearch
75
76 Revision 1.11 1999/07/05 21:06:43 rjmcnab
77 Disabled quoted strings.
78
79 Revision 1.10 1999/07/01 09:29:19 rjmcnab
80 Changes for better reporting of number documents which match a query. Changes
81 should still work as before with older versions of mg.
82
83 Revision 1.9 1999/07/01 03:54:48 rjmcnab
84 Added code to plug in the equivalent terms of each of the query terms.
85 Also added a function to get a raw utf8 encoded mg document (for speeding
86 up a phrase matching function)
87
88 Revision 1.8 1999/06/30 04:04:12 rjmcnab
89 made stemming functions available from mgsearch and made the stems
90 for the query terms available in queryinfo
91
92 Revision 1.7 1999/06/27 22:07:27 sjboddie
93 got rid of all the old functions for dealing with dir indexes
94
95 Revision 1.6 1999/06/09 00:41:32 sjboddie
96 phrase searching now uses case-folding if it's turned on
97
98 Revision 1.5 1999/02/21 22:31:35 rjmcnab
99
100 Removed locateinfo.
101
102 Revision 1.4 1999/02/03 01:13:27 sjboddie
103
104 Got interface to handle subcollections and language subcollections -
105 committed changes made to some of the collections
106
107 Revision 1.3 1999/01/19 01:38:17 rjmcnab
108
109 Made the source more portable.
110
111 Revision 1.2 1999/01/12 01:51:02 rjmcnab
112
113 Standard header.
114
115 Revision 1.1 1999/01/08 09:02:16 rjmcnab
116
117 Moved from src/library.
118
119 */
120
121#include "gsdlconf.h"
122#include "mgsearch.h"
123#include "fileutil.h"
124
125#include <string.h>
126#include <stdio.h>
127#include <stdlib.h>
128#include <ctype.h>
129
130#if defined(GSDL_USE_OBJECTSPACE)
131# include <ospace\std\iostream>
132#elif defined(GSDL_USE_IOS_H)
133# include <iostream.h>
134#else
135# include <iostream>
136#endif
137
138#if defined(__WIN32__)
139// gdbm stuff
140# include "autoconf.h"
141# include "systems.h"
142# include "gdbmconst.h"
143# include "gdbm.h"
144#else
145# include <gdbm.h>
146#endif
147
148
149#include <assert.h>
150
151#include "mgq.h"
152// #include "locateinfo.h"
153#include "gsdlunicode.h"
154#include "unitool.h"
155
156
157/////////////
158// globals //
159/////////////
160
161static char *tempdoc = NULL;
162static int templen = 0;
163
164
165//////////////////////
166// useful functions //
167//////////////////////
168
169
170// input and output are in utf8
171text_t mgsearch_stemword (const text_t &word) {
172 // allocate working stem space
173 int maxstemlen = mgq_getmaxstemlen ();
174 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
175 if (word_stem == NULL) return "";
176
177 // copy word to word_stem
178 int len = 0;
179 text_t::const_iterator here = word.begin();
180 text_t::const_iterator end = word.end();
181 while (len < maxstemlen && here != end) {
182 word_stem[len+1] = (unsigned char)(*here);
183 len++; here++;
184 }
185 word_stem[len+1] = '\0';
186 word_stem[0] = len;
187
188 mgq_stemword (word_stem);
189
190 // copy word_stem back to tempstr
191 text_t tempstr;
192 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
193
194 delete [] word_stem;
195
196 return tempstr;
197}
198
199
200
201////////////////////////
202// callback functions //
203////////////////////////
204
205// This routine is called for each document found in a search
206// it assumes that cache_num is set up correctly to point to
207// a suitable result cache
208int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
209 float Weight, void *info) {
210
211
212 queryresultsclass *queryresults = (queryresultsclass * )info;
213
214 // append this entry to the document results
215 docresultclass docresult;
216 docresult.docnum = DocNum;
217 docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
218 docresult.docweight = Weight - docresult.num_query_terms_matched*100;
219
220 queryresults->docs.docset[DocNum] = docresult;
221 queryresults->docs.docorder.push_back(DocNum);
222
223 return 0;
224}
225
226int termequivcallback(char *Word, int ULen, int /*Freq*/,
227 float /*Weight*/, void *info) {
228 text_tset *equivterms = (text_tset *)info;
229 if (equivterms == NULL) return 0;
230
231 text_t thisterm;
232 thisterm.setcarr(Word, ULen);
233
234 equivterms->insert(thisterm);
235
236 return 0;
237}
238
239
240void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
241 // allocate working stem space
242 int maxstemlen = mgq_getmaxstemlen ();
243 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
244 if (word_stem == NULL) return;
245
246 // copy word to word_stem
247 int len = 0;
248 text_t::const_iterator here = word.begin();
249 text_t::const_iterator end = word.end();
250 while (len < maxstemlen && here != end) {
251 word_stem[len+1] = (unsigned char)(*here);
252 len++; here++;
253 }
254 word_stem[len+1] = '\0';
255 word_stem[0] = len;
256
257 // get the equivalent terms
258 mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
259
260 delete [] word_stem;
261
262 return;
263}
264
265 text_tset utf8equivterms; // kept as utf8 string for fast matching
266
267
268// This callback is called once for each term in the query
269int termfreqcallback(char *Word, int ULen, int Freq,
270 float /*Weight*/, void *info) {
271 queryresultsclass *queryresults = (queryresultsclass *)info;
272 if (queryresults == NULL) return 0;
273
274 text_t term;
275 term.setcarr(Word, ULen);
276 termfreqclass termfreq;
277
278 termfreq.termstr = to_uni(term);
279 text_t utf8termstem = mgsearch_stemword (term);
280 termfreq.termstemstr = to_uni (utf8termstem);
281
282 mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
283
284 termfreq.termfreq = Freq;
285 queryresults->orgterms.push_back(termfreq);
286
287 return 0;
288}
289
290// this callback is called once for each variation of each term
291int termvariantscallback(char *Word, int ULen, int /*Freq*/,
292 float /*Weight*/, void *info) {
293
294 text_t term;
295 term.setcarr(Word, ULen);
296 queryresultsclass *queryresults = (queryresultsclass *)info;
297 queryresults->termvariants.insert(to_uni(term));
298
299 return 0;
300}
301
302// This callback is for getting document text
303int doctextcallback(char *Doc, int ULen, int /*Freq*/,
304 float /*Weight*/, void * /*info*/) {
305 tempdoc = Doc;
306 templen = ULen;
307
308 return 0;
309}
310
311
312static text_t getindexsuffix (const text_t &collection,
313 const text_t &index) {
314
315 text_t indexsuffix = "index";
316 indexsuffix = filename_cat (indexsuffix, index);
317 indexsuffix = filename_cat (indexsuffix, collection);
318 return indexsuffix;
319}
320
321
322
323
324////////////////////
325// mgsearch class //
326////////////////////
327
328mgsearchclass::mgsearchclass ()
329 : searchclass() {
330
331}
332
333mgsearchclass::~mgsearchclass ()
334{
335 if (cache != NULL)
336 {
337 delete cache;
338 cache = NULL;
339 }
340}
341
342// you only need to use this function before doing any stemming
343// casefolding and stemming will be set if values for them are
344// provided (0 or 1).
345// makeindexcurrent returns true if it was able to load the database
346bool mgsearchclass::makeindexcurrent (const text_t &index,
347 const text_t &subcollection,
348 const text_t &language,
349 const text_t &collection,
350 int casefolding,
351 int stemming) {
352 bool databaseloaded = true;
353
354 // get the names of the collection, index and text suffixes
355 char *ccollection = collection.getcstr();
356 assert (ccollection != NULL);
357 char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
358 assert (idxsuffix != NULL);
359 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
360 assert (txtsuffix != NULL);
361
362#ifdef __WIN32__
363 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
364#else
365 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
366#endif
367
368 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
369 if (casefolding == 0) mgq_ask(".set casefold off");
370 else if (casefolding > 0) mgq_ask(".set casefold on");
371 if (stemming == 0) mgq_ask(".set stem off");
372 else if (stemming > 0) mgq_ask(".set stem on");
373
374 } else databaseloaded = false;
375
376 // free up the c strings
377 delete ccollection;
378 delete idxsuffix;
379 delete txtsuffix;
380 delete ccollectdir;
381
382 return databaseloaded;
383}
384
385
386// stem word uses the values set in the last call to makeindexcurrent
387// to stem the word. It is assumed that word is in unicode
388text_t mgsearchclass::stemword (const text_t &word) {
389 return to_uni (mgsearch_stemword (to_utf8 (word)));
390}
391
392text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
393 return to_uni (mgsearch_stemword (to_utf8 (here, end)));
394}
395
396/**
397 * search directs the whole execution of the search; a number of other
398 * functions in this class are called as a result, and precondition
399 * checks are also made
400 */
401bool mgsearchclass::search(const queryparamclass &queryparams,
402 queryresultsclass &queryresults) {
403 // assert (cache != NULL);
404
405 // clear any previous results
406 queryresults.clear();
407 // first check the cache
408 if (cache != NULL) {
409 if (cache->find(queryparams, queryresults)) return true;
410 }
411 // make sure there is a query to be processed
412 if (!has_unicode_letdig(queryparams.querystring)) return true;
413
414 if (makeindexcurrent (queryparams.index, queryparams.subcollection,
415 queryparams.language, queryparams.collection)) {
416 // initialise the form of results
417 setsearchmode (queryparams);
418
419 // execute the query
420 submitquery (queryparams);
421
422 // retrieve the results
423 getresults (queryparams, queryresults);
424
425 return true;
426 }
427
428 return false;
429}
430
431
432void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
433{
434 mgq_ask(".set expert true");
435 mgq_ask(".set sorted_terms true");
436 mgq_ask(".set accumulator_method array");
437 mgq_ask(".set max_accumulators 500000");
438 mgq_ask(".set maxparas 500000");
439 mgq_ask(".set verbatim true");
440 mgq_ask(".unset skip_dump");
441 mgq_ask(".set mode docnums");
442
443 switch (queryparams.search_type)
444 {
445 case 0: mgq_ask(".set query boolean"); break;
446 case 1: mgq_ask(".set query ranked"); break;
447 }
448 switch (queryparams.casefolding)
449 {
450 case 1: mgq_ask(".set casefold on"); break;
451 case 0: mgq_ask(".set casefold off"); break;
452 }
453 switch (queryparams.stemming)
454 {
455 case 1: mgq_ask(".set stem on"); break;
456 case 0: mgq_ask(".set stem off"); break;
457 }
458 mgq_ask(".set heads_length 150");
459
460 if (queryparams.maxdocs == -1) {
461 mgq_ask(".set maxdocs all");
462 } else {
463 char maxdocstr[32];
464 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
465 mgq_ask(maxdocstr);
466 }
467}
468
469/**
470 * submitquery constructs the query string (into UTF8 encoding)
471 * and submits it using mgq_ask to the mg search engine. Most
472 * of the processing will be done inside Greenstone
473 */
474void mgsearchclass::submitquery (const queryparamclass &queryparams)
475{
476 // sort out the query string; copy it, remove all special characters
477 // and then convert it to a string in UTF8 format
478 text_t ttquerystring = queryparams.querystring;
479 filterquery (ttquerystring);
480 char *querystring = to_utf8(ttquerystring).getcstr();
481
482 // submit the query
483 mgq_ask(querystring);
484
485 // destroy the temporary character array
486 delete querystring;
487}
488
489/**
490 * getrults is called to retrieve the required data on the docs
491 * which responded to the query submitted in submitquery above.
492 *
493 * It calls the local mgquery (mgq) interface to MG several times,
494 * to obtain the document numbers, term frequencies, term variants
495 * etc. All processing of the query will be done by Greenstone
496 * thereafter
497 */
498void mgsearchclass::getresults (const queryparamclass &queryparams,
499 queryresultsclass &queryresults) {
500 // get the configuration for the maximum number of documents to
501 // retrieve
502 int howmany = queryparams.maxdocs;
503 if (howmany == -1) howmany = MAXNUMDOCS;
504 mgq_results(result_docnums, 0, howmany,
505 ourquerycallback, (void *)(&queryresults));
506
507 // get the term frequencies
508 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
509 termfreqcallback, (void *)(&queryresults));
510 queryresults.sortuniqqueryterms();
511
512 // get term variants
513 mgq_results(result_terms, 0, MAXNUMTERMS,
514 termvariantscallback, (void *)(&queryresults));
515
516 // get the number of documents retrieved
517 int total_retrieved = 0, is_approx = 0;
518 mgq_docsretrieved (&total_retrieved, &is_approx);
519
520 if (total_retrieved == 0) {
521 // not available (or really was zero)
522 queryresults.docs_matched = queryresults.docs.docset.size();
523 if ((queryparams.maxdocs == -1) ||
524 (queryresults.docs_matched < queryparams.maxdocs))
525 queryresults.is_approx = Exact;
526 else
527 queryresults.is_approx = MoreThan;
528 } else {
529 queryresults.docs_matched = total_retrieved;
530 if (is_approx) queryresults.is_approx = Approximate;
531 else queryresults.is_approx = Exact;
532 }
533}
534
535/**
536 * Tidies the given querystring, removing special characters
537 */
538void mgsearchclass::filterquery (text_t &ttquerystring) {
539 text_t::iterator ithere = ttquerystring.begin ();
540 text_t::iterator itend = ttquerystring.end ();
541
542 // remove all non alphanumeric characters (except
543 // boolean operators
544 while (ithere != itend) {
545 if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
546 (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
547 (*ithere != ')')) (*ithere) = ' ';
548 ithere++;
549 }
550}
551
552
553// the document text for 'docnum' is placed in 'output'
554// docTargetDocument returns 'true' if it was able to
555// try to get a document
556// collection is needed to see if an index from the
557// collection is loaded. If no index has been loaded
558// defaultindex is needed to load one
559bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
560 const text_t &defaultsubcollection,
561 const text_t &defaultlanguage,
562 const text_t &collection,
563 int docnum,
564 text_t &output) {
565 output.clear();
566
567 // get the mg version of the document
568 char *mgdoc = NULL;
569 int doclen = 0;
570 if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
571 collection, docnum, mgdoc, doclen)) return false;
572 if (mgdoc == NULL) return false;
573
574 // replace all control-Cs with spaces
575 char *mgdoc_here = mgdoc;
576 char *mgdoc_end = mgdoc + doclen;
577 while (mgdoc_here < mgdoc_end) {
578 if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
579 mgdoc_here++;
580 }
581
582 // convert this document to unicode
583 utf8inconvertclass inconvert;
584 convertclass::status_t status;
585 inconvert.reset ();
586 inconvert.setinput (mgdoc, doclen);
587 inconvert.convert (output, status);
588
589 return true;
590}
591
592
593bool mgsearchclass::mgdocument (const text_t &defaultindex,
594 const text_t &defaultsubcollection,
595 const text_t &defaultlanguage,
596 const text_t &collection,
597 int docnum,
598 char *&UDoc, int &ULen) {
599 int databaseloaded = 0;
600
601 UDoc = NULL; ULen = 0;
602
603 // see if we can make an appropriate database current
604// char *ccollection = collection.getcstr();
605// assert (ccollection != NULL);
606// databaseloaded = load_text_database (ccollection);
607// delete ccollection;
608
609 // try and load the database
610// if (!databaseloaded)
611 databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
612 defaultlanguage, collection);
613
614 if (databaseloaded) {
615 // retrieve the document from mg
616 char docstr[32];
617 sprintf(docstr, "%i", docnum);
618
619 mgq_ask(".set mode text");
620 mgq_ask(".set query docnums");
621 mgq_ask(docstr);
622
623 tempdoc = NULL;
624 templen = 0;
625 mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
626 UDoc = tempdoc;
627 ULen = templen;
628 }
629
630 return (bool)databaseloaded;
631}
632
Note: See TracBrowser for help on using the repository browser.