source: trunk/gsdl/src/colservr/mgsearch.cpp@ 1860

Last change on this file since 1860 was 1860, checked in by cs025, 23 years ago

Included CORBA branch for first time

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.3 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: mgsearch.cpp 1860 2001-01-25 18:26:45Z cs025 $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.28 2001/01/25 18:26:44 cs025
31 Included CORBA branch for first time
32
33 Revision 1.22.2.1 2000/04/04 15:02:32 cs025
34 Corba first commit
35
36 Revision 1.22 1999/09/24 02:41:21 rjmcnab
37 change to use has_unicode_letdig in text_t
38
39 Revision 1.21 1999/09/21 21:41:41 sjboddie
40 fixed an error in what I committed last
41
42 Revision 1.20 1999/09/21 11:59:26 sjboddie
43 added Maxdocs queryfilter option (which may be -1 for 'all)
44
45 Revision 1.19 1999/09/07 22:52:52 rjmcnab
46 Seems to be an error in mg for retrieving documents using a paragraph
47 based index for some cases. Just added a work around (loads the default
48 index every time).
49
50 Revision 1.18 1999/09/07 04:57:22 sjboddie
51 added gpl notice
52
53 Revision 1.17 1999/08/31 22:42:41 rjmcnab
54 A couple of minor things.
55
56 Revision 1.16 1999/08/25 04:51:06 sjboddie
57 small change to allow for searching using boolean operators
58
59 Revision 1.15 1999/07/16 08:35:03 rjmcnab
60 Fixed a weird bug to do with a faulty case statement.
61
62 Revision 1.14 1999/07/16 03:42:22 sjboddie
63 changed isApprox
64
65 Revision 1.13 1999/07/16 00:12:46 sjboddie
66 removed all the old post-processing stuff
67
68 Revision 1.12 1999/07/07 06:17:47 rjmcnab
69 broke search_index into index+subcollection+language
70 within mgsearch
71
72 Revision 1.11 1999/07/05 21:06:43 rjmcnab
73 Disabled quoted strings.
74
75 Revision 1.10 1999/07/01 09:29:19 rjmcnab
76 Changes for better reporting of number documents which match a query. Changes
77 should still work as before with older versions of mg.
78
79 Revision 1.9 1999/07/01 03:54:48 rjmcnab
80 Added code to plug in the equivalent terms of each of the query terms.
81 Also added a function to get a raw utf8 encoded mg document (for speeding
82 up a phrase matching function)
83
84 Revision 1.8 1999/06/30 04:04:12 rjmcnab
85 made stemming functions available from mgsearch and made the stems
86 for the query terms available in queryinfo
87
88 Revision 1.7 1999/06/27 22:07:27 sjboddie
89 got rid of all the old functions for dealing with dir indexes
90
91 Revision 1.6 1999/06/09 00:41:32 sjboddie
92 phrase searching now uses case-folding if it's turned on
93
94 Revision 1.5 1999/02/21 22:31:35 rjmcnab
95
96 Removed locateinfo.
97
98 Revision 1.4 1999/02/03 01:13:27 sjboddie
99
100 Got interface to handle subcollections and language subcollections -
101 committed changes made to some of the collections
102
103 Revision 1.3 1999/01/19 01:38:17 rjmcnab
104
105 Made the source more portable.
106
107 Revision 1.2 1999/01/12 01:51:02 rjmcnab
108
109 Standard header.
110
111 Revision 1.1 1999/01/08 09:02:16 rjmcnab
112
113 Moved from src/library.
114
115 */
116
117#include "gsdlconf.h"
118#include "mgsearch.h"
119#include "fileutil.h"
120
121#include <string.h>
122#include <stdio.h>
123#include <stdlib.h>
124#include <ctype.h>
125
126#if defined(GSDL_USE_OBJECTSPACE)
127# include <ospace\std\iostream>
128#elif defined(GSDL_USE_IOS_H)
129# include <iostream.h>
130#else
131# include <iostream>
132#endif
133
134#if defined(__WIN32__)
135// gdbm stuff
136# include "autoconf.h"
137# include "systems.h"
138# include "gdbmconst.h"
139# include "gdbm.h"
140#else
141# include <gdbm.h>
142#endif
143
144
145#include <assert.h>
146
147#include "mgq.h"
148// #include "locateinfo.h"
149#include "gsdlunicode.h"
150#include "unitool.h"
151
152
153/////////////
154// globals //
155/////////////
156
157static char *tempdoc = NULL;
158static int templen = 0;
159
160
161//////////////////////
162// useful functions //
163//////////////////////
164
165
166// input and output are in utf8
167text_t mgsearch_stemword (const text_t &word) {
168 // allocate working stem space
169 int maxstemlen = mgq_getmaxstemlen ();
170 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
171 if (word_stem == NULL) return "";
172
173 // copy word to word_stem
174 int len = 0;
175 text_t::const_iterator here = word.begin();
176 text_t::const_iterator end = word.end();
177 while (len < maxstemlen && here != end) {
178 word_stem[len+1] = (unsigned char)(*here);
179 len++; here++;
180 }
181 word_stem[len+1] = '\0';
182 word_stem[0] = len;
183
184 mgq_stemword (word_stem);
185
186 // copy word_stem back to tempstr
187 text_t tempstr;
188 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
189
190 delete [] word_stem;
191
192 return tempstr;
193}
194
195
196
197////////////////////////
198// callback functions //
199////////////////////////
200
201// This routine is called for each document found in a search
202// it assumes that cache_num is set up correctly to point to
203// a suitable result cache
204int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
205 float Weight, void *info) {
206
207
208 queryresultsclass *queryresults = (queryresultsclass * )info;
209
210 // append this entry to the document results
211 docresultclass docresult;
212 docresult.docnum = DocNum;
213 docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
214 docresult.docweight = Weight - docresult.num_query_terms_matched*100;
215
216 queryresults->docs.docset[DocNum] = docresult;
217 queryresults->docs.docorder.push_back(DocNum);
218
219 return 0;
220}
221
222int termequivcallback(char *Word, int ULen, int /*Freq*/,
223 float /*Weight*/, void *info) {
224 text_tset *equivterms = (text_tset *)info;
225 if (equivterms == NULL) return 0;
226
227 text_t thisterm;
228 thisterm.setcarr(Word, ULen);
229
230 equivterms->insert(thisterm);
231
232 return 0;
233}
234
235
236void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
237 // allocate working stem space
238 int maxstemlen = mgq_getmaxstemlen ();
239 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
240 if (word_stem == NULL) return;
241
242 // copy word to word_stem
243 int len = 0;
244 text_t::const_iterator here = word.begin();
245 text_t::const_iterator end = word.end();
246 while (len < maxstemlen && here != end) {
247 word_stem[len+1] = (unsigned char)(*here);
248 len++; here++;
249 }
250 word_stem[len+1] = '\0';
251 word_stem[0] = len;
252
253 // get the equivalent terms
254 mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
255
256 delete [] word_stem;
257
258 return;
259}
260
261 text_tset utf8equivterms; // kept as utf8 string for fast matching
262
263
264// This callback is called once for each term in the query
265int termfreqcallback(char *Word, int ULen, int Freq,
266 float /*Weight*/, void *info) {
267 queryresultsclass *queryresults = (queryresultsclass *)info;
268 if (queryresults == NULL) return 0;
269
270 text_t term;
271 term.setcarr(Word, ULen);
272 termfreqclass termfreq;
273
274 termfreq.termstr = to_uni(term);
275 text_t utf8termstem = mgsearch_stemword (term);
276 termfreq.termstemstr = to_uni (utf8termstem);
277
278 mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
279
280 termfreq.termfreq = Freq;
281 queryresults->orgterms.push_back(termfreq);
282
283 return 0;
284}
285
286// this callback is called once for each variation of each term
287int termvariantscallback(char *Word, int ULen, int /*Freq*/,
288 float /*Weight*/, void *info) {
289
290 text_t term;
291 term.setcarr(Word, ULen);
292 queryresultsclass *queryresults = (queryresultsclass *)info;
293 queryresults->termvariants.insert(to_uni(term));
294
295 return 0;
296}
297
298// This callback is for getting document text
299int doctextcallback(char *Doc, int ULen, int /*Freq*/,
300 float /*Weight*/, void * /*info*/) {
301 tempdoc = Doc;
302 templen = ULen;
303
304 return 0;
305}
306
307
308static text_t getindexsuffix (const text_t &collection,
309 const text_t &index) {
310
311 text_t indexsuffix = "index";
312 indexsuffix = filename_cat (indexsuffix, index);
313 indexsuffix = filename_cat (indexsuffix, collection);
314 return indexsuffix;
315}
316
317
318
319
320////////////////////
321// mgsearch class //
322////////////////////
323
324mgsearchclass::mgsearchclass ()
325 : searchclass() {
326
327}
328
329mgsearchclass::~mgsearchclass ()
330{
331 if (cache != NULL)
332 {
333 delete cache;
334 cache = NULL;
335 }
336}
337
338// you only need to use this function before doing any stemming
339// casefolding and stemming will be set if values for them are
340// provided (0 or 1).
341// makeindexcurrent returns true if it was able to load the database
342bool mgsearchclass::makeindexcurrent (const text_t &index,
343 const text_t &subcollection,
344 const text_t &language,
345 const text_t &collection,
346 int casefolding,
347 int stemming) {
348 bool databaseloaded = true;
349
350 // get the names of the collection, index and text suffixes
351 char *ccollection = collection.getcstr();
352 assert (ccollection != NULL);
353 char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
354 assert (idxsuffix != NULL);
355 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
356 assert (txtsuffix != NULL);
357
358#ifdef __WIN32__
359 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
360#else
361 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
362#endif
363
364 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
365 if (casefolding == 0) mgq_ask(".set casefold off");
366 else if (casefolding > 0) mgq_ask(".set casefold on");
367 if (stemming == 0) mgq_ask(".set stem off");
368 else if (stemming > 0) mgq_ask(".set stem on");
369
370 } else databaseloaded = false;
371
372 // free up the c strings
373 delete ccollection;
374 delete idxsuffix;
375 delete txtsuffix;
376 delete ccollectdir;
377
378 return databaseloaded;
379}
380
381
382// stem word uses the values set in the last call to makeindexcurrent
383// to stem the word. It is assumed that word is in unicode
384text_t mgsearchclass::stemword (const text_t &word) {
385 return to_uni (mgsearch_stemword (to_utf8 (word)));
386}
387
388text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
389 return to_uni (mgsearch_stemword (to_utf8 (here, end)));
390}
391
392/**
393 * search directs the whole execution of the search; a number of other
394 * functions in this class are called as a result, and precondition
395 * checks are also made
396 */
397bool mgsearchclass::search(const queryparamclass &queryparams,
398 queryresultsclass &queryresults) {
399 // assert (cache != NULL);
400
401 // clear any previous results
402 queryresults.clear();
403 // first check the cache
404 if (cache != NULL) {
405 if (cache->find(queryparams, queryresults)) return true;
406 }
407 // make sure there is a query to be processed
408 if (!has_unicode_letdig(queryparams.querystring)) return true;
409
410 if (makeindexcurrent (queryparams.index, queryparams.subcollection,
411 queryparams.language, queryparams.collection)) {
412 // initialise the form of results
413 setsearchmode (queryparams);
414
415 // execute the query
416 submitquery (queryparams);
417
418 // retrieve the results
419 getresults (queryparams, queryresults);
420
421 return true;
422 }
423
424 return false;
425}
426
427
428void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
429{
430 mgq_ask(".set expert true");
431 mgq_ask(".set sorted_terms true");
432 mgq_ask(".set accumulator_method list");
433 mgq_ask(".set max_accumulators 500000");
434 mgq_ask(".set maxparas 500000");
435 mgq_ask(".set verbatim true");
436 mgq_ask(".unset skip_dump");
437 mgq_ask(".set mode docnums");
438
439 switch (queryparams.search_type)
440 {
441 case 0: mgq_ask(".set query boolean"); break;
442 case 1: mgq_ask(".set query ranked"); break;
443 }
444 switch (queryparams.casefolding)
445 {
446 case 1: mgq_ask(".set casefold on"); break;
447 case 0: mgq_ask(".set casefold off"); break;
448 }
449 switch (queryparams.stemming)
450 {
451 case 1: mgq_ask(".set stem on"); break;
452 case 0: mgq_ask(".set stem off"); break;
453 }
454 mgq_ask(".set heads_length 150");
455
456 if (queryparams.maxdocs == -1) {
457 mgq_ask(".set maxdocs all");
458 } else {
459 char maxdocstr[32];
460 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
461 mgq_ask(maxdocstr);
462 }
463}
464
465/**
466 * submitquery constructs the query string (into UTF8 encoding)
467 * and submits it using mgq_ask to the mg search engine. Most
468 * of the processing will be done inside Greenstone
469 */
470void mgsearchclass::submitquery (const queryparamclass &queryparams)
471{
472 // sort out the query string; copy it, remove all special characters
473 // and then convert it to a string in UTF8 format
474 text_t ttquerystring = queryparams.querystring;
475 filterquery (ttquerystring);
476 char *querystring = to_utf8(ttquerystring).getcstr();
477
478 // submit the query
479 mgq_ask(querystring);
480
481 // destroy the temporary character array
482 delete querystring;
483}
484
485/**
486 * getrults is called to retrieve the required data on the docs
487 * which responded to the query submitted in submitquery above.
488 *
489 * It calls the local mgquery (mgq) interface to MG several times,
490 * to obtain the document numbers, term frequencies, term variants
491 * etc. All processing of the query will be done by Greenstone
492 * thereafter
493 */
494void mgsearchclass::getresults (const queryparamclass &queryparams,
495 queryresultsclass &queryresults) {
496 // get the configuration for the maximum number of documents to
497 // retrieve
498 int howmany = queryparams.maxdocs;
499 if (howmany == -1) howmany = MAXNUMDOCS;
500 mgq_results(result_docnums, 0, howmany,
501 ourquerycallback, (void *)(&queryresults));
502
503 // get the term frequencies
504 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
505 termfreqcallback, (void *)(&queryresults));
506 queryresults.sortuniqqueryterms();
507
508 // get term variants
509 mgq_results(result_terms, 0, MAXNUMTERMS,
510 termvariantscallback, (void *)(&queryresults));
511
512 // get the number of documents retrieved
513 int total_retrieved = 0, is_approx = 0;
514 mgq_docsretrieved (&total_retrieved, &is_approx);
515
516 if (total_retrieved == 0) {
517 // not available (or really was zero)
518 queryresults.docs_matched = queryresults.docs.docset.size();
519 if ((queryparams.maxdocs == -1) ||
520 (queryresults.docs_matched < queryparams.maxdocs))
521 queryresults.is_approx = Exact;
522 else
523 queryresults.is_approx = MoreThan;
524 } else {
525 queryresults.docs_matched = total_retrieved;
526 if (is_approx) queryresults.is_approx = Approximate;
527 else queryresults.is_approx = Exact;
528 }
529}
530
531/**
532 * Tidies the given querystring, removing special characters
533 */
534void mgsearchclass::filterquery (text_t &ttquerystring) {
535 text_t::iterator ithere = ttquerystring.begin ();
536 text_t::iterator itend = ttquerystring.end ();
537
538 // remove all non alphanumeric characters (except
539 // boolean operators
540 while (ithere != itend) {
541 if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
542 (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
543 (*ithere != ')')) (*ithere) = ' ';
544 ithere++;
545 }
546}
547
548
549// the document text for 'docnum' is placed in 'output'
550// docTargetDocument returns 'true' if it was able to
551// try to get a document
552// collection is needed to see if an index from the
553// collection is loaded. If no index has been loaded
554// defaultindex is needed to load one
555bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
556 const text_t &defaultsubcollection,
557 const text_t &defaultlanguage,
558 const text_t &collection,
559 int docnum,
560 text_t &output) {
561 output.clear();
562
563 // get the mg version of the document
564 char *mgdoc = NULL;
565 int doclen = 0;
566 if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
567 collection, docnum, mgdoc, doclen)) return false;
568 if (mgdoc == NULL) return false;
569
570 // replace all control-Cs with spaces
571 char *mgdoc_here = mgdoc;
572 char *mgdoc_end = mgdoc + doclen;
573 while (mgdoc_here < mgdoc_end) {
574 if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
575 mgdoc_here++;
576 }
577
578 // convert this document to unicode
579 utf8inconvertclass inconvert;
580 convertclass::status_t status;
581 inconvert.reset ();
582 inconvert.setinput (mgdoc, doclen);
583 inconvert.convert (output, status);
584
585 return true;
586}
587
588
589bool mgsearchclass::mgdocument (const text_t &defaultindex,
590 const text_t &defaultsubcollection,
591 const text_t &defaultlanguage,
592 const text_t &collection,
593 int docnum,
594 char *&UDoc, int &ULen) {
595 int databaseloaded = 0;
596
597 UDoc = NULL; ULen = 0;
598
599 // see if we can make an appropriate database current
600// char *ccollection = collection.getcstr();
601// assert (ccollection != NULL);
602// databaseloaded = load_text_database (ccollection);
603// delete ccollection;
604
605 // try and load the database
606// if (!databaseloaded)
607 databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
608 defaultlanguage, collection);
609
610 if (databaseloaded) {
611 // retrieve the document from mg
612 char docstr[32];
613 sprintf(docstr, "%i", docnum);
614
615 mgq_ask(".set mode text");
616 mgq_ask(".set query docnums");
617 mgq_ask(docstr);
618
619 tempdoc = NULL;
620 templen = 0;
621 mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
622 UDoc = tempdoc;
623 ULen = templen;
624 }
625
626 return (bool)databaseloaded;
627}
628
Note: See TracBrowser for help on using the repository browser.