source: trunk/gsdl/src/colservr/mgsearch.cpp@ 615

Last change on this file since 615 was 615, checked in by sjboddie, 25 years ago

fixed an error in what I committed last

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.4 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: mgsearch.cpp 615 1999-09-21 21:41:41Z sjboddie $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.21 1999/09/21 21:41:41 sjboddie
31 fixed an error in what I committed last
32
33 Revision 1.20 1999/09/21 11:59:26 sjboddie
34 added Maxdocs queryfilter option (which may be -1 for 'all)
35
36 Revision 1.19 1999/09/07 22:52:52 rjmcnab
37 Seems to be an error in mg for retrieving documents using a paragraph
38 based index for some cases. Just added a work around (loads the default
39 index every time).
40
41 Revision 1.18 1999/09/07 04:57:22 sjboddie
42 added gpl notice
43
44 Revision 1.17 1999/08/31 22:42:41 rjmcnab
45 A couple of minor things.
46
47 Revision 1.16 1999/08/25 04:51:06 sjboddie
48 small change to allow for searching using boolean operators
49
50 Revision 1.15 1999/07/16 08:35:03 rjmcnab
51 Fixed a weird bug to do with a faulty case statement.
52
53 Revision 1.14 1999/07/16 03:42:22 sjboddie
54 changed isApprox
55
56 Revision 1.13 1999/07/16 00:12:46 sjboddie
57 removed all the old post-processing stuff
58
59 Revision 1.12 1999/07/07 06:17:47 rjmcnab
60 broke search_index into index+subcollection+language
61 within mgsearch
62
63 Revision 1.11 1999/07/05 21:06:43 rjmcnab
64 Disabled quoted strings.
65
66 Revision 1.10 1999/07/01 09:29:19 rjmcnab
67 Changes for better reporting of number documents which match a query. Changes
68 should still work as before with older versions of mg.
69
70 Revision 1.9 1999/07/01 03:54:48 rjmcnab
71 Added code to plug in the equivalent terms of each of the query terms.
72 Also added a function to get a raw utf8 encoded mg document (for speeding
73 up a phrase matching function)
74
75 Revision 1.8 1999/06/30 04:04:12 rjmcnab
76 made stemming functions available from mgsearch and made the stems
77 for the query terms available in queryinfo
78
79 Revision 1.7 1999/06/27 22:07:27 sjboddie
80 got rid of all the old functions for dealing with dir indexes
81
82 Revision 1.6 1999/06/09 00:41:32 sjboddie
83 phrase searching now uses case-folding if it's turned on
84
85 Revision 1.5 1999/02/21 22:31:35 rjmcnab
86
87 Removed locateinfo.
88
89 Revision 1.4 1999/02/03 01:13:27 sjboddie
90
91 Got interface to handle subcollections and language subcollections -
92 committed changes made to some of the collections
93
94 Revision 1.3 1999/01/19 01:38:17 rjmcnab
95
96 Made the source more portable.
97
98 Revision 1.2 1999/01/12 01:51:02 rjmcnab
99
100 Standard header.
101
102 Revision 1.1 1999/01/08 09:02:16 rjmcnab
103
104 Moved from src/library.
105
106 */
107
108
109#include "gsdlconf.h"
110#include "mgsearch.h"
111#include "fileutil.h"
112
113#include <string.h>
114#include <stdio.h>
115#include <stdlib.h>
116#include <ctype.h>
117
118#if defined(GSDL_USE_OBJECTSPACE)
119# include <ospace\std\iostream>
120#elif defined(GSDL_USE_IOS_H)
121# include <iostream.h>
122#else
123# include <iostream>
124#endif
125
126#if defined(__WIN32__)
127// gdbm stuff
128# include "autoconf.h"
129# include "systems.h"
130# include "gdbmconst.h"
131# include "gdbm.h"
132#else
133# include <gdbm.h>
134#endif
135
136
137#include <assert.h>
138
139#include "mgq.h"
140// #include "locateinfo.h"
141#include "gsdlunicode.h"
142#include "unitool.h"
143
144
145/////////////
146// globals //
147/////////////
148
149static char *tempdoc = NULL;
150static int templen = 0;
151
152
153//////////////////////
154// useful functions //
155//////////////////////
156
157
158// input and output are in utf8
159text_t mgsearch_stemword (const text_t &word) {
160 // allocate working stem space
161 int maxstemlen = mgq_getmaxstemlen ();
162 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
163 if (word_stem == NULL) return "";
164
165 // copy word to word_stem
166 int len = 0;
167 text_t::const_iterator here = word.begin();
168 text_t::const_iterator end = word.end();
169 while (len < maxstemlen && here != end) {
170 word_stem[len+1] = (unsigned char)(*here);
171 len++; here++;
172 }
173 word_stem[len+1] = '\0';
174 word_stem[0] = len;
175
176 mgq_stemword (word_stem);
177
178 // copy word_stem back to tempstr
179 text_t tempstr;
180 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
181
182 delete [] word_stem;
183
184 return tempstr;
185}
186
187
188
189////////////////////////
190// callback functions //
191////////////////////////
192
193// This routine is called for each document found in a search
194// it assumes that cache_num is set up correctly to point to
195// a suitable result cache
196int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
197 float Weight, void *info) {
198
199
200 queryresultsclass *queryresults = (queryresultsclass * )info;
201
202 // append this entry to the document results
203 docresultclass docresult;
204 docresult.docnum = DocNum;
205 docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
206 docresult.docweight = Weight - docresult.num_query_terms_matched*100;
207
208 queryresults->docs.docset[DocNum] = docresult;
209 queryresults->docs.docorder.push_back(DocNum);
210
211 return 0;
212}
213
214int termequivcallback(char *Word, int ULen, int /*Freq*/,
215 float /*Weight*/, void *info) {
216 text_tset *equivterms = (text_tset *)info;
217 if (equivterms == NULL) return 0;
218
219 text_t thisterm;
220 thisterm.setcarr(Word, ULen);
221
222 equivterms->insert(thisterm);
223
224 return 0;
225}
226
227
228void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
229 // allocate working stem space
230 int maxstemlen = mgq_getmaxstemlen ();
231 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
232 if (word_stem == NULL) return;
233
234 // copy word to word_stem
235 int len = 0;
236 text_t::const_iterator here = word.begin();
237 text_t::const_iterator end = word.end();
238 while (len < maxstemlen && here != end) {
239 word_stem[len+1] = (unsigned char)(*here);
240 len++; here++;
241 }
242 word_stem[len+1] = '\0';
243 word_stem[0] = len;
244
245 // get the equivalent terms
246 mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
247
248 delete [] word_stem;
249
250 return;
251}
252
253 text_tset utf8equivterms; // kept as utf8 string for fast matching
254
255
256// This callback is called once for each term in the query
257int termfreqcallback(char *Word, int ULen, int Freq,
258 float /*Weight*/, void *info) {
259 queryresultsclass *queryresults = (queryresultsclass *)info;
260 if (queryresults == NULL) return 0;
261
262 text_t term;
263 term.setcarr(Word, ULen);
264 termfreqclass termfreq;
265
266 termfreq.termstr = to_uni(term);
267 text_t utf8termstem = mgsearch_stemword (term);
268 termfreq.termstemstr = to_uni (utf8termstem);
269
270 mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
271
272 termfreq.termfreq = Freq;
273 queryresults->orgterms.push_back(termfreq);
274
275 return 0;
276}
277
278// this callback is called once for each variation of each term
279int termvariantscallback(char *Word, int ULen, int /*Freq*/,
280 float /*Weight*/, void *info) {
281
282 text_t term;
283 term.setcarr(Word, ULen);
284 queryresultsclass *queryresults = (queryresultsclass *)info;
285 queryresults->termvariants.insert(to_uni(term));
286
287 return 0;
288}
289
290// This callback is for getting document text
291int doctextcallback(char *Doc, int ULen, int /*Freq*/,
292 float /*Weight*/, void * /*info*/) {
293 tempdoc = Doc;
294 templen = ULen;
295
296 return 0;
297}
298
299
300static text_t getindexsuffix (const text_t &collection,
301 const text_t &index) {
302
303 text_t indexsuffix = "index";
304 indexsuffix = filename_cat (indexsuffix, index);
305 indexsuffix = filename_cat (indexsuffix, collection);
306 return indexsuffix;
307}
308
309
310
311
312////////////////////
313// mgsearch class //
314////////////////////
315
316mgsearchclass::mgsearchclass ()
317{
318 cache = new querycache (RESULTCACHESIZE);
319}
320
321mgsearchclass::~mgsearchclass ()
322{
323 if (cache != NULL)
324 {
325 delete cache;
326 cache = NULL;
327 }
328}
329
330
331void mgsearchclass::setcollectdir (const text_t &thecollectdir)
332{
333 collectdir = thecollectdir;
334}
335
336// you only need to use this function before doing any stemming
337// casefolding and stemming will be set if values for them are
338// provided (0 or 1).
339// makeindexcurrent returns true if it was able to load the database
340bool mgsearchclass::makeindexcurrent (const text_t &index,
341 const text_t &subcollection,
342 const text_t &language,
343 const text_t &collection,
344 int casefolding,
345 int stemming) {
346 bool databaseloaded = true;
347
348 // get the names of the collection, index and text suffixes
349 char *ccollection = collection.getcstr();
350 assert (ccollection != NULL);
351 char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
352 assert (idxsuffix != NULL);
353 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
354 assert (txtsuffix != NULL);
355
356#ifdef __WIN32__
357 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
358#else
359 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
360#endif
361
362 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
363 if (casefolding == 0) mgq_ask(".set casefold off");
364 else if (casefolding > 0) mgq_ask(".set casefold on");
365 if (stemming == 0) mgq_ask(".set stem off");
366 else if (stemming > 0) mgq_ask(".set stem on");
367
368 } else databaseloaded = false;
369
370 // free up the c strings
371 delete ccollection;
372 delete idxsuffix;
373 delete txtsuffix;
374 delete ccollectdir;
375
376 return databaseloaded;
377}
378
379
380// stem word uses the values set in the last call to makeindexcurrent
381// to stem the word. It is assumed that word is in unicode
382text_t mgsearchclass::stemword (const text_t &word) {
383 return to_uni (mgsearch_stemword (to_utf8 (word)));
384}
385
386text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
387 return to_uni (mgsearch_stemword (to_utf8 (here, end)));
388}
389
390
391bool mgsearchclass::search(const queryparamclass &queryparams,
392 queryresultsclass &queryresults) {
393 assert (cache != NULL);
394
395 queryresults.clear();
396
397 // first check the cache
398 if (cache->find(queryparams, queryresults)) return true;
399
400 // make sure there is a query to be processed
401 text_t::const_iterator queryhere = queryparams.querystring.begin();
402 text_t::const_iterator queryend = queryparams.querystring.end();
403 while (queryhere != queryend) {
404 if (is_unicode_letdig (*queryhere)) break;
405 queryhere++;
406 }
407
408 // if we reached the end of the query string without finding
409 // any alphanumeric characters then return no results (and say
410 // the database was loaded)
411 if (queryhere == queryend) return true;
412
413 if (makeindexcurrent (queryparams.index, queryparams.subcollection,
414 queryparams.language, queryparams.collection)) {
415 setsearchmode (queryparams);
416 submitquery (queryparams);
417 getresults (queryparams, queryresults);
418 return true;
419 }
420
421 return false;
422}
423
424
425void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
426{
427 mgq_ask(".set expert true");
428 mgq_ask(".set sorted_terms true");
429 mgq_ask(".set accumulator_method list");
430 mgq_ask(".set max_accumulators 500000");
431 mgq_ask(".set maxparas 500000");
432 mgq_ask(".set verbatim true");
433 mgq_ask(".unset skip_dump");
434 mgq_ask(".set mode docnums");
435
436 switch (queryparams.search_type)
437 {
438 case 0: mgq_ask(".set query boolean"); break;
439 case 1: mgq_ask(".set query ranked"); break;
440 }
441 switch (queryparams.casefolding)
442 {
443 case 1: mgq_ask(".set casefold on"); break;
444 case 0: mgq_ask(".set casefold off"); break;
445 }
446 switch (queryparams.stemming)
447 {
448 case 1: mgq_ask(".set stem on"); break;
449 case 0: mgq_ask(".set stem off"); break;
450 }
451 mgq_ask(".set heads_length 150");
452
453 if (queryparams.maxdocs == -1) {
454 mgq_ask(".set maxdocs all");
455 } else {
456 char maxdocstr[32];
457 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
458 mgq_ask(maxdocstr);
459 }
460}
461
462
463void mgsearchclass::submitquery (const queryparamclass &queryparams)
464{
465 // sort out the query string
466 text_t ttquerystring = queryparams.querystring;
467 filterquery (ttquerystring);
468 char *querystring = to_utf8(ttquerystring).getcstr();
469
470 // submit the query
471 mgq_ask(querystring);
472
473 delete querystring;
474}
475
476
477void mgsearchclass::getresults (const queryparamclass &queryparams,
478 queryresultsclass &queryresults) {
479
480 int howmany = queryparams.maxdocs;
481 if (howmany == -1) howmany = MAXNUMDOCS;
482 mgq_results(result_docnums, 0, howmany,
483 ourquerycallback, (void *)(&queryresults));
484
485 // get the term frequencies
486 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
487 termfreqcallback, (void *)(&queryresults));
488 queryresults.sortuniqqueryterms();
489
490 // get term variants
491 mgq_results(result_terms, 0, MAXNUMTERMS,
492 termvariantscallback, (void *)(&queryresults));
493
494 // get the number of documents retrieved
495 int total_retrieved = 0, is_approx = 0;
496 mgq_docsretrieved (&total_retrieved, &is_approx);
497
498 if (total_retrieved == 0) {
499 // not available (or really was zero)
500 queryresults.docs_matched = queryresults.docs.docset.size();
501 if ((queryparams.maxdocs == -1) ||
502 (queryresults.docs_matched < queryparams.maxdocs))
503 queryresults.is_approx = Exact;
504 else
505 queryresults.is_approx = MoreThan;
506 } else {
507 queryresults.docs_matched = total_retrieved;
508 if (is_approx) queryresults.is_approx = Approximate;
509 else queryresults.is_approx = Exact;
510 }
511}
512
513void mgsearchclass::filterquery (text_t &ttquerystring) {
514 text_t::iterator ithere = ttquerystring.begin ();
515 text_t::iterator itend = ttquerystring.end ();
516
517 // remove all non alphanumeric characters (except
518 // boolean operators
519 while (ithere != itend) {
520 if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
521 (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
522 (*ithere != ')')) (*ithere) = ' ';
523 ithere++;
524 }
525}
526
527
528// the document text for 'docnum' is placed in 'output'
529// docTargetDocument returns 'true' if it was able to
530// try to get a document
531// collection is needed to see if an index from the
532// collection is loaded. If no index has been loaded
533// defaultindex is needed to load one
534bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
535 const text_t &defaultsubcollection,
536 const text_t &defaultlanguage,
537 const text_t &collection,
538 int docnum,
539 text_t &output) {
540 output.clear();
541
542 // get the mg version of the document
543 char *mgdoc = NULL;
544 int doclen = 0;
545 if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
546 collection, docnum, mgdoc, doclen)) return false;
547 if (mgdoc == NULL) return false;
548
549 // replace all control-Cs with spaces
550 char *mgdoc_here = mgdoc;
551 char *mgdoc_end = mgdoc + doclen;
552 while (mgdoc_here < mgdoc_end) {
553 if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
554 mgdoc_here++;
555 }
556
557 // convert this document to unicode
558 utf8inconvertclass inconvert;
559 convertclass::status_t status;
560 inconvert.reset ();
561 inconvert.setinput (mgdoc, doclen);
562 inconvert.convert (output, status);
563
564 return true;
565}
566
567
568bool mgsearchclass::mgdocument (const text_t &defaultindex,
569 const text_t &defaultsubcollection,
570 const text_t &defaultlanguage,
571 const text_t &collection,
572 int docnum,
573 char *&UDoc, int &ULen) {
574 int databaseloaded = 0;
575
576 UDoc = NULL; ULen = 0;
577
578 // see if we can make an appropriate database current
579// char *ccollection = collection.getcstr();
580// assert (ccollection != NULL);
581// databaseloaded = load_text_database (ccollection);
582// delete ccollection;
583
584 // try and load the database
585// if (!databaseloaded)
586 databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
587 defaultlanguage, collection);
588
589 if (databaseloaded) {
590 // retrieve the document from mg
591 char docstr[32];
592 sprintf(docstr, "%i", docnum);
593
594 mgq_ask(".set mode text");
595 mgq_ask(".set query docnums");
596 mgq_ask(docstr);
597
598 tempdoc = NULL;
599 templen = 0;
600 mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
601 UDoc = tempdoc;
602 ULen = templen;
603 }
604
605 return (bool)databaseloaded;
606}
607
Note: See TracBrowser for help on using the repository browser.