source: trunk/gsdl/src/colservr/mgsearch.cpp@ 633

Last change on this file since 633 was 633, checked in by rjmcnab, 25 years ago

change to use has_unicode_letdig in text_t

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.1 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: mgsearch.cpp 633 1999-09-24 02:41:21Z rjmcnab $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.22 1999/09/24 02:41:21 rjmcnab
31 change to use has_unicode_letdig in text_t
32
33 Revision 1.21 1999/09/21 21:41:41 sjboddie
34 fixed an error in what I committed last
35
36 Revision 1.20 1999/09/21 11:59:26 sjboddie
37 added Maxdocs queryfilter option (which may be -1 for 'all)
38
39 Revision 1.19 1999/09/07 22:52:52 rjmcnab
40 Seems to be an error in mg for retrieving documents using a paragraph
41 based index for some cases. Just added a work around (loads the default
42 index every time).
43
44 Revision 1.18 1999/09/07 04:57:22 sjboddie
45 added gpl notice
46
47 Revision 1.17 1999/08/31 22:42:41 rjmcnab
48 A couple of minor things.
49
50 Revision 1.16 1999/08/25 04:51:06 sjboddie
51 small change to allow for searching using boolean operators
52
53 Revision 1.15 1999/07/16 08:35:03 rjmcnab
54 Fixed a weird bug to do with a faulty case statement.
55
56 Revision 1.14 1999/07/16 03:42:22 sjboddie
57 changed isApprox
58
59 Revision 1.13 1999/07/16 00:12:46 sjboddie
60 removed all the old post-processing stuff
61
62 Revision 1.12 1999/07/07 06:17:47 rjmcnab
63 broke search_index into index+subcollection+language
64 within mgsearch
65
66 Revision 1.11 1999/07/05 21:06:43 rjmcnab
67 Disabled quoted strings.
68
69 Revision 1.10 1999/07/01 09:29:19 rjmcnab
70 Changes for better reporting of number documents which match a query. Changes
71 should still work as before with older versions of mg.
72
73 Revision 1.9 1999/07/01 03:54:48 rjmcnab
74 Added code to plug in the equivalent terms of each of the query terms.
75 Also added a function to get a raw utf8 encoded mg document (for speeding
76 up a phrase matching function)
77
78 Revision 1.8 1999/06/30 04:04:12 rjmcnab
79 made stemming functions available from mgsearch and made the stems
80 for the query terms available in queryinfo
81
82 Revision 1.7 1999/06/27 22:07:27 sjboddie
83 got rid of all the old functions for dealing with dir indexes
84
85 Revision 1.6 1999/06/09 00:41:32 sjboddie
86 phrase searching now uses case-folding if it's turned on
87
88 Revision 1.5 1999/02/21 22:31:35 rjmcnab
89
90 Removed locateinfo.
91
92 Revision 1.4 1999/02/03 01:13:27 sjboddie
93
94 Got interface to handle subcollections and language subcollections -
95 committed changes made to some of the collections
96
97 Revision 1.3 1999/01/19 01:38:17 rjmcnab
98
99 Made the source more portable.
100
101 Revision 1.2 1999/01/12 01:51:02 rjmcnab
102
103 Standard header.
104
105 Revision 1.1 1999/01/08 09:02:16 rjmcnab
106
107 Moved from src/library.
108
109 */
110
111
112#include "gsdlconf.h"
113#include "mgsearch.h"
114#include "fileutil.h"
115
116#include <string.h>
117#include <stdio.h>
118#include <stdlib.h>
119#include <ctype.h>
120
121#if defined(GSDL_USE_OBJECTSPACE)
122# include <ospace\std\iostream>
123#elif defined(GSDL_USE_IOS_H)
124# include <iostream.h>
125#else
126# include <iostream>
127#endif
128
129#if defined(__WIN32__)
130// gdbm stuff
131# include "autoconf.h"
132# include "systems.h"
133# include "gdbmconst.h"
134# include "gdbm.h"
135#else
136# include <gdbm.h>
137#endif
138
139
140#include <assert.h>
141
142#include "mgq.h"
143// #include "locateinfo.h"
144#include "gsdlunicode.h"
145#include "unitool.h"
146
147
148/////////////
149// globals //
150/////////////
151
152static char *tempdoc = NULL;
153static int templen = 0;
154
155
156//////////////////////
157// useful functions //
158//////////////////////
159
160
161// input and output are in utf8
162text_t mgsearch_stemword (const text_t &word) {
163 // allocate working stem space
164 int maxstemlen = mgq_getmaxstemlen ();
165 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
166 if (word_stem == NULL) return "";
167
168 // copy word to word_stem
169 int len = 0;
170 text_t::const_iterator here = word.begin();
171 text_t::const_iterator end = word.end();
172 while (len < maxstemlen && here != end) {
173 word_stem[len+1] = (unsigned char)(*here);
174 len++; here++;
175 }
176 word_stem[len+1] = '\0';
177 word_stem[0] = len;
178
179 mgq_stemword (word_stem);
180
181 // copy word_stem back to tempstr
182 text_t tempstr;
183 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
184
185 delete [] word_stem;
186
187 return tempstr;
188}
189
190
191
192////////////////////////
193// callback functions //
194////////////////////////
195
196// This routine is called for each document found in a search
197// it assumes that cache_num is set up correctly to point to
198// a suitable result cache
199int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
200 float Weight, void *info) {
201
202
203 queryresultsclass *queryresults = (queryresultsclass * )info;
204
205 // append this entry to the document results
206 docresultclass docresult;
207 docresult.docnum = DocNum;
208 docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
209 docresult.docweight = Weight - docresult.num_query_terms_matched*100;
210
211 queryresults->docs.docset[DocNum] = docresult;
212 queryresults->docs.docorder.push_back(DocNum);
213
214 return 0;
215}
216
217int termequivcallback(char *Word, int ULen, int /*Freq*/,
218 float /*Weight*/, void *info) {
219 text_tset *equivterms = (text_tset *)info;
220 if (equivterms == NULL) return 0;
221
222 text_t thisterm;
223 thisterm.setcarr(Word, ULen);
224
225 equivterms->insert(thisterm);
226
227 return 0;
228}
229
230
231void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
232 // allocate working stem space
233 int maxstemlen = mgq_getmaxstemlen ();
234 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
235 if (word_stem == NULL) return;
236
237 // copy word to word_stem
238 int len = 0;
239 text_t::const_iterator here = word.begin();
240 text_t::const_iterator end = word.end();
241 while (len < maxstemlen && here != end) {
242 word_stem[len+1] = (unsigned char)(*here);
243 len++; here++;
244 }
245 word_stem[len+1] = '\0';
246 word_stem[0] = len;
247
248 // get the equivalent terms
249 mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
250
251 delete [] word_stem;
252
253 return;
254}
255
256 text_tset utf8equivterms; // kept as utf8 string for fast matching
257
258
259// This callback is called once for each term in the query
260int termfreqcallback(char *Word, int ULen, int Freq,
261 float /*Weight*/, void *info) {
262 queryresultsclass *queryresults = (queryresultsclass *)info;
263 if (queryresults == NULL) return 0;
264
265 text_t term;
266 term.setcarr(Word, ULen);
267 termfreqclass termfreq;
268
269 termfreq.termstr = to_uni(term);
270 text_t utf8termstem = mgsearch_stemword (term);
271 termfreq.termstemstr = to_uni (utf8termstem);
272
273 mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
274
275 termfreq.termfreq = Freq;
276 queryresults->orgterms.push_back(termfreq);
277
278 return 0;
279}
280
281// this callback is called once for each variation of each term
282int termvariantscallback(char *Word, int ULen, int /*Freq*/,
283 float /*Weight*/, void *info) {
284
285 text_t term;
286 term.setcarr(Word, ULen);
287 queryresultsclass *queryresults = (queryresultsclass *)info;
288 queryresults->termvariants.insert(to_uni(term));
289
290 return 0;
291}
292
293// This callback is for getting document text
294int doctextcallback(char *Doc, int ULen, int /*Freq*/,
295 float /*Weight*/, void * /*info*/) {
296 tempdoc = Doc;
297 templen = ULen;
298
299 return 0;
300}
301
302
303static text_t getindexsuffix (const text_t &collection,
304 const text_t &index) {
305
306 text_t indexsuffix = "index";
307 indexsuffix = filename_cat (indexsuffix, index);
308 indexsuffix = filename_cat (indexsuffix, collection);
309 return indexsuffix;
310}
311
312
313
314
315////////////////////
316// mgsearch class //
317////////////////////
318
319mgsearchclass::mgsearchclass ()
320{
321 cache = new querycache (RESULTCACHESIZE);
322}
323
324mgsearchclass::~mgsearchclass ()
325{
326 if (cache != NULL)
327 {
328 delete cache;
329 cache = NULL;
330 }
331}
332
333
334void mgsearchclass::setcollectdir (const text_t &thecollectdir)
335{
336 collectdir = thecollectdir;
337}
338
339// you only need to use this function before doing any stemming
340// casefolding and stemming will be set if values for them are
341// provided (0 or 1).
342// makeindexcurrent returns true if it was able to load the database
343bool mgsearchclass::makeindexcurrent (const text_t &index,
344 const text_t &subcollection,
345 const text_t &language,
346 const text_t &collection,
347 int casefolding,
348 int stemming) {
349 bool databaseloaded = true;
350
351 // get the names of the collection, index and text suffixes
352 char *ccollection = collection.getcstr();
353 assert (ccollection != NULL);
354 char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
355 assert (idxsuffix != NULL);
356 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
357 assert (txtsuffix != NULL);
358
359#ifdef __WIN32__
360 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
361#else
362 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
363#endif
364
365 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
366 if (casefolding == 0) mgq_ask(".set casefold off");
367 else if (casefolding > 0) mgq_ask(".set casefold on");
368 if (stemming == 0) mgq_ask(".set stem off");
369 else if (stemming > 0) mgq_ask(".set stem on");
370
371 } else databaseloaded = false;
372
373 // free up the c strings
374 delete ccollection;
375 delete idxsuffix;
376 delete txtsuffix;
377 delete ccollectdir;
378
379 return databaseloaded;
380}
381
382
383// stem word uses the values set in the last call to makeindexcurrent
384// to stem the word. It is assumed that word is in unicode
385text_t mgsearchclass::stemword (const text_t &word) {
386 return to_uni (mgsearch_stemword (to_utf8 (word)));
387}
388
389text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
390 return to_uni (mgsearch_stemword (to_utf8 (here, end)));
391}
392
393
394bool mgsearchclass::search(const queryparamclass &queryparams,
395 queryresultsclass &queryresults) {
396 assert (cache != NULL);
397
398 queryresults.clear();
399
400 // first check the cache
401 if (cache->find(queryparams, queryresults)) return true;
402
403 // make sure there is a query to be processed
404 if (!has_unicode_letdig(queryparams.querystring)) return true;
405
406 if (makeindexcurrent (queryparams.index, queryparams.subcollection,
407 queryparams.language, queryparams.collection)) {
408 setsearchmode (queryparams);
409 submitquery (queryparams);
410 getresults (queryparams, queryresults);
411 return true;
412 }
413
414 return false;
415}
416
417
418void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
419{
420 mgq_ask(".set expert true");
421 mgq_ask(".set sorted_terms true");
422 mgq_ask(".set accumulator_method list");
423 mgq_ask(".set max_accumulators 500000");
424 mgq_ask(".set maxparas 500000");
425 mgq_ask(".set verbatim true");
426 mgq_ask(".unset skip_dump");
427 mgq_ask(".set mode docnums");
428
429 switch (queryparams.search_type)
430 {
431 case 0: mgq_ask(".set query boolean"); break;
432 case 1: mgq_ask(".set query ranked"); break;
433 }
434 switch (queryparams.casefolding)
435 {
436 case 1: mgq_ask(".set casefold on"); break;
437 case 0: mgq_ask(".set casefold off"); break;
438 }
439 switch (queryparams.stemming)
440 {
441 case 1: mgq_ask(".set stem on"); break;
442 case 0: mgq_ask(".set stem off"); break;
443 }
444 mgq_ask(".set heads_length 150");
445
446 if (queryparams.maxdocs == -1) {
447 mgq_ask(".set maxdocs all");
448 } else {
449 char maxdocstr[32];
450 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
451 mgq_ask(maxdocstr);
452 }
453}
454
455
456void mgsearchclass::submitquery (const queryparamclass &queryparams)
457{
458 // sort out the query string
459 text_t ttquerystring = queryparams.querystring;
460 filterquery (ttquerystring);
461 char *querystring = to_utf8(ttquerystring).getcstr();
462
463 // submit the query
464 mgq_ask(querystring);
465
466 delete querystring;
467}
468
469
470void mgsearchclass::getresults (const queryparamclass &queryparams,
471 queryresultsclass &queryresults) {
472
473 int howmany = queryparams.maxdocs;
474 if (howmany == -1) howmany = MAXNUMDOCS;
475 mgq_results(result_docnums, 0, howmany,
476 ourquerycallback, (void *)(&queryresults));
477
478 // get the term frequencies
479 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
480 termfreqcallback, (void *)(&queryresults));
481 queryresults.sortuniqqueryterms();
482
483 // get term variants
484 mgq_results(result_terms, 0, MAXNUMTERMS,
485 termvariantscallback, (void *)(&queryresults));
486
487 // get the number of documents retrieved
488 int total_retrieved = 0, is_approx = 0;
489 mgq_docsretrieved (&total_retrieved, &is_approx);
490
491 if (total_retrieved == 0) {
492 // not available (or really was zero)
493 queryresults.docs_matched = queryresults.docs.docset.size();
494 if ((queryparams.maxdocs == -1) ||
495 (queryresults.docs_matched < queryparams.maxdocs))
496 queryresults.is_approx = Exact;
497 else
498 queryresults.is_approx = MoreThan;
499 } else {
500 queryresults.docs_matched = total_retrieved;
501 if (is_approx) queryresults.is_approx = Approximate;
502 else queryresults.is_approx = Exact;
503 }
504}
505
506void mgsearchclass::filterquery (text_t &ttquerystring) {
507 text_t::iterator ithere = ttquerystring.begin ();
508 text_t::iterator itend = ttquerystring.end ();
509
510 // remove all non alphanumeric characters (except
511 // boolean operators
512 while (ithere != itend) {
513 if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
514 (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
515 (*ithere != ')')) (*ithere) = ' ';
516 ithere++;
517 }
518}
519
520
521// the document text for 'docnum' is placed in 'output'
522// docTargetDocument returns 'true' if it was able to
523// try to get a document
524// collection is needed to see if an index from the
525// collection is loaded. If no index has been loaded
526// defaultindex is needed to load one
527bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
528 const text_t &defaultsubcollection,
529 const text_t &defaultlanguage,
530 const text_t &collection,
531 int docnum,
532 text_t &output) {
533 output.clear();
534
535 // get the mg version of the document
536 char *mgdoc = NULL;
537 int doclen = 0;
538 if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
539 collection, docnum, mgdoc, doclen)) return false;
540 if (mgdoc == NULL) return false;
541
542 // replace all control-Cs with spaces
543 char *mgdoc_here = mgdoc;
544 char *mgdoc_end = mgdoc + doclen;
545 while (mgdoc_here < mgdoc_end) {
546 if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
547 mgdoc_here++;
548 }
549
550 // convert this document to unicode
551 utf8inconvertclass inconvert;
552 convertclass::status_t status;
553 inconvert.reset ();
554 inconvert.setinput (mgdoc, doclen);
555 inconvert.convert (output, status);
556
557 return true;
558}
559
560
561bool mgsearchclass::mgdocument (const text_t &defaultindex,
562 const text_t &defaultsubcollection,
563 const text_t &defaultlanguage,
564 const text_t &collection,
565 int docnum,
566 char *&UDoc, int &ULen) {
567 int databaseloaded = 0;
568
569 UDoc = NULL; ULen = 0;
570
571 // see if we can make an appropriate database current
572// char *ccollection = collection.getcstr();
573// assert (ccollection != NULL);
574// databaseloaded = load_text_database (ccollection);
575// delete ccollection;
576
577 // try and load the database
578// if (!databaseloaded)
579 databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
580 defaultlanguage, collection);
581
582 if (databaseloaded) {
583 // retrieve the document from mg
584 char docstr[32];
585 sprintf(docstr, "%i", docnum);
586
587 mgq_ask(".set mode text");
588 mgq_ask(".set query docnums");
589 mgq_ask(docstr);
590
591 tempdoc = NULL;
592 templen = 0;
593 mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
594 UDoc = tempdoc;
595 ULen = templen;
596 }
597
598 return (bool)databaseloaded;
599}
600
Note: See TracBrowser for help on using the repository browser.