source: branches/New_Config_Format-branch/gsdl/src/colservr/mgsearch.cpp@ 1279

Last change on this file since 1279 was 1279, checked in by sjboddie, 24 years ago

merged changes to trunk into New_Config_Format branch

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.4 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: mgsearch.cpp 1279 2000-07-12 22:21:53Z sjboddie $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.22.4.1 2000/07/12 22:21:27 sjboddie
31 merged changes to trunk into New_Config_Format branch
32
33 Revision 1.23 2000/07/03 21:58:41 nzdl
34 removed mg directive that was causing meaningless warning messages
35 in errout.txt
36
37 Revision 1.22 1999/09/24 02:41:21 rjmcnab
38 change to use has_unicode_letdig in text_t
39
40 Revision 1.21 1999/09/21 21:41:41 sjboddie
41 fixed an error in what I committed last
42
43 Revision 1.20 1999/09/21 11:59:26 sjboddie
44 added Maxdocs queryfilter option (which may be -1 for 'all)
45
46 Revision 1.19 1999/09/07 22:52:52 rjmcnab
47 Seems to be an error in mg for retrieving documents using a paragraph
48 based index for some cases. Just added a work around (loads the default
49 index every time).
50
51 Revision 1.18 1999/09/07 04:57:22 sjboddie
52 added gpl notice
53
54 Revision 1.17 1999/08/31 22:42:41 rjmcnab
55 A couple of minor things.
56
57 Revision 1.16 1999/08/25 04:51:06 sjboddie
58 small change to allow for searching using boolean operators
59
60 Revision 1.15 1999/07/16 08:35:03 rjmcnab
61 Fixed a weird bug to do with a faulty case statement.
62
63 Revision 1.14 1999/07/16 03:42:22 sjboddie
64 changed isApprox
65
66 Revision 1.13 1999/07/16 00:12:46 sjboddie
67 removed all the old post-processing stuff
68
69 Revision 1.12 1999/07/07 06:17:47 rjmcnab
70 broke search_index into index+subcollection+language
71 within mgsearch
72
73 Revision 1.11 1999/07/05 21:06:43 rjmcnab
74 Disabled quoted strings.
75
76 Revision 1.10 1999/07/01 09:29:19 rjmcnab
77 Changes for better reporting of number documents which match a query. Changes
78 should still work as before with older versions of mg.
79
80 Revision 1.9 1999/07/01 03:54:48 rjmcnab
81 Added code to plug in the equivalent terms of each of the query terms.
82 Also added a function to get a raw utf8 encoded mg document (for speeding
83 up a phrase matching function)
84
85 Revision 1.8 1999/06/30 04:04:12 rjmcnab
86 made stemming functions available from mgsearch and made the stems
87 for the query terms available in queryinfo
88
89 Revision 1.7 1999/06/27 22:07:27 sjboddie
90 got rid of all the old functions for dealing with dir indexes
91
92 Revision 1.6 1999/06/09 00:41:32 sjboddie
93 phrase searching now uses case-folding if it's turned on
94
95 Revision 1.5 1999/02/21 22:31:35 rjmcnab
96
97 Removed locateinfo.
98
99 Revision 1.4 1999/02/03 01:13:27 sjboddie
100
101 Got interface to handle subcollections and language subcollections -
102 committed changes made to some of the collections
103
104 Revision 1.3 1999/01/19 01:38:17 rjmcnab
105
106 Made the source more portable.
107
108 Revision 1.2 1999/01/12 01:51:02 rjmcnab
109
110 Standard header.
111
112 Revision 1.1 1999/01/08 09:02:16 rjmcnab
113
114 Moved from src/library.
115
116 */
117
118
119#include "gsdlconf.h"
120#include "mgsearch.h"
121#include "fileutil.h"
122
123#include <string.h>
124#include <stdio.h>
125#include <stdlib.h>
126#include <ctype.h>
127
128#if defined(GSDL_USE_OBJECTSPACE)
129# include <ospace\std\iostream>
130#elif defined(GSDL_USE_IOS_H)
131# include <iostream.h>
132#else
133# include <iostream>
134#endif
135
136#if defined(__WIN32__)
137// gdbm stuff
138# include "autoconf.h"
139# include "systems.h"
140# include "gdbmconst.h"
141# include "gdbm.h"
142#else
143# include <gdbm.h>
144#endif
145
146
147#include <assert.h>
148
149#include "mgq.h"
150// #include "locateinfo.h"
151#include "gsdlunicode.h"
152#include "unitool.h"
153
154
155/////////////
156// globals //
157/////////////
158
159static char *tempdoc = NULL;
160static int templen = 0;
161
162
163//////////////////////
164// useful functions //
165//////////////////////
166
167
168// input and output are in utf8
169text_t mgsearch_stemword (const text_t &word) {
170 // allocate working stem space
171 int maxstemlen = mgq_getmaxstemlen ();
172 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
173 if (word_stem == NULL) return "";
174
175 // copy word to word_stem
176 int len = 0;
177 text_t::const_iterator here = word.begin();
178 text_t::const_iterator end = word.end();
179 while (len < maxstemlen && here != end) {
180 word_stem[len+1] = (unsigned char)(*here);
181 len++; here++;
182 }
183 word_stem[len+1] = '\0';
184 word_stem[0] = len;
185
186 mgq_stemword (word_stem);
187
188 // copy word_stem back to tempstr
189 text_t tempstr;
190 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
191
192 delete [] word_stem;
193
194 return tempstr;
195}
196
197
198
199////////////////////////
200// callback functions //
201////////////////////////
202
203// This routine is called for each document found in a search
204// it assumes that cache_num is set up correctly to point to
205// a suitable result cache
206int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
207 float Weight, void *info) {
208
209
210 queryresultsclass *queryresults = (queryresultsclass * )info;
211
212 // append this entry to the document results
213 docresultclass docresult;
214 docresult.docnum = DocNum;
215 docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
216 docresult.docweight = Weight - docresult.num_query_terms_matched*100;
217
218 queryresults->docs.docset[DocNum] = docresult;
219 queryresults->docs.docorder.push_back(DocNum);
220
221 return 0;
222}
223
224int termequivcallback(char *Word, int ULen, int /*Freq*/,
225 float /*Weight*/, void *info) {
226 text_tset *equivterms = (text_tset *)info;
227 if (equivterms == NULL) return 0;
228
229 text_t thisterm;
230 thisterm.setcarr(Word, ULen);
231
232 equivterms->insert(thisterm);
233
234 return 0;
235}
236
237
238void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
239 // allocate working stem space
240 int maxstemlen = mgq_getmaxstemlen ();
241 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
242 if (word_stem == NULL) return;
243
244 // copy word to word_stem
245 int len = 0;
246 text_t::const_iterator here = word.begin();
247 text_t::const_iterator end = word.end();
248 while (len < maxstemlen && here != end) {
249 word_stem[len+1] = (unsigned char)(*here);
250 len++; here++;
251 }
252 word_stem[len+1] = '\0';
253 word_stem[0] = len;
254
255 // get the equivalent terms
256 mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
257
258 delete [] word_stem;
259
260 return;
261}
262
263 text_tset utf8equivterms; // kept as utf8 string for fast matching
264
265
266// This callback is called once for each term in the query
267int termfreqcallback(char *Word, int ULen, int Freq,
268 float /*Weight*/, void *info) {
269 queryresultsclass *queryresults = (queryresultsclass *)info;
270 if (queryresults == NULL) return 0;
271
272 text_t term;
273 term.setcarr(Word, ULen);
274 termfreqclass termfreq;
275
276 termfreq.termstr = to_uni(term);
277 text_t utf8termstem = mgsearch_stemword (term);
278 termfreq.termstemstr = to_uni (utf8termstem);
279
280 mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
281
282 termfreq.termfreq = Freq;
283 queryresults->orgterms.push_back(termfreq);
284
285 return 0;
286}
287
288// this callback is called once for each variation of each term
289int termvariantscallback(char *Word, int ULen, int /*Freq*/,
290 float /*Weight*/, void *info) {
291
292 text_t term;
293 term.setcarr(Word, ULen);
294 queryresultsclass *queryresults = (queryresultsclass *)info;
295 queryresults->termvariants.insert(to_uni(term));
296
297 return 0;
298}
299
300// This callback is for getting document text
301int doctextcallback(char *Doc, int ULen, int /*Freq*/,
302 float /*Weight*/, void * /*info*/) {
303 tempdoc = Doc;
304 templen = ULen;
305
306 return 0;
307}
308
309
310static text_t getindexsuffix (const text_t &collection,
311 const text_t &index) {
312
313 text_t indexsuffix = "index";
314 indexsuffix = filename_cat (indexsuffix, index);
315 indexsuffix = filename_cat (indexsuffix, collection);
316 return indexsuffix;
317}
318
319
320
321
322////////////////////
323// mgsearch class //
324////////////////////
325
326mgsearchclass::mgsearchclass ()
327{
328 cache = new querycache (RESULTCACHESIZE);
329}
330
331mgsearchclass::~mgsearchclass ()
332{
333 if (cache != NULL)
334 {
335 delete cache;
336 cache = NULL;
337 }
338}
339
340
341void mgsearchclass::setcollectdir (const text_t &thecollectdir)
342{
343 collectdir = thecollectdir;
344}
345
346// you only need to use this function before doing any stemming
347// casefolding and stemming will be set if values for them are
348// provided (0 or 1).
349// makeindexcurrent returns true if it was able to load the database
350bool mgsearchclass::makeindexcurrent (const text_t &index,
351 const text_t &subcollection,
352 const text_t &language,
353 const text_t &collection,
354 int casefolding,
355 int stemming) {
356 bool databaseloaded = true;
357
358 // get the names of the collection, index and text suffixes
359 char *ccollection = collection.getcstr();
360 assert (ccollection != NULL);
361 char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
362 assert (idxsuffix != NULL);
363 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
364 assert (txtsuffix != NULL);
365
366#ifdef __WIN32__
367 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
368#else
369 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
370#endif
371
372 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
373 if (casefolding == 0) mgq_ask(".set casefold off");
374 else if (casefolding > 0) mgq_ask(".set casefold on");
375 if (stemming == 0) mgq_ask(".set stem off");
376 else if (stemming > 0) mgq_ask(".set stem on");
377
378 } else databaseloaded = false;
379
380 // free up the c strings
381 delete ccollection;
382 delete idxsuffix;
383 delete txtsuffix;
384 delete ccollectdir;
385
386 return databaseloaded;
387}
388
389
390// stem word uses the values set in the last call to makeindexcurrent
391// to stem the word. It is assumed that word is in unicode
392text_t mgsearchclass::stemword (const text_t &word) {
393 return to_uni (mgsearch_stemword (to_utf8 (word)));
394}
395
396text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
397 return to_uni (mgsearch_stemword (to_utf8 (here, end)));
398}
399
400
401bool mgsearchclass::search(const queryparamclass &queryparams,
402 queryresultsclass &queryresults) {
403 assert (cache != NULL);
404
405 queryresults.clear();
406
407 // first check the cache
408 if (cache->find(queryparams, queryresults)) return true;
409
410 // make sure there is a query to be processed
411 if (!has_unicode_letdig(queryparams.querystring)) return true;
412
413 if (makeindexcurrent (queryparams.index, queryparams.subcollection,
414 queryparams.language, queryparams.collection)) {
415 setsearchmode (queryparams);
416 submitquery (queryparams);
417 getresults (queryparams, queryresults);
418 return true;
419 }
420
421 return false;
422}
423
424
425void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
426{
427 mgq_ask(".set expert true");
428 mgq_ask(".set sorted_terms true");
429 mgq_ask(".set accumulator_method list");
430 mgq_ask(".set max_accumulators 500000");
431 mgq_ask(".set maxparas 500000");
432 mgq_ask(".set verbatim true");
433 // mgq_ask(".unset skip_dump");
434 mgq_ask(".set mode docnums");
435
436 switch (queryparams.search_type)
437 {
438 case 0: mgq_ask(".set query boolean"); break;
439 case 1: mgq_ask(".set query ranked"); break;
440 }
441 switch (queryparams.casefolding)
442 {
443 case 1: mgq_ask(".set casefold on"); break;
444 case 0: mgq_ask(".set casefold off"); break;
445 }
446 switch (queryparams.stemming)
447 {
448 case 1: mgq_ask(".set stem on"); break;
449 case 0: mgq_ask(".set stem off"); break;
450 }
451 mgq_ask(".set heads_length 150");
452
453 if (queryparams.maxdocs == -1) {
454 mgq_ask(".set maxdocs all");
455 } else {
456 char maxdocstr[32];
457 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
458 mgq_ask(maxdocstr);
459 }
460}
461
462
463void mgsearchclass::submitquery (const queryparamclass &queryparams)
464{
465 // sort out the query string
466 text_t ttquerystring = queryparams.querystring;
467 filterquery (ttquerystring);
468 char *querystring = to_utf8(ttquerystring).getcstr();
469
470 // submit the query
471 mgq_ask(querystring);
472
473 delete querystring;
474}
475
476
477void mgsearchclass::getresults (const queryparamclass &queryparams,
478 queryresultsclass &queryresults) {
479
480 int howmany = queryparams.maxdocs;
481 if (howmany == -1) howmany = MAXNUMDOCS;
482 mgq_results(result_docnums, 0, howmany,
483 ourquerycallback, (void *)(&queryresults));
484
485 // get the term frequencies
486 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
487 termfreqcallback, (void *)(&queryresults));
488 queryresults.sortuniqqueryterms();
489
490 // get term variants
491 mgq_results(result_terms, 0, MAXNUMTERMS,
492 termvariantscallback, (void *)(&queryresults));
493
494 // get the number of documents retrieved
495 int total_retrieved = 0, is_approx = 0;
496 mgq_docsretrieved (&total_retrieved, &is_approx);
497
498 if (total_retrieved == 0) {
499 // not available (or really was zero)
500 queryresults.docs_matched = queryresults.docs.docset.size();
501 if ((queryparams.maxdocs == -1) ||
502 (queryresults.docs_matched < queryparams.maxdocs))
503 queryresults.is_approx = Exact;
504 else
505 queryresults.is_approx = MoreThan;
506 } else {
507 queryresults.docs_matched = total_retrieved;
508 if (is_approx) queryresults.is_approx = Approximate;
509 else queryresults.is_approx = Exact;
510 }
511}
512
513void mgsearchclass::filterquery (text_t &ttquerystring) {
514 text_t::iterator ithere = ttquerystring.begin ();
515 text_t::iterator itend = ttquerystring.end ();
516
517 // remove all non alphanumeric characters (except
518 // boolean operators
519 while (ithere != itend) {
520 if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
521 (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
522 (*ithere != ')')) (*ithere) = ' ';
523 ithere++;
524 }
525}
526
527
528// the document text for 'docnum' is placed in 'output'
529// docTargetDocument returns 'true' if it was able to
530// try to get a document
531// collection is needed to see if an index from the
532// collection is loaded. If no index has been loaded
533// defaultindex is needed to load one
534bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
535 const text_t &defaultsubcollection,
536 const text_t &defaultlanguage,
537 const text_t &collection,
538 int docnum,
539 text_t &output) {
540 output.clear();
541
542 // get the mg version of the document
543 char *mgdoc = NULL;
544 int doclen = 0;
545 if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
546 collection, docnum, mgdoc, doclen)) return false;
547 if (mgdoc == NULL) return false;
548
549 // replace all control-Cs with spaces
550 char *mgdoc_here = mgdoc;
551 char *mgdoc_end = mgdoc + doclen;
552 while (mgdoc_here < mgdoc_end) {
553 if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
554 mgdoc_here++;
555 }
556
557 // convert this document to unicode
558 utf8inconvertclass inconvert;
559 convertclass::status_t status;
560 inconvert.reset ();
561 inconvert.setinput (mgdoc, doclen);
562 inconvert.convert (output, status);
563
564 return true;
565}
566
567
568bool mgsearchclass::mgdocument (const text_t &defaultindex,
569 const text_t &defaultsubcollection,
570 const text_t &defaultlanguage,
571 const text_t &collection,
572 int docnum,
573 char *&UDoc, int &ULen) {
574 int databaseloaded = 0;
575
576 UDoc = NULL; ULen = 0;
577
578 // see if we can make an appropriate database current
579// char *ccollection = collection.getcstr();
580// assert (ccollection != NULL);
581// databaseloaded = load_text_database (ccollection);
582// delete ccollection;
583
584 // try and load the database
585// if (!databaseloaded)
586 databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
587 defaultlanguage, collection);
588
589 if (databaseloaded) {
590 // retrieve the document from mg
591 char docstr[32];
592 sprintf(docstr, "%i", docnum);
593
594 mgq_ask(".set mode text");
595 mgq_ask(".set query docnums");
596 mgq_ask(docstr);
597
598 tempdoc = NULL;
599 templen = 0;
600 mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
601 UDoc = tempdoc;
602 ULen = templen;
603 }
604
605 return (bool)databaseloaded;
606}
607
Note: See TracBrowser for help on using the repository browser.