source: trunk/gsdl/src/colservr/mgsearch.cpp@ 13780

Last change on this file since 13780 was 13780, checked in by mdewsnip, 17 years ago

GLI/LOCAL LIBRARY: To prevent the problems with the GLI being unable to install newly built collections because the local library is holding files open, much more care needs to be taken to close files (typically the GDBM database and the MG/MGPP index files) after use. Fixed a lot of places where files were being left open.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.5 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "gsdlconf.h"
27#include "mgsearch.h"
28#include "fileutil.h"
29
30#include <string.h>
31#include <stdio.h>
32#include <stdlib.h>
33#include <ctype.h>
34
35#if defined(GSDL_USE_OBJECTSPACE)
36# include <ospace\std\iostream>
37#elif defined(GSDL_USE_IOS_H)
38# include <iostream.h>
39#else
40# include <iostream>
41#endif
42
43#if defined(__WIN32__)
44// gdbm stuff
45# include "autoconf.h"
46# include "systems.h"
47# include "gdbmconst.h"
48# include "gdbm.h"
49#else
50# include <gdbm.h>
51#endif
52
53
54#include <assert.h>
55
56#include "mgq.h"
57// #include "locateinfo.h"
58#include "gsdlunicode.h"
59#include "unitool.h"
60
61
62/////////////
63// globals //
64/////////////
65
66static char *tempdoc = NULL;
67static int templen = 0;
68
69
70//////////////////////
71// useful functions //
72//////////////////////
73
74
75// input and output are in utf8
76text_t mgsearch_stemword (const text_t &word) {
77 // allocate working stem space
78 int maxstemlen = mgq_getmaxstemlen ();
79 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
80 if (word_stem == NULL) return "";
81
82 // copy word to word_stem
83 int len = 0;
84 text_t::const_iterator here = word.begin();
85 text_t::const_iterator end = word.end();
86 while (len < maxstemlen && here != end) {
87 word_stem[len+1] = (unsigned char)(*here);
88 ++len; ++here;
89 }
90 word_stem[len+1] = '\0';
91 word_stem[0] = len;
92
93 mgq_stemword (word_stem);
94
95 // copy word_stem back to tempstr
96 text_t tempstr;
97 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
98
99 delete [] word_stem;
100
101 return tempstr;
102}
103
104
105
106////////////////////////
107// callback functions //
108////////////////////////
109
110// This routine is called for each document found in a search
111// it assumes that cache_num is set up correctly to point to
112// a suitable result cache
113int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
114 float Weight, void *info) {
115
116
117 queryresultsclass *queryresults = (queryresultsclass * )info;
118
119 // append this entry to the document results
120 docresultclass docresult;
121 docresult.docnum = DocNum;
122 docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
123 docresult.docweight = Weight - docresult.num_query_terms_matched*100;
124
125 queryresults->docs.docset[DocNum] = docresult;
126 queryresults->docs.docorder.push_back(DocNum);
127
128 return 0;
129}
130
131int termequivcallback(char *Word, int ULen, int /*Freq*/,
132 float /*Weight*/, void *info) {
133 text_tset *equivterms = (text_tset *)info;
134 if (equivterms == NULL) return 0;
135
136 text_t thisterm;
137 thisterm.setcarr(Word, ULen);
138
139 equivterms->insert(thisterm);
140
141 return 0;
142}
143
144
145void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
146 // allocate working stem space
147 int maxstemlen = mgq_getmaxstemlen ();
148 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
149 if (word_stem == NULL) return;
150
151 // copy word to word_stem
152 int len = 0;
153 text_t::const_iterator here = word.begin();
154 text_t::const_iterator end = word.end();
155 while (len < maxstemlen && here != end) {
156 word_stem[len+1] = (unsigned char)(*here);
157 ++len; ++here;
158 }
159 word_stem[len+1] = '\0';
160 word_stem[0] = len;
161
162 // get the equivalent terms
163 mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
164
165 delete [] word_stem;
166
167 return;
168}
169
170 text_tset utf8equivterms; // kept as utf8 string for fast matching
171
172
173// This callback is called once for each term in the query
174int termfreqcallback(char *Word, int ULen, int Freq,
175 float /*Weight*/, void *info) {
176 queryresultsclass *queryresults = (queryresultsclass *)info;
177 if (queryresults == NULL) return 0;
178
179 text_t term;
180 term.setcarr(Word, ULen);
181 termfreqclass termfreq;
182
183 termfreq.termstr = to_uni(term);
184 text_t utf8termstem = mgsearch_stemword (term);
185 termfreq.termstemstr = to_uni (utf8termstem);
186
187 mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
188
189 termfreq.termfreq = Freq;
190 queryresults->orgterms.push_back(termfreq);
191
192 return 0;
193}
194
195// this callback is called once for each variation of each term
196int termvariantscallback(char *Word, int ULen, int /*Freq*/,
197 float /*Weight*/, void *info) {
198
199 text_t term;
200 term.setcarr(Word, ULen);
201 queryresultsclass *queryresults = (queryresultsclass *)info;
202 queryresults->termvariants.insert(to_uni(term));
203
204 return 0;
205}
206
207// This callback is for getting document text
208int doctextcallback(char *Doc, int ULen, int /*Freq*/,
209 float /*Weight*/, void * /*info*/) {
210 tempdoc = Doc;
211 templen = ULen;
212
213 return 0;
214}
215
216
217text_t mgsearchclass::getindexsuffix (const text_t &collection,
218 const text_t &index) {
219
220 text_t indexsuffix = "index";
221 indexsuffix = filename_cat (indexsuffix, index);
222 if (indexstem.empty()) {
223 // no index stem, use the coll name
224 indexsuffix = filename_cat (indexsuffix, collection);
225 } else {
226 indexsuffix = filename_cat (indexsuffix, indexstem);
227 }
228 return indexsuffix;
229}
230
231
232
233
234////////////////////
235// mgsearch class //
236////////////////////
237
238mgsearchclass::mgsearchclass ()
239 : searchclass() {
240
241}
242
243mgsearchclass::~mgsearchclass ()
244{
245 if (cache != NULL)
246 {
247 delete cache;
248 cache = NULL;
249 }
250}
251
252void mgsearchclass::set_indexstem(const text_t &stem) {
253 indexstem = stem;
254
255}
256
257// you only need to use this function before doing any stemming
258// casefolding and stemming will be set if values for them are
259// provided (0 or 1).
260// makeindexcurrent returns true if it was able to load the database
261bool mgsearchclass::makeindexcurrent (const text_t &index,
262 const text_t &subcollection,
263 const text_t &language,
264 const text_t &collection,
265 int casefolding,
266 int stemming) {
267 bool databaseloaded = true;
268
269 // get the names of the collection, index and text suffixes
270 char *ccollection = collection.getcstr();
271 assert (ccollection != NULL);
272 char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
273 assert (idxsuffix != NULL);
274 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
275 assert (txtsuffix != NULL);
276#ifdef __WIN32__
277 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
278#else
279 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
280#endif
281
282 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
283 if (casefolding == 0) mgq_ask(".set casefold off");
284 else if (casefolding > 0) mgq_ask(".set casefold on");
285 if (stemming == 0) mgq_ask(".set stem off");
286 else if (stemming > 0) mgq_ask(".set stem on");
287
288 } else databaseloaded = false;
289
290 // free up the c strings
291 delete []ccollection;
292 delete []idxsuffix;
293 delete []txtsuffix;
294 delete []ccollectdir;
295
296 return databaseloaded;
297}
298
299
300// stem word uses the values set in the last call to makeindexcurrent
301// to stem the word. It is assumed that word is in unicode
302text_t mgsearchclass::stemword (const text_t &word) {
303 return to_uni (mgsearch_stemword (to_utf8 (word)));
304}
305
306text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
307 return to_uni (mgsearch_stemword (to_utf8 (here, end)));
308}
309
310/**
311 * search directs the whole execution of the search; a number of other
312 * functions in this class are called as a result, and precondition
313 * checks are also made
314 */
315bool mgsearchclass::search(const queryparamclass &queryparams,
316 queryresultsclass &queryresults) {
317 // assert (cache != NULL);
318
319 // clear any previous results
320 queryresults.clear();
321 // first check the cache
322 if (cache != NULL) {
323 if (cache->find(queryparams, queryresults)) return true;
324 }
325 // make sure there is a query to be processed
326 if (!has_unicode_letdig(queryparams.querystring)) return true;
327
328 if (makeindexcurrent (queryparams.index, queryparams.subcollection,
329 queryparams.language, queryparams.collection)) {
330 // initialise the form of results
331 setsearchmode (queryparams);
332
333 // execute the query
334 submitquery (queryparams);
335
336 // retrieve the results
337 getresults (queryparams, queryresults);
338 unload_database(); // Important that local library doesn't leave any files open
339 return true;
340 }
341
342 return false;
343}
344
345/* accumulator_method has been changed to use array rather than list.
346list appears to be broken somewhat - for some ranked queries, it returned
347fewer results than it should have (eg 45 instead of 50). The three other
348methods (array, splay_tree, hash_table) all return the same number of
349documents, in the same order, with the same ranks. list returns what
350appears to be the same documents (but less of them), but with different ranks,
351and in a different order. Minimal time tests dont show any speed improvement
352of list over array (maybe because its broken??). [02/2001, kjm18]
353
354... [sjboddie, also 02/2001] turns out that changing the accumulator_method
355introduced a more serious bug than it fixed (i.e. occasionally when doing a
356ranked search for a very common word you get no results at all). I've
357changed it back to list for now, one day we should play with other
358accumulator_methods but for now I don't have time and don't want to risk
359introducing bugs (better the devil you know ;)
360*/
361void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
362{
363 mgq_ask(".set expert true");
364 mgq_ask(".set sorted_terms true");
365 mgq_ask(".set accumulator_method list");
366 mgq_ask(".set max_accumulators 500000");
367 mgq_ask(".set maxparas 500000");
368 mgq_ask(".set verbatim true");
369 mgq_ask(".unset skip_dump");
370 mgq_ask(".set mode docnums");
371
372 switch (queryparams.search_type)
373 {
374 case 0: mgq_ask(".set query boolean"); break;
375 case 1: mgq_ask(".set query ranked"); break;
376 }
377 switch (queryparams.casefolding)
378 {
379 case 1: mgq_ask(".set casefold on"); break;
380 case 0: mgq_ask(".set casefold off"); break;
381 }
382 switch (queryparams.stemming)
383 {
384 case 1: mgq_ask(".set stem on"); break;
385 case 0: mgq_ask(".set stem off"); break;
386 }
387 mgq_ask(".set heads_length 150");
388
389 if (queryparams.maxdocs == -1) {
390 mgq_ask(".set maxdocs all");
391 } else {
392 char maxdocstr[32];
393 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
394 mgq_ask(maxdocstr);
395 }
396
397 char maxnumericstr[32];
398 sprintf(maxnumericstr, ".set maxnumeric %i", queryparams.maxnumeric);
399 mgq_ask(maxnumericstr);
400
401}
402
403/**
404 * submitquery constructs the query string (into UTF8 encoding)
405 * and submits it using mgq_ask to the mg search engine. Most
406 * of the processing will be done inside Greenstone
407 */
408void mgsearchclass::submitquery (const queryparamclass &queryparams)
409{
410 // sort out the query string; copy it, remove all special characters
411 // and then convert it to a string in UTF8 format
412 text_t ttquerystring = queryparams.querystring;
413 filterquery (ttquerystring);
414 char *querystring = to_utf8(ttquerystring).getcstr();
415
416 // submit the query
417 mgq_ask(querystring);
418
419 // destroy the temporary character array
420 delete []querystring;
421}
422
423/**
424 * getrults is called to retrieve the required data on the docs
425 * which responded to the query submitted in submitquery above.
426 *
427 * It calls the local mgquery (mgq) interface to MG several times,
428 * to obtain the document numbers, term frequencies, term variants
429 * etc. All processing of the query will be done by Greenstone
430 * thereafter
431 */
432void mgsearchclass::getresults (const queryparamclass &queryparams,
433 queryresultsclass &queryresults) {
434 // get the configuration for the maximum number of documents to
435 // retrieve
436 int howmany = queryparams.maxdocs;
437 if (howmany == -1) howmany = MAXNUMDOCS;
438 mgq_results(result_docnums, 0, howmany,
439 ourquerycallback, (void *)(&queryresults));
440
441 // get the term frequencies
442 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
443 termfreqcallback, (void *)(&queryresults));
444 queryresults.sortuniqqueryterms();
445
446 // get term variants
447 mgq_results(result_terms, 0, MAXNUMTERMS,
448 termvariantscallback, (void *)(&queryresults));
449
450 // get the number of documents retrieved
451 int total_retrieved = 0, is_approx = 0;
452 mgq_docsretrieved (&total_retrieved, &is_approx);
453
454 if (total_retrieved == 0) {
455 // not available (or really was zero)
456 queryresults.docs_matched = queryresults.docs.docset.size();
457 if ((queryparams.maxdocs == -1) ||
458 (queryresults.docs_matched < queryparams.maxdocs))
459 queryresults.is_approx = Exact;
460 else
461 queryresults.is_approx = MoreThan;
462 } else {
463 queryresults.docs_matched = total_retrieved;
464 if (is_approx) queryresults.is_approx = Approximate;
465 else queryresults.is_approx = Exact;
466 }
467}
468
469/**
470 * Tidies the given querystring, removing special characters
471 */
472void mgsearchclass::filterquery (text_t &ttquerystring) {
473 text_t::iterator ithere = ttquerystring.begin ();
474 text_t::iterator itend = ttquerystring.end ();
475
476 // remove all non alphanumeric characters (except
477 // boolean operators
478 while (ithere != itend) {
479 if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
480 (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
481 (*ithere != ')')) (*ithere) = ' ';
482 ++ithere;
483 }
484}
485
486
487// the document text for 'docnum' is placed in 'output'
488// docTargetDocument returns 'true' if it was able to
489// try to get a document
490// collection is needed to see if an index from the
491// collection is loaded. If no index has been loaded
492// defaultindex is needed to load one
493bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
494 const text_t &defaultsubcollection,
495 const text_t &defaultlanguage,
496 const text_t &collection,
497 int docnum,
498 text_t &output) {
499 output.clear();
500
501 // get the mg version of the document
502 char *mgdoc = NULL;
503 int doclen = 0;
504 if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
505 collection, docnum, mgdoc, doclen)) return false;
506 if (mgdoc == NULL) return false;
507
508 // replace all control-Cs with spaces
509 char *mgdoc_here = mgdoc;
510 char *mgdoc_end = mgdoc + doclen;
511 while (mgdoc_here < mgdoc_end) {
512 if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
513 ++mgdoc_here;
514 }
515
516 // convert this document to unicode
517 utf8inconvertclass inconvert;
518 convertclass::status_t status;
519 inconvert.reset ();
520 inconvert.setinput (mgdoc, doclen);
521 inconvert.convert (output, status);
522
523 return true;
524}
525
526
527bool mgsearchclass::mgdocument (const text_t &defaultindex,
528 const text_t &defaultsubcollection,
529 const text_t &defaultlanguage,
530 const text_t &collection,
531 int docnum,
532 char *&UDoc, int &ULen) {
533 int databaseloaded = 0;
534
535 UDoc = NULL; ULen = 0;
536
537 // see if we can make an appropriate database current
538// char *ccollection = collection.getcstr();
539// assert (ccollection != NULL);
540// databaseloaded = load_text_database (ccollection);
541// delete []ccollection;
542
543 // try and load the database
544// if (!databaseloaded)
545 databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
546 defaultlanguage, collection);
547
548 if (databaseloaded) {
549 // retrieve the document from mg
550 char docstr[32];
551 sprintf(docstr, "%i", docnum);
552
553 mgq_ask(".set mode text");
554 mgq_ask(".set query docnums");
555 mgq_ask(docstr);
556
557 tempdoc = NULL;
558 templen = 0;
559 mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
560 UDoc = tempdoc;
561 ULen = templen;
562 }
563
564 unload_database(); // Important that local library doesn't leave any files open
565 return (bool)databaseloaded;
566}
567
568// unload_database simply calls mgq's close_all_databases function to clear
569// any cached databases - this is useful when attempting to completely
570// remove all trace of a collectionserver at runtime (when using a
571// persistent version of Greenstone like the windows local library)
572void mgsearchclass::unload_database () {
573 close_all_databases();
574}
Note: See TracBrowser for help on using the repository browser.