source: gsdl/trunk/src/colservr/mgsearch.cpp@ 15757

Last change on this file since 15757 was 15590, checked in by mdewsnip, 16 years ago

Removed unnecessary inclusions of the gdbm headers.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.5 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "gsdlconf.h"
27#include "mgsearch.h"
28#include "fileutil.h"
29
30#include <string.h>
31#include <stdio.h>
32#include <stdlib.h>
33#include <ctype.h>
34
35#if defined(GSDL_USE_OBJECTSPACE)
36# include <ospace\std\iostream>
37#elif defined(GSDL_USE_IOS_H)
38# include <iostream.h>
39#else
40# include <iostream>
41#endif
42
43
44#include <assert.h>
45
46#include "mgq.h"
47// #include "locateinfo.h"
48#include "gsdlunicode.h"
49#include "unitool.h"
50
51
52/////////////
53// globals //
54/////////////
55
56static char *tempdoc = NULL;
57static int templen = 0;
58
59
60//////////////////////
61// useful functions //
62//////////////////////
63
64
65// input and output are in utf8
66text_t mgsearch_stemword (const text_t &word) {
67 // allocate working stem space
68 int maxstemlen = mgq_getmaxstemlen ();
69 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
70 if (word_stem == NULL) return "";
71
72 // copy word to word_stem
73 int len = 0;
74 text_t::const_iterator here = word.begin();
75 text_t::const_iterator end = word.end();
76 while (len < maxstemlen && here != end) {
77 word_stem[len+1] = (unsigned char)(*here);
78 ++len; ++here;
79 }
80 word_stem[len+1] = '\0';
81 word_stem[0] = len;
82
83 mgq_stemword (word_stem);
84
85 // copy word_stem back to tempstr
86 text_t tempstr;
87 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
88
89 delete [] word_stem;
90
91 return tempstr;
92}
93
94
95
96////////////////////////
97// callback functions //
98////////////////////////
99
100// This routine is called for each document found in a search
101// it assumes that cache_num is set up correctly to point to
102// a suitable result cache
103int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
104 float Weight, void *info) {
105
106
107 queryresultsclass *queryresults = (queryresultsclass * )info;
108
109 // append this entry to the document results
110 docresultclass docresult;
111 docresult.docnum = DocNum;
112 docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
113 docresult.docweight = Weight - docresult.num_query_terms_matched*100;
114
115 queryresults->docs.docset[DocNum] = docresult;
116 queryresults->docs.docorder.push_back(DocNum);
117
118 return 0;
119}
120
121int termequivcallback(char *Word, int ULen, int /*Freq*/,
122 float /*Weight*/, void *info) {
123 text_tset *equivterms = (text_tset *)info;
124 if (equivterms == NULL) return 0;
125
126 text_t thisterm;
127 thisterm.setcarr(Word, ULen);
128
129 equivterms->insert(thisterm);
130
131 return 0;
132}
133
134
135void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
136 // allocate working stem space
137 int maxstemlen = mgq_getmaxstemlen ();
138 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
139 if (word_stem == NULL) return;
140
141 // copy word to word_stem
142 int len = 0;
143 text_t::const_iterator here = word.begin();
144 text_t::const_iterator end = word.end();
145 while (len < maxstemlen && here != end) {
146 word_stem[len+1] = (unsigned char)(*here);
147 ++len; ++here;
148 }
149 word_stem[len+1] = '\0';
150 word_stem[0] = len;
151
152 // get the equivalent terms
153 mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
154
155 delete [] word_stem;
156
157 return;
158}
159
160 text_tset utf8equivterms; // kept as utf8 string for fast matching
161
162
163// This callback is called once for each term in the query
164int termfreqcallback(char *Word, int ULen, int Freq,
165 float /*Weight*/, void *info) {
166 queryresultsclass *queryresults = (queryresultsclass *)info;
167 if (queryresults == NULL) return 0;
168
169 text_t term;
170 term.setcarr(Word, ULen);
171 termfreqclass termfreq;
172
173 termfreq.termstr = to_uni(term);
174 text_t utf8termstem = mgsearch_stemword (term);
175 termfreq.termstemstr = to_uni (utf8termstem);
176
177 mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
178
179 termfreq.termfreq = Freq;
180 queryresults->orgterms.push_back(termfreq);
181
182 return 0;
183}
184
185// this callback is called once for each variation of each term
186int termvariantscallback(char *Word, int ULen, int /*Freq*/,
187 float /*Weight*/, void *info) {
188
189 text_t term;
190 term.setcarr(Word, ULen);
191 queryresultsclass *queryresults = (queryresultsclass *)info;
192 queryresults->termvariants.insert(to_uni(term));
193
194 return 0;
195}
196
197// This callback is for getting document text
198int doctextcallback(char *Doc, int ULen, int /*Freq*/,
199 float /*Weight*/, void * /*info*/) {
200 if (Doc != NULL) {
201 // Make a copy of this string so we can unload the database without losing it
202 tempdoc = new char[ULen + 1];
203 strcpy(tempdoc, Doc);
204 }
205 templen = ULen;
206
207 return 0;
208}
209
210
211text_t mgsearchclass::getindexsuffix (const text_t &collection,
212 const text_t &index) {
213
214 text_t indexsuffix = "index";
215 indexsuffix = filename_cat (indexsuffix, index);
216 if (indexstem.empty()) {
217 // no index stem, use the coll name
218 indexsuffix = filename_cat (indexsuffix, collection);
219 } else {
220 indexsuffix = filename_cat (indexsuffix, indexstem);
221 }
222 return indexsuffix;
223}
224
225
226
227
228////////////////////
229// mgsearch class //
230////////////////////
231
232mgsearchclass::mgsearchclass ()
233 : searchclass() {
234
235}
236
237mgsearchclass::~mgsearchclass ()
238{
239 if (cache != NULL)
240 {
241 delete cache;
242 cache = NULL;
243 }
244}
245
246void mgsearchclass::set_indexstem(const text_t &stem) {
247 indexstem = stem;
248
249}
250
251// you only need to use this function before doing any stemming
252// casefolding and stemming will be set if values for them are
253// provided (0 or 1).
254// makeindexcurrent returns true if it was able to load the database
255bool mgsearchclass::makeindexcurrent (const text_t &index,
256 const text_t &subcollection,
257 const text_t &language,
258 const text_t &collection,
259 int casefolding,
260 int stemming) {
261 bool databaseloaded = true;
262
263 // get the names of the collection, index and text suffixes
264 char *ccollection = collection.getcstr();
265 assert (ccollection != NULL);
266 char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
267 assert (idxsuffix != NULL);
268 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
269 assert (txtsuffix != NULL);
270#ifdef __WIN32__
271 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
272#else
273 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
274#endif
275
276 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
277 if (casefolding == 0) mgq_ask(".set casefold off");
278 else if (casefolding > 0) mgq_ask(".set casefold on");
279 if (stemming == 0) mgq_ask(".set stem off");
280 else if (stemming > 0) mgq_ask(".set stem on");
281
282 } else databaseloaded = false;
283
284 // free up the c strings
285 delete []ccollection;
286 delete []idxsuffix;
287 delete []txtsuffix;
288 delete []ccollectdir;
289
290 return databaseloaded;
291}
292
293
294// stem word uses the values set in the last call to makeindexcurrent
295// to stem the word. It is assumed that word is in unicode
296text_t mgsearchclass::stemword (const text_t &word) {
297 return to_uni (mgsearch_stemword (to_utf8 (word)));
298}
299
300text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
301 return to_uni (mgsearch_stemword (to_utf8 (here, end)));
302}
303
304/**
305 * search directs the whole execution of the search; a number of other
306 * functions in this class are called as a result, and precondition
307 * checks are also made
308 */
309bool mgsearchclass::search(const queryparamclass &queryparams,
310 queryresultsclass &queryresults) {
311 // assert (cache != NULL);
312
313 // clear any previous results
314 queryresults.clear();
315 // first check the cache
316 if (cache != NULL) {
317 if (cache->find(queryparams, queryresults)) return true;
318 }
319 // make sure there is a query to be processed
320 if (!has_unicode_letdig(queryparams.querystring)) return true;
321
322 if (makeindexcurrent (queryparams.index, queryparams.subcollection,
323 queryparams.language, queryparams.collection)) {
324 // initialise the form of results
325 setsearchmode (queryparams);
326
327 // execute the query
328 submitquery (queryparams);
329
330 // retrieve the results
331 getresults (queryparams, queryresults);
332 unload_database(); // Important that local library doesn't leave any files open
333 return true;
334 }
335
336 return false;
337}
338
339/* accumulator_method has been changed to use array rather than list.
340list appears to be broken somewhat - for some ranked queries, it returned
341fewer results than it should have (eg 45 instead of 50). The three other
342methods (array, splay_tree, hash_table) all return the same number of
343documents, in the same order, with the same ranks. list returns what
344appears to be the same documents (but less of them), but with different ranks,
345and in a different order. Minimal time tests dont show any speed improvement
346of list over array (maybe because its broken??). [02/2001, kjm18]
347
348... [sjboddie, also 02/2001] turns out that changing the accumulator_method
349introduced a more serious bug than it fixed (i.e. occasionally when doing a
350ranked search for a very common word you get no results at all). I've
351changed it back to list for now, one day we should play with other
352accumulator_methods but for now I don't have time and don't want to risk
353introducing bugs (better the devil you know ;)
354*/
355void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
356{
357 mgq_ask(".set expert true");
358 mgq_ask(".set sorted_terms true");
359 mgq_ask(".set accumulator_method list");
360 mgq_ask(".set max_accumulators 500000");
361 mgq_ask(".set maxparas 500000");
362 mgq_ask(".set verbatim true");
363 mgq_ask(".unset skip_dump");
364 mgq_ask(".set mode docnums");
365
366 switch (queryparams.search_type)
367 {
368 case 0: mgq_ask(".set query boolean"); break;
369 case 1: mgq_ask(".set query ranked"); break;
370 }
371 switch (queryparams.casefolding)
372 {
373 case 1: mgq_ask(".set casefold on"); break;
374 case 0: mgq_ask(".set casefold off"); break;
375 }
376 switch (queryparams.stemming)
377 {
378 case 1: mgq_ask(".set stem on"); break;
379 case 0: mgq_ask(".set stem off"); break;
380 }
381 mgq_ask(".set heads_length 150");
382
383 if (queryparams.maxdocs == -1) {
384 mgq_ask(".set maxdocs all");
385 } else {
386 char maxdocstr[32];
387 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
388 mgq_ask(maxdocstr);
389 }
390
391 char maxnumericstr[32];
392 sprintf(maxnumericstr, ".set maxnumeric %i", queryparams.maxnumeric);
393 mgq_ask(maxnumericstr);
394
395}
396
397/**
398 * submitquery constructs the query string (into UTF8 encoding)
399 * and submits it using mgq_ask to the mg search engine. Most
400 * of the processing will be done inside Greenstone
401 */
402void mgsearchclass::submitquery (const queryparamclass &queryparams)
403{
404 // sort out the query string; copy it, remove all special characters
405 // and then convert it to a string in UTF8 format
406 text_t ttquerystring = queryparams.querystring;
407 filterquery (ttquerystring);
408 char *querystring = to_utf8(ttquerystring).getcstr();
409
410 // submit the query
411 mgq_ask(querystring);
412
413 // destroy the temporary character array
414 delete []querystring;
415}
416
417/**
418 * getrults is called to retrieve the required data on the docs
419 * which responded to the query submitted in submitquery above.
420 *
421 * It calls the local mgquery (mgq) interface to MG several times,
422 * to obtain the document numbers, term frequencies, term variants
423 * etc. All processing of the query will be done by Greenstone
424 * thereafter
425 */
426void mgsearchclass::getresults (const queryparamclass &queryparams,
427 queryresultsclass &queryresults) {
428 // get the configuration for the maximum number of documents to
429 // retrieve
430 int howmany = queryparams.maxdocs;
431 if (howmany == -1) howmany = MAXNUMDOCS;
432 mgq_results(result_docnums, 0, howmany,
433 ourquerycallback, (void *)(&queryresults));
434
435 // get the term frequencies
436 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
437 termfreqcallback, (void *)(&queryresults));
438 queryresults.sortuniqqueryterms();
439
440 // get term variants
441 mgq_results(result_terms, 0, MAXNUMTERMS,
442 termvariantscallback, (void *)(&queryresults));
443
444 // get the number of documents retrieved
445 int total_retrieved = 0, is_approx = 0;
446 mgq_docsretrieved (&total_retrieved, &is_approx);
447
448 if (total_retrieved == 0) {
449 // not available (or really was zero)
450 queryresults.docs_matched = queryresults.docs.docset.size();
451 if ((queryparams.maxdocs == -1) ||
452 (queryresults.docs_matched < queryparams.maxdocs))
453 queryresults.is_approx = Exact;
454 else
455 queryresults.is_approx = MoreThan;
456 } else {
457 queryresults.docs_matched = total_retrieved;
458 if (is_approx) queryresults.is_approx = Approximate;
459 else queryresults.is_approx = Exact;
460 }
461}
462
463/**
464 * Tidies the given querystring, removing special characters
465 */
466void mgsearchclass::filterquery (text_t &ttquerystring) {
467 text_t::iterator ithere = ttquerystring.begin ();
468 text_t::iterator itend = ttquerystring.end ();
469
470 // remove all non alphanumeric characters (except
471 // boolean operators
472 while (ithere != itend) {
473 if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
474 (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
475 (*ithere != ')')) (*ithere) = ' ';
476 ++ithere;
477 }
478}
479
480
481// the document text for 'docnum' is placed in 'output'
482// docTargetDocument returns 'true' if it was able to
483// try to get a document
484// collection is needed to see if an index from the
485// collection is loaded. If no index has been loaded
486// defaultindex is needed to load one
487bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
488 const text_t &defaultsubcollection,
489 const text_t &defaultlanguage,
490 const text_t &collection,
491 int docnum,
492 text_t &output) {
493 output.clear();
494
495 // get the mg version of the document
496 char *mgdoc = NULL;
497 int doclen = 0;
498 if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
499 collection, docnum, mgdoc, doclen)) return false;
500 if (mgdoc == NULL) return false;
501
502 // replace all control-Cs with spaces
503 char *mgdoc_here = mgdoc;
504 char *mgdoc_end = mgdoc + doclen;
505 while (mgdoc_here < mgdoc_end) {
506 if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
507 ++mgdoc_here;
508 }
509
510 // convert this document to unicode
511 utf8inconvertclass inconvert;
512 convertclass::status_t status;
513 inconvert.reset ();
514 inconvert.setinput (mgdoc, doclen);
515 inconvert.convert (output, status);
516
517 delete[] mgdoc;
518 return true;
519}
520
521
522bool mgsearchclass::mgdocument (const text_t &defaultindex,
523 const text_t &defaultsubcollection,
524 const text_t &defaultlanguage,
525 const text_t &collection,
526 int docnum,
527 char *&UDoc, int &ULen) {
528 int databaseloaded = 0;
529
530 UDoc = NULL; ULen = 0;
531
532 // see if we can make an appropriate database current
533// char *ccollection = collection.getcstr();
534// assert (ccollection != NULL);
535// databaseloaded = load_text_database (ccollection);
536// delete []ccollection;
537
538 // try and load the database
539// if (!databaseloaded)
540 databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
541 defaultlanguage, collection);
542
543 if (databaseloaded) {
544 // retrieve the document from mg
545 char docstr[32];
546 sprintf(docstr, "%i", docnum);
547
548 mgq_ask(".set mode text");
549 mgq_ask(".set query docnums");
550 mgq_ask(docstr);
551
552 tempdoc = NULL;
553 templen = 0;
554 mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
555 UDoc = tempdoc;
556 ULen = templen;
557 }
558
559 unload_database(); // Important that local library doesn't leave any files open
560 return (bool)databaseloaded;
561}
562
563// unload_database simply calls mgq's close_all_databases function to clear
564// any cached databases - this is useful when attempting to completely
565// remove all trace of a collectionserver at runtime (when using a
566// persistent version of Greenstone like the windows local library)
567void mgsearchclass::unload_database () {
568 close_all_databases();
569}
Note: See TracBrowser for help on using the repository browser.