source: trunk/gsdl/src/colservr/mgsearch.cpp@ 9620

Last change on this file since 9620 was 9620, checked in by kjdon, 19 years ago

added some x++ -> ++x changes submitted by Emanuel Dejanu

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.1 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "gsdlconf.h"
27#include "mgsearch.h"
28#include "fileutil.h"
29
30#include <string.h>
31#include <stdio.h>
32#include <stdlib.h>
33#include <ctype.h>
34
35#if defined(GSDL_USE_OBJECTSPACE)
36# include <ospace\std\iostream>
37#elif defined(GSDL_USE_IOS_H)
38# include <iostream.h>
39#else
40# include <iostream>
41#endif
42
43#if defined(__WIN32__)
44// gdbm stuff
45# include "autoconf.h"
46# include "systems.h"
47# include "gdbmconst.h"
48# include "gdbm.h"
49#else
50# include <gdbm.h>
51#endif
52
53
54#include <assert.h>
55
56#include "mgq.h"
57// #include "locateinfo.h"
58#include "gsdlunicode.h"
59#include "unitool.h"
60
61
62/////////////
63// globals //
64/////////////
65
66static char *tempdoc = NULL;
67static int templen = 0;
68
69
70//////////////////////
71// useful functions //
72//////////////////////
73
74
75// input and output are in utf8
76text_t mgsearch_stemword (const text_t &word) {
77 // allocate working stem space
78 int maxstemlen = mgq_getmaxstemlen ();
79 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
80 if (word_stem == NULL) return "";
81
82 // copy word to word_stem
83 int len = 0;
84 text_t::const_iterator here = word.begin();
85 text_t::const_iterator end = word.end();
86 while (len < maxstemlen && here != end) {
87 word_stem[len+1] = (unsigned char)(*here);
88 ++len; ++here;
89 }
90 word_stem[len+1] = '\0';
91 word_stem[0] = len;
92
93 mgq_stemword (word_stem);
94
95 // copy word_stem back to tempstr
96 text_t tempstr;
97 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
98
99 delete [] word_stem;
100
101 return tempstr;
102}
103
104
105
106////////////////////////
107// callback functions //
108////////////////////////
109
110// This routine is called for each document found in a search
111// it assumes that cache_num is set up correctly to point to
112// a suitable result cache
113int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
114 float Weight, void *info) {
115
116
117 queryresultsclass *queryresults = (queryresultsclass * )info;
118
119 // append this entry to the document results
120 docresultclass docresult;
121 docresult.docnum = DocNum;
122 docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
123 docresult.docweight = Weight - docresult.num_query_terms_matched*100;
124
125 queryresults->docs.docset[DocNum] = docresult;
126 queryresults->docs.docorder.push_back(DocNum);
127
128 return 0;
129}
130
131int termequivcallback(char *Word, int ULen, int /*Freq*/,
132 float /*Weight*/, void *info) {
133 text_tset *equivterms = (text_tset *)info;
134 if (equivterms == NULL) return 0;
135
136 text_t thisterm;
137 thisterm.setcarr(Word, ULen);
138
139 equivterms->insert(thisterm);
140
141 return 0;
142}
143
144
145void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
146 // allocate working stem space
147 int maxstemlen = mgq_getmaxstemlen ();
148 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
149 if (word_stem == NULL) return;
150
151 // copy word to word_stem
152 int len = 0;
153 text_t::const_iterator here = word.begin();
154 text_t::const_iterator end = word.end();
155 while (len < maxstemlen && here != end) {
156 word_stem[len+1] = (unsigned char)(*here);
157 ++len; ++here;
158 }
159 word_stem[len+1] = '\0';
160 word_stem[0] = len;
161
162 // get the equivalent terms
163 mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
164
165 delete [] word_stem;
166
167 return;
168}
169
170 text_tset utf8equivterms; // kept as utf8 string for fast matching
171
172
173// This callback is called once for each term in the query
174int termfreqcallback(char *Word, int ULen, int Freq,
175 float /*Weight*/, void *info) {
176 queryresultsclass *queryresults = (queryresultsclass *)info;
177 if (queryresults == NULL) return 0;
178
179 text_t term;
180 term.setcarr(Word, ULen);
181 termfreqclass termfreq;
182
183 termfreq.termstr = to_uni(term);
184 text_t utf8termstem = mgsearch_stemword (term);
185 termfreq.termstemstr = to_uni (utf8termstem);
186
187 mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
188
189 termfreq.termfreq = Freq;
190 queryresults->orgterms.push_back(termfreq);
191
192 return 0;
193}
194
195// this callback is called once for each variation of each term
196int termvariantscallback(char *Word, int ULen, int /*Freq*/,
197 float /*Weight*/, void *info) {
198
199 text_t term;
200 term.setcarr(Word, ULen);
201 queryresultsclass *queryresults = (queryresultsclass *)info;
202 queryresults->termvariants.insert(to_uni(term));
203
204 return 0;
205}
206
207// This callback is for getting document text
208int doctextcallback(char *Doc, int ULen, int /*Freq*/,
209 float /*Weight*/, void * /*info*/) {
210 tempdoc = Doc;
211 templen = ULen;
212
213 return 0;
214}
215
216
217static text_t getindexsuffix (const text_t &collection,
218 const text_t &index) {
219
220 text_t indexsuffix = "index";
221 indexsuffix = filename_cat (indexsuffix, index);
222 indexsuffix = filename_cat (indexsuffix, collection);
223 return indexsuffix;
224}
225
226
227
228
229////////////////////
230// mgsearch class //
231////////////////////
232
233mgsearchclass::mgsearchclass ()
234 : searchclass() {
235
236}
237
238mgsearchclass::~mgsearchclass ()
239{
240 if (cache != NULL)
241 {
242 delete cache;
243 cache = NULL;
244 }
245}
246
247// you only need to use this function before doing any stemming
248// casefolding and stemming will be set if values for them are
249// provided (0 or 1).
250// makeindexcurrent returns true if it was able to load the database
251bool mgsearchclass::makeindexcurrent (const text_t &index,
252 const text_t &subcollection,
253 const text_t &language,
254 const text_t &collection,
255 int casefolding,
256 int stemming) {
257 bool databaseloaded = true;
258
259 // get the names of the collection, index and text suffixes
260 char *ccollection = collection.getcstr();
261 assert (ccollection != NULL);
262 char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
263 assert (idxsuffix != NULL);
264 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
265 assert (txtsuffix != NULL);
266
267#ifdef __WIN32__
268 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
269#else
270 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
271#endif
272
273 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
274 if (casefolding == 0) mgq_ask(".set casefold off");
275 else if (casefolding > 0) mgq_ask(".set casefold on");
276 if (stemming == 0) mgq_ask(".set stem off");
277 else if (stemming > 0) mgq_ask(".set stem on");
278
279 } else databaseloaded = false;
280
281 // free up the c strings
282 delete ccollection;
283 delete idxsuffix;
284 delete txtsuffix;
285 delete ccollectdir;
286
287 return databaseloaded;
288}
289
290
291// stem word uses the values set in the last call to makeindexcurrent
292// to stem the word. It is assumed that word is in unicode
293text_t mgsearchclass::stemword (const text_t &word) {
294 return to_uni (mgsearch_stemword (to_utf8 (word)));
295}
296
297text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
298 return to_uni (mgsearch_stemword (to_utf8 (here, end)));
299}
300
301/**
302 * search directs the whole execution of the search; a number of other
303 * functions in this class are called as a result, and precondition
304 * checks are also made
305 */
306bool mgsearchclass::search(const queryparamclass &queryparams,
307 queryresultsclass &queryresults) {
308 // assert (cache != NULL);
309
310 // clear any previous results
311 queryresults.clear();
312 // first check the cache
313 if (cache != NULL) {
314 if (cache->find(queryparams, queryresults)) return true;
315 }
316 // make sure there is a query to be processed
317 if (!has_unicode_letdig(queryparams.querystring)) return true;
318
319 if (makeindexcurrent (queryparams.index, queryparams.subcollection,
320 queryparams.language, queryparams.collection)) {
321 // initialise the form of results
322 setsearchmode (queryparams);
323
324 // execute the query
325 submitquery (queryparams);
326
327 // retrieve the results
328 getresults (queryparams, queryresults);
329 return true;
330 }
331
332 return false;
333}
334
335/* accumulator_method has been changed to use array rather than list.
336list appears to be broken somewhat - for some ranked queries, it returned
337fewer results than it should have (eg 45 instead of 50). The three other
338methods (array, splay_tree, hash_table) all return the same number of
339documents, in the same order, with the same ranks. list returns what
340appears to be the same documents (but less of them), but with different ranks,
341and in a different order. Minimal time tests dont show any speed improvement
342of list over array (maybe because its broken??). [02/2001, kjm18]
343
344... [sjboddie, also 02/2001] turns out that changing the accumulator_method
345introduced a more serious bug than it fixed (i.e. occasionally when doing a
346ranked search for a very common word you get no results at all). I've
347changed it back to list for now, one day we should play with other
348accumulator_methods but for now I don't have time and don't want to risk
349introducing bugs (better the devil you know ;)
350*/
351void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
352{
353 mgq_ask(".set expert true");
354 mgq_ask(".set sorted_terms true");
355 mgq_ask(".set accumulator_method list");
356 mgq_ask(".set max_accumulators 500000");
357 mgq_ask(".set maxparas 500000");
358 mgq_ask(".set verbatim true");
359 mgq_ask(".unset skip_dump");
360 mgq_ask(".set mode docnums");
361
362 switch (queryparams.search_type)
363 {
364 case 0: mgq_ask(".set query boolean"); break;
365 case 1: mgq_ask(".set query ranked"); break;
366 }
367 switch (queryparams.casefolding)
368 {
369 case 1: mgq_ask(".set casefold on"); break;
370 case 0: mgq_ask(".set casefold off"); break;
371 }
372 switch (queryparams.stemming)
373 {
374 case 1: mgq_ask(".set stem on"); break;
375 case 0: mgq_ask(".set stem off"); break;
376 }
377 mgq_ask(".set heads_length 150");
378
379 if (queryparams.maxdocs == -1) {
380 mgq_ask(".set maxdocs all");
381 } else {
382 char maxdocstr[32];
383 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
384 mgq_ask(maxdocstr);
385 }
386
387 char maxnumericstr[32];
388 sprintf(maxnumericstr, ".set maxnumeric %i", queryparams.maxnumeric);
389 mgq_ask(maxnumericstr);
390
391}
392
393/**
394 * submitquery constructs the query string (into UTF8 encoding)
395 * and submits it using mgq_ask to the mg search engine. Most
396 * of the processing will be done inside Greenstone
397 */
398void mgsearchclass::submitquery (const queryparamclass &queryparams)
399{
400 // sort out the query string; copy it, remove all special characters
401 // and then convert it to a string in UTF8 format
402 text_t ttquerystring = queryparams.querystring;
403 filterquery (ttquerystring);
404 char *querystring = to_utf8(ttquerystring).getcstr();
405
406 // submit the query
407 mgq_ask(querystring);
408
409 // destroy the temporary character array
410 delete querystring;
411}
412
413/**
414 * getrults is called to retrieve the required data on the docs
415 * which responded to the query submitted in submitquery above.
416 *
417 * It calls the local mgquery (mgq) interface to MG several times,
418 * to obtain the document numbers, term frequencies, term variants
419 * etc. All processing of the query will be done by Greenstone
420 * thereafter
421 */
422void mgsearchclass::getresults (const queryparamclass &queryparams,
423 queryresultsclass &queryresults) {
424 // get the configuration for the maximum number of documents to
425 // retrieve
426 int howmany = queryparams.maxdocs;
427 if (howmany == -1) howmany = MAXNUMDOCS;
428 mgq_results(result_docnums, 0, howmany,
429 ourquerycallback, (void *)(&queryresults));
430
431 // get the term frequencies
432 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
433 termfreqcallback, (void *)(&queryresults));
434 queryresults.sortuniqqueryterms();
435
436 // get term variants
437 mgq_results(result_terms, 0, MAXNUMTERMS,
438 termvariantscallback, (void *)(&queryresults));
439
440 // get the number of documents retrieved
441 int total_retrieved = 0, is_approx = 0;
442 mgq_docsretrieved (&total_retrieved, &is_approx);
443
444 if (total_retrieved == 0) {
445 // not available (or really was zero)
446 queryresults.docs_matched = queryresults.docs.docset.size();
447 if ((queryparams.maxdocs == -1) ||
448 (queryresults.docs_matched < queryparams.maxdocs))
449 queryresults.is_approx = Exact;
450 else
451 queryresults.is_approx = MoreThan;
452 } else {
453 queryresults.docs_matched = total_retrieved;
454 if (is_approx) queryresults.is_approx = Approximate;
455 else queryresults.is_approx = Exact;
456 }
457}
458
459/**
460 * Tidies the given querystring, removing special characters
461 */
462void mgsearchclass::filterquery (text_t &ttquerystring) {
463 text_t::iterator ithere = ttquerystring.begin ();
464 text_t::iterator itend = ttquerystring.end ();
465
466 // remove all non alphanumeric characters (except
467 // boolean operators
468 while (ithere != itend) {
469 if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
470 (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
471 (*ithere != ')')) (*ithere) = ' ';
472 ++ithere;
473 }
474}
475
476
477// the document text for 'docnum' is placed in 'output'
478// docTargetDocument returns 'true' if it was able to
479// try to get a document
480// collection is needed to see if an index from the
481// collection is loaded. If no index has been loaded
482// defaultindex is needed to load one
483bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
484 const text_t &defaultsubcollection,
485 const text_t &defaultlanguage,
486 const text_t &collection,
487 int docnum,
488 text_t &output) {
489 output.clear();
490
491 // get the mg version of the document
492 char *mgdoc = NULL;
493 int doclen = 0;
494 if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
495 collection, docnum, mgdoc, doclen)) return false;
496 if (mgdoc == NULL) return false;
497
498 // replace all control-Cs with spaces
499 char *mgdoc_here = mgdoc;
500 char *mgdoc_end = mgdoc + doclen;
501 while (mgdoc_here < mgdoc_end) {
502 if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
503 ++mgdoc_here;
504 }
505
506 // convert this document to unicode
507 utf8inconvertclass inconvert;
508 convertclass::status_t status;
509 inconvert.reset ();
510 inconvert.setinput (mgdoc, doclen);
511 inconvert.convert (output, status);
512
513 return true;
514}
515
516
517bool mgsearchclass::mgdocument (const text_t &defaultindex,
518 const text_t &defaultsubcollection,
519 const text_t &defaultlanguage,
520 const text_t &collection,
521 int docnum,
522 char *&UDoc, int &ULen) {
523 int databaseloaded = 0;
524
525 UDoc = NULL; ULen = 0;
526
527 // see if we can make an appropriate database current
528// char *ccollection = collection.getcstr();
529// assert (ccollection != NULL);
530// databaseloaded = load_text_database (ccollection);
531// delete ccollection;
532
533 // try and load the database
534// if (!databaseloaded)
535 databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
536 defaultlanguage, collection);
537
538 if (databaseloaded) {
539 // retrieve the document from mg
540 char docstr[32];
541 sprintf(docstr, "%i", docnum);
542
543 mgq_ask(".set mode text");
544 mgq_ask(".set query docnums");
545 mgq_ask(docstr);
546
547 tempdoc = NULL;
548 templen = 0;
549 mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
550 UDoc = tempdoc;
551 ULen = templen;
552 }
553
554 return (bool)databaseloaded;
555}
556
557// unload_database simply calls mgq's close_all_databases function to clear
558// any cached databases - this is useful when attempting to completely
559// remove all trace of a collectionserver at runtime (when using a
560// persistent version of Greenstone like the windows local library)
561void mgsearchclass::unload_database () {
562 close_all_databases();
563}
Note: See TracBrowser for help on using the repository browser.