root/trunk/gsdl/src/colservr/mgsearch.cpp @ 13789

Revision 13789, 16.6 KB (checked in by mdewsnip, 13 years ago)

Fixed a problem with my previous change causing the local library to often crash when displaying a document. This was due to the document text being in memory that was deleted when the database was unloaded.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999  The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "gsdlconf.h"
27#include "mgsearch.h"
28#include "fileutil.h"
29
30#include <string.h>
31#include <stdio.h>
32#include <stdlib.h>
33#include <ctype.h>
34
35#if defined(GSDL_USE_OBJECTSPACE)
36#  include <ospace\std\iostream>
37#elif defined(GSDL_USE_IOS_H)
38#  include <iostream.h>
39#else
40#  include <iostream>
41#endif
42
43#if defined(__WIN32__)
44// gdbm stuff
45#  include "autoconf.h"
46#  include "systems.h"
47#  include "gdbmconst.h"
48#  include "gdbm.h"
49#else
50#  include <gdbm.h>
51#endif
52
53 
54#include <assert.h>
55
56#include "mgq.h"
57// #include "locateinfo.h"
58#include "gsdlunicode.h"
59#include "unitool.h"
60
61
62/////////////
63// globals //
64/////////////
65
66static char *tempdoc = NULL;
67static int templen = 0;
68
69
70//////////////////////
71// useful functions //
72//////////////////////
73
74
75// input and output are in utf8
76text_t mgsearch_stemword (const text_t &word) {
77  // allocate working stem space
78  int maxstemlen = mgq_getmaxstemlen ();
79  unsigned char *word_stem = new unsigned char [maxstemlen + 2];
80  if (word_stem == NULL) return "";
81
82  // copy word to word_stem
83  int len = 0;
84  text_t::const_iterator here = word.begin();
85  text_t::const_iterator end = word.end();
86  while (len < maxstemlen && here != end) {
87    word_stem[len+1] = (unsigned char)(*here);
88    ++len; ++here;
89  }
90  word_stem[len+1] = '\0';
91  word_stem[0] = len;
92
93  mgq_stemword (word_stem);
94
95  // copy word_stem back to tempstr
96  text_t tempstr;
97  tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
98
99  delete [] word_stem;
100 
101  return tempstr;
102}
103
104
105
106////////////////////////
107// callback functions //
108////////////////////////
109
110// This routine is called for each document found in a search
111// it assumes that cache_num is set up correctly to point to
112// a suitable result cache
113int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
114             float Weight, void *info) {
115
116 
117  queryresultsclass *queryresults = (queryresultsclass * )info;
118
119  // append this entry to the document results
120  docresultclass docresult;
121  docresult.docnum = DocNum;
122  docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
123  docresult.docweight = Weight - docresult.num_query_terms_matched*100;
124 
125  queryresults->docs.docset[DocNum] = docresult;
126  queryresults->docs.docorder.push_back(DocNum);
127 
128  return 0;
129}
130
131int termequivcallback(char *Word, int ULen,  int /*Freq*/,
132              float /*Weight*/,  void *info) {
133  text_tset *equivterms = (text_tset *)info;
134  if (equivterms == NULL) return 0;
135
136  text_t thisterm;
137  thisterm.setcarr(Word, ULen);
138
139  equivterms->insert(thisterm);
140 
141  return 0;
142}
143
144
145void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
146  // allocate working stem space
147  int maxstemlen = mgq_getmaxstemlen ();
148  unsigned char *word_stem = new unsigned char [maxstemlen + 2];
149  if (word_stem == NULL) return;
150
151  // copy word to word_stem
152  int len = 0;
153  text_t::const_iterator here = word.begin();
154  text_t::const_iterator end = word.end();
155  while (len < maxstemlen && here != end) {
156    word_stem[len+1] = (unsigned char)(*here);
157    ++len; ++here;
158  }
159  word_stem[len+1] = '\0';
160  word_stem[0] = len;
161
162  // get the equivalent terms
163  mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
164 
165  delete [] word_stem;
166
167  return;
168}
169
170  text_tset utf8equivterms; // kept as utf8 string for fast matching
171
172
173// This callback is called once for each term in the query
174int termfreqcallback(char *Word, int ULen,  int Freq,
175             float /*Weight*/,  void *info) {
176  queryresultsclass *queryresults = (queryresultsclass *)info;
177  if (queryresults == NULL) return 0;
178
179  text_t term;
180  term.setcarr(Word, ULen);
181  termfreqclass termfreq;
182
183  termfreq.termstr = to_uni(term);
184  text_t utf8termstem = mgsearch_stemword (term);
185  termfreq.termstemstr = to_uni (utf8termstem);
186
187  mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
188 
189  termfreq.termfreq = Freq;
190  queryresults->orgterms.push_back(termfreq);
191 
192  return 0;
193}
194
195// this callback is called once for each variation of each term
196int termvariantscallback(char *Word, int ULen, int /*Freq*/,
197             float /*Weight*/, void *info) {
198
199  text_t term;
200  term.setcarr(Word, ULen);
201  queryresultsclass *queryresults = (queryresultsclass *)info;
202  queryresults->termvariants.insert(to_uni(term));
203
204  return 0;
205}
206
207// This callback is for getting document text
208int doctextcallback(char *Doc, int ULen,  int /*Freq*/,
209            float /*Weight*/,  void * /*info*/) {
210  if (Doc != NULL) {
211    // Make a copy of this string so we can unload the database without losing it
212    tempdoc = new char[ULen + 1];
213    strcpy(tempdoc, Doc);
214  }
215  templen = ULen;
216 
217  return 0;
218}
219
220
221text_t mgsearchclass::getindexsuffix (const text_t &collection,
222                  const text_t &index) {
223
224  text_t indexsuffix = "index"; 
225  indexsuffix = filename_cat (indexsuffix, index);
226  if (indexstem.empty()) {
227    // no index stem, use the coll name
228    indexsuffix = filename_cat (indexsuffix, collection);
229  } else {
230    indexsuffix = filename_cat (indexsuffix, indexstem);
231  }
232  return indexsuffix;
233}
234
235
236
237
238////////////////////
239// mgsearch class //
240////////////////////
241
242mgsearchclass::mgsearchclass ()
243  : searchclass() {
244 
245}
246
247mgsearchclass::~mgsearchclass ()
248{
249  if (cache != NULL)
250    {
251      delete cache;
252      cache = NULL;
253    }
254}
255
256void mgsearchclass::set_indexstem(const text_t &stem) {
257  indexstem = stem;
258 
259}
260
261// you only need to use this function before doing any stemming
262// casefolding and stemming will be set if values for them are
263// provided (0 or 1).
264// makeindexcurrent returns true if it was able to load the database
265bool mgsearchclass::makeindexcurrent (const text_t &index,
266                      const text_t &subcollection,
267                      const text_t &language,
268                      const text_t &collection,
269                      int casefolding,
270                      int stemming) {
271  bool databaseloaded = true;
272
273  // get the names of the collection, index and text suffixes
274  char *ccollection = collection.getcstr();
275  assert (ccollection != NULL);
276  char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
277  assert (idxsuffix != NULL);
278  char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
279  assert (txtsuffix != NULL);
280#ifdef __WIN32__
281  char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
282#else
283  char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
284#endif
285
286  if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
287    if (casefolding == 0) mgq_ask(".set casefold off");
288    else if (casefolding > 0) mgq_ask(".set casefold on");
289    if (stemming == 0) mgq_ask(".set stem off");
290    else if (stemming > 0) mgq_ask(".set stem on");
291   
292  } else databaseloaded = false;
293
294  // free up the c strings
295  delete []ccollection;
296  delete []idxsuffix;
297  delete []txtsuffix;
298  delete []ccollectdir;
299
300  return databaseloaded;
301}
302
303
304// stem word uses the values set in the last call to makeindexcurrent
305// to stem the word. It is assumed that word is in unicode
306text_t mgsearchclass::stemword (const text_t &word) {
307  return to_uni (mgsearch_stemword (to_utf8 (word)));
308}
309
310text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
311  return to_uni (mgsearch_stemword (to_utf8 (here, end)));
312}
313
314/**
315 * search directs the whole execution of the search; a number of other
316 * functions in this class are called as a result, and precondition
317 * checks are also made
318 */
319bool mgsearchclass::search(const queryparamclass &queryparams,
320               queryresultsclass &queryresults) {
321  //  assert (cache != NULL);
322
323  // clear any previous results
324  queryresults.clear();
325  // first check the cache
326  if (cache != NULL) {
327    if (cache->find(queryparams, queryresults)) return true;
328  }
329  // make sure there is a query to be processed
330  if (!has_unicode_letdig(queryparams.querystring)) return true;
331
332  if (makeindexcurrent (queryparams.index, queryparams.subcollection,
333            queryparams.language, queryparams.collection)) {
334    // initialise the form of results
335    setsearchmode (queryparams);
336
337    // execute the query
338    submitquery (queryparams);
339
340    // retrieve the results
341    getresults (queryparams, queryresults);
342    unload_database();  // Important that local library doesn't leave any files open
343    return true;
344  }
345
346  return false;
347}
348
349/* accumulator_method has been changed to use array rather than list.
350list appears to be broken somewhat - for some ranked queries, it returned
351fewer results than it should have (eg 45 instead of 50). The three other
352methods (array, splay_tree, hash_table) all return the same number of
353documents, in the same order, with the same ranks. list returns what
354appears to be the same documents (but less of them), but with different ranks,
355and in a different order. Minimal time tests dont show any speed improvement
356of list over array (maybe because its broken??).  [02/2001, kjm18]
357
358... [sjboddie, also 02/2001] turns out that changing the accumulator_method
359introduced a more serious bug than it fixed (i.e. occasionally when doing a
360ranked search for a very common word you get no results at all). I've
361changed it back to list for now, one day we should play with other
362accumulator_methods but for now I don't have time and don't want to risk
363introducing bugs (better the devil you know ;)
364*/
365void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
366{
367  mgq_ask(".set expert true");
368  mgq_ask(".set sorted_terms true");
369  mgq_ask(".set accumulator_method list");
370  mgq_ask(".set max_accumulators 500000");
371  mgq_ask(".set maxparas 500000");
372  mgq_ask(".set verbatim true");
373  mgq_ask(".unset skip_dump");
374  mgq_ask(".set mode docnums");
375
376  switch (queryparams.search_type)
377    {
378    case 0: mgq_ask(".set query boolean");  break;
379    case 1:  mgq_ask(".set query ranked"); break;
380    }
381  switch (queryparams.casefolding)
382    {
383    case 1: mgq_ask(".set casefold on");  break;
384    case 0: mgq_ask(".set casefold off"); break;
385    }
386  switch (queryparams.stemming)
387    {
388    case 1: mgq_ask(".set stem on");  break;
389    case 0: mgq_ask(".set stem off"); break;
390    }
391  mgq_ask(".set heads_length 150");
392 
393  if (queryparams.maxdocs == -1) {
394    mgq_ask(".set maxdocs all");
395  } else {
396    char maxdocstr[32];
397    sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
398    mgq_ask(maxdocstr);
399  }
400
401  char maxnumericstr[32];
402  sprintf(maxnumericstr, ".set maxnumeric %i", queryparams.maxnumeric);
403  mgq_ask(maxnumericstr);
404 
405}
406
407/**
408 * submitquery constructs the query string (into UTF8 encoding)
409 * and submits it using mgq_ask to the mg search engine.  Most
410 * of the processing will be done inside Greenstone
411 */
412void mgsearchclass::submitquery (const queryparamclass &queryparams)
413{
414  // sort out the query string; copy it, remove all special characters
415  // and then convert it to a string in UTF8 format
416  text_t ttquerystring = queryparams.querystring;
417  filterquery (ttquerystring);
418  char *querystring = to_utf8(ttquerystring).getcstr();
419
420  // submit the query
421  mgq_ask(querystring);
422
423  // destroy the temporary character array
424  delete []querystring;
425}
426
427/**
428 * getrults is called to retrieve the required data on the docs
429 * which responded to the query submitted in submitquery above.
430 *
431 * It calls the local mgquery (mgq) interface to MG several times,
432 * to obtain the document numbers, term frequencies, term variants
433 * etc.  All processing of the query will be done by Greenstone
434 * thereafter
435 */
436void mgsearchclass::getresults (const queryparamclass &queryparams,
437                queryresultsclass &queryresults) {
438  // get the configuration for the maximum number of documents to
439  // retrieve
440  int howmany = queryparams.maxdocs;
441  if (howmany == -1) howmany = MAXNUMDOCS;
442  mgq_results(result_docnums, 0, howmany,
443          ourquerycallback, (void *)(&queryresults));
444 
445  // get the term frequencies
446  mgq_results(result_termfreqs, 0, MAXNUMTERMS,
447          termfreqcallback, (void *)(&queryresults));
448  queryresults.sortuniqqueryterms();
449
450  // get term variants
451  mgq_results(result_terms, 0, MAXNUMTERMS,
452              termvariantscallback, (void *)(&queryresults));
453
454  // get the number of documents retrieved
455  int total_retrieved = 0, is_approx = 0;
456  mgq_docsretrieved (&total_retrieved, &is_approx);
457
458  if (total_retrieved == 0) {
459    // not available (or really was zero)
460    queryresults.docs_matched = queryresults.docs.docset.size();
461    if ((queryparams.maxdocs == -1) ||
462    (queryresults.docs_matched < queryparams.maxdocs))
463      queryresults.is_approx = Exact;
464    else
465      queryresults.is_approx = MoreThan;
466  } else {
467    queryresults.docs_matched = total_retrieved;
468    if (is_approx) queryresults.is_approx = Approximate;
469    else queryresults.is_approx = Exact;
470  }
471}
472
473/**
474 * Tidies the given querystring, removing special characters
475 */
476void mgsearchclass::filterquery (text_t &ttquerystring) {
477  text_t::iterator ithere = ttquerystring.begin ();
478  text_t::iterator itend = ttquerystring.end ();
479 
480  // remove all non alphanumeric characters (except
481  // boolean operators
482  while (ithere != itend) {
483    if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
484    (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
485    (*ithere != ')')) (*ithere) = ' ';
486    ++ithere;
487  }
488}
489
490
491// the document text for 'docnum' is placed in 'output'
492// docTargetDocument returns 'true' if it was able to
493// try to get a document
494// collection is needed to see if an index from the
495// collection is loaded. If no index has been loaded
496// defaultindex is needed to load one
497bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
498                      const text_t &defaultsubcollection,
499                      const text_t &defaultlanguage,
500                      const text_t &collection,
501                      int docnum,
502                      text_t &output) {
503  output.clear();
504
505  // get the mg version of the document
506  char *mgdoc = NULL;
507  int doclen = 0;
508  if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
509           collection, docnum, mgdoc, doclen)) return false;
510  if (mgdoc == NULL) return false;
511
512  // replace all control-Cs with spaces
513  char *mgdoc_here = mgdoc;
514  char *mgdoc_end = mgdoc + doclen;
515  while (mgdoc_here < mgdoc_end) {
516    if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
517    ++mgdoc_here;
518  }
519
520  // convert this document to unicode
521  utf8inconvertclass inconvert;
522  convertclass::status_t status;
523  inconvert.reset ();
524  inconvert.setinput (mgdoc, doclen);
525  inconvert.convert (output, status);
526
527  delete[] mgdoc;
528  return true;
529}
530
531
532bool mgsearchclass::mgdocument (const text_t &defaultindex,
533                const text_t &defaultsubcollection,
534                const text_t &defaultlanguage,
535                const text_t &collection,
536                int docnum,
537                char *&UDoc, int &ULen) {
538  int databaseloaded = 0;
539
540  UDoc = NULL; ULen = 0;
541 
542  // see if we can make an appropriate database current
543//    char *ccollection = collection.getcstr();
544//    assert (ccollection != NULL);
545//    databaseloaded = load_text_database (ccollection);
546//    delete []ccollection;
547 
548  // try and load the database
549//    if (!databaseloaded)
550  databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
551                     defaultlanguage, collection);
552 
553  if (databaseloaded) {
554    // retrieve the document from mg
555    char docstr[32];
556    sprintf(docstr, "%i", docnum);
557   
558    mgq_ask(".set mode text");
559    mgq_ask(".set query docnums");
560    mgq_ask(docstr);
561
562    tempdoc = NULL;
563    templen = 0;
564    mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
565    UDoc = tempdoc;
566    ULen = templen;
567  }
568
569  unload_database();  // Important that local library doesn't leave any files open
570  return (bool)databaseloaded;
571}
572
573// unload_database simply calls mgq's close_all_databases function to clear
574// any cached databases - this is useful when attempting to completely
575// remove all trace of a collectionserver at runtime (when using a
576// persistent version of Greenstone like the windows local library)
577void mgsearchclass::unload_database () {
578  close_all_databases();
579}
Note: See TracBrowser for help on using the browser.