root/gsdl/trunk/src/colservr/mgsearch.cpp @ 15590

Revision 15590, 16.5 KB (checked in by mdewsnip, 12 years ago)

Removed unnecessary inclusions of the gdbm headers.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999  The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "gsdlconf.h"
27#include "mgsearch.h"
28#include "fileutil.h"
29
30#include <string.h>
31#include <stdio.h>
32#include <stdlib.h>
33#include <ctype.h>
34
35#if defined(GSDL_USE_OBJECTSPACE)
36#  include <ospace\std\iostream>
37#elif defined(GSDL_USE_IOS_H)
38#  include <iostream.h>
39#else
40#  include <iostream>
41#endif
42
43 
44#include <assert.h>
45
46#include "mgq.h"
47// #include "locateinfo.h"
48#include "gsdlunicode.h"
49#include "unitool.h"
50
51
52/////////////
53// globals //
54/////////////
55
56static char *tempdoc = NULL;
57static int templen = 0;
58
59
60//////////////////////
61// useful functions //
62//////////////////////
63
64
65// input and output are in utf8
66text_t mgsearch_stemword (const text_t &word) {
67  // allocate working stem space
68  int maxstemlen = mgq_getmaxstemlen ();
69  unsigned char *word_stem = new unsigned char [maxstemlen + 2];
70  if (word_stem == NULL) return "";
71
72  // copy word to word_stem
73  int len = 0;
74  text_t::const_iterator here = word.begin();
75  text_t::const_iterator end = word.end();
76  while (len < maxstemlen && here != end) {
77    word_stem[len+1] = (unsigned char)(*here);
78    ++len; ++here;
79  }
80  word_stem[len+1] = '\0';
81  word_stem[0] = len;
82
83  mgq_stemword (word_stem);
84
85  // copy word_stem back to tempstr
86  text_t tempstr;
87  tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
88
89  delete [] word_stem;
90 
91  return tempstr;
92}
93
94
95
96////////////////////////
97// callback functions //
98////////////////////////
99
100// This routine is called for each document found in a search
101// it assumes that cache_num is set up correctly to point to
102// a suitable result cache
103int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
104             float Weight, void *info) {
105
106 
107  queryresultsclass *queryresults = (queryresultsclass * )info;
108
109  // append this entry to the document results
110  docresultclass docresult;
111  docresult.docnum = DocNum;
112  docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
113  docresult.docweight = Weight - docresult.num_query_terms_matched*100;
114 
115  queryresults->docs.docset[DocNum] = docresult;
116  queryresults->docs.docorder.push_back(DocNum);
117 
118  return 0;
119}
120
121int termequivcallback(char *Word, int ULen,  int /*Freq*/,
122              float /*Weight*/,  void *info) {
123  text_tset *equivterms = (text_tset *)info;
124  if (equivterms == NULL) return 0;
125
126  text_t thisterm;
127  thisterm.setcarr(Word, ULen);
128
129  equivterms->insert(thisterm);
130 
131  return 0;
132}
133
134
135void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
136  // allocate working stem space
137  int maxstemlen = mgq_getmaxstemlen ();
138  unsigned char *word_stem = new unsigned char [maxstemlen + 2];
139  if (word_stem == NULL) return;
140
141  // copy word to word_stem
142  int len = 0;
143  text_t::const_iterator here = word.begin();
144  text_t::const_iterator end = word.end();
145  while (len < maxstemlen && here != end) {
146    word_stem[len+1] = (unsigned char)(*here);
147    ++len; ++here;
148  }
149  word_stem[len+1] = '\0';
150  word_stem[0] = len;
151
152  // get the equivalent terms
153  mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
154 
155  delete [] word_stem;
156
157  return;
158}
159
160  text_tset utf8equivterms; // kept as utf8 string for fast matching
161
162
163// This callback is called once for each term in the query
164int termfreqcallback(char *Word, int ULen,  int Freq,
165             float /*Weight*/,  void *info) {
166  queryresultsclass *queryresults = (queryresultsclass *)info;
167  if (queryresults == NULL) return 0;
168
169  text_t term;
170  term.setcarr(Word, ULen);
171  termfreqclass termfreq;
172
173  termfreq.termstr = to_uni(term);
174  text_t utf8termstem = mgsearch_stemword (term);
175  termfreq.termstemstr = to_uni (utf8termstem);
176
177  mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
178 
179  termfreq.termfreq = Freq;
180  queryresults->orgterms.push_back(termfreq);
181 
182  return 0;
183}
184
185// this callback is called once for each variation of each term
186int termvariantscallback(char *Word, int ULen, int /*Freq*/,
187             float /*Weight*/, void *info) {
188
189  text_t term;
190  term.setcarr(Word, ULen);
191  queryresultsclass *queryresults = (queryresultsclass *)info;
192  queryresults->termvariants.insert(to_uni(term));
193
194  return 0;
195}
196
197// This callback is for getting document text
198int doctextcallback(char *Doc, int ULen,  int /*Freq*/,
199            float /*Weight*/,  void * /*info*/) {
200  if (Doc != NULL) {
201    // Make a copy of this string so we can unload the database without losing it
202    tempdoc = new char[ULen + 1];
203    strcpy(tempdoc, Doc);
204  }
205  templen = ULen;
206 
207  return 0;
208}
209
210
211text_t mgsearchclass::getindexsuffix (const text_t &collection,
212                  const text_t &index) {
213
214  text_t indexsuffix = "index"; 
215  indexsuffix = filename_cat (indexsuffix, index);
216  if (indexstem.empty()) {
217    // no index stem, use the coll name
218    indexsuffix = filename_cat (indexsuffix, collection);
219  } else {
220    indexsuffix = filename_cat (indexsuffix, indexstem);
221  }
222  return indexsuffix;
223}
224
225
226
227
228////////////////////
229// mgsearch class //
230////////////////////
231
232mgsearchclass::mgsearchclass ()
233  : searchclass() {
234 
235}
236
237mgsearchclass::~mgsearchclass ()
238{
239  if (cache != NULL)
240    {
241      delete cache;
242      cache = NULL;
243    }
244}
245
246void mgsearchclass::set_indexstem(const text_t &stem) {
247  indexstem = stem;
248 
249}
250
251// you only need to use this function before doing any stemming
252// casefolding and stemming will be set if values for them are
253// provided (0 or 1).
254// makeindexcurrent returns true if it was able to load the database
255bool mgsearchclass::makeindexcurrent (const text_t &index,
256                      const text_t &subcollection,
257                      const text_t &language,
258                      const text_t &collection,
259                      int casefolding,
260                      int stemming) {
261  bool databaseloaded = true;
262
263  // get the names of the collection, index and text suffixes
264  char *ccollection = collection.getcstr();
265  assert (ccollection != NULL);
266  char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
267  assert (idxsuffix != NULL);
268  char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
269  assert (txtsuffix != NULL);
270#ifdef __WIN32__
271  char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
272#else
273  char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
274#endif
275
276  if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
277    if (casefolding == 0) mgq_ask(".set casefold off");
278    else if (casefolding > 0) mgq_ask(".set casefold on");
279    if (stemming == 0) mgq_ask(".set stem off");
280    else if (stemming > 0) mgq_ask(".set stem on");
281   
282  } else databaseloaded = false;
283
284  // free up the c strings
285  delete []ccollection;
286  delete []idxsuffix;
287  delete []txtsuffix;
288  delete []ccollectdir;
289
290  return databaseloaded;
291}
292
293
294// stem word uses the values set in the last call to makeindexcurrent
295// to stem the word. It is assumed that word is in unicode
296text_t mgsearchclass::stemword (const text_t &word) {
297  return to_uni (mgsearch_stemword (to_utf8 (word)));
298}
299
300text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
301  return to_uni (mgsearch_stemword (to_utf8 (here, end)));
302}
303
304/**
305 * search directs the whole execution of the search; a number of other
306 * functions in this class are called as a result, and precondition
307 * checks are also made
308 */
309bool mgsearchclass::search(const queryparamclass &queryparams,
310               queryresultsclass &queryresults) {
311  //  assert (cache != NULL);
312
313  // clear any previous results
314  queryresults.clear();
315  // first check the cache
316  if (cache != NULL) {
317    if (cache->find(queryparams, queryresults)) return true;
318  }
319  // make sure there is a query to be processed
320  if (!has_unicode_letdig(queryparams.querystring)) return true;
321
322  if (makeindexcurrent (queryparams.index, queryparams.subcollection,
323            queryparams.language, queryparams.collection)) {
324    // initialise the form of results
325    setsearchmode (queryparams);
326
327    // execute the query
328    submitquery (queryparams);
329
330    // retrieve the results
331    getresults (queryparams, queryresults);
332    unload_database();  // Important that local library doesn't leave any files open
333    return true;
334  }
335
336  return false;
337}
338
339/* accumulator_method has been changed to use array rather than list.
340list appears to be broken somewhat - for some ranked queries, it returned
341fewer results than it should have (eg 45 instead of 50). The three other
342methods (array, splay_tree, hash_table) all return the same number of
343documents, in the same order, with the same ranks. list returns what
344appears to be the same documents (but less of them), but with different ranks,
345and in a different order. Minimal time tests dont show any speed improvement
346of list over array (maybe because its broken??).  [02/2001, kjm18]
347
348... [sjboddie, also 02/2001] turns out that changing the accumulator_method
349introduced a more serious bug than it fixed (i.e. occasionally when doing a
350ranked search for a very common word you get no results at all). I've
351changed it back to list for now, one day we should play with other
352accumulator_methods but for now I don't have time and don't want to risk
353introducing bugs (better the devil you know ;)
354*/
355void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
356{
357  mgq_ask(".set expert true");
358  mgq_ask(".set sorted_terms true");
359  mgq_ask(".set accumulator_method list");
360  mgq_ask(".set max_accumulators 500000");
361  mgq_ask(".set maxparas 500000");
362  mgq_ask(".set verbatim true");
363  mgq_ask(".unset skip_dump");
364  mgq_ask(".set mode docnums");
365
366  switch (queryparams.search_type)
367    {
368    case 0: mgq_ask(".set query boolean");  break;
369    case 1:  mgq_ask(".set query ranked"); break;
370    }
371  switch (queryparams.casefolding)
372    {
373    case 1: mgq_ask(".set casefold on");  break;
374    case 0: mgq_ask(".set casefold off"); break;
375    }
376  switch (queryparams.stemming)
377    {
378    case 1: mgq_ask(".set stem on");  break;
379    case 0: mgq_ask(".set stem off"); break;
380    }
381  mgq_ask(".set heads_length 150");
382 
383  if (queryparams.maxdocs == -1) {
384    mgq_ask(".set maxdocs all");
385  } else {
386    char maxdocstr[32];
387    sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
388    mgq_ask(maxdocstr);
389  }
390
391  char maxnumericstr[32];
392  sprintf(maxnumericstr, ".set maxnumeric %i", queryparams.maxnumeric);
393  mgq_ask(maxnumericstr);
394 
395}
396
397/**
398 * submitquery constructs the query string (into UTF8 encoding)
399 * and submits it using mgq_ask to the mg search engine.  Most
400 * of the processing will be done inside Greenstone
401 */
402void mgsearchclass::submitquery (const queryparamclass &queryparams)
403{
404  // sort out the query string; copy it, remove all special characters
405  // and then convert it to a string in UTF8 format
406  text_t ttquerystring = queryparams.querystring;
407  filterquery (ttquerystring);
408  char *querystring = to_utf8(ttquerystring).getcstr();
409
410  // submit the query
411  mgq_ask(querystring);
412
413  // destroy the temporary character array
414  delete []querystring;
415}
416
417/**
418 * getrults is called to retrieve the required data on the docs
419 * which responded to the query submitted in submitquery above.
420 *
421 * It calls the local mgquery (mgq) interface to MG several times,
422 * to obtain the document numbers, term frequencies, term variants
423 * etc.  All processing of the query will be done by Greenstone
424 * thereafter
425 */
426void mgsearchclass::getresults (const queryparamclass &queryparams,
427                queryresultsclass &queryresults) {
428  // get the configuration for the maximum number of documents to
429  // retrieve
430  int howmany = queryparams.maxdocs;
431  if (howmany == -1) howmany = MAXNUMDOCS;
432  mgq_results(result_docnums, 0, howmany,
433          ourquerycallback, (void *)(&queryresults));
434 
435  // get the term frequencies
436  mgq_results(result_termfreqs, 0, MAXNUMTERMS,
437          termfreqcallback, (void *)(&queryresults));
438  queryresults.sortuniqqueryterms();
439
440  // get term variants
441  mgq_results(result_terms, 0, MAXNUMTERMS,
442              termvariantscallback, (void *)(&queryresults));
443
444  // get the number of documents retrieved
445  int total_retrieved = 0, is_approx = 0;
446  mgq_docsretrieved (&total_retrieved, &is_approx);
447
448  if (total_retrieved == 0) {
449    // not available (or really was zero)
450    queryresults.docs_matched = queryresults.docs.docset.size();
451    if ((queryparams.maxdocs == -1) ||
452    (queryresults.docs_matched < queryparams.maxdocs))
453      queryresults.is_approx = Exact;
454    else
455      queryresults.is_approx = MoreThan;
456  } else {
457    queryresults.docs_matched = total_retrieved;
458    if (is_approx) queryresults.is_approx = Approximate;
459    else queryresults.is_approx = Exact;
460  }
461}
462
463/**
464 * Tidies the given querystring, removing special characters
465 */
466void mgsearchclass::filterquery (text_t &ttquerystring) {
467  text_t::iterator ithere = ttquerystring.begin ();
468  text_t::iterator itend = ttquerystring.end ();
469 
470  // remove all non alphanumeric characters (except
471  // boolean operators
472  while (ithere != itend) {
473    if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
474    (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
475    (*ithere != ')')) (*ithere) = ' ';
476    ++ithere;
477  }
478}
479
480
481// the document text for 'docnum' is placed in 'output'
482// docTargetDocument returns 'true' if it was able to
483// try to get a document
484// collection is needed to see if an index from the
485// collection is loaded. If no index has been loaded
486// defaultindex is needed to load one
487bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
488                      const text_t &defaultsubcollection,
489                      const text_t &defaultlanguage,
490                      const text_t &collection,
491                      int docnum,
492                      text_t &output) {
493  output.clear();
494
495  // get the mg version of the document
496  char *mgdoc = NULL;
497  int doclen = 0;
498  if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
499           collection, docnum, mgdoc, doclen)) return false;
500  if (mgdoc == NULL) return false;
501
502  // replace all control-Cs with spaces
503  char *mgdoc_here = mgdoc;
504  char *mgdoc_end = mgdoc + doclen;
505  while (mgdoc_here < mgdoc_end) {
506    if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
507    ++mgdoc_here;
508  }
509
510  // convert this document to unicode
511  utf8inconvertclass inconvert;
512  convertclass::status_t status;
513  inconvert.reset ();
514  inconvert.setinput (mgdoc, doclen);
515  inconvert.convert (output, status);
516
517  delete[] mgdoc;
518  return true;
519}
520
521
522bool mgsearchclass::mgdocument (const text_t &defaultindex,
523                const text_t &defaultsubcollection,
524                const text_t &defaultlanguage,
525                const text_t &collection,
526                int docnum,
527                char *&UDoc, int &ULen) {
528  int databaseloaded = 0;
529
530  UDoc = NULL; ULen = 0;
531 
532  // see if we can make an appropriate database current
533//    char *ccollection = collection.getcstr();
534//    assert (ccollection != NULL);
535//    databaseloaded = load_text_database (ccollection);
536//    delete []ccollection;
537 
538  // try and load the database
539//    if (!databaseloaded)
540  databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
541                     defaultlanguage, collection);
542 
543  if (databaseloaded) {
544    // retrieve the document from mg
545    char docstr[32];
546    sprintf(docstr, "%i", docnum);
547   
548    mgq_ask(".set mode text");
549    mgq_ask(".set query docnums");
550    mgq_ask(docstr);
551
552    tempdoc = NULL;
553    templen = 0;
554    mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
555    UDoc = tempdoc;
556    ULen = templen;
557  }
558
559  unload_database();  // Important that local library doesn't leave any files open
560  return (bool)databaseloaded;
561}
562
563// unload_database simply calls mgq's close_all_databases function to clear
564// any cached databases - this is useful when attempting to completely
565// remove all trace of a collectionserver at runtime (when using a
566// persistent version of Greenstone like the windows local library)
567void mgsearchclass::unload_database () {
568  close_all_databases();
569}
Note: See TracBrowser for help on using the browser.