source: trunk/gsdl/src/colservr/mgsearch.cpp@ 1285

Last change on this file since 1285 was 1285, checked in by sjboddie, 24 years ago

Removed CVS logging information from source files

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.6 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "gsdlconf.h"
27#include "mgsearch.h"
28#include "fileutil.h"
29
30#include <string.h>
31#include <stdio.h>
32#include <stdlib.h>
33#include <ctype.h>
34
35#if defined(GSDL_USE_OBJECTSPACE)
36# include <ospace\std\iostream>
37#elif defined(GSDL_USE_IOS_H)
38# include <iostream.h>
39#else
40# include <iostream>
41#endif
42
43#if defined(__WIN32__)
44// gdbm stuff
45# include "autoconf.h"
46# include "systems.h"
47# include "gdbmconst.h"
48# include "gdbm.h"
49#else
50# include <gdbm.h>
51#endif
52
53
54#include <assert.h>
55
56#include "mgq.h"
57// #include "locateinfo.h"
58#include "gsdlunicode.h"
59#include "unitool.h"
60
61
62/////////////
63// globals //
64/////////////
65
66static char *tempdoc = NULL;
67static int templen = 0;
68
69
70//////////////////////
71// useful functions //
72//////////////////////
73
74
75// input and output are in utf8
76text_t mgsearch_stemword (const text_t &word) {
77 // allocate working stem space
78 int maxstemlen = mgq_getmaxstemlen ();
79 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
80 if (word_stem == NULL) return "";
81
82 // copy word to word_stem
83 int len = 0;
84 text_t::const_iterator here = word.begin();
85 text_t::const_iterator end = word.end();
86 while (len < maxstemlen && here != end) {
87 word_stem[len+1] = (unsigned char)(*here);
88 len++; here++;
89 }
90 word_stem[len+1] = '\0';
91 word_stem[0] = len;
92
93 mgq_stemword (word_stem);
94
95 // copy word_stem back to tempstr
96 text_t tempstr;
97 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
98
99 delete [] word_stem;
100
101 return tempstr;
102}
103
104
105
106////////////////////////
107// callback functions //
108////////////////////////
109
110// This routine is called for each document found in a search
111// it assumes that cache_num is set up correctly to point to
112// a suitable result cache
113int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
114 float Weight, void *info) {
115
116
117 queryresultsclass *queryresults = (queryresultsclass * )info;
118
119 // append this entry to the document results
120 docresultclass docresult;
121 docresult.docnum = DocNum;
122 docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
123 docresult.docweight = Weight - docresult.num_query_terms_matched*100;
124
125 queryresults->docs.docset[DocNum] = docresult;
126 queryresults->docs.docorder.push_back(DocNum);
127
128 return 0;
129}
130
131int termequivcallback(char *Word, int ULen, int /*Freq*/,
132 float /*Weight*/, void *info) {
133 text_tset *equivterms = (text_tset *)info;
134 if (equivterms == NULL) return 0;
135
136 text_t thisterm;
137 thisterm.setcarr(Word, ULen);
138
139 equivterms->insert(thisterm);
140
141 return 0;
142}
143
144
145void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
146 // allocate working stem space
147 int maxstemlen = mgq_getmaxstemlen ();
148 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
149 if (word_stem == NULL) return;
150
151 // copy word to word_stem
152 int len = 0;
153 text_t::const_iterator here = word.begin();
154 text_t::const_iterator end = word.end();
155 while (len < maxstemlen && here != end) {
156 word_stem[len+1] = (unsigned char)(*here);
157 len++; here++;
158 }
159 word_stem[len+1] = '\0';
160 word_stem[0] = len;
161
162 // get the equivalent terms
163 mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
164
165 delete [] word_stem;
166
167 return;
168}
169
170 text_tset utf8equivterms; // kept as utf8 string for fast matching
171
172
173// This callback is called once for each term in the query
174int termfreqcallback(char *Word, int ULen, int Freq,
175 float /*Weight*/, void *info) {
176 queryresultsclass *queryresults = (queryresultsclass *)info;
177 if (queryresults == NULL) return 0;
178
179 text_t term;
180 term.setcarr(Word, ULen);
181 termfreqclass termfreq;
182
183 termfreq.termstr = to_uni(term);
184 text_t utf8termstem = mgsearch_stemword (term);
185 termfreq.termstemstr = to_uni (utf8termstem);
186
187 mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
188
189 termfreq.termfreq = Freq;
190 queryresults->orgterms.push_back(termfreq);
191
192 return 0;
193}
194
195// this callback is called once for each variation of each term
196int termvariantscallback(char *Word, int ULen, int /*Freq*/,
197 float /*Weight*/, void *info) {
198
199 text_t term;
200 term.setcarr(Word, ULen);
201 queryresultsclass *queryresults = (queryresultsclass *)info;
202 queryresults->termvariants.insert(to_uni(term));
203
204 return 0;
205}
206
207// This callback is for getting document text
208int doctextcallback(char *Doc, int ULen, int /*Freq*/,
209 float /*Weight*/, void * /*info*/) {
210 tempdoc = Doc;
211 templen = ULen;
212
213 return 0;
214}
215
216
217static text_t getindexsuffix (const text_t &collection,
218 const text_t &index) {
219
220 text_t indexsuffix = "index";
221 indexsuffix = filename_cat (indexsuffix, index);
222 indexsuffix = filename_cat (indexsuffix, collection);
223 return indexsuffix;
224}
225
226
227
228
229////////////////////
230// mgsearch class //
231////////////////////
232
233mgsearchclass::mgsearchclass ()
234{
235 cache = new querycache (RESULTCACHESIZE);
236}
237
238mgsearchclass::~mgsearchclass ()
239{
240 if (cache != NULL)
241 {
242 delete cache;
243 cache = NULL;
244 }
245}
246
247
248void mgsearchclass::setcollectdir (const text_t &thecollectdir)
249{
250 collectdir = thecollectdir;
251}
252
253// you only need to use this function before doing any stemming
254// casefolding and stemming will be set if values for them are
255// provided (0 or 1).
256// makeindexcurrent returns true if it was able to load the database
257bool mgsearchclass::makeindexcurrent (const text_t &index,
258 const text_t &subcollection,
259 const text_t &language,
260 const text_t &collection,
261 int casefolding,
262 int stemming) {
263 bool databaseloaded = true;
264
265 // get the names of the collection, index and text suffixes
266 char *ccollection = collection.getcstr();
267 assert (ccollection != NULL);
268 char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
269 assert (idxsuffix != NULL);
270 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
271 assert (txtsuffix != NULL);
272
273#ifdef __WIN32__
274 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
275#else
276 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
277#endif
278
279 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
280 if (casefolding == 0) mgq_ask(".set casefold off");
281 else if (casefolding > 0) mgq_ask(".set casefold on");
282 if (stemming == 0) mgq_ask(".set stem off");
283 else if (stemming > 0) mgq_ask(".set stem on");
284
285 } else databaseloaded = false;
286
287 // free up the c strings
288 delete ccollection;
289 delete idxsuffix;
290 delete txtsuffix;
291 delete ccollectdir;
292
293 return databaseloaded;
294}
295
296
297// stem word uses the values set in the last call to makeindexcurrent
298// to stem the word. It is assumed that word is in unicode
299text_t mgsearchclass::stemword (const text_t &word) {
300 return to_uni (mgsearch_stemword (to_utf8 (word)));
301}
302
303text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
304 return to_uni (mgsearch_stemword (to_utf8 (here, end)));
305}
306
307
308bool mgsearchclass::search(const queryparamclass &queryparams,
309 queryresultsclass &queryresults) {
310 assert (cache != NULL);
311
312 queryresults.clear();
313
314 // first check the cache
315 if (cache->find(queryparams, queryresults)) return true;
316
317 // make sure there is a query to be processed
318 if (!has_unicode_letdig(queryparams.querystring)) return true;
319
320 if (makeindexcurrent (queryparams.index, queryparams.subcollection,
321 queryparams.language, queryparams.collection)) {
322 setsearchmode (queryparams);
323 submitquery (queryparams);
324 getresults (queryparams, queryresults);
325 return true;
326 }
327
328 return false;
329}
330
331
332void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
333{
334 mgq_ask(".set expert true");
335 mgq_ask(".set sorted_terms true");
336 mgq_ask(".set accumulator_method list");
337 mgq_ask(".set max_accumulators 500000");
338 mgq_ask(".set maxparas 500000");
339 mgq_ask(".set verbatim true");
340 // mgq_ask(".unset skip_dump");
341 mgq_ask(".set mode docnums");
342
343 switch (queryparams.search_type)
344 {
345 case 0: mgq_ask(".set query boolean"); break;
346 case 1: mgq_ask(".set query ranked"); break;
347 }
348 switch (queryparams.casefolding)
349 {
350 case 1: mgq_ask(".set casefold on"); break;
351 case 0: mgq_ask(".set casefold off"); break;
352 }
353 switch (queryparams.stemming)
354 {
355 case 1: mgq_ask(".set stem on"); break;
356 case 0: mgq_ask(".set stem off"); break;
357 }
358 mgq_ask(".set heads_length 150");
359
360 if (queryparams.maxdocs == -1) {
361 mgq_ask(".set maxdocs all");
362 } else {
363 char maxdocstr[32];
364 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
365 mgq_ask(maxdocstr);
366 }
367}
368
369
370void mgsearchclass::submitquery (const queryparamclass &queryparams)
371{
372 // sort out the query string
373 text_t ttquerystring = queryparams.querystring;
374 filterquery (ttquerystring);
375 char *querystring = to_utf8(ttquerystring).getcstr();
376
377 // submit the query
378 mgq_ask(querystring);
379
380 delete querystring;
381}
382
383
384void mgsearchclass::getresults (const queryparamclass &queryparams,
385 queryresultsclass &queryresults) {
386
387 int howmany = queryparams.maxdocs;
388 if (howmany == -1) howmany = MAXNUMDOCS;
389 mgq_results(result_docnums, 0, howmany,
390 ourquerycallback, (void *)(&queryresults));
391
392 // get the term frequencies
393 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
394 termfreqcallback, (void *)(&queryresults));
395 queryresults.sortuniqqueryterms();
396
397 // get term variants
398 mgq_results(result_terms, 0, MAXNUMTERMS,
399 termvariantscallback, (void *)(&queryresults));
400
401 // get the number of documents retrieved
402 int total_retrieved = 0, is_approx = 0;
403 mgq_docsretrieved (&total_retrieved, &is_approx);
404
405 if (total_retrieved == 0) {
406 // not available (or really was zero)
407 queryresults.docs_matched = queryresults.docs.docset.size();
408 if ((queryparams.maxdocs == -1) ||
409 (queryresults.docs_matched < queryparams.maxdocs))
410 queryresults.is_approx = Exact;
411 else
412 queryresults.is_approx = MoreThan;
413 } else {
414 queryresults.docs_matched = total_retrieved;
415 if (is_approx) queryresults.is_approx = Approximate;
416 else queryresults.is_approx = Exact;
417 }
418}
419
420void mgsearchclass::filterquery (text_t &ttquerystring) {
421 text_t::iterator ithere = ttquerystring.begin ();
422 text_t::iterator itend = ttquerystring.end ();
423
424 // remove all non alphanumeric characters (except
425 // boolean operators
426 while (ithere != itend) {
427 if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
428 (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
429 (*ithere != ')')) (*ithere) = ' ';
430 ithere++;
431 }
432}
433
434
435// the document text for 'docnum' is placed in 'output'
436// docTargetDocument returns 'true' if it was able to
437// try to get a document
438// collection is needed to see if an index from the
439// collection is loaded. If no index has been loaded
440// defaultindex is needed to load one
441bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
442 const text_t &defaultsubcollection,
443 const text_t &defaultlanguage,
444 const text_t &collection,
445 int docnum,
446 text_t &output) {
447 output.clear();
448
449 // get the mg version of the document
450 char *mgdoc = NULL;
451 int doclen = 0;
452 if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
453 collection, docnum, mgdoc, doclen)) return false;
454 if (mgdoc == NULL) return false;
455
456 // replace all control-Cs with spaces
457 char *mgdoc_here = mgdoc;
458 char *mgdoc_end = mgdoc + doclen;
459 while (mgdoc_here < mgdoc_end) {
460 if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
461 mgdoc_here++;
462 }
463
464 // convert this document to unicode
465 utf8inconvertclass inconvert;
466 convertclass::status_t status;
467 inconvert.reset ();
468 inconvert.setinput (mgdoc, doclen);
469 inconvert.convert (output, status);
470
471 return true;
472}
473
474
475bool mgsearchclass::mgdocument (const text_t &defaultindex,
476 const text_t &defaultsubcollection,
477 const text_t &defaultlanguage,
478 const text_t &collection,
479 int docnum,
480 char *&UDoc, int &ULen) {
481 int databaseloaded = 0;
482
483 UDoc = NULL; ULen = 0;
484
485 // see if we can make an appropriate database current
486// char *ccollection = collection.getcstr();
487// assert (ccollection != NULL);
488// databaseloaded = load_text_database (ccollection);
489// delete ccollection;
490
491 // try and load the database
492// if (!databaseloaded)
493 databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
494 defaultlanguage, collection);
495
496 if (databaseloaded) {
497 // retrieve the document from mg
498 char docstr[32];
499 sprintf(docstr, "%i", docnum);
500
501 mgq_ask(".set mode text");
502 mgq_ask(".set query docnums");
503 mgq_ask(docstr);
504
505 tempdoc = NULL;
506 templen = 0;
507 mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
508 UDoc = tempdoc;
509 ULen = templen;
510 }
511
512 return (bool)databaseloaded;
513}
Note: See TracBrowser for help on using the repository browser.