source: trunk/gsdl/src/colservr/mgsearch.cpp@ 1497

Last change on this file since 1497 was 1497, checked in by sjboddie, 24 years ago

removed some debugging stuff

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.5 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "gsdlconf.h"
27#include "mgsearch.h"
28#include "fileutil.h"
29
30#include <string.h>
31#include <stdio.h>
32#include <stdlib.h>
33#include <ctype.h>
34
35#if defined(GSDL_USE_OBJECTSPACE)
36# include <ospace\std\iostream>
37#elif defined(GSDL_USE_IOS_H)
38# include <iostream.h>
39#else
40# include <iostream>
41#endif
42
43#if defined(__WIN32__)
44// gdbm stuff
45# include "autoconf.h"
46# include "systems.h"
47# include "gdbmconst.h"
48# include "gdbm.h"
49#else
50# include <gdbm.h>
51#endif
52
53
54#include <assert.h>
55
56#include "mgq.h"
57// #include "locateinfo.h"
58#include "gsdlunicode.h"
59#include "unitool.h"
60
61
62/////////////
63// globals //
64/////////////
65
66static char *tempdoc = NULL;
67static int templen = 0;
68
69
70//////////////////////
71// useful functions //
72//////////////////////
73
74
75// input and output are in utf8
76text_t mgsearch_stemword (const text_t &word) {
77 // allocate working stem space
78 int maxstemlen = mgq_getmaxstemlen ();
79 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
80 if (word_stem == NULL) return "";
81
82 // copy word to word_stem
83 int len = 0;
84 text_t::const_iterator here = word.begin();
85 text_t::const_iterator end = word.end();
86 while (len < maxstemlen && here != end) {
87 word_stem[len+1] = (unsigned char)(*here);
88 len++; here++;
89 }
90 word_stem[len+1] = '\0';
91 word_stem[0] = len;
92
93 mgq_stemword (word_stem);
94
95 // copy word_stem back to tempstr
96 text_t tempstr;
97 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
98
99 delete [] word_stem;
100
101 return tempstr;
102}
103
104
105
106////////////////////////
107// callback functions //
108////////////////////////
109
110// This routine is called for each document found in a search
111// it assumes that cache_num is set up correctly to point to
112// a suitable result cache
113int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
114 float Weight, void *info) {
115
116
117 queryresultsclass *queryresults = (queryresultsclass * )info;
118
119 // append this entry to the document results
120 docresultclass docresult;
121 docresult.docnum = DocNum;
122 docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
123 docresult.docweight = Weight - docresult.num_query_terms_matched*100;
124
125 queryresults->docs.docset[DocNum] = docresult;
126 queryresults->docs.docorder.push_back(DocNum);
127
128 return 0;
129}
130
131int termequivcallback(char *Word, int ULen, int /*Freq*/,
132 float /*Weight*/, void *info) {
133 text_tset *equivterms = (text_tset *)info;
134 if (equivterms == NULL) return 0;
135
136 text_t thisterm;
137 thisterm.setcarr(Word, ULen);
138
139 equivterms->insert(thisterm);
140
141 return 0;
142}
143
144
145void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
146 // allocate working stem space
147 int maxstemlen = mgq_getmaxstemlen ();
148 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
149 if (word_stem == NULL) return;
150
151 // copy word to word_stem
152 int len = 0;
153 text_t::const_iterator here = word.begin();
154 text_t::const_iterator end = word.end();
155 while (len < maxstemlen && here != end) {
156 word_stem[len+1] = (unsigned char)(*here);
157 len++; here++;
158 }
159 word_stem[len+1] = '\0';
160 word_stem[0] = len;
161
162 // get the equivalent terms
163 mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
164
165 delete [] word_stem;
166
167 return;
168}
169
170 text_tset utf8equivterms; // kept as utf8 string for fast matching
171
172
173// This callback is called once for each term in the query
174int termfreqcallback(char *Word, int ULen, int Freq,
175 float /*Weight*/, void *info) {
176 queryresultsclass *queryresults = (queryresultsclass *)info;
177 if (queryresults == NULL) return 0;
178
179 text_t term;
180 term.setcarr(Word, ULen);
181 termfreqclass termfreq;
182
183 termfreq.termstr = to_uni(term);
184 text_t utf8termstem = mgsearch_stemword (term);
185 termfreq.termstemstr = to_uni (utf8termstem);
186
187 mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
188
189 termfreq.termfreq = Freq;
190 queryresults->orgterms.push_back(termfreq);
191
192 return 0;
193}
194
195// this callback is called once for each variation of each term
196int termvariantscallback(char *Word, int ULen, int /*Freq*/,
197 float /*Weight*/, void *info) {
198
199 text_t term;
200 term.setcarr(Word, ULen);
201 queryresultsclass *queryresults = (queryresultsclass *)info;
202 queryresults->termvariants.insert(to_uni(term));
203
204 return 0;
205}
206
207// This callback is for getting document text
208int doctextcallback(char *Doc, int ULen, int /*Freq*/,
209 float /*Weight*/, void * /*info*/) {
210 tempdoc = Doc;
211 templen = ULen;
212
213 return 0;
214}
215
216
217static text_t getindexsuffix (const text_t &collection,
218 const text_t &index) {
219
220 text_t indexsuffix = "index";
221 indexsuffix = filename_cat (indexsuffix, index);
222 indexsuffix = filename_cat (indexsuffix, collection);
223 return indexsuffix;
224}
225
226
227
228
229////////////////////
230// mgsearch class //
231////////////////////
232
233mgsearchclass::mgsearchclass ()
234 : searchclass() {
235
236}
237
238mgsearchclass::~mgsearchclass ()
239{
240 if (cache != NULL)
241 {
242 delete cache;
243 cache = NULL;
244 }
245}
246
247// you only need to use this function before doing any stemming
248// casefolding and stemming will be set if values for them are
249// provided (0 or 1).
250// makeindexcurrent returns true if it was able to load the database
251bool mgsearchclass::makeindexcurrent (const text_t &index,
252 const text_t &subcollection,
253 const text_t &language,
254 const text_t &collection,
255 int casefolding,
256 int stemming) {
257 bool databaseloaded = true;
258
259 // get the names of the collection, index and text suffixes
260 char *ccollection = collection.getcstr();
261 assert (ccollection != NULL);
262 char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
263 assert (idxsuffix != NULL);
264 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
265 assert (txtsuffix != NULL);
266
267#ifdef __WIN32__
268 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
269#else
270 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
271#endif
272
273 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
274 if (casefolding == 0) mgq_ask(".set casefold off");
275 else if (casefolding > 0) mgq_ask(".set casefold on");
276 if (stemming == 0) mgq_ask(".set stem off");
277 else if (stemming > 0) mgq_ask(".set stem on");
278
279 } else databaseloaded = false;
280
281 // free up the c strings
282 delete ccollection;
283 delete idxsuffix;
284 delete txtsuffix;
285 delete ccollectdir;
286
287 return databaseloaded;
288}
289
290
291// stem word uses the values set in the last call to makeindexcurrent
292// to stem the word. It is assumed that word is in unicode
293text_t mgsearchclass::stemword (const text_t &word) {
294 return to_uni (mgsearch_stemword (to_utf8 (word)));
295}
296
297text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
298 return to_uni (mgsearch_stemword (to_utf8 (here, end)));
299}
300
301
302bool mgsearchclass::search(const queryparamclass &queryparams,
303 queryresultsclass &queryresults) {
304 // assert (cache != NULL);
305
306 queryresults.clear();
307 // first check the cache
308 if (cache != NULL) {
309 if (cache->find(queryparams, queryresults)) return true;
310 }
311 // make sure there is a query to be processed
312 if (!has_unicode_letdig(queryparams.querystring)) return true;
313
314 if (makeindexcurrent (queryparams.index, queryparams.subcollection,
315 queryparams.language, queryparams.collection)) {
316 setsearchmode (queryparams);
317 submitquery (queryparams);
318 getresults (queryparams, queryresults);
319 return true;
320 }
321
322 return false;
323}
324
325
326void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
327{
328 mgq_ask(".set expert true");
329 mgq_ask(".set sorted_terms true");
330 mgq_ask(".set accumulator_method list");
331 mgq_ask(".set max_accumulators 500000");
332 mgq_ask(".set maxparas 500000");
333 mgq_ask(".set verbatim true");
334 mgq_ask(".unset skip_dump");
335 mgq_ask(".set mode docnums");
336
337 switch (queryparams.search_type)
338 {
339 case 0: mgq_ask(".set query boolean"); break;
340 case 1: mgq_ask(".set query ranked"); break;
341 }
342 switch (queryparams.casefolding)
343 {
344 case 1: mgq_ask(".set casefold on"); break;
345 case 0: mgq_ask(".set casefold off"); break;
346 }
347 switch (queryparams.stemming)
348 {
349 case 1: mgq_ask(".set stem on"); break;
350 case 0: mgq_ask(".set stem off"); break;
351 }
352 mgq_ask(".set heads_length 150");
353
354 if (queryparams.maxdocs == -1) {
355 mgq_ask(".set maxdocs all");
356 } else {
357 char maxdocstr[32];
358 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
359 mgq_ask(maxdocstr);
360 }
361}
362
363
364void mgsearchclass::submitquery (const queryparamclass &queryparams)
365{
366 // sort out the query string
367 text_t ttquerystring = queryparams.querystring;
368 filterquery (ttquerystring);
369 char *querystring = to_utf8(ttquerystring).getcstr();
370
371 // submit the query
372 mgq_ask(querystring);
373
374 delete querystring;
375}
376
377
378void mgsearchclass::getresults (const queryparamclass &queryparams,
379 queryresultsclass &queryresults) {
380
381 int howmany = queryparams.maxdocs;
382 if (howmany == -1) howmany = MAXNUMDOCS;
383 mgq_results(result_docnums, 0, howmany,
384 ourquerycallback, (void *)(&queryresults));
385
386 // get the term frequencies
387 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
388 termfreqcallback, (void *)(&queryresults));
389 queryresults.sortuniqqueryterms();
390
391 // get term variants
392 mgq_results(result_terms, 0, MAXNUMTERMS,
393 termvariantscallback, (void *)(&queryresults));
394
395 // get the number of documents retrieved
396 int total_retrieved = 0, is_approx = 0;
397 mgq_docsretrieved (&total_retrieved, &is_approx);
398
399 if (total_retrieved == 0) {
400 // not available (or really was zero)
401 queryresults.docs_matched = queryresults.docs.docset.size();
402 if ((queryparams.maxdocs == -1) ||
403 (queryresults.docs_matched < queryparams.maxdocs))
404 queryresults.is_approx = Exact;
405 else
406 queryresults.is_approx = MoreThan;
407 } else {
408 queryresults.docs_matched = total_retrieved;
409 if (is_approx) queryresults.is_approx = Approximate;
410 else queryresults.is_approx = Exact;
411 }
412}
413
414void mgsearchclass::filterquery (text_t &ttquerystring) {
415 text_t::iterator ithere = ttquerystring.begin ();
416 text_t::iterator itend = ttquerystring.end ();
417
418 // remove all non alphanumeric characters (except
419 // boolean operators
420 while (ithere != itend) {
421 if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
422 (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
423 (*ithere != ')')) (*ithere) = ' ';
424 ithere++;
425 }
426}
427
428
429// the document text for 'docnum' is placed in 'output'
430// docTargetDocument returns 'true' if it was able to
431// try to get a document
432// collection is needed to see if an index from the
433// collection is loaded. If no index has been loaded
434// defaultindex is needed to load one
435bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
436 const text_t &defaultsubcollection,
437 const text_t &defaultlanguage,
438 const text_t &collection,
439 int docnum,
440 text_t &output) {
441 output.clear();
442
443 // get the mg version of the document
444 char *mgdoc = NULL;
445 int doclen = 0;
446 if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
447 collection, docnum, mgdoc, doclen)) return false;
448 if (mgdoc == NULL) return false;
449
450 // replace all control-Cs with spaces
451 char *mgdoc_here = mgdoc;
452 char *mgdoc_end = mgdoc + doclen;
453 while (mgdoc_here < mgdoc_end) {
454 if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
455 mgdoc_here++;
456 }
457
458 // convert this document to unicode
459 utf8inconvertclass inconvert;
460 convertclass::status_t status;
461 inconvert.reset ();
462 inconvert.setinput (mgdoc, doclen);
463 inconvert.convert (output, status);
464
465 return true;
466}
467
468
469bool mgsearchclass::mgdocument (const text_t &defaultindex,
470 const text_t &defaultsubcollection,
471 const text_t &defaultlanguage,
472 const text_t &collection,
473 int docnum,
474 char *&UDoc, int &ULen) {
475 int databaseloaded = 0;
476
477 UDoc = NULL; ULen = 0;
478
479 // see if we can make an appropriate database current
480// char *ccollection = collection.getcstr();
481// assert (ccollection != NULL);
482// databaseloaded = load_text_database (ccollection);
483// delete ccollection;
484
485 // try and load the database
486// if (!databaseloaded)
487 databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
488 defaultlanguage, collection);
489
490 if (databaseloaded) {
491 // retrieve the document from mg
492 char docstr[32];
493 sprintf(docstr, "%i", docnum);
494
495 mgq_ask(".set mode text");
496 mgq_ask(".set query docnums");
497 mgq_ask(docstr);
498
499 tempdoc = NULL;
500 templen = 0;
501 mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
502 UDoc = tempdoc;
503 ULen = templen;
504 }
505
506 return (bool)databaseloaded;
507}
508
Note: See TracBrowser for help on using the repository browser.