source: main/tags/2.25/gsdl/src/colservr/mgsearch.cpp@ 24204

Last change on this file since 24204 was 1324, checked in by kjm18, 24 years ago

mgpp incorporated. the old mgsearchclass and queryfilterclass are changed.
Have a base searchclass, from which mgsearchclass and mgppsearchclass inherit.
Have a base queryfilterclass, from which mgqueryfilterclass and
mgppqueryfilterclass inherit. librarymain in recpt should choose the appropriate
type (mg vs mgpp) for each collection.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.7 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "gsdlconf.h"
27#include "mgsearch.h"
28#include "fileutil.h"
29
30#include <string.h>
31#include <stdio.h>
32#include <stdlib.h>
33#include <ctype.h>
34
35#if defined(GSDL_USE_OBJECTSPACE)
36# include <ospace\std\iostream>
37#elif defined(GSDL_USE_IOS_H)
38# include <iostream.h>
39#else
40# include <iostream>
41#endif
42
43#if defined(__WIN32__)
44// gdbm stuff
45# include "autoconf.h"
46# include "systems.h"
47# include "gdbmconst.h"
48# include "gdbm.h"
49#else
50# include <gdbm.h>
51#endif
52
53
54#include <assert.h>
55
56#include "mgq.h"
57// #include "locateinfo.h"
58#include "gsdlunicode.h"
59#include "unitool.h"
60
61
62/////////////
63// globals //
64/////////////
65
66static char *tempdoc = NULL;
67static int templen = 0;
68
69
70//////////////////////
71// useful functions //
72//////////////////////
73
74
75// input and output are in utf8
76text_t mgsearch_stemword (const text_t &word) {
77 // allocate working stem space
78 int maxstemlen = mgq_getmaxstemlen ();
79 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
80 if (word_stem == NULL) return "";
81
82 // copy word to word_stem
83 int len = 0;
84 text_t::const_iterator here = word.begin();
85 text_t::const_iterator end = word.end();
86 while (len < maxstemlen && here != end) {
87 word_stem[len+1] = (unsigned char)(*here);
88 len++; here++;
89 }
90 word_stem[len+1] = '\0';
91 word_stem[0] = len;
92
93 mgq_stemword (word_stem);
94
95 // copy word_stem back to tempstr
96 text_t tempstr;
97 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
98
99 delete [] word_stem;
100
101 return tempstr;
102}
103
104
105
106////////////////////////
107// callback functions //
108////////////////////////
109
110// This routine is called for each document found in a search
111// it assumes that cache_num is set up correctly to point to
112// a suitable result cache
113int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
114 float Weight, void *info) {
115
116
117 queryresultsclass *queryresults = (queryresultsclass * )info;
118
119 // append this entry to the document results
120 docresultclass docresult;
121 docresult.docnum = DocNum;
122 docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
123 docresult.docweight = Weight - docresult.num_query_terms_matched*100;
124
125 queryresults->docs.docset[DocNum] = docresult;
126 queryresults->docs.docorder.push_back(DocNum);
127
128 return 0;
129}
130
131int termequivcallback(char *Word, int ULen, int /*Freq*/,
132 float /*Weight*/, void *info) {
133 text_tset *equivterms = (text_tset *)info;
134 if (equivterms == NULL) return 0;
135
136 text_t thisterm;
137 thisterm.setcarr(Word, ULen);
138
139 equivterms->insert(thisterm);
140
141 return 0;
142}
143
144
145void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
146 // allocate working stem space
147 int maxstemlen = mgq_getmaxstemlen ();
148 unsigned char *word_stem = new unsigned char [maxstemlen + 2];
149 if (word_stem == NULL) return;
150
151 // copy word to word_stem
152 int len = 0;
153 text_t::const_iterator here = word.begin();
154 text_t::const_iterator end = word.end();
155 while (len < maxstemlen && here != end) {
156 word_stem[len+1] = (unsigned char)(*here);
157 len++; here++;
158 }
159 word_stem[len+1] = '\0';
160 word_stem[0] = len;
161
162 // get the equivalent terms
163 mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
164
165 delete [] word_stem;
166
167 return;
168}
169
170 text_tset utf8equivterms; // kept as utf8 string for fast matching
171
172
173// This callback is called once for each term in the query
174int termfreqcallback(char *Word, int ULen, int Freq,
175 float /*Weight*/, void *info) {
176 queryresultsclass *queryresults = (queryresultsclass *)info;
177 if (queryresults == NULL) return 0;
178
179 text_t term;
180 term.setcarr(Word, ULen);
181 termfreqclass termfreq;
182
183 termfreq.termstr = to_uni(term);
184 text_t utf8termstem = mgsearch_stemword (term);
185 termfreq.termstemstr = to_uni (utf8termstem);
186
187 mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
188
189 termfreq.termfreq = Freq;
190 queryresults->orgterms.push_back(termfreq);
191
192 return 0;
193}
194
195// this callback is called once for each variation of each term
196int termvariantscallback(char *Word, int ULen, int /*Freq*/,
197 float /*Weight*/, void *info) {
198
199 text_t term;
200 term.setcarr(Word, ULen);
201 queryresultsclass *queryresults = (queryresultsclass *)info;
202 queryresults->termvariants.insert(to_uni(term));
203
204 return 0;
205}
206
207// This callback is for getting document text
208int doctextcallback(char *Doc, int ULen, int /*Freq*/,
209 float /*Weight*/, void * /*info*/) {
210 tempdoc = Doc;
211 templen = ULen;
212
213 return 0;
214}
215
216
217static text_t getindexsuffix (const text_t &collection,
218 const text_t &index) {
219
220 text_t indexsuffix = "index";
221 indexsuffix = filename_cat (indexsuffix, index);
222 indexsuffix = filename_cat (indexsuffix, collection);
223 return indexsuffix;
224}
225
226
227
228
229////////////////////
230// mgsearch class //
231////////////////////
232
233mgsearchclass::mgsearchclass ()
234 : searchclass() {
235
236}
237
238mgsearchclass::~mgsearchclass ()
239{
240 if (cache != NULL)
241 {
242 delete cache;
243 cache = NULL;
244 }
245}
246
247// you only need to use this function before doing any stemming
248// casefolding and stemming will be set if values for them are
249// provided (0 or 1).
250// makeindexcurrent returns true if it was able to load the database
251bool mgsearchclass::makeindexcurrent (const text_t &index,
252 const text_t &subcollection,
253 const text_t &language,
254 const text_t &collection,
255 int casefolding,
256 int stemming) {
257 bool databaseloaded = true;
258
259 // get the names of the collection, index and text suffixes
260 char *ccollection = collection.getcstr();
261 assert (ccollection != NULL);
262 char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
263 assert (idxsuffix != NULL);
264 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
265 assert (txtsuffix != NULL);
266
267#ifdef __WIN32__
268 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
269#else
270 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
271#endif
272
273 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
274 if (casefolding == 0) mgq_ask(".set casefold off");
275 else if (casefolding > 0) mgq_ask(".set casefold on");
276 if (stemming == 0) mgq_ask(".set stem off");
277 else if (stemming > 0) mgq_ask(".set stem on");
278
279 } else databaseloaded = false;
280
281 // free up the c strings
282 delete ccollection;
283 delete idxsuffix;
284 delete txtsuffix;
285 delete ccollectdir;
286
287 return databaseloaded;
288}
289
290
291// stem word uses the values set in the last call to makeindexcurrent
292// to stem the word. It is assumed that word is in unicode
293text_t mgsearchclass::stemword (const text_t &word) {
294 return to_uni (mgsearch_stemword (to_utf8 (word)));
295}
296
297text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
298 return to_uni (mgsearch_stemword (to_utf8 (here, end)));
299}
300
301
302bool mgsearchclass::search(const queryparamclass &queryparams,
303 queryresultsclass &queryresults) {
304 // assert (cache != NULL);
305
306 queryresults.clear();
307 cerr << "mgsearch start of search"<<endl;
308 // first check the cache
309 if (cache != NULL) {
310 if (cache->find(queryparams, queryresults)) return true;
311 }
312 // make sure there is a query to be processed
313 if (!has_unicode_letdig(queryparams.querystring)) return true;
314
315 if (makeindexcurrent (queryparams.index, queryparams.subcollection,
316 queryparams.language, queryparams.collection)) {
317 cerr << "made index current "<<endl;
318 setsearchmode (queryparams);
319 submitquery (queryparams);
320 getresults (queryparams, queryresults);
321 cerr << "got results"<<endl;
322 return true;
323 }
324
325 return false;
326}
327
328
329void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
330{
331 mgq_ask(".set expert true");
332 mgq_ask(".set sorted_terms true");
333 mgq_ask(".set accumulator_method list");
334 mgq_ask(".set max_accumulators 500000");
335 mgq_ask(".set maxparas 500000");
336 mgq_ask(".set verbatim true");
337 mgq_ask(".unset skip_dump");
338 mgq_ask(".set mode docnums");
339
340 switch (queryparams.search_type)
341 {
342 case 0: mgq_ask(".set query boolean"); break;
343 case 1: mgq_ask(".set query ranked"); break;
344 }
345 switch (queryparams.casefolding)
346 {
347 case 1: mgq_ask(".set casefold on"); break;
348 case 0: mgq_ask(".set casefold off"); break;
349 }
350 switch (queryparams.stemming)
351 {
352 case 1: mgq_ask(".set stem on"); break;
353 case 0: mgq_ask(".set stem off"); break;
354 }
355 mgq_ask(".set heads_length 150");
356
357 if (queryparams.maxdocs == -1) {
358 mgq_ask(".set maxdocs all");
359 } else {
360 char maxdocstr[32];
361 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
362 mgq_ask(maxdocstr);
363 }
364}
365
366
367void mgsearchclass::submitquery (const queryparamclass &queryparams)
368{
369 // sort out the query string
370 text_t ttquerystring = queryparams.querystring;
371 filterquery (ttquerystring);
372 char *querystring = to_utf8(ttquerystring).getcstr();
373
374 // submit the query
375 mgq_ask(querystring);
376
377 delete querystring;
378}
379
380
381void mgsearchclass::getresults (const queryparamclass &queryparams,
382 queryresultsclass &queryresults) {
383
384 int howmany = queryparams.maxdocs;
385 if (howmany == -1) howmany = MAXNUMDOCS;
386 mgq_results(result_docnums, 0, howmany,
387 ourquerycallback, (void *)(&queryresults));
388
389 // get the term frequencies
390 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
391 termfreqcallback, (void *)(&queryresults));
392 queryresults.sortuniqqueryterms();
393
394 // get term variants
395 mgq_results(result_terms, 0, MAXNUMTERMS,
396 termvariantscallback, (void *)(&queryresults));
397
398 // get the number of documents retrieved
399 int total_retrieved = 0, is_approx = 0;
400 mgq_docsretrieved (&total_retrieved, &is_approx);
401
402 if (total_retrieved == 0) {
403 // not available (or really was zero)
404 queryresults.docs_matched = queryresults.docs.docset.size();
405 if ((queryparams.maxdocs == -1) ||
406 (queryresults.docs_matched < queryparams.maxdocs))
407 queryresults.is_approx = Exact;
408 else
409 queryresults.is_approx = MoreThan;
410 } else {
411 queryresults.docs_matched = total_retrieved;
412 if (is_approx) queryresults.is_approx = Approximate;
413 else queryresults.is_approx = Exact;
414 }
415}
416
417void mgsearchclass::filterquery (text_t &ttquerystring) {
418 text_t::iterator ithere = ttquerystring.begin ();
419 text_t::iterator itend = ttquerystring.end ();
420
421 // remove all non alphanumeric characters (except
422 // boolean operators
423 while (ithere != itend) {
424 if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
425 (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
426 (*ithere != ')')) (*ithere) = ' ';
427 ithere++;
428 }
429}
430
431
432// the document text for 'docnum' is placed in 'output'
433// docTargetDocument returns 'true' if it was able to
434// try to get a document
435// collection is needed to see if an index from the
436// collection is loaded. If no index has been loaded
437// defaultindex is needed to load one
438bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
439 const text_t &defaultsubcollection,
440 const text_t &defaultlanguage,
441 const text_t &collection,
442 int docnum,
443 text_t &output) {
444 output.clear();
445
446 // get the mg version of the document
447 char *mgdoc = NULL;
448 int doclen = 0;
449 if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
450 collection, docnum, mgdoc, doclen)) return false;
451 if (mgdoc == NULL) return false;
452
453 // replace all control-Cs with spaces
454 char *mgdoc_here = mgdoc;
455 char *mgdoc_end = mgdoc + doclen;
456 while (mgdoc_here < mgdoc_end) {
457 if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
458 mgdoc_here++;
459 }
460
461 // convert this document to unicode
462 utf8inconvertclass inconvert;
463 convertclass::status_t status;
464 inconvert.reset ();
465 inconvert.setinput (mgdoc, doclen);
466 inconvert.convert (output, status);
467
468 return true;
469}
470
471
472bool mgsearchclass::mgdocument (const text_t &defaultindex,
473 const text_t &defaultsubcollection,
474 const text_t &defaultlanguage,
475 const text_t &collection,
476 int docnum,
477 char *&UDoc, int &ULen) {
478 int databaseloaded = 0;
479
480 UDoc = NULL; ULen = 0;
481
482 // see if we can make an appropriate database current
483// char *ccollection = collection.getcstr();
484// assert (ccollection != NULL);
485// databaseloaded = load_text_database (ccollection);
486// delete ccollection;
487
488 // try and load the database
489// if (!databaseloaded)
490 databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
491 defaultlanguage, collection);
492
493 if (databaseloaded) {
494 // retrieve the document from mg
495 char docstr[32];
496 sprintf(docstr, "%i", docnum);
497
498 mgq_ask(".set mode text");
499 mgq_ask(".set query docnums");
500 mgq_ask(docstr);
501
502 tempdoc = NULL;
503 templen = 0;
504 mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
505 UDoc = tempdoc;
506 ULen = templen;
507 }
508
509 return (bool)databaseloaded;
510}
511
Note: See TracBrowser for help on using the repository browser.