source: trunk/gsdl/src/colservr/mgsearch.cpp@ 301

Last change on this file since 301 was 301, checked in by sjboddie, 25 years ago

got rid of all the old functions for dealing with dir indexes

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.2 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: mgsearch.cpp 301 1999-06-27 22:07:27Z sjboddie $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.7 1999/06/27 22:07:27 sjboddie
15 got rid of all the old functions for dealing with dir indexes
16
17 Revision 1.6 1999/06/09 00:41:32 sjboddie
18 phrase searching now uses case-folding if it's turned on
19
20 Revision 1.5 1999/02/21 22:31:35 rjmcnab
21
22 Removed locateinfo.
23
24 Revision 1.4 1999/02/03 01:13:27 sjboddie
25
26 Got interface to handle subcollections and language subcollections -
27 committed changes made to some of the collections
28
29 Revision 1.3 1999/01/19 01:38:17 rjmcnab
30
31 Made the source more portable.
32
33 Revision 1.2 1999/01/12 01:51:02 rjmcnab
34
35 Standard header.
36
37 Revision 1.1 1999/01/08 09:02:16 rjmcnab
38
39 Moved from src/library.
40
41 */
42
43
44#include "gsdlconf.h"
45#include "mgsearch.h"
46#include "fileutil.h"
47
48#include <string.h>
49#include <stdio.h>
50#include <stdlib.h>
51#include <ctype.h>
52
53#if defined(GSDL_USE_OBJECTSPACE)
54# include <ospace\std\iostream>
55#elif defined(GSDL_USE_IOS_H)
56# include <iostream.h>
57#else
58# include <iostream>
59#endif
60
61#if defined(__WIN32__)
62// gdbm stuff
63# include "autoconf.h"
64# include "systems.h"
65# include "gdbmconst.h"
66# include "gdbm.h"
67#else
68# include <gdbm.h>
69#endif
70
71
72#include <assert.h>
73
74#include "mgq.h"
75// #include "locateinfo.h"
76#include "gsdlunicode.h"
77#include "unitool.h"
78
79
80/////////////
81// globals //
82/////////////
83
84static char *quotedquery = NULL;
85static int casefold;
86
87
88////////////////////////
89// callback functions //
90////////////////////////
91
92// This routine is called for each document found in a search
93// it assumes that cache_num is set up correctly to point to
94// a suitable result cache
95int ourquerycallback(char *UDoc, int /*ULen*/, int DocNum,
96 float Weight, void *info) {
97
98
99 queryresultsclass *queryresults = (queryresultsclass * )info;
100
101 // check the returned document for the presence of the
102 // quoted part of the query, if there was one
103
104 // if (UDoc != NULL && quotedquery != NULL &&
105 // quotedquery[0] != '\0' && strstr (UDoc, quotedquery) == NULL) return 0;
106
107
108 if (UDoc != NULL && quotedquery != NULL && quotedquery[0] != '\0') {
109
110 if (casefold) {
111 int len;
112 for (len = 0; quotedquery[len] != '\0'; len ++)
113 quotedquery[len] = tolower (quotedquery[len]);
114 for (len = 0; UDoc[len] != '\0'; len ++)
115 UDoc[len] = tolower (UDoc[len]);
116 }
117 if (strstr (UDoc, quotedquery) == NULL) return 0;
118 }
119
120 // append this entry to the document results
121 docresultclass docresult;
122 docresult.docnum = DocNum;
123 docresult.docweight = Weight;
124
125 queryresults->docs.push_back(docresult);
126
127 return 0;
128}
129
130// This callback is called once for each term in the query
131int termfreqcallback(char *Word, int ULen, int Freq,
132 float /*Weight*/, void *info) {
133 queryresultsclass *queryresults = (queryresultsclass *)info;
134
135 text_t term;
136 term.setcarr(Word, ULen);
137 termfreqclass termfreq;
138 termfreq.termstr = to_uni(term);
139 termfreq.termfreq = Freq;
140 queryresults->terms.push_back(termfreq);
141
142 return 0;
143}
144
145// this callback is called once for each variation of each term
146int termscallback(char *Word, int ULen, int /*Freq*/,
147 float /*Weight*/, void *info) {
148
149 text_t term;
150 term.setcarr(Word, ULen);
151 queryresultsclass *queryresults = (queryresultsclass *)info;
152 queryresults->termvariants.push_back(to_uni(term));
153
154 return 0;
155}
156
157// This callback is for getting document text
158int doctextcallback(char *Word, int ULen, int /*Freq*/,
159 float /*Weight*/, void *info) {
160 text_t *output = (text_t *)info;
161 if (output == NULL) return 0;
162 output->clear();
163
164 utf8inconvertclass inconvert;
165 convertclass::status_t status;
166 inconvert.reset ();
167 inconvert.setinput (Word, ULen);
168 inconvert.convert (*output, status);
169
170 // replace all control-Cs with spaces
171 text_t::iterator here = output->begin();
172 text_t::iterator end = output->end();
173 while (here != end) {
174 if (*here == '\x3') *here = ' ';
175 here++;
176 }
177
178 return 0;
179}
180
181
182static text_t getindexsuffix (const text_t &collection,
183 const text_t &index) {
184 text_t indexsuffix = "index";
185 indexsuffix = filename_cat (indexsuffix, index);
186 indexsuffix = filename_cat (indexsuffix, collection);
187 return indexsuffix;
188}
189
190
191
192
193////////////////////
194// mgsearch class //
195////////////////////
196
197mgsearchclass::mgsearchclass ()
198{
199 cache = new querycache (RESULTCACHESIZE);
200}
201
202mgsearchclass::~mgsearchclass ()
203{
204 if (cache != NULL)
205 {
206 delete cache;
207 cache = NULL;
208 }
209}
210
211
212void mgsearchclass::setcollectdir (const text_t &thecollectdir)
213{
214 collectdir = thecollectdir;
215}
216
217
218bool mgsearchclass::search(const queryparamclass &queryparams,
219 queryresultsclass &queryresults)
220{
221 bool databaseloaded = true;
222
223 assert (cache != NULL);
224
225 queryresults.clear();
226
227 // first check the cache
228 if (cache->find(queryparams, queryresults))
229 return true;
230
231 // make sure there is a query to be processed
232 text_t::const_iterator queryhere = queryparams.querystring.begin();
233 text_t::const_iterator queryend = queryparams.querystring.end();
234 while (queryhere != queryend) {
235 if (is_unicode_letdig (*queryhere)) break;
236 queryhere++;
237 }
238
239 // if we reached the end of the query string without finding
240 // any alphanumeric characters then return no results (and say
241 // the database was loaded)
242 if (queryhere == queryend) return true;
243
244 casefold = queryparams.casefolding;
245
246 // get the names of the collection, index and text suffixes
247 char *ccollection = queryparams.collection.getcstr();
248 assert (ccollection != NULL);
249 char *idxsuffix = (getindexsuffix (queryparams.collection,
250 queryparams.search_index)).getcstr();
251 assert (idxsuffix != NULL);
252 char *txtsuffix = (getindexsuffix (queryparams.collection, "text")).getcstr();
253 assert (txtsuffix != NULL);
254
255#ifdef __WIN32__
256 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
257#else
258 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
259#endif
260
261 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix))
262 {
263 setsearchmode (queryparams);
264 submitquery (queryparams);
265 getresults (queryresults);
266 }
267 else databaseloaded = false;
268
269 // free up the c strings
270 delete ccollection;
271 delete idxsuffix;
272 delete txtsuffix;
273 delete ccollectdir;
274
275 return databaseloaded;
276}
277
278
279void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
280{
281 mgq_ask(".set expert true");
282 mgq_ask(".set accumulator_method list");
283 mgq_ask(".set max_accumulators 50000");
284 mgq_ask(".set verbatim true");
285 mgq_ask(".unset skip_dump");
286 mgq_ask(".set mode docnums");
287
288 switch (queryparams.search_type)
289 {
290 case 0: mgq_ask(".set query boolean"); break;
291 case 1: mgq_ask(".set query ranked"); break;
292 }
293 switch (queryparams.casefolding)
294 {
295 case 1: mgq_ask(".set casefold on"); break;
296 case 0: mgq_ask(".set casefold off"); break;
297 }
298 switch (queryparams.stemming)
299 {
300 case 1: mgq_ask(".set stem on"); break;
301 case 0: mgq_ask(".set stem off"); break;
302 }
303 mgq_ask(".set heads_length 150");
304
305 char maxdocstr[32];
306 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
307 mgq_ask(maxdocstr);
308}
309
310
311void mgsearchclass::submitquery (const queryparamclass &queryparams)
312{
313 // sort out the query string
314 text_t ttquerystring = queryparams.querystring;
315 text_t ttquotedquery;
316 extractquoted (ttquerystring, ttquotedquery);
317 filterquery (ttquerystring);
318
319 // turn the strings into c strings for mg
320 if (quotedquery != NULL) // quotedquery is a global
321 {
322 delete quotedquery;
323 quotedquery = NULL;
324 }
325
326 // quotedquery will be deleted on the next call to this function
327 quotedquery = to_utf8(ttquotedquery).getcstr ();
328 char *querystring = to_utf8(ttquerystring).getcstr();
329
330 // submit the query
331 mgq_ask(querystring);
332
333 delete querystring;
334}
335
336
337void mgsearchclass::getresults (queryresultsclass &queryresults)
338{
339 if (quotedquery[0] == '\0')
340 {
341 // don't need the text
342 mgq_results(result_docnums, 0, MAXNUMDOCS,
343 ourquerycallback, (void *)(&queryresults));
344 }
345 else
346 {
347 // we need the text for this one
348 mgq_results(result_docs, 0, MAXNUMDOCS,
349 ourquerycallback, (void *)(&queryresults));
350 }
351
352 // get the term frequencies
353 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
354 termfreqcallback, (void *)(&queryresults));
355 mgq_results(result_terms, 0, MAXNUMTERMS,
356 termscallback, (void *)(&queryresults));
357 queryresults.sortqueryterms();
358 queryresults.uniqqueryterms();
359}
360
361
362void mgsearchclass::extractquoted (text_t &ttquerystring, text_t &ttquotedquery)
363{
364 ttquotedquery.clear();
365
366 text_t::iterator ithere = ttquerystring.begin ();
367 text_t::iterator itend = ttquerystring.end ();
368
369 bool inquote = false;
370
371 while (ithere != itend)
372 {
373 if ((*ithere) == '\"')
374 {
375 if (!inquote) ttquotedquery.clear ();
376 inquote = !inquote;
377 *ithere = ' '; // delete the quote
378 }
379 else if (inquote)
380 {
381 ttquotedquery.push_back(*ithere);
382 *ithere = ' ';
383 }
384
385 ithere++;
386 }
387}
388
389
390void mgsearchclass::filterquery (text_t &ttquerystring) {
391 text_t::iterator ithere = ttquerystring.begin ();
392 text_t::iterator itend = ttquerystring.end ();
393
394 // remove all non alphanumeric characters
395 while (ithere != itend) {
396 if (!is_unicode_letdig(*ithere)) (*ithere) = ' ';
397 ithere++;
398 }
399}
400
401
402// the document text for 'docnum' is placed in 'output'
403// docTargetDocument returns 'true' if it was able to
404// try to get a document
405// collection is needed to see if an index from the
406// collection is loaded. If no index has been loaded
407// defaultindex is needed to load one
408bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
409 const text_t &collection,
410 int docnum,
411 text_t &output)
412{
413 int databaseloaded = 0;
414
415 output.clear();
416
417 char *ccollection = collection.getcstr();
418 assert (ccollection != NULL);
419
420 // see if we can make an appropriate database current
421 databaseloaded = load_text_database (ccollection);
422
423 // try and load the database
424 if (!databaseloaded)
425 {
426 // get the names of the index and text suffixes
427 char *idxsuffix = (getindexsuffix (collection,
428 defaultindex)).getcstr();
429 assert (idxsuffix != NULL);
430 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
431 assert (txtsuffix != NULL);
432
433#ifdef __WIN32__
434 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
435#else
436 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
437#endif
438
439 databaseloaded = load_database(ccollection, ccollectdir, idxsuffix, txtsuffix);
440
441 // free up the c strings
442 delete idxsuffix;
443 delete txtsuffix;
444 delete ccollectdir;
445 }
446
447 // free up the c collection string
448 delete ccollection;
449
450 if (databaseloaded)
451 {
452 // retrieve the document from mg
453 char docstr[32];
454 sprintf(docstr, "%i", docnum);
455
456 mgq_ask(".set mode text");
457 mgq_ask(".set query docnums");
458 mgq_ask(docstr);
459 mgq_results (result_docs, 0, 1, doctextcallback, (void *)&output);
460 }
461
462 return databaseloaded;
463}
464
Note: See TracBrowser for help on using the repository browser.