source: trunk/gsdl/src/library/mgsearch.cpp@ 94

Last change on this file since 94 was 94, checked in by rjmcnab, 25 years ago

Wrote general map file based in and out converters. Fixed bugs related
to Chinese charater searching. text_t now has a encoding attribute. Added
an encoding option to the preferences.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.7 KB
Line 
1#include <string.h>
2#include <stdio.h>
3#include <stdlib.h>
4#include <ctype.h>
5
6#ifdef __GNUG__
7# include <iostream.h>
8# include <gdbm.h>
9
10#else
11# ifndef USE_OBJECTSPACE
12# include <iostream>
13# else
14# include <ospace\std\iostream>
15# endif
16
17// gdbm stuff
18# include "autoconf.h"
19# include "systems.h"
20# include "gdbmconst.h"
21# include "gdbm.h"
22#endif
23
24#include <assert.h>
25
26#include "mgq.h"
27#include "mgsearch.h"
28#include "locateinfo.h"
29#include "gsdlunicode.h"
30#include "unitool.h"
31
32
33/////////////
34// globals //
35/////////////
36
37static char *quotedquery = NULL;
38
39
40
41////////////////////////
42// callback functions //
43////////////////////////
44
45// This routine is called for each document found in a search
46// it assumes that cache_num is set up correctly to point to
47// a suitable result cache
48int ourquerycallback(char *UDoc, int ULen, int DocNum,
49 float Weight, void *info) {
50
51
52 queryresultsclass *queryresults = (queryresultsclass * )info;
53
54 // check the returned document for the presence of the
55 // quoted part of the query, if there was one
56
57 if (UDoc != NULL && quotedquery != NULL &&
58 quotedquery[0] != '\0' && strstr (UDoc, quotedquery) == NULL) return 0;
59
60 // append this entry to the document results
61 docresultclass docresult;
62 docresult.docnum = DocNum;
63 docresult.docweight = Weight;
64
65 queryresults->docs.push_back(docresult);
66
67 return 0;
68}
69
70// This callback is called once for each term in the query
71int termfreqcallback(char *Word, int ULen, int Freq,
72 float Weight, void *info) {
73 queryresultsclass *queryresults = (queryresultsclass *)info;
74
75 text_t term;
76 term.setcarr(Word, ULen);
77 termfreqclass termfreq;
78 termfreq.termstr = to_uni(term);
79 termfreq.termfreq = Freq;
80 queryresults->terms.push_back(termfreq);
81
82 return 0;
83}
84
85// this callback is called once for each variation of each term
86int termscallback(char *Word, int ULen, int Freq,
87 float Weight, void *info) {
88
89 text_t term;
90 term.setcarr(Word, ULen);
91 queryresultsclass *queryresults = (queryresultsclass *)info;
92 queryresults->termvariants.push_back(to_uni(term));
93
94 return 0;
95}
96
97// This callback is for getting document text
98int doctextcallback(char *Word, int ULen, int Freq,
99 float Weight, void *info) {
100 text_t *output = (text_t *)info;
101 if (output == NULL) return 0;
102 output->clear();
103
104 utf8inconvertclass inconvert;
105 convertclass::status_t status;
106 inconvert.reset ();
107 inconvert.setinput (Word, ULen);
108 inconvert.convert (*output, status);
109
110 // replace all control-Cs with spaces
111 text_t::iterator here = output->begin();
112 text_t::iterator end = output->end();
113 while (here != end) {
114 if (*here == '\x3') *here = ' ';
115 here++;
116 }
117
118 return 0;
119}
120
121
122
123////////////////////
124// mgsearch class //
125////////////////////
126
127mgsearchclass::mgsearchclass ()
128{
129 cache = new querycache (RESULTCACHESIZE);
130}
131
132mgsearchclass::~mgsearchclass ()
133{
134 if (cache != NULL)
135 {
136 delete cache;
137 cache = NULL;
138 }
139}
140
141
142void mgsearchclass::setcollectdir (const text_t &thecollectdir)
143{
144 collectdir = thecollectdir;
145}
146
147
148bool mgsearchclass::search(const queryparamclass &queryparams,
149 queryresultsclass &queryresults)
150{
151 bool databaseloaded = true;
152
153 assert (cache != NULL);
154
155 queryresults.clear();
156
157 // first check the cache
158 if (cache->find(queryparams, queryresults))
159 return true;
160
161 // make sure there is a query to be processed
162 text_t::const_iterator queryhere = queryparams.querystring.begin();
163 text_t::const_iterator queryend = queryparams.querystring.end();
164 while (queryhere != queryend) {
165 if (is_unicode_letdig (*queryhere)) break;
166 queryhere++;
167 }
168
169 // if we reached the end of the query string without finding
170 // any alphanumeric characters then return no results (and say
171 // the database was loaded)
172 if (queryhere == queryend) return true;
173
174
175 // get the names of the index and text suffixes
176 char *idxsuffix = (getindexsuffix (queryparams.collection,
177 queryparams.search_index)).getcstr();
178 assert (idxsuffix != NULL);
179 char *txtsuffix = (gettextsuffix (queryparams.collection)).getcstr();
180 assert (txtsuffix != NULL);
181
182#ifdef __WIN32__
183 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
184#else
185 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
186#endif
187
188 if (load_database(ccollectdir, idxsuffix, txtsuffix))
189 {
190 setsearchmode (queryparams);
191 submitquery (queryparams);
192 getresults (queryresults);
193 }
194 else databaseloaded = false;
195
196 // free up the c strings
197 delete idxsuffix;
198 delete txtsuffix;
199 delete ccollectdir;
200
201 return databaseloaded;
202}
203
204
205void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
206{
207 mgq_ask(".set expert true");
208 mgq_ask(".set accumulator_method list");
209 mgq_ask(".set max_accumulators 50000");
210 mgq_ask(".set verbatim true");
211 mgq_ask(".unset skip_dump");
212 mgq_ask(".set mode docnums");
213
214 switch (queryparams.search_type)
215 {
216 case 0: mgq_ask(".set query boolean"); break;
217 case 1: mgq_ask(".set query ranked"); break;
218 }
219 switch (queryparams.casefolding)
220 {
221 case 1: mgq_ask(".set casefold on"); break;
222 case 0: mgq_ask(".set casefold off"); break;
223 }
224 switch (queryparams.stemming)
225 {
226 case 1: mgq_ask(".set stem on"); break;
227 case 0: mgq_ask(".set stem off"); break;
228 }
229 mgq_ask(".set heads_length 150");
230
231 char maxdocstr[32];
232 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
233 mgq_ask(maxdocstr);
234}
235
236
237void mgsearchclass::submitquery (const queryparamclass &queryparams)
238{
239 // sort out the query string
240 text_t ttquerystring = queryparams.querystring;
241 text_t ttquotedquery;
242 extractquoted (ttquerystring, ttquotedquery);
243 filterquery (ttquerystring);
244
245 // turn the strings into c strings for mg
246 if (quotedquery != NULL) // quotedquery is a global
247 {
248 delete quotedquery;
249 quotedquery = NULL;
250 }
251
252 // quotedquery will be deleted on the next call to this function
253 quotedquery = to_utf8(ttquotedquery).getcstr ();
254 char *querystring = to_utf8(ttquerystring).getcstr();
255
256 // submit the query
257 mgq_ask(querystring);
258
259 delete querystring;
260}
261
262
263void mgsearchclass::getresults (queryresultsclass &queryresults)
264{
265 if (quotedquery[0] == '\0')
266 {
267 // don't need the text
268 mgq_results(result_docnums, 0, MAXNUMDOCS,
269 ourquerycallback, (void *)(&queryresults));
270 }
271 else
272 {
273 // we need the text for this one
274 mgq_results(result_docs, 0, MAXNUMDOCS,
275 ourquerycallback, (void *)(&queryresults));
276 }
277
278 // get the term frequencies
279 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
280 termfreqcallback, (void *)(&queryresults));
281 mgq_results(result_terms, 0, MAXNUMTERMS,
282 termscallback, (void *)(&queryresults));
283 queryresults.sortqueryterms();
284 queryresults.uniqqueryterms();
285}
286
287
288void mgsearchclass::extractquoted (text_t &ttquerystring, text_t &ttquotedquery)
289{
290 ttquotedquery.clear();
291
292 text_t::iterator ithere = ttquerystring.begin ();
293 text_t::iterator itend = ttquerystring.end ();
294
295 bool inquote = false;
296
297 while (ithere != itend)
298 {
299 if ((*ithere) == '\"')
300 {
301 if (!inquote) ttquotedquery.clear ();
302 inquote = !inquote;
303 *ithere = ' '; // delete the quote
304 }
305 else if (inquote)
306 {
307 ttquotedquery.push_back(*ithere);
308 *ithere = ' ';
309 }
310
311 ithere++;
312 }
313}
314
315
316void mgsearchclass::filterquery (text_t &ttquerystring) {
317 text_t::iterator ithere = ttquerystring.begin ();
318 text_t::iterator itend = ttquerystring.end ();
319
320 // remove all non alphanumeric characters
321 while (ithere != itend) {
322 if (!is_unicode_letdig(*ithere)) (*ithere) = ' ';
323 ithere++;
324 }
325}
326
327
328// the document text for 'docnum' is placed in 'output'
329// docTargetDocument returns 'true' if it was able to
330// try to get a document
331// collection is needed to see if an index from the
332// collection is loaded. If no index has been loaded
333// defaultindex is needed to load one
334bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
335 const text_t &collection,
336 int docnum,
337 text_t &output)
338{
339 bool databaseloaded = true;
340
341 output.clear();
342
343
344 // make sure index is level 2
345
346 ////// this changed with new naming scheme in new building software
347 ///// i.e paragraph level index no longer contain number '3' but begin
348 ///// with letter 'p'
349
350 text_t db_loaded = db_loaded_name;
351
352 if (!db_loaded.empty()) {
353 text_t::const_iterator here = db_loaded.begin();
354 text_t::const_iterator end = db_loaded.end();
355
356
357 //while (here != end) {
358 // if (*here == '3')
359 // databaseloaded = false;
360 // here ++;
361 //}
362
363 char separator = '/';
364 text_t db;
365 int found = 0;
366#ifdef __WIN32__
367 separator = '\\';
368#endif;
369 // strip away path to db and following collection name
370 end --;
371 while (end != here) {
372 if (*end == separator) {
373 if (found) break;
374 else {db.clear(); found = 1; end--; continue;}
375 }
376 db.push_back(*end);
377 end --;
378 }
379
380 // string will have been reversed above so see if last
381 // character is 'p'
382 if (db[db.size()-1] == 'p') databaseloaded = false;
383 }
384
385 // find out if the database is already loaded
386 // this is needed because a different index (but valid one)
387 // might be already loaded.
388 // this comparison is needed because 'load_database'
389 // is now more oriented towards indexes
390 if (databaseloaded == true) {
391 text_t::const_iterator here = collection.begin();
392 text_t::const_iterator end = collection.end();
393 char *dbhere = &db_loaded_name[strlen(db_loaded_name) - collection.size()]; // assumes collection shorter than db_loaded_name
394 while (here != end)
395 {
396 if (*here != *dbhere)
397 {
398 databaseloaded = false;
399 break;
400 }
401 here++;
402 dbhere++;
403 }
404 }
405
406 // try and load the database
407 if (!databaseloaded)
408 {
409 // get the names of the index and text suffixes
410 char *idxsuffix = (getindexsuffix (collection,
411 defaultindex)).getcstr();
412 assert (idxsuffix != NULL);
413 char *txtsuffix = (gettextsuffix (collection)).getcstr();
414 assert (txtsuffix != NULL);
415
416#ifdef __WIN32__
417 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
418#else
419 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
420#endif
421
422 if (load_database(ccollectdir, idxsuffix, txtsuffix))
423 databaseloaded = true;
424 else
425 databaseloaded = false;
426
427 // free up the c strings
428 delete idxsuffix;
429 delete txtsuffix;
430 delete ccollectdir;
431 }
432
433 if (databaseloaded)
434 {
435 // retrieve the document from mg
436 char docstr[32];
437 sprintf(docstr, "%i", docnum);
438
439 mgq_ask(".set mode text");
440 mgq_ask(".set query docnums");
441 mgq_ask(docstr);
442 mgq_results (result_docs, 0, 1, doctextcallback, (void *)&output);
443 }
444
445 return databaseloaded;
446}
447
Note: See TracBrowser for help on using the repository browser.