source: trunk/gsdl/src/colservr/mgsearch.cpp@ 163

Last change on this file since 163 was 163, checked in by rjmcnab, 25 years ago

Removed locateinfo.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.1 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: mgsearch.cpp 163 1999-02-21 22:31:35Z rjmcnab $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.5 1999/02/21 22:31:35 rjmcnab
15
16 Removed locateinfo.
17
18 Revision 1.4 1999/02/03 01:13:27 sjboddie
19
20 Got interface to handle subcollections and language subcollections -
21 committed changes made to some of the collections
22
23 Revision 1.3 1999/01/19 01:38:17 rjmcnab
24
25 Made the source more portable.
26
27 Revision 1.2 1999/01/12 01:51:02 rjmcnab
28
29 Standard header.
30
31 Revision 1.1 1999/01/08 09:02:16 rjmcnab
32
33 Moved from src/library.
34
35 */
36
37
38#include "gsdlconf.h"
39#include "mgsearch.h"
40#include "fileutil.h"
41
42#include <string.h>
43#include <stdio.h>
44#include <stdlib.h>
45#include <ctype.h>
46
47#if defined(GSDL_USE_OBJECTSPACE)
48# include <ospace\std\iostream>
49#elif defined(GSDL_USE_IOS_H)
50# include <iostream.h>
51#else
52# include <iostream>
53#endif
54
55#if defined(__WIN32__)
56// gdbm stuff
57# include "autoconf.h"
58# include "systems.h"
59# include "gdbmconst.h"
60# include "gdbm.h"
61#else
62# include <gdbm.h>
63#endif
64
65
66#include <assert.h>
67
68#include "mgq.h"
69// #include "locateinfo.h"
70#include "gsdlunicode.h"
71#include "unitool.h"
72
73
74/////////////
75// globals //
76/////////////
77
78static char *quotedquery = NULL;
79
80
81/////////////////////////
82// index map functions //
83/////////////////////////
84
85void getrealdir (const text_t &map, text_t &realpart, text_t &dirpart) {
86 realpart.clear ();
87 dirpart.clear();
88
89 text_t::const_iterator here = map.begin();
90 text_t::const_iterator end = map.end();
91
92 // get the real index
93 while (here != end && *here != '-') {
94 realpart.push_back(*here);
95 here++;
96 }
97
98 if (here != end) here++;
99 if (here != end && *here == '>') here++;
100
101 // get the dir index
102 while (here != end) {
103 dirpart.push_back(*here);
104 here++;
105 }
106}
107
108void getrealdirindex (const text_t &indexmap, const text_t &subcollectionmap,
109 const text_t &languagemap, text_t &realindex,
110 text_t &dirindex) {
111 text_t real, dir;
112 realindex.clear();
113 dirindex.clear();
114
115 getrealdir (indexmap, real, dir);
116 realindex += real;
117 dirindex += dir;
118
119 getrealdir (subcollectionmap, real, dir);
120 realindex += real;
121 dirindex += dir;
122
123 getrealdir (languagemap, real, dir);
124 realindex += real;
125 dirindex += dir;
126}
127
128//bool isdirindex (const text_tarray &indexmap, const text_t &dirindex) {
129// text_tarray::const_iterator here = indexmap.begin();
130// text_tarray::const_iterator end = indexmap.end();
131// text_t maprealindex, mapdirindex;
132
133// while (here != end) {
134// getrealdirindex (*here, maprealindex, mapdirindex);
135// if (mapdirindex == dirindex) return true;
136// here++;
137// }
138
139// return false;
140//}
141
142void getrealindexparts (const text_tarray &/*indexmap*/, const text_tarray &/*subcollectionmap*/,
143 const text_tarray &languagemap, const text_t &realindex,
144 text_t &index, text_t &subcollection, text_t &language) {
145
146 index.clear();
147 subcollection.clear();
148 language.clear();
149
150 text_tarray parts;
151 splitchar (realindex.begin(), realindex.end(), ':', parts);
152 int numparts = parts.size();
153
154 if (numparts >= 2) {
155 index = parts[0] + ":" + parts[1];
156
157 if (numparts == 3) {
158 if (languagemap.empty())
159 subcollection = parts[2];
160 else
161 language = parts[2];
162 } else if (numparts == 4) {
163 subcollection = parts[2];
164 language = parts[3];
165 }
166 }
167}
168
169
170void getdirindexparts (const text_tarray &/*indexmap*/, const text_tarray &/*subcollectionmap*/,
171 const text_tarray &languagemap, const text_t &dirindex,
172 text_t &index, text_t &subcollection, text_t &language) {
173
174 index.clear();
175 subcollection.clear();
176 language.clear();
177
178 int indexsize = dirindex.size();
179 if (indexsize != 3 && indexsize != 5 &&
180 indexsize != 7) return;
181
182 text_t::const_iterator dibegin = dirindex.begin();
183 text_t::const_iterator diend = dirindex.end();
184
185 // first three characters make up index part
186 index = substr(dibegin, dibegin+3);
187
188 if (indexsize == 5) {
189 if (languagemap.empty())
190 subcollection = substr(dibegin+3, dibegin+5);
191 else
192 language = substr(dibegin+3, dibegin+5);
193 } else if (indexsize == 7) {
194 subcollection = substr(dibegin+3, dibegin+5);
195 language = substr(dibegin+5, diend);
196 }
197}
198
199
200bool isrealindex (const text_tarray &indexmap, const text_tarray &subcollectionmap,
201 const text_tarray &languagemap, const text_t &realindex) {
202
203 text_t index, subcollection, language, realpart, dirpart;
204 getrealindexparts (indexmap, subcollectionmap, languagemap, realindex,
205 index, subcollection, language);
206
207 // check index part
208 text_tarray::const_iterator here = indexmap.begin();
209 text_tarray::const_iterator end = indexmap.end();
210 bool exists = false;
211 while (here != end) {
212 getrealdir (*here, realpart, dirpart);
213 if (realpart == index) {exists = true; break;}
214 here++;
215 }
216 if (!exists) return false;
217
218 // check subcollection part if there is one
219 if (!subcollection.empty()) {
220 here = subcollectionmap.begin();
221 end = subcollectionmap.end();
222 exists = false;
223 while (here != end) {
224 getrealdir (*here, realpart, dirpart);
225 if (realpart == subcollection) {exists = true; break;}
226 here++;
227 }
228 if (!exists) return false;
229 }
230
231 // check language part if there is one
232 if (!language.empty()) {
233 here = languagemap.begin();
234 end = languagemap.end();
235 exists = false;
236 while (here != end) {
237 getrealdir (*here, realpart, dirpart);
238 if (realpart == language) {exists = true; break;}
239 here++;
240 }
241 if (!exists) return false;
242 }
243 return true;
244}
245
246text_t dir2realindex (const text_tarray &indexmap, const text_tarray &subcollectionmap,
247 const text_tarray &languagemap, const text_t &dirindex) {
248
249 text_t index, subcollection, language, realpart, dirpart, realindex;
250 getdirindexparts (indexmap, subcollectionmap, languagemap, dirindex,
251 index, subcollection, language);
252
253 // get index part
254 text_tarray::const_iterator here = indexmap.begin();
255 text_tarray::const_iterator end = indexmap.end();
256 while (here != end) {
257 getrealdir (*here, realpart, dirpart);
258 if (dirpart == index) {realindex += realpart; break;}
259 here++;
260 }
261
262 if (realindex.empty()) return "";
263
264 // get subcollection part
265 here = subcollectionmap.begin();
266 end = subcollectionmap.end();
267 while (here != end) {
268 getrealdir (*here, realpart, dirpart);
269 if (dirpart == subcollection) {realindex += ":" + realpart; break;}
270 here++;
271 }
272
273 // get language part
274 here = languagemap.begin();
275 end = languagemap.end();
276 while (here != end) {
277 getrealdir (*here, realpart, dirpart);
278 if (dirpart == language) {realindex += ":" + realpart; break;}
279 here++;
280 }
281 return realindex;
282}
283
284text_t real2dirindex (const text_tarray &indexmap, const text_tarray &subcollectionmap,
285 const text_tarray &languagemap, const text_t &realindex) {
286
287 text_t index, subcollection, language, realpart, dirpart, dirindex;
288 getrealindexparts (indexmap, subcollectionmap, languagemap, realindex,
289 index, subcollection, language);
290
291 // get index part
292 text_tarray::const_iterator here = indexmap.begin();
293 text_tarray::const_iterator end = indexmap.end();
294 while (here != end) {
295 getrealdir (*here, realpart, dirpart);
296 if (realpart == index) {dirindex += dirpart; break;}
297 here++;
298 }
299
300 if (dirindex.empty()) return "";
301
302 // get subcollection part
303 here = subcollectionmap.begin();
304 end = subcollectionmap.end();
305 while (here != end) {
306 getrealdir (*here, realpart, dirpart);
307 if (realpart == subcollection) {dirindex += dirpart; break;}
308 here++;
309 }
310
311 // get language part
312 here = languagemap.begin();
313 end = languagemap.end();
314 while (here != end) {
315 getrealdir (*here, realpart, dirpart);
316 if (realpart == language) {dirindex += dirpart; break;}
317 here++;
318 }
319 return dirindex;
320}
321
322text_t real2macroindex (const text_t &realindex) {
323 text_t macroindex;
324 text_t::const_iterator here = realindex.begin();
325 text_t::const_iterator end = realindex.end();
326 unsigned short c;
327
328 while (here != end) {
329 c = *here;
330 if ((c >= '0' && c <= '9') ||
331 (c >= 'A' && c <= 'Z') ||
332 (c >= 'a' && c <= 'z'))
333 macroindex.push_back (*here);
334 here++;
335 }
336
337 return macroindex;
338}
339
340bool isdoclevelindex (const text_t &realindex) {
341 char *docstr = "document";
342 text_t::const_iterator here = realindex.begin ();
343 text_t::const_iterator end = realindex.end ();
344
345 while (here != end) {
346 if (*docstr == '\0') return true;
347 if (*docstr != (char)(*here)) return false;
348 docstr++;
349 here++;
350 }
351
352 return false;
353}
354
355text_t getdoclevelindex (const text_tarray &/*indexmap*/) {
356 //text_tarray::const_iterator here = indexmap.begin();
357 //text_tarray::const_iterator end = indexmap.end();
358 //text_t maprealindex, mapdirindex;
359
360 // while (here != end) {
361 // getrealdirindex (*here, maprealindex, mapdirindex);
362 // if (isdoclevelindex (maprealindex)) return maprealindex;
363 // here++;
364 //}
365
366 return "";
367}
368
369
370
371
372////////////////////////
373// callback functions //
374////////////////////////
375
376// This routine is called for each document found in a search
377// it assumes that cache_num is set up correctly to point to
378// a suitable result cache
379int ourquerycallback(char *UDoc, int /*ULen*/, int DocNum,
380 float Weight, void *info) {
381
382
383 queryresultsclass *queryresults = (queryresultsclass * )info;
384
385 // check the returned document for the presence of the
386 // quoted part of the query, if there was one
387
388 if (UDoc != NULL && quotedquery != NULL &&
389 quotedquery[0] != '\0' && strstr (UDoc, quotedquery) == NULL) return 0;
390
391 // append this entry to the document results
392 docresultclass docresult;
393 docresult.docnum = DocNum;
394 docresult.docweight = Weight;
395
396 queryresults->docs.push_back(docresult);
397
398 return 0;
399}
400
401// This callback is called once for each term in the query
402int termfreqcallback(char *Word, int ULen, int Freq,
403 float /*Weight*/, void *info) {
404 queryresultsclass *queryresults = (queryresultsclass *)info;
405
406 text_t term;
407 term.setcarr(Word, ULen);
408 termfreqclass termfreq;
409 termfreq.termstr = to_uni(term);
410 termfreq.termfreq = Freq;
411 queryresults->terms.push_back(termfreq);
412
413 return 0;
414}
415
416// this callback is called once for each variation of each term
417int termscallback(char *Word, int ULen, int /*Freq*/,
418 float /*Weight*/, void *info) {
419
420 text_t term;
421 term.setcarr(Word, ULen);
422 queryresultsclass *queryresults = (queryresultsclass *)info;
423 queryresults->termvariants.push_back(to_uni(term));
424
425 return 0;
426}
427
428// This callback is for getting document text
429int doctextcallback(char *Word, int ULen, int /*Freq*/,
430 float /*Weight*/, void *info) {
431 text_t *output = (text_t *)info;
432 if (output == NULL) return 0;
433 output->clear();
434
435 utf8inconvertclass inconvert;
436 convertclass::status_t status;
437 inconvert.reset ();
438 inconvert.setinput (Word, ULen);
439 inconvert.convert (*output, status);
440
441 // replace all control-Cs with spaces
442 text_t::iterator here = output->begin();
443 text_t::iterator end = output->end();
444 while (here != end) {
445 if (*here == '\x3') *here = ' ';
446 here++;
447 }
448
449 return 0;
450}
451
452
453static text_t getindexsuffix (const text_t &collection,
454 const text_t &index) {
455 text_t indexsuffix = "index";
456 indexsuffix = filename_cat (indexsuffix, index);
457 indexsuffix = filename_cat (indexsuffix, collection);
458 return indexsuffix;
459}
460
461
462
463
464////////////////////
465// mgsearch class //
466////////////////////
467
468mgsearchclass::mgsearchclass ()
469{
470 cache = new querycache (RESULTCACHESIZE);
471}
472
473mgsearchclass::~mgsearchclass ()
474{
475 if (cache != NULL)
476 {
477 delete cache;
478 cache = NULL;
479 }
480}
481
482
483void mgsearchclass::setcollectdir (const text_t &thecollectdir)
484{
485 collectdir = thecollectdir;
486}
487
488
489bool mgsearchclass::search(const queryparamclass &queryparams,
490 queryresultsclass &queryresults)
491{
492 bool databaseloaded = true;
493
494 assert (cache != NULL);
495
496 queryresults.clear();
497
498 // first check the cache
499 if (cache->find(queryparams, queryresults))
500 return true;
501
502 // make sure there is a query to be processed
503 text_t::const_iterator queryhere = queryparams.querystring.begin();
504 text_t::const_iterator queryend = queryparams.querystring.end();
505 while (queryhere != queryend) {
506 if (is_unicode_letdig (*queryhere)) break;
507 queryhere++;
508 }
509
510 // if we reached the end of the query string without finding
511 // any alphanumeric characters then return no results (and say
512 // the database was loaded)
513 if (queryhere == queryend) return true;
514
515
516 // get the names of the collection, index and text suffixes
517 char *ccollection = queryparams.collection.getcstr();
518 assert (ccollection != NULL);
519 char *idxsuffix = (getindexsuffix (queryparams.collection,
520 queryparams.search_index)).getcstr();
521 assert (idxsuffix != NULL);
522 char *txtsuffix = (getindexsuffix (queryparams.collection, "text")).getcstr();
523 assert (txtsuffix != NULL);
524
525#ifdef __WIN32__
526 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
527#else
528 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
529#endif
530
531 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix))
532 {
533 setsearchmode (queryparams);
534 submitquery (queryparams);
535 getresults (queryresults);
536 }
537 else databaseloaded = false;
538
539 // free up the c strings
540 delete ccollection;
541 delete idxsuffix;
542 delete txtsuffix;
543 delete ccollectdir;
544
545 return databaseloaded;
546}
547
548
549void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
550{
551 mgq_ask(".set expert true");
552 mgq_ask(".set accumulator_method list");
553 mgq_ask(".set max_accumulators 50000");
554 mgq_ask(".set verbatim true");
555 mgq_ask(".unset skip_dump");
556 mgq_ask(".set mode docnums");
557
558 switch (queryparams.search_type)
559 {
560 case 0: mgq_ask(".set query boolean"); break;
561 case 1: mgq_ask(".set query ranked"); break;
562 }
563 switch (queryparams.casefolding)
564 {
565 case 1: mgq_ask(".set casefold on"); break;
566 case 0: mgq_ask(".set casefold off"); break;
567 }
568 switch (queryparams.stemming)
569 {
570 case 1: mgq_ask(".set stem on"); break;
571 case 0: mgq_ask(".set stem off"); break;
572 }
573 mgq_ask(".set heads_length 150");
574
575 char maxdocstr[32];
576 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
577 mgq_ask(maxdocstr);
578}
579
580
581void mgsearchclass::submitquery (const queryparamclass &queryparams)
582{
583 // sort out the query string
584 text_t ttquerystring = queryparams.querystring;
585 text_t ttquotedquery;
586 extractquoted (ttquerystring, ttquotedquery);
587 filterquery (ttquerystring);
588
589 // turn the strings into c strings for mg
590 if (quotedquery != NULL) // quotedquery is a global
591 {
592 delete quotedquery;
593 quotedquery = NULL;
594 }
595
596 // quotedquery will be deleted on the next call to this function
597 quotedquery = to_utf8(ttquotedquery).getcstr ();
598 char *querystring = to_utf8(ttquerystring).getcstr();
599
600 // submit the query
601 mgq_ask(querystring);
602
603 delete querystring;
604}
605
606
607void mgsearchclass::getresults (queryresultsclass &queryresults)
608{
609 if (quotedquery[0] == '\0')
610 {
611 // don't need the text
612 mgq_results(result_docnums, 0, MAXNUMDOCS,
613 ourquerycallback, (void *)(&queryresults));
614 }
615 else
616 {
617 // we need the text for this one
618 mgq_results(result_docs, 0, MAXNUMDOCS,
619 ourquerycallback, (void *)(&queryresults));
620 }
621
622 // get the term frequencies
623 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
624 termfreqcallback, (void *)(&queryresults));
625 mgq_results(result_terms, 0, MAXNUMTERMS,
626 termscallback, (void *)(&queryresults));
627 queryresults.sortqueryterms();
628 queryresults.uniqqueryterms();
629}
630
631
632void mgsearchclass::extractquoted (text_t &ttquerystring, text_t &ttquotedquery)
633{
634 ttquotedquery.clear();
635
636 text_t::iterator ithere = ttquerystring.begin ();
637 text_t::iterator itend = ttquerystring.end ();
638
639 bool inquote = false;
640
641 while (ithere != itend)
642 {
643 if ((*ithere) == '\"')
644 {
645 if (!inquote) ttquotedquery.clear ();
646 inquote = !inquote;
647 *ithere = ' '; // delete the quote
648 }
649 else if (inquote)
650 {
651 ttquotedquery.push_back(*ithere);
652 *ithere = ' ';
653 }
654
655 ithere++;
656 }
657}
658
659
660void mgsearchclass::filterquery (text_t &ttquerystring) {
661 text_t::iterator ithere = ttquerystring.begin ();
662 text_t::iterator itend = ttquerystring.end ();
663
664 // remove all non alphanumeric characters
665 while (ithere != itend) {
666 if (!is_unicode_letdig(*ithere)) (*ithere) = ' ';
667 ithere++;
668 }
669}
670
671
672// the document text for 'docnum' is placed in 'output'
673// docTargetDocument returns 'true' if it was able to
674// try to get a document
675// collection is needed to see if an index from the
676// collection is loaded. If no index has been loaded
677// defaultindex is needed to load one
678bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
679 const text_t &collection,
680 int docnum,
681 text_t &output)
682{
683 int databaseloaded = 0;
684
685 output.clear();
686
687 char *ccollection = collection.getcstr();
688 assert (ccollection != NULL);
689
690 // see if we can make an appropriate database current
691 databaseloaded = load_text_database (ccollection);
692
693 // try and load the database
694 if (!databaseloaded)
695 {
696 // get the names of the index and text suffixes
697 char *idxsuffix = (getindexsuffix (collection,
698 defaultindex)).getcstr();
699 assert (idxsuffix != NULL);
700 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
701 assert (txtsuffix != NULL);
702
703#ifdef __WIN32__
704 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
705#else
706 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
707#endif
708
709 databaseloaded = load_database(ccollection, ccollectdir, idxsuffix, txtsuffix);
710
711 // free up the c strings
712 delete idxsuffix;
713 delete txtsuffix;
714 delete ccollectdir;
715 }
716
717 // free up the c collection string
718 delete ccollection;
719
720 if (databaseloaded)
721 {
722 // retrieve the document from mg
723 char docstr[32];
724 sprintf(docstr, "%i", docnum);
725
726 mgq_ask(".set mode text");
727 mgq_ask(".set query docnums");
728 mgq_ask(docstr);
729 mgq_results (result_docs, 0, 1, doctextcallback, (void *)&output);
730 }
731
732 return databaseloaded;
733}
734
Note: See TracBrowser for help on using the repository browser.