source: trunk/gsdl/src/colservr/mgsearch.cpp@ 265

Last change on this file since 265 was 265, checked in by sjboddie, 25 years ago

phrase searching now uses case-folding if it's turned on

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.6 KB
Line 
1/**********************************************************************
2 *
3 * mgsearch.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: mgsearch.cpp 265 1999-06-09 00:41:32Z sjboddie $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.6 1999/06/09 00:41:32 sjboddie
15 phrase searching now uses case-folding if it's turned on
16
17 Revision 1.5 1999/02/21 22:31:35 rjmcnab
18
19 Removed locateinfo.
20
21 Revision 1.4 1999/02/03 01:13:27 sjboddie
22
23 Got interface to handle subcollections and language subcollections -
24 committed changes made to some of the collections
25
26 Revision 1.3 1999/01/19 01:38:17 rjmcnab
27
28 Made the source more portable.
29
30 Revision 1.2 1999/01/12 01:51:02 rjmcnab
31
32 Standard header.
33
34 Revision 1.1 1999/01/08 09:02:16 rjmcnab
35
36 Moved from src/library.
37
38 */
39
40
41#include "gsdlconf.h"
42#include "mgsearch.h"
43#include "fileutil.h"
44
45#include <string.h>
46#include <stdio.h>
47#include <stdlib.h>
48#include <ctype.h>
49
50#if defined(GSDL_USE_OBJECTSPACE)
51# include <ospace\std\iostream>
52#elif defined(GSDL_USE_IOS_H)
53# include <iostream.h>
54#else
55# include <iostream>
56#endif
57
58#if defined(__WIN32__)
59// gdbm stuff
60# include "autoconf.h"
61# include "systems.h"
62# include "gdbmconst.h"
63# include "gdbm.h"
64#else
65# include <gdbm.h>
66#endif
67
68
69#include <assert.h>
70
71#include "mgq.h"
72// #include "locateinfo.h"
73#include "gsdlunicode.h"
74#include "unitool.h"
75
76
77/////////////
78// globals //
79/////////////
80
81static char *quotedquery = NULL;
82static int casefold;
83
84
85/////////////////////////
86// index map functions //
87/////////////////////////
88
89void getrealdir (const text_t &map, text_t &realpart, text_t &dirpart) {
90 realpart.clear ();
91 dirpart.clear();
92
93 text_t::const_iterator here = map.begin();
94 text_t::const_iterator end = map.end();
95
96 // get the real index
97 while (here != end && *here != '-') {
98 realpart.push_back(*here);
99 here++;
100 }
101
102 if (here != end) here++;
103 if (here != end && *here == '>') here++;
104
105 // get the dir index
106 while (here != end) {
107 dirpart.push_back(*here);
108 here++;
109 }
110}
111
112void getrealdirindex (const text_t &indexmap, const text_t &subcollectionmap,
113 const text_t &languagemap, text_t &realindex,
114 text_t &dirindex) {
115 text_t real, dir;
116 realindex.clear();
117 dirindex.clear();
118
119 getrealdir (indexmap, real, dir);
120 realindex += real;
121 dirindex += dir;
122
123 getrealdir (subcollectionmap, real, dir);
124 realindex += real;
125 dirindex += dir;
126
127 getrealdir (languagemap, real, dir);
128 realindex += real;
129 dirindex += dir;
130}
131
132//bool isdirindex (const text_tarray &indexmap, const text_t &dirindex) {
133// text_tarray::const_iterator here = indexmap.begin();
134// text_tarray::const_iterator end = indexmap.end();
135// text_t maprealindex, mapdirindex;
136
137// while (here != end) {
138// getrealdirindex (*here, maprealindex, mapdirindex);
139// if (mapdirindex == dirindex) return true;
140// here++;
141// }
142
143// return false;
144//}
145
146void getrealindexparts (const text_tarray &/*indexmap*/, const text_tarray &/*subcollectionmap*/,
147 const text_tarray &languagemap, const text_t &realindex,
148 text_t &index, text_t &subcollection, text_t &language) {
149
150 index.clear();
151 subcollection.clear();
152 language.clear();
153
154 text_tarray parts;
155 splitchar (realindex.begin(), realindex.end(), ':', parts);
156 int numparts = parts.size();
157
158 if (numparts >= 2) {
159 index = parts[0] + ":" + parts[1];
160
161 if (numparts == 3) {
162 if (languagemap.empty())
163 subcollection = parts[2];
164 else
165 language = parts[2];
166 } else if (numparts == 4) {
167 subcollection = parts[2];
168 language = parts[3];
169 }
170 }
171}
172
173
174void getdirindexparts (const text_tarray &/*indexmap*/, const text_tarray &/*subcollectionmap*/,
175 const text_tarray &languagemap, const text_t &dirindex,
176 text_t &index, text_t &subcollection, text_t &language) {
177
178 index.clear();
179 subcollection.clear();
180 language.clear();
181
182 int indexsize = dirindex.size();
183 if (indexsize != 3 && indexsize != 5 &&
184 indexsize != 7) return;
185
186 text_t::const_iterator dibegin = dirindex.begin();
187 text_t::const_iterator diend = dirindex.end();
188
189 // first three characters make up index part
190 index = substr(dibegin, dibegin+3);
191
192 if (indexsize == 5) {
193 if (languagemap.empty())
194 subcollection = substr(dibegin+3, dibegin+5);
195 else
196 language = substr(dibegin+3, dibegin+5);
197 } else if (indexsize == 7) {
198 subcollection = substr(dibegin+3, dibegin+5);
199 language = substr(dibegin+5, diend);
200 }
201}
202
203
204bool isrealindex (const text_tarray &indexmap, const text_tarray &subcollectionmap,
205 const text_tarray &languagemap, const text_t &realindex) {
206
207 text_t index, subcollection, language, realpart, dirpart;
208 getrealindexparts (indexmap, subcollectionmap, languagemap, realindex,
209 index, subcollection, language);
210
211 // check index part
212 text_tarray::const_iterator here = indexmap.begin();
213 text_tarray::const_iterator end = indexmap.end();
214 bool exists = false;
215 while (here != end) {
216 getrealdir (*here, realpart, dirpart);
217 if (realpart == index) {exists = true; break;}
218 here++;
219 }
220 if (!exists) return false;
221
222 // check subcollection part if there is one
223 if (!subcollection.empty()) {
224 here = subcollectionmap.begin();
225 end = subcollectionmap.end();
226 exists = false;
227 while (here != end) {
228 getrealdir (*here, realpart, dirpart);
229 if (realpart == subcollection) {exists = true; break;}
230 here++;
231 }
232 if (!exists) return false;
233 }
234
235 // check language part if there is one
236 if (!language.empty()) {
237 here = languagemap.begin();
238 end = languagemap.end();
239 exists = false;
240 while (here != end) {
241 getrealdir (*here, realpart, dirpart);
242 if (realpart == language) {exists = true; break;}
243 here++;
244 }
245 if (!exists) return false;
246 }
247 return true;
248}
249
250text_t dir2realindex (const text_tarray &indexmap, const text_tarray &subcollectionmap,
251 const text_tarray &languagemap, const text_t &dirindex) {
252
253 text_t index, subcollection, language, realpart, dirpart, realindex;
254 getdirindexparts (indexmap, subcollectionmap, languagemap, dirindex,
255 index, subcollection, language);
256
257 // get index part
258 text_tarray::const_iterator here = indexmap.begin();
259 text_tarray::const_iterator end = indexmap.end();
260 while (here != end) {
261 getrealdir (*here, realpart, dirpart);
262 if (dirpart == index) {realindex += realpart; break;}
263 here++;
264 }
265
266 if (realindex.empty()) return "";
267
268 // get subcollection part
269 here = subcollectionmap.begin();
270 end = subcollectionmap.end();
271 while (here != end) {
272 getrealdir (*here, realpart, dirpart);
273 if (dirpart == subcollection) {realindex += ":" + realpart; break;}
274 here++;
275 }
276
277 // get language part
278 here = languagemap.begin();
279 end = languagemap.end();
280 while (here != end) {
281 getrealdir (*here, realpart, dirpart);
282 if (dirpart == language) {realindex += ":" + realpart; break;}
283 here++;
284 }
285 return realindex;
286}
287
288text_t real2dirindex (const text_tarray &indexmap, const text_tarray &subcollectionmap,
289 const text_tarray &languagemap, const text_t &realindex) {
290
291 text_t index, subcollection, language, realpart, dirpart, dirindex;
292 getrealindexparts (indexmap, subcollectionmap, languagemap, realindex,
293 index, subcollection, language);
294
295 // get index part
296 text_tarray::const_iterator here = indexmap.begin();
297 text_tarray::const_iterator end = indexmap.end();
298 while (here != end) {
299 getrealdir (*here, realpart, dirpart);
300 if (realpart == index) {dirindex += dirpart; break;}
301 here++;
302 }
303
304 if (dirindex.empty()) return "";
305
306 // get subcollection part
307 here = subcollectionmap.begin();
308 end = subcollectionmap.end();
309 while (here != end) {
310 getrealdir (*here, realpart, dirpart);
311 if (realpart == subcollection) {dirindex += dirpart; break;}
312 here++;
313 }
314
315 // get language part
316 here = languagemap.begin();
317 end = languagemap.end();
318 while (here != end) {
319 getrealdir (*here, realpart, dirpart);
320 if (realpart == language) {dirindex += dirpart; break;}
321 here++;
322 }
323 return dirindex;
324}
325
326text_t real2macroindex (const text_t &realindex) {
327 text_t macroindex;
328 text_t::const_iterator here = realindex.begin();
329 text_t::const_iterator end = realindex.end();
330 unsigned short c;
331
332 while (here != end) {
333 c = *here;
334 if ((c >= '0' && c <= '9') ||
335 (c >= 'A' && c <= 'Z') ||
336 (c >= 'a' && c <= 'z'))
337 macroindex.push_back (*here);
338 here++;
339 }
340
341 return macroindex;
342}
343
344bool isdoclevelindex (const text_t &realindex) {
345 char *docstr = "document";
346 text_t::const_iterator here = realindex.begin ();
347 text_t::const_iterator end = realindex.end ();
348
349 while (here != end) {
350 if (*docstr == '\0') return true;
351 if (*docstr != (char)(*here)) return false;
352 docstr++;
353 here++;
354 }
355
356 return false;
357}
358
359text_t getdoclevelindex (const text_tarray &/*indexmap*/) {
360 //text_tarray::const_iterator here = indexmap.begin();
361 //text_tarray::const_iterator end = indexmap.end();
362 //text_t maprealindex, mapdirindex;
363
364 // while (here != end) {
365 // getrealdirindex (*here, maprealindex, mapdirindex);
366 // if (isdoclevelindex (maprealindex)) return maprealindex;
367 // here++;
368 //}
369
370 return "";
371}
372
373
374
375
376////////////////////////
377// callback functions //
378////////////////////////
379
380// This routine is called for each document found in a search
381// it assumes that cache_num is set up correctly to point to
382// a suitable result cache
383int ourquerycallback(char *UDoc, int /*ULen*/, int DocNum,
384 float Weight, void *info) {
385
386
387 queryresultsclass *queryresults = (queryresultsclass * )info;
388
389 // check the returned document for the presence of the
390 // quoted part of the query, if there was one
391
392 // if (UDoc != NULL && quotedquery != NULL &&
393 // quotedquery[0] != '\0' && strstr (UDoc, quotedquery) == NULL) return 0;
394
395
396 if (UDoc != NULL && quotedquery != NULL && quotedquery[0] != '\0') {
397
398 if (casefold) {
399 int len;
400 for (len = 0; quotedquery[len] != '\0'; len ++)
401 quotedquery[len] = tolower (quotedquery[len]);
402 for (len = 0; UDoc[len] != '\0'; len ++)
403 UDoc[len] = tolower (UDoc[len]);
404 }
405 if (strstr (UDoc, quotedquery) == NULL) return 0;
406 }
407
408 // append this entry to the document results
409 docresultclass docresult;
410 docresult.docnum = DocNum;
411 docresult.docweight = Weight;
412
413 queryresults->docs.push_back(docresult);
414
415 return 0;
416}
417
418// This callback is called once for each term in the query
419int termfreqcallback(char *Word, int ULen, int Freq,
420 float /*Weight*/, void *info) {
421 queryresultsclass *queryresults = (queryresultsclass *)info;
422
423 text_t term;
424 term.setcarr(Word, ULen);
425 termfreqclass termfreq;
426 termfreq.termstr = to_uni(term);
427 termfreq.termfreq = Freq;
428 queryresults->terms.push_back(termfreq);
429
430 return 0;
431}
432
433// this callback is called once for each variation of each term
434int termscallback(char *Word, int ULen, int /*Freq*/,
435 float /*Weight*/, void *info) {
436
437 text_t term;
438 term.setcarr(Word, ULen);
439 queryresultsclass *queryresults = (queryresultsclass *)info;
440 queryresults->termvariants.push_back(to_uni(term));
441
442 return 0;
443}
444
445// This callback is for getting document text
446int doctextcallback(char *Word, int ULen, int /*Freq*/,
447 float /*Weight*/, void *info) {
448 text_t *output = (text_t *)info;
449 if (output == NULL) return 0;
450 output->clear();
451
452 utf8inconvertclass inconvert;
453 convertclass::status_t status;
454 inconvert.reset ();
455 inconvert.setinput (Word, ULen);
456 inconvert.convert (*output, status);
457
458 // replace all control-Cs with spaces
459 text_t::iterator here = output->begin();
460 text_t::iterator end = output->end();
461 while (here != end) {
462 if (*here == '\x3') *here = ' ';
463 here++;
464 }
465
466 return 0;
467}
468
469
470static text_t getindexsuffix (const text_t &collection,
471 const text_t &index) {
472 text_t indexsuffix = "index";
473 indexsuffix = filename_cat (indexsuffix, index);
474 indexsuffix = filename_cat (indexsuffix, collection);
475 return indexsuffix;
476}
477
478
479
480
481////////////////////
482// mgsearch class //
483////////////////////
484
485mgsearchclass::mgsearchclass ()
486{
487 cache = new querycache (RESULTCACHESIZE);
488}
489
490mgsearchclass::~mgsearchclass ()
491{
492 if (cache != NULL)
493 {
494 delete cache;
495 cache = NULL;
496 }
497}
498
499
500void mgsearchclass::setcollectdir (const text_t &thecollectdir)
501{
502 collectdir = thecollectdir;
503}
504
505
506bool mgsearchclass::search(const queryparamclass &queryparams,
507 queryresultsclass &queryresults)
508{
509 bool databaseloaded = true;
510
511 assert (cache != NULL);
512
513 queryresults.clear();
514
515 // first check the cache
516 if (cache->find(queryparams, queryresults))
517 return true;
518
519 // make sure there is a query to be processed
520 text_t::const_iterator queryhere = queryparams.querystring.begin();
521 text_t::const_iterator queryend = queryparams.querystring.end();
522 while (queryhere != queryend) {
523 if (is_unicode_letdig (*queryhere)) break;
524 queryhere++;
525 }
526
527 // if we reached the end of the query string without finding
528 // any alphanumeric characters then return no results (and say
529 // the database was loaded)
530 if (queryhere == queryend) return true;
531
532 casefold = queryparams.casefolding;
533
534 // get the names of the collection, index and text suffixes
535 char *ccollection = queryparams.collection.getcstr();
536 assert (ccollection != NULL);
537 char *idxsuffix = (getindexsuffix (queryparams.collection,
538 queryparams.search_index)).getcstr();
539 assert (idxsuffix != NULL);
540 char *txtsuffix = (getindexsuffix (queryparams.collection, "text")).getcstr();
541 assert (txtsuffix != NULL);
542
543#ifdef __WIN32__
544 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
545#else
546 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
547#endif
548
549 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix))
550 {
551 setsearchmode (queryparams);
552 submitquery (queryparams);
553 getresults (queryresults);
554 }
555 else databaseloaded = false;
556
557 // free up the c strings
558 delete ccollection;
559 delete idxsuffix;
560 delete txtsuffix;
561 delete ccollectdir;
562
563 return databaseloaded;
564}
565
566
567void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
568{
569 mgq_ask(".set expert true");
570 mgq_ask(".set accumulator_method list");
571 mgq_ask(".set max_accumulators 50000");
572 mgq_ask(".set verbatim true");
573 mgq_ask(".unset skip_dump");
574 mgq_ask(".set mode docnums");
575
576 switch (queryparams.search_type)
577 {
578 case 0: mgq_ask(".set query boolean"); break;
579 case 1: mgq_ask(".set query ranked"); break;
580 }
581 switch (queryparams.casefolding)
582 {
583 case 1: mgq_ask(".set casefold on"); break;
584 case 0: mgq_ask(".set casefold off"); break;
585 }
586 switch (queryparams.stemming)
587 {
588 case 1: mgq_ask(".set stem on"); break;
589 case 0: mgq_ask(".set stem off"); break;
590 }
591 mgq_ask(".set heads_length 150");
592
593 char maxdocstr[32];
594 sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
595 mgq_ask(maxdocstr);
596}
597
598
599void mgsearchclass::submitquery (const queryparamclass &queryparams)
600{
601 // sort out the query string
602 text_t ttquerystring = queryparams.querystring;
603 text_t ttquotedquery;
604 extractquoted (ttquerystring, ttquotedquery);
605 filterquery (ttquerystring);
606
607 // turn the strings into c strings for mg
608 if (quotedquery != NULL) // quotedquery is a global
609 {
610 delete quotedquery;
611 quotedquery = NULL;
612 }
613
614 // quotedquery will be deleted on the next call to this function
615 quotedquery = to_utf8(ttquotedquery).getcstr ();
616 char *querystring = to_utf8(ttquerystring).getcstr();
617
618 // submit the query
619 mgq_ask(querystring);
620
621 delete querystring;
622}
623
624
625void mgsearchclass::getresults (queryresultsclass &queryresults)
626{
627 if (quotedquery[0] == '\0')
628 {
629 // don't need the text
630 mgq_results(result_docnums, 0, MAXNUMDOCS,
631 ourquerycallback, (void *)(&queryresults));
632 }
633 else
634 {
635 // we need the text for this one
636 mgq_results(result_docs, 0, MAXNUMDOCS,
637 ourquerycallback, (void *)(&queryresults));
638 }
639
640 // get the term frequencies
641 mgq_results(result_termfreqs, 0, MAXNUMTERMS,
642 termfreqcallback, (void *)(&queryresults));
643 mgq_results(result_terms, 0, MAXNUMTERMS,
644 termscallback, (void *)(&queryresults));
645 queryresults.sortqueryterms();
646 queryresults.uniqqueryterms();
647}
648
649
650void mgsearchclass::extractquoted (text_t &ttquerystring, text_t &ttquotedquery)
651{
652 ttquotedquery.clear();
653
654 text_t::iterator ithere = ttquerystring.begin ();
655 text_t::iterator itend = ttquerystring.end ();
656
657 bool inquote = false;
658
659 while (ithere != itend)
660 {
661 if ((*ithere) == '\"')
662 {
663 if (!inquote) ttquotedquery.clear ();
664 inquote = !inquote;
665 *ithere = ' '; // delete the quote
666 }
667 else if (inquote)
668 {
669 ttquotedquery.push_back(*ithere);
670 *ithere = ' ';
671 }
672
673 ithere++;
674 }
675}
676
677
678void mgsearchclass::filterquery (text_t &ttquerystring) {
679 text_t::iterator ithere = ttquerystring.begin ();
680 text_t::iterator itend = ttquerystring.end ();
681
682 // remove all non alphanumeric characters
683 while (ithere != itend) {
684 if (!is_unicode_letdig(*ithere)) (*ithere) = ' ';
685 ithere++;
686 }
687}
688
689
690// the document text for 'docnum' is placed in 'output'
691// docTargetDocument returns 'true' if it was able to
692// try to get a document
693// collection is needed to see if an index from the
694// collection is loaded. If no index has been loaded
695// defaultindex is needed to load one
696bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
697 const text_t &collection,
698 int docnum,
699 text_t &output)
700{
701 int databaseloaded = 0;
702
703 output.clear();
704
705 char *ccollection = collection.getcstr();
706 assert (ccollection != NULL);
707
708 // see if we can make an appropriate database current
709 databaseloaded = load_text_database (ccollection);
710
711 // try and load the database
712 if (!databaseloaded)
713 {
714 // get the names of the index and text suffixes
715 char *idxsuffix = (getindexsuffix (collection,
716 defaultindex)).getcstr();
717 assert (idxsuffix != NULL);
718 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
719 assert (txtsuffix != NULL);
720
721#ifdef __WIN32__
722 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
723#else
724 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
725#endif
726
727 databaseloaded = load_database(ccollection, ccollectdir, idxsuffix, txtsuffix);
728
729 // free up the c strings
730 delete idxsuffix;
731 delete txtsuffix;
732 delete ccollectdir;
733 }
734
735 // free up the c collection string
736 delete ccollection;
737
738 if (databaseloaded)
739 {
740 // retrieve the document from mg
741 char docstr[32];
742 sprintf(docstr, "%i", docnum);
743
744 mgq_ask(".set mode text");
745 mgq_ask(".set query docnums");
746 mgq_ask(docstr);
747 mgq_results (result_docs, 0, 1, doctextcallback, (void *)&output);
748 }
749
750 return databaseloaded;
751}
752
Note: See TracBrowser for help on using the repository browser.