[110] | 1 | /**********************************************************************
|
---|
| 2 | *
|
---|
| 3 | * mgsearch.cpp --
|
---|
| 4 | * Copyright (C) 1999 The New Zealand Digital Library Project
|
---|
| 5 | *
|
---|
[534] | 6 | * A component of the Greenstone digital library software
|
---|
| 7 | * from the New Zealand Digital Library Project at the
|
---|
| 8 | * University of Waikato, New Zealand.
|
---|
[110] | 9 | *
|
---|
[534] | 10 | * This program is free software; you can redistribute it and/or modify
|
---|
| 11 | * it under the terms of the GNU General Public License as published by
|
---|
| 12 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 13 | * (at your option) any later version.
|
---|
| 14 | *
|
---|
| 15 | * This program is distributed in the hope that it will be useful,
|
---|
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 18 | * GNU General Public License for more details.
|
---|
| 19 | *
|
---|
| 20 | * You should have received a copy of the GNU General Public License
|
---|
| 21 | * along with this program; if not, write to the Free Software
|
---|
| 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 23 | *
|
---|
[110] | 24 | * $Id: mgsearch.cpp 539 1999-09-07 22:52:52Z rjmcnab $
|
---|
| 25 | *
|
---|
| 26 | *********************************************************************/
|
---|
| 27 |
|
---|
| 28 | /*
|
---|
| 29 | $Log$
|
---|
[539] | 30 | Revision 1.19 1999/09/07 22:52:52 rjmcnab
|
---|
| 31 | Seems to be an error in mg for retrieving documents using a paragraph
|
---|
| 32 | based index for some cases. Just added a work around (loads the default
|
---|
| 33 | index every time).
|
---|
| 34 |
|
---|
[534] | 35 | Revision 1.18 1999/09/07 04:57:22 sjboddie
|
---|
| 36 | added gpl notice
|
---|
| 37 |
|
---|
[497] | 38 | Revision 1.17 1999/08/31 22:42:41 rjmcnab
|
---|
| 39 | A couple of minor things.
|
---|
| 40 |
|
---|
[473] | 41 | Revision 1.16 1999/08/25 04:51:06 sjboddie
|
---|
| 42 | small change to allow for searching using boolean operators
|
---|
| 43 |
|
---|
[401] | 44 | Revision 1.15 1999/07/16 08:35:03 rjmcnab
|
---|
| 45 | Fixed a weird bug to do with a faulty case statement.
|
---|
| 46 |
|
---|
[398] | 47 | Revision 1.14 1999/07/16 03:42:22 sjboddie
|
---|
| 48 | changed isApprox
|
---|
| 49 |
|
---|
[393] | 50 | Revision 1.13 1999/07/16 00:12:46 sjboddie
|
---|
| 51 | removed all the old post-processing stuff
|
---|
| 52 |
|
---|
[350] | 53 | Revision 1.12 1999/07/07 06:17:47 rjmcnab
|
---|
| 54 | broke search_index into index+subcollection+language
|
---|
| 55 | within mgsearch
|
---|
| 56 |
|
---|
[343] | 57 | Revision 1.11 1999/07/05 21:06:43 rjmcnab
|
---|
| 58 | Disabled quoted strings.
|
---|
| 59 |
|
---|
[334] | 60 | Revision 1.10 1999/07/01 09:29:19 rjmcnab
|
---|
| 61 | Changes for better reporting of number documents which match a query. Changes
|
---|
| 62 | should still work as before with older versions of mg.
|
---|
| 63 |
|
---|
[325] | 64 | Revision 1.9 1999/07/01 03:54:48 rjmcnab
|
---|
| 65 | Added code to plug in the equivalent terms of each of the query terms.
|
---|
| 66 | Also added a function to get a raw utf8 encoded mg document (for speeding
|
---|
| 67 | up a phrase matching function)
|
---|
| 68 |
|
---|
[319] | 69 | Revision 1.8 1999/06/30 04:04:12 rjmcnab
|
---|
| 70 | made stemming functions available from mgsearch and made the stems
|
---|
| 71 | for the query terms available in queryinfo
|
---|
| 72 |
|
---|
[301] | 73 | Revision 1.7 1999/06/27 22:07:27 sjboddie
|
---|
| 74 | got rid of all the old functions for dealing with dir indexes
|
---|
| 75 |
|
---|
[265] | 76 | Revision 1.6 1999/06/09 00:41:32 sjboddie
|
---|
| 77 | phrase searching now uses case-folding if it's turned on
|
---|
| 78 |
|
---|
[163] | 79 | Revision 1.5 1999/02/21 22:31:35 rjmcnab
|
---|
| 80 |
|
---|
| 81 | Removed locateinfo.
|
---|
| 82 |
|
---|
[138] | 83 | Revision 1.4 1999/02/03 01:13:27 sjboddie
|
---|
| 84 |
|
---|
| 85 | Got interface to handle subcollections and language subcollections -
|
---|
| 86 | committed changes made to some of the collections
|
---|
| 87 |
|
---|
[114] | 88 | Revision 1.3 1999/01/19 01:38:17 rjmcnab
|
---|
| 89 |
|
---|
| 90 | Made the source more portable.
|
---|
| 91 |
|
---|
[112] | 92 | Revision 1.2 1999/01/12 01:51:02 rjmcnab
|
---|
| 93 |
|
---|
| 94 | Standard header.
|
---|
| 95 |
|
---|
[110] | 96 | Revision 1.1 1999/01/08 09:02:16 rjmcnab
|
---|
| 97 |
|
---|
| 98 | Moved from src/library.
|
---|
| 99 |
|
---|
| 100 | */
|
---|
| 101 |
|
---|
| 102 |
|
---|
[114] | 103 | #include "gsdlconf.h"
|
---|
[110] | 104 | #include "mgsearch.h"
|
---|
[163] | 105 | #include "fileutil.h"
|
---|
[110] | 106 |
|
---|
| 107 | #include <string.h>
|
---|
| 108 | #include <stdio.h>
|
---|
| 109 | #include <stdlib.h>
|
---|
| 110 | #include <ctype.h>
|
---|
| 111 |
|
---|
[114] | 112 | #if defined(GSDL_USE_OBJECTSPACE)
|
---|
| 113 | # include <ospace\std\iostream>
|
---|
| 114 | #elif defined(GSDL_USE_IOS_H)
|
---|
[110] | 115 | # include <iostream.h>
|
---|
| 116 | #else
|
---|
[114] | 117 | # include <iostream>
|
---|
| 118 | #endif
|
---|
[110] | 119 |
|
---|
[114] | 120 | #if defined(__WIN32__)
|
---|
[110] | 121 | // gdbm stuff
|
---|
| 122 | # include "autoconf.h"
|
---|
| 123 | # include "systems.h"
|
---|
| 124 | # include "gdbmconst.h"
|
---|
| 125 | # include "gdbm.h"
|
---|
[114] | 126 | #else
|
---|
| 127 | # include <gdbm.h>
|
---|
[110] | 128 | #endif
|
---|
[114] | 129 |
|
---|
[110] | 130 |
|
---|
| 131 | #include <assert.h>
|
---|
| 132 |
|
---|
| 133 | #include "mgq.h"
|
---|
[163] | 134 | // #include "locateinfo.h"
|
---|
[110] | 135 | #include "gsdlunicode.h"
|
---|
| 136 | #include "unitool.h"
|
---|
| 137 |
|
---|
| 138 |
|
---|
| 139 | /////////////
|
---|
| 140 | // globals //
|
---|
| 141 | /////////////
|
---|
| 142 |
|
---|
[325] | 143 | static char *tempdoc = NULL;
|
---|
| 144 | static int templen = 0;
|
---|
[110] | 145 |
|
---|
[325] | 146 |
|
---|
[319] | 147 | //////////////////////
|
---|
| 148 | // useful functions //
|
---|
| 149 | //////////////////////
|
---|
| 150 |
|
---|
| 151 |
|
---|
| 152 | // input and output are in utf8
|
---|
| 153 | text_t mgsearch_stemword (const text_t &word) {
|
---|
| 154 | // allocate working stem space
|
---|
| 155 | int maxstemlen = mgq_getmaxstemlen ();
|
---|
| 156 | unsigned char *word_stem = new unsigned char [maxstemlen + 2];
|
---|
| 157 | if (word_stem == NULL) return "";
|
---|
| 158 |
|
---|
| 159 | // copy word to word_stem
|
---|
| 160 | int len = 0;
|
---|
| 161 | text_t::const_iterator here = word.begin();
|
---|
| 162 | text_t::const_iterator end = word.end();
|
---|
| 163 | while (len < maxstemlen && here != end) {
|
---|
| 164 | word_stem[len+1] = (unsigned char)(*here);
|
---|
| 165 | len++; here++;
|
---|
| 166 | }
|
---|
| 167 | word_stem[len+1] = '\0';
|
---|
| 168 | word_stem[0] = len;
|
---|
| 169 |
|
---|
| 170 | mgq_stemword (word_stem);
|
---|
| 171 |
|
---|
| 172 | // copy word_stem back to tempstr
|
---|
| 173 | text_t tempstr;
|
---|
| 174 | tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
|
---|
| 175 |
|
---|
[325] | 176 | delete [] word_stem;
|
---|
| 177 |
|
---|
[319] | 178 | return tempstr;
|
---|
| 179 | }
|
---|
| 180 |
|
---|
| 181 |
|
---|
| 182 |
|
---|
[110] | 183 | ////////////////////////
|
---|
| 184 | // callback functions //
|
---|
| 185 | ////////////////////////
|
---|
| 186 |
|
---|
| 187 | // This routine is called for each document found in a search
|
---|
| 188 | // it assumes that cache_num is set up correctly to point to
|
---|
| 189 | // a suitable result cache
|
---|
[497] | 190 | int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
|
---|
[110] | 191 | float Weight, void *info) {
|
---|
| 192 |
|
---|
| 193 |
|
---|
| 194 | queryresultsclass *queryresults = (queryresultsclass * )info;
|
---|
| 195 |
|
---|
| 196 | // append this entry to the document results
|
---|
| 197 | docresultclass docresult;
|
---|
| 198 | docresult.docnum = DocNum;
|
---|
[319] | 199 | docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
|
---|
| 200 | docresult.docweight = Weight - docresult.num_query_terms_matched*100;
|
---|
| 201 |
|
---|
[350] | 202 | queryresults->docs.docset[DocNum] = docresult;
|
---|
| 203 | queryresults->docs.docorder.push_back(DocNum);
|
---|
[110] | 204 |
|
---|
| 205 | return 0;
|
---|
| 206 | }
|
---|
| 207 |
|
---|
[325] | 208 | int termequivcallback(char *Word, int ULen, int /*Freq*/,
|
---|
| 209 | float /*Weight*/, void *info) {
|
---|
| 210 | text_tset *equivterms = (text_tset *)info;
|
---|
| 211 | if (equivterms == NULL) return 0;
|
---|
| 212 |
|
---|
| 213 | text_t thisterm;
|
---|
| 214 | thisterm.setcarr(Word, ULen);
|
---|
| 215 |
|
---|
| 216 | equivterms->insert(thisterm);
|
---|
| 217 |
|
---|
| 218 | return 0;
|
---|
| 219 | }
|
---|
| 220 |
|
---|
| 221 |
|
---|
| 222 | void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
|
---|
| 223 | // allocate working stem space
|
---|
| 224 | int maxstemlen = mgq_getmaxstemlen ();
|
---|
| 225 | unsigned char *word_stem = new unsigned char [maxstemlen + 2];
|
---|
| 226 | if (word_stem == NULL) return;
|
---|
| 227 |
|
---|
| 228 | // copy word to word_stem
|
---|
| 229 | int len = 0;
|
---|
| 230 | text_t::const_iterator here = word.begin();
|
---|
| 231 | text_t::const_iterator end = word.end();
|
---|
| 232 | while (len < maxstemlen && here != end) {
|
---|
| 233 | word_stem[len+1] = (unsigned char)(*here);
|
---|
| 234 | len++; here++;
|
---|
| 235 | }
|
---|
| 236 | word_stem[len+1] = '\0';
|
---|
| 237 | word_stem[0] = len;
|
---|
| 238 |
|
---|
| 239 | // get the equivalent terms
|
---|
| 240 | mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
|
---|
| 241 |
|
---|
| 242 | delete [] word_stem;
|
---|
| 243 |
|
---|
| 244 | return;
|
---|
| 245 | }
|
---|
| 246 |
|
---|
| 247 | text_tset utf8equivterms; // kept as utf8 string for fast matching
|
---|
| 248 |
|
---|
| 249 |
|
---|
[110] | 250 | // This callback is called once for each term in the query
|
---|
| 251 | int termfreqcallback(char *Word, int ULen, int Freq,
|
---|
[114] | 252 | float /*Weight*/, void *info) {
|
---|
[110] | 253 | queryresultsclass *queryresults = (queryresultsclass *)info;
|
---|
[325] | 254 | if (queryresults == NULL) return 0;
|
---|
[110] | 255 |
|
---|
| 256 | text_t term;
|
---|
| 257 | term.setcarr(Word, ULen);
|
---|
| 258 | termfreqclass termfreq;
|
---|
[325] | 259 |
|
---|
[110] | 260 | termfreq.termstr = to_uni(term);
|
---|
[325] | 261 | text_t utf8termstem = mgsearch_stemword (term);
|
---|
| 262 | termfreq.termstemstr = to_uni (utf8termstem);
|
---|
| 263 |
|
---|
| 264 | mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
|
---|
| 265 |
|
---|
[110] | 266 | termfreq.termfreq = Freq;
|
---|
[319] | 267 | queryresults->orgterms.push_back(termfreq);
|
---|
[110] | 268 |
|
---|
| 269 | return 0;
|
---|
| 270 | }
|
---|
| 271 |
|
---|
| 272 | // this callback is called once for each variation of each term
|
---|
[319] | 273 | int termvariantscallback(char *Word, int ULen, int /*Freq*/,
|
---|
| 274 | float /*Weight*/, void *info) {
|
---|
[110] | 275 |
|
---|
| 276 | text_t term;
|
---|
| 277 | term.setcarr(Word, ULen);
|
---|
| 278 | queryresultsclass *queryresults = (queryresultsclass *)info;
|
---|
[350] | 279 | queryresults->termvariants.insert(to_uni(term));
|
---|
[110] | 280 |
|
---|
| 281 | return 0;
|
---|
| 282 | }
|
---|
| 283 |
|
---|
| 284 | // This callback is for getting document text
|
---|
[325] | 285 | int doctextcallback(char *Doc, int ULen, int /*Freq*/,
|
---|
[497] | 286 | float /*Weight*/, void * /*info*/) {
|
---|
[325] | 287 | tempdoc = Doc;
|
---|
| 288 | templen = ULen;
|
---|
[110] | 289 |
|
---|
| 290 | return 0;
|
---|
| 291 | }
|
---|
| 292 |
|
---|
| 293 |
|
---|
[163] | 294 | static text_t getindexsuffix (const text_t &collection,
|
---|
| 295 | const text_t &index) {
|
---|
[393] | 296 |
|
---|
| 297 | text_t indexsuffix = "index";
|
---|
[163] | 298 | indexsuffix = filename_cat (indexsuffix, index);
|
---|
| 299 | indexsuffix = filename_cat (indexsuffix, collection);
|
---|
| 300 | return indexsuffix;
|
---|
| 301 | }
|
---|
[110] | 302 |
|
---|
[163] | 303 |
|
---|
| 304 |
|
---|
| 305 |
|
---|
[110] | 306 | ////////////////////
|
---|
| 307 | // mgsearch class //
|
---|
| 308 | ////////////////////
|
---|
| 309 |
|
---|
| 310 | mgsearchclass::mgsearchclass ()
|
---|
| 311 | {
|
---|
| 312 | cache = new querycache (RESULTCACHESIZE);
|
---|
| 313 | }
|
---|
| 314 |
|
---|
| 315 | mgsearchclass::~mgsearchclass ()
|
---|
| 316 | {
|
---|
| 317 | if (cache != NULL)
|
---|
| 318 | {
|
---|
| 319 | delete cache;
|
---|
| 320 | cache = NULL;
|
---|
| 321 | }
|
---|
| 322 | }
|
---|
| 323 |
|
---|
| 324 |
|
---|
| 325 | void mgsearchclass::setcollectdir (const text_t &thecollectdir)
|
---|
| 326 | {
|
---|
| 327 | collectdir = thecollectdir;
|
---|
| 328 | }
|
---|
| 329 |
|
---|
[319] | 330 | // you only need to use this function before doing any stemming
|
---|
| 331 | // casefolding and stemming will be set if values for them are
|
---|
| 332 | // provided (0 or 1).
|
---|
| 333 | // makeindexcurrent returns true if it was able to load the database
|
---|
| 334 | bool mgsearchclass::makeindexcurrent (const text_t &index,
|
---|
[350] | 335 | const text_t &subcollection,
|
---|
| 336 | const text_t &language,
|
---|
[319] | 337 | const text_t &collection,
|
---|
| 338 | int casefolding,
|
---|
| 339 | int stemming) {
|
---|
| 340 | bool databaseloaded = true;
|
---|
[110] | 341 |
|
---|
[319] | 342 | // get the names of the collection, index and text suffixes
|
---|
| 343 | char *ccollection = collection.getcstr();
|
---|
| 344 | assert (ccollection != NULL);
|
---|
[350] | 345 | char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
|
---|
[319] | 346 | assert (idxsuffix != NULL);
|
---|
| 347 | char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
|
---|
| 348 | assert (txtsuffix != NULL);
|
---|
| 349 |
|
---|
| 350 | #ifdef __WIN32__
|
---|
| 351 | char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
|
---|
| 352 | #else
|
---|
| 353 | char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
|
---|
| 354 | #endif
|
---|
| 355 |
|
---|
| 356 | if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
|
---|
| 357 | if (casefolding == 0) mgq_ask(".set casefold off");
|
---|
| 358 | else if (casefolding > 0) mgq_ask(".set casefold on");
|
---|
| 359 | if (stemming == 0) mgq_ask(".set stem off");
|
---|
| 360 | else if (stemming > 0) mgq_ask(".set stem on");
|
---|
| 361 |
|
---|
| 362 | } else databaseloaded = false;
|
---|
| 363 |
|
---|
| 364 | // free up the c strings
|
---|
| 365 | delete ccollection;
|
---|
| 366 | delete idxsuffix;
|
---|
| 367 | delete txtsuffix;
|
---|
| 368 | delete ccollectdir;
|
---|
| 369 |
|
---|
| 370 | return databaseloaded;
|
---|
| 371 | }
|
---|
| 372 |
|
---|
| 373 |
|
---|
| 374 | // stem word uses the values set in the last call to makeindexcurrent
|
---|
| 375 | // to stem the word. It is assumed that word is in unicode
|
---|
| 376 | text_t mgsearchclass::stemword (const text_t &word) {
|
---|
| 377 | return to_uni (mgsearch_stemword (to_utf8 (word)));
|
---|
| 378 | }
|
---|
| 379 |
|
---|
[325] | 380 | text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
|
---|
| 381 | return to_uni (mgsearch_stemword (to_utf8 (here, end)));
|
---|
| 382 | }
|
---|
| 383 |
|
---|
| 384 |
|
---|
[110] | 385 | bool mgsearchclass::search(const queryparamclass &queryparams,
|
---|
[319] | 386 | queryresultsclass &queryresults) {
|
---|
[110] | 387 | assert (cache != NULL);
|
---|
| 388 |
|
---|
| 389 | queryresults.clear();
|
---|
| 390 |
|
---|
| 391 | // first check the cache
|
---|
[319] | 392 | if (cache->find(queryparams, queryresults)) return true;
|
---|
[110] | 393 |
|
---|
| 394 | // make sure there is a query to be processed
|
---|
| 395 | text_t::const_iterator queryhere = queryparams.querystring.begin();
|
---|
| 396 | text_t::const_iterator queryend = queryparams.querystring.end();
|
---|
| 397 | while (queryhere != queryend) {
|
---|
| 398 | if (is_unicode_letdig (*queryhere)) break;
|
---|
| 399 | queryhere++;
|
---|
| 400 | }
|
---|
| 401 |
|
---|
| 402 | // if we reached the end of the query string without finding
|
---|
| 403 | // any alphanumeric characters then return no results (and say
|
---|
| 404 | // the database was loaded)
|
---|
| 405 | if (queryhere == queryend) return true;
|
---|
| 406 |
|
---|
[350] | 407 | if (makeindexcurrent (queryparams.index, queryparams.subcollection,
|
---|
| 408 | queryparams.language, queryparams.collection)) {
|
---|
[319] | 409 | setsearchmode (queryparams);
|
---|
| 410 | submitquery (queryparams);
|
---|
[334] | 411 | getresults (queryparams, queryresults);
|
---|
[319] | 412 | return true;
|
---|
| 413 | }
|
---|
[110] | 414 |
|
---|
[319] | 415 | return false;
|
---|
[110] | 416 | }
|
---|
| 417 |
|
---|
| 418 |
|
---|
| 419 | void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
|
---|
| 420 | {
|
---|
| 421 | mgq_ask(".set expert true");
|
---|
[319] | 422 | mgq_ask(".set sorted_terms true");
|
---|
[110] | 423 | mgq_ask(".set accumulator_method list");
|
---|
[497] | 424 | mgq_ask(".set max_accumulators 500000");
|
---|
| 425 | mgq_ask(".set maxparas 500000");
|
---|
[110] | 426 | mgq_ask(".set verbatim true");
|
---|
| 427 | mgq_ask(".unset skip_dump");
|
---|
| 428 | mgq_ask(".set mode docnums");
|
---|
| 429 |
|
---|
| 430 | switch (queryparams.search_type)
|
---|
| 431 | {
|
---|
| 432 | case 0: mgq_ask(".set query boolean"); break;
|
---|
| 433 | case 1: mgq_ask(".set query ranked"); break;
|
---|
| 434 | }
|
---|
| 435 | switch (queryparams.casefolding)
|
---|
| 436 | {
|
---|
| 437 | case 1: mgq_ask(".set casefold on"); break;
|
---|
| 438 | case 0: mgq_ask(".set casefold off"); break;
|
---|
| 439 | }
|
---|
| 440 | switch (queryparams.stemming)
|
---|
| 441 | {
|
---|
| 442 | case 1: mgq_ask(".set stem on"); break;
|
---|
| 443 | case 0: mgq_ask(".set stem off"); break;
|
---|
| 444 | }
|
---|
| 445 | mgq_ask(".set heads_length 150");
|
---|
| 446 |
|
---|
[350] | 447 | if (queryparams.maxdocs == -1) {
|
---|
| 448 | mgq_ask(".set maxdocs all");
|
---|
| 449 | } else {
|
---|
| 450 | char maxdocstr[32];
|
---|
| 451 | sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
|
---|
| 452 | mgq_ask(maxdocstr);
|
---|
| 453 | }
|
---|
[110] | 454 | }
|
---|
| 455 |
|
---|
| 456 |
|
---|
| 457 | void mgsearchclass::submitquery (const queryparamclass &queryparams)
|
---|
| 458 | {
|
---|
| 459 | // sort out the query string
|
---|
| 460 | text_t ttquerystring = queryparams.querystring;
|
---|
| 461 | filterquery (ttquerystring);
|
---|
| 462 | char *querystring = to_utf8(ttquerystring).getcstr();
|
---|
| 463 |
|
---|
| 464 | // submit the query
|
---|
| 465 | mgq_ask(querystring);
|
---|
| 466 |
|
---|
| 467 | delete querystring;
|
---|
| 468 | }
|
---|
| 469 |
|
---|
| 470 |
|
---|
[334] | 471 | void mgsearchclass::getresults (const queryparamclass &queryparams,
|
---|
| 472 | queryresultsclass &queryresults) {
|
---|
[393] | 473 |
|
---|
| 474 | mgq_results(result_docnums, 0, MAXNUMDOCS,
|
---|
| 475 | ourquerycallback, (void *)(&queryresults));
|
---|
[110] | 476 |
|
---|
| 477 | // get the term frequencies
|
---|
| 478 | mgq_results(result_termfreqs, 0, MAXNUMTERMS,
|
---|
| 479 | termfreqcallback, (void *)(&queryresults));
|
---|
[319] | 480 | queryresults.sortuniqqueryterms();
|
---|
| 481 |
|
---|
| 482 | // get term variants
|
---|
[110] | 483 | mgq_results(result_terms, 0, MAXNUMTERMS,
|
---|
[319] | 484 | termvariantscallback, (void *)(&queryresults));
|
---|
[334] | 485 |
|
---|
| 486 | // get the number of documents retrieved
|
---|
| 487 | int total_retrieved = 0, is_approx = 0;
|
---|
| 488 | mgq_docsretrieved (&total_retrieved, &is_approx);
|
---|
| 489 |
|
---|
| 490 | if (total_retrieved == 0) {
|
---|
| 491 | // not available (or really was zero)
|
---|
[350] | 492 | queryresults.docs_matched = queryresults.docs.docset.size();
|
---|
[334] | 493 | if (queryresults.docs_matched < queryparams.maxdocs)
|
---|
[398] | 494 | queryresults.is_approx = Exact;
|
---|
[334] | 495 | else
|
---|
[398] | 496 | queryresults.is_approx = MoreThan;
|
---|
[334] | 497 | } else {
|
---|
| 498 | queryresults.docs_matched = total_retrieved;
|
---|
[401] | 499 | if (is_approx) queryresults.is_approx = Approximate;
|
---|
| 500 | else queryresults.is_approx = Exact;
|
---|
[334] | 501 | }
|
---|
[110] | 502 | }
|
---|
| 503 |
|
---|
| 504 | void mgsearchclass::filterquery (text_t &ttquerystring) {
|
---|
| 505 | text_t::iterator ithere = ttquerystring.begin ();
|
---|
| 506 | text_t::iterator itend = ttquerystring.end ();
|
---|
| 507 |
|
---|
[473] | 508 | // remove all non alphanumeric characters (except
|
---|
| 509 | // boolean operators
|
---|
[110] | 510 | while (ithere != itend) {
|
---|
[473] | 511 | if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
|
---|
| 512 | (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
|
---|
| 513 | (*ithere != ')')) (*ithere) = ' ';
|
---|
[110] | 514 | ithere++;
|
---|
| 515 | }
|
---|
| 516 | }
|
---|
| 517 |
|
---|
| 518 |
|
---|
| 519 | // the document text for 'docnum' is placed in 'output'
|
---|
| 520 | // docTargetDocument returns 'true' if it was able to
|
---|
| 521 | // try to get a document
|
---|
| 522 | // collection is needed to see if an index from the
|
---|
| 523 | // collection is loaded. If no index has been loaded
|
---|
| 524 | // defaultindex is needed to load one
|
---|
[350] | 525 | bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
|
---|
| 526 | const text_t &defaultsubcollection,
|
---|
| 527 | const text_t &defaultlanguage,
|
---|
[110] | 528 | const text_t &collection,
|
---|
| 529 | int docnum,
|
---|
[325] | 530 | text_t &output) {
|
---|
[110] | 531 | output.clear();
|
---|
| 532 |
|
---|
[325] | 533 | // get the mg version of the document
|
---|
| 534 | char *mgdoc = NULL;
|
---|
| 535 | int doclen = 0;
|
---|
[350] | 536 | if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
|
---|
| 537 | collection, docnum, mgdoc, doclen)) return false;
|
---|
[325] | 538 | if (mgdoc == NULL) return false;
|
---|
[110] | 539 |
|
---|
[325] | 540 | // replace all control-Cs with spaces
|
---|
| 541 | char *mgdoc_here = mgdoc;
|
---|
| 542 | char *mgdoc_end = mgdoc + doclen;
|
---|
| 543 | while (mgdoc_here < mgdoc_end) {
|
---|
| 544 | if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
|
---|
| 545 | mgdoc_here++;
|
---|
| 546 | }
|
---|
[110] | 547 |
|
---|
[325] | 548 | // convert this document to unicode
|
---|
| 549 | utf8inconvertclass inconvert;
|
---|
| 550 | convertclass::status_t status;
|
---|
| 551 | inconvert.reset ();
|
---|
| 552 | inconvert.setinput (mgdoc, doclen);
|
---|
| 553 | inconvert.convert (output, status);
|
---|
[110] | 554 |
|
---|
[325] | 555 | return true;
|
---|
| 556 | }
|
---|
[110] | 557 |
|
---|
[325] | 558 |
|
---|
| 559 | bool mgsearchclass::mgdocument (const text_t &defaultindex,
|
---|
[350] | 560 | const text_t &defaultsubcollection,
|
---|
| 561 | const text_t &defaultlanguage,
|
---|
[325] | 562 | const text_t &collection,
|
---|
| 563 | int docnum,
|
---|
| 564 | char *&UDoc, int &ULen) {
|
---|
[497] | 565 | int databaseloaded = 0;
|
---|
[325] | 566 |
|
---|
| 567 | UDoc = NULL; ULen = 0;
|
---|
| 568 |
|
---|
| 569 | // see if we can make an appropriate database current
|
---|
[539] | 570 | // char *ccollection = collection.getcstr();
|
---|
| 571 | // assert (ccollection != NULL);
|
---|
| 572 | // databaseloaded = load_text_database (ccollection);
|
---|
| 573 | // delete ccollection;
|
---|
[110] | 574 |
|
---|
[325] | 575 | // try and load the database
|
---|
[539] | 576 | // if (!databaseloaded)
|
---|
| 577 | databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
|
---|
| 578 | defaultlanguage, collection);
|
---|
[325] | 579 |
|
---|
| 580 | if (databaseloaded) {
|
---|
| 581 | // retrieve the document from mg
|
---|
| 582 | char docstr[32];
|
---|
| 583 | sprintf(docstr, "%i", docnum);
|
---|
| 584 |
|
---|
| 585 | mgq_ask(".set mode text");
|
---|
| 586 | mgq_ask(".set query docnums");
|
---|
| 587 | mgq_ask(docstr);
|
---|
[110] | 588 |
|
---|
[325] | 589 | tempdoc = NULL;
|
---|
| 590 | templen = 0;
|
---|
| 591 | mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
|
---|
| 592 | UDoc = tempdoc;
|
---|
| 593 | ULen = templen;
|
---|
| 594 | }
|
---|
[110] | 595 |
|
---|
[497] | 596 | return (bool)databaseloaded;
|
---|
[110] | 597 | }
|
---|
| 598 |
|
---|