[110] | 1 | /**********************************************************************
|
---|
| 2 | *
|
---|
| 3 | * mgsearch.cpp --
|
---|
| 4 | * Copyright (C) 1999 The New Zealand Digital Library Project
|
---|
| 5 | *
|
---|
[534] | 6 | * A component of the Greenstone digital library software
|
---|
| 7 | * from the New Zealand Digital Library Project at the
|
---|
| 8 | * University of Waikato, New Zealand.
|
---|
[110] | 9 | *
|
---|
[534] | 10 | * This program is free software; you can redistribute it and/or modify
|
---|
| 11 | * it under the terms of the GNU General Public License as published by
|
---|
| 12 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 13 | * (at your option) any later version.
|
---|
| 14 | *
|
---|
| 15 | * This program is distributed in the hope that it will be useful,
|
---|
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 18 | * GNU General Public License for more details.
|
---|
| 19 | *
|
---|
| 20 | * You should have received a copy of the GNU General Public License
|
---|
| 21 | * along with this program; if not, write to the Free Software
|
---|
| 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 23 | *
|
---|
[110] | 24 | *********************************************************************/
|
---|
| 25 |
|
---|
[114] | 26 | #include "gsdlconf.h"
|
---|
[110] | 27 | #include "mgsearch.h"
|
---|
[163] | 28 | #include "fileutil.h"
|
---|
[110] | 29 |
|
---|
| 30 | #include <string.h>
|
---|
| 31 | #include <stdio.h>
|
---|
| 32 | #include <stdlib.h>
|
---|
| 33 | #include <ctype.h>
|
---|
| 34 |
|
---|
[114] | 35 | #if defined(GSDL_USE_OBJECTSPACE)
|
---|
| 36 | # include <ospace\std\iostream>
|
---|
| 37 | #elif defined(GSDL_USE_IOS_H)
|
---|
[110] | 38 | # include <iostream.h>
|
---|
| 39 | #else
|
---|
[114] | 40 | # include <iostream>
|
---|
| 41 | #endif
|
---|
[110] | 42 |
|
---|
[114] | 43 | #if defined(__WIN32__)
|
---|
[110] | 44 | // gdbm stuff
|
---|
| 45 | # include "autoconf.h"
|
---|
| 46 | # include "systems.h"
|
---|
| 47 | # include "gdbmconst.h"
|
---|
| 48 | # include "gdbm.h"
|
---|
[114] | 49 | #else
|
---|
| 50 | # include <gdbm.h>
|
---|
[110] | 51 | #endif
|
---|
[114] | 52 |
|
---|
[110] | 53 |
|
---|
| 54 | #include <assert.h>
|
---|
| 55 |
|
---|
| 56 | #include "mgq.h"
|
---|
[163] | 57 | // #include "locateinfo.h"
|
---|
[110] | 58 | #include "gsdlunicode.h"
|
---|
| 59 | #include "unitool.h"
|
---|
| 60 |
|
---|
| 61 |
|
---|
| 62 | /////////////
|
---|
| 63 | // globals //
|
---|
| 64 | /////////////
|
---|
| 65 |
|
---|
[325] | 66 | static char *tempdoc = NULL;
|
---|
| 67 | static int templen = 0;
|
---|
[110] | 68 |
|
---|
[325] | 69 |
|
---|
[319] | 70 | //////////////////////
|
---|
| 71 | // useful functions //
|
---|
| 72 | //////////////////////
|
---|
| 73 |
|
---|
| 74 |
|
---|
| 75 | // input and output are in utf8
|
---|
| 76 | text_t mgsearch_stemword (const text_t &word) {
|
---|
| 77 | // allocate working stem space
|
---|
| 78 | int maxstemlen = mgq_getmaxstemlen ();
|
---|
| 79 | unsigned char *word_stem = new unsigned char [maxstemlen + 2];
|
---|
| 80 | if (word_stem == NULL) return "";
|
---|
| 81 |
|
---|
| 82 | // copy word to word_stem
|
---|
| 83 | int len = 0;
|
---|
| 84 | text_t::const_iterator here = word.begin();
|
---|
| 85 | text_t::const_iterator end = word.end();
|
---|
| 86 | while (len < maxstemlen && here != end) {
|
---|
| 87 | word_stem[len+1] = (unsigned char)(*here);
|
---|
[9620] | 88 | ++len; ++here;
|
---|
[319] | 89 | }
|
---|
| 90 | word_stem[len+1] = '\0';
|
---|
| 91 | word_stem[0] = len;
|
---|
| 92 |
|
---|
| 93 | mgq_stemword (word_stem);
|
---|
| 94 |
|
---|
| 95 | // copy word_stem back to tempstr
|
---|
| 96 | text_t tempstr;
|
---|
| 97 | tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
|
---|
| 98 |
|
---|
[325] | 99 | delete [] word_stem;
|
---|
| 100 |
|
---|
[319] | 101 | return tempstr;
|
---|
| 102 | }
|
---|
| 103 |
|
---|
| 104 |
|
---|
| 105 |
|
---|
[110] | 106 | ////////////////////////
|
---|
| 107 | // callback functions //
|
---|
| 108 | ////////////////////////
|
---|
| 109 |
|
---|
| 110 | // This routine is called for each document found in a search
|
---|
| 111 | // it assumes that cache_num is set up correctly to point to
|
---|
| 112 | // a suitable result cache
|
---|
[497] | 113 | int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum,
|
---|
[110] | 114 | float Weight, void *info) {
|
---|
| 115 |
|
---|
| 116 |
|
---|
| 117 | queryresultsclass *queryresults = (queryresultsclass * )info;
|
---|
| 118 |
|
---|
| 119 | // append this entry to the document results
|
---|
| 120 | docresultclass docresult;
|
---|
| 121 | docresult.docnum = DocNum;
|
---|
[319] | 122 | docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
|
---|
| 123 | docresult.docweight = Weight - docresult.num_query_terms_matched*100;
|
---|
| 124 |
|
---|
[350] | 125 | queryresults->docs.docset[DocNum] = docresult;
|
---|
| 126 | queryresults->docs.docorder.push_back(DocNum);
|
---|
[110] | 127 |
|
---|
| 128 | return 0;
|
---|
| 129 | }
|
---|
| 130 |
|
---|
[325] | 131 | int termequivcallback(char *Word, int ULen, int /*Freq*/,
|
---|
| 132 | float /*Weight*/, void *info) {
|
---|
| 133 | text_tset *equivterms = (text_tset *)info;
|
---|
| 134 | if (equivterms == NULL) return 0;
|
---|
| 135 |
|
---|
| 136 | text_t thisterm;
|
---|
| 137 | thisterm.setcarr(Word, ULen);
|
---|
| 138 |
|
---|
| 139 | equivterms->insert(thisterm);
|
---|
| 140 |
|
---|
| 141 | return 0;
|
---|
| 142 | }
|
---|
| 143 |
|
---|
| 144 |
|
---|
| 145 | void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
|
---|
| 146 | // allocate working stem space
|
---|
| 147 | int maxstemlen = mgq_getmaxstemlen ();
|
---|
| 148 | unsigned char *word_stem = new unsigned char [maxstemlen + 2];
|
---|
| 149 | if (word_stem == NULL) return;
|
---|
| 150 |
|
---|
| 151 | // copy word to word_stem
|
---|
| 152 | int len = 0;
|
---|
| 153 | text_t::const_iterator here = word.begin();
|
---|
| 154 | text_t::const_iterator end = word.end();
|
---|
| 155 | while (len < maxstemlen && here != end) {
|
---|
| 156 | word_stem[len+1] = (unsigned char)(*here);
|
---|
[9620] | 157 | ++len; ++here;
|
---|
[325] | 158 | }
|
---|
| 159 | word_stem[len+1] = '\0';
|
---|
| 160 | word_stem[0] = len;
|
---|
| 161 |
|
---|
| 162 | // get the equivalent terms
|
---|
| 163 | mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
|
---|
| 164 |
|
---|
| 165 | delete [] word_stem;
|
---|
| 166 |
|
---|
| 167 | return;
|
---|
| 168 | }
|
---|
| 169 |
|
---|
| 170 | text_tset utf8equivterms; // kept as utf8 string for fast matching
|
---|
| 171 |
|
---|
| 172 |
|
---|
[110] | 173 | // This callback is called once for each term in the query
|
---|
| 174 | int termfreqcallback(char *Word, int ULen, int Freq,
|
---|
[114] | 175 | float /*Weight*/, void *info) {
|
---|
[110] | 176 | queryresultsclass *queryresults = (queryresultsclass *)info;
|
---|
[325] | 177 | if (queryresults == NULL) return 0;
|
---|
[110] | 178 |
|
---|
| 179 | text_t term;
|
---|
| 180 | term.setcarr(Word, ULen);
|
---|
| 181 | termfreqclass termfreq;
|
---|
[325] | 182 |
|
---|
[110] | 183 | termfreq.termstr = to_uni(term);
|
---|
[325] | 184 | text_t utf8termstem = mgsearch_stemword (term);
|
---|
| 185 | termfreq.termstemstr = to_uni (utf8termstem);
|
---|
| 186 |
|
---|
| 187 | mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
|
---|
| 188 |
|
---|
[110] | 189 | termfreq.termfreq = Freq;
|
---|
[319] | 190 | queryresults->orgterms.push_back(termfreq);
|
---|
[110] | 191 |
|
---|
| 192 | return 0;
|
---|
| 193 | }
|
---|
| 194 |
|
---|
| 195 | // this callback is called once for each variation of each term
|
---|
[319] | 196 | int termvariantscallback(char *Word, int ULen, int /*Freq*/,
|
---|
| 197 | float /*Weight*/, void *info) {
|
---|
[110] | 198 |
|
---|
| 199 | text_t term;
|
---|
| 200 | term.setcarr(Word, ULen);
|
---|
| 201 | queryresultsclass *queryresults = (queryresultsclass *)info;
|
---|
[350] | 202 | queryresults->termvariants.insert(to_uni(term));
|
---|
[110] | 203 |
|
---|
| 204 | return 0;
|
---|
| 205 | }
|
---|
| 206 |
|
---|
| 207 | // This callback is for getting document text
|
---|
[325] | 208 | int doctextcallback(char *Doc, int ULen, int /*Freq*/,
|
---|
[497] | 209 | float /*Weight*/, void * /*info*/) {
|
---|
[13789] | 210 | if (Doc != NULL) {
|
---|
| 211 | // Make a copy of this string so we can unload the database without losing it
|
---|
| 212 | tempdoc = new char[ULen + 1];
|
---|
| 213 | strcpy(tempdoc, Doc);
|
---|
| 214 | }
|
---|
[325] | 215 | templen = ULen;
|
---|
[110] | 216 |
|
---|
| 217 | return 0;
|
---|
| 218 | }
|
---|
| 219 |
|
---|
| 220 |
|
---|
[9937] | 221 | text_t mgsearchclass::getindexsuffix (const text_t &collection,
|
---|
[163] | 222 | const text_t &index) {
|
---|
[393] | 223 |
|
---|
| 224 | text_t indexsuffix = "index";
|
---|
[163] | 225 | indexsuffix = filename_cat (indexsuffix, index);
|
---|
[9937] | 226 | if (indexstem.empty()) {
|
---|
| 227 | // no index stem, use the coll name
|
---|
| 228 | indexsuffix = filename_cat (indexsuffix, collection);
|
---|
| 229 | } else {
|
---|
| 230 | indexsuffix = filename_cat (indexsuffix, indexstem);
|
---|
| 231 | }
|
---|
[163] | 232 | return indexsuffix;
|
---|
| 233 | }
|
---|
[110] | 234 |
|
---|
[163] | 235 |
|
---|
| 236 |
|
---|
| 237 |
|
---|
[110] | 238 | ////////////////////
|
---|
| 239 | // mgsearch class //
|
---|
| 240 | ////////////////////
|
---|
| 241 |
|
---|
| 242 | mgsearchclass::mgsearchclass ()
|
---|
[1324] | 243 | : searchclass() {
|
---|
| 244 |
|
---|
[110] | 245 | }
|
---|
| 246 |
|
---|
| 247 | mgsearchclass::~mgsearchclass ()
|
---|
| 248 | {
|
---|
| 249 | if (cache != NULL)
|
---|
| 250 | {
|
---|
| 251 | delete cache;
|
---|
| 252 | cache = NULL;
|
---|
| 253 | }
|
---|
| 254 | }
|
---|
| 255 |
|
---|
[9937] | 256 | void mgsearchclass::set_indexstem(const text_t &stem) {
|
---|
| 257 | indexstem = stem;
|
---|
| 258 |
|
---|
| 259 | }
|
---|
| 260 |
|
---|
[319] | 261 | // you only need to use this function before doing any stemming
|
---|
| 262 | // casefolding and stemming will be set if values for them are
|
---|
| 263 | // provided (0 or 1).
|
---|
| 264 | // makeindexcurrent returns true if it was able to load the database
|
---|
| 265 | bool mgsearchclass::makeindexcurrent (const text_t &index,
|
---|
[350] | 266 | const text_t &subcollection,
|
---|
| 267 | const text_t &language,
|
---|
[319] | 268 | const text_t &collection,
|
---|
| 269 | int casefolding,
|
---|
| 270 | int stemming) {
|
---|
| 271 | bool databaseloaded = true;
|
---|
[110] | 272 |
|
---|
[319] | 273 | // get the names of the collection, index and text suffixes
|
---|
| 274 | char *ccollection = collection.getcstr();
|
---|
| 275 | assert (ccollection != NULL);
|
---|
[350] | 276 | char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
|
---|
[319] | 277 | assert (idxsuffix != NULL);
|
---|
| 278 | char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
|
---|
| 279 | assert (txtsuffix != NULL);
|
---|
| 280 | #ifdef __WIN32__
|
---|
| 281 | char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
|
---|
| 282 | #else
|
---|
| 283 | char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
|
---|
| 284 | #endif
|
---|
| 285 |
|
---|
| 286 | if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
|
---|
| 287 | if (casefolding == 0) mgq_ask(".set casefold off");
|
---|
| 288 | else if (casefolding > 0) mgq_ask(".set casefold on");
|
---|
| 289 | if (stemming == 0) mgq_ask(".set stem off");
|
---|
| 290 | else if (stemming > 0) mgq_ask(".set stem on");
|
---|
| 291 |
|
---|
| 292 | } else databaseloaded = false;
|
---|
| 293 |
|
---|
| 294 | // free up the c strings
|
---|
[9631] | 295 | delete []ccollection;
|
---|
| 296 | delete []idxsuffix;
|
---|
| 297 | delete []txtsuffix;
|
---|
| 298 | delete []ccollectdir;
|
---|
[319] | 299 |
|
---|
| 300 | return databaseloaded;
|
---|
| 301 | }
|
---|
| 302 |
|
---|
| 303 |
|
---|
| 304 | // stem word uses the values set in the last call to makeindexcurrent
|
---|
| 305 | // to stem the word. It is assumed that word is in unicode
|
---|
| 306 | text_t mgsearchclass::stemword (const text_t &word) {
|
---|
| 307 | return to_uni (mgsearch_stemword (to_utf8 (word)));
|
---|
| 308 | }
|
---|
| 309 |
|
---|
[325] | 310 | text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
|
---|
| 311 | return to_uni (mgsearch_stemword (to_utf8 (here, end)));
|
---|
| 312 | }
|
---|
| 313 |
|
---|
[1860] | 314 | /**
|
---|
| 315 | * search directs the whole execution of the search; a number of other
|
---|
| 316 | * functions in this class are called as a result, and precondition
|
---|
| 317 | * checks are also made
|
---|
| 318 | */
|
---|
[110] | 319 | bool mgsearchclass::search(const queryparamclass &queryparams,
|
---|
[319] | 320 | queryresultsclass &queryresults) {
|
---|
[1324] | 321 | // assert (cache != NULL);
|
---|
[110] | 322 |
|
---|
[1860] | 323 | // clear any previous results
|
---|
[110] | 324 | queryresults.clear();
|
---|
| 325 | // first check the cache
|
---|
[1324] | 326 | if (cache != NULL) {
|
---|
| 327 | if (cache->find(queryparams, queryresults)) return true;
|
---|
| 328 | }
|
---|
[110] | 329 | // make sure there is a query to be processed
|
---|
[633] | 330 | if (!has_unicode_letdig(queryparams.querystring)) return true;
|
---|
[110] | 331 |
|
---|
[350] | 332 | if (makeindexcurrent (queryparams.index, queryparams.subcollection,
|
---|
| 333 | queryparams.language, queryparams.collection)) {
|
---|
[1860] | 334 | // initialise the form of results
|
---|
[319] | 335 | setsearchmode (queryparams);
|
---|
[1860] | 336 |
|
---|
| 337 | // execute the query
|
---|
[319] | 338 | submitquery (queryparams);
|
---|
[1860] | 339 |
|
---|
| 340 | // retrieve the results
|
---|
[334] | 341 | getresults (queryparams, queryresults);
|
---|
[13780] | 342 | unload_database(); // Important that local library doesn't leave any files open
|
---|
[319] | 343 | return true;
|
---|
| 344 | }
|
---|
[110] | 345 |
|
---|
[319] | 346 | return false;
|
---|
[110] | 347 | }
|
---|
| 348 |
|
---|
[1990] | 349 | /* accumulator_method has been changed to use array rather than list.
|
---|
| 350 | list appears to be broken somewhat - for some ranked queries, it returned
|
---|
| 351 | fewer results than it should have (eg 45 instead of 50). The three other
|
---|
| 352 | methods (array, splay_tree, hash_table) all return the same number of
|
---|
| 353 | documents, in the same order, with the same ranks. list returns what
|
---|
| 354 | appears to be the same documents (but less of them), but with different ranks,
|
---|
| 355 | and in a different order. Minimal time tests dont show any speed improvement
|
---|
| 356 | of list over array (maybe because its broken??). [02/2001, kjm18]
|
---|
[2011] | 357 |
|
---|
| 358 | ... [sjboddie, also 02/2001] turns out that changing the accumulator_method
|
---|
| 359 | introduced a more serious bug than it fixed (i.e. occasionally when doing a
|
---|
| 360 | ranked search for a very common word you get no results at all). I've
|
---|
| 361 | changed it back to list for now, one day we should play with other
|
---|
| 362 | accumulator_methods but for now I don't have time and don't want to risk
|
---|
| 363 | introducing bugs (better the devil you know ;)
|
---|
[1990] | 364 | */
|
---|
[110] | 365 | void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
|
---|
| 366 | {
|
---|
| 367 | mgq_ask(".set expert true");
|
---|
[319] | 368 | mgq_ask(".set sorted_terms true");
|
---|
[2011] | 369 | mgq_ask(".set accumulator_method list");
|
---|
[497] | 370 | mgq_ask(".set max_accumulators 500000");
|
---|
| 371 | mgq_ask(".set maxparas 500000");
|
---|
[110] | 372 | mgq_ask(".set verbatim true");
|
---|
[1306] | 373 | mgq_ask(".unset skip_dump");
|
---|
[110] | 374 | mgq_ask(".set mode docnums");
|
---|
| 375 |
|
---|
| 376 | switch (queryparams.search_type)
|
---|
| 377 | {
|
---|
| 378 | case 0: mgq_ask(".set query boolean"); break;
|
---|
| 379 | case 1: mgq_ask(".set query ranked"); break;
|
---|
| 380 | }
|
---|
| 381 | switch (queryparams.casefolding)
|
---|
| 382 | {
|
---|
| 383 | case 1: mgq_ask(".set casefold on"); break;
|
---|
| 384 | case 0: mgq_ask(".set casefold off"); break;
|
---|
| 385 | }
|
---|
| 386 | switch (queryparams.stemming)
|
---|
| 387 | {
|
---|
| 388 | case 1: mgq_ask(".set stem on"); break;
|
---|
| 389 | case 0: mgq_ask(".set stem off"); break;
|
---|
| 390 | }
|
---|
| 391 | mgq_ask(".set heads_length 150");
|
---|
| 392 |
|
---|
[350] | 393 | if (queryparams.maxdocs == -1) {
|
---|
| 394 | mgq_ask(".set maxdocs all");
|
---|
| 395 | } else {
|
---|
| 396 | char maxdocstr[32];
|
---|
| 397 | sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
|
---|
| 398 | mgq_ask(maxdocstr);
|
---|
| 399 | }
|
---|
[4193] | 400 |
|
---|
| 401 | char maxnumericstr[32];
|
---|
| 402 | sprintf(maxnumericstr, ".set maxnumeric %i", queryparams.maxnumeric);
|
---|
| 403 | mgq_ask(maxnumericstr);
|
---|
| 404 |
|
---|
[110] | 405 | }
|
---|
| 406 |
|
---|
[1860] | 407 | /**
|
---|
| 408 | * submitquery constructs the query string (into UTF8 encoding)
|
---|
| 409 | * and submits it using mgq_ask to the mg search engine. Most
|
---|
| 410 | * of the processing will be done inside Greenstone
|
---|
| 411 | */
|
---|
[110] | 412 | void mgsearchclass::submitquery (const queryparamclass &queryparams)
|
---|
| 413 | {
|
---|
[1860] | 414 | // sort out the query string; copy it, remove all special characters
|
---|
| 415 | // and then convert it to a string in UTF8 format
|
---|
[110] | 416 | text_t ttquerystring = queryparams.querystring;
|
---|
| 417 | filterquery (ttquerystring);
|
---|
| 418 | char *querystring = to_utf8(ttquerystring).getcstr();
|
---|
[2211] | 419 |
|
---|
[110] | 420 | // submit the query
|
---|
| 421 | mgq_ask(querystring);
|
---|
| 422 |
|
---|
[1860] | 423 | // destroy the temporary character array
|
---|
[9631] | 424 | delete []querystring;
|
---|
[110] | 425 | }
|
---|
| 426 |
|
---|
[1860] | 427 | /**
|
---|
| 428 | * getrults is called to retrieve the required data on the docs
|
---|
| 429 | * which responded to the query submitted in submitquery above.
|
---|
| 430 | *
|
---|
| 431 | * It calls the local mgquery (mgq) interface to MG several times,
|
---|
| 432 | * to obtain the document numbers, term frequencies, term variants
|
---|
| 433 | * etc. All processing of the query will be done by Greenstone
|
---|
| 434 | * thereafter
|
---|
| 435 | */
|
---|
[334] | 436 | void mgsearchclass::getresults (const queryparamclass &queryparams,
|
---|
| 437 | queryresultsclass &queryresults) {
|
---|
[1860] | 438 | // get the configuration for the maximum number of documents to
|
---|
| 439 | // retrieve
|
---|
[612] | 440 | int howmany = queryparams.maxdocs;
|
---|
[615] | 441 | if (howmany == -1) howmany = MAXNUMDOCS;
|
---|
[612] | 442 | mgq_results(result_docnums, 0, howmany,
|
---|
[393] | 443 | ourquerycallback, (void *)(&queryresults));
|
---|
[110] | 444 |
|
---|
| 445 | // get the term frequencies
|
---|
| 446 | mgq_results(result_termfreqs, 0, MAXNUMTERMS,
|
---|
| 447 | termfreqcallback, (void *)(&queryresults));
|
---|
[319] | 448 | queryresults.sortuniqqueryterms();
|
---|
| 449 |
|
---|
| 450 | // get term variants
|
---|
[110] | 451 | mgq_results(result_terms, 0, MAXNUMTERMS,
|
---|
[319] | 452 | termvariantscallback, (void *)(&queryresults));
|
---|
[334] | 453 |
|
---|
| 454 | // get the number of documents retrieved
|
---|
| 455 | int total_retrieved = 0, is_approx = 0;
|
---|
| 456 | mgq_docsretrieved (&total_retrieved, &is_approx);
|
---|
| 457 |
|
---|
| 458 | if (total_retrieved == 0) {
|
---|
| 459 | // not available (or really was zero)
|
---|
[350] | 460 | queryresults.docs_matched = queryresults.docs.docset.size();
|
---|
[612] | 461 | if ((queryparams.maxdocs == -1) ||
|
---|
| 462 | (queryresults.docs_matched < queryparams.maxdocs))
|
---|
[398] | 463 | queryresults.is_approx = Exact;
|
---|
[334] | 464 | else
|
---|
[398] | 465 | queryresults.is_approx = MoreThan;
|
---|
[334] | 466 | } else {
|
---|
| 467 | queryresults.docs_matched = total_retrieved;
|
---|
[401] | 468 | if (is_approx) queryresults.is_approx = Approximate;
|
---|
| 469 | else queryresults.is_approx = Exact;
|
---|
[334] | 470 | }
|
---|
[110] | 471 | }
|
---|
| 472 |
|
---|
[1860] | 473 | /**
|
---|
| 474 | * Tidies the given querystring, removing special characters
|
---|
| 475 | */
|
---|
[110] | 476 | void mgsearchclass::filterquery (text_t &ttquerystring) {
|
---|
| 477 | text_t::iterator ithere = ttquerystring.begin ();
|
---|
| 478 | text_t::iterator itend = ttquerystring.end ();
|
---|
| 479 |
|
---|
[473] | 480 | // remove all non alphanumeric characters (except
|
---|
| 481 | // boolean operators
|
---|
[110] | 482 | while (ithere != itend) {
|
---|
[473] | 483 | if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
|
---|
| 484 | (*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
|
---|
| 485 | (*ithere != ')')) (*ithere) = ' ';
|
---|
[9620] | 486 | ++ithere;
|
---|
[110] | 487 | }
|
---|
| 488 | }
|
---|
| 489 |
|
---|
| 490 |
|
---|
| 491 | // the document text for 'docnum' is placed in 'output'
|
---|
| 492 | // docTargetDocument returns 'true' if it was able to
|
---|
| 493 | // try to get a document
|
---|
| 494 | // collection is needed to see if an index from the
|
---|
| 495 | // collection is loaded. If no index has been loaded
|
---|
| 496 | // defaultindex is needed to load one
|
---|
[350] | 497 | bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
|
---|
| 498 | const text_t &defaultsubcollection,
|
---|
| 499 | const text_t &defaultlanguage,
|
---|
[110] | 500 | const text_t &collection,
|
---|
| 501 | int docnum,
|
---|
[325] | 502 | text_t &output) {
|
---|
[110] | 503 | output.clear();
|
---|
| 504 |
|
---|
[325] | 505 | // get the mg version of the document
|
---|
| 506 | char *mgdoc = NULL;
|
---|
| 507 | int doclen = 0;
|
---|
[350] | 508 | if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
|
---|
| 509 | collection, docnum, mgdoc, doclen)) return false;
|
---|
[325] | 510 | if (mgdoc == NULL) return false;
|
---|
[110] | 511 |
|
---|
[325] | 512 | // replace all control-Cs with spaces
|
---|
| 513 | char *mgdoc_here = mgdoc;
|
---|
| 514 | char *mgdoc_end = mgdoc + doclen;
|
---|
| 515 | while (mgdoc_here < mgdoc_end) {
|
---|
| 516 | if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
|
---|
[9620] | 517 | ++mgdoc_here;
|
---|
[325] | 518 | }
|
---|
[110] | 519 |
|
---|
[325] | 520 | // convert this document to unicode
|
---|
| 521 | utf8inconvertclass inconvert;
|
---|
| 522 | convertclass::status_t status;
|
---|
| 523 | inconvert.reset ();
|
---|
| 524 | inconvert.setinput (mgdoc, doclen);
|
---|
| 525 | inconvert.convert (output, status);
|
---|
[110] | 526 |
|
---|
[13789] | 527 | delete[] mgdoc;
|
---|
[325] | 528 | return true;
|
---|
| 529 | }
|
---|
[110] | 530 |
|
---|
[325] | 531 |
|
---|
| 532 | bool mgsearchclass::mgdocument (const text_t &defaultindex,
|
---|
[350] | 533 | const text_t &defaultsubcollection,
|
---|
| 534 | const text_t &defaultlanguage,
|
---|
[325] | 535 | const text_t &collection,
|
---|
| 536 | int docnum,
|
---|
| 537 | char *&UDoc, int &ULen) {
|
---|
[497] | 538 | int databaseloaded = 0;
|
---|
[325] | 539 |
|
---|
| 540 | UDoc = NULL; ULen = 0;
|
---|
| 541 |
|
---|
| 542 | // see if we can make an appropriate database current
|
---|
[539] | 543 | // char *ccollection = collection.getcstr();
|
---|
| 544 | // assert (ccollection != NULL);
|
---|
| 545 | // databaseloaded = load_text_database (ccollection);
|
---|
[9631] | 546 | // delete []ccollection;
|
---|
[110] | 547 |
|
---|
[325] | 548 | // try and load the database
|
---|
[539] | 549 | // if (!databaseloaded)
|
---|
| 550 | databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
|
---|
| 551 | defaultlanguage, collection);
|
---|
[325] | 552 |
|
---|
| 553 | if (databaseloaded) {
|
---|
| 554 | // retrieve the document from mg
|
---|
| 555 | char docstr[32];
|
---|
| 556 | sprintf(docstr, "%i", docnum);
|
---|
| 557 |
|
---|
| 558 | mgq_ask(".set mode text");
|
---|
| 559 | mgq_ask(".set query docnums");
|
---|
| 560 | mgq_ask(docstr);
|
---|
[110] | 561 |
|
---|
[325] | 562 | tempdoc = NULL;
|
---|
| 563 | templen = 0;
|
---|
| 564 | mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
|
---|
| 565 | UDoc = tempdoc;
|
---|
| 566 | ULen = templen;
|
---|
| 567 | }
|
---|
[110] | 568 |
|
---|
[13780] | 569 | unload_database(); // Important that local library doesn't leave any files open
|
---|
[497] | 570 | return (bool)databaseloaded;
|
---|
[110] | 571 | }
|
---|
[1324] | 572 |
|
---|
[2545] | 573 | // unload_database simply calls mgq's close_all_databases function to clear
|
---|
| 574 | // any cached databases - this is useful when attempting to completely
|
---|
| 575 | // remove all trace of a collectionserver at runtime (when using a
|
---|
| 576 | // persistent version of Greenstone like the windows local library)
|
---|
| 577 | void mgsearchclass::unload_database () {
|
---|
| 578 | close_all_databases();
|
---|
| 579 | }
|
---|