Changeset 325 for trunk/gsdl/src/colservr/mgsearch.cpp
- Timestamp:
- 1999-07-01T15:54:49+12:00 (25 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/src/colservr/mgsearch.cpp
r319 r325 12 12 /* 13 13 $Log$ 14 Revision 1.9 1999/07/01 03:54:48 rjmcnab 15 Added code to plug in the equivalent terms of each of the query terms. 16 Also added a function to get a raw utf8 encoded mg document (for speeding 17 up a phrase matching function) 18 14 19 Revision 1.8 1999/06/30 04:04:12 rjmcnab 15 20 made stemming functions available from mgsearch and made the stems … … 89 94 static int casefold; 90 95 96 static char *tempdoc = NULL; 97 static int templen = 0; 98 91 99 92 100 ////////////////////// … … 119 127 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]); 120 128 129 delete [] word_stem; 130 121 131 return tempstr; 122 132 } … … 167 177 } 168 178 179 int termequivcallback(char *Word, int ULen, int /*Freq*/, 180 float /*Weight*/, void *info) { 181 text_tset *equivterms = (text_tset *)info; 182 if (equivterms == NULL) return 0; 183 184 text_t thisterm; 185 thisterm.setcarr(Word, ULen); 186 187 equivterms->insert(thisterm); 188 189 return 0; 190 } 191 192 193 void mgsearch_equivterms (const text_t &word, text_tset &equivterms) { 194 // allocate working stem space 195 int maxstemlen = mgq_getmaxstemlen (); 196 unsigned char *word_stem = new unsigned char [maxstemlen + 2]; 197 if (word_stem == NULL) return; 198 199 // copy word to word_stem 200 int len = 0; 201 text_t::const_iterator here = word.begin(); 202 text_t::const_iterator end = word.end(); 203 while (len < maxstemlen && here != end) { 204 word_stem[len+1] = (unsigned char)(*here); 205 len++; here++; 206 } 207 word_stem[len+1] = '\0'; 208 word_stem[0] = len; 209 210 // get the equivalent terms 211 mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms)); 212 213 delete [] word_stem; 214 215 return; 216 } 217 218 text_tset utf8equivterms; // kept as utf8 string for fast matching 219 220 169 221 // This callback is called once for each term in the query 170 222 int termfreqcallback(char *Word, int ULen, int Freq, 171 223 float /*Weight*/, void *info) { 172 224 queryresultsclass *queryresults = (queryresultsclass *)info; 225 if (queryresults == NULL) return 0; 173 226 174 227 text_t term; 175 228 term.setcarr(Word, ULen); 176 229 termfreqclass termfreq; 230 177 231 termfreq.termstr = to_uni(term); 178 termfreq.termstemstr = to_uni (mgsearch_stemword (term)); 232 text_t utf8termstem = mgsearch_stemword (term); 233 termfreq.termstemstr = to_uni (utf8termstem); 234 235 mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms); 236 179 237 termfreq.termfreq = Freq; 180 238 queryresults->orgterms.push_back(termfreq); … … 196 254 197 255 // This callback is for getting document text 198 int doctextcallback(char *Word, int ULen, int /*Freq*/, 199 float /*Weight*/, void *info) { 200 text_t *output = (text_t *)info; 201 if (output == NULL) return 0; 202 output->clear(); 203 204 utf8inconvertclass inconvert; 205 convertclass::status_t status; 206 inconvert.reset (); 207 inconvert.setinput (Word, ULen); 208 inconvert.convert (*output, status); 209 210 // replace all control-Cs with spaces 211 text_t::iterator here = output->begin(); 212 text_t::iterator end = output->end(); 213 while (here != end) { 214 if (*here == '\x3') *here = ' '; 215 here++; 216 } 256 int doctextcallback(char *Doc, int ULen, int /*Freq*/, 257 float /*Weight*/, void */*info*/) { 258 tempdoc = Doc; 259 templen = ULen; 217 260 218 261 return 0; … … 302 345 return to_uni (mgsearch_stemword (to_utf8 (word))); 303 346 } 304 347 348 text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) { 349 return to_uni (mgsearch_stemword (to_utf8 (here, end))); 350 } 351 305 352 306 353 bool mgsearchclass::search(const queryparamclass &queryparams, … … 473 520 const text_t &collection, 474 521 int docnum, 475 text_t &output) 476 { 477 int databaseloaded = 0; 478 522 text_t &output) { 479 523 output.clear(); 480 524 525 // get the mg version of the document 526 char *mgdoc = NULL; 527 int doclen = 0; 528 if (!mgdocument (defaultindex, collection, docnum, mgdoc, doclen)) return false; 529 if (mgdoc == NULL) return false; 530 531 // replace all control-Cs with spaces 532 char *mgdoc_here = mgdoc; 533 char *mgdoc_end = mgdoc + doclen; 534 while (mgdoc_here < mgdoc_end) { 535 if (*mgdoc_here == '\x3') *mgdoc_here = ' '; 536 mgdoc_here++; 537 } 538 539 // convert this document to unicode 540 utf8inconvertclass inconvert; 541 convertclass::status_t status; 542 inconvert.reset (); 543 inconvert.setinput (mgdoc, doclen); 544 inconvert.convert (output, status); 545 546 return true; 547 } 548 549 550 bool mgsearchclass::mgdocument (const text_t &defaultindex, 551 const text_t &collection, 552 int docnum, 553 char *&UDoc, int &ULen) { 554 bool databaseloaded = 0; 555 556 UDoc = NULL; ULen = 0; 557 558 // see if we can make an appropriate database current 481 559 char *ccollection = collection.getcstr(); 482 560 assert (ccollection != NULL); 483 484 // see if we can make an appropriate database current485 561 databaseloaded = load_text_database (ccollection); 486 562 delete ccollection; 563 487 564 // try and load the database 488 if (!databaseloaded) 489 { 490 // get the names of the index and text suffixes 491 char *idxsuffix = (getindexsuffix (collection, 492 defaultindex)).getcstr(); 493 assert (idxsuffix != NULL); 494 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr(); 495 assert (txtsuffix != NULL); 496 497 #ifdef __WIN32__ 498 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL); 499 #else 500 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL); 501 #endif 502 503 databaseloaded = load_database(ccollection, ccollectdir, idxsuffix, txtsuffix); 504 505 // free up the c strings 506 delete idxsuffix; 507 delete txtsuffix; 508 delete ccollectdir; 509 } 510 511 // free up the c collection string 512 delete ccollection; 513 514 if (databaseloaded) 515 { 516 // retrieve the document from mg 517 char docstr[32]; 518 sprintf(docstr, "%i", docnum); 519 520 mgq_ask(".set mode text"); 521 mgq_ask(".set query docnums"); 522 mgq_ask(docstr); 523 mgq_results (result_docs, 0, 1, doctextcallback, (void *)&output); 524 } 565 if (!databaseloaded) databaseloaded = makeindexcurrent (defaultindex, collection); 566 567 if (databaseloaded) { 568 // retrieve the document from mg 569 char docstr[32]; 570 sprintf(docstr, "%i", docnum); 571 572 mgq_ask(".set mode text"); 573 mgq_ask(".set query docnums"); 574 mgq_ask(docstr); 575 576 tempdoc = NULL; 577 templen = 0; 578 mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL); 579 UDoc = tempdoc; 580 ULen = templen; 581 } 525 582 526 583 return databaseloaded;
Note:
See TracChangeset
for help on using the changeset viewer.