Ignore:
Timestamp:
1999-07-01T15:54:49+12:00 (25 years ago)
Author:
rjmcnab
Message:

Added code to plug in the equivalent terms of each of the query terms.
Also added a function to get a raw utf8 encoded mg document (for speeding
up a phrase matching function)

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/colservr/mgsearch.cpp

    r319 r325  
    1212/*
    1313   $Log$
     14   Revision 1.9  1999/07/01 03:54:48  rjmcnab
     15   Added code to plug in the equivalent terms of each of the query terms.
     16   Also added a function to get a raw utf8 encoded mg document (for speeding
     17   up a phrase matching function)
     18
    1419   Revision 1.8  1999/06/30 04:04:12  rjmcnab
    1520   made stemming functions available from mgsearch and made the stems
     
    8994static int casefold;
    9095
     96static char *tempdoc = NULL;
     97static int templen = 0;
     98
    9199
    92100//////////////////////
     
    119127  tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
    120128
     129  delete [] word_stem;
     130 
    121131  return tempstr;
    122132}
     
    167177}
    168178
     179int termequivcallback(char *Word, int ULen,  int /*Freq*/,
     180              float /*Weight*/,  void *info) {
     181  text_tset *equivterms = (text_tset *)info;
     182  if (equivterms == NULL) return 0;
     183
     184  text_t thisterm;
     185  thisterm.setcarr(Word, ULen);
     186
     187  equivterms->insert(thisterm);
     188 
     189  return 0;
     190}
     191
     192
     193void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
     194  // allocate working stem space
     195  int maxstemlen = mgq_getmaxstemlen ();
     196  unsigned char *word_stem = new unsigned char [maxstemlen + 2];
     197  if (word_stem == NULL) return;
     198
     199  // copy word to word_stem
     200  int len = 0;
     201  text_t::const_iterator here = word.begin();
     202  text_t::const_iterator end = word.end();
     203  while (len < maxstemlen && here != end) {
     204    word_stem[len+1] = (unsigned char)(*here);
     205    len++; here++;
     206  }
     207  word_stem[len+1] = '\0';
     208  word_stem[0] = len;
     209
     210  // get the equivalent terms
     211  mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
     212 
     213  delete [] word_stem;
     214
     215  return;
     216}
     217
     218  text_tset utf8equivterms; // kept as utf8 string for fast matching
     219
     220
    169221// This callback is called once for each term in the query
    170222int termfreqcallback(char *Word, int ULen,  int Freq,
    171223             float /*Weight*/,  void *info) {
    172224  queryresultsclass *queryresults = (queryresultsclass *)info;
     225  if (queryresults == NULL) return 0;
    173226
    174227  text_t term;
    175228  term.setcarr(Word, ULen);
    176229  termfreqclass termfreq;
     230
    177231  termfreq.termstr = to_uni(term);
    178   termfreq.termstemstr = to_uni (mgsearch_stemword (term));
     232  text_t utf8termstem = mgsearch_stemword (term);
     233  termfreq.termstemstr = to_uni (utf8termstem);
     234
     235  mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
     236 
    179237  termfreq.termfreq = Freq;
    180238  queryresults->orgterms.push_back(termfreq);
     
    196254
    197255// This callback is for getting document text
    198 int doctextcallback(char *Word, int ULen,  int /*Freq*/,
    199             float /*Weight*/,  void *info) {
    200   text_t *output = (text_t *)info;
    201   if (output == NULL) return 0;
    202   output->clear();
    203 
    204   utf8inconvertclass inconvert;
    205   convertclass::status_t status;
    206   inconvert.reset ();
    207   inconvert.setinput (Word, ULen);
    208   inconvert.convert (*output, status);
    209  
    210   // replace all control-Cs with spaces
    211   text_t::iterator here = output->begin();
    212   text_t::iterator end = output->end();
    213   while (here != end) {
    214     if (*here == '\x3') *here = ' ';
    215     here++;
    216   }
     256int doctextcallback(char *Doc, int ULen,  int /*Freq*/,
     257            float /*Weight*/,  void */*info*/) {
     258  tempdoc = Doc;
     259  templen = ULen;
    217260 
    218261  return 0;
     
    302345  return to_uni (mgsearch_stemword (to_utf8 (word)));
    303346}
    304  
     347
     348text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
     349  return to_uni (mgsearch_stemword (to_utf8 (here, end)));
     350}
     351
    305352
    306353bool mgsearchclass::search(const queryparamclass &queryparams,
     
    473520                      const text_t &collection,
    474521                      int docnum,
    475                       text_t &output)
    476 {
    477   int databaseloaded = 0;
    478 
     522                      text_t &output) {
    479523  output.clear();
    480524
     525  // get the mg version of the document
     526  char *mgdoc = NULL;
     527  int doclen = 0;
     528  if (!mgdocument (defaultindex, collection, docnum, mgdoc, doclen)) return false;
     529  if (mgdoc == NULL) return false;
     530
     531  // replace all control-Cs with spaces
     532  char *mgdoc_here = mgdoc;
     533  char *mgdoc_end = mgdoc + doclen;
     534  while (mgdoc_here < mgdoc_end) {
     535    if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
     536    mgdoc_here++;
     537  }
     538
     539  // convert this document to unicode
     540  utf8inconvertclass inconvert;
     541  convertclass::status_t status;
     542  inconvert.reset ();
     543  inconvert.setinput (mgdoc, doclen);
     544  inconvert.convert (output, status);
     545
     546  return true;
     547}
     548
     549
     550bool mgsearchclass::mgdocument (const text_t &defaultindex,
     551                const text_t &collection,
     552                int docnum,
     553                char *&UDoc, int &ULen) {
     554  bool databaseloaded = 0;
     555
     556  UDoc = NULL; ULen = 0;
     557 
     558  // see if we can make an appropriate database current
    481559  char *ccollection = collection.getcstr();
    482560  assert (ccollection != NULL);
    483 
    484   // see if we can make an appropriate database current
    485561  databaseloaded = load_text_database (ccollection);
    486 
     562  delete ccollection;
     563 
    487564  // try and load the database
    488   if (!databaseloaded)
    489     {
    490       // get the names of the index and text suffixes
    491       char *idxsuffix = (getindexsuffix (collection,
    492                      defaultindex)).getcstr();
    493       assert (idxsuffix != NULL);
    494       char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
    495       assert (txtsuffix != NULL);
    496 
    497 #ifdef __WIN32__
    498       char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
    499 #else
    500       char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
    501 #endif
    502      
    503       databaseloaded = load_database(ccollection, ccollectdir, idxsuffix, txtsuffix);
    504      
    505       // free up the c strings
    506       delete idxsuffix;
    507       delete txtsuffix;
    508       delete ccollectdir;
    509     }
    510 
    511   // free up the c collection string
    512   delete ccollection;
    513  
    514   if (databaseloaded)
    515     {
    516       // retrieve the document from mg
    517       char docstr[32];
    518       sprintf(docstr, "%i", docnum);
    519 
    520       mgq_ask(".set mode text");
    521       mgq_ask(".set query docnums");
    522       mgq_ask(docstr);
    523       mgq_results (result_docs, 0, 1, doctextcallback, (void *)&output);
    524     }
     565  if (!databaseloaded) databaseloaded = makeindexcurrent (defaultindex, collection);
     566 
     567  if (databaseloaded) {
     568    // retrieve the document from mg
     569    char docstr[32];
     570    sprintf(docstr, "%i", docnum);
     571   
     572    mgq_ask(".set mode text");
     573    mgq_ask(".set query docnums");
     574    mgq_ask(docstr);
     575
     576    tempdoc = NULL;
     577    templen = 0;
     578    mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
     579    UDoc = tempdoc;
     580    ULen = templen;
     581  }
    525582
    526583  return databaseloaded;
Note: See TracChangeset for help on using the changeset viewer.