Ignore:
Timestamp:
1999-06-30T16:04:14+12:00 (25 years ago)
Author:
rjmcnab
Message:

made stemming functions available from mgsearch and made the stems
for the query terms available in queryinfo

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/colservr/mgsearch.cpp

    r301 r319  
    1212/*
    1313   $Log$
     14   Revision 1.8  1999/06/30 04:04:12  rjmcnab
     15   made stemming functions available from mgsearch and made the stems
     16   for the query terms available in queryinfo
     17
    1418   Revision 1.7  1999/06/27 22:07:27  sjboddie
    1519   got rid of all the old functions for dealing with dir indexes
     
    8690
    8791
     92//////////////////////
     93// useful functions //
     94//////////////////////
     95
     96
     97// input and output are in utf8
     98text_t mgsearch_stemword (const text_t &word) {
     99  // allocate working stem space
     100  int maxstemlen = mgq_getmaxstemlen ();
     101  unsigned char *word_stem = new unsigned char [maxstemlen + 2];
     102  if (word_stem == NULL) return "";
     103
     104  // copy word to word_stem
     105  int len = 0;
     106  text_t::const_iterator here = word.begin();
     107  text_t::const_iterator end = word.end();
     108  while (len < maxstemlen && here != end) {
     109    word_stem[len+1] = (unsigned char)(*here);
     110    len++; here++;
     111  }
     112  word_stem[len+1] = '\0';
     113  word_stem[0] = len;
     114
     115  mgq_stemword (word_stem);
     116
     117  // copy word_stem back to tempstr
     118  text_t tempstr;
     119  tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
     120
     121  return tempstr;
     122}
     123
     124
     125
    88126////////////////////////
    89127// callback functions //
     
    121159  docresultclass docresult;
    122160  docresult.docnum = DocNum;
    123   docresult.docweight = Weight;
    124 
     161  docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
     162  docresult.docweight = Weight - docresult.num_query_terms_matched*100;
     163 
    125164  queryresults->docs.push_back(docresult);
    126165 
     
    137176  termfreqclass termfreq;
    138177  termfreq.termstr = to_uni(term);
     178  termfreq.termstemstr = to_uni (mgsearch_stemword (term));
    139179  termfreq.termfreq = Freq;
    140   queryresults->terms.push_back(termfreq);
     180  queryresults->orgterms.push_back(termfreq);
    141181 
    142182  return 0;
     
    144184
    145185// this callback is called once for each variation of each term
    146 int termscallback(char *Word, int ULen, int /*Freq*/,
    147           float /*Weight*/, void *info) {
     186int termvariantscallback(char *Word, int ULen, int /*Freq*/,
     187            float /*Weight*/, void *info) {
    148188
    149189  text_t term;
     
    215255}
    216256
     257// you only need to use this function before doing any stemming
     258// casefolding and stemming will be set if values for them are
     259// provided (0 or 1).
     260// makeindexcurrent returns true if it was able to load the database
     261bool mgsearchclass::makeindexcurrent (const text_t &index,
     262                      const text_t &collection,
     263                      int casefolding,
     264                      int stemming) {
     265  bool databaseloaded = true;
     266
     267  // get the names of the collection, index and text suffixes
     268  char *ccollection = collection.getcstr();
     269  assert (ccollection != NULL);
     270  char *idxsuffix = (getindexsuffix (collection, index)).getcstr();
     271  assert (idxsuffix != NULL);
     272  char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
     273  assert (txtsuffix != NULL);
     274
     275#ifdef __WIN32__
     276  char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
     277#else
     278  char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
     279#endif
     280
     281  if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
     282    if (casefolding == 0) mgq_ask(".set casefold off");
     283    else if (casefolding > 0) mgq_ask(".set casefold on");
     284    if (stemming == 0) mgq_ask(".set stem off");
     285    else if (stemming > 0) mgq_ask(".set stem on");
     286   
     287  } else databaseloaded = false;
     288
     289  // free up the c strings
     290  delete ccollection;
     291  delete idxsuffix;
     292  delete txtsuffix;
     293  delete ccollectdir;
     294
     295  return databaseloaded;
     296}
     297
     298
     299// stem word uses the values set in the last call to makeindexcurrent
     300// to stem the word. It is assumed that word is in unicode
     301text_t mgsearchclass::stemword (const text_t &word) {
     302  return to_uni (mgsearch_stemword (to_utf8 (word)));
     303}
     304 
    217305
    218306bool mgsearchclass::search(const queryparamclass &queryparams,
    219                queryresultsclass &queryresults)
    220 {
    221   bool databaseloaded = true;
    222 
     307               queryresultsclass &queryresults) {
    223308  assert (cache != NULL);
    224309
     
    226311
    227312  // first check the cache
    228   if (cache->find(queryparams, queryresults))
    229     return true;
     313  if (cache->find(queryparams, queryresults)) return true;
    230314
    231315  // make sure there is a query to be processed
     
    244328  casefold = queryparams.casefolding;
    245329
    246   // get the names of the collection, index and text suffixes
    247   char *ccollection = queryparams.collection.getcstr();
    248   assert (ccollection != NULL);
    249   char *idxsuffix = (getindexsuffix (queryparams.collection,
    250                      queryparams.search_index)).getcstr();
    251   assert (idxsuffix != NULL);
    252   char *txtsuffix = (getindexsuffix (queryparams.collection, "text")).getcstr();
    253   assert (txtsuffix != NULL);
    254 
    255 #ifdef __WIN32__
    256   char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
    257 #else
    258   char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
    259 #endif
    260 
    261   if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix))
    262     {
    263       setsearchmode (queryparams);
    264       submitquery (queryparams);
    265       getresults (queryresults);
    266     }
    267   else databaseloaded = false;
    268 
    269   // free up the c strings
    270   delete ccollection;
    271   delete idxsuffix;
    272   delete txtsuffix;
    273   delete ccollectdir;
    274 
    275   return databaseloaded;
     330  if (makeindexcurrent (queryparams.search_index, queryparams.collection)) {
     331    setsearchmode (queryparams);
     332    submitquery (queryparams);
     333    getresults (queryresults);
     334    return true;
     335  }
     336
     337  return false;
    276338}
    277339
     
    280342{
    281343  mgq_ask(".set expert true");
     344  mgq_ask(".set sorted_terms true");
    282345  mgq_ask(".set accumulator_method list");
    283346  mgq_ask(".set max_accumulators 50000");
     
    353416  mgq_results(result_termfreqs, 0, MAXNUMTERMS,
    354417          termfreqcallback, (void *)(&queryresults));
     418  queryresults.sortuniqqueryterms();
     419
     420  // get term variants
    355421  mgq_results(result_terms, 0, MAXNUMTERMS,
    356               termscallback, (void *)(&queryresults));
    357   queryresults.sortqueryterms();
    358   queryresults.uniqqueryterms();
     422              termvariantscallback, (void *)(&queryresults));
    359423}
    360424
Note: See TracChangeset for help on using the changeset viewer.