Ignore:
Timestamp:
1999-06-30T16:04:14+12:00 (25 years ago)
Author:
rjmcnab
Message:

made stemming functions available from mgsearch and made the stems
for the query terms available in queryinfo

Location:
trunk/gsdl/src/colservr
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/colservr/mgq.c

    r308 r319  
    1212/*
    1313   $Log$
     14   Revision 1.5  1999/06/30 04:04:11  rjmcnab
     15   made stemming functions available from mgsearch and made the stems
     16   for the query terms available in queryinfo
     17
    1418   Revision 1.4  1999/06/28 08:56:29  rjmcnab
    1519   A bit of hacking to remove the restriction that the index to get
     
    6064#include "term_lists.h"
    6165#include "local_strings.h"
     66
     67#include "words.h"
     68#include "stemmer.h"
    6269 
    6370#ifdef __cplusplus
     
    561568
    562569
     570/* use mgq_getmaxstemlen to determine the length of the word stems to pass */
     571/* to mgq_stemword */
     572int mgq_getmaxstemlen () {
     573  return MAXSTEMLEN;
     574}
     575
     576/* note: the stemming method and the stemmer come from the last query */
     577/* "word" should be at least maxstemlen+1 long and it is a string that */
     578/* starts with the string length */
     579void mgq_stemword (unsigned char *word) {
     580  int stem_method = 0;
     581  query_data *qd = NULL;
     582
     583  if (cur_cachenum == -1) return;
     584  qd = dbcache[cur_cachenum].qd;
     585  if (qd == NULL || word == NULL) return;
     586
     587  if (qd->sd->sdh.indexed) {
     588    stem_method = BooleanEnv(GetEnv("casefold"),0) | (BooleanEnv(GetEnv("stem"),0) << 1);
     589  } else {
     590    stem_method = qd->sd->sdh.stem_method;
     591  }
     592
     593  stemmer (stem_method, qd->sd->sdh.stemmer_num, word);
     594}
     595
     596
     597
    563598int is_dbcache_full (void) {
    564599  init_dbcache ();
  • trunk/gsdl/src/colservr/mgq.h

    r115 r319  
    1818#endif
    1919
     20 
    2021enum result_kinds {
    21     result_docs,      /* Return the documents found in last search */
    22     result_docnums,   /* Return document id numbers and weights */
    23     result_termfreqs, /* Return terms and frequencies */
    24     result_terms      /* Return matching query terms */
    25     };
     22  result_docs,      /* Return the documents found in last search */
     23  result_docnums,   /* Return document id numbers and weights */
     24  result_termfreqs, /* Return terms and frequencies */
     25  result_terms      /* Return matching query terms */
     26};
    2627
    2728int mgq_ask(char *line);
    28 int mgq_results(enum result_kinds kind,int skip, int howmany, int (*sender)(char *, int, int, float, void *), void *ptr);
     29int mgq_results(enum result_kinds kind,int skip, int howmany,
     30        int (*sender)(char *, int, int, float, void *), void *ptr);
    2931int mgq_numdocs(void);
    3032int mgq_numterms(void);
    3133
     34/* use mgq_getmaxstemlen to determine the length of the word stems to pass */
     35/* to mgq_stemword */
     36int mgq_getmaxstemlen ();
    3237
     38/* note: the stemming method and the stemmer come from the last query */
     39/* "word" should be at least maxstemlen+1 long and it is a string that */
     40/* starts with the string length */
     41void mgq_stemword (unsigned char *word);
     42
     43 
    3344int is_dbcache_full (void);
    3445int load_database (char *collection, char *mgdir, char *gensuffix, char *textsuffix);
  • trunk/gsdl/src/colservr/mgsearch.cpp

    r301 r319  
    1212/*
    1313   $Log$
     14   Revision 1.8  1999/06/30 04:04:12  rjmcnab
     15   made stemming functions available from mgsearch and made the stems
     16   for the query terms available in queryinfo
     17
    1418   Revision 1.7  1999/06/27 22:07:27  sjboddie
    1519   got rid of all the old functions for dealing with dir indexes
     
    8690
    8791
     92//////////////////////
     93// useful functions //
     94//////////////////////
     95
     96
     97// input and output are in utf8
     98text_t mgsearch_stemword (const text_t &word) {
     99  // allocate working stem space
     100  int maxstemlen = mgq_getmaxstemlen ();
     101  unsigned char *word_stem = new unsigned char [maxstemlen + 2];
     102  if (word_stem == NULL) return "";
     103
     104  // copy word to word_stem
     105  int len = 0;
     106  text_t::const_iterator here = word.begin();
     107  text_t::const_iterator end = word.end();
     108  while (len < maxstemlen && here != end) {
     109    word_stem[len+1] = (unsigned char)(*here);
     110    len++; here++;
     111  }
     112  word_stem[len+1] = '\0';
     113  word_stem[0] = len;
     114
     115  mgq_stemword (word_stem);
     116
     117  // copy word_stem back to tempstr
     118  text_t tempstr;
     119  tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);
     120
     121  return tempstr;
     122}
     123
     124
     125
    88126////////////////////////
    89127// callback functions //
     
    121159  docresultclass docresult;
    122160  docresult.docnum = DocNum;
    123   docresult.docweight = Weight;
    124 
     161  docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
     162  docresult.docweight = Weight - docresult.num_query_terms_matched*100;
     163 
    125164  queryresults->docs.push_back(docresult);
    126165 
     
    137176  termfreqclass termfreq;
    138177  termfreq.termstr = to_uni(term);
     178  termfreq.termstemstr = to_uni (mgsearch_stemword (term));
    139179  termfreq.termfreq = Freq;
    140   queryresults->terms.push_back(termfreq);
     180  queryresults->orgterms.push_back(termfreq);
    141181 
    142182  return 0;
     
    144184
    145185// this callback is called once for each variation of each term
    146 int termscallback(char *Word, int ULen, int /*Freq*/,
    147           float /*Weight*/, void *info) {
     186int termvariantscallback(char *Word, int ULen, int /*Freq*/,
     187            float /*Weight*/, void *info) {
    148188
    149189  text_t term;
     
    215255}
    216256
     257// you only need to use this function before doing any stemming
     258// casefolding and stemming will be set if values for them are
     259// provided (0 or 1).
     260// makeindexcurrent returns true if it was able to load the database
     261bool mgsearchclass::makeindexcurrent (const text_t &index,
     262                      const text_t &collection,
     263                      int casefolding,
     264                      int stemming) {
     265  bool databaseloaded = true;
     266
     267  // get the names of the collection, index and text suffixes
     268  char *ccollection = collection.getcstr();
     269  assert (ccollection != NULL);
     270  char *idxsuffix = (getindexsuffix (collection, index)).getcstr();
     271  assert (idxsuffix != NULL);
     272  char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
     273  assert (txtsuffix != NULL);
     274
     275#ifdef __WIN32__
     276  char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
     277#else
     278  char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
     279#endif
     280
     281  if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
     282    if (casefolding == 0) mgq_ask(".set casefold off");
     283    else if (casefolding > 0) mgq_ask(".set casefold on");
     284    if (stemming == 0) mgq_ask(".set stem off");
     285    else if (stemming > 0) mgq_ask(".set stem on");
     286   
     287  } else databaseloaded = false;
     288
     289  // free up the c strings
     290  delete ccollection;
     291  delete idxsuffix;
     292  delete txtsuffix;
     293  delete ccollectdir;
     294
     295  return databaseloaded;
     296}
     297
     298
     299// stem word uses the values set in the last call to makeindexcurrent
     300// to stem the word. It is assumed that word is in unicode
     301text_t mgsearchclass::stemword (const text_t &word) {
     302  return to_uni (mgsearch_stemword (to_utf8 (word)));
     303}
     304 
    217305
    218306bool mgsearchclass::search(const queryparamclass &queryparams,
    219                queryresultsclass &queryresults)
    220 {
    221   bool databaseloaded = true;
    222 
     307               queryresultsclass &queryresults) {
    223308  assert (cache != NULL);
    224309
     
    226311
    227312  // first check the cache
    228   if (cache->find(queryparams, queryresults))
    229     return true;
     313  if (cache->find(queryparams, queryresults)) return true;
    230314
    231315  // make sure there is a query to be processed
     
    244328  casefold = queryparams.casefolding;
    245329
    246   // get the names of the collection, index and text suffixes
    247   char *ccollection = queryparams.collection.getcstr();
    248   assert (ccollection != NULL);
    249   char *idxsuffix = (getindexsuffix (queryparams.collection,
    250                      queryparams.search_index)).getcstr();
    251   assert (idxsuffix != NULL);
    252   char *txtsuffix = (getindexsuffix (queryparams.collection, "text")).getcstr();
    253   assert (txtsuffix != NULL);
    254 
    255 #ifdef __WIN32__
    256   char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
    257 #else
    258   char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
    259 #endif
    260 
    261   if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix))
    262     {
    263       setsearchmode (queryparams);
    264       submitquery (queryparams);
    265       getresults (queryresults);
    266     }
    267   else databaseloaded = false;
    268 
    269   // free up the c strings
    270   delete ccollection;
    271   delete idxsuffix;
    272   delete txtsuffix;
    273   delete ccollectdir;
    274 
    275   return databaseloaded;
     330  if (makeindexcurrent (queryparams.search_index, queryparams.collection)) {
     331    setsearchmode (queryparams);
     332    submitquery (queryparams);
     333    getresults (queryresults);
     334    return true;
     335  }
     336
     337  return false;
    276338}
    277339
     
    280342{
    281343  mgq_ask(".set expert true");
     344  mgq_ask(".set sorted_terms true");
    282345  mgq_ask(".set accumulator_method list");
    283346  mgq_ask(".set max_accumulators 50000");
     
    353416  mgq_results(result_termfreqs, 0, MAXNUMTERMS,
    354417          termfreqcallback, (void *)(&queryresults));
     418  queryresults.sortuniqqueryterms();
     419
     420  // get term variants
    355421  mgq_results(result_terms, 0, MAXNUMTERMS,
    356               termscallback, (void *)(&queryresults));
    357   queryresults.sortqueryterms();
    358   queryresults.uniqqueryterms();
     422              termvariantscallback, (void *)(&queryresults));
    359423}
    360424
  • trunk/gsdl/src/colservr/mgsearch.h

    r301 r319  
    2626  mgsearchclass ();
    2727  virtual ~mgsearchclass ();
    28  
     28
    2929  // the index directory must be set before any searching
    3030  // is done
    3131  void setcollectdir (const text_t &thecollectdir);
     32
     33  // you only need to use this function before doing any stemming
     34  // casefolding and stemming will be set if values for them are
     35  // provided (0 or 1).
     36  // makeindexcurrent returns true if it was able to load the database
     37  bool makeindexcurrent (const text_t &index, const text_t &collection,
     38             int casefolding = -1, int stemming = -1);
     39
     40  // stem word uses the values set in the last call to makeindexcurrent
     41  // to stem the word. It is assumed that word is in unicode
     42  text_t stemword (const text_t &word);
    3243 
    3344  // the search results are returned in queryresults
  • trunk/gsdl/src/colservr/queryfilter.cpp

    r311 r319  
    1212/*
    1313   $Log$
     14   Revision 1.7  1999/06/30 04:04:13  rjmcnab
     15   made stemming functions available from mgsearch and made the stems
     16   for the query terms available in queryinfo
     17
    1418   Revision 1.6  1999/06/29 22:06:23  rjmcnab
    1519   Added a couple of fields to queryinfo to handle a special version
     
    347351  // assemble the term results
    348352  if ((request.filterResultOptions & FRtermFreq) || (request.filterResultOptions & FRmatchTerms)) {
    349     queryresults.sortqueryterms();
    350     queryresults.uniqqueryterms();
     353    // note: the terms have already been sorted and uniqued
    351354
    352355    TermInfo_t terminfo;
  • trunk/gsdl/src/colservr/queryinfo.cpp

    r311 r319  
    1212/*
    1313   $Log$
     14   Revision 1.4  1999/06/30 04:04:13  rjmcnab
     15   made stemming functions available from mgsearch and made the stems
     16   for the query terms available in queryinfo
     17
    1418   Revision 1.3  1999/06/29 22:06:23  rjmcnab
    1519   Added a couple of fields to queryinfo to handle a special version
     
    8892{
    8993  termstr = t.termstr;
     94  termstemstr = t.termstemstr;
    9095  termfreq = t.termfreq;
    9196
     
    96101{
    97102  return ((x.termstr == y.termstr) &&
     103      (x.termstemstr == y.termstemstr) &&
    98104      (x.termfreq == y.termfreq));
    99105}
     
    108114{
    109115  return ((x.termfreq < y.termfreq) ||
    110       ((x.termfreq == y.termfreq) && (x.termstr < y.termstr)));
    111 
     116      ((x.termfreq == y.termfreq) && (x.termstemstr < y.termstemstr)) ||
     117      ((x.termfreq == y.termfreq) && (x.termstemstr == y.termstemstr) && (x.termstr < y.termstr)));
    112118}
    113119
     
    115121{
    116122  return ((x.termfreq > y.termfreq) ||
    117       ((x.termfreq == y.termfreq) && (x.termstr > y.termstr)));
    118 
     123      ((x.termfreq == y.termfreq) && (x.termstemstr > y.termstemstr)) ||
     124      ((x.termfreq == y.termfreq) && (x.termstemstr == y.termstemstr) && (x.termstr > y.termstr)));
    119125}
    120126
     
    125131
    126132  outs << text_t2ascii << " t:\"" << t.termstr << "\"";
     133  outs << text_t2ascii << " s:\"" << t.termstemstr << "\"";
    127134  outs << " f:" << t.termfreq << "\n";
    128135
     
    145152// query results
    146153
    147 void queryresultsclass::clear ()
    148 {
     154void queryresultsclass::clear () {
    149155  docs_matched_set = false;;
    150156  docs_matched = 0;
     
    152158 
    153159  docs.erase(docs.begin(),docs.end());
     160  orgterms.erase(orgterms.begin(),orgterms.end());
    154161  terms.erase(terms.begin(),terms.end());
    155162}
     
    168175}
    169176
    170 void queryresultsclass::sortqueryterms()
    171 {
     177void queryresultsclass::sortuniqqueryterms() {
     178  terms = orgterms;
     179
     180  // sort the terms
    172181  sort (terms.begin(), terms.end());
    173 }
    174 
    175 void queryresultsclass::uniqqueryterms()
    176 {
     182
     183  // and then unique them
    177184  vector<termfreqclass>::iterator new_end = unique (terms.begin(), terms.end());
    178185  terms.erase(new_end, terms.end());
    179186}
    180 
    181187
    182188
     
    192198    outs << (*docshere);
    193199    docshere++;
     200  }
     201
     202  outs << "orgterms\n";
     203  vector<termfreqclass>::iterator orgtermshere = q.orgterms.begin();
     204  vector<termfreqclass>::iterator orgtermsend = q.orgterms.end();
     205  while (orgtermshere != orgtermsend) {
     206    outs << (*orgtermshere);
     207    orgtermshere++;
    194208  }
    195209
  • trunk/gsdl/src/colservr/queryinfo.h

    r311 r319  
    6363public:
    6464  text_t termstr;
     65  text_t termstemstr;
    6566  unsigned int termfreq;
    6667 
     
    110111 
    111112  vector<docresultclass> docs;
     113  vector<termfreqclass> orgterms; // terms before they are sorted and uniqued
    112114  vector<termfreqclass> terms;
    113115  text_tarray termvariants;
     
    122124  int getnumterms () {return terms.size();}
    123125 
    124   void sortqueryterms();
    125   void uniqqueryterms();
     126  void sortuniqqueryterms();
    126127};
    127128
Note: See TracChangeset for help on using the changeset viewer.