Changeset 319 for trunk/gsdl/src/colservr
- Timestamp:
- 1999-06-30T16:04:14+12:00 (25 years ago)
- Location:
- trunk/gsdl/src/colservr
- Files:
-
- 7 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/src/colservr/mgq.c
r308 r319 12 12 /* 13 13 $Log$ 14 Revision 1.5 1999/06/30 04:04:11 rjmcnab 15 made stemming functions available from mgsearch and made the stems 16 for the query terms available in queryinfo 17 14 18 Revision 1.4 1999/06/28 08:56:29 rjmcnab 15 19 A bit of hacking to remove the restriction that the index to get … … 60 64 #include "term_lists.h" 61 65 #include "local_strings.h" 66 67 #include "words.h" 68 #include "stemmer.h" 62 69 63 70 #ifdef __cplusplus … … 561 568 562 569 570 /* use mgq_getmaxstemlen to determine the length of the word stems to pass */ 571 /* to mgq_stemword */ 572 int mgq_getmaxstemlen () { 573 return MAXSTEMLEN; 574 } 575 576 /* note: the stemming method and the stemmer come from the last query */ 577 /* "word" should be at least maxstemlen+1 long and it is a string that */ 578 /* starts with the string length */ 579 void mgq_stemword (unsigned char *word) { 580 int stem_method = 0; 581 query_data *qd = NULL; 582 583 if (cur_cachenum == -1) return; 584 qd = dbcache[cur_cachenum].qd; 585 if (qd == NULL || word == NULL) return; 586 587 if (qd->sd->sdh.indexed) { 588 stem_method = BooleanEnv(GetEnv("casefold"),0) | (BooleanEnv(GetEnv("stem"),0) << 1); 589 } else { 590 stem_method = qd->sd->sdh.stem_method; 591 } 592 593 stemmer (stem_method, qd->sd->sdh.stemmer_num, word); 594 } 595 596 597 563 598 int is_dbcache_full (void) { 564 599 init_dbcache (); -
trunk/gsdl/src/colservr/mgq.h
r115 r319 18 18 #endif 19 19 20 20 21 enum result_kinds { 21 22 23 24 25 22 result_docs, /* Return the documents found in last search */ 23 result_docnums, /* Return document id numbers and weights */ 24 result_termfreqs, /* Return terms and frequencies */ 25 result_terms /* Return matching query terms */ 26 }; 26 27 27 28 int mgq_ask(char *line); 28 int mgq_results(enum result_kinds kind,int skip, int howmany, int (*sender)(char *, int, int, float, void *), void *ptr); 29 int mgq_results(enum result_kinds kind,int skip, int howmany, 30 int (*sender)(char *, int, int, float, void *), void *ptr); 29 31 int mgq_numdocs(void); 30 32 int mgq_numterms(void); 31 33 34 /* use mgq_getmaxstemlen to determine the length of the word stems to pass */ 35 /* to mgq_stemword */ 36 int mgq_getmaxstemlen (); 32 37 38 /* note: the stemming method and the stemmer come from the last query */ 39 /* "word" should be at least maxstemlen+1 long and it is a string that */ 40 /* starts with the string length */ 41 void mgq_stemword (unsigned char *word); 42 43 33 44 int is_dbcache_full (void); 34 45 int load_database (char *collection, char *mgdir, char *gensuffix, char *textsuffix); -
trunk/gsdl/src/colservr/mgsearch.cpp
r301 r319 12 12 /* 13 13 $Log$ 14 Revision 1.8 1999/06/30 04:04:12 rjmcnab 15 made stemming functions available from mgsearch and made the stems 16 for the query terms available in queryinfo 17 14 18 Revision 1.7 1999/06/27 22:07:27 sjboddie 15 19 got rid of all the old functions for dealing with dir indexes … … 86 90 87 91 92 ////////////////////// 93 // useful functions // 94 ////////////////////// 95 96 97 // input and output are in utf8 98 text_t mgsearch_stemword (const text_t &word) { 99 // allocate working stem space 100 int maxstemlen = mgq_getmaxstemlen (); 101 unsigned char *word_stem = new unsigned char [maxstemlen + 2]; 102 if (word_stem == NULL) return ""; 103 104 // copy word to word_stem 105 int len = 0; 106 text_t::const_iterator here = word.begin(); 107 text_t::const_iterator end = word.end(); 108 while (len < maxstemlen && here != end) { 109 word_stem[len+1] = (unsigned char)(*here); 110 len++; here++; 111 } 112 word_stem[len+1] = '\0'; 113 word_stem[0] = len; 114 115 mgq_stemword (word_stem); 116 117 // copy word_stem back to tempstr 118 text_t tempstr; 119 tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]); 120 121 return tempstr; 122 } 123 124 125 88 126 //////////////////////// 89 127 // callback functions // … … 121 159 docresultclass docresult; 122 160 docresult.docnum = DocNum; 123 docresult.docweight = Weight; 124 161 docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg... 162 docresult.docweight = Weight - docresult.num_query_terms_matched*100; 163 125 164 queryresults->docs.push_back(docresult); 126 165 … … 137 176 termfreqclass termfreq; 138 177 termfreq.termstr = to_uni(term); 178 termfreq.termstemstr = to_uni (mgsearch_stemword (term)); 139 179 termfreq.termfreq = Freq; 140 queryresults-> terms.push_back(termfreq);180 queryresults->orgterms.push_back(termfreq); 141 181 142 182 return 0; … … 144 184 145 185 // this callback is called once for each variation of each term 146 int term scallback(char *Word, int ULen, int /*Freq*/,147 186 int termvariantscallback(char *Word, int ULen, int /*Freq*/, 187 float /*Weight*/, void *info) { 148 188 149 189 text_t term; … … 215 255 } 216 256 257 // you only need to use this function before doing any stemming 258 // casefolding and stemming will be set if values for them are 259 // provided (0 or 1). 260 // makeindexcurrent returns true if it was able to load the database 261 bool mgsearchclass::makeindexcurrent (const text_t &index, 262 const text_t &collection, 263 int casefolding, 264 int stemming) { 265 bool databaseloaded = true; 266 267 // get the names of the collection, index and text suffixes 268 char *ccollection = collection.getcstr(); 269 assert (ccollection != NULL); 270 char *idxsuffix = (getindexsuffix (collection, index)).getcstr(); 271 assert (idxsuffix != NULL); 272 char *txtsuffix = (getindexsuffix (collection, "text")).getcstr(); 273 assert (txtsuffix != NULL); 274 275 #ifdef __WIN32__ 276 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL); 277 #else 278 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL); 279 #endif 280 281 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) { 282 if (casefolding == 0) mgq_ask(".set casefold off"); 283 else if (casefolding > 0) mgq_ask(".set casefold on"); 284 if (stemming == 0) mgq_ask(".set stem off"); 285 else if (stemming > 0) mgq_ask(".set stem on"); 286 287 } else databaseloaded = false; 288 289 // free up the c strings 290 delete ccollection; 291 delete idxsuffix; 292 delete txtsuffix; 293 delete ccollectdir; 294 295 return databaseloaded; 296 } 297 298 299 // stem word uses the values set in the last call to makeindexcurrent 300 // to stem the word. It is assumed that word is in unicode 301 text_t mgsearchclass::stemword (const text_t &word) { 302 return to_uni (mgsearch_stemword (to_utf8 (word))); 303 } 304 217 305 218 306 bool mgsearchclass::search(const queryparamclass &queryparams, 219 queryresultsclass &queryresults) 220 { 221 bool databaseloaded = true; 222 307 queryresultsclass &queryresults) { 223 308 assert (cache != NULL); 224 309 … … 226 311 227 312 // first check the cache 228 if (cache->find(queryparams, queryresults)) 229 return true; 313 if (cache->find(queryparams, queryresults)) return true; 230 314 231 315 // make sure there is a query to be processed … … 244 328 casefold = queryparams.casefolding; 245 329 246 // get the names of the collection, index and text suffixes 247 char *ccollection = queryparams.collection.getcstr(); 248 assert (ccollection != NULL); 249 char *idxsuffix = (getindexsuffix (queryparams.collection, 250 queryparams.search_index)).getcstr(); 251 assert (idxsuffix != NULL); 252 char *txtsuffix = (getindexsuffix (queryparams.collection, "text")).getcstr(); 253 assert (txtsuffix != NULL); 254 255 #ifdef __WIN32__ 256 char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL); 257 #else 258 char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL); 259 #endif 260 261 if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) 262 { 263 setsearchmode (queryparams); 264 submitquery (queryparams); 265 getresults (queryresults); 266 } 267 else databaseloaded = false; 268 269 // free up the c strings 270 delete ccollection; 271 delete idxsuffix; 272 delete txtsuffix; 273 delete ccollectdir; 274 275 return databaseloaded; 330 if (makeindexcurrent (queryparams.search_index, queryparams.collection)) { 331 setsearchmode (queryparams); 332 submitquery (queryparams); 333 getresults (queryresults); 334 return true; 335 } 336 337 return false; 276 338 } 277 339 … … 280 342 { 281 343 mgq_ask(".set expert true"); 344 mgq_ask(".set sorted_terms true"); 282 345 mgq_ask(".set accumulator_method list"); 283 346 mgq_ask(".set max_accumulators 50000"); … … 353 416 mgq_results(result_termfreqs, 0, MAXNUMTERMS, 354 417 termfreqcallback, (void *)(&queryresults)); 418 queryresults.sortuniqqueryterms(); 419 420 // get term variants 355 421 mgq_results(result_terms, 0, MAXNUMTERMS, 356 termscallback, (void *)(&queryresults)); 357 queryresults.sortqueryterms(); 358 queryresults.uniqqueryterms(); 422 termvariantscallback, (void *)(&queryresults)); 359 423 } 360 424 -
trunk/gsdl/src/colservr/mgsearch.h
r301 r319 26 26 mgsearchclass (); 27 27 virtual ~mgsearchclass (); 28 28 29 29 // the index directory must be set before any searching 30 30 // is done 31 31 void setcollectdir (const text_t &thecollectdir); 32 33 // you only need to use this function before doing any stemming 34 // casefolding and stemming will be set if values for them are 35 // provided (0 or 1). 36 // makeindexcurrent returns true if it was able to load the database 37 bool makeindexcurrent (const text_t &index, const text_t &collection, 38 int casefolding = -1, int stemming = -1); 39 40 // stem word uses the values set in the last call to makeindexcurrent 41 // to stem the word. It is assumed that word is in unicode 42 text_t stemword (const text_t &word); 32 43 33 44 // the search results are returned in queryresults -
trunk/gsdl/src/colservr/queryfilter.cpp
r311 r319 12 12 /* 13 13 $Log$ 14 Revision 1.7 1999/06/30 04:04:13 rjmcnab 15 made stemming functions available from mgsearch and made the stems 16 for the query terms available in queryinfo 17 14 18 Revision 1.6 1999/06/29 22:06:23 rjmcnab 15 19 Added a couple of fields to queryinfo to handle a special version … … 347 351 // assemble the term results 348 352 if ((request.filterResultOptions & FRtermFreq) || (request.filterResultOptions & FRmatchTerms)) { 349 queryresults.sortqueryterms(); 350 queryresults.uniqqueryterms(); 353 // note: the terms have already been sorted and uniqued 351 354 352 355 TermInfo_t terminfo; -
trunk/gsdl/src/colservr/queryinfo.cpp
r311 r319 12 12 /* 13 13 $Log$ 14 Revision 1.4 1999/06/30 04:04:13 rjmcnab 15 made stemming functions available from mgsearch and made the stems 16 for the query terms available in queryinfo 17 14 18 Revision 1.3 1999/06/29 22:06:23 rjmcnab 15 19 Added a couple of fields to queryinfo to handle a special version … … 88 92 { 89 93 termstr = t.termstr; 94 termstemstr = t.termstemstr; 90 95 termfreq = t.termfreq; 91 96 … … 96 101 { 97 102 return ((x.termstr == y.termstr) && 103 (x.termstemstr == y.termstemstr) && 98 104 (x.termfreq == y.termfreq)); 99 105 } … … 108 114 { 109 115 return ((x.termfreq < y.termfreq) || 110 ((x.termfreq == y.termfreq) && (x.termst r < y.termstr)));111 116 ((x.termfreq == y.termfreq) && (x.termstemstr < y.termstemstr)) || 117 ((x.termfreq == y.termfreq) && (x.termstemstr == y.termstemstr) && (x.termstr < y.termstr))); 112 118 } 113 119 … … 115 121 { 116 122 return ((x.termfreq > y.termfreq) || 117 ((x.termfreq == y.termfreq) && (x.termst r > y.termstr)));118 123 ((x.termfreq == y.termfreq) && (x.termstemstr > y.termstemstr)) || 124 ((x.termfreq == y.termfreq) && (x.termstemstr == y.termstemstr) && (x.termstr > y.termstr))); 119 125 } 120 126 … … 125 131 126 132 outs << text_t2ascii << " t:\"" << t.termstr << "\""; 133 outs << text_t2ascii << " s:\"" << t.termstemstr << "\""; 127 134 outs << " f:" << t.termfreq << "\n"; 128 135 … … 145 152 // query results 146 153 147 void queryresultsclass::clear () 148 { 154 void queryresultsclass::clear () { 149 155 docs_matched_set = false;; 150 156 docs_matched = 0; … … 152 158 153 159 docs.erase(docs.begin(),docs.end()); 160 orgterms.erase(orgterms.begin(),orgterms.end()); 154 161 terms.erase(terms.begin(),terms.end()); 155 162 } … … 168 175 } 169 176 170 void queryresultsclass::sortqueryterms() 171 { 177 void queryresultsclass::sortuniqqueryterms() { 178 terms = orgterms; 179 180 // sort the terms 172 181 sort (terms.begin(), terms.end()); 173 } 174 175 void queryresultsclass::uniqqueryterms() 176 { 182 183 // and then unique them 177 184 vector<termfreqclass>::iterator new_end = unique (terms.begin(), terms.end()); 178 185 terms.erase(new_end, terms.end()); 179 186 } 180 181 187 182 188 … … 192 198 outs << (*docshere); 193 199 docshere++; 200 } 201 202 outs << "orgterms\n"; 203 vector<termfreqclass>::iterator orgtermshere = q.orgterms.begin(); 204 vector<termfreqclass>::iterator orgtermsend = q.orgterms.end(); 205 while (orgtermshere != orgtermsend) { 206 outs << (*orgtermshere); 207 orgtermshere++; 194 208 } 195 209 -
trunk/gsdl/src/colservr/queryinfo.h
r311 r319 63 63 public: 64 64 text_t termstr; 65 text_t termstemstr; 65 66 unsigned int termfreq; 66 67 … … 110 111 111 112 vector<docresultclass> docs; 113 vector<termfreqclass> orgterms; // terms before they are sorted and uniqued 112 114 vector<termfreqclass> terms; 113 115 text_tarray termvariants; … … 122 124 int getnumterms () {return terms.size();} 123 125 124 void sortqueryterms(); 125 void uniqqueryterms(); 126 void sortuniqqueryterms(); 126 127 }; 127 128
Note:
See TracChangeset
for help on using the changeset viewer.