Changeset 8244
- Timestamp:
- 2004-10-08T13:32:30+13:00 (20 years ago)
- Location:
- trunk/gsdl/src/mgpp/text
- Files:
-
- 8 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/src/mgpp/text/GSDLQueryLex.cpp
r6121 r8244 185 185 return true; 186 186 187 } else if (c == '*') { 188 el.lexType = StarE; 189 AddNChar (here, el.text, charLen); 190 return true; 191 187 192 } else if (c == '^') { 188 193 el.lexType = RangeE; -
trunk/gsdl/src/mgpp/text/GSDLQueryLex.h
r6121 r8244 30 30 NotOpE, NearOpE, WithinOpE, QuoteE, IntegerE, TermWeightE, 31 31 StemMethodE, RangeE, AtE, TagE, OpenSquareBracketE, 32 CloseSquareBracketE, UnknownE};32 CloseSquareBracketE, StarE, UnknownE}; 33 33 34 34 struct LexEl { -
trunk/gsdl/src/mgpp/text/GSDLQueryParser.cpp
r6129 r8244 178 178 179 179 termNode.stemMethod = defaultStemMethod; 180 180 bool partial_match = false; 181 181 LexEl el; 182 182 UCArray::const_iterator oldHere = here; … … 202 202 } else if (el.lexType == AtE) { 203 203 termNode.startRange = termNode.endRange = ParseInt (here, end); 204 204 } else if (el.lexType == StarE) { 205 partial_match = true; 205 206 } else { 206 207 // no term modifiers … … 209 210 } 210 211 212 if (partial_match) { 213 termNode.stemMethod |= 4; // set bit 2 to 1 214 termNode.stemMethod &=0xd; // set bit 1 to 0 // we dont have stemming on if doing partial matching. 215 } 211 216 oldHere = here; 212 217 } -
trunk/gsdl/src/mgpp/text/Terms.cpp
r3008 r8244 205 205 206 206 207 208 209 207 void FindWordNumbers (IndexData &indexData, 210 208 const UCArray &term, … … 213 211 equivWords.erase (equivWords.begin(), equivWords.end()); 214 212 215 if (stemMethod == 0 ) {213 if (stemMethod == 0 || stemMethod==4 || stemMethod==5) { 216 214 // don't need to stem the word, 217 // find the word number for this term215 // find the word number(s) for this term 218 216 unsigned long wordElNum = 0; 219 217 unsigned long numLevels = indexData.bdh.num_levels; 220 218 word_block_dict_el wordDictEl; 221 219 wordDictEl.SetNumLevels (numLevels); 222 if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords, 223 indexData.bdh.entries_per_wblk, 224 indexData.bdh.word_dict_size, 225 numLevels, term, wordDictEl, wordElNum)) 226 equivWords.push_back (wordElNum); 227 228 return; 229 230 } 231 232 220 if (stemMethod ==0) { 221 if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords, 222 indexData.bdh.entries_per_wblk, 223 indexData.bdh.word_dict_size, 224 numLevels, term, wordDictEl, wordElNum)) 225 equivWords.push_back (wordElNum); 226 227 return; 228 } else { 229 // partial matching, 230 PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod==5?true:false) ); 231 return; 232 } 233 } 234 233 235 // need to stem this word and find it in the blocked stem index 234 236 … … 236 238 UCArray stemTerm; 237 239 unsigned long stemmerNum = 0; 238 239 240 if (stemMethod == 1) stemmerNum = indexData.sih1.stemmer_num; 240 241 else if (stemMethod == 2) stemmerNum = indexData.sih2.stemmer_num; 241 242 else if (stemMethod == 3) stemmerNum = indexData.sih3.stemmer_num; 242 243 243 244 244 // convert the word to an "mg word" 245 245 mgWord[0] = term.size(); -
trunk/gsdl/src/mgpp/text/UCArray.cpp
r7416 r8244 244 244 } 245 245 246 // does the first string start with the second? 247 bool StartsWith (const UCArray &a1, const UCArray &a2) { 248 unsigned int l1 = a1.size(); 249 unsigned int l2 = a2.size(); 250 if (l2 > l1) { 251 // if the prefix is longer than the string, it can't start with it 252 return false; 253 } 254 unsigned int l =l2; 255 UCArray::const_iterator a1Here = a1.begin(); 256 UCArray::const_iterator a2Here = a2.begin(); 257 258 while (l--) { 259 if ((*a1Here != *a2Here)) 260 return false; 261 ++a1Here; 262 ++a2Here; 263 } 264 return true; // we have successfully matched the whole way 265 266 } 267 268 // does the first string start with the second, ignoring case? 269 bool StartsWithCasefold(const UCArray &a1, const UCArray &a2) { 270 unsigned int l1 = a1.size(); 271 unsigned int l2 = a2.size(); 272 if (l2 > l1) { 273 // if the prefix is longer than the string, it can't start with it 274 return false; 275 } 276 unsigned int l =l2; 277 UCArray::const_iterator a1Here = a1.begin(); 278 UCArray::const_iterator a2Here = a2.begin(); 279 280 while (l--) { 281 if (casecharmap[*a1Here] != casecharmap[*a2Here]) 282 return false; 283 ++a1Here; 284 ++a2Here; 285 } 286 return true; // we have successfully matched the whole way 287 288 } 289 290 246 291 unsigned long PrefixLen (const UCArray &a1, const UCArray &a2) { 247 292 unsigned long l = (a1.size() < a2.size()) ? a1.size() : a2.size(); -
trunk/gsdl/src/mgpp/text/UCArray.h
r7413 r8244 97 97 int DictCompare (const UCArray &a1, const UCArray &a2); 98 98 99 // does the first string start with the second? 100 bool StartsWith(const UCArray &a1, const UCArray &a2); 101 // does the first string start with the second, ignoring case? 102 bool StartsWithCasefold(const UCArray &a1, const UCArray &a2); 103 99 104 struct LTUCArray { 100 105 bool operator()(const UCArray &a1, const UCArray &a2) const -
trunk/gsdl/src/mgpp/text/invf.cpp
r2468 r8244 691 691 } 692 692 693 // ------------------------------------------ 694 // functions for partial term matching 695 // ie find all words that start with xxx 696 bool PartialMatchSearchWordBlockDictEl (FILE *dictFile, 697 const block_idx &bIdx, 698 unsigned long entriesPerBlock, 699 unsigned long dictSize, 700 unsigned long numLevels, 701 const UCArray &el, 702 word_block_dict_el &dictEl, 703 vector<unsigned long> &elNumList, 704 bool casefold) { 705 706 UCArrayClear (dictEl.el); 707 elNumList.erase (elNumList.begin(), elNumList.end()); 708 unsigned long elNum; 709 // find the block that contains the element 710 unsigned long blockIdxNum; 711 // will this work?? 712 if (!SearchEl (bIdx, entriesPerBlock, el, 713 blockIdxNum, elNum)) { 714 return false; 715 } 716 unsigned long blockEndElNum = elNum + entriesPerBlock; 717 if (blockEndElNum > dictSize) blockEndElNum = dictSize; 718 719 bool still_looking = true; 720 // look for the block 721 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET); 722 // test each element 723 while (elNum < blockEndElNum) { 724 dictEl.Read (dictFile, numLevels); 725 if (StartsWithCasefold(dictEl.el, el)) { 726 if (casefold || StartsWith(dictEl.el, el)) { 727 elNumList.push_back(elNum); 728 } 729 still_looking=false; // we have found one now, so stop the next time we don't have a match 730 } else if (!still_looking) { 731 // we have found a match previously, and now this doesn't match 732 return true; 733 } // else keep looking 734 735 elNum++; 736 } 737 // if we get here, we are either still searching for the first 738 // case, or we are collecting terms. 739 if (still_looking) { 740 //we haven't found a match yet, just check the next element, 741 dictEl.Read (dictFile, numLevels); 742 if (!StartsWithCasefold(dictEl.el, el)) { 743 // the first element of the next block is not a match, so there are no matches 744 return false; 745 } else { 746 if (casefold || StartsWith(dictEl.el, el)) { 747 elNumList.push_back(elNum); 748 elNum++; 749 } 750 751 } 752 } 753 // just keep accumulating matches until there are no more 754 dictEl.Read (dictFile, numLevels); 755 while (StartsWithCasefold(dictEl.el, el)) { 756 if (casefold || StartsWith(dictEl.el, el)) { 757 elNumList.push_back(elNum); 758 } 759 elNum++; 760 dictEl.Read (dictFile, numLevels); 761 } 762 return true; 763 764 } 765 766 693 767 //---------------------------------------------------------------- 694 768 // functions for full text browse -
trunk/gsdl/src/mgpp/text/invf.h
r1300 r8244 301 301 stem_block_dict_el &dictEl, 302 302 unsigned long &elNum); 303 304 //----------------------------------------------------- 305 // New functions for partial matching 306 // returns a list of word numbers from the main block index whose 307 // words start with el. 308 bool PartialMatchSearchWordBlockDictEl (FILE *dictFile, 309 const block_idx &bIdx, 310 unsigned long entriesPerBlock, 311 unsigned long dictSize, 312 unsigned long numLevels, 313 const UCArray &el, 314 word_block_dict_el &dictEl, 315 vector<unsigned long> &elNumList, 316 bool casefold); 303 317 304 318 //----------------------------------------------------------
Note:
See TracChangeset
for help on using the changeset viewer.