Ignore:
Timestamp:
2006-12-11T11:22:20+13:00 (17 years ago)
Author:
shaoqun
Message:

added code for accentfolding

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/mgpp/text/Terms.cpp

    r8692 r13477  
    210210              vector<unsigned long> &equivWords) {
    211211  equivWords.erase (equivWords.begin(), equivWords.end());
    212  
    213   if (stemMethod == 0 || stemMethod==4 || stemMethod==5) {
     212
     213  // if the stem method specified is not a valid one (i.e. there was no appropriate stem index, then we set it to 0)
     214  // unless we have partial matching, in which case we are not doing stem indexes anyway.
     215  if (!(stemMethod & STEM_PARTIAL_MATCH) && indexData.stemFile[stemMethod-1] == NULL) {
     216    cerr << "Stem index for method "<<stemMethod<< " was not built, so not doing stemming\n";
     217    stemMethod = 0;
     218  }
     219  /* [JFG - Mar 06: Accent folding patch] */
     220  /* use flag PARTIAL_MATCH */ 
     221  if (stemMethod == 0 || (stemMethod & STEM_PARTIAL_MATCH)) {
    214222    // don't need to stem the word,
    215223    // find the word number(s) for this term
     
    218226    word_block_dict_el wordDictEl;
    219227    wordDictEl.SetNumLevels (numLevels);
    220     if (stemMethod ==0) {
     228    if (stemMethod == 0) {
    221229      if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
    222230                 indexData.bdh.entries_per_wblk,
     
    228236    } else {
    229237      // partial matching,
    230       PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod==5?true:false) );
     238      PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod & STEM_CaseFolding)? true : false);
     239      // TODO: Accent Folding is not handled here!!
    231240      return;
    232241    }
     
    234243             
    235244  // need to stem this word and find it in the blocked stem index
    236  
    237   unsigned char  mgWord[MAXSTEMLEN + 1];
     245  unsigned char mgWord[MAXSTEMLEN + 1];
    238246  UCArray stemTerm;
    239247  unsigned long stemmerNum = 0;
    240   if (stemMethod == 1) stemmerNum = indexData.sih1.stemmer_num;
    241   else if (stemMethod == 2) stemmerNum = indexData.sih2.stemmer_num;
    242   else if (stemMethod == 3) stemmerNum = indexData.sih3.stemmer_num;
    243    
     248
     249  /* [JFG - Mar 06: Accent folding patch] */
     250  if(stemMethod > STEM_MAX) {
     251    return;
     252    //TODO: throw an error here
     253  }
     254
     255  stemmerNum = indexData.sih[stemMethod-1].stemmer_num;
     256 
    244257  // convert the word to an "mg word"
    245258  mgWord[0] = term.size();
     
    247260 
    248261  // stem the word
    249   stemmer (stemMethod, stemmerNum, mgWord);
    250 
     262  mgpp_stemmer (stemMethod, stemmerNum, mgWord);
    251263  // convert the result back to a UCArray
    252264  stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
     
    256268  unsigned long stemElNum;
    257269  bool result = false;
    258   if (stemMethod == 1) {
    259     result = SearchStemBlockDictEl (indexData.stem1File,
    260                indexData.sii1,
    261                indexData.sih1.entries_per_block,
    262                indexData.sih1.dict_size,
     270 
     271  /* [JFG - Mar 06: Accent folding patch] */
     272  result = SearchStemBlockDictEl (indexData.stemFile[stemMethod-1],
     273               indexData.sii[stemMethod-1],
     274               indexData.sih[stemMethod-1].entries_per_block,
     275               indexData.sih[stemMethod-1].dict_size,
    263276               stemTerm,
    264277               stemDictEl,
    265278               stemElNum);
    266 
    267   } else if (stemMethod == 2) {
    268     result = SearchStemBlockDictEl (indexData.stem2File,
    269                indexData.sii2,
    270                indexData.sih2.entries_per_block,
    271                indexData.sih2.dict_size,
    272                stemTerm,
    273                stemDictEl,
    274                stemElNum);
    275 
    276   } else if (stemMethod == 3) {
    277     result = SearchStemBlockDictEl (indexData.stem3File,
    278                indexData.sii3,
    279                indexData.sih3.entries_per_block,
    280                indexData.sih3.dict_size,
    281                stemTerm,
    282                stemDictEl,
    283                stemElNum);
    284   }
    285 
     279 
    286280  if (result) {
    287281    equivWords = stemDictEl.equivWords; 
Note: See TracChangeset for help on using the changeset viewer.