Changeset 13653 for trunk


Ignore:
Timestamp:
2007-01-17T11:21:18+13:00 (17 years ago)
Author:
kjdon
Message:

Accent folding patch thanks to Juan Grigera. parsing of stem/case/accent term
modifiers now uses defines from mg_files.h

turned off accent folding if partial matching is being done - can't do them
together due to the way the index works. also, do the accentfold cases for
the switch in GetStemMethod only if ENABLE_ACCENTFOLD is defined
changed line 528 to avoid a compile warning on windows

Location:
trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/indexers/mgpp/text/GSDLQueryParser.cpp

    r12321 r13653  
    141141
    142142static unsigned long GetStemMethod(LexEl &el, int defaultStemMethod) {
    143   // here expect el to contain some of c,s,i,u
    144   // stem method 0 = c,u  00
    145   // stem method 1 = i,u  01  - default for DL
    146   // stem method 2 = c, s  10
    147   // stem method 3 = i,s  11
    148 
     143  // here expect el to contain some of c,s,i,u,f,a -- see mg_files.h CHAR_FLAG_STEM_* constants
    149144  unsigned long stem = (unsigned long)defaultStemMethod;
    150145     
     
    152147  UCArray::const_iterator end = el.text.end();
    153148
    154   unsigned char c1 = *here;
    155   if (!(c1 == 'c'|| c1 == 'i' || c1 == 'u' || c1 == 's'))
    156     return 4; // incorrect format
    157  
    158   ++here;
    159   unsigned char c2 = 'a';
    160   if (here !=end) {
    161     c2 = *here;
    162     if (!(c2 == 'c'|| c2 == 'i' || c2 == 'u' || c2 == 's'))
    163       return 4; // incorrect format
    164   }
    165  
    166   if (c1 == 'i'|| c2=='i') stem |= 1; // set bit 0 to 1
    167   if (c1 == 'c' || c2 == 'c') stem &=0xe; //set bit 0 to 0
    168   if (c1 == 's'|| c2 == 's') stem |= 2; // set bit 1 to 1
    169   if (c1 == 'u' || c2 =='u') stem &=0xd; // set bit 1 to 0
     149  /* [JFG - Mar 06: Accent folding patch] */
     150  /* Changed to use CHAR_FLAG_STEM* constants from mg_files.h */
     151  while(here != end) {
     152    unsigned char ch = *here;
     153    if (strchr (CHAR_FLAG_STEM_Validator, ch) == NULL)
     154      return STEM_INVALID; // incorrect format
     155   
     156    switch(ch) {
     157    case CHAR_FLAG_STEM_CaseFold:       // ignore case (fold)
     158      stem |= STEM_CaseFolding;
     159      break;
     160    case CHAR_FLAG_STEM_NoCaseFold:     // case sensitive
     161      stem &= (~STEM_CaseFolding);
     162      break;
     163    case CHAR_FLAG_STEM_Stemming:       // stem words
     164      stem |= STEM_Stemming;
     165      break;
     166    case CHAR_FLAG_STEM_NoStemming:     // do not stem words
     167      stem &= (~STEM_Stemming);
     168      break;
     169#ifdef ENABLE_ACCENTFOLD
     170   case CHAR_FLAG_STEM_AccentFold:      // accent fold
     171      stem |= STEM_AccentFolding;
     172      break;
     173    case CHAR_FLAG_STEM_NoAccentFold:   // do no accent folding
     174      stem &= (~STEM_AccentFolding);
     175      break;
     176#endif
     177    };
     178   
     179    ++here;     
     180  }
    170181  return stem;
    171182}
     
    190201      if (ParseLexEl (here, end, stem) && stem.lexType == TermE) {
    191202    termNode.stemMethod = GetStemMethod(stem, defaultStemMethod);
    192     if (termNode.stemMethod == 4) { // error so backtrack
     203    /* [JFG - Mar 06: Accent folding patch] */
     204    /* use STEM_INVALID instead of hardcoded 4 */
     205    if (termNode.stemMethod == STEM_INVALID) { // error so backtrack
    193206      here = oldHere;
    194207      termNode.stemMethod = (unsigned long)defaultStemMethod;
    195208    }
    196       }else here = oldHere; //ignore - wrong syntax
     209      } else here = oldHere; //ignore - wrong syntax
    197210
    198211    } else if (el.lexType == RangeE) {
     
    211224   
    212225    if (partial_match) {
    213       termNode.stemMethod |= 4; // set bit 2 to 1
    214       termNode.stemMethod &=0xd; // set bit 1 to 0 // we dont have stemming on if doing partial matching.
     226      /* [JFG - Mar 06: Accent folding patch] */
     227      /* use STEM_PARTIAL_MATCH flag */
     228      termNode.stemMethod |= STEM_PARTIAL_MATCH; // set partial match flag
     229      termNode.stemMethod &= (~STEM_Stemming); // we dont have stemming on if doing partial matching.
     230      termNode.stemMethod &= (~STEM_AccentFolding); // we dont have accentfolding on if doing partial matching.
    215231    }
    216232    oldHere = here;
     
    510526QueryNode *ParseQuery (const UCArray &queryStr, int defaultBoolCombine,
    511527               int defaultStemMethod, int maxnumeric) {
    512   if (4 < maxnumeric < 512) {
     528  if (4 < maxnumeric && maxnumeric < 512) {
    513529    MAXNUMERIC = maxnumeric;
    514530  }
  • trunk/mgpp/text/GSDLQueryParser.cpp

    r12321 r13653  
    141141
    142142static unsigned long GetStemMethod(LexEl &el, int defaultStemMethod) {
    143   // here expect el to contain some of c,s,i,u
    144   // stem method 0 = c,u  00
    145   // stem method 1 = i,u  01  - default for DL
    146   // stem method 2 = c, s  10
    147   // stem method 3 = i,s  11
    148 
     143  // here expect el to contain some of c,s,i,u,f,a -- see mg_files.h CHAR_FLAG_STEM_* constants
    149144  unsigned long stem = (unsigned long)defaultStemMethod;
    150145     
     
    152147  UCArray::const_iterator end = el.text.end();
    153148
    154   unsigned char c1 = *here;
    155   if (!(c1 == 'c'|| c1 == 'i' || c1 == 'u' || c1 == 's'))
    156     return 4; // incorrect format
    157  
    158   ++here;
    159   unsigned char c2 = 'a';
    160   if (here !=end) {
    161     c2 = *here;
    162     if (!(c2 == 'c'|| c2 == 'i' || c2 == 'u' || c2 == 's'))
    163       return 4; // incorrect format
    164   }
    165  
    166   if (c1 == 'i'|| c2=='i') stem |= 1; // set bit 0 to 1
    167   if (c1 == 'c' || c2 == 'c') stem &=0xe; //set bit 0 to 0
    168   if (c1 == 's'|| c2 == 's') stem |= 2; // set bit 1 to 1
    169   if (c1 == 'u' || c2 =='u') stem &=0xd; // set bit 1 to 0
     149  /* [JFG - Mar 06: Accent folding patch] */
     150  /* Changed to use CHAR_FLAG_STEM* constants from mg_files.h */
     151  while(here != end) {
     152    unsigned char ch = *here;
     153    if (strchr (CHAR_FLAG_STEM_Validator, ch) == NULL)
     154      return STEM_INVALID; // incorrect format
     155   
     156    switch(ch) {
     157    case CHAR_FLAG_STEM_CaseFold:       // ignore case (fold)
     158      stem |= STEM_CaseFolding;
     159      break;
     160    case CHAR_FLAG_STEM_NoCaseFold:     // case sensitive
     161      stem &= (~STEM_CaseFolding);
     162      break;
     163    case CHAR_FLAG_STEM_Stemming:       // stem words
     164      stem |= STEM_Stemming;
     165      break;
     166    case CHAR_FLAG_STEM_NoStemming:     // do not stem words
     167      stem &= (~STEM_Stemming);
     168      break;
     169#ifdef ENABLE_ACCENTFOLD
     170   case CHAR_FLAG_STEM_AccentFold:      // accent fold
     171      stem |= STEM_AccentFolding;
     172      break;
     173    case CHAR_FLAG_STEM_NoAccentFold:   // do no accent folding
     174      stem &= (~STEM_AccentFolding);
     175      break;
     176#endif
     177    };
     178   
     179    ++here;     
     180  }
    170181  return stem;
    171182}
     
    190201      if (ParseLexEl (here, end, stem) && stem.lexType == TermE) {
    191202    termNode.stemMethod = GetStemMethod(stem, defaultStemMethod);
    192     if (termNode.stemMethod == 4) { // error so backtrack
     203    /* [JFG - Mar 06: Accent folding patch] */
     204    /* use STEM_INVALID instead of hardcoded 4 */
     205    if (termNode.stemMethod == STEM_INVALID) { // error so backtrack
    193206      here = oldHere;
    194207      termNode.stemMethod = (unsigned long)defaultStemMethod;
    195208    }
    196       }else here = oldHere; //ignore - wrong syntax
     209      } else here = oldHere; //ignore - wrong syntax
    197210
    198211    } else if (el.lexType == RangeE) {
     
    211224   
    212225    if (partial_match) {
    213       termNode.stemMethod |= 4; // set bit 2 to 1
    214       termNode.stemMethod &=0xd; // set bit 1 to 0 // we dont have stemming on if doing partial matching.
     226      /* [JFG - Mar 06: Accent folding patch] */
     227      /* use STEM_PARTIAL_MATCH flag */
     228      termNode.stemMethod |= STEM_PARTIAL_MATCH; // set partial match flag
     229      termNode.stemMethod &= (~STEM_Stemming); // we dont have stemming on if doing partial matching.
     230      termNode.stemMethod &= (~STEM_AccentFolding); // we dont have accentfolding on if doing partial matching.
    215231    }
    216232    oldHere = here;
     
    510526QueryNode *ParseQuery (const UCArray &queryStr, int defaultBoolCombine,
    511527               int defaultStemMethod, int maxnumeric) {
    512   if (4 < maxnumeric < 512) {
     528  if (4 < maxnumeric && maxnumeric < 512) {
    513529    MAXNUMERIC = maxnumeric;
    514530  }
Note: See TracChangeset for help on using the changeset viewer.