Ignore:
Timestamp:
2006-12-11T11:22:20+13:00 (17 years ago)
Author:
shaoqun
Message:

added code for accentfolding

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/indexers/mgpp/text/stemmer.cpp

    r9613 r13477  
    2222#include "sysfuncs.h"
    2323#include "stemmer.h"
    24 
    2524#include "lovinstem.h"
    2625#include "simplefrenchstem.h"
    2726#include "unitool.h"
     27
     28#ifdef ENABLE_ACCENTFOLD
     29/* [JFG - Mar 06: Accent folding patch] */
     30#include "unac.h"
     31#endif
    2832
    2933#define LOVINSTEMMER        0
     
    3438 * making sure the final length doesn't exceed the original
    3539 * length */
    36 static void unicode_casefold (u_char *word) {
     40static void mgpp_unicode_casefold (u_char *word) {
    3741  unsigned short out[256]; /* temp space */
    3842  int i;
     
    5256}
    5357
     58#ifdef ENABLE_ACCENTFOLD
     59/* [JFG - Mar 06: Accent folding patch] */
     60/* =========================================================================
     61 * Function: unicode_accentfold
     62 
     63 * Description: remove accents from characters
     64 * Input: a word string with the length in the first byte
     65 * Output: the unaccented word
     66 * ========================================================================= */
     67void mgpp_unicode_accentfold (unsigned char *word) {     
     68  size_t unac_size = 0;
     69  char *unac = NULL;
    5470
    55 int stemmernumber (u_char *stemmerdescription) {
     71
     72  unac_string("utf-8", (char*)word+1, word[0], &unac, &unac_size);
     73  strncpy((char*)word+1, unac, word[0]+1);
     74  word[0] = unac_size;
     75 
     76  free(unac);
     77  return;
     78}
     79#endif
     80     
     81int mgpp_stemmernumber (u_char *stemmerdescription) {
    5682  u_char descript[MAX_STEM_DESCRIPTION_LEN];
    5783  int i;
     
    85111 * Method 2 - Stem.
    86112 * Method 3 - Case fold and stem.
    87  *
     113 * Method 4 - Accent fold
     114 * Method 5 - Accent fold and case fold
     115 * Method 6 - Accent fold and stem
     116 * Method 7 - Accent fold, stem and case fold
     117
    88118 * The stemmer number should be obtained using
    89119 * the stemmernumber function above.
    90120 */
    91121void
    92 stemmer (int method, int stemmer, u_char *word) {
    93   if (method & 1) {
    94     unicode_casefold (word);
     122mgpp_stemmer (int method, int stemmer, u_char *word) {
     123  if (method & STEM_CaseFolding) {
     124    mgpp_unicode_casefold (word);
    95125  }
    96126
    97   if (method & 2) {
     127#ifdef ENABLE_ACCENTFOLD
     128  if (method & STEM_AccentFolding) {
     129    mgpp_unicode_accentfold (word);
     130  }
     131#endif
     132
     133  if (method & STEM_Stemming) {
    98134    switch (stemmer) {
    99135    case LOVINSTEMMER: lovinstem (word);
Note: See TracChangeset for help on using the changeset viewer.