Changeset 13477 for trunk/indexers/mgpp/text/stemmer.cpp
- Timestamp:
- 2006-12-11T11:22:20+13:00 (17 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/indexers/mgpp/text/stemmer.cpp
r9613 r13477 22 22 #include "sysfuncs.h" 23 23 #include "stemmer.h" 24 25 24 #include "lovinstem.h" 26 25 #include "simplefrenchstem.h" 27 26 #include "unitool.h" 27 28 #ifdef ENABLE_ACCENTFOLD 29 /* [JFG - Mar 06: Accent folding patch] */ 30 #include "unac.h" 31 #endif 28 32 29 33 #define LOVINSTEMMER 0 … … 34 38 * making sure the final length doesn't exceed the original 35 39 * length */ 36 static void unicode_casefold (u_char *word) {40 static void mgpp_unicode_casefold (u_char *word) { 37 41 unsigned short out[256]; /* temp space */ 38 42 int i; … … 52 56 } 53 57 58 #ifdef ENABLE_ACCENTFOLD 59 /* [JFG - Mar 06: Accent folding patch] */ 60 /* ========================================================================= 61 * Function: unicode_accentfold 62 63 * Description: remove accents from characters 64 * Input: a word string with the length in the first byte 65 * Output: the unaccented word 66 * ========================================================================= */ 67 void mgpp_unicode_accentfold (unsigned char *word) { 68 size_t unac_size = 0; 69 char *unac = NULL; 54 70 55 int stemmernumber (u_char *stemmerdescription) { 71 72 unac_string("utf-8", (char*)word+1, word[0], &unac, &unac_size); 73 strncpy((char*)word+1, unac, word[0]+1); 74 word[0] = unac_size; 75 76 free(unac); 77 return; 78 } 79 #endif 80 81 int mgpp_stemmernumber (u_char *stemmerdescription) { 56 82 u_char descript[MAX_STEM_DESCRIPTION_LEN]; 57 83 int i; … … 85 111 * Method 2 - Stem. 86 112 * Method 3 - Case fold and stem. 87 * 113 * Method 4 - Accent fold 114 * Method 5 - Accent fold and case fold 115 * Method 6 - Accent fold and stem 116 * Method 7 - Accent fold, stem and case fold 117 88 118 * The stemmer number should be obtained using 89 119 * the stemmernumber function above. 90 120 */ 91 121 void 92 stemmer (int method, int stemmer, u_char *word) {93 if (method & 1) {94 unicode_casefold (word);122 mgpp_stemmer (int method, int stemmer, u_char *word) { 123 if (method & STEM_CaseFolding) { 124 mgpp_unicode_casefold (word); 95 125 } 96 126 97 if (method & 2) { 127 #ifdef ENABLE_ACCENTFOLD 128 if (method & STEM_AccentFolding) { 129 mgpp_unicode_accentfold (word); 130 } 131 #endif 132 133 if (method & STEM_Stemming) { 98 134 switch (stemmer) { 99 135 case LOVINSTEMMER: lovinstem (word);
Note:
See TracChangeset
for help on using the changeset viewer.