Changeset 12879
- Timestamp:
- 2006-09-28T10:23:36+12:00 (18 years ago)
- Location:
- trunk/gsdl/src/mgpp/text
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/src/mgpp/text/stemmer.cpp
r9611 r12879 22 22 #include "sysfuncs.h" 23 23 #include "stemmer.h" 24 24 #include "mg_files.h" 25 25 #include "lovinstem.h" 26 26 #include "simplefrenchstem.h" 27 27 #include "unitool.h" 28 29 /* [JFG - Mar 06: Accent folding patch] */ 30 #include "unac.h" 28 31 29 32 #define LOVINSTEMMER 0 … … 34 37 * making sure the final length doesn't exceed the original 35 38 * length */ 36 static void unicode_casefold (u_char *word) {39 static void mgpp_unicode_casefold (u_char *word) { 37 40 unsigned short out[256]; /* temp space */ 38 41 int i; … … 52 55 } 53 56 57 /* [JFG - Mar 06: Accent folding patch] */ 58 /* ========================================================================= 59 * Function: unicode_accentfold 60 61 * Description: remove accents from characters 62 * Input: a word string with the length in the first byte 63 * Output: the unaccented word 64 * ========================================================================= */ 65 void mgpp_unicode_accentfold (unsigned char *word) { 66 size_t unac_size = 0; 67 char *unac = NULL; 54 68 55 int stemmernumber (u_char *stemmerdescription) { 69 70 unac_string("utf-8", (char*)word+1, word[0], &unac, &unac_size); 71 strncpy((char*)word+1, unac, word[0]+1); 72 word[0] = unac_size; 73 74 free(unac); 75 return; 76 } 77 78 79 int mgpp_stemmernumber (u_char *stemmerdescription) { 56 80 u_char descript[MAX_STEM_DESCRIPTION_LEN]; 57 81 int i; … … 85 109 * Method 2 - Stem. 86 110 * Method 3 - Case fold and stem. 87 * 111 * Method 4 - Accent fold 112 * Method 5 - Accent fold and case fold 113 * Method 6 - Accent fold and stem 114 * Method 7 - Accent fold, stem and case fold 115 88 116 * The stemmer number should be obtained using 89 117 * the stemmernumber function above. 90 118 */ 91 119 void 92 stemmer (int method, int stemmer, u_char *word) {93 if (method & 1) {94 unicode_casefold (word);120 mgpp_stemmer (int method, int stemmer, u_char *word) { 121 if (method & STEM_CaseFolding) { 122 mgpp_unicode_casefold (word); 95 123 } 96 124 97 if (method & 2) { 125 if (method & STEM_AccentFolding) { 126 mgpp_unicode_accentfold (word); 127 } 128 129 if (method & STEM_Stemming) { 98 130 switch (stemmer) { 99 131 case LOVINSTEMMER: lovinstem (word); -
trunk/gsdl/src/mgpp/text/stemmer.h
r2468 r12879 47 47 * stemmer description. 48 48 */ 49 int stemmernumber (u_char *stemmerdescription);49 int mgpp_stemmernumber (u_char *stemmerdescription); 50 50 51 51 /* … … 54 54 * Method 2 - Stem. 55 55 * Method 3 - Case fold and stem. 56 * 56 * Method 4 - Accent fold 57 * Method 5 - Case fold and accent fold 58 * Method 6 - Stem and accent fold 59 * Method 7 - Case fold, stem and accent fold 57 60 * The stemmer number should be obtained using function 58 61 * stemmernumber above. … … 61 64 extern "C" 62 65 #endif 63 void stemmer (int method, int stemmer, u_char * word);66 void mgpp_stemmer (int method, int stemmer, u_char * word); 64 67 65 68 #endif
Note:
See TracChangeset
for help on using the changeset viewer.