source: main/trunk/greenstone2/common-src/indexers/mgpp/text/stemmer.cpp@ 26162

Last change on this file since 26162 was 26162, checked in by kjdon, 12 years ago

in unicode_to_utf8_word, the max_output_length is the maximum length of the actual string, not including the length, so shouldn't add one to the length

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.0 KB
Line 
1/**************************************************************************
2 *
3 * stemmer.cpp -- The stemmer/case folder
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "sysfuncs.h"
23#include "stemmer.h"
24#include "lovinstem.h"
25#include "simplefrenchstem.h"
26#include "unitool.h"
27
28#ifdef ENABLE_ACCENTFOLD
29/* [JFG - Mar 06: Accent folding patch] */
30#include "unac.h"
31#endif
32
33#define LOVINSTEMMER 0
34#define SIMPLEFRENCHSTEMMER 1
35
36
37/* decode the utf-8 encoded unicode, casefold and then recode
38 * making sure the final length doesn't exceed the original
39 * length */
40static void mgpp_unicode_casefold (u_char *word) {
41 unsigned short out[256]; /* temp space */
42 int i;
43 int len;
44
45 /* decode */
46 utf8_word_to_unicode (word, out, 255);
47 len = out[0];
48
49 /* casefold and simplify-fold */
50 for (i=0; i<len; ++i) {
51 out[i+1] = unicode_tosimplified(unicode_tolower(out[i+1]));
52 }
53
54 /* re-code */
55 //unicode_to_utf8_word (out, word, word[0]+1);
56 // The max_output_length (3rd param) for unicode_to_utf8 is the max length of
57 // the string, not including the length
58 unicode_to_utf8_word (out, word, word[0]);
59}
60
61#ifdef ENABLE_ACCENTFOLD
62/* [JFG - Mar 06: Accent folding patch] */
63/* =========================================================================
64 * Function: unicode_accentfold
65
66 * Description: remove accents from characters
67 * Input: a word string with the length in the first byte
68 * Output: the unaccented word
69 * ========================================================================= */
70void mgpp_unicode_accentfold (unsigned char *word) {
71 size_t unac_size = 0;
72 char *unac = NULL;
73
74
75 unac_string("utf-8", (char*)word+1, word[0], &unac, &unac_size);
76 strncpy((char*)word+1, unac, word[0]+1);
77 word[0] = unac_size;
78
79 free(unac);
80 return;
81}
82#endif
83
84int mgpp_stemmernumber (u_char *stemmerdescription) {
85 u_char descript[MAX_STEM_DESCRIPTION_LEN];
86 int i;
87
88 /* copy and case-fold the description */
89 for (i=0; (stemmerdescription[i] != '\0') &&
90 (i < MAX_STEM_DESCRIPTION_LEN-1); ++i)
91 descript[i] = tolower (stemmerdescription[i]);
92 descript[i] = '\0';
93
94 /* map the description to its number */
95
96 if ((strcmp ((char *) descript, "0") == 0) ||
97 (strcmp ((char *) descript, "english") == 0) ||
98 (strcmp ((char *) descript, "lovin") == 0))
99 return LOVINSTEMMER;
100
101 if ((strcmp ((char *) descript, "1") == 1) ||
102 (strcmp ((char *) descript, "french") == 0) ||
103 (strcmp ((char *) descript, "simplefrench") == 0))
104 return SIMPLEFRENCHSTEMMER;
105
106 return -1;
107}
108
109
110
111/*
112 * Method 0 - Do not stem or case fold.
113 * Method 1 - Case fold.
114 * Method 2 - Stem.
115 * Method 3 - Case fold and stem.
116 * Method 4 - Accent fold
117 * Method 5 - Accent fold and case fold
118 * Method 6 - Accent fold and stem
119 * Method 7 - Accent fold, stem and case fold
120
121 * The stemmer number should be obtained using
122 * the stemmernumber function above.
123 */
124void
125mgpp_stemmer (int method, int stemmer, u_char *word) {
126 if (method & STEM_CaseFolding) {
127 mgpp_unicode_casefold (word);
128 }
129
130#ifdef ENABLE_ACCENTFOLD
131 if (method & STEM_AccentFolding) {
132 mgpp_unicode_accentfold (word);
133 }
134#endif
135
136 if (method & STEM_Stemming) {
137 switch (stemmer) {
138 case LOVINSTEMMER: lovinstem (word);
139 break;
140 case SIMPLEFRENCHSTEMMER: simplefrenchstem (word);
141 break;
142 }
143 }
144}
Note: See TracBrowser for help on using the repository browser.