source: trunk/gsdl/src/mgpp/text/stemmer.cpp@ 12915

Last change on this file since 12915 was 12915, checked in by kjdon, 18 years ago

the compilation of the accent folding stuff (which needs unac package and iconv library) is now under the control of ENABLE_ACCENTFOLD, which gets defined in the CFLAGS if we want accent folding. turn it off in the mgpp/src/text/Makefile

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 3.9 KB
Line 
1/**************************************************************************
2 *
3 * stemmer.cpp -- The stemmer/case folder
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "sysfuncs.h"
23#include "stemmer.h"
24#include "mg_files.h"
25#include "lovinstem.h"
26#include "simplefrenchstem.h"
27#include "unitool.h"
28
29#ifdef ENABLE_ACCENTFOLD
30/* [JFG - Mar 06: Accent folding patch] */
31#include "unac.h"
32#endif
33
34#define LOVINSTEMMER 0
35#define SIMPLEFRENCHSTEMMER 1
36
37
38/* decode the utf-8 encoded unicode, casefold and then recode
39 * making sure the final length doesn't exceed the original
40 * length */
41static void mgpp_unicode_casefold (u_char *word) {
42 unsigned short out[256]; /* temp space */
43 int i;
44 int len;
45
46 /* decode */
47 utf8_word_to_unicode (word, out, 255);
48 len = out[0];
49
50 /* casefold and simplify-fold */
51 for (i=0; i<len; ++i) {
52 out[i+1] = unicode_tosimplified(unicode_tolower(out[i+1]));
53 }
54
55 /* re-code */
56 unicode_to_utf8_word (out, word, word[0]+1);
57}
58
59#ifdef ENABLE_ACCENTFOLD
60/* [JFG - Mar 06: Accent folding patch] */
61/* =========================================================================
62 * Function: unicode_accentfold
63
64 * Description: remove accents from characters
65 * Input: a word string with the length in the first byte
66 * Output: the unaccented word
67 * ========================================================================= */
68void mgpp_unicode_accentfold (unsigned char *word) {
69 size_t unac_size = 0;
70 char *unac = NULL;
71
72
73 unac_string("utf-8", (char*)word+1, word[0], &unac, &unac_size);
74 strncpy((char*)word+1, unac, word[0]+1);
75 word[0] = unac_size;
76
77 free(unac);
78 return;
79}
80#endif
81
82int mgpp_stemmernumber (u_char *stemmerdescription) {
83 u_char descript[MAX_STEM_DESCRIPTION_LEN];
84 int i;
85
86 /* copy and case-fold the description */
87 for (i=0; (stemmerdescription[i] != '\0') &&
88 (i < MAX_STEM_DESCRIPTION_LEN-1); ++i)
89 descript[i] = tolower (stemmerdescription[i]);
90 descript[i] = '\0';
91
92 /* map the description to its number */
93
94 if ((strcmp ((char *) descript, "0") == 0) ||
95 (strcmp ((char *) descript, "english") == 0) ||
96 (strcmp ((char *) descript, "lovin") == 0))
97 return LOVINSTEMMER;
98
99 if ((strcmp ((char *) descript, "1") == 1) ||
100 (strcmp ((char *) descript, "french") == 0) ||
101 (strcmp ((char *) descript, "simplefrench") == 0))
102 return SIMPLEFRENCHSTEMMER;
103
104 return -1;
105}
106
107
108
109/*
110 * Method 0 - Do not stem or case fold.
111 * Method 1 - Case fold.
112 * Method 2 - Stem.
113 * Method 3 - Case fold and stem.
114 * Method 4 - Accent fold
115 * Method 5 - Accent fold and case fold
116 * Method 6 - Accent fold and stem
117 * Method 7 - Accent fold, stem and case fold
118
119 * The stemmer number should be obtained using
120 * the stemmernumber function above.
121 */
122void
123mgpp_stemmer (int method, int stemmer, u_char *word) {
124 if (method & STEM_CaseFolding) {
125 mgpp_unicode_casefold (word);
126 }
127
128#ifdef ENABLE_ACCENTFOLD
129 if (method & STEM_AccentFolding) {
130 mgpp_unicode_accentfold (word);
131 }
132#endif
133
134 if (method & STEM_Stemming) {
135 switch (stemmer) {
136 case LOVINSTEMMER: lovinstem (word);
137 break;
138 case SIMPLEFRENCHSTEMMER: simplefrenchstem (word);
139 break;
140 }
141 }
142}
Note: See TracBrowser for help on using the repository browser.