source: gsdl/trunk/trunk/mgpp/text/stemmer.cpp@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 3.9 KB
RevLine 
[3365]1/**************************************************************************
2 *
3 * stemmer.cpp -- The stemmer/case folder
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "sysfuncs.h"
23#include "stemmer.h"
24#include "lovinstem.h"
25#include "simplefrenchstem.h"
26#include "unitool.h"
27
[13477]28#ifdef ENABLE_ACCENTFOLD
29/* [JFG - Mar 06: Accent folding patch] */
30#include "unac.h"
31#endif
32
[3365]33#define LOVINSTEMMER 0
34#define SIMPLEFRENCHSTEMMER 1
35
36
37/* decode the utf-8 encoded unicode, casefold and then recode
38 * making sure the final length doesn't exceed the original
39 * length */
[13477]40static void mgpp_unicode_casefold (u_char *word) {
[3365]41 unsigned short out[256]; /* temp space */
42 int i;
43 int len;
44
45 /* decode */
46 utf8_word_to_unicode (word, out, 255);
47 len = out[0];
48
49 /* casefold and simplify-fold */
[9613]50 for (i=0; i<len; ++i) {
[3365]51 out[i+1] = unicode_tosimplified(unicode_tolower(out[i+1]));
52 }
53
54 /* re-code */
55 unicode_to_utf8_word (out, word, word[0]+1);
56}
57
[13477]58#ifdef ENABLE_ACCENTFOLD
59/* [JFG - Mar 06: Accent folding patch] */
60/* =========================================================================
61 * Function: unicode_accentfold
62
63 * Description: remove accents from characters
64 * Input: a word string with the length in the first byte
65 * Output: the unaccented word
66 * ========================================================================= */
67void mgpp_unicode_accentfold (unsigned char *word) {
68 size_t unac_size = 0;
69 char *unac = NULL;
[3365]70
[13477]71
72 unac_string("utf-8", (char*)word+1, word[0], &unac, &unac_size);
73 strncpy((char*)word+1, unac, word[0]+1);
74 word[0] = unac_size;
75
76 free(unac);
77 return;
78}
79#endif
80
81int mgpp_stemmernumber (u_char *stemmerdescription) {
[3365]82 u_char descript[MAX_STEM_DESCRIPTION_LEN];
83 int i;
84
85 /* copy and case-fold the description */
86 for (i=0; (stemmerdescription[i] != '\0') &&
[9613]87 (i < MAX_STEM_DESCRIPTION_LEN-1); ++i)
[3365]88 descript[i] = tolower (stemmerdescription[i]);
89 descript[i] = '\0';
90
91 /* map the description to its number */
92
93 if ((strcmp ((char *) descript, "0") == 0) ||
94 (strcmp ((char *) descript, "english") == 0) ||
95 (strcmp ((char *) descript, "lovin") == 0))
96 return LOVINSTEMMER;
97
98 if ((strcmp ((char *) descript, "1") == 1) ||
99 (strcmp ((char *) descript, "french") == 0) ||
100 (strcmp ((char *) descript, "simplefrench") == 0))
101 return SIMPLEFRENCHSTEMMER;
102
103 return -1;
104}
105
106
107
108/*
109 * Method 0 - Do not stem or case fold.
110 * Method 1 - Case fold.
111 * Method 2 - Stem.
112 * Method 3 - Case fold and stem.
[13477]113 * Method 4 - Accent fold
114 * Method 5 - Accent fold and case fold
115 * Method 6 - Accent fold and stem
116 * Method 7 - Accent fold, stem and case fold
117
[3365]118 * The stemmer number should be obtained using
119 * the stemmernumber function above.
120 */
121void
[13477]122mgpp_stemmer (int method, int stemmer, u_char *word) {
123 if (method & STEM_CaseFolding) {
124 mgpp_unicode_casefold (word);
[3365]125 }
126
[13477]127#ifdef ENABLE_ACCENTFOLD
128 if (method & STEM_AccentFolding) {
129 mgpp_unicode_accentfold (word);
130 }
131#endif
132
133 if (method & STEM_Stemming) {
[3365]134 switch (stemmer) {
135 case LOVINSTEMMER: lovinstem (word);
136 break;
137 case SIMPLEFRENCHSTEMMER: simplefrenchstem (word);
138 break;
139 }
140 }
141}
Note: See TracBrowser for help on using the repository browser.