root/gsdl/trunk/trunk/mgpp/text/stemmer.cpp @ 16583

Revision 16583, 3.9 KB (checked in by davidb, 12 years ago)

Undoing change commited in r16582

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1/**************************************************************************
2 *
3 * stemmer.cpp -- The stemmer/case folder
4 * Copyright (C) 1994  Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "sysfuncs.h"
23#include "stemmer.h"
24#include "lovinstem.h"
25#include "simplefrenchstem.h"
26#include "unitool.h"
27
28#ifdef ENABLE_ACCENTFOLD
29/* [JFG - Mar 06: Accent folding patch] */
30#include "unac.h"
31#endif
32
33#define LOVINSTEMMER        0
34#define SIMPLEFRENCHSTEMMER 1
35
36
37/* decode the utf-8 encoded unicode, casefold and then recode
38 * making sure the final length doesn't exceed the original
39 * length */
40static void mgpp_unicode_casefold (u_char *word) {
41  unsigned short out[256]; /* temp space */
42  int i;
43  int len;
44
45  /* decode */
46  utf8_word_to_unicode (word, out, 255);
47  len = out[0];
48
49  /* casefold and simplify-fold */
50  for (i=0; i<len; ++i) {
51    out[i+1] = unicode_tosimplified(unicode_tolower(out[i+1]));
52  }
53
54  /* re-code */
55  unicode_to_utf8_word (out, word, word[0]+1);
56}
57
58#ifdef ENABLE_ACCENTFOLD
59/* [JFG - Mar 06: Accent folding patch] */
60/* =========================================================================
61 * Function: unicode_accentfold
62 
63 * Description: remove accents from characters
64 * Input: a word string with the length in the first byte
65 * Output: the unaccented word
66 * ========================================================================= */
67void mgpp_unicode_accentfold (unsigned char *word) {     
68  size_t unac_size = 0;
69  char *unac = NULL;
70
71
72  unac_string("utf-8", (char*)word+1, word[0], &unac, &unac_size);
73  strncpy((char*)word+1, unac, word[0]+1);
74  word[0] = unac_size;
75 
76  free(unac);
77  return;
78}
79#endif
80     
81int mgpp_stemmernumber (u_char *stemmerdescription) {
82  u_char descript[MAX_STEM_DESCRIPTION_LEN];
83  int i;
84
85  /* copy and case-fold the description */
86  for (i=0; (stemmerdescription[i] != '\0') &&
87     (i < MAX_STEM_DESCRIPTION_LEN-1); ++i)
88    descript[i] = tolower (stemmerdescription[i]);
89  descript[i] = '\0';
90
91  /* map the description to its number */
92
93  if ((strcmp ((char *) descript,  "0") == 0) ||
94      (strcmp ((char *) descript, "english") == 0) ||
95      (strcmp ((char *) descript, "lovin") == 0))
96    return LOVINSTEMMER;
97
98  if ((strcmp ((char *) descript, "1") == 1) ||
99      (strcmp ((char *) descript, "french") == 0) ||
100      (strcmp ((char *) descript, "simplefrench") == 0))
101    return SIMPLEFRENCHSTEMMER;
102
103  return -1;
104}
105
106
107
108/*
109 * Method 0 - Do not stem or case fold.
110 * Method 1 - Case fold.
111 * Method 2 - Stem.
112 * Method 3 - Case fold and stem.
113 * Method 4 - Accent fold
114 * Method 5 - Accent fold and case fold
115 * Method 6 - Accent fold and stem
116 * Method 7 - Accent fold, stem and case fold
117
118 * The stemmer number should be obtained using
119 * the stemmernumber function above.
120 */
121void
122mgpp_stemmer (int method, int stemmer, u_char *word) {
123  if (method & STEM_CaseFolding) {
124    mgpp_unicode_casefold (word);
125  }
126
127#ifdef ENABLE_ACCENTFOLD
128  if (method & STEM_AccentFolding) {
129    mgpp_unicode_accentfold (word);
130  }
131#endif
132
133  if (method & STEM_Stemming) {
134    switch (stemmer) {
135    case LOVINSTEMMER: lovinstem (word);
136      break;
137    case SIMPLEFRENCHSTEMMER: simplefrenchstem (word);
138      break;
139    }
140  }
141}
Note: See TracBrowser for help on using the browser.