source: trunk/mgpp/text/stemmer.cpp@ 9613

Last change on this file since 9613 was 9613, checked in by kjdon, 19 years ago

added in x++ -> ++x changes submitted by Emanuel Dejanu

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 2.8 KB
Line 
1/**************************************************************************
2 *
3 * stemmer.cpp -- The stemmer/case folder
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "sysfuncs.h"
23#include "stemmer.h"
24
25#include "lovinstem.h"
26#include "simplefrenchstem.h"
27#include "unitool.h"
28
29#define LOVINSTEMMER 0
30#define SIMPLEFRENCHSTEMMER 1
31
32
33/* decode the utf-8 encoded unicode, casefold and then recode
34 * making sure the final length doesn't exceed the original
35 * length */
36static void unicode_casefold (u_char *word) {
37 unsigned short out[256]; /* temp space */
38 int i;
39 int len;
40
41 /* decode */
42 utf8_word_to_unicode (word, out, 255);
43 len = out[0];
44
45 /* casefold and simplify-fold */
46 for (i=0; i<len; ++i) {
47 out[i+1] = unicode_tosimplified(unicode_tolower(out[i+1]));
48 }
49
50 /* re-code */
51 unicode_to_utf8_word (out, word, word[0]+1);
52}
53
54
55int stemmernumber (u_char *stemmerdescription) {
56 u_char descript[MAX_STEM_DESCRIPTION_LEN];
57 int i;
58
59 /* copy and case-fold the description */
60 for (i=0; (stemmerdescription[i] != '\0') &&
61 (i < MAX_STEM_DESCRIPTION_LEN-1); ++i)
62 descript[i] = tolower (stemmerdescription[i]);
63 descript[i] = '\0';
64
65 /* map the description to its number */
66
67 if ((strcmp ((char *) descript, "0") == 0) ||
68 (strcmp ((char *) descript, "english") == 0) ||
69 (strcmp ((char *) descript, "lovin") == 0))
70 return LOVINSTEMMER;
71
72 if ((strcmp ((char *) descript, "1") == 1) ||
73 (strcmp ((char *) descript, "french") == 0) ||
74 (strcmp ((char *) descript, "simplefrench") == 0))
75 return SIMPLEFRENCHSTEMMER;
76
77 return -1;
78}
79
80
81
82/*
83 * Method 0 - Do not stem or case fold.
84 * Method 1 - Case fold.
85 * Method 2 - Stem.
86 * Method 3 - Case fold and stem.
87 *
88 * The stemmer number should be obtained using
89 * the stemmernumber function above.
90 */
91void
92stemmer (int method, int stemmer, u_char *word) {
93 if (method & 1) {
94 unicode_casefold (word);
95 }
96
97 if (method & 2) {
98 switch (stemmer) {
99 case LOVINSTEMMER: lovinstem (word);
100 break;
101 case SIMPLEFRENCHSTEMMER: simplefrenchstem (word);
102 break;
103 }
104 }
105}
Note: See TracBrowser for help on using the repository browser.