source: trunk/gsdl/src/mgpp/text/stemmer.cpp@ 856

Last change on this file since 856 was 856, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 3.9 KB
Line 
1/**************************************************************************
2 *
3 * stemmer.cpp -- The stemmer/case folder
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: stemmer.cpp 856 2000-01-14 02:26:25Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25#include "stemmer.h"
26
27#include "lovinstem.h"
28#include "simplefrenchstem.h"
29#include "unitool.h"
30
31/*
32 $Log$
33 Revision 1.1 2000/01/14 02:26:22 sjboddie
34 Rodgers new C++ mg
35
36 Revision 1.1 1999/10/11 02:58:34 cs025
37 Base install of MG-PP
38
39 Revision 1.1 1999/08/10 21:18:23 sjboddie
40 renamed mg-1.3d directory mg
41
42 Revision 1.3 1998/12/17 09:12:54 rjmcnab
43
44 Altered mg to process utf-8 encoded Unicode. The main changes
45 are in the parsing of the input, the casefolding, and the stemming.
46
47 Revision 1.2 1998/11/25 07:55:51 rjmcnab
48
49 Modified mg to that you can specify the stemmer you want
50 to use via a command line option. You specify it to
51 mg_passes during the build process. The number of the
52 stemmer that you used is stored within the inverted
53 dictionary header and the stemmed dictionary header so
54 the correct stemmer is used in later stages of building
55 and querying.
56
57 Revision 1.1 1998/11/17 09:35:42 rjmcnab
58 *** empty log message ***
59
60 * Revision 1.3 1994/10/20 03:57:05 tes
61 * I have rewritten the boolean query optimiser and abstracted out the
62 * components of the boolean query.
63 *
64 * Revision 1.2 1994/09/20 04:42:10 tes
65 * For version 1.1
66 *
67 */
68
69
70#define LOVINSTEMMER 0
71#define SIMPLEFRENCHSTEMMER 1
72
73
74/* decode the utf-8 encoded unicode, casefold and then recode
75 * making sure the final length doesn't exceed the original
76 * length */
77static void unicode_casefold (u_char *word) {
78 unsigned short out[256]; /* temp space */
79 int i;
80 int len;
81
82 /* decode */
83 utf8_word_to_unicode (word, out, 255);
84 len = out[0];
85
86 /* casefold and simplify-fold */
87 for (i=0; i<len; i++) {
88 out[i+1] = unicode_tosimplified(unicode_tolower(out[i+1]));
89 }
90
91 /* re-code */
92 unicode_to_utf8_word (out, word, word[0]+1);
93}
94
95
96int stemmernumber (u_char *stemmerdescription) {
97 u_char descript[MAX_STEM_DESCRIPTION_LEN];
98 int i;
99
100 /* copy and case-fold the description */
101 for (i=0; (stemmerdescription[i] != '\0') &&
102 (i < MAX_STEM_DESCRIPTION_LEN-1); i++)
103 descript[i] = tolower (stemmerdescription[i]);
104 descript[i] = '\0';
105
106 /* map the description to its number */
107
108 if ((strcmp ((char *) descript, "0") == 0) ||
109 (strcmp ((char *) descript, "english") == 0) ||
110 (strcmp ((char *) descript, "lovin") == 0))
111 return LOVINSTEMMER;
112
113 if ((strcmp ((char *) descript, "1") == 1) ||
114 (strcmp ((char *) descript, "french") == 0) ||
115 (strcmp ((char *) descript, "simplefrench") == 0))
116 return SIMPLEFRENCHSTEMMER;
117
118 return -1;
119}
120
121
122
123/*
124 * Method 0 - Do not stem or case fold.
125 * Method 1 - Case fold.
126 * Method 2 - Stem.
127 * Method 3 - Case fold and stem.
128 *
129 * The stemmer number should be obtained using
130 * the stemmernumber function above.
131 */
132void
133stemmer (int method, int stemmer, u_char *word) {
134 if (method & 1) {
135 unicode_casefold (word);
136 }
137
138 if (method & 2) {
139 switch (stemmer) {
140 case LOVINSTEMMER: lovinstem (word);
141 break;
142 case SIMPLEFRENCHSTEMMER: simplefrenchstem (word);
143 break;
144 }
145 }
146}
Note: See TracBrowser for help on using the repository browser.