1 | /*
|
---|
2 | * LovinsStemmer.java
|
---|
3 | * Copyright (C) 2001 Eibe Frank
|
---|
4 | *
|
---|
5 | * This program is free software; you can redistribute it and/or modify
|
---|
6 | * it under the terms of the GNU General Public License as published by
|
---|
7 | * the Free Software Foundation; either version 2 of the License, or
|
---|
8 | * (at your option) any later version.
|
---|
9 | *
|
---|
10 | * This program is distributed in the hope that it will be useful,
|
---|
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
13 | * GNU General Public License for more details.
|
---|
14 | *
|
---|
15 | * You should have received a copy of the GNU General Public License
|
---|
16 | * along with this program; if not, write to the Free Software
|
---|
17 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
18 | */
|
---|
19 |
|
---|
20 | import java.util.*;
|
---|
21 | import java.io.*;
|
---|
22 |
|
---|
23 | /**
|
---|
24 | * Implements the Lovins stemmer.
|
---|
25 | *
|
---|
26 | * @author Eibe Frank ([email protected])
|
---|
27 | * @version 1.0
|
---|
28 | */
|
---|
29 | public class LovinsStemmer extends Stemmer implements Serializable {
|
---|
30 |
|
---|
31 | /** C version compatibility mode (emulates bugs
|
---|
32 | in original C implementation) */
|
---|
33 | private static boolean m_CompMode = true;
|
---|
34 |
|
---|
35 | /** The hash tables containing the list of endings. */
|
---|
36 | private static HashMap m_l11 = null;
|
---|
37 | private static HashMap m_l10 = null;
|
---|
38 | private static HashMap m_l9 = null;
|
---|
39 | private static HashMap m_l8 = null;
|
---|
40 | private static HashMap m_l7 = null;
|
---|
41 | private static HashMap m_l6 = null;
|
---|
42 | private static HashMap m_l5 = null;
|
---|
43 | private static HashMap m_l4 = null;
|
---|
44 | private static HashMap m_l3 = null;
|
---|
45 | private static HashMap m_l2 = null;
|
---|
46 | private static HashMap m_l1 = null;
|
---|
47 |
|
---|
48 | static {
|
---|
49 |
|
---|
50 | m_l11 = new HashMap();
|
---|
51 | m_l11.put("alistically", "B");
|
---|
52 | m_l11.put("arizability", "A");
|
---|
53 | m_l11.put("izationally", "B");
|
---|
54 | m_l10 = new HashMap();
|
---|
55 | m_l10.put("antialness", "A");
|
---|
56 | m_l10.put("arisations", "A");
|
---|
57 | m_l10.put("arizations", "A");
|
---|
58 | m_l10.put("entialness", "A");
|
---|
59 | m_l9 = new HashMap();
|
---|
60 | m_l9.put("allically", "C");
|
---|
61 | m_l9.put("antaneous", "A");
|
---|
62 | m_l9.put("antiality", "A");
|
---|
63 | m_l9.put("arisation", "A");
|
---|
64 | m_l9.put("arization", "A");
|
---|
65 | m_l9.put("ationally", "B");
|
---|
66 | m_l9.put("ativeness", "A");
|
---|
67 | m_l9.put("eableness", "E");
|
---|
68 | m_l9.put("entations", "A");
|
---|
69 | m_l9.put("entiality", "A");
|
---|
70 | m_l9.put("entialize", "A");
|
---|
71 | m_l9.put("entiation", "A");
|
---|
72 | m_l9.put("ionalness", "A");
|
---|
73 | m_l9.put("istically", "A");
|
---|
74 | m_l9.put("itousness", "A");
|
---|
75 | m_l9.put("izability", "A");
|
---|
76 | m_l9.put("izational", "A");
|
---|
77 | m_l8 = new HashMap();
|
---|
78 | m_l8.put("ableness", "A");
|
---|
79 | m_l8.put("arizable", "A");
|
---|
80 | m_l8.put("entation", "A");
|
---|
81 | m_l8.put("entially", "A");
|
---|
82 | m_l8.put("eousness", "A");
|
---|
83 | m_l8.put("ibleness", "A");
|
---|
84 | m_l8.put("icalness", "A");
|
---|
85 | m_l8.put("ionalism", "A");
|
---|
86 | m_l8.put("ionality", "A");
|
---|
87 | m_l8.put("ionalize", "A");
|
---|
88 | m_l8.put("iousness", "A");
|
---|
89 | m_l8.put("izations", "A");
|
---|
90 | m_l8.put("lessness", "A");
|
---|
91 | m_l7 = new HashMap();
|
---|
92 | m_l7.put("ability", "A");
|
---|
93 | m_l7.put("aically", "A");
|
---|
94 | m_l7.put("alistic", "B");
|
---|
95 | m_l7.put("alities", "A");
|
---|
96 | m_l7.put("ariness", "E");
|
---|
97 | m_l7.put("aristic", "A");
|
---|
98 | m_l7.put("arizing", "A");
|
---|
99 | m_l7.put("ateness", "A");
|
---|
100 | m_l7.put("atingly", "A");
|
---|
101 | m_l7.put("ational", "B");
|
---|
102 | m_l7.put("atively", "A");
|
---|
103 | m_l7.put("ativism", "A");
|
---|
104 | m_l7.put("elihood", "E");
|
---|
105 | m_l7.put("encible", "A");
|
---|
106 | m_l7.put("entally", "A");
|
---|
107 | m_l7.put("entials", "A");
|
---|
108 | m_l7.put("entiate", "A");
|
---|
109 | m_l7.put("entness", "A");
|
---|
110 | m_l7.put("fulness", "A");
|
---|
111 | m_l7.put("ibility", "A");
|
---|
112 | m_l7.put("icalism", "A");
|
---|
113 | m_l7.put("icalist", "A");
|
---|
114 | m_l7.put("icality", "A");
|
---|
115 | m_l7.put("icalize", "A");
|
---|
116 | m_l7.put("ication", "G");
|
---|
117 | m_l7.put("icianry", "A");
|
---|
118 | m_l7.put("ination", "A");
|
---|
119 | m_l7.put("ingness", "A");
|
---|
120 | m_l7.put("ionally", "A");
|
---|
121 | m_l7.put("isation", "A");
|
---|
122 | m_l7.put("ishness", "A");
|
---|
123 | m_l7.put("istical", "A");
|
---|
124 | m_l7.put("iteness", "A");
|
---|
125 | m_l7.put("iveness", "A");
|
---|
126 | m_l7.put("ivistic", "A");
|
---|
127 | m_l7.put("ivities", "A");
|
---|
128 | m_l7.put("ization", "F");
|
---|
129 | m_l7.put("izement", "A");
|
---|
130 | m_l7.put("oidally", "A");
|
---|
131 | m_l7.put("ousness", "A");
|
---|
132 | m_l6 = new HashMap();
|
---|
133 | m_l6.put("aceous", "A");
|
---|
134 | m_l6.put("acious", "B");
|
---|
135 | m_l6.put("action", "G");
|
---|
136 | m_l6.put("alness", "A");
|
---|
137 | m_l6.put("ancial", "A");
|
---|
138 | m_l6.put("ancies", "A");
|
---|
139 | m_l6.put("ancing", "B");
|
---|
140 | m_l6.put("ariser", "A");
|
---|
141 | m_l6.put("arized", "A");
|
---|
142 | m_l6.put("arizer", "A");
|
---|
143 | m_l6.put("atable", "A");
|
---|
144 | m_l6.put("ations", "B");
|
---|
145 | m_l6.put("atives", "A");
|
---|
146 | m_l6.put("eature", "Z");
|
---|
147 | m_l6.put("efully", "A");
|
---|
148 | m_l6.put("encies", "A");
|
---|
149 | m_l6.put("encing", "A");
|
---|
150 | m_l6.put("ential", "A");
|
---|
151 | m_l6.put("enting", "C");
|
---|
152 | m_l6.put("entist", "A");
|
---|
153 | m_l6.put("eously", "A");
|
---|
154 | m_l6.put("ialist", "A");
|
---|
155 | m_l6.put("iality", "A");
|
---|
156 | m_l6.put("ialize", "A");
|
---|
157 | m_l6.put("ically", "A");
|
---|
158 | m_l6.put("icance", "A");
|
---|
159 | m_l6.put("icians", "A");
|
---|
160 | m_l6.put("icists", "A");
|
---|
161 | m_l6.put("ifully", "A");
|
---|
162 | m_l6.put("ionals", "A");
|
---|
163 | m_l6.put("ionate", "D");
|
---|
164 | m_l6.put("ioning", "A");
|
---|
165 | m_l6.put("ionist", "A");
|
---|
166 | m_l6.put("iously", "A");
|
---|
167 | m_l6.put("istics", "A");
|
---|
168 | m_l6.put("izable", "E");
|
---|
169 | m_l6.put("lessly", "A");
|
---|
170 | m_l6.put("nesses", "A");
|
---|
171 | m_l6.put("oidism", "A");
|
---|
172 | m_l5 = new HashMap();
|
---|
173 | m_l5.put("acies", "A");
|
---|
174 | m_l5.put("acity", "A");
|
---|
175 | m_l5.put("aging", "B");
|
---|
176 | m_l5.put("aical", "A");
|
---|
177 | if (!m_CompMode) {
|
---|
178 | m_l5.put("alist", "A");
|
---|
179 | }
|
---|
180 | m_l5.put("alism", "B");
|
---|
181 | m_l5.put("ality", "A");
|
---|
182 | m_l5.put("alize", "A");
|
---|
183 | m_l5.put("allic", "b");
|
---|
184 | m_l5.put("anced", "B");
|
---|
185 | m_l5.put("ances", "B");
|
---|
186 | m_l5.put("antic", "C");
|
---|
187 | m_l5.put("arial", "A");
|
---|
188 | m_l5.put("aries", "A");
|
---|
189 | m_l5.put("arily", "A");
|
---|
190 | m_l5.put("arity", "B");
|
---|
191 | m_l5.put("arize", "A");
|
---|
192 | m_l5.put("aroid", "A");
|
---|
193 | m_l5.put("ately", "A");
|
---|
194 | m_l5.put("ating", "I");
|
---|
195 | m_l5.put("ation", "B");
|
---|
196 | m_l5.put("ative", "A");
|
---|
197 | m_l5.put("ators", "A");
|
---|
198 | m_l5.put("atory", "A");
|
---|
199 | m_l5.put("ature", "E");
|
---|
200 | m_l5.put("early", "Y");
|
---|
201 | m_l5.put("ehood", "A");
|
---|
202 | m_l5.put("eless", "A");
|
---|
203 | if (!m_CompMode) {
|
---|
204 | m_l5.put("elily", "A");
|
---|
205 | } else {
|
---|
206 | m_l5.put("elity", "A");
|
---|
207 | }
|
---|
208 | m_l5.put("ement", "A");
|
---|
209 | m_l5.put("enced", "A");
|
---|
210 | m_l5.put("ences", "A");
|
---|
211 | m_l5.put("eness", "E");
|
---|
212 | m_l5.put("ening", "E");
|
---|
213 | m_l5.put("ental", "A");
|
---|
214 | m_l5.put("ented", "C");
|
---|
215 | m_l5.put("ently", "A");
|
---|
216 | m_l5.put("fully", "A");
|
---|
217 | m_l5.put("ially", "A");
|
---|
218 | m_l5.put("icant", "A");
|
---|
219 | m_l5.put("ician", "A");
|
---|
220 | m_l5.put("icide", "A");
|
---|
221 | m_l5.put("icism", "A");
|
---|
222 | m_l5.put("icist", "A");
|
---|
223 | m_l5.put("icity", "A");
|
---|
224 | m_l5.put("idine", "I");
|
---|
225 | m_l5.put("iedly", "A");
|
---|
226 | m_l5.put("ihood", "A");
|
---|
227 | m_l5.put("inate", "A");
|
---|
228 | m_l5.put("iness", "A");
|
---|
229 | m_l5.put("ingly", "B");
|
---|
230 | m_l5.put("inism", "J");
|
---|
231 | m_l5.put("inity", "c");
|
---|
232 | m_l5.put("ional", "A");
|
---|
233 | m_l5.put("ioned", "A");
|
---|
234 | m_l5.put("ished", "A");
|
---|
235 | m_l5.put("istic", "A");
|
---|
236 | m_l5.put("ities", "A");
|
---|
237 | m_l5.put("itous", "A");
|
---|
238 | m_l5.put("ively", "A");
|
---|
239 | m_l5.put("ivity", "A");
|
---|
240 | m_l5.put("izers", "F");
|
---|
241 | m_l5.put("izing", "F");
|
---|
242 | m_l5.put("oidal", "A");
|
---|
243 | m_l5.put("oides", "A");
|
---|
244 | m_l5.put("otide", "A");
|
---|
245 | m_l5.put("ously", "A");
|
---|
246 | m_l4 = new HashMap();
|
---|
247 | m_l4.put("able", "A");
|
---|
248 | m_l4.put("ably", "A");
|
---|
249 | m_l4.put("ages", "B");
|
---|
250 | m_l4.put("ally", "B");
|
---|
251 | m_l4.put("ance", "B");
|
---|
252 | m_l4.put("ancy", "B");
|
---|
253 | m_l4.put("ants", "B");
|
---|
254 | m_l4.put("aric", "A");
|
---|
255 | m_l4.put("arly", "K");
|
---|
256 | m_l4.put("ated", "I");
|
---|
257 | m_l4.put("ates", "A");
|
---|
258 | m_l4.put("atic", "B");
|
---|
259 | m_l4.put("ator", "A");
|
---|
260 | m_l4.put("ealy", "Y");
|
---|
261 | m_l4.put("edly", "E");
|
---|
262 | m_l4.put("eful", "A");
|
---|
263 | m_l4.put("eity", "A");
|
---|
264 | m_l4.put("ence", "A");
|
---|
265 | m_l4.put("ency", "A");
|
---|
266 | m_l4.put("ened", "E");
|
---|
267 | m_l4.put("enly", "E");
|
---|
268 | m_l4.put("eous", "A");
|
---|
269 | m_l4.put("hood", "A");
|
---|
270 | m_l4.put("ials", "A");
|
---|
271 | m_l4.put("ians", "A");
|
---|
272 | m_l4.put("ible", "A");
|
---|
273 | m_l4.put("ibly", "A");
|
---|
274 | m_l4.put("ical", "A");
|
---|
275 | m_l4.put("ides", "L");
|
---|
276 | m_l4.put("iers", "A");
|
---|
277 | m_l4.put("iful", "A");
|
---|
278 | m_l4.put("ines", "M");
|
---|
279 | m_l4.put("ings", "N");
|
---|
280 | m_l4.put("ions", "B");
|
---|
281 | m_l4.put("ious", "A");
|
---|
282 | m_l4.put("isms", "B");
|
---|
283 | m_l4.put("ists", "A");
|
---|
284 | m_l4.put("itic", "H");
|
---|
285 | m_l4.put("ized", "F");
|
---|
286 | m_l4.put("izer", "F");
|
---|
287 | m_l4.put("less", "A");
|
---|
288 | m_l4.put("lily", "A");
|
---|
289 | m_l4.put("ness", "A");
|
---|
290 | m_l4.put("ogen", "A");
|
---|
291 | m_l4.put("ward", "A");
|
---|
292 | m_l4.put("wise", "A");
|
---|
293 | m_l4.put("ying", "B");
|
---|
294 | m_l4.put("yish", "A");
|
---|
295 | m_l3 = new HashMap();
|
---|
296 | m_l3.put("acy", "A");
|
---|
297 | m_l3.put("age", "B");
|
---|
298 | m_l3.put("aic", "A");
|
---|
299 | m_l3.put("als", "b");
|
---|
300 | m_l3.put("ant", "B");
|
---|
301 | m_l3.put("ars", "O");
|
---|
302 | m_l3.put("ary", "F");
|
---|
303 | m_l3.put("ata", "A");
|
---|
304 | m_l3.put("ate", "A");
|
---|
305 | m_l3.put("eal", "Y");
|
---|
306 | m_l3.put("ear", "Y");
|
---|
307 | m_l3.put("ely", "E");
|
---|
308 | m_l3.put("ene", "E");
|
---|
309 | m_l3.put("ent", "C");
|
---|
310 | m_l3.put("ery", "E");
|
---|
311 | m_l3.put("ese", "A");
|
---|
312 | m_l3.put("ful", "A");
|
---|
313 | m_l3.put("ial", "A");
|
---|
314 | m_l3.put("ian", "A");
|
---|
315 | m_l3.put("ics", "A");
|
---|
316 | m_l3.put("ide", "L");
|
---|
317 | m_l3.put("ied", "A");
|
---|
318 | m_l3.put("ier", "A");
|
---|
319 | m_l3.put("ies", "P");
|
---|
320 | m_l3.put("ily", "A");
|
---|
321 | m_l3.put("ine", "M");
|
---|
322 | m_l3.put("ing", "N");
|
---|
323 | m_l3.put("ion", "Q");
|
---|
324 | m_l3.put("ish", "C");
|
---|
325 | m_l3.put("ism", "B");
|
---|
326 | m_l3.put("ist", "A");
|
---|
327 | m_l3.put("ite", "a");
|
---|
328 | m_l3.put("ity", "A");
|
---|
329 | m_l3.put("ium", "A");
|
---|
330 | m_l3.put("ive", "A");
|
---|
331 | m_l3.put("ize", "F");
|
---|
332 | m_l3.put("oid", "A");
|
---|
333 | m_l3.put("one", "R");
|
---|
334 | m_l3.put("ous", "A");
|
---|
335 | m_l2 = new HashMap();
|
---|
336 | m_l2.put("ae", "A");
|
---|
337 | m_l2.put("al", "b");
|
---|
338 | m_l2.put("ar", "X");
|
---|
339 | m_l2.put("as", "B");
|
---|
340 | m_l2.put("ed", "E");
|
---|
341 | m_l2.put("en", "F");
|
---|
342 | m_l2.put("es", "E");
|
---|
343 | m_l2.put("ia", "A");
|
---|
344 | m_l2.put("ic", "A");
|
---|
345 | m_l2.put("is", "A");
|
---|
346 | m_l2.put("ly", "B");
|
---|
347 | m_l2.put("on", "S");
|
---|
348 | m_l2.put("or", "T");
|
---|
349 | m_l2.put("um", "U");
|
---|
350 | m_l2.put("us", "V");
|
---|
351 | m_l2.put("yl", "R");
|
---|
352 | m_l2.put("s\'", "A");
|
---|
353 | m_l2.put("\'s", "A");
|
---|
354 | m_l1 = new HashMap();
|
---|
355 | m_l1.put("a", "A");
|
---|
356 | m_l1.put("e", "A");
|
---|
357 | m_l1.put("i", "A");
|
---|
358 | m_l1.put("o", "A");
|
---|
359 | m_l1.put("s", "W");
|
---|
360 | m_l1.put("y", "B");
|
---|
361 | }
|
---|
362 |
|
---|
363 | /**
|
---|
364 | * Finds and removes ending from given word.
|
---|
365 | */
|
---|
366 | private String removeEnding(String word) {
|
---|
367 |
|
---|
368 | int length = word.length();
|
---|
369 | int el = 11;
|
---|
370 |
|
---|
371 | while (el > 0) {
|
---|
372 | if (length - el > 1) {
|
---|
373 | String ending = word.substring(length - el);
|
---|
374 | String conditionCode = null;
|
---|
375 | switch (el) {
|
---|
376 | case 11: conditionCode = (String)m_l11.get(ending);
|
---|
377 | break;
|
---|
378 | case 10: conditionCode = (String)m_l10.get(ending);
|
---|
379 | break;
|
---|
380 | case 9: conditionCode = (String)m_l9.get(ending);
|
---|
381 | break;
|
---|
382 | case 8: conditionCode = (String)m_l8.get(ending);
|
---|
383 | break;
|
---|
384 | case 7: conditionCode = (String)m_l7.get(ending);
|
---|
385 | break;
|
---|
386 | case 6: conditionCode = (String)m_l6.get(ending);
|
---|
387 | break;
|
---|
388 | case 5: conditionCode = (String)m_l5.get(ending);
|
---|
389 | break;
|
---|
390 | case 4: conditionCode = (String)m_l4.get(ending);
|
---|
391 | break;
|
---|
392 | case 3: conditionCode = (String)m_l3.get(ending);
|
---|
393 | break;
|
---|
394 | case 2: conditionCode = (String)m_l2.get(ending);
|
---|
395 | break;
|
---|
396 | case 1: conditionCode = (String)m_l1.get(ending);
|
---|
397 | break;
|
---|
398 | default:
|
---|
399 | }
|
---|
400 | if (conditionCode != null) {
|
---|
401 | switch (conditionCode.charAt(0)) {
|
---|
402 | case 'A':
|
---|
403 | return word.substring(0, length - el);
|
---|
404 | case 'B':
|
---|
405 | if (length - el > 2) {
|
---|
406 | return word.substring(0, length - el);
|
---|
407 | }
|
---|
408 | break;
|
---|
409 | case 'C':
|
---|
410 | if (length - el > 3) {
|
---|
411 | return word.substring(0, length - el);
|
---|
412 | }
|
---|
413 | break;
|
---|
414 | case 'D':
|
---|
415 | if (length - el > 4) {
|
---|
416 | return word.substring(0, length - el);
|
---|
417 | }
|
---|
418 | break;
|
---|
419 | case 'E':
|
---|
420 | if (word.charAt(length - el - 1) != 'e') {
|
---|
421 | return word.substring(0, length - el);
|
---|
422 | }
|
---|
423 | break;
|
---|
424 | case 'F':
|
---|
425 | if ((length - el > 2) &&
|
---|
426 | (word.charAt(length - el - 1) != 'e')) {
|
---|
427 | return word.substring(0, length - el);
|
---|
428 | }
|
---|
429 | break;
|
---|
430 | case 'G':
|
---|
431 | if ((length - el > 2) &&
|
---|
432 | (word.charAt(length - el - 1) == 'f')) {
|
---|
433 | return word.substring(0, length - el);
|
---|
434 | }
|
---|
435 | break;
|
---|
436 | case 'H':
|
---|
437 | if ((word.charAt(length - el - 1) == 't') ||
|
---|
438 | ((word.charAt(length - el - 1) == 'l') &&
|
---|
439 | (word.charAt(length - el - 2) == 'l'))) {
|
---|
440 | return word.substring(0, length - el);
|
---|
441 | }
|
---|
442 | break;
|
---|
443 | case 'I':
|
---|
444 | if ((word.charAt(length - el - 1) != 'o') &&
|
---|
445 | (word.charAt(length - el - 1) != 'e')) {
|
---|
446 | return word.substring(0, length - el);
|
---|
447 | }
|
---|
448 | break;
|
---|
449 | case 'J':
|
---|
450 | if ((word.charAt(length - el - 1) != 'a') &&
|
---|
451 | (word.charAt(length - el - 1) != 'e')) {
|
---|
452 | return word.substring(0, length - el);
|
---|
453 | }
|
---|
454 | break;
|
---|
455 | case 'K':
|
---|
456 | if ((length - el > 2) &&
|
---|
457 | ((word.charAt(length - el - 1) == 'l') ||
|
---|
458 | (word.charAt(length - el - 1) == 'i') ||
|
---|
459 | ((word.charAt(length - el - 1) == 'e') &&
|
---|
460 | (word.charAt(length - el - 3) == 'u')))) {
|
---|
461 | return word.substring(0, length - el);
|
---|
462 | }
|
---|
463 | break;
|
---|
464 | case 'L':
|
---|
465 | if ((word.charAt(length - el - 1) != 'u') &&
|
---|
466 | (word.charAt(length - el - 1) != 'x') &&
|
---|
467 | ((word.charAt(length - el - 1) != 's') ||
|
---|
468 | (word.charAt(length - el - 2) == 'o'))) {
|
---|
469 | return word.substring(0, length - el);
|
---|
470 | }
|
---|
471 | break;
|
---|
472 | case 'M':
|
---|
473 | if ((word.charAt(length - el - 1) != 'a') &&
|
---|
474 | (word.charAt(length - el - 1) != 'c') &&
|
---|
475 | (word.charAt(length - el - 1) != 'e') &&
|
---|
476 | (word.charAt(length - el - 1) != 'm')) {
|
---|
477 | return word.substring(0, length - el);
|
---|
478 | }
|
---|
479 | break;
|
---|
480 | case 'N':
|
---|
481 | if ((length - el > 3) ||
|
---|
482 | ((length - el == 3) &&
|
---|
483 | ((word.charAt(length - el - 3) != 's')))) {
|
---|
484 | return word.substring(0, length - el);
|
---|
485 | }
|
---|
486 | break;
|
---|
487 | case 'O':
|
---|
488 | if ((word.charAt(length - el - 1) == 'l') ||
|
---|
489 | (word.charAt(length - el - 1) == 'i')) {
|
---|
490 | return word.substring(0, length - el);
|
---|
491 | }
|
---|
492 | break;
|
---|
493 | case 'P':
|
---|
494 | if (word.charAt(length - el - 1) != 'c') {
|
---|
495 | return word.substring(0, length - el);
|
---|
496 | }
|
---|
497 | break;
|
---|
498 | case 'Q':
|
---|
499 | if ((length - el > 2) &&
|
---|
500 | (word.charAt(length - el - 1) != 'l') &&
|
---|
501 | (word.charAt(length - el - 1) != 'n')) {
|
---|
502 | return word.substring(0, length - el);
|
---|
503 | }
|
---|
504 | break;
|
---|
505 | case 'R':
|
---|
506 | if ((word.charAt(length - el - 1) == 'n') ||
|
---|
507 | (word.charAt(length - el - 1) == 'r')) {
|
---|
508 | return word.substring(0, length - el);
|
---|
509 | }
|
---|
510 | break;
|
---|
511 | case 'S':
|
---|
512 | if (((word.charAt(length - el - 1) == 'r') &&
|
---|
513 | (word.charAt(length - el - 2) == 'd')) ||
|
---|
514 | ((word.charAt(length - el - 1) == 't') &&
|
---|
515 | (word.charAt(length - el - 2) != 't'))) {
|
---|
516 | return word.substring(0, length - el);
|
---|
517 | }
|
---|
518 | break;
|
---|
519 | case 'T':
|
---|
520 | if ((word.charAt(length - el - 1) == 's') ||
|
---|
521 | ((word.charAt(length - el - 1) == 't') &&
|
---|
522 | (word.charAt(length - el - 2) != 'o'))) {
|
---|
523 | return word.substring(0, length - el);
|
---|
524 | }
|
---|
525 | break;
|
---|
526 | case 'U':
|
---|
527 | if ((word.charAt(length - el - 1) == 'l') ||
|
---|
528 | (word.charAt(length - el - 1) == 'm') ||
|
---|
529 | (word.charAt(length - el - 1) == 'n') ||
|
---|
530 | (word.charAt(length - el - 1) == 'r')) {
|
---|
531 | return word.substring(0, length - el);
|
---|
532 | }
|
---|
533 | break;
|
---|
534 | case 'V':
|
---|
535 | if (word.charAt(length - el - 1) == 'c') {
|
---|
536 | return word.substring(0, length - el);
|
---|
537 | }
|
---|
538 | break;
|
---|
539 | case 'W':
|
---|
540 | if ((word.charAt(length - el - 1) != 's') &&
|
---|
541 | (word.charAt(length - el - 1) != 'u')) {
|
---|
542 | return word.substring(0, length - el);
|
---|
543 | }
|
---|
544 | break;
|
---|
545 | case 'X':
|
---|
546 | if ((word.charAt(length - el - 1) == 'l') ||
|
---|
547 | (word.charAt(length - el - 1) == 'i') ||
|
---|
548 | ((length - el > 2) &&
|
---|
549 | (word.charAt(length - el - 1) == 'e') &&
|
---|
550 | (word.charAt(length - el - 3) == 'u'))) {
|
---|
551 | return word.substring(0, length - el);
|
---|
552 | }
|
---|
553 | break;
|
---|
554 | case 'Y':
|
---|
555 | if ((word.charAt(length - el - 1) == 'n') &&
|
---|
556 | (word.charAt(length - el - 2) == 'i')) {
|
---|
557 | return word.substring(0, length - el);
|
---|
558 | }
|
---|
559 | break;
|
---|
560 | case 'Z':
|
---|
561 | if (word.charAt(length - el - 1) != 'f') {
|
---|
562 | return word.substring(0, length - el);
|
---|
563 | }
|
---|
564 | break;
|
---|
565 | case 'a':
|
---|
566 | if ((word.charAt(length - el - 1) == 'd') ||
|
---|
567 | (word.charAt(length - el - 1) == 'f') ||
|
---|
568 | (((word.charAt(length - el - 1) == 'h') &&
|
---|
569 | (word.charAt(length - el - 2) == 'p'))) ||
|
---|
570 | (((word.charAt(length - el - 1) == 'h') &&
|
---|
571 | (word.charAt(length - el - 2) == 't'))) ||
|
---|
572 | (word.charAt(length - el - 1) == 'l') ||
|
---|
573 | (((word.charAt(length - el - 1) == 'r') &&
|
---|
574 | (word.charAt(length - el - 2) == 'e'))) ||
|
---|
575 | (((word.charAt(length - el - 1) == 'r') &&
|
---|
576 | (word.charAt(length - el - 2) == 'o'))) ||
|
---|
577 | (((word.charAt(length - el - 1) == 's') &&
|
---|
578 | (word.charAt(length - el - 2) == 'e'))) ||
|
---|
579 | (word.charAt(length - el - 1) == 't')) {
|
---|
580 | return word.substring(0, length - el);
|
---|
581 | }
|
---|
582 | break;
|
---|
583 | case 'b':
|
---|
584 | if (m_CompMode) {
|
---|
585 | if (((length - el == 3 ) &&
|
---|
586 | (!((word.charAt(length - el - 1) == 't') &&
|
---|
587 | (word.charAt(length - el - 2) == 'e') &&
|
---|
588 | (word.charAt(length - el - 3) == 'm')))) ||
|
---|
589 | ((length - el > 3) &&
|
---|
590 | (!((word.charAt(length - el - 1) == 't') &&
|
---|
591 | (word.charAt(length - el - 2) == 's') &&
|
---|
592 | (word.charAt(length - el - 3) == 'y') &&
|
---|
593 | (word.charAt(length - el - 4) == 'r'))))) {
|
---|
594 | return word.substring(0, length - el);
|
---|
595 | }
|
---|
596 | } else {
|
---|
597 | if ((length - el > 2) &&
|
---|
598 | (!((word.charAt(length - el - 1) == 't') &&
|
---|
599 | (word.charAt(length - el - 2) == 'e') &&
|
---|
600 | (word.charAt(length - el - 3) == 'm'))) &&
|
---|
601 | ((length - el < 4) ||
|
---|
602 | (!((word.charAt(length - el - 1) == 't') &&
|
---|
603 | (word.charAt(length - el - 2) == 's') &&
|
---|
604 | (word.charAt(length - el - 3) == 'y') &&
|
---|
605 | (word.charAt(length - el - 4) == 'r'))))) {
|
---|
606 | return word.substring(0, length - el);
|
---|
607 | }
|
---|
608 | }
|
---|
609 | break;
|
---|
610 | case 'c':
|
---|
611 | if (word.charAt(length - el - 1) == 'l') {
|
---|
612 | return word.substring(0, length - el);
|
---|
613 | }
|
---|
614 | break;
|
---|
615 | default:
|
---|
616 | throw new IllegalArgumentException("Fatal error.");
|
---|
617 | }
|
---|
618 | }
|
---|
619 | }
|
---|
620 | el--;
|
---|
621 | }
|
---|
622 | return word;
|
---|
623 | }
|
---|
624 |
|
---|
625 | /**
|
---|
626 | * Recodes ending of given word.
|
---|
627 | */
|
---|
628 | private String recodeEnding(String word) {
|
---|
629 |
|
---|
630 | int lastPos = word.length() - 1;
|
---|
631 |
|
---|
632 | // Rule 1
|
---|
633 | if (word.endsWith("bb") ||
|
---|
634 | word.endsWith("dd") ||
|
---|
635 | word.endsWith("gg") ||
|
---|
636 | word.endsWith("ll") ||
|
---|
637 | word.endsWith("mm") ||
|
---|
638 | word.endsWith("nn") ||
|
---|
639 | word.endsWith("pp") ||
|
---|
640 | word.endsWith("rr") ||
|
---|
641 | word.endsWith("ss") ||
|
---|
642 | word.endsWith("tt")) {
|
---|
643 | word = word.substring(0, lastPos);
|
---|
644 | lastPos--;
|
---|
645 | }
|
---|
646 |
|
---|
647 | // Rule 2
|
---|
648 | if (word.endsWith("iev")) {
|
---|
649 | word = word.substring(0, lastPos - 2).concat("ief");
|
---|
650 | }
|
---|
651 |
|
---|
652 | // Rule 3
|
---|
653 | if (word.endsWith("uct")) {
|
---|
654 | word = word.substring(0, lastPos - 2).concat("uc");
|
---|
655 | lastPos--;
|
---|
656 | }
|
---|
657 |
|
---|
658 | // Rule 4
|
---|
659 | if (word.endsWith("umpt")) {
|
---|
660 | word = word.substring(0, lastPos - 3).concat("um");
|
---|
661 | lastPos -= 2;
|
---|
662 | }
|
---|
663 |
|
---|
664 | // Rule 5
|
---|
665 | if (word.endsWith("rpt")) {
|
---|
666 | word = word.substring(0, lastPos - 2).concat("rb");
|
---|
667 | lastPos--;
|
---|
668 | }
|
---|
669 |
|
---|
670 | // Rule 6
|
---|
671 | if (word.endsWith("urs")) {
|
---|
672 | word = word.substring(0, lastPos - 2).concat("ur");
|
---|
673 | lastPos--;
|
---|
674 | }
|
---|
675 |
|
---|
676 | // Rule 7
|
---|
677 | if (word.endsWith("istr")) {
|
---|
678 | word = word.substring(0, lastPos - 3).concat("ister");
|
---|
679 | lastPos++;
|
---|
680 | }
|
---|
681 |
|
---|
682 | // Rule 7a
|
---|
683 | if (word.endsWith("metr")) {
|
---|
684 | word = word.substring(0, lastPos - 3).concat("meter");
|
---|
685 | lastPos++;
|
---|
686 | }
|
---|
687 |
|
---|
688 | // Rule 8
|
---|
689 | if (word.endsWith("olv")) {
|
---|
690 | word = word.substring(0, lastPos - 2).concat("olut");
|
---|
691 | lastPos++;
|
---|
692 | }
|
---|
693 |
|
---|
694 | // Rule 9
|
---|
695 | if (word.endsWith("ul")) {
|
---|
696 | if ((lastPos - 2 < 0) ||
|
---|
697 | ((word.charAt(lastPos - 2) != 'a') &&
|
---|
698 | (word.charAt(lastPos - 2) != 'i') &&
|
---|
699 | (word.charAt(lastPos - 2) != 'o'))) {
|
---|
700 | word = word.substring(0, lastPos - 1).concat("l");
|
---|
701 | lastPos--;
|
---|
702 | }
|
---|
703 | }
|
---|
704 |
|
---|
705 | // Rule 10
|
---|
706 | if (word.endsWith("bex")) {
|
---|
707 | word = word.substring(0, lastPos - 2).concat("bic");
|
---|
708 | }
|
---|
709 |
|
---|
710 | // Rule 11
|
---|
711 | if (word.endsWith("dex")) {
|
---|
712 | word = word.substring(0, lastPos - 2).concat("dic");
|
---|
713 | }
|
---|
714 |
|
---|
715 | // Rule 12
|
---|
716 | if (word.endsWith("pex")) {
|
---|
717 | word = word.substring(0, lastPos - 2).concat("pic");
|
---|
718 | }
|
---|
719 |
|
---|
720 | // Rule 13
|
---|
721 | if (word.endsWith("tex")) {
|
---|
722 | word = word.substring(0, lastPos - 2).concat("tic");
|
---|
723 | }
|
---|
724 |
|
---|
725 | // Rule 14
|
---|
726 | if (word.endsWith("ax")) {
|
---|
727 | word = word.substring(0, lastPos - 1).concat("ac");
|
---|
728 | }
|
---|
729 |
|
---|
730 | // Rule 15
|
---|
731 | if (word.endsWith("ex")) {
|
---|
732 | word = word.substring(0, lastPos - 1).concat("ec");
|
---|
733 | }
|
---|
734 |
|
---|
735 | // Rule 16
|
---|
736 | if (word.endsWith("ix")) {
|
---|
737 | word = word.substring(0, lastPos - 1).concat("ic");
|
---|
738 | }
|
---|
739 |
|
---|
740 | // Rule 17
|
---|
741 | if (word.endsWith("lux")) {
|
---|
742 | word = word.substring(0, lastPos - 2).concat("luc");
|
---|
743 | }
|
---|
744 |
|
---|
745 | // Rule 18
|
---|
746 | if (word.endsWith("uad")) {
|
---|
747 | word = word.substring(0, lastPos - 2).concat("uas");
|
---|
748 | }
|
---|
749 |
|
---|
750 | // Rule 19
|
---|
751 | if (word.endsWith("vad")) {
|
---|
752 | word = word.substring(0, lastPos - 2).concat("vas");
|
---|
753 | }
|
---|
754 |
|
---|
755 | // Rule 20
|
---|
756 | if (word.endsWith("cid")) {
|
---|
757 | word = word.substring(0, lastPos - 2).concat("cis");
|
---|
758 | }
|
---|
759 |
|
---|
760 | // Rule 21
|
---|
761 | if (word.endsWith("lid")) {
|
---|
762 | word = word.substring(0, lastPos - 2).concat("lis");
|
---|
763 | }
|
---|
764 |
|
---|
765 | // Rule 22
|
---|
766 | if (word.endsWith("erid")) {
|
---|
767 | word = word.substring(0, lastPos - 3).concat("eris");
|
---|
768 | }
|
---|
769 |
|
---|
770 | // Rule 23
|
---|
771 | if (word.endsWith("pand")) {
|
---|
772 | word = word.substring(0, lastPos - 3).concat("pans");
|
---|
773 | }
|
---|
774 |
|
---|
775 | // Rule 24
|
---|
776 | if (word.endsWith("end")) {
|
---|
777 | if ((lastPos - 3 < 0) ||
|
---|
778 | (word.charAt(lastPos - 3) != 's')) {
|
---|
779 | word = word.substring(0, lastPos - 2).concat("ens");
|
---|
780 | }
|
---|
781 | }
|
---|
782 |
|
---|
783 | // Rule 25
|
---|
784 | if (word.endsWith("ond")) {
|
---|
785 | word = word.substring(0, lastPos - 2).concat("ons");
|
---|
786 | }
|
---|
787 |
|
---|
788 | // Rule 26
|
---|
789 | if (word.endsWith("lud")) {
|
---|
790 | word = word.substring(0, lastPos - 2).concat("lus");
|
---|
791 | }
|
---|
792 |
|
---|
793 | // Rule 27
|
---|
794 | if (word.endsWith("rud")) {
|
---|
795 | word = word.substring(0, lastPos - 2).concat("rus");
|
---|
796 | }
|
---|
797 |
|
---|
798 | // Rule 28
|
---|
799 | if (word.endsWith("her")) {
|
---|
800 | if ((lastPos - 3 < 0) ||
|
---|
801 | ((word.charAt(lastPos - 3) != 'p') &&
|
---|
802 | (word.charAt(lastPos - 3) != 't'))) {
|
---|
803 | word = word.substring(0, lastPos - 2).concat("hes");
|
---|
804 | }
|
---|
805 | }
|
---|
806 |
|
---|
807 | // Rule 29
|
---|
808 | if (word.endsWith("mit")) {
|
---|
809 | word = word.substring(0, lastPos - 2).concat("mis");
|
---|
810 | }
|
---|
811 |
|
---|
812 | // Rule 30
|
---|
813 | if (word.endsWith("end")) {
|
---|
814 | if ((lastPos - 3 < 0) ||
|
---|
815 | (word.charAt(lastPos - 3) != 'm')) {
|
---|
816 | word = word.substring(0, lastPos - 2).concat("ens");
|
---|
817 | }
|
---|
818 | }
|
---|
819 |
|
---|
820 | // Rule 31
|
---|
821 | if (word.endsWith("ert")) {
|
---|
822 | word = word.substring(0, lastPos - 2).concat("ers");
|
---|
823 | }
|
---|
824 |
|
---|
825 | // Rule 32
|
---|
826 | if (word.endsWith("et")) {
|
---|
827 | if ((lastPos - 2 < 0) ||
|
---|
828 | (word.charAt(lastPos - 2) != 'n')) {
|
---|
829 | word = word.substring(0, lastPos - 1).concat("es");
|
---|
830 | }
|
---|
831 | }
|
---|
832 |
|
---|
833 | // Rule 33
|
---|
834 | if (word.endsWith("yt")) {
|
---|
835 | word = word.substring(0, lastPos - 1).concat("ys");
|
---|
836 | }
|
---|
837 |
|
---|
838 | // Rule 34
|
---|
839 | if (word.endsWith("yz")) {
|
---|
840 | word = word.substring(0, lastPos - 1).concat("ys");
|
---|
841 | }
|
---|
842 |
|
---|
843 | return word;
|
---|
844 | }
|
---|
845 |
|
---|
846 | /**
|
---|
847 | * Returns the stemmed version of the given word.
|
---|
848 | *
|
---|
849 | * @param word a string consisting of a single word
|
---|
850 | */
|
---|
851 | public String stem(String word) {
|
---|
852 |
|
---|
853 | if (word.length() > 2) {
|
---|
854 | return recodeEnding(removeEnding(word.toLowerCase()));
|
---|
855 | } else {
|
---|
856 | return word.toLowerCase();
|
---|
857 | }
|
---|
858 | }
|
---|
859 |
|
---|
860 | /**
|
---|
861 | * Stems text coming into stdin and writes it to stdout.
|
---|
862 | */
|
---|
863 | public static void main(String[] ops) {
|
---|
864 |
|
---|
865 | LovinsStemmer ls = new LovinsStemmer();
|
---|
866 |
|
---|
867 | try {
|
---|
868 | int num;
|
---|
869 | StringBuffer wordBuffer = new StringBuffer();
|
---|
870 | while ((num = System.in.read()) != -1) {
|
---|
871 | char c = (char)num;
|
---|
872 | if (((num >= (int)'A') && (num <= (int)'Z')) ||
|
---|
873 | ((num >= (int)'a') && (num <= (int)'z'))) {
|
---|
874 | wordBuffer.append(c);
|
---|
875 | } else {
|
---|
876 | if (wordBuffer.length() > 0) {
|
---|
877 | System.out.print(ls.stem(wordBuffer.toString().
|
---|
878 | toLowerCase()));
|
---|
879 | wordBuffer = new StringBuffer();
|
---|
880 | }
|
---|
881 | System.out.print(c);
|
---|
882 | }
|
---|
883 | }
|
---|
884 | } catch (Exception e) {
|
---|
885 | System.err.println(e.getMessage());
|
---|
886 | }
|
---|
887 | }
|
---|
888 | }
|
---|
889 |
|
---|
890 |
|
---|