source: trunk/gsdl/packages/kea/kea-3.0/LovinsStemmer.java@ 8815

Last change on this file since 8815 was 8815, checked in by mdewsnip, 19 years ago

Kea 3.0, as downloaded from http://www.nzdl.org/kea but with CSTR_abstracts_test, CSTR_abstracts_train, Chinese_test, and Chinese_train directories removed.

  • Property svn:keywords set to Author Date Id Revision
File size: 23.4 KB
Line 
1/*
2 * LovinsStemmer.java
3 * Copyright (C) 2001 Eibe Frank
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20import java.util.*;
21import java.io.*;
22
23/**
24 * Implements the Lovins stemmer.
25 *
26 * @author Eibe Frank ([email protected])
27 * @version 1.0
28 */
29public class LovinsStemmer extends Stemmer implements Serializable {
30
31 /** C version compatibility mode (emulates bugs
32 in original C implementation) */
33 private static boolean m_CompMode = true;
34
35 /** The hash tables containing the list of endings. */
36 private static HashMap m_l11 = null;
37 private static HashMap m_l10 = null;
38 private static HashMap m_l9 = null;
39 private static HashMap m_l8 = null;
40 private static HashMap m_l7 = null;
41 private static HashMap m_l6 = null;
42 private static HashMap m_l5 = null;
43 private static HashMap m_l4 = null;
44 private static HashMap m_l3 = null;
45 private static HashMap m_l2 = null;
46 private static HashMap m_l1 = null;
47
48 static {
49
50 m_l11 = new HashMap();
51 m_l11.put("alistically", "B");
52 m_l11.put("arizability", "A");
53 m_l11.put("izationally", "B");
54 m_l10 = new HashMap();
55 m_l10.put("antialness", "A");
56 m_l10.put("arisations", "A");
57 m_l10.put("arizations", "A");
58 m_l10.put("entialness", "A");
59 m_l9 = new HashMap();
60 m_l9.put("allically", "C");
61 m_l9.put("antaneous", "A");
62 m_l9.put("antiality", "A");
63 m_l9.put("arisation", "A");
64 m_l9.put("arization", "A");
65 m_l9.put("ationally", "B");
66 m_l9.put("ativeness", "A");
67 m_l9.put("eableness", "E");
68 m_l9.put("entations", "A");
69 m_l9.put("entiality", "A");
70 m_l9.put("entialize", "A");
71 m_l9.put("entiation", "A");
72 m_l9.put("ionalness", "A");
73 m_l9.put("istically", "A");
74 m_l9.put("itousness", "A");
75 m_l9.put("izability", "A");
76 m_l9.put("izational", "A");
77 m_l8 = new HashMap();
78 m_l8.put("ableness", "A");
79 m_l8.put("arizable", "A");
80 m_l8.put("entation", "A");
81 m_l8.put("entially", "A");
82 m_l8.put("eousness", "A");
83 m_l8.put("ibleness", "A");
84 m_l8.put("icalness", "A");
85 m_l8.put("ionalism", "A");
86 m_l8.put("ionality", "A");
87 m_l8.put("ionalize", "A");
88 m_l8.put("iousness", "A");
89 m_l8.put("izations", "A");
90 m_l8.put("lessness", "A");
91 m_l7 = new HashMap();
92 m_l7.put("ability", "A");
93 m_l7.put("aically", "A");
94 m_l7.put("alistic", "B");
95 m_l7.put("alities", "A");
96 m_l7.put("ariness", "E");
97 m_l7.put("aristic", "A");
98 m_l7.put("arizing", "A");
99 m_l7.put("ateness", "A");
100 m_l7.put("atingly", "A");
101 m_l7.put("ational", "B");
102 m_l7.put("atively", "A");
103 m_l7.put("ativism", "A");
104 m_l7.put("elihood", "E");
105 m_l7.put("encible", "A");
106 m_l7.put("entally", "A");
107 m_l7.put("entials", "A");
108 m_l7.put("entiate", "A");
109 m_l7.put("entness", "A");
110 m_l7.put("fulness", "A");
111 m_l7.put("ibility", "A");
112 m_l7.put("icalism", "A");
113 m_l7.put("icalist", "A");
114 m_l7.put("icality", "A");
115 m_l7.put("icalize", "A");
116 m_l7.put("ication", "G");
117 m_l7.put("icianry", "A");
118 m_l7.put("ination", "A");
119 m_l7.put("ingness", "A");
120 m_l7.put("ionally", "A");
121 m_l7.put("isation", "A");
122 m_l7.put("ishness", "A");
123 m_l7.put("istical", "A");
124 m_l7.put("iteness", "A");
125 m_l7.put("iveness", "A");
126 m_l7.put("ivistic", "A");
127 m_l7.put("ivities", "A");
128 m_l7.put("ization", "F");
129 m_l7.put("izement", "A");
130 m_l7.put("oidally", "A");
131 m_l7.put("ousness", "A");
132 m_l6 = new HashMap();
133 m_l6.put("aceous", "A");
134 m_l6.put("acious", "B");
135 m_l6.put("action", "G");
136 m_l6.put("alness", "A");
137 m_l6.put("ancial", "A");
138 m_l6.put("ancies", "A");
139 m_l6.put("ancing", "B");
140 m_l6.put("ariser", "A");
141 m_l6.put("arized", "A");
142 m_l6.put("arizer", "A");
143 m_l6.put("atable", "A");
144 m_l6.put("ations", "B");
145 m_l6.put("atives", "A");
146 m_l6.put("eature", "Z");
147 m_l6.put("efully", "A");
148 m_l6.put("encies", "A");
149 m_l6.put("encing", "A");
150 m_l6.put("ential", "A");
151 m_l6.put("enting", "C");
152 m_l6.put("entist", "A");
153 m_l6.put("eously", "A");
154 m_l6.put("ialist", "A");
155 m_l6.put("iality", "A");
156 m_l6.put("ialize", "A");
157 m_l6.put("ically", "A");
158 m_l6.put("icance", "A");
159 m_l6.put("icians", "A");
160 m_l6.put("icists", "A");
161 m_l6.put("ifully", "A");
162 m_l6.put("ionals", "A");
163 m_l6.put("ionate", "D");
164 m_l6.put("ioning", "A");
165 m_l6.put("ionist", "A");
166 m_l6.put("iously", "A");
167 m_l6.put("istics", "A");
168 m_l6.put("izable", "E");
169 m_l6.put("lessly", "A");
170 m_l6.put("nesses", "A");
171 m_l6.put("oidism", "A");
172 m_l5 = new HashMap();
173 m_l5.put("acies", "A");
174 m_l5.put("acity", "A");
175 m_l5.put("aging", "B");
176 m_l5.put("aical", "A");
177 if (!m_CompMode) {
178 m_l5.put("alist", "A");
179 }
180 m_l5.put("alism", "B");
181 m_l5.put("ality", "A");
182 m_l5.put("alize", "A");
183 m_l5.put("allic", "b");
184 m_l5.put("anced", "B");
185 m_l5.put("ances", "B");
186 m_l5.put("antic", "C");
187 m_l5.put("arial", "A");
188 m_l5.put("aries", "A");
189 m_l5.put("arily", "A");
190 m_l5.put("arity", "B");
191 m_l5.put("arize", "A");
192 m_l5.put("aroid", "A");
193 m_l5.put("ately", "A");
194 m_l5.put("ating", "I");
195 m_l5.put("ation", "B");
196 m_l5.put("ative", "A");
197 m_l5.put("ators", "A");
198 m_l5.put("atory", "A");
199 m_l5.put("ature", "E");
200 m_l5.put("early", "Y");
201 m_l5.put("ehood", "A");
202 m_l5.put("eless", "A");
203 if (!m_CompMode) {
204 m_l5.put("elily", "A");
205 } else {
206 m_l5.put("elity", "A");
207 }
208 m_l5.put("ement", "A");
209 m_l5.put("enced", "A");
210 m_l5.put("ences", "A");
211 m_l5.put("eness", "E");
212 m_l5.put("ening", "E");
213 m_l5.put("ental", "A");
214 m_l5.put("ented", "C");
215 m_l5.put("ently", "A");
216 m_l5.put("fully", "A");
217 m_l5.put("ially", "A");
218 m_l5.put("icant", "A");
219 m_l5.put("ician", "A");
220 m_l5.put("icide", "A");
221 m_l5.put("icism", "A");
222 m_l5.put("icist", "A");
223 m_l5.put("icity", "A");
224 m_l5.put("idine", "I");
225 m_l5.put("iedly", "A");
226 m_l5.put("ihood", "A");
227 m_l5.put("inate", "A");
228 m_l5.put("iness", "A");
229 m_l5.put("ingly", "B");
230 m_l5.put("inism", "J");
231 m_l5.put("inity", "c");
232 m_l5.put("ional", "A");
233 m_l5.put("ioned", "A");
234 m_l5.put("ished", "A");
235 m_l5.put("istic", "A");
236 m_l5.put("ities", "A");
237 m_l5.put("itous", "A");
238 m_l5.put("ively", "A");
239 m_l5.put("ivity", "A");
240 m_l5.put("izers", "F");
241 m_l5.put("izing", "F");
242 m_l5.put("oidal", "A");
243 m_l5.put("oides", "A");
244 m_l5.put("otide", "A");
245 m_l5.put("ously", "A");
246 m_l4 = new HashMap();
247 m_l4.put("able", "A");
248 m_l4.put("ably", "A");
249 m_l4.put("ages", "B");
250 m_l4.put("ally", "B");
251 m_l4.put("ance", "B");
252 m_l4.put("ancy", "B");
253 m_l4.put("ants", "B");
254 m_l4.put("aric", "A");
255 m_l4.put("arly", "K");
256 m_l4.put("ated", "I");
257 m_l4.put("ates", "A");
258 m_l4.put("atic", "B");
259 m_l4.put("ator", "A");
260 m_l4.put("ealy", "Y");
261 m_l4.put("edly", "E");
262 m_l4.put("eful", "A");
263 m_l4.put("eity", "A");
264 m_l4.put("ence", "A");
265 m_l4.put("ency", "A");
266 m_l4.put("ened", "E");
267 m_l4.put("enly", "E");
268 m_l4.put("eous", "A");
269 m_l4.put("hood", "A");
270 m_l4.put("ials", "A");
271 m_l4.put("ians", "A");
272 m_l4.put("ible", "A");
273 m_l4.put("ibly", "A");
274 m_l4.put("ical", "A");
275 m_l4.put("ides", "L");
276 m_l4.put("iers", "A");
277 m_l4.put("iful", "A");
278 m_l4.put("ines", "M");
279 m_l4.put("ings", "N");
280 m_l4.put("ions", "B");
281 m_l4.put("ious", "A");
282 m_l4.put("isms", "B");
283 m_l4.put("ists", "A");
284 m_l4.put("itic", "H");
285 m_l4.put("ized", "F");
286 m_l4.put("izer", "F");
287 m_l4.put("less", "A");
288 m_l4.put("lily", "A");
289 m_l4.put("ness", "A");
290 m_l4.put("ogen", "A");
291 m_l4.put("ward", "A");
292 m_l4.put("wise", "A");
293 m_l4.put("ying", "B");
294 m_l4.put("yish", "A");
295 m_l3 = new HashMap();
296 m_l3.put("acy", "A");
297 m_l3.put("age", "B");
298 m_l3.put("aic", "A");
299 m_l3.put("als", "b");
300 m_l3.put("ant", "B");
301 m_l3.put("ars", "O");
302 m_l3.put("ary", "F");
303 m_l3.put("ata", "A");
304 m_l3.put("ate", "A");
305 m_l3.put("eal", "Y");
306 m_l3.put("ear", "Y");
307 m_l3.put("ely", "E");
308 m_l3.put("ene", "E");
309 m_l3.put("ent", "C");
310 m_l3.put("ery", "E");
311 m_l3.put("ese", "A");
312 m_l3.put("ful", "A");
313 m_l3.put("ial", "A");
314 m_l3.put("ian", "A");
315 m_l3.put("ics", "A");
316 m_l3.put("ide", "L");
317 m_l3.put("ied", "A");
318 m_l3.put("ier", "A");
319 m_l3.put("ies", "P");
320 m_l3.put("ily", "A");
321 m_l3.put("ine", "M");
322 m_l3.put("ing", "N");
323 m_l3.put("ion", "Q");
324 m_l3.put("ish", "C");
325 m_l3.put("ism", "B");
326 m_l3.put("ist", "A");
327 m_l3.put("ite", "a");
328 m_l3.put("ity", "A");
329 m_l3.put("ium", "A");
330 m_l3.put("ive", "A");
331 m_l3.put("ize", "F");
332 m_l3.put("oid", "A");
333 m_l3.put("one", "R");
334 m_l3.put("ous", "A");
335 m_l2 = new HashMap();
336 m_l2.put("ae", "A");
337 m_l2.put("al", "b");
338 m_l2.put("ar", "X");
339 m_l2.put("as", "B");
340 m_l2.put("ed", "E");
341 m_l2.put("en", "F");
342 m_l2.put("es", "E");
343 m_l2.put("ia", "A");
344 m_l2.put("ic", "A");
345 m_l2.put("is", "A");
346 m_l2.put("ly", "B");
347 m_l2.put("on", "S");
348 m_l2.put("or", "T");
349 m_l2.put("um", "U");
350 m_l2.put("us", "V");
351 m_l2.put("yl", "R");
352 m_l2.put("s\'", "A");
353 m_l2.put("\'s", "A");
354 m_l1 = new HashMap();
355 m_l1.put("a", "A");
356 m_l1.put("e", "A");
357 m_l1.put("i", "A");
358 m_l1.put("o", "A");
359 m_l1.put("s", "W");
360 m_l1.put("y", "B");
361 }
362
363 /**
364 * Finds and removes ending from given word.
365 */
366 private String removeEnding(String word) {
367
368 int length = word.length();
369 int el = 11;
370
371 while (el > 0) {
372 if (length - el > 1) {
373 String ending = word.substring(length - el);
374 String conditionCode = null;
375 switch (el) {
376 case 11: conditionCode = (String)m_l11.get(ending);
377 break;
378 case 10: conditionCode = (String)m_l10.get(ending);
379 break;
380 case 9: conditionCode = (String)m_l9.get(ending);
381 break;
382 case 8: conditionCode = (String)m_l8.get(ending);
383 break;
384 case 7: conditionCode = (String)m_l7.get(ending);
385 break;
386 case 6: conditionCode = (String)m_l6.get(ending);
387 break;
388 case 5: conditionCode = (String)m_l5.get(ending);
389 break;
390 case 4: conditionCode = (String)m_l4.get(ending);
391 break;
392 case 3: conditionCode = (String)m_l3.get(ending);
393 break;
394 case 2: conditionCode = (String)m_l2.get(ending);
395 break;
396 case 1: conditionCode = (String)m_l1.get(ending);
397 break;
398 default:
399 }
400 if (conditionCode != null) {
401 switch (conditionCode.charAt(0)) {
402 case 'A':
403 return word.substring(0, length - el);
404 case 'B':
405 if (length - el > 2) {
406 return word.substring(0, length - el);
407 }
408 break;
409 case 'C':
410 if (length - el > 3) {
411 return word.substring(0, length - el);
412 }
413 break;
414 case 'D':
415 if (length - el > 4) {
416 return word.substring(0, length - el);
417 }
418 break;
419 case 'E':
420 if (word.charAt(length - el - 1) != 'e') {
421 return word.substring(0, length - el);
422 }
423 break;
424 case 'F':
425 if ((length - el > 2) &&
426 (word.charAt(length - el - 1) != 'e')) {
427 return word.substring(0, length - el);
428 }
429 break;
430 case 'G':
431 if ((length - el > 2) &&
432 (word.charAt(length - el - 1) == 'f')) {
433 return word.substring(0, length - el);
434 }
435 break;
436 case 'H':
437 if ((word.charAt(length - el - 1) == 't') ||
438 ((word.charAt(length - el - 1) == 'l') &&
439 (word.charAt(length - el - 2) == 'l'))) {
440 return word.substring(0, length - el);
441 }
442 break;
443 case 'I':
444 if ((word.charAt(length - el - 1) != 'o') &&
445 (word.charAt(length - el - 1) != 'e')) {
446 return word.substring(0, length - el);
447 }
448 break;
449 case 'J':
450 if ((word.charAt(length - el - 1) != 'a') &&
451 (word.charAt(length - el - 1) != 'e')) {
452 return word.substring(0, length - el);
453 }
454 break;
455 case 'K':
456 if ((length - el > 2) &&
457 ((word.charAt(length - el - 1) == 'l') ||
458 (word.charAt(length - el - 1) == 'i') ||
459 ((word.charAt(length - el - 1) == 'e') &&
460 (word.charAt(length - el - 3) == 'u')))) {
461 return word.substring(0, length - el);
462 }
463 break;
464 case 'L':
465 if ((word.charAt(length - el - 1) != 'u') &&
466 (word.charAt(length - el - 1) != 'x') &&
467 ((word.charAt(length - el - 1) != 's') ||
468 (word.charAt(length - el - 2) == 'o'))) {
469 return word.substring(0, length - el);
470 }
471 break;
472 case 'M':
473 if ((word.charAt(length - el - 1) != 'a') &&
474 (word.charAt(length - el - 1) != 'c') &&
475 (word.charAt(length - el - 1) != 'e') &&
476 (word.charAt(length - el - 1) != 'm')) {
477 return word.substring(0, length - el);
478 }
479 break;
480 case 'N':
481 if ((length - el > 3) ||
482 ((length - el == 3) &&
483 ((word.charAt(length - el - 3) != 's')))) {
484 return word.substring(0, length - el);
485 }
486 break;
487 case 'O':
488 if ((word.charAt(length - el - 1) == 'l') ||
489 (word.charAt(length - el - 1) == 'i')) {
490 return word.substring(0, length - el);
491 }
492 break;
493 case 'P':
494 if (word.charAt(length - el - 1) != 'c') {
495 return word.substring(0, length - el);
496 }
497 break;
498 case 'Q':
499 if ((length - el > 2) &&
500 (word.charAt(length - el - 1) != 'l') &&
501 (word.charAt(length - el - 1) != 'n')) {
502 return word.substring(0, length - el);
503 }
504 break;
505 case 'R':
506 if ((word.charAt(length - el - 1) == 'n') ||
507 (word.charAt(length - el - 1) == 'r')) {
508 return word.substring(0, length - el);
509 }
510 break;
511 case 'S':
512 if (((word.charAt(length - el - 1) == 'r') &&
513 (word.charAt(length - el - 2) == 'd')) ||
514 ((word.charAt(length - el - 1) == 't') &&
515 (word.charAt(length - el - 2) != 't'))) {
516 return word.substring(0, length - el);
517 }
518 break;
519 case 'T':
520 if ((word.charAt(length - el - 1) == 's') ||
521 ((word.charAt(length - el - 1) == 't') &&
522 (word.charAt(length - el - 2) != 'o'))) {
523 return word.substring(0, length - el);
524 }
525 break;
526 case 'U':
527 if ((word.charAt(length - el - 1) == 'l') ||
528 (word.charAt(length - el - 1) == 'm') ||
529 (word.charAt(length - el - 1) == 'n') ||
530 (word.charAt(length - el - 1) == 'r')) {
531 return word.substring(0, length - el);
532 }
533 break;
534 case 'V':
535 if (word.charAt(length - el - 1) == 'c') {
536 return word.substring(0, length - el);
537 }
538 break;
539 case 'W':
540 if ((word.charAt(length - el - 1) != 's') &&
541 (word.charAt(length - el - 1) != 'u')) {
542 return word.substring(0, length - el);
543 }
544 break;
545 case 'X':
546 if ((word.charAt(length - el - 1) == 'l') ||
547 (word.charAt(length - el - 1) == 'i') ||
548 ((length - el > 2) &&
549 (word.charAt(length - el - 1) == 'e') &&
550 (word.charAt(length - el - 3) == 'u'))) {
551 return word.substring(0, length - el);
552 }
553 break;
554 case 'Y':
555 if ((word.charAt(length - el - 1) == 'n') &&
556 (word.charAt(length - el - 2) == 'i')) {
557 return word.substring(0, length - el);
558 }
559 break;
560 case 'Z':
561 if (word.charAt(length - el - 1) != 'f') {
562 return word.substring(0, length - el);
563 }
564 break;
565 case 'a':
566 if ((word.charAt(length - el - 1) == 'd') ||
567 (word.charAt(length - el - 1) == 'f') ||
568 (((word.charAt(length - el - 1) == 'h') &&
569 (word.charAt(length - el - 2) == 'p'))) ||
570 (((word.charAt(length - el - 1) == 'h') &&
571 (word.charAt(length - el - 2) == 't'))) ||
572 (word.charAt(length - el - 1) == 'l') ||
573 (((word.charAt(length - el - 1) == 'r') &&
574 (word.charAt(length - el - 2) == 'e'))) ||
575 (((word.charAt(length - el - 1) == 'r') &&
576 (word.charAt(length - el - 2) == 'o'))) ||
577 (((word.charAt(length - el - 1) == 's') &&
578 (word.charAt(length - el - 2) == 'e'))) ||
579 (word.charAt(length - el - 1) == 't')) {
580 return word.substring(0, length - el);
581 }
582 break;
583 case 'b':
584 if (m_CompMode) {
585 if (((length - el == 3 ) &&
586 (!((word.charAt(length - el - 1) == 't') &&
587 (word.charAt(length - el - 2) == 'e') &&
588 (word.charAt(length - el - 3) == 'm')))) ||
589 ((length - el > 3) &&
590 (!((word.charAt(length - el - 1) == 't') &&
591 (word.charAt(length - el - 2) == 's') &&
592 (word.charAt(length - el - 3) == 'y') &&
593 (word.charAt(length - el - 4) == 'r'))))) {
594 return word.substring(0, length - el);
595 }
596 } else {
597 if ((length - el > 2) &&
598 (!((word.charAt(length - el - 1) == 't') &&
599 (word.charAt(length - el - 2) == 'e') &&
600 (word.charAt(length - el - 3) == 'm'))) &&
601 ((length - el < 4) ||
602 (!((word.charAt(length - el - 1) == 't') &&
603 (word.charAt(length - el - 2) == 's') &&
604 (word.charAt(length - el - 3) == 'y') &&
605 (word.charAt(length - el - 4) == 'r'))))) {
606 return word.substring(0, length - el);
607 }
608 }
609 break;
610 case 'c':
611 if (word.charAt(length - el - 1) == 'l') {
612 return word.substring(0, length - el);
613 }
614 break;
615 default:
616 throw new IllegalArgumentException("Fatal error.");
617 }
618 }
619 }
620 el--;
621 }
622 return word;
623 }
624
625 /**
626 * Recodes ending of given word.
627 */
628 private String recodeEnding(String word) {
629
630 int lastPos = word.length() - 1;
631
632 // Rule 1
633 if (word.endsWith("bb") ||
634 word.endsWith("dd") ||
635 word.endsWith("gg") ||
636 word.endsWith("ll") ||
637 word.endsWith("mm") ||
638 word.endsWith("nn") ||
639 word.endsWith("pp") ||
640 word.endsWith("rr") ||
641 word.endsWith("ss") ||
642 word.endsWith("tt")) {
643 word = word.substring(0, lastPos);
644 lastPos--;
645 }
646
647 // Rule 2
648 if (word.endsWith("iev")) {
649 word = word.substring(0, lastPos - 2).concat("ief");
650 }
651
652 // Rule 3
653 if (word.endsWith("uct")) {
654 word = word.substring(0, lastPos - 2).concat("uc");
655 lastPos--;
656 }
657
658 // Rule 4
659 if (word.endsWith("umpt")) {
660 word = word.substring(0, lastPos - 3).concat("um");
661 lastPos -= 2;
662 }
663
664 // Rule 5
665 if (word.endsWith("rpt")) {
666 word = word.substring(0, lastPos - 2).concat("rb");
667 lastPos--;
668 }
669
670 // Rule 6
671 if (word.endsWith("urs")) {
672 word = word.substring(0, lastPos - 2).concat("ur");
673 lastPos--;
674 }
675
676 // Rule 7
677 if (word.endsWith("istr")) {
678 word = word.substring(0, lastPos - 3).concat("ister");
679 lastPos++;
680 }
681
682 // Rule 7a
683 if (word.endsWith("metr")) {
684 word = word.substring(0, lastPos - 3).concat("meter");
685 lastPos++;
686 }
687
688 // Rule 8
689 if (word.endsWith("olv")) {
690 word = word.substring(0, lastPos - 2).concat("olut");
691 lastPos++;
692 }
693
694 // Rule 9
695 if (word.endsWith("ul")) {
696 if ((lastPos - 2 < 0) ||
697 ((word.charAt(lastPos - 2) != 'a') &&
698 (word.charAt(lastPos - 2) != 'i') &&
699 (word.charAt(lastPos - 2) != 'o'))) {
700 word = word.substring(0, lastPos - 1).concat("l");
701 lastPos--;
702 }
703 }
704
705 // Rule 10
706 if (word.endsWith("bex")) {
707 word = word.substring(0, lastPos - 2).concat("bic");
708 }
709
710 // Rule 11
711 if (word.endsWith("dex")) {
712 word = word.substring(0, lastPos - 2).concat("dic");
713 }
714
715 // Rule 12
716 if (word.endsWith("pex")) {
717 word = word.substring(0, lastPos - 2).concat("pic");
718 }
719
720 // Rule 13
721 if (word.endsWith("tex")) {
722 word = word.substring(0, lastPos - 2).concat("tic");
723 }
724
725 // Rule 14
726 if (word.endsWith("ax")) {
727 word = word.substring(0, lastPos - 1).concat("ac");
728 }
729
730 // Rule 15
731 if (word.endsWith("ex")) {
732 word = word.substring(0, lastPos - 1).concat("ec");
733 }
734
735 // Rule 16
736 if (word.endsWith("ix")) {
737 word = word.substring(0, lastPos - 1).concat("ic");
738 }
739
740 // Rule 17
741 if (word.endsWith("lux")) {
742 word = word.substring(0, lastPos - 2).concat("luc");
743 }
744
745 // Rule 18
746 if (word.endsWith("uad")) {
747 word = word.substring(0, lastPos - 2).concat("uas");
748 }
749
750 // Rule 19
751 if (word.endsWith("vad")) {
752 word = word.substring(0, lastPos - 2).concat("vas");
753 }
754
755 // Rule 20
756 if (word.endsWith("cid")) {
757 word = word.substring(0, lastPos - 2).concat("cis");
758 }
759
760 // Rule 21
761 if (word.endsWith("lid")) {
762 word = word.substring(0, lastPos - 2).concat("lis");
763 }
764
765 // Rule 22
766 if (word.endsWith("erid")) {
767 word = word.substring(0, lastPos - 3).concat("eris");
768 }
769
770 // Rule 23
771 if (word.endsWith("pand")) {
772 word = word.substring(0, lastPos - 3).concat("pans");
773 }
774
775 // Rule 24
776 if (word.endsWith("end")) {
777 if ((lastPos - 3 < 0) ||
778 (word.charAt(lastPos - 3) != 's')) {
779 word = word.substring(0, lastPos - 2).concat("ens");
780 }
781 }
782
783 // Rule 25
784 if (word.endsWith("ond")) {
785 word = word.substring(0, lastPos - 2).concat("ons");
786 }
787
788 // Rule 26
789 if (word.endsWith("lud")) {
790 word = word.substring(0, lastPos - 2).concat("lus");
791 }
792
793 // Rule 27
794 if (word.endsWith("rud")) {
795 word = word.substring(0, lastPos - 2).concat("rus");
796 }
797
798 // Rule 28
799 if (word.endsWith("her")) {
800 if ((lastPos - 3 < 0) ||
801 ((word.charAt(lastPos - 3) != 'p') &&
802 (word.charAt(lastPos - 3) != 't'))) {
803 word = word.substring(0, lastPos - 2).concat("hes");
804 }
805 }
806
807 // Rule 29
808 if (word.endsWith("mit")) {
809 word = word.substring(0, lastPos - 2).concat("mis");
810 }
811
812 // Rule 30
813 if (word.endsWith("end")) {
814 if ((lastPos - 3 < 0) ||
815 (word.charAt(lastPos - 3) != 'm')) {
816 word = word.substring(0, lastPos - 2).concat("ens");
817 }
818 }
819
820 // Rule 31
821 if (word.endsWith("ert")) {
822 word = word.substring(0, lastPos - 2).concat("ers");
823 }
824
825 // Rule 32
826 if (word.endsWith("et")) {
827 if ((lastPos - 2 < 0) ||
828 (word.charAt(lastPos - 2) != 'n')) {
829 word = word.substring(0, lastPos - 1).concat("es");
830 }
831 }
832
833 // Rule 33
834 if (word.endsWith("yt")) {
835 word = word.substring(0, lastPos - 1).concat("ys");
836 }
837
838 // Rule 34
839 if (word.endsWith("yz")) {
840 word = word.substring(0, lastPos - 1).concat("ys");
841 }
842
843 return word;
844 }
845
846 /**
847 * Returns the stemmed version of the given word.
848 *
849 * @param word a string consisting of a single word
850 */
851 public String stem(String word) {
852
853 if (word.length() > 2) {
854 return recodeEnding(removeEnding(word.toLowerCase()));
855 } else {
856 return word.toLowerCase();
857 }
858 }
859
860 /**
861 * Stems text coming into stdin and writes it to stdout.
862 */
863 public static void main(String[] ops) {
864
865 LovinsStemmer ls = new LovinsStemmer();
866
867 try {
868 int num;
869 StringBuffer wordBuffer = new StringBuffer();
870 while ((num = System.in.read()) != -1) {
871 char c = (char)num;
872 if (((num >= (int)'A') && (num <= (int)'Z')) ||
873 ((num >= (int)'a') && (num <= (int)'z'))) {
874 wordBuffer.append(c);
875 } else {
876 if (wordBuffer.length() > 0) {
877 System.out.print(ls.stem(wordBuffer.toString().
878 toLowerCase()));
879 wordBuffer = new StringBuffer();
880 }
881 System.out.print(c);
882 }
883 }
884 } catch (Exception e) {
885 System.err.println(e.getMessage());
886 }
887 }
888}
889
890
Note: See TracBrowser for help on using the repository browser.