source: trunk/indexers/mg/lib/lovinstem.c@ 3745

Last change on this file since 3745 was 3745, checked in by mdewsnip, 21 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 24.3 KB
Line 
1/**************************************************************************
2 *
3 * lovinstem.c -- Stemming code
4 * Copyright (C) 1994 Linh Huynh ([email protected])
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: lovinstem.c 3745 2003-02-20 21:20:24Z mdewsnip $
21 *
22 **************************************************************************/
23
24static char *RCSID = "$Id: lovinstem.c 3745 2003-02-20 21:20:24Z mdewsnip $";
25
26#include "sysfuncs.h"
27
28
29/*****************************************************************************
30 *
31 * Transformational Rules used in Recoding Stem Terminations
32 *
33 * Parameter in functions:
34 * ch : the second last character of the old ending
35 * (after removing one of double consonants)
36 *
37 ****************************************************************************/
38
39
40/************ Conditional rules associated with transformations **************/
41
42static int aio ();
43static int s ();
44static int pt ();
45static int m ();
46static int n ();
47
48static int
49aio (ch) /* Rule number 9 */
50 char ch;
51{
52 return ((ch != 'a') && (ch != 'i') && (ch != 'o'));
53}
54
55static int
56s (ch) /* Rule Number 24 */
57 char ch;
58{
59 return (ch != 's');
60}
61
62static int
63pt (ch) /* Rule number 28 */
64 char ch;
65{
66 return ((ch != 'p') && (ch != 't'));
67}
68
69static int
70m (ch) /* Rule number 30 */
71 char ch;
72{
73 return (ch != 'm');
74}
75
76static int
77n (ch) /* Rule number 32 */
78 char ch;
79{
80 return (ch != 'n');
81}
82
83
84/**************************************************************************
85 *
86 * Context-sensitive rules associated with certain endings.
87 *
88 * The notations of the rules are the same as in Lovins' paper
89 * except that rule A is replaced by a NULL in the data structure.
90 *
91 * Function parameters:
92 * - stem_length: possible stem length
93 * - end : pointer points to the end of the possible stem.
94 *
95 *************************************************************************/
96
97
98/********* Context-sensitive rule function declarations ******************/
99
100static int B ();
101static int C ();
102static int D ();
103static int E ();
104static int F ();
105static int G ();
106static int H ();
107static int I ();
108static int J ();
109static int K ();
110static int L ();
111static int M ();
112static int N ();
113static int O ();
114static int P ();
115static int Q ();
116static int R ();
117static int S ();
118static int T ();
119static int U ();
120static int V ();
121static int W ();
122static int X ();
123static int Y ();
124static int Z ();
125static int AA ();
126static int BB ();
127static int CC ();
128
129static int
130B (stem_length, end)
131 int stem_length;
132 char *end;
133{
134 return (stem_length >= 3);
135}
136
137static int
138C (stem_length, end)
139 int stem_length;
140 char *end;
141{
142 return (stem_length >= 4);
143}
144
145static int
146D (stem_length, end)
147 int stem_length;
148 char *end;
149{
150 return (stem_length >= 5);
151}
152
153static int
154E (stem_length, end)
155 int stem_length;
156 char *end;
157{
158 return (*end != 'e');
159}
160
161static int
162F (stem_length, end)
163 int stem_length;
164 char *end;
165{
166 return ((stem_length >= 3) && (*end != 'e'));
167}
168
169static int
170G (stem_length, end)
171 int stem_length;
172 char *end;
173{
174 return ((stem_length >= 3) && (*end == 'f'));
175}
176
177static int
178H (stem_length, end)
179 int stem_length;
180 char *end;
181{
182 char p1, p2;
183 p1 = *end;
184 p2 = *(end - 1);
185 return ((p1 == 't') || ((p1 == 'l') && (p2 == 'l')));
186}
187
188static int
189I (stem_length, end)
190 int stem_length;
191 char *end;
192{
193 char p1;
194 p1 = *end;
195 return ((p1 != 'o') && (p1 != 'e'));
196}
197
198static int
199J (stem_length, end)
200 int stem_length;
201 char *end;
202{
203 char p1;
204 p1 = *end;
205 return ((p1 != 'a') && (p1 != 'e'));
206}
207
208static int
209K (stem_length, end)
210 int stem_length;
211 char *end;
212{
213 char p1;
214 p1 = *end;
215 return ((stem_length >= 3) &&
216 ((p1 == 'l') || (p1 == 'i') ||
217 ((p1 == 'e') && (*(end - 2) == 'u'))));
218}
219
220static int
221L (stem_length, end)
222 int stem_length;
223 char *end;
224{
225 char p1, p2;
226 p1 = *end;
227 p2 = *(end - 1);
228 if (p1 == 's')
229 return (p2 == 'o');
230 else
231 return ((p1 != 'u') && (p1 != 'x'));
232}
233
234static int
235M (stem_length, end)
236 int stem_length;
237 char *end;
238{
239 char p1;
240 p1 = *end;
241 return ((p1 != 'a') && (p1 != 'c') &&
242 (p1 != 'e') && (p1 != 'm'));
243}
244
245static int
246N (stem_length, end)
247 int stem_length;
248 char *end;
249{
250 if (stem_length >= 3)
251 {
252 if (*(end - 2) == 's')
253 return (stem_length >= 4);
254 else
255 return 1;
256 }
257 else
258 return 0;
259}
260
261static int
262O (stem_length, end)
263 int stem_length;
264 char *end;
265{
266 char p1;
267 p1 = *end;
268 return ((p1 == 'l') || (p1 == 'i'));
269}
270
271static int
272P (stem_length, end)
273 int stem_length;
274 char *end;
275{
276 return (*end != 'c');
277}
278
279static int
280Q (stem_length, end)
281 int stem_length;
282 char *end;
283{
284 char p1;
285 p1 = *end;
286
287 return ((stem_length >= 3) && (p1 != 'l') && (p1 != 'n'));
288}
289
290static int
291R (stem_length, end)
292 int stem_length;
293 char *end;
294{
295 char p1;
296 p1 = *end;
297 return ((p1 == 'n') || (p1 == 'r'));
298}
299
300static int
301S (stem_length, end)
302 int stem_length;
303 char *end;
304{
305 char p1, p2;
306 p1 = *end;
307 p2 = *(end - 1);
308
309 if (p1 == 't')
310 return (p2 != 't');
311 else
312 return ((p1 == 'r') && (p2 == 'd'));
313}
314
315static int
316T (stem_length, end)
317 int stem_length;
318 char *end;
319{
320 char p1, p2;
321 p1 = *end;
322 p2 = *(end - 1);
323
324 if (p1 == 't')
325 return (p2 != 'o');
326 else
327 return (p1 == 's');
328}
329
330static int
331U (stem_length, end)
332 int stem_length;
333 char *end;
334{
335 char p1;
336 p1 = *end;
337
338 return ((p1 == 'l') || (p1 == 'm') ||
339 (p1 == 'n') || (p1 == 'r'));
340}
341
342static int
343V (stem_length, end)
344 int stem_length;
345 char *end;
346{
347 return (*end == 'c');
348}
349
350static int
351W (stem_length, end)
352 int stem_length;
353 char *end;
354{
355 char p1;
356 p1 = *end;
357
358 return ((p1 != 's') && (p1 != 'u'));
359}
360
361static int
362X (stem_length, end)
363 int stem_length;
364 char *end;
365{
366 char p1;
367 p1 = *end;
368
369 if (p1 == 'e')
370 return ((stem_length >= 3) && (*(end - 2) == 'u'));
371 else
372 return ((p1 == 'l') || (p1 == 'i'));
373}
374
375static int
376Y (stem_length, end)
377 int stem_length;
378 char *end;
379{
380 return ((*end == 'n') && (*(end - 1) == 'i'));
381}
382
383static int
384Z (stem_length, end)
385 int stem_length;
386 char *end;
387{
388 return (*end != 'f');
389}
390
391static int
392AA (stem_length, end)
393 int stem_length;
394 char *end;
395{
396 char p1, p2;
397 p1 = *end;
398 p2 = *(end - 1);
399
400 if (p1 == 'h')
401 return ((p2 == 'p') || (p2 == 't'));
402 else if (p1 == 'r')
403 return ((p2 == 'e') || (p2 == 'o'));
404 else if (p1 == 's')
405 return (p2 == 'e');
406 else
407 return ((p1 == 'd') || (p1 == 'f') ||
408 (p1 == 'l') || (p1 == 't'));
409}
410
411static int
412BB (stem_length, end)
413 int stem_length;
414 char *end;
415{
416 if (stem_length >= 3)
417 {
418 if (stem_length >= 4)
419 return (strncmp (end - 3, "ryst", 4) != 0);
420 else
421 return (strncmp (end - 2, "met", 3) != 0);
422 }
423 else
424 return 0;
425}
426
427static int
428CC (stem_length, end)
429 int stem_length;
430 char *end;
431{
432 return (*end == 'l');
433}
434
435/**************************************************************************/
436
437#define MIN_STEM_LENGTH 2
438#define MAX_ENDING_SIZE 11
439#define PREDEFINED_SIZE (MAX_ENDING_SIZE + MIN_STEM_LENGTH)
440#define EOS '\0'
441/* [RPAP - Jan 97: Stem Index Change] */
442#define MAX_STEM_LENGTH 255
443
444static char *remove_ending (char *word, int w_len);
445static void recode_stem (char *stem_end);
446
447/* =========================================================================
448 * Function: stem
449 * Description:
450 * Implemetation of the Lovins' stemming algorithm described in
451 * J.B. Lovins, "Development of a Stemming Algorithm",
452 * Mechanical Translation and Computational Linguistics, Vol 11,1968.
453 * Input: a word string with the length in the first byte
454 * Output: the stemmed word
455 * ========================================================================= */
456
457void
458lovinstem (u_char * word)
459{
460 char *word_start; /* points to first letter of word */
461 char *stem_end; /* points to last character of stem portion */
462 int old_length; /* length of word */
463 int new_length; /* length after stemming */
464
465 /* [RPAP - Jan 97: Stem Index Change] */
466 u_char copy[MAX_STEM_LENGTH + 2];
467 bcopy ((char *) word, (char *) copy, *word + 1);
468
469 old_length = *copy; /* [RPAP - Jan 97: Stem Index Change] */
470
471 /* The word must be at least MIN_STEM_LENGTH characters long. */
472 if (old_length <= MIN_STEM_LENGTH)
473 return;
474
475 word_start = (char *) copy + 1; /* jump over length byte */ /* [RPAP - Jan 97: Stem Index Change] */
476 copy[old_length + 1] = '\0'; /* null terminate string */ /* [RPAP - Jan 97: Stem Index Change] */
477
478 stem_end = remove_ending (word_start, old_length);
479 recode_stem (stem_end);
480
481 /* fix up new length */
482 /* have to use strlen because have no other way of telling length */
483 new_length = strlen (word_start);
484 *copy = new_length; /* [RPAP - Jan 97: Stem Index Change] */
485
486 /* [RPAP - Jan 97: Stem Index Change] */
487 bcopy ((char *) copy, (char *) word, *copy + 1);
488
489} /*stem_complex */
490
491/* =========================================================================
492
493 * Data structure for recoding and the method for recoding the stem.
494 *
495 * ========================================================================= */
496
497typedef struct
498{
499 char *old_end; /* old ending */
500 char *new_end; /* new ending */
501 char old_offset; /* length of the old ending - 1 */
502 int (*cond) (); /* condition rule */
503 char end_group; /* signal the end of the group */
504}
505Recode_Rules;
506
507/*
508 Stem terminations are divided into groups having the same last
509 character
510 */
511
512static Recode_Rules Rules[] =
513{
514 "uad", "uas", 2, NULL, 0,
515 "vad", "vas", 2, NULL, 0,
516 "cid", "cis", 2, NULL, 0,
517 "lid", "lis", 2, NULL, 0,
518 "erid", "eris", 3, NULL, 0,
519 "pand", "pans", 3, NULL, 0,
520 "end", "ens", 2, s, 0,
521 "end", "ens", 2, m, 0,
522 "ond", "ons", 2, NULL, 0,
523 "lud", "lus", 2, NULL, 0,
524 "rud", "rus", 2, NULL, 1,
525
526 "ul", "l", 1, aio, 1,
527
528 "istr", "ister", 3, NULL, 0,
529 "metr", "meter", 3, NULL, 0,
530 "her", "hes", 2, pt, 1,
531
532 "urs", "ur", 2, NULL, 1,
533
534 "uct", "uc", 2, NULL, 0,
535 "umpt", "um", 3, NULL, 0,
536 "rpt", "rb", 2, NULL, 0,
537 "mit", "mis", 2, NULL, 0,
538 "ert", "ers", 2, NULL, 0,
539 "et", "es", 1, n, 0,
540 "yt", "ys", 1, NULL, 1,
541
542 "iev", "ief", 2, NULL, 0,
543 "olv", "olut", 2, NULL, 1,
544
545 "bex", "bic", 2, NULL, 0,
546 "dex", "dic", 2, NULL, 0,
547 "pex", "pic", 2, NULL, 0,
548 "tex", "tic", 2, NULL, 0,
549 "ax", "ac", 1, NULL, 0,
550 "ex", "ec", 1, NULL, 0,
551 "ix", "ic", 1, NULL, 0,
552 "lux", "luc", 2, NULL, 1,
553
554 "yz", "ys", 1, NULL, 1
555};
556
557typedef struct last
558 {
559 char c; /* The last character */
560 struct last *left; /* used in balanced */
561 struct last *right; /* binary tree */
562 Recode_Rules *pt; /* location of approriate group */
563 }
564Last_Char_Node;
565
566/*------------
567 s
568 / \
569 l x
570 / \ / \
571 d r t z
572 \
573 v
574---------------*/
575static Last_Char_Node pr[] =
576{
577 'd', NULL, NULL, Rules,
578 'l', pr, pr + 2, Rules + 11,
579 'r', NULL, NULL, Rules + 12,
580 's', pr + 1, pr + 6, Rules + 15,
581 't', NULL, pr + 5, Rules + 16,
582 'v', NULL, NULL, Rules + 23,
583 'x', pr + 4, pr + 7, Rules + 25,
584 'z', NULL, NULL, Rules + 33,
585};
586
587
588/*****************************************************************************
589 *
590 * Recode the stem according to the transformation rules.
591 *
592 *****************************************************************************/
593
594static void
595recode_stem (char *stem_end)
596{
597 static Last_Char_Node *root = pr + 3;
598 Last_Char_Node *p_last = root;
599 Recode_Rules *rule; /* points to the transformation list */
600 char *h, /* points to start of possible ending */
601 ch, /* last character of the old ending */
602 ch2;
603
604 h = stem_end - 1;
605 ch = *stem_end;
606 if (*h == ch) /* Check for double consonant */
607 {
608 if (strchr ("bdglmnprst", ch) != NULL)
609 {
610 *stem_end = EOS;
611 stem_end = h;
612 }
613 }
614
615 do /* Check for the last character */
616 {
617 ch2 = p_last->c;
618 if (ch == ch2)
619 break;
620 else if (ch > ch2)
621 p_last = p_last->right;
622 else
623 p_last = p_last->left;
624 }
625 while (p_last != NULL);
626
627 if (p_last != NULL) /* Check for the rest of suffix list */
628 {
629 rule = p_last->pt;
630 for (;;)
631 {
632 h = stem_end - rule->old_offset;
633 if (strcmp (h, rule->old_end) == 0)
634 {
635 if (!rule->cond || (*rule->cond) (*(h - 1)))
636 {
637 (void) strcpy (h, rule->new_end);
638 break;
639 }
640 }
641 if (rule->end_group)
642 break;
643 rule++;
644 }
645 }
646}
647
648/* ======================================================================
649
650 * List of endings and the method to remove the ending
651 *
652 * ======================================================================*/
653
654/************ Data structures for list of endings ********************/
655
656typedef struct
657{
658 char *ending; /* old ending */
659 int (*cond) (); /* conditional rule */
660 signed char left_offset; /* used to find the siblings */
661 signed char right_offset; /* in balanced binary tree */
662}
663Ending_List;
664
665/*
666 The original list of endings is re-organised into groups having
667 the same length and the same first character.
668 */
669
670static Ending_List List[] =
671{
672 "a", NULL, 0, 0,
673
674 "ae", NULL, 0, 0,
675 "al", BB, -1, 2,
676 "ar", X, 0, 0,
677 "as", B, -1, 0,
678
679 "acy", NULL, 0, 1,
680 "age", B, 0, 0,
681 "aic", NULL, -2, 1,
682 "als", BB, 0, 0,
683 "ant", B, -2, 2,
684 "ars", O, 0, 0,
685 "ary", F, -1, 2,
686 "ata", NULL, 0, 0,
687 "ate", NULL, -1, 0,
688
689 "able", NULL, 0, 1,
690 "ably", NULL, 0, 0,
691 "ages", B, -2, 2,
692 "ally", B, 0, 0,
693 "ance", B, -1, 1,
694 "ancy", B, 0, 0,
695 "ants", B, -4, 4,
696 "aric", NULL, 0, 0,
697 "arly", K, -1, 1,
698 "ated", I, 0, 0,
699 "ates", NULL, -2, 2,
700 "atic", B, 0, 0,
701 "ator", NULL, -1, 0,
702
703 "acies", NULL, 0, 0,
704 "acity", NULL, -1, 1,
705 "aging", B, 0, 0,
706 "aical", NULL, -2, 2,
707 "alist", NULL, 0, 0,
708 "alism", B, -1, 0,
709 "ality", NULL, -3, 3,
710 "alize", NULL, 0, 1,
711 "allic", BB, 0, 0,
712 "anced", B, -2, 2,
713 "ances", B, 0, 0,
714 "antic", C, -1, 0,
715 "arial", NULL, -6, 6,
716 "aries", NULL, 0, 1,
717 "arily", NULL, 0, 0,
718 "arity", B, -2, 2,
719 "arize", NULL, 0, 0,
720 "aroid", NULL, -1, 0,
721 "ately", NULL, -3, 3,
722 "ating", I, 0, 1,
723 "ation", B, 0, 0,
724 "ative", NULL, -2, 2,
725 "ators", NULL, 0, 0,
726 "atory", NULL, -1, 1,
727 "ature", E, 0, 0,
728
729 "aceous", NULL, 0, 1,
730 "acious", B, 0, 0,
731 "action", G, -2, 2,
732 "alness", NULL, 0, 0,
733 "ancial", NULL, -1, 1,
734 "ancies", NULL, 0, 0,
735 "ancing", B, -4, 4,
736 "ariser", NULL, 0, 0,
737 "arized", NULL, -1, 1,
738 "arizer", NULL, 0, 0,
739 "atable", NULL, -2, 2,
740 "ations", B, 0, 0,
741 "atives", NULL, -1, 0,
742
743 "ability", NULL, 0, 1,
744 "aically", NULL, 0, 0,
745 "alistic", B, -2, 2,
746 "alities", NULL, 0, 0,
747 "ariness", E, -1, 0,
748 "aristic", NULL, -3, 3,
749 "arizing", NULL, 0, 1,
750 "ateness", NULL, 0, 0,
751 "atingly", NULL, -2, 2,
752 "ational", B, 0, 0,
753 "atively", NULL, -1, 1,
754 "ativism", NULL, 0, 0,
755
756 "ableness", NULL, 0, 1,
757 "arizable", NULL, 0, 0,
758
759 "allically", C, 0, 0,
760 "antaneous", NULL, -1, 1,
761 "antiality", NULL, 0, 0,
762 "arisation", NULL, -2, 2,
763 "arization", NULL, 0, 0,
764 "ationally", B, -1, 1,
765 "ativeness", NULL, 0, 0,
766
767 "antialness", NULL, 0, 0,
768 "arisations", NULL, -1, 1,
769 "arizations", NULL, 0, 0,
770
771 "alistically", B, 0, 1,
772 "arizability", NULL, 0, 0,
773
774 "e", NULL, 0, 0,
775
776 "ed", E, 0, 0,
777 "en", F, -1, 1,
778 "es", E, 0, 0,
779
780 "eal", Y, 0, 0,
781 "ear", Y, -1, 1,
782 "ely", E, 0, 0,
783 "ene", E, -2, 2,
784 "ent", C, 0, 0,
785 "ery", E, -1, 1,
786 "ese", NULL, 0, 0,
787
788 "ealy", Y, 0, 1,
789 "edly", E, 0, 0,
790 "eful", NULL, -2, 1,
791 "eity", NULL, 0, 0,
792 "ence", NULL, -2, 2,
793 "ency", NULL, 0, 0,
794 "ened", E, -1, 2,
795 "enly", E, 0, 0,
796 "eous", NULL, -1, 0,
797
798 "early", Y, 0, 1,
799 "ehood", NULL, 0, 0,
800 "eless", NULL, -2, 2,
801 "elity", NULL, 0, 0,
802 "ement", NULL, -1, 0,
803 "enced", NULL, -3, 3,
804 "ences", NULL, 0, 1,
805 "eness", E, 0, 0,
806 "ening", E, -2, 2,
807 "ental", NULL, 0, 0,
808 "ented", C, -1, 1,
809 "ently", NULL, 0, 0,
810
811 "eature", Z, 0, 0,
812 "efully", NULL, -1, 1,
813 "encies", NULL, 0, 0,
814 "encing", NULL, -2, 2,
815 "ential", NULL, 0, 0,
816 "enting", C, -1, 1,
817 "entist", NULL, 0, 1,
818 "eously", NULL, 0, 0,
819
820 "elihood", E, 0, 1,
821 "encible", NULL, 0, 0,
822 "entally", NULL, -2, 2,
823 "entials", NULL, 0, 0,
824 "entiate", NULL, -1, 1,
825 "entness", NULL, 0, 0,
826
827 "entation", NULL, 0, 0,
828 "entially", NULL, -1, 1,
829 "eousness", NULL, 0, 0,
830
831 "eableness", E, 0, 1,
832 "entations", NULL, 0, 0,
833 "entiality", NULL, -2, 2,
834 "entialize", NULL, 0, 0,
835 "entiation", NULL, -1, 0,
836
837 "entialness", NULL, 0, 0,
838
839 "ful", NULL, 0, 0,
840
841 "fully", NULL, 0, 0,
842
843 "fulness", NULL, 0, 0,
844
845 "hood", NULL, 0, 0,
846
847 "i", NULL, 0, 0,
848
849 "ia", NULL, 0, 0,
850 "ic", NULL, -1, 1,
851 "is", NULL, 0, 0,
852
853 "ial", NULL, 0, 0,
854 "ian", NULL, -1, 1,
855 "ics", NULL, 0, 1,
856 "ide", L, 0, 0,
857 "ied", NULL, -3, 3,
858 "ier", NULL, 0, 0,
859 "ies", P, -1, 0,
860 "ily", NULL, -1, 1,
861 "ine", M, 0, 0,
862 "ing", N, -5, 5,
863 "ion", Q, 0, 0,
864 "ish", C, -1, 1,
865 "ism", B, 0, 1,
866 "ist", NULL, 0, 0,
867 "ite", AA, -3, 3,
868 "ity", NULL, 0, 0,
869 "ium", NULL, -1, 0,
870 "ive", NULL, -1, 1,
871 "ize", F, 0, 0,
872
873 "ials", NULL, 0, 0,
874 "ians", NULL, -1, 0,
875 "ible", NULL, -1, 1,
876 "ibly", NULL, 0, 0,
877 "ical", NULL, -2, 2,
878 "ides", L, 0, 0,
879 "iers", NULL, -1, 1,
880 "iful", NULL, 0, 0,
881 "ines", M, -4, 4,
882 "ings", N, 0, 0,
883 "ions", B, -1, 1,
884 "ious", NULL, 0, 0,
885 "isms", B, -2, 2,
886 "ists", NULL, 0, 0,
887 "itic", H, -1, 1,
888 "ized", F, 0, 1,
889 "izer", F, 0, 0,
890
891 "ially", NULL, 0, 0,
892 "icant", NULL, -1, 1,
893 "ician", NULL, 0, 0,
894 "icide", NULL, -2, 2,
895 "icism", NULL, 0, 0,
896 "icist", NULL, -1, 0,
897 "icity", NULL, -3, 3,
898 "idine", I, 0, 1,
899 "iedly", NULL, 0, 0,
900 "ihood", NULL, -2, 2,
901 "inate", NULL, 0, 0,
902 "iness", NULL, -1, 0,
903 "ingly", B, -6, 6,
904 "inism", J, 0, 1,
905 "inity", CC, 0, 0,
906 "ional", NULL, -2, 2,
907 "ioned", NULL, 0, 0,
908 "ished", NULL, -1, 0,
909 "istic", NULL, -3, 3,
910 "ities", NULL, 0, 1,
911 "itous", NULL, 0, 0,
912 "ively", NULL, -2, 2,
913 "ivity", NULL, 0, 0,
914 "izers", F, -1, 1,
915 "izing", F, 0, 0,
916
917 "ialist", NULL, 0, 0,
918 "iality", NULL, -1, 1,
919 "ialize", NULL, 0, 0,
920 "ically", NULL, -2, 2,
921 "icance", NULL, 0, 0,
922 "icians", NULL, -1, 1,
923 "icists", NULL, 0, 0,
924 "ifully", NULL, -4, 4,
925 "ionals", NULL, 0, 0,
926 "ionate", D, -1, 1,
927 "ioning", NULL, 0, 0,
928 "ionist", NULL, -2, 2,
929 "iously", NULL, 0, 0,
930 "istics", NULL, -1, 1,
931 "izable", E, 0, 0,
932
933 "ibility", NULL, 0, 0,
934 "icalism", NULL, -1, 1,
935 "icalist", NULL, 0, 1,
936 "icality", NULL, 0, 0,
937 "icalize", NULL, -3, 3,
938 "ication", G, 0, 0,
939 "icianry", NULL, -1, 0,
940 "ination", NULL, -1, 1,
941 "ingness", NULL, 0, 0,
942 "ionally", NULL, -5, 5,
943 "isation", NULL, 0, 0,
944 "ishness", NULL, -1, 1,
945 "istical", NULL, 0, 1,
946 "iteness", NULL, 0, 0,
947 "iveness", NULL, -3, 3,
948 "ivistic", NULL, 0, 0,
949 "ivities", NULL, -1, 0,
950 "ization", F, -1, 1,
951 "izement", NULL, 0, 0,
952
953 "ibleness", NULL, 0, 0,
954 "icalness", NULL, -1, 1,
955 "ionalism", NULL, 0, 0,
956 "ionality", NULL, -2, 2,
957 "ionalize", NULL, 0, 0,
958 "iousness", NULL, -1, 1,
959 "izations", NULL, 0, 0,
960
961 "ionalness", NULL, 0, 1,
962 "istically", NULL, 0, 0,
963 "itousness", NULL, -2, 2,
964 "izability", NULL, 0, 0,
965 "izational", NULL, -1, 0,
966
967 "izationally", B, 0, 0,
968
969 "ly", B, 0, 0,
970
971 "less", NULL, 0, 1,
972 "lily", NULL, 0, 0,
973
974 "lessly", NULL, 0, 0,
975
976 "lessness", NULL, 0, 0,
977
978 "ness", NULL, 0, 0,
979
980 "nesses", NULL, 0, 0,
981
982 "o", NULL, 0, 0,
983
984 "on", S, 0, 1,
985 "or", T, 0, 0,
986
987 "oid", NULL, 0, 0,
988 "one", R, -1, 1,
989 "ous", NULL, 0, 0,
990
991 "ogen", NULL, 0, 0,
992
993 "oidal", NULL, 0, 0,
994 "oides", NULL, -1, 2,
995 "otide", NULL, 0, 0,
996 "ously", NULL, -1, 0,
997
998 "oidism", NULL, 0, 0,
999
1000 "oidally", NULL, 0, 1,
1001 "ousness", NULL, 0, 0,
1002
1003 "s", W, 0, 0,
1004
1005 "s'", NULL, 0, 0,
1006
1007 "um", U, 0, 1,
1008 "us", V, 0, 0,
1009
1010 "ward", NULL, 0, 1,
1011 "wise", NULL, 0, 0,
1012
1013 "y", B, 0, 0,
1014
1015 "yl", R, 0, 0,
1016
1017 "ying", B, 0, 1,
1018 "yish", NULL, 0, 0,
1019
1020 "'s", NULL, 0, 0,
1021};
1022
1023typedef struct node
1024 {
1025 char c; /* First character */
1026 struct node *left; /* used in balanced */
1027 struct node *right; /* binary tree */
1028 Ending_List *ptr[11]; /* the approriate location */
1029 }
1030First_Char_Node;
1031
1032static First_Char_Node First[] =
1033{
1034 '\'', NULL, NULL,
1035 {NULL,
1036 List + 293, NULL, NULL, NULL, NULL,
1037 NULL, NULL, NULL, NULL, NULL},
1038
1039 'a', First, NULL,
1040 {List,
1041 List + 2, List + 9, List + 20, List + 39, List + 58,
1042 List + 70, List + 77, List + 82, List + 87, List + 89},
1043
1044 'e', First + 1, First + 4,
1045 {List + 91,
1046 List + 93, List + 98, List + 106, List + 116, List + 126,
1047 List + 133, List + 138, List + 142, List + 145, NULL},
1048
1049 'f', NULL, NULL,
1050 {NULL,
1051 NULL, List + 146, NULL, List + 147, NULL,
1052 List + 148, NULL, NULL, NULL, NULL},
1053
1054 'h', First + 3, First + 5,
1055 {NULL,
1056 NULL, NULL, List + 149, NULL, NULL,
1057 NULL, NULL, NULL, NULL, NULL},
1058
1059 'i', NULL, NULL,
1060 {List + 150,
1061 List + 152, List + 163, List + 181, List + 202, List + 222,
1062 List + 239, List + 252, List + 258, NULL, List + 261},
1063
1064 'l', First + 2, First + 10,
1065 {NULL,
1066 List + 262, NULL, List + 263, NULL, List + 265,
1067 NULL, List + 266, NULL, NULL, NULL},
1068
1069 'n', NULL, NULL,
1070 {NULL,
1071 NULL, NULL, List + 267, NULL, List + 268,
1072 NULL, NULL, NULL, NULL, NULL},
1073
1074 'o', First + 7, First + 9,
1075 {List + 269,
1076 List + 270, List + 273, List + 275, List + 277, List + 280,
1077 List + 281, NULL, NULL, NULL, NULL},
1078
1079 's', NULL, NULL,
1080 {List + 283,
1081 List + 284, NULL, NULL, NULL, NULL,
1082 NULL, NULL, NULL, NULL, NULL},
1083
1084 'u', First + 8, First + 12,
1085 {NULL,
1086 List + 285, NULL, NULL, NULL, NULL,
1087 NULL, NULL, NULL, NULL, NULL},
1088
1089 'w', NULL, NULL,
1090 {NULL,
1091 NULL, NULL, List + 287, NULL, NULL,
1092 NULL, NULL, NULL, NULL, NULL},
1093
1094 'y', First + 11, NULL,
1095 {List + 289,
1096 List + 290, NULL, List + 291, NULL, NULL,
1097 NULL, NULL, NULL, NULL, NULL},
1098};
1099
1100
1101/******************************************************************************
1102 *
1103 * Look for a match in the suffix list.
1104 * Return the pointer to the end of the new stem if there is a match.
1105 * Otherwise, return the pointer to the end of the original word.
1106 *
1107 * Method:
1108 * * Search for the first character in the suffix list.
1109 * * If there is match, search for the rest of the suffix list.
1110 *
1111 *****************************************************************************/
1112
1113static char *
1114remove_ending (char *word, int w_len)
1115{
1116 static First_Char_Node *root = First + 6; /* the root of binary tree is 'l' */
1117 First_Char_Node *p_first; /* Points to the first character */
1118 /* of the possible suffix */
1119 Ending_List *p_list; /* Points to the List of Endings */
1120 char *suffix_start, /* Points to start of possible suffix */
1121 *stem_end, /* Points to the end of possible stem */
1122 ch1, ch2;
1123 int s_len, /* Possible stem length */
1124 e_offset, /* Offset from the end to the start of */
1125 /* possible suffix */
1126 not_done = 1, cmp;
1127
1128 stem_end = word + w_len - 1;
1129 if (w_len > PREDEFINED_SIZE) /* find the start of suffix */
1130 suffix_start = word + w_len - MAX_ENDING_SIZE;
1131 else
1132 suffix_start = word + MIN_STEM_LENGTH;
1133
1134 ch1 = *suffix_start;
1135 do /* Search for the first character */
1136 {
1137 p_first = root;
1138 do
1139 {
1140 ch2 = p_first->c;
1141 if (ch1 == ch2)
1142 break;
1143 else if (ch1 > ch2)
1144 p_first = p_first->right;
1145 else
1146 p_first = p_first->left;
1147 }
1148 while (p_first != NULL);
1149
1150 if (p_first != NULL) /* Search for the rest */
1151 {
1152 e_offset = stem_end - suffix_start;
1153 if ((p_list = p_first->ptr[e_offset]) != NULL)
1154 {
1155 for (;;) /* no need to compare the first char */
1156 {
1157 cmp = strcmp (suffix_start + 1, p_list->ending + 1);
1158 if (cmp > 0)
1159 if (p_list->right_offset)
1160 p_list += p_list->right_offset;
1161 else
1162 break;
1163 else if (cmp < 0)
1164 if (p_list->left_offset)
1165 p_list += p_list->left_offset;
1166 else
1167 break;
1168 else
1169 {
1170 s_len = suffix_start - word;
1171 if (!p_list->cond ||
1172 (*p_list->cond) (s_len, suffix_start - 1))
1173 {
1174 *suffix_start = EOS;
1175 stem_end = suffix_start - 1;
1176 not_done = 0;
1177 }
1178 break;
1179 }
1180 }
1181 }
1182 }
1183 suffix_start++;
1184 }
1185 while (not_done && ((ch1 = *suffix_start) != EOS));
1186 return (stem_end);
1187}
Note: See TracBrowser for help on using the repository browser.