source: trunk/indexers/mg/lib/lovinstem.c@ 13654

Last change on this file since 13654 was 13654, checked in by kjdon, 17 years ago

tidied up the top comments, removed Ids, and old log messages

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 24.3 KB
Line 
1/**************************************************************************
2 *
3 * lovinstem.c -- Stemming code
4 * Copyright (C) 1994 Linh Huynh ([email protected])
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "sysfuncs.h"
23
24
25/*****************************************************************************
26 *
27 * Transformational Rules used in Recoding Stem Terminations
28 *
29 * Parameter in functions:
30 * ch : the second last character of the old ending
31 * (after removing one of double consonants)
32 *
33 ****************************************************************************/
34
35
36/************ Conditional rules associated with transformations **************/
37
38static int aio ();
39static int s ();
40static int pt ();
41static int m ();
42static int n ();
43
44static int
45aio (ch) /* Rule number 9 */
46 char ch;
47{
48 return ((ch != 'a') && (ch != 'i') && (ch != 'o'));
49}
50
51static int
52s (ch) /* Rule Number 24 */
53 char ch;
54{
55 return (ch != 's');
56}
57
58static int
59pt (ch) /* Rule number 28 */
60 char ch;
61{
62 return ((ch != 'p') && (ch != 't'));
63}
64
65static int
66m (ch) /* Rule number 30 */
67 char ch;
68{
69 return (ch != 'm');
70}
71
72static int
73n (ch) /* Rule number 32 */
74 char ch;
75{
76 return (ch != 'n');
77}
78
79
80/**************************************************************************
81 *
82 * Context-sensitive rules associated with certain endings.
83 *
84 * The notations of the rules are the same as in Lovins' paper
85 * except that rule A is replaced by a NULL in the data structure.
86 *
87 * Function parameters:
88 * - stem_length: possible stem length
89 * - end : pointer points to the end of the possible stem.
90 *
91 *************************************************************************/
92
93
94/********* Context-sensitive rule function declarations ******************/
95
96static int B ();
97static int C ();
98static int D ();
99static int E ();
100static int F ();
101static int G ();
102static int H ();
103static int I ();
104static int J ();
105static int K ();
106static int L ();
107static int M ();
108static int N ();
109static int O ();
110static int P ();
111static int Q ();
112static int R ();
113static int S ();
114static int T ();
115static int U ();
116static int V ();
117static int W ();
118static int X ();
119static int Y ();
120static int Z ();
121static int AA ();
122static int BB ();
123static int CC ();
124
125static int
126B (stem_length, end)
127 int stem_length;
128 char *end;
129{
130 return (stem_length >= 3);
131}
132
133static int
134C (stem_length, end)
135 int stem_length;
136 char *end;
137{
138 return (stem_length >= 4);
139}
140
141static int
142D (stem_length, end)
143 int stem_length;
144 char *end;
145{
146 return (stem_length >= 5);
147}
148
149static int
150E (stem_length, end)
151 int stem_length;
152 char *end;
153{
154 return (*end != 'e');
155}
156
157static int
158F (stem_length, end)
159 int stem_length;
160 char *end;
161{
162 return ((stem_length >= 3) && (*end != 'e'));
163}
164
165static int
166G (stem_length, end)
167 int stem_length;
168 char *end;
169{
170 return ((stem_length >= 3) && (*end == 'f'));
171}
172
173static int
174H (stem_length, end)
175 int stem_length;
176 char *end;
177{
178 char p1, p2;
179 p1 = *end;
180 p2 = *(end - 1);
181 return ((p1 == 't') || ((p1 == 'l') && (p2 == 'l')));
182}
183
184static int
185I (stem_length, end)
186 int stem_length;
187 char *end;
188{
189 char p1;
190 p1 = *end;
191 return ((p1 != 'o') && (p1 != 'e'));
192}
193
194static int
195J (stem_length, end)
196 int stem_length;
197 char *end;
198{
199 char p1;
200 p1 = *end;
201 return ((p1 != 'a') && (p1 != 'e'));
202}
203
204static int
205K (stem_length, end)
206 int stem_length;
207 char *end;
208{
209 char p1;
210 p1 = *end;
211 return ((stem_length >= 3) &&
212 ((p1 == 'l') || (p1 == 'i') ||
213 ((p1 == 'e') && (*(end - 2) == 'u'))));
214}
215
216static int
217L (stem_length, end)
218 int stem_length;
219 char *end;
220{
221 char p1, p2;
222 p1 = *end;
223 p2 = *(end - 1);
224 if (p1 == 's')
225 return (p2 == 'o');
226 else
227 return ((p1 != 'u') && (p1 != 'x'));
228}
229
230static int
231M (stem_length, end)
232 int stem_length;
233 char *end;
234{
235 char p1;
236 p1 = *end;
237 return ((p1 != 'a') && (p1 != 'c') &&
238 (p1 != 'e') && (p1 != 'm'));
239}
240
241static int
242N (stem_length, end)
243 int stem_length;
244 char *end;
245{
246 if (stem_length >= 3)
247 {
248 if (*(end - 2) == 's')
249 return (stem_length >= 4);
250 else
251 return 1;
252 }
253 else
254 return 0;
255}
256
257static int
258O (stem_length, end)
259 int stem_length;
260 char *end;
261{
262 char p1;
263 p1 = *end;
264 return ((p1 == 'l') || (p1 == 'i'));
265}
266
267static int
268P (stem_length, end)
269 int stem_length;
270 char *end;
271{
272 return (*end != 'c');
273}
274
275static int
276Q (stem_length, end)
277 int stem_length;
278 char *end;
279{
280 char p1;
281 p1 = *end;
282
283 return ((stem_length >= 3) && (p1 != 'l') && (p1 != 'n'));
284}
285
286static int
287R (stem_length, end)
288 int stem_length;
289 char *end;
290{
291 char p1;
292 p1 = *end;
293 return ((p1 == 'n') || (p1 == 'r'));
294}
295
296static int
297S (stem_length, end)
298 int stem_length;
299 char *end;
300{
301 char p1, p2;
302 p1 = *end;
303 p2 = *(end - 1);
304
305 if (p1 == 't')
306 return (p2 != 't');
307 else
308 return ((p1 == 'r') && (p2 == 'd'));
309}
310
311static int
312T (stem_length, end)
313 int stem_length;
314 char *end;
315{
316 char p1, p2;
317 p1 = *end;
318 p2 = *(end - 1);
319
320 if (p1 == 't')
321 return (p2 != 'o');
322 else
323 return (p1 == 's');
324}
325
326static int
327U (stem_length, end)
328 int stem_length;
329 char *end;
330{
331 char p1;
332 p1 = *end;
333
334 return ((p1 == 'l') || (p1 == 'm') ||
335 (p1 == 'n') || (p1 == 'r'));
336}
337
338static int
339V (stem_length, end)
340 int stem_length;
341 char *end;
342{
343 return (*end == 'c');
344}
345
346static int
347W (stem_length, end)
348 int stem_length;
349 char *end;
350{
351 char p1;
352 p1 = *end;
353
354 return ((p1 != 's') && (p1 != 'u'));
355}
356
357static int
358X (stem_length, end)
359 int stem_length;
360 char *end;
361{
362 char p1;
363 p1 = *end;
364
365 if (p1 == 'e')
366 return ((stem_length >= 3) && (*(end - 2) == 'u'));
367 else
368 return ((p1 == 'l') || (p1 == 'i'));
369}
370
371static int
372Y (stem_length, end)
373 int stem_length;
374 char *end;
375{
376 return ((*end == 'n') && (*(end - 1) == 'i'));
377}
378
379static int
380Z (stem_length, end)
381 int stem_length;
382 char *end;
383{
384 return (*end != 'f');
385}
386
387static int
388AA (stem_length, end)
389 int stem_length;
390 char *end;
391{
392 char p1, p2;
393 p1 = *end;
394 p2 = *(end - 1);
395
396 if (p1 == 'h')
397 return ((p2 == 'p') || (p2 == 't'));
398 else if (p1 == 'r')
399 return ((p2 == 'e') || (p2 == 'o'));
400 else if (p1 == 's')
401 return (p2 == 'e');
402 else
403 return ((p1 == 'd') || (p1 == 'f') ||
404 (p1 == 'l') || (p1 == 't'));
405}
406
407static int
408BB (stem_length, end)
409 int stem_length;
410 char *end;
411{
412 if (stem_length >= 3)
413 {
414 if (stem_length >= 4)
415 return (strncmp (end - 3, "ryst", 4) != 0);
416 else
417 return (strncmp (end - 2, "met", 3) != 0);
418 }
419 else
420 return 0;
421}
422
423static int
424CC (stem_length, end)
425 int stem_length;
426 char *end;
427{
428 return (*end == 'l');
429}
430
431/**************************************************************************/
432
433#define MIN_STEM_LENGTH 2
434#define MAX_ENDING_SIZE 11
435#define PREDEFINED_SIZE (MAX_ENDING_SIZE + MIN_STEM_LENGTH)
436#define EOS '\0'
437/* [RPAP - Jan 97: Stem Index Change] */
438#define MAX_STEM_LENGTH 255
439
440static char *remove_ending (char *word, int w_len);
441static void recode_stem (char *stem_end);
442
443/* =========================================================================
444 * Function: stem
445 * Description:
446 * Implemetation of the Lovins' stemming algorithm described in
447 * J.B. Lovins, "Development of a Stemming Algorithm",
448 * Mechanical Translation and Computational Linguistics, Vol 11,1968.
449 * Input: a word string with the length in the first byte
450 * Output: the stemmed word
451 * ========================================================================= */
452
453void
454lovinstem (u_char * word)
455{
456 char *word_start; /* points to first letter of word */
457 char *stem_end; /* points to last character of stem portion */
458 int old_length; /* length of word */
459 int new_length; /* length after stemming */
460
461 /* [RPAP - Jan 97: Stem Index Change] */
462 u_char copy[MAX_STEM_LENGTH + 2];
463 bcopy ((char *) word, (char *) copy, *word + 1);
464
465 old_length = *copy; /* [RPAP - Jan 97: Stem Index Change] */
466
467 /* The word must be at least MIN_STEM_LENGTH characters long. */
468 if (old_length <= MIN_STEM_LENGTH)
469 return;
470
471 word_start = (char *) copy + 1; /* jump over length byte */ /* [RPAP - Jan 97: Stem Index Change] */
472 copy[old_length + 1] = '\0'; /* null terminate string */ /* [RPAP - Jan 97: Stem Index Change] */
473
474 stem_end = remove_ending (word_start, old_length);
475 recode_stem (stem_end);
476
477 /* fix up new length */
478 /* have to use strlen because have no other way of telling length */
479 new_length = strlen (word_start);
480 *copy = new_length; /* [RPAP - Jan 97: Stem Index Change] */
481
482 /* [RPAP - Jan 97: Stem Index Change] */
483 bcopy ((char *) copy, (char *) word, *copy + 1);
484
485} /*stem_complex */
486
487/* =========================================================================
488
489 * Data structure for recoding and the method for recoding the stem.
490 *
491 * ========================================================================= */
492
493typedef struct
494{
495 char *old_end; /* old ending */
496 char *new_end; /* new ending */
497 char old_offset; /* length of the old ending - 1 */
498 int (*cond) (); /* condition rule */
499 char end_group; /* signal the end of the group */
500}
501Recode_Rules;
502
503/*
504 Stem terminations are divided into groups having the same last
505 character
506 */
507
508static Recode_Rules Rules[] =
509{
510 "uad", "uas", 2, NULL, 0,
511 "vad", "vas", 2, NULL, 0,
512 "cid", "cis", 2, NULL, 0,
513 "lid", "lis", 2, NULL, 0,
514 "erid", "eris", 3, NULL, 0,
515 "pand", "pans", 3, NULL, 0,
516 "end", "ens", 2, s, 0,
517 "end", "ens", 2, m, 0,
518 "ond", "ons", 2, NULL, 0,
519 "lud", "lus", 2, NULL, 0,
520 "rud", "rus", 2, NULL, 1,
521
522 "ul", "l", 1, aio, 1,
523
524 "istr", "ister", 3, NULL, 0,
525 "metr", "meter", 3, NULL, 0,
526 "her", "hes", 2, pt, 1,
527
528 "urs", "ur", 2, NULL, 1,
529
530 "uct", "uc", 2, NULL, 0,
531 "umpt", "um", 3, NULL, 0,
532 "rpt", "rb", 2, NULL, 0,
533 "mit", "mis", 2, NULL, 0,
534 "ert", "ers", 2, NULL, 0,
535 "et", "es", 1, n, 0,
536 "yt", "ys", 1, NULL, 1,
537
538 "iev", "ief", 2, NULL, 0,
539 "olv", "olut", 2, NULL, 1,
540
541 "bex", "bic", 2, NULL, 0,
542 "dex", "dic", 2, NULL, 0,
543 "pex", "pic", 2, NULL, 0,
544 "tex", "tic", 2, NULL, 0,
545 "ax", "ac", 1, NULL, 0,
546 "ex", "ec", 1, NULL, 0,
547 "ix", "ic", 1, NULL, 0,
548 "lux", "luc", 2, NULL, 1,
549
550 "yz", "ys", 1, NULL, 1
551};
552
553typedef struct last
554 {
555 char c; /* The last character */
556 struct last *left; /* used in balanced */
557 struct last *right; /* binary tree */
558 Recode_Rules *pt; /* location of approriate group */
559 }
560Last_Char_Node;
561
562/*------------
563 s
564 / \
565 l x
566 / \ / \
567 d r t z
568 \
569 v
570---------------*/
571static Last_Char_Node pr[] =
572{
573 'd', NULL, NULL, Rules,
574 'l', pr, pr + 2, Rules + 11,
575 'r', NULL, NULL, Rules + 12,
576 's', pr + 1, pr + 6, Rules + 15,
577 't', NULL, pr + 5, Rules + 16,
578 'v', NULL, NULL, Rules + 23,
579 'x', pr + 4, pr + 7, Rules + 25,
580 'z', NULL, NULL, Rules + 33,
581};
582
583
584/*****************************************************************************
585 *
586 * Recode the stem according to the transformation rules.
587 *
588 *****************************************************************************/
589
590static void
591recode_stem (char *stem_end)
592{
593 static Last_Char_Node *root = pr + 3;
594 Last_Char_Node *p_last = root;
595 Recode_Rules *rule; /* points to the transformation list */
596 char *h, /* points to start of possible ending */
597 ch, /* last character of the old ending */
598 ch2;
599
600 h = stem_end - 1;
601 ch = *stem_end;
602 if (*h == ch) /* Check for double consonant */
603 {
604 if (strchr ("bdglmnprst", ch) != NULL)
605 {
606 *stem_end = EOS;
607 stem_end = h;
608 }
609 }
610
611 do /* Check for the last character */
612 {
613 ch2 = p_last->c;
614 if (ch == ch2)
615 break;
616 else if (ch > ch2)
617 p_last = p_last->right;
618 else
619 p_last = p_last->left;
620 }
621 while (p_last != NULL);
622
623 if (p_last != NULL) /* Check for the rest of suffix list */
624 {
625 rule = p_last->pt;
626 for (;;)
627 {
628 h = stem_end - rule->old_offset;
629 if (strcmp (h, rule->old_end) == 0)
630 {
631 if (!rule->cond || (*rule->cond) (*(h - 1)))
632 {
633 (void) strcpy (h, rule->new_end);
634 break;
635 }
636 }
637 if (rule->end_group)
638 break;
639 rule++;
640 }
641 }
642}
643
644/* ======================================================================
645
646 * List of endings and the method to remove the ending
647 *
648 * ======================================================================*/
649
650/************ Data structures for list of endings ********************/
651
652typedef struct
653{
654 char *ending; /* old ending */
655 int (*cond) (); /* conditional rule */
656 signed char left_offset; /* used to find the siblings */
657 signed char right_offset; /* in balanced binary tree */
658}
659Ending_List;
660
661/*
662 The original list of endings is re-organised into groups having
663 the same length and the same first character.
664 */
665
666static Ending_List List[] =
667{
668 "a", NULL, 0, 0,
669
670 "ae", NULL, 0, 0,
671 "al", BB, -1, 2,
672 "ar", X, 0, 0,
673 "as", B, -1, 0,
674
675 "acy", NULL, 0, 1,
676 "age", B, 0, 0,
677 "aic", NULL, -2, 1,
678 "als", BB, 0, 0,
679 "ant", B, -2, 2,
680 "ars", O, 0, 0,
681 "ary", F, -1, 2,
682 "ata", NULL, 0, 0,
683 "ate", NULL, -1, 0,
684
685 "able", NULL, 0, 1,
686 "ably", NULL, 0, 0,
687 "ages", B, -2, 2,
688 "ally", B, 0, 0,
689 "ance", B, -1, 1,
690 "ancy", B, 0, 0,
691 "ants", B, -4, 4,
692 "aric", NULL, 0, 0,
693 "arly", K, -1, 1,
694 "ated", I, 0, 0,
695 "ates", NULL, -2, 2,
696 "atic", B, 0, 0,
697 "ator", NULL, -1, 0,
698
699 "acies", NULL, 0, 0,
700 "acity", NULL, -1, 1,
701 "aging", B, 0, 0,
702 "aical", NULL, -2, 2,
703 "alist", NULL, 0, 0,
704 "alism", B, -1, 0,
705 "ality", NULL, -3, 3,
706 "alize", NULL, 0, 1,
707 "allic", BB, 0, 0,
708 "anced", B, -2, 2,
709 "ances", B, 0, 0,
710 "antic", C, -1, 0,
711 "arial", NULL, -6, 6,
712 "aries", NULL, 0, 1,
713 "arily", NULL, 0, 0,
714 "arity", B, -2, 2,
715 "arize", NULL, 0, 0,
716 "aroid", NULL, -1, 0,
717 "ately", NULL, -3, 3,
718 "ating", I, 0, 1,
719 "ation", B, 0, 0,
720 "ative", NULL, -2, 2,
721 "ators", NULL, 0, 0,
722 "atory", NULL, -1, 1,
723 "ature", E, 0, 0,
724
725 "aceous", NULL, 0, 1,
726 "acious", B, 0, 0,
727 "action", G, -2, 2,
728 "alness", NULL, 0, 0,
729 "ancial", NULL, -1, 1,
730 "ancies", NULL, 0, 0,
731 "ancing", B, -4, 4,
732 "ariser", NULL, 0, 0,
733 "arized", NULL, -1, 1,
734 "arizer", NULL, 0, 0,
735 "atable", NULL, -2, 2,
736 "ations", B, 0, 0,
737 "atives", NULL, -1, 0,
738
739 "ability", NULL, 0, 1,
740 "aically", NULL, 0, 0,
741 "alistic", B, -2, 2,
742 "alities", NULL, 0, 0,
743 "ariness", E, -1, 0,
744 "aristic", NULL, -3, 3,
745 "arizing", NULL, 0, 1,
746 "ateness", NULL, 0, 0,
747 "atingly", NULL, -2, 2,
748 "ational", B, 0, 0,
749 "atively", NULL, -1, 1,
750 "ativism", NULL, 0, 0,
751
752 "ableness", NULL, 0, 1,
753 "arizable", NULL, 0, 0,
754
755 "allically", C, 0, 0,
756 "antaneous", NULL, -1, 1,
757 "antiality", NULL, 0, 0,
758 "arisation", NULL, -2, 2,
759 "arization", NULL, 0, 0,
760 "ationally", B, -1, 1,
761 "ativeness", NULL, 0, 0,
762
763 "antialness", NULL, 0, 0,
764 "arisations", NULL, -1, 1,
765 "arizations", NULL, 0, 0,
766
767 "alistically", B, 0, 1,
768 "arizability", NULL, 0, 0,
769
770 "e", NULL, 0, 0,
771
772 "ed", E, 0, 0,
773 "en", F, -1, 1,
774 "es", E, 0, 0,
775
776 "eal", Y, 0, 0,
777 "ear", Y, -1, 1,
778 "ely", E, 0, 0,
779 "ene", E, -2, 2,
780 "ent", C, 0, 0,
781 "ery", E, -1, 1,
782 "ese", NULL, 0, 0,
783
784 "ealy", Y, 0, 1,
785 "edly", E, 0, 0,
786 "eful", NULL, -2, 1,
787 "eity", NULL, 0, 0,
788 "ence", NULL, -2, 2,
789 "ency", NULL, 0, 0,
790 "ened", E, -1, 2,
791 "enly", E, 0, 0,
792 "eous", NULL, -1, 0,
793
794 "early", Y, 0, 1,
795 "ehood", NULL, 0, 0,
796 "eless", NULL, -2, 2,
797 "elity", NULL, 0, 0,
798 "ement", NULL, -1, 0,
799 "enced", NULL, -3, 3,
800 "ences", NULL, 0, 1,
801 "eness", E, 0, 0,
802 "ening", E, -2, 2,
803 "ental", NULL, 0, 0,
804 "ented", C, -1, 1,
805 "ently", NULL, 0, 0,
806
807 "eature", Z, 0, 0,
808 "efully", NULL, -1, 1,
809 "encies", NULL, 0, 0,
810 "encing", NULL, -2, 2,
811 "ential", NULL, 0, 0,
812 "enting", C, -1, 1,
813 "entist", NULL, 0, 1,
814 "eously", NULL, 0, 0,
815
816 "elihood", E, 0, 1,
817 "encible", NULL, 0, 0,
818 "entally", NULL, -2, 2,
819 "entials", NULL, 0, 0,
820 "entiate", NULL, -1, 1,
821 "entness", NULL, 0, 0,
822
823 "entation", NULL, 0, 0,
824 "entially", NULL, -1, 1,
825 "eousness", NULL, 0, 0,
826
827 "eableness", E, 0, 1,
828 "entations", NULL, 0, 0,
829 "entiality", NULL, -2, 2,
830 "entialize", NULL, 0, 0,
831 "entiation", NULL, -1, 0,
832
833 "entialness", NULL, 0, 0,
834
835 "ful", NULL, 0, 0,
836
837 "fully", NULL, 0, 0,
838
839 "fulness", NULL, 0, 0,
840
841 "hood", NULL, 0, 0,
842
843 "i", NULL, 0, 0,
844
845 "ia", NULL, 0, 0,
846 "ic", NULL, -1, 1,
847 "is", NULL, 0, 0,
848
849 "ial", NULL, 0, 0,
850 "ian", NULL, -1, 1,
851 "ics", NULL, 0, 1,
852 "ide", L, 0, 0,
853 "ied", NULL, -3, 3,
854 "ier", NULL, 0, 0,
855 "ies", P, -1, 0,
856 "ily", NULL, -1, 1,
857 "ine", M, 0, 0,
858 "ing", N, -5, 5,
859 "ion", Q, 0, 0,
860 "ish", C, -1, 1,
861 "ism", B, 0, 1,
862 "ist", NULL, 0, 0,
863 "ite", AA, -3, 3,
864 "ity", NULL, 0, 0,
865 "ium", NULL, -1, 0,
866 "ive", NULL, -1, 1,
867 "ize", F, 0, 0,
868
869 "ials", NULL, 0, 0,
870 "ians", NULL, -1, 0,
871 "ible", NULL, -1, 1,
872 "ibly", NULL, 0, 0,
873 "ical", NULL, -2, 2,
874 "ides", L, 0, 0,
875 "iers", NULL, -1, 1,
876 "iful", NULL, 0, 0,
877 "ines", M, -4, 4,
878 "ings", N, 0, 0,
879 "ions", B, -1, 1,
880 "ious", NULL, 0, 0,
881 "isms", B, -2, 2,
882 "ists", NULL, 0, 0,
883 "itic", H, -1, 1,
884 "ized", F, 0, 1,
885 "izer", F, 0, 0,
886
887 "ially", NULL, 0, 0,
888 "icant", NULL, -1, 1,
889 "ician", NULL, 0, 0,
890 "icide", NULL, -2, 2,
891 "icism", NULL, 0, 0,
892 "icist", NULL, -1, 0,
893 "icity", NULL, -3, 3,
894 "idine", I, 0, 1,
895 "iedly", NULL, 0, 0,
896 "ihood", NULL, -2, 2,
897 "inate", NULL, 0, 0,
898 "iness", NULL, -1, 0,
899 "ingly", B, -6, 6,
900 "inism", J, 0, 1,
901 "inity", CC, 0, 0,
902 "ional", NULL, -2, 2,
903 "ioned", NULL, 0, 0,
904 "ished", NULL, -1, 0,
905 "istic", NULL, -3, 3,
906 "ities", NULL, 0, 1,
907 "itous", NULL, 0, 0,
908 "ively", NULL, -2, 2,
909 "ivity", NULL, 0, 0,
910 "izers", F, -1, 1,
911 "izing", F, 0, 0,
912
913 "ialist", NULL, 0, 0,
914 "iality", NULL, -1, 1,
915 "ialize", NULL, 0, 0,
916 "ically", NULL, -2, 2,
917 "icance", NULL, 0, 0,
918 "icians", NULL, -1, 1,
919 "icists", NULL, 0, 0,
920 "ifully", NULL, -4, 4,
921 "ionals", NULL, 0, 0,
922 "ionate", D, -1, 1,
923 "ioning", NULL, 0, 0,
924 "ionist", NULL, -2, 2,
925 "iously", NULL, 0, 0,
926 "istics", NULL, -1, 1,
927 "izable", E, 0, 0,
928
929 "ibility", NULL, 0, 0,
930 "icalism", NULL, -1, 1,
931 "icalist", NULL, 0, 1,
932 "icality", NULL, 0, 0,
933 "icalize", NULL, -3, 3,
934 "ication", G, 0, 0,
935 "icianry", NULL, -1, 0,
936 "ination", NULL, -1, 1,
937 "ingness", NULL, 0, 0,
938 "ionally", NULL, -5, 5,
939 "isation", NULL, 0, 0,
940 "ishness", NULL, -1, 1,
941 "istical", NULL, 0, 1,
942 "iteness", NULL, 0, 0,
943 "iveness", NULL, -3, 3,
944 "ivistic", NULL, 0, 0,
945 "ivities", NULL, -1, 0,
946 "ization", F, -1, 1,
947 "izement", NULL, 0, 0,
948
949 "ibleness", NULL, 0, 0,
950 "icalness", NULL, -1, 1,
951 "ionalism", NULL, 0, 0,
952 "ionality", NULL, -2, 2,
953 "ionalize", NULL, 0, 0,
954 "iousness", NULL, -1, 1,
955 "izations", NULL, 0, 0,
956
957 "ionalness", NULL, 0, 1,
958 "istically", NULL, 0, 0,
959 "itousness", NULL, -2, 2,
960 "izability", NULL, 0, 0,
961 "izational", NULL, -1, 0,
962
963 "izationally", B, 0, 0,
964
965 "ly", B, 0, 0,
966
967 "less", NULL, 0, 1,
968 "lily", NULL, 0, 0,
969
970 "lessly", NULL, 0, 0,
971
972 "lessness", NULL, 0, 0,
973
974 "ness", NULL, 0, 0,
975
976 "nesses", NULL, 0, 0,
977
978 "o", NULL, 0, 0,
979
980 "on", S, 0, 1,
981 "or", T, 0, 0,
982
983 "oid", NULL, 0, 0,
984 "one", R, -1, 1,
985 "ous", NULL, 0, 0,
986
987 "ogen", NULL, 0, 0,
988
989 "oidal", NULL, 0, 0,
990 "oides", NULL, -1, 2,
991 "otide", NULL, 0, 0,
992 "ously", NULL, -1, 0,
993
994 "oidism", NULL, 0, 0,
995
996 "oidally", NULL, 0, 1,
997 "ousness", NULL, 0, 0,
998
999 "s", W, 0, 0,
1000
1001 "s'", NULL, 0, 0,
1002
1003 "um", U, 0, 1,
1004 "us", V, 0, 0,
1005
1006 "ward", NULL, 0, 1,
1007 "wise", NULL, 0, 0,
1008
1009 "y", B, 0, 0,
1010
1011 "yl", R, 0, 0,
1012
1013 "ying", B, 0, 1,
1014 "yish", NULL, 0, 0,
1015
1016 "'s", NULL, 0, 0,
1017};
1018
1019typedef struct node
1020 {
1021 char c; /* First character */
1022 struct node *left; /* used in balanced */
1023 struct node *right; /* binary tree */
1024 Ending_List *ptr[11]; /* the approriate location */
1025 }
1026First_Char_Node;
1027
1028static First_Char_Node First[] =
1029{
1030 '\'', NULL, NULL,
1031 {NULL,
1032 List + 293, NULL, NULL, NULL, NULL,
1033 NULL, NULL, NULL, NULL, NULL},
1034
1035 'a', First, NULL,
1036 {List,
1037 List + 2, List + 9, List + 20, List + 39, List + 58,
1038 List + 70, List + 77, List + 82, List + 87, List + 89},
1039
1040 'e', First + 1, First + 4,
1041 {List + 91,
1042 List + 93, List + 98, List + 106, List + 116, List + 126,
1043 List + 133, List + 138, List + 142, List + 145, NULL},
1044
1045 'f', NULL, NULL,
1046 {NULL,
1047 NULL, List + 146, NULL, List + 147, NULL,
1048 List + 148, NULL, NULL, NULL, NULL},
1049
1050 'h', First + 3, First + 5,
1051 {NULL,
1052 NULL, NULL, List + 149, NULL, NULL,
1053 NULL, NULL, NULL, NULL, NULL},
1054
1055 'i', NULL, NULL,
1056 {List + 150,
1057 List + 152, List + 163, List + 181, List + 202, List + 222,
1058 List + 239, List + 252, List + 258, NULL, List + 261},
1059
1060 'l', First + 2, First + 10,
1061 {NULL,
1062 List + 262, NULL, List + 263, NULL, List + 265,
1063 NULL, List + 266, NULL, NULL, NULL},
1064
1065 'n', NULL, NULL,
1066 {NULL,
1067 NULL, NULL, List + 267, NULL, List + 268,
1068 NULL, NULL, NULL, NULL, NULL},
1069
1070 'o', First + 7, First + 9,
1071 {List + 269,
1072 List + 270, List + 273, List + 275, List + 277, List + 280,
1073 List + 281, NULL, NULL, NULL, NULL},
1074
1075 's', NULL, NULL,
1076 {List + 283,
1077 List + 284, NULL, NULL, NULL, NULL,
1078 NULL, NULL, NULL, NULL, NULL},
1079
1080 'u', First + 8, First + 12,
1081 {NULL,
1082 List + 285, NULL, NULL, NULL, NULL,
1083 NULL, NULL, NULL, NULL, NULL},
1084
1085 'w', NULL, NULL,
1086 {NULL,
1087 NULL, NULL, List + 287, NULL, NULL,
1088 NULL, NULL, NULL, NULL, NULL},
1089
1090 'y', First + 11, NULL,
1091 {List + 289,
1092 List + 290, NULL, List + 291, NULL, NULL,
1093 NULL, NULL, NULL, NULL, NULL},
1094};
1095
1096
1097/******************************************************************************
1098 *
1099 * Look for a match in the suffix list.
1100 * Return the pointer to the end of the new stem if there is a match.
1101 * Otherwise, return the pointer to the end of the original word.
1102 *
1103 * Method:
1104 * * Search for the first character in the suffix list.
1105 * * If there is match, search for the rest of the suffix list.
1106 *
1107 *****************************************************************************/
1108
1109static char *
1110remove_ending (char *word, int w_len)
1111{
1112 static First_Char_Node *root = First + 6; /* the root of binary tree is 'l' */
1113 First_Char_Node *p_first; /* Points to the first character */
1114 /* of the possible suffix */
1115 Ending_List *p_list; /* Points to the List of Endings */
1116 char *suffix_start, /* Points to start of possible suffix */
1117 *stem_end, /* Points to the end of possible stem */
1118 ch1, ch2;
1119 int s_len, /* Possible stem length */
1120 e_offset, /* Offset from the end to the start of */
1121 /* possible suffix */
1122 not_done = 1, cmp;
1123
1124 stem_end = word + w_len - 1;
1125 if (w_len > PREDEFINED_SIZE) /* find the start of suffix */
1126 suffix_start = word + w_len - MAX_ENDING_SIZE;
1127 else
1128 suffix_start = word + MIN_STEM_LENGTH;
1129
1130 ch1 = *suffix_start;
1131 do /* Search for the first character */
1132 {
1133 p_first = root;
1134 do
1135 {
1136 ch2 = p_first->c;
1137 if (ch1 == ch2)
1138 break;
1139 else if (ch1 > ch2)
1140 p_first = p_first->right;
1141 else
1142 p_first = p_first->left;
1143 }
1144 while (p_first != NULL);
1145
1146 if (p_first != NULL) /* Search for the rest */
1147 {
1148 e_offset = stem_end - suffix_start;
1149 if ((p_list = p_first->ptr[e_offset]) != NULL)
1150 {
1151 for (;;) /* no need to compare the first char */
1152 {
1153 cmp = strcmp (suffix_start + 1, p_list->ending + 1);
1154 if (cmp > 0)
1155 if (p_list->right_offset)
1156 p_list += p_list->right_offset;
1157 else
1158 break;
1159 else if (cmp < 0)
1160 if (p_list->left_offset)
1161 p_list += p_list->left_offset;
1162 else
1163 break;
1164 else
1165 {
1166 s_len = suffix_start - word;
1167 if (!p_list->cond ||
1168 (*p_list->cond) (s_len, suffix_start - 1))
1169 {
1170 *suffix_start = EOS;
1171 stem_end = suffix_start - 1;
1172 not_done = 0;
1173 }
1174 break;
1175 }
1176 }
1177 }
1178 }
1179 suffix_start++;
1180 }
1181 while (not_done && ((ch1 = *suffix_start) != EOS));
1182 return (stem_end);
1183}
Note: See TracBrowser for help on using the repository browser.