source: trunk/indexers/mgpp/lib/lovinstem.cpp@ 9612

Last change on this file since 9612 was 9612, checked in by kjdon, 19 years ago

added in x++ -> ++x changes submitted by Emanuel Dejanu

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 23.7 KB
Line 
1/**************************************************************************
2 *
3 * lovinstem.c -- Stemming code
4 * Copyright (C) 1994 Linh Huynh ([email protected])
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "sysfuncs.h"
23
24
25/*****************************************************************************
26 *
27 * Transformational Rules used in Recoding Stem Terminations
28 *
29 * Parameter in functions:
30 * ch : the second last character of the old ending
31 * (after removing one of double consonants)
32 *
33 ****************************************************************************/
34
35
36/************ Conditional rules associated with transformations **************/
37
38/*static int aio ();
39static int s ();
40static int pt ();
41static int m ();
42static int n ();*/
43
44static int
45aio (char ch) /* Rule number 9 */
46{
47 return ((ch != 'a') && (ch != 'i') && (ch != 'o'));
48}
49
50static int
51s (char ch) /* Rule Number 24 */
52{
53 return (ch != 's');
54}
55
56static int
57pt (char ch) /* Rule number 28 */
58{
59 return ((ch != 'p') && (ch != 't'));
60}
61
62static int
63m (char ch) /* Rule number 30 */
64{
65 return (ch != 'm');
66}
67
68static int
69n (char ch) /* Rule number 32 */
70{
71 return (ch != 'n');
72}
73
74
75/**************************************************************************
76 *
77 * Context-sensitive rules associated with certain endings.
78 *
79 * The notations of the rules are the same as in Lovins' paper
80 * except that rule A is replaced by a NULL in the data structure.
81 *
82 * Function parameters:
83 * - stem_length: possible stem length
84 * - end : pointer points to the end of the possible stem.
85 *
86 *************************************************************************/
87
88
89/********* Context-sensitive rule function declarations ******************/
90
91/*static int B ();
92static int C ();
93static int D ();
94static int E ();
95static int F ();
96static int G ();
97static int H ();
98static int I ();
99static int J ();
100static int K ();
101static int L ();
102static int M ();
103static int N ();
104static int O ();
105static int P ();
106static int Q ();
107static int R ();
108static int S ();
109static int T ();
110static int U ();
111static int V ();
112static int W ();
113static int X ();
114static int Y ();
115static int Z ();
116static int AA ();
117static int BB ();
118static int CC ();
119*/
120static int
121B (int stem_length, char *end)
122{
123 return (stem_length >= 3);
124}
125
126static int
127C (int stem_length, char *end)
128{
129 return (stem_length >= 4);
130}
131
132static int
133D (int stem_length, char *end)
134{
135 return (stem_length >= 5);
136}
137
138static int
139E (int stem_length, char *end)
140
141
142{
143 return (*end != 'e');
144}
145
146static int
147F (int stem_length, char *end)
148
149
150{
151 return ((stem_length >= 3) && (*end != 'e'));
152}
153
154static int
155G (int stem_length, char *end)
156
157
158{
159 return ((stem_length >= 3) && (*end == 'f'));
160}
161
162static int
163H (int stem_length, char *end)
164
165
166{
167 char p1, p2;
168 p1 = *end;
169 p2 = *(end - 1);
170 return ((p1 == 't') || ((p1 == 'l') && (p2 == 'l')));
171}
172
173static int
174I (int stem_length, char *end)
175
176
177{
178 char p1;
179 p1 = *end;
180 return ((p1 != 'o') && (p1 != 'e'));
181}
182
183static int
184J (int stem_length, char *end)
185
186
187{
188 char p1;
189 p1 = *end;
190 return ((p1 != 'a') && (p1 != 'e'));
191}
192
193static int
194K (int stem_length, char *end)
195
196
197{
198 char p1;
199 p1 = *end;
200 return ((stem_length >= 3) &&
201 ((p1 == 'l') || (p1 == 'i') ||
202 ((p1 == 'e') && (*(end - 2) == 'u'))));
203}
204
205static int
206L (int stem_length, char *end)
207
208
209{
210 char p1, p2;
211 p1 = *end;
212 p2 = *(end - 1);
213 if (p1 == 's')
214 return (p2 == 'o');
215 else
216 return ((p1 != 'u') && (p1 != 'x'));
217}
218
219static int
220M (int stem_length, char *end)
221
222
223{
224 char p1;
225 p1 = *end;
226 return ((p1 != 'a') && (p1 != 'c') &&
227 (p1 != 'e') && (p1 != 'm'));
228}
229
230static int
231N (int stem_length, char *end)
232
233
234{
235 if (stem_length >= 3)
236 {
237 if (*(end - 2) == 's')
238 return (stem_length >= 4);
239 else
240 return 1;
241 }
242 else
243 return 0;
244}
245
246static int
247O (int stem_length, char *end)
248
249
250{
251 char p1;
252 p1 = *end;
253 return ((p1 == 'l') || (p1 == 'i'));
254}
255
256static int
257P (int stem_length, char *end)
258
259
260{
261 return (*end != 'c');
262}
263
264static int
265Q (int stem_length, char *end)
266
267
268{
269 char p1;
270 p1 = *end;
271
272 return ((stem_length >= 3) && (p1 != 'l') && (p1 != 'n'));
273}
274
275static int
276R (int stem_length, char *end)
277
278
279{
280 char p1;
281 p1 = *end;
282 return ((p1 == 'n') || (p1 == 'r'));
283}
284
285static int
286S (int stem_length, char *end)
287
288
289{
290 char p1, p2;
291 p1 = *end;
292 p2 = *(end - 1);
293
294 if (p1 == 't')
295 return (p2 != 't');
296 else
297 return ((p1 == 'r') && (p2 == 'd'));
298}
299
300static int
301T (int stem_length, char *end)
302
303
304{
305 char p1, p2;
306 p1 = *end;
307 p2 = *(end - 1);
308
309 if (p1 == 't')
310 return (p2 != 'o');
311 else
312 return (p1 == 's');
313}
314
315static int
316U (int stem_length, char *end)
317
318
319{
320 char p1;
321 p1 = *end;
322
323 return ((p1 == 'l') || (p1 == 'm') ||
324 (p1 == 'n') || (p1 == 'r'));
325}
326
327static int
328V (int stem_length, char *end)
329
330
331{
332 return (*end == 'c');
333}
334
335static int
336W (int stem_length, char *end)
337
338
339{
340 char p1;
341 p1 = *end;
342
343 return ((p1 != 's') && (p1 != 'u'));
344}
345
346static int
347X (int stem_length, char *end)
348
349
350{
351 char p1;
352 p1 = *end;
353
354 if (p1 == 'e')
355 return ((stem_length >= 3) && (*(end - 2) == 'u'));
356 else
357 return ((p1 == 'l') || (p1 == 'i'));
358}
359
360static int
361Y (int stem_length, char *end)
362
363
364{
365 return ((*end == 'n') && (*(end - 1) == 'i'));
366}
367
368static int
369Z (int stem_length, char *end)
370
371
372{
373 return (*end != 'f');
374}
375
376static int
377AA (int stem_length, char *end)
378
379
380{
381 char p1, p2;
382 p1 = *end;
383 p2 = *(end - 1);
384
385 if (p1 == 'h')
386 return ((p2 == 'p') || (p2 == 't'));
387 else if (p1 == 'r')
388 return ((p2 == 'e') || (p2 == 'o'));
389 else if (p1 == 's')
390 return (p2 == 'e');
391 else
392 return ((p1 == 'd') || (p1 == 'f') ||
393 (p1 == 'l') || (p1 == 't'));
394}
395
396static int
397BB (int stem_length, char *end)
398
399
400{
401 if (stem_length >= 3)
402 {
403 if (stem_length >= 4)
404 return (strncmp (end - 3, "ryst", 4) != 0);
405 else
406 return (strncmp (end - 2, "met", 3) != 0);
407 }
408 else
409 return 0;
410}
411
412static int
413CC (int stem_length, char *end)
414
415
416{
417 return (*end == 'l');
418}
419
420/**************************************************************************/
421
422#define MIN_STEM_LENGTH 2
423#define MAX_ENDING_SIZE 11
424#define PREDEFINED_SIZE (MAX_ENDING_SIZE + MIN_STEM_LENGTH)
425#define EOS '\0'
426/* [RPAP - Jan 97: Stem Index Change] */
427#define MAX_STEM_LENGTH 255
428
429static char *remove_ending (char *word, int w_len);
430static void recode_stem (char *stem_end);
431
432/* =========================================================================
433 * Function: stem
434 * Description:
435 * Implemetation of the Lovins' stemming algorithm described in
436 * J.B. Lovins, "Development of a Stemming Algorithm",
437 * Mechanical Translation and Computational Linguistics, Vol 11,1968.
438 * Input: a word string with the length in the first byte
439 * Output: the stemmed word
440 * ========================================================================= */
441
442void
443lovinstem (u_char * word)
444{
445 char *word_start; /* points to first letter of word */
446 char *stem_end; /* points to last character of stem portion */
447 int old_length; /* length of word */
448 int new_length; /* length after stemming */
449
450 /* [RPAP - Jan 97: Stem Index Change] */
451 u_char copy[MAX_STEM_LENGTH + 2];
452 memcpy (copy, word, *word + 1);
453
454 old_length = *copy; /* [RPAP - Jan 97: Stem Index Change] */
455
456 /* The word must be at least MIN_STEM_LENGTH characters long. */
457 if (old_length <= MIN_STEM_LENGTH)
458 return;
459
460 word_start = (char *) copy + 1; /* jump over length byte */ /* [RPAP - Jan 97: Stem Index Change] */
461 copy[old_length + 1] = '\0'; /* null terminate string */ /* [RPAP - Jan 97: Stem Index Change] */
462
463 stem_end = remove_ending (word_start, old_length);
464 recode_stem (stem_end);
465
466 /* fix up new length */
467 /* have to use strlen because have no other way of telling length */
468 new_length = strlen (word_start);
469 *copy = new_length; /* [RPAP - Jan 97: Stem Index Change] */
470
471 /* [RPAP - Jan 97: Stem Index Change] */
472 memcpy (word, copy, *copy + 1);
473
474} /*stem_complex */
475
476/* =========================================================================
477
478 * Data structure for recoding and the method for recoding the stem.
479 *
480 * ========================================================================= */
481
482typedef struct
483{
484 char *old_end; /* old ending */
485 char *new_end; /* new ending */
486 char old_offset; /* length of the old ending - 1 */
487 int (*cond) (char); /* condition rule */
488 char end_group; /* signal the end of the group */
489}
490Recode_Rules;
491
492/*
493 Stem terminations are divided into groups having the same last
494 character
495 */
496
497static Recode_Rules Rules[] =
498{
499 "uad", "uas", 2, NULL, 0,
500 "vad", "vas", 2, NULL, 0,
501 "cid", "cis", 2, NULL, 0,
502 "lid", "lis", 2, NULL, 0,
503 "erid", "eris", 3, NULL, 0,
504 "pand", "pans", 3, NULL, 0,
505 "end", "ens", 2, s, 0,
506 "end", "ens", 2, m, 0,
507 "ond", "ons", 2, NULL, 0,
508 "lud", "lus", 2, NULL, 0,
509 "rud", "rus", 2, NULL, 1,
510
511 "ul", "l", 1, aio, 1,
512
513 "istr", "ister", 3, NULL, 0,
514 "metr", "meter", 3, NULL, 0,
515 "her", "hes", 2, pt, 1,
516
517 "urs", "ur", 2, NULL, 1,
518
519 "uct", "uc", 2, NULL, 0,
520 "umpt", "um", 3, NULL, 0,
521 "rpt", "rb", 2, NULL, 0,
522 "mit", "mis", 2, NULL, 0,
523 "ert", "ers", 2, NULL, 0,
524 "et", "es", 1, n, 0,
525 "yt", "ys", 1, NULL, 1,
526
527 "iev", "ief", 2, NULL, 0,
528 "olv", "olut", 2, NULL, 1,
529
530 "bex", "bic", 2, NULL, 0,
531 "dex", "dic", 2, NULL, 0,
532 "pex", "pic", 2, NULL, 0,
533 "tex", "tic", 2, NULL, 0,
534 "ax", "ac", 1, NULL, 0,
535 "ex", "ec", 1, NULL, 0,
536 "ix", "ic", 1, NULL, 0,
537 "lux", "luc", 2, NULL, 1,
538
539 "yz", "ys", 1, NULL, 1
540};
541
542typedef struct last
543 {
544 char c; /* The last character */
545 struct last *left; /* used in balanced */
546 struct last *right; /* binary tree */
547 Recode_Rules *pt; /* location of approriate group */
548 }
549Last_Char_Node;
550
551/*------------
552 s
553 / \
554 l x
555 / \ / \
556 d r t z
557 \
558 v
559---------------*/
560static Last_Char_Node pr[] =
561{
562 'd', NULL, NULL, Rules,
563 'l', pr, pr + 2, Rules + 11,
564 'r', NULL, NULL, Rules + 12,
565 's', pr + 1, pr + 6, Rules + 15,
566 't', NULL, pr + 5, Rules + 16,
567 'v', NULL, NULL, Rules + 23,
568 'x', pr + 4, pr + 7, Rules + 25,
569 'z', NULL, NULL, Rules + 33,
570};
571
572
573/*****************************************************************************
574 *
575 * Recode the stem according to the transformation rules.
576 *
577 *****************************************************************************/
578
579static void
580recode_stem (char *stem_end)
581{
582 static Last_Char_Node *root = pr + 3;
583 Last_Char_Node *p_last = root;
584 Recode_Rules *rule; /* points to the transformation list */
585 char *h, /* points to start of possible ending */
586 ch, /* last character of the old ending */
587 ch2;
588
589 h = stem_end - 1;
590 ch = *stem_end;
591 if (*h == ch) /* Check for double consonant */
592 {
593 if (strchr ("bdglmnprst", ch) != NULL)
594 {
595 *stem_end = EOS;
596 stem_end = h;
597 }
598 }
599
600 do /* Check for the last character */
601 {
602 ch2 = p_last->c;
603 if (ch == ch2)
604 break;
605 else if (ch > ch2)
606 p_last = p_last->right;
607 else
608 p_last = p_last->left;
609 }
610 while (p_last != NULL);
611
612 if (p_last != NULL) /* Check for the rest of suffix list */
613 {
614 rule = p_last->pt;
615 for (;;)
616 {
617 h = stem_end - rule->old_offset;
618 if (strcmp (h, rule->old_end) == 0)
619 {
620 if (!rule->cond || (*rule->cond) (*(h - 1)))
621 {
622 (void) strcpy (h, rule->new_end);
623 break;
624 }
625 }
626 if (rule->end_group)
627 break;
628 ++rule;
629 }
630 }
631}
632
633/* ======================================================================
634
635 * List of endings and the method to remove the ending
636 *
637 * ======================================================================*/
638
639/************ Data structures for list of endings ********************/
640
641typedef struct
642{
643 char *ending; /* old ending */
644 int (*cond) (int length, char *end); /* conditional rule */
645 signed char left_offset; /* used to find the siblings */
646 signed char right_offset; /* in balanced binary tree */
647}
648Ending_List;
649
650/*
651 The original list of endings is re-organised into groups having
652 the same length and the same first character.
653 */
654
655static Ending_List List[] =
656{
657 "a", NULL, 0, 0,
658
659 "ae", NULL, 0, 0,
660 "al", BB, -1, 2,
661 "ar", X, 0, 0,
662 "as", B, -1, 0,
663
664 "acy", NULL, 0, 1,
665 "age", B, 0, 0,
666 "aic", NULL, -2, 1,
667 "als", BB, 0, 0,
668 "ant", B, -2, 2,
669 "ars", O, 0, 0,
670 "ary", F, -1, 2,
671 "ata", NULL, 0, 0,
672 "ate", NULL, -1, 0,
673
674 "able", NULL, 0, 1,
675 "ably", NULL, 0, 0,
676 "ages", B, -2, 2,
677 "ally", B, 0, 0,
678 "ance", B, -1, 1,
679 "ancy", B, 0, 0,
680 "ants", B, -4, 4,
681 "aric", NULL, 0, 0,
682 "arly", K, -1, 1,
683 "ated", I, 0, 0,
684 "ates", NULL, -2, 2,
685 "atic", B, 0, 0,
686 "ator", NULL, -1, 0,
687
688 "acies", NULL, 0, 0,
689 "acity", NULL, -1, 1,
690 "aging", B, 0, 0,
691 "aical", NULL, -2, 2,
692 "alist", NULL, 0, 0,
693 "alism", B, -1, 0,
694 "ality", NULL, -3, 3,
695 "alize", NULL, 0, 1,
696 "allic", BB, 0, 0,
697 "anced", B, -2, 2,
698 "ances", B, 0, 0,
699 "antic", C, -1, 0,
700 "arial", NULL, -6, 6,
701 "aries", NULL, 0, 1,
702 "arily", NULL, 0, 0,
703 "arity", B, -2, 2,
704 "arize", NULL, 0, 0,
705 "aroid", NULL, -1, 0,
706 "ately", NULL, -3, 3,
707 "ating", I, 0, 1,
708 "ation", B, 0, 0,
709 "ative", NULL, -2, 2,
710 "ators", NULL, 0, 0,
711 "atory", NULL, -1, 1,
712 "ature", E, 0, 0,
713
714 "aceous", NULL, 0, 1,
715 "acious", B, 0, 0,
716 "action", G, -2, 2,
717 "alness", NULL, 0, 0,
718 "ancial", NULL, -1, 1,
719 "ancies", NULL, 0, 0,
720 "ancing", B, -4, 4,
721 "ariser", NULL, 0, 0,
722 "arized", NULL, -1, 1,
723 "arizer", NULL, 0, 0,
724 "atable", NULL, -2, 2,
725 "ations", B, 0, 0,
726 "atives", NULL, -1, 0,
727
728 "ability", NULL, 0, 1,
729 "aically", NULL, 0, 0,
730 "alistic", B, -2, 2,
731 "alities", NULL, 0, 0,
732 "ariness", E, -1, 0,
733 "aristic", NULL, -3, 3,
734 "arizing", NULL, 0, 1,
735 "ateness", NULL, 0, 0,
736 "atingly", NULL, -2, 2,
737 "ational", B, 0, 0,
738 "atively", NULL, -1, 1,
739 "ativism", NULL, 0, 0,
740
741 "ableness", NULL, 0, 1,
742 "arizable", NULL, 0, 0,
743
744 "allically", C, 0, 0,
745 "antaneous", NULL, -1, 1,
746 "antiality", NULL, 0, 0,
747 "arisation", NULL, -2, 2,
748 "arization", NULL, 0, 0,
749 "ationally", B, -1, 1,
750 "ativeness", NULL, 0, 0,
751
752 "antialness", NULL, 0, 0,
753 "arisations", NULL, -1, 1,
754 "arizations", NULL, 0, 0,
755
756 "alistically", B, 0, 1,
757 "arizability", NULL, 0, 0,
758
759 "e", NULL, 0, 0,
760
761 "ed", E, 0, 0,
762 "en", F, -1, 1,
763 "es", E, 0, 0,
764
765 "eal", Y, 0, 0,
766 "ear", Y, -1, 1,
767 "ely", E, 0, 0,
768 "ene", E, -2, 2,
769 "ent", C, 0, 0,
770 "ery", E, -1, 1,
771 "ese", NULL, 0, 0,
772
773 "ealy", Y, 0, 1,
774 "edly", E, 0, 0,
775 "eful", NULL, -2, 1,
776 "eity", NULL, 0, 0,
777 "ence", NULL, -2, 2,
778 "ency", NULL, 0, 0,
779 "ened", E, -1, 2,
780 "enly", E, 0, 0,
781 "eous", NULL, -1, 0,
782
783 "early", Y, 0, 1,
784 "ehood", NULL, 0, 0,
785 "eless", NULL, -2, 2,
786 "elity", NULL, 0, 0,
787 "ement", NULL, -1, 0,
788 "enced", NULL, -3, 3,
789 "ences", NULL, 0, 1,
790 "eness", E, 0, 0,
791 "ening", E, -2, 2,
792 "ental", NULL, 0, 0,
793 "ented", C, -1, 1,
794 "ently", NULL, 0, 0,
795
796 "eature", Z, 0, 0,
797 "efully", NULL, -1, 1,
798 "encies", NULL, 0, 0,
799 "encing", NULL, -2, 2,
800 "ential", NULL, 0, 0,
801 "enting", C, -1, 1,
802 "entist", NULL, 0, 1,
803 "eously", NULL, 0, 0,
804
805 "elihood", E, 0, 1,
806 "encible", NULL, 0, 0,
807 "entally", NULL, -2, 2,
808 "entials", NULL, 0, 0,
809 "entiate", NULL, -1, 1,
810 "entness", NULL, 0, 0,
811
812 "entation", NULL, 0, 0,
813 "entially", NULL, -1, 1,
814 "eousness", NULL, 0, 0,
815
816 "eableness", E, 0, 1,
817 "entations", NULL, 0, 0,
818 "entiality", NULL, -2, 2,
819 "entialize", NULL, 0, 0,
820 "entiation", NULL, -1, 0,
821
822 "entialness", NULL, 0, 0,
823
824 "ful", NULL, 0, 0,
825
826 "fully", NULL, 0, 0,
827
828 "fulness", NULL, 0, 0,
829
830 "hood", NULL, 0, 0,
831
832 "i", NULL, 0, 0,
833
834 "ia", NULL, 0, 0,
835 "ic", NULL, -1, 1,
836 "is", NULL, 0, 0,
837
838 "ial", NULL, 0, 0,
839 "ian", NULL, -1, 1,
840 "ics", NULL, 0, 1,
841 "ide", L, 0, 0,
842 "ied", NULL, -3, 3,
843 "ier", NULL, 0, 0,
844 "ies", P, -1, 0,
845 "ily", NULL, -1, 1,
846 "ine", M, 0, 0,
847 "ing", N, -5, 5,
848 "ion", Q, 0, 0,
849 "ish", C, -1, 1,
850 "ism", B, 0, 1,
851 "ist", NULL, 0, 0,
852 "ite", AA, -3, 3,
853 "ity", NULL, 0, 0,
854 "ium", NULL, -1, 0,
855 "ive", NULL, -1, 1,
856 "ize", F, 0, 0,
857
858 "ials", NULL, 0, 0,
859 "ians", NULL, -1, 0,
860 "ible", NULL, -1, 1,
861 "ibly", NULL, 0, 0,
862 "ical", NULL, -2, 2,
863 "ides", L, 0, 0,
864 "iers", NULL, -1, 1,
865 "iful", NULL, 0, 0,
866 "ines", M, -4, 4,
867 "ings", N, 0, 0,
868 "ions", B, -1, 1,
869 "ious", NULL, 0, 0,
870 "isms", B, -2, 2,
871 "ists", NULL, 0, 0,
872 "itic", H, -1, 1,
873 "ized", F, 0, 1,
874 "izer", F, 0, 0,
875
876 "ially", NULL, 0, 0,
877 "icant", NULL, -1, 1,
878 "ician", NULL, 0, 0,
879 "icide", NULL, -2, 2,
880 "icism", NULL, 0, 0,
881 "icist", NULL, -1, 0,
882 "icity", NULL, -3, 3,
883 "idine", I, 0, 1,
884 "iedly", NULL, 0, 0,
885 "ihood", NULL, -2, 2,
886 "inate", NULL, 0, 0,
887 "iness", NULL, -1, 0,
888 "ingly", B, -6, 6,
889 "inism", J, 0, 1,
890 "inity", CC, 0, 0,
891 "ional", NULL, -2, 2,
892 "ioned", NULL, 0, 0,
893 "ished", NULL, -1, 0,
894 "istic", NULL, -3, 3,
895 "ities", NULL, 0, 1,
896 "itous", NULL, 0, 0,
897 "ively", NULL, -2, 2,
898 "ivity", NULL, 0, 0,
899 "izers", F, -1, 1,
900 "izing", F, 0, 0,
901
902 "ialist", NULL, 0, 0,
903 "iality", NULL, -1, 1,
904 "ialize", NULL, 0, 0,
905 "ically", NULL, -2, 2,
906 "icance", NULL, 0, 0,
907 "icians", NULL, -1, 1,
908 "icists", NULL, 0, 0,
909 "ifully", NULL, -4, 4,
910 "ionals", NULL, 0, 0,
911 "ionate", D, -1, 1,
912 "ioning", NULL, 0, 0,
913 "ionist", NULL, -2, 2,
914 "iously", NULL, 0, 0,
915 "istics", NULL, -1, 1,
916 "izable", E, 0, 0,
917
918 "ibility", NULL, 0, 0,
919 "icalism", NULL, -1, 1,
920 "icalist", NULL, 0, 1,
921 "icality", NULL, 0, 0,
922 "icalize", NULL, -3, 3,
923 "ication", G, 0, 0,
924 "icianry", NULL, -1, 0,
925 "ination", NULL, -1, 1,
926 "ingness", NULL, 0, 0,
927 "ionally", NULL, -5, 5,
928 "isation", NULL, 0, 0,
929 "ishness", NULL, -1, 1,
930 "istical", NULL, 0, 1,
931 "iteness", NULL, 0, 0,
932 "iveness", NULL, -3, 3,
933 "ivistic", NULL, 0, 0,
934 "ivities", NULL, -1, 0,
935 "ization", F, -1, 1,
936 "izement", NULL, 0, 0,
937
938 "ibleness", NULL, 0, 0,
939 "icalness", NULL, -1, 1,
940 "ionalism", NULL, 0, 0,
941 "ionality", NULL, -2, 2,
942 "ionalize", NULL, 0, 0,
943 "iousness", NULL, -1, 1,
944 "izations", NULL, 0, 0,
945
946 "ionalness", NULL, 0, 1,
947 "istically", NULL, 0, 0,
948 "itousness", NULL, -2, 2,
949 "izability", NULL, 0, 0,
950 "izational", NULL, -1, 0,
951
952 "izationally", B, 0, 0,
953
954 "ly", B, 0, 0,
955
956 "less", NULL, 0, 1,
957 "lily", NULL, 0, 0,
958
959 "lessly", NULL, 0, 0,
960
961 "lessness", NULL, 0, 0,
962
963 "ness", NULL, 0, 0,
964
965 "nesses", NULL, 0, 0,
966
967 "o", NULL, 0, 0,
968
969 "on", S, 0, 1,
970 "or", T, 0, 0,
971
972 "oid", NULL, 0, 0,
973 "one", R, -1, 1,
974 "ous", NULL, 0, 0,
975
976 "ogen", NULL, 0, 0,
977
978 "oidal", NULL, 0, 0,
979 "oides", NULL, -1, 2,
980 "otide", NULL, 0, 0,
981 "ously", NULL, -1, 0,
982
983 "oidism", NULL, 0, 0,
984
985 "oidally", NULL, 0, 1,
986 "ousness", NULL, 0, 0,
987
988 "s", W, 0, 0,
989
990 "s'", NULL, 0, 0,
991
992 "um", U, 0, 1,
993 "us", V, 0, 0,
994
995 "ward", NULL, 0, 1,
996 "wise", NULL, 0, 0,
997
998 "y", B, 0, 0,
999
1000 "yl", R, 0, 0,
1001
1002 "ying", B, 0, 1,
1003 "yish", NULL, 0, 0,
1004
1005 "'s", NULL, 0, 0,
1006};
1007
1008typedef struct node
1009 {
1010 char c; /* First character */
1011 struct node *left; /* used in balanced */
1012 struct node *right; /* binary tree */
1013 Ending_List *ptr[11]; /* the approriate location */
1014 }
1015First_Char_Node;
1016
1017static First_Char_Node First[] =
1018{
1019 '\'', NULL, NULL,
1020 {NULL,
1021 List + 293, NULL, NULL, NULL, NULL,
1022 NULL, NULL, NULL, NULL, NULL},
1023
1024 'a', First, NULL,
1025 {List,
1026 List + 2, List + 9, List + 20, List + 39, List + 58,
1027 List + 70, List + 77, List + 82, List + 87, List + 89},
1028
1029 'e', First + 1, First + 4,
1030 {List + 91,
1031 List + 93, List + 98, List + 106, List + 116, List + 126,
1032 List + 133, List + 138, List + 142, List + 145, NULL},
1033
1034 'f', NULL, NULL,
1035 {NULL,
1036 NULL, List + 146, NULL, List + 147, NULL,
1037 List + 148, NULL, NULL, NULL, NULL},
1038
1039 'h', First + 3, First + 5,
1040 {NULL,
1041 NULL, NULL, List + 149, NULL, NULL,
1042 NULL, NULL, NULL, NULL, NULL},
1043
1044 'i', NULL, NULL,
1045 {List + 150,
1046 List + 152, List + 163, List + 181, List + 202, List + 222,
1047 List + 239, List + 252, List + 258, NULL, List + 261},
1048
1049 'l', First + 2, First + 10,
1050 {NULL,
1051 List + 262, NULL, List + 263, NULL, List + 265,
1052 NULL, List + 266, NULL, NULL, NULL},
1053
1054 'n', NULL, NULL,
1055 {NULL,
1056 NULL, NULL, List + 267, NULL, List + 268,
1057 NULL, NULL, NULL, NULL, NULL},
1058
1059 'o', First + 7, First + 9,
1060 {List + 269,
1061 List + 270, List + 273, List + 275, List + 277, List + 280,
1062 List + 281, NULL, NULL, NULL, NULL},
1063
1064 's', NULL, NULL,
1065 {List + 283,
1066 List + 284, NULL, NULL, NULL, NULL,
1067 NULL, NULL, NULL, NULL, NULL},
1068
1069 'u', First + 8, First + 12,
1070 {NULL,
1071 List + 285, NULL, NULL, NULL, NULL,
1072 NULL, NULL, NULL, NULL, NULL},
1073
1074 'w', NULL, NULL,
1075 {NULL,
1076 NULL, NULL, List + 287, NULL, NULL,
1077 NULL, NULL, NULL, NULL, NULL},
1078
1079 'y', First + 11, NULL,
1080 {List + 289,
1081 List + 290, NULL, List + 291, NULL, NULL,
1082 NULL, NULL, NULL, NULL, NULL},
1083};
1084
1085
1086/******************************************************************************
1087 *
1088 * Look for a match in the suffix list.
1089 * Return the pointer to the end of the new stem if there is a match.
1090 * Otherwise, return the pointer to the end of the original word.
1091 *
1092 * Method:
1093 * * Search for the first character in the suffix list.
1094 * * If there is match, search for the rest of the suffix list.
1095 *
1096 *****************************************************************************/
1097
1098static char *
1099remove_ending (char *word, int w_len)
1100{
1101 static First_Char_Node *root = First + 6; /* the root of binary tree is 'l' */
1102 First_Char_Node *p_first; /* Points to the first character */
1103 /* of the possible suffix */
1104 Ending_List *p_list; /* Points to the List of Endings */
1105 char *suffix_start, /* Points to start of possible suffix */
1106 *stem_end, /* Points to the end of possible stem */
1107 ch1, ch2;
1108 int s_len, /* Possible stem length */
1109 e_offset, /* Offset from the end to the start of */
1110 /* possible suffix */
1111 not_done = 1, cmp;
1112
1113 stem_end = word + w_len - 1;
1114 if (w_len > PREDEFINED_SIZE) /* find the start of suffix */
1115 suffix_start = word + w_len - MAX_ENDING_SIZE;
1116 else
1117 suffix_start = word + MIN_STEM_LENGTH;
1118
1119 ch1 = *suffix_start;
1120 do /* Search for the first character */
1121 {
1122 p_first = root;
1123 do
1124 {
1125 ch2 = p_first->c;
1126 if (ch1 == ch2)
1127 break;
1128 else if (ch1 > ch2)
1129 p_first = p_first->right;
1130 else
1131 p_first = p_first->left;
1132 }
1133 while (p_first != NULL);
1134
1135 if (p_first != NULL) /* Search for the rest */
1136 {
1137 e_offset = stem_end - suffix_start;
1138 if ((p_list = p_first->ptr[e_offset]) != NULL)
1139 {
1140 for (;;) /* no need to compare the first char */
1141 {
1142 cmp = strcmp (suffix_start + 1, p_list->ending + 1);
1143 if (cmp > 0)
1144 if (p_list->right_offset)
1145 p_list += p_list->right_offset;
1146 else
1147 break;
1148 else if (cmp < 0)
1149 if (p_list->left_offset)
1150 p_list += p_list->left_offset;
1151 else
1152 break;
1153 else
1154 {
1155 s_len = suffix_start - word;
1156 if (!p_list->cond ||
1157 (*p_list->cond) (s_len, suffix_start - 1))
1158 {
1159 *suffix_start = EOS;
1160 stem_end = suffix_start - 1;
1161 not_done = 0;
1162 }
1163 break;
1164 }
1165 }
1166 }
1167 }
1168 ++suffix_start;
1169 }
1170 while (not_done && ((ch1 = *suffix_start) != EOS));
1171 return (stem_end);
1172}
Note: See TracBrowser for help on using the repository browser.