source: main/branches/64_bit_Greenstone/greenstone2/common-src/indexers/mgpp/lib/lovinstem.cpp@ 23508

Last change on this file since 23508 was 23508, checked in by sjm84, 13 years ago

Committing 64 bit changes into the branch

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 26.9 KB
Line 
1/**************************************************************************
2 *
3 * lovinstem.c -- Stemming code
4 * Copyright (C) 1994 Linh Huynh ([email protected])
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "sysfuncs.h"
23
24
25/*****************************************************************************
26 *
27 * Transformational Rules used in Recoding Stem Terminations
28 *
29 * Parameter in functions:
30 * ch : the second last character of the old ending
31 * (after removing one of double consonants)
32 *
33 ****************************************************************************/
34
35
36/************ Conditional rules associated with transformations **************/
37
38/*static int aio ();
39static int s ();
40static int pt ();
41static int m ();
42static int n ();*/
43
44static int
45aio (char ch) /* Rule number 9 */
46{
47 return ((ch != 'a') && (ch != 'i') && (ch != 'o'));
48}
49
50static int
51s (char ch) /* Rule Number 24 */
52{
53 return (ch != 's');
54}
55
56static int
57pt (char ch) /* Rule number 28 */
58{
59 return ((ch != 'p') && (ch != 't'));
60}
61
62static int
63m (char ch) /* Rule number 30 */
64{
65 return (ch != 'm');
66}
67
68static int
69n (char ch) /* Rule number 32 */
70{
71 return (ch != 'n');
72}
73
74
75/**************************************************************************
76 *
77 * Context-sensitive rules associated with certain endings.
78 *
79 * The notations of the rules are the same as in Lovins' paper
80 * except that rule A is replaced by a NULL in the data structure.
81 *
82 * Function parameters:
83 * - stem_length: possible stem length
84 * - end : pointer points to the end of the possible stem.
85 *
86 *************************************************************************/
87
88
89/********* Context-sensitive rule function declarations ******************/
90
91/*static int B ();
92static int C ();
93static int D ();
94static int E ();
95static int F ();
96static int G ();
97static int H ();
98static int I ();
99static int J ();
100static int K ();
101static int L ();
102static int M ();
103static int N ();
104static int O ();
105static int P ();
106static int Q ();
107static int R ();
108static int S ();
109static int T ();
110static int U ();
111static int V ();
112static int W ();
113static int X ();
114static int Y ();
115static int Z ();
116static int AA ();
117static int BB ();
118static int CC ();
119*/
120static int
121B (int stem_length, char *end)
122{
123 return (stem_length >= 3);
124}
125
126static int
127C (int stem_length, char *end)
128{
129 return (stem_length >= 4);
130}
131
132static int
133D (int stem_length, char *end)
134{
135 return (stem_length >= 5);
136}
137
138static int
139E (int stem_length, char *end)
140
141
142{
143 return (*end != 'e');
144}
145
146static int
147F (int stem_length, char *end)
148
149
150{
151 return ((stem_length >= 3) && (*end != 'e'));
152}
153
154static int
155G (int stem_length, char *end)
156
157
158{
159 return ((stem_length >= 3) && (*end == 'f'));
160}
161
162static int
163H (int stem_length, char *end)
164
165
166{
167 char p1, p2;
168 p1 = *end;
169 p2 = *(end - 1);
170 return ((p1 == 't') || ((p1 == 'l') && (p2 == 'l')));
171}
172
173static int
174I (int stem_length, char *end)
175
176
177{
178 char p1;
179 p1 = *end;
180 return ((p1 != 'o') && (p1 != 'e'));
181}
182
183static int
184J (int stem_length, char *end)
185
186
187{
188 char p1;
189 p1 = *end;
190 return ((p1 != 'a') && (p1 != 'e'));
191}
192
193static int
194K (int stem_length, char *end)
195
196
197{
198 char p1;
199 p1 = *end;
200 return ((stem_length >= 3) &&
201 ((p1 == 'l') || (p1 == 'i') ||
202 ((p1 == 'e') && (*(end - 2) == 'u'))));
203}
204
205static int
206L (int stem_length, char *end)
207
208
209{
210 char p1, p2;
211 p1 = *end;
212 p2 = *(end - 1);
213 if (p1 == 's')
214 return (p2 == 'o');
215 else
216 return ((p1 != 'u') && (p1 != 'x'));
217}
218
219static int
220M (int stem_length, char *end)
221
222
223{
224 char p1;
225 p1 = *end;
226 return ((p1 != 'a') && (p1 != 'c') &&
227 (p1 != 'e') && (p1 != 'm'));
228}
229
230static int
231N (int stem_length, char *end)
232
233
234{
235 if (stem_length >= 3)
236 {
237 if (*(end - 2) == 's')
238 return (stem_length >= 4);
239 else
240 return 1;
241 }
242 else
243 return 0;
244}
245
246static int
247O (int stem_length, char *end)
248
249
250{
251 char p1;
252 p1 = *end;
253 return ((p1 == 'l') || (p1 == 'i'));
254}
255
256static int
257P (int stem_length, char *end)
258
259
260{
261 return (*end != 'c');
262}
263
264static int
265Q (int stem_length, char *end)
266
267
268{
269 char p1;
270 p1 = *end;
271
272 return ((stem_length >= 3) && (p1 != 'l') && (p1 != 'n'));
273}
274
275static int
276R (int stem_length, char *end)
277
278
279{
280 char p1;
281 p1 = *end;
282 return ((p1 == 'n') || (p1 == 'r'));
283}
284
285static int
286S (int stem_length, char *end)
287
288
289{
290 char p1, p2;
291 p1 = *end;
292 p2 = *(end - 1);
293
294 if (p1 == 't')
295 return (p2 != 't');
296 else
297 return ((p1 == 'r') && (p2 == 'd'));
298}
299
300static int
301T (int stem_length, char *end)
302
303
304{
305 char p1, p2;
306 p1 = *end;
307 p2 = *(end - 1);
308
309 if (p1 == 't')
310 return (p2 != 'o');
311 else
312 return (p1 == 's');
313}
314
315static int
316U (int stem_length, char *end)
317
318
319{
320 char p1;
321 p1 = *end;
322
323 return ((p1 == 'l') || (p1 == 'm') ||
324 (p1 == 'n') || (p1 == 'r'));
325}
326
327static int
328V (int stem_length, char *end)
329
330
331{
332 return (*end == 'c');
333}
334
335static int
336W (int stem_length, char *end)
337
338
339{
340 char p1;
341 p1 = *end;
342
343 return ((p1 != 's') && (p1 != 'u'));
344}
345
346static int
347X (int stem_length, char *end)
348
349
350{
351 char p1;
352 p1 = *end;
353
354 if (p1 == 'e')
355 return ((stem_length >= 3) && (*(end - 2) == 'u'));
356 else
357 return ((p1 == 'l') || (p1 == 'i'));
358}
359
360static int
361Y (int stem_length, char *end)
362
363
364{
365 return ((*end == 'n') && (*(end - 1) == 'i'));
366}
367
368static int
369Z (int stem_length, char *end)
370
371
372{
373 return (*end != 'f');
374}
375
376static int
377AA (int stem_length, char *end)
378
379
380{
381 char p1, p2;
382 p1 = *end;
383 p2 = *(end - 1);
384
385 if (p1 == 'h')
386 return ((p2 == 'p') || (p2 == 't'));
387 else if (p1 == 'r')
388 return ((p2 == 'e') || (p2 == 'o'));
389 else if (p1 == 's')
390 return (p2 == 'e');
391 else
392 return ((p1 == 'd') || (p1 == 'f') ||
393 (p1 == 'l') || (p1 == 't'));
394}
395
396static int
397BB (int stem_length, char *end)
398
399
400{
401 if (stem_length >= 3)
402 {
403 if (stem_length >= 4)
404 return (strncmp (end - 3, "ryst", 4) != 0);
405 else
406 return (strncmp (end - 2, "met", 3) != 0);
407 }
408 else
409 return 0;
410}
411
412static int
413CC (int stem_length, char *end)
414
415
416{
417 return (*end == 'l');
418}
419
420/**************************************************************************/
421
422#define MIN_STEM_LENGTH 2
423#define MAX_ENDING_SIZE 11
424#define PREDEFINED_SIZE (MAX_ENDING_SIZE + MIN_STEM_LENGTH)
425#define EOS '\0'
426/* [RPAP - Jan 97: Stem Index Change] */
427#define MAX_STEM_LENGTH 255
428
429static char *remove_ending (char *word, int w_len);
430static void recode_stem (char *stem_end);
431
432/* =========================================================================
433 * Function: stem
434 * Description:
435 * Implemetation of the Lovins' stemming algorithm described in
436 * J.B. Lovins, "Development of a Stemming Algorithm",
437 * Mechanical Translation and Computational Linguistics, Vol 11,1968.
438 * Input: a word string with the length in the first byte
439 * Output: the stemmed word
440 * ========================================================================= */
441
442void
443lovinstem (u_char * word)
444{
445 char *word_start; /* points to first letter of word */
446 char *stem_end; /* points to last character of stem portion */
447 int old_length; /* length of word */
448 int new_length; /* length after stemming */
449
450 /* [RPAP - Jan 97: Stem Index Change] */
451 u_char copy[MAX_STEM_LENGTH + 2];
452 memcpy (copy, word, *word + 1);
453
454 old_length = *copy; /* [RPAP - Jan 97: Stem Index Change] */
455
456 /* The word must be at least MIN_STEM_LENGTH characters long. */
457 if (old_length <= MIN_STEM_LENGTH)
458 return;
459
460 word_start = (char *) copy + 1; /* jump over length byte */ /* [RPAP - Jan 97: Stem Index Change] */
461 copy[old_length + 1] = '\0'; /* null terminate string */ /* [RPAP - Jan 97: Stem Index Change] */
462
463 stem_end = remove_ending (word_start, old_length);
464 recode_stem (stem_end);
465
466 /* fix up new length */
467 /* have to use strlen because have no other way of telling length */
468 new_length = strlen (word_start);
469 *copy = new_length; /* [RPAP - Jan 97: Stem Index Change] */
470
471 /* [RPAP - Jan 97: Stem Index Change] */
472 memcpy (word, copy, *copy + 1);
473
474} /*stem_complex */
475
476/* =========================================================================
477
478 * Data structure for recoding and the method for recoding the stem.
479 *
480 * ========================================================================= */
481
482typedef struct
483{
484 char *old_end; /* old ending */
485 char *new_end; /* new ending */
486 char old_offset; /* length of the old ending - 1 */
487 int (*cond) (char); /* condition rule */
488 char end_group; /* signal the end of the group */
489}
490Recode_Rules;
491
492/*
493 Stem terminations are divided into groups having the same last
494 character
495 */
496
497static Recode_Rules Rules[] =
498{
499 {(char*)"uad", (char*)"uas", 2, NULL, 0},
500 {(char*)"vad", (char*)"vas", 2, NULL, 0},
501 {(char*)"cid", (char*)"cis", 2, NULL, 0},
502 {(char*)"lid", (char*)"lis", 2, NULL, 0},
503 {(char*)"erid", (char*)"eris", 3, NULL, 0},
504 {(char*)"pand", (char*)"pans", 3, NULL, 0},
505 {(char*)"end", (char*)"ens", 2, s, 0},
506 {(char*)"end", (char*)"ens", 2, m, 0},
507 {(char*)"ond", (char*)"ons", 2, NULL, 0},
508 {(char*)"lud", (char*)"lus", 2, NULL, 0},
509 {(char*)"rud", (char*)"rus", 2, NULL, 1},
510
511 {(char*)"ul", (char*)"l", 1, aio, 1},
512
513 {(char*)"istr", (char*)"ister", 3, NULL, 0},
514 {(char*)"metr", (char*)"meter", 3, NULL, 0},
515 {(char*)"her", (char*)"hes", 2, pt, 1},
516
517 {(char*)"urs", (char*)"ur", 2, NULL, 1},
518
519 {(char*)"uct", (char*)"uc", 2, NULL, 0},
520 {(char*)"umpt", (char*)"um", 3, NULL, 0},
521 {(char*)"rpt", (char*)"rb", 2, NULL, 0},
522 {(char*)"mit", (char*)"mis", 2, NULL, 0},
523 {(char*)"ert", (char*)"ers", 2, NULL, 0},
524 {(char*)"et", (char*)"es", 1, n, 0},
525 {(char*)"yt", (char*)"ys", 1, NULL, 1},
526
527 {(char*)"iev", (char*)"ief", 2, NULL, 0},
528 {(char*)"olv", (char*)"olut", 2, NULL, 1},
529
530 {(char*)"bex", (char*)"bic", 2, NULL, 0},
531 {(char*)"dex", (char*)"dic", 2, NULL, 0},
532 {(char*)"pex", (char*)"pic", 2, NULL, 0},
533 {(char*)"tex", (char*)"tic", 2, NULL, 0},
534 {(char*)"ax", (char*)"ac", 1, NULL, 0},
535 {(char*)"ex", (char*)"ec", 1, NULL, 0},
536 {(char*)"ix", (char*)"ic", 1, NULL, 0},
537 {(char*)"lux", (char*)"luc", 2, NULL, 1},
538
539 {(char*)"yz", (char*)"ys", 1, NULL, 1}
540};
541
542typedef struct last
543 {
544 char c; /* The last character */
545 struct last *left; /* used in balanced */
546 struct last *right; /* binary tree */
547 Recode_Rules *pt; /* location of approriate group */
548 }
549Last_Char_Node;
550
551/*------------
552 s
553 / \
554 l x
555 / \ / \
556 d r t z
557 \
558 v
559---------------*/
560static Last_Char_Node pr[] =
561{
562 {'d', NULL, NULL, Rules},
563 {'l', pr, pr + 2, Rules + 11},
564 {'r', NULL, NULL, Rules + 12},
565 {'s', pr + 1, pr + 6, Rules + 15},
566 {'t', NULL, pr + 5, Rules + 16},
567 {'v', NULL, NULL, Rules + 23},
568 {'x', pr + 4, pr + 7, Rules + 25},
569 {'z', NULL, NULL, Rules + 33},
570};
571
572
573/*****************************************************************************
574 *
575 * Recode the stem according to the transformation rules.
576 *
577 *****************************************************************************/
578
579static void
580recode_stem (char *stem_end)
581{
582 static Last_Char_Node *root = pr + 3;
583 Last_Char_Node *p_last = root;
584 Recode_Rules *rule; /* points to the transformation list */
585 char *h, /* points to start of possible ending */
586 ch, /* last character of the old ending */
587 ch2;
588
589 h = stem_end - 1;
590 ch = *stem_end;
591 if (*h == ch) /* Check for double consonant */
592 {
593 if (strchr ("bdglmnprst", ch) != NULL)
594 {
595 *stem_end = EOS;
596 stem_end = h;
597 }
598 }
599
600 do /* Check for the last character */
601 {
602 ch2 = p_last->c;
603 if (ch == ch2)
604 break;
605 else if (ch > ch2)
606 p_last = p_last->right;
607 else
608 p_last = p_last->left;
609 }
610 while (p_last != NULL);
611
612 if (p_last != NULL) /* Check for the rest of suffix list */
613 {
614 rule = p_last->pt;
615 for (;;)
616 {
617 h = stem_end - rule->old_offset;
618 if (strcmp (h, rule->old_end) == 0)
619 {
620 if (!rule->cond || (*rule->cond) (*(h - 1)))
621 {
622 (void) strcpy (h, rule->new_end);
623 break;
624 }
625 }
626 if (rule->end_group)
627 break;
628 ++rule;
629 }
630 }
631}
632
633/* ======================================================================
634
635 * List of endings and the method to remove the ending
636 *
637 * ======================================================================*/
638
639/************ Data structures for list of endings ********************/
640
641typedef struct
642{
643 char *ending; /* old ending */
644 int (*cond) (int length, char *end); /* conditional rule */
645 signed char left_offset; /* used to find the siblings */
646 signed char right_offset; /* in balanced binary tree */
647}
648Ending_List;
649
650/*
651 The original list of endings is re-organised into groups having
652 the same length and the same first character.
653 */
654
655static Ending_List List[] =
656{
657 {(char*)"a", NULL, 0, 0},
658
659 {(char*)"ae", NULL, 0, 0},
660 {(char*)"al", BB, -1, 2},
661 {(char*)"ar", X, 0, 0},
662 {(char*)"as", B, -1, 0},
663
664 {(char*)"acy", NULL, 0, 1},
665 {(char*)"age", B, 0, 0},
666 {(char*)"aic", NULL, -2, 1},
667 {(char*)"als", BB, 0, 0},
668 {(char*)"ant", B, -2, 2},
669 {(char*)"ars", O, 0, 0},
670 {(char*)"ary", F, -1, 2},
671 {(char*)"ata", NULL, 0, 0},
672 {(char*)"ate", NULL, -1, 0},
673
674 {(char*)"able", NULL, 0, 1},
675 {(char*)"ably", NULL, 0, 0},
676 {(char*)"ages", B, -2, 2},
677 {(char*)"ally", B, 0, 0},
678 {(char*)"ance", B, -1, 1},
679 {(char*)"ancy", B, 0, 0},
680 {(char*)"ants", B, -4, 4},
681 {(char*)"aric", NULL, 0, 0},
682 {(char*)"arly", K, -1, 1},
683 {(char*)"ated", I, 0, 0},
684 {(char*)"ates", NULL, -2, 2},
685 {(char*)"atic", B, 0, 0},
686 {(char*)"ator", NULL, -1, 0},
687
688 {(char*)"acies", NULL, 0, 0},
689 {(char*)"acity", NULL, -1, 1},
690 {(char*)"aging", B, 0, 0},
691 {(char*)"aical", NULL, -2, 2},
692 {(char*)"alist", NULL, 0, 0},
693 {(char*)"alism", B, -1, 0},
694 {(char*)"ality", NULL, -3, 3},
695 {(char*)"alize", NULL, 0, 1},
696 {(char*)"allic", BB, 0, 0},
697 {(char*)"anced", B, -2, 2},
698 {(char*)"ances", B, 0, 0},
699 {(char*)"antic", C, -1, 0},
700 {(char*)"arial", NULL, -6, 6},
701 {(char*)"aries", NULL, 0, 1},
702 {(char*)"arily", NULL, 0, 0},
703 {(char*)"arity", B, -2, 2},
704 {(char*)"arize", NULL, 0, 0},
705 {(char*)"aroid", NULL, -1, 0},
706 {(char*)"ately", NULL, -3, 3},
707 {(char*)"ating", I, 0, 1},
708 {(char*)"ation", B, 0, 0},
709 {(char*)"ative", NULL, -2, 2},
710 {(char*)"ators", NULL, 0, 0},
711 {(char*)"atory", NULL, -1, 1},
712 {(char*)"ature", E, 0, 0},
713
714 {(char*)"aceous", NULL, 0, 1},
715 {(char*)"acious", B, 0, 0},
716 {(char*)"action", G, -2, 2},
717 {(char*)"alness", NULL, 0, 0},
718 {(char*)"ancial", NULL, -1, 1},
719 {(char*)"ancies", NULL, 0, 0},
720 {(char*)"ancing", B, -4, 4},
721 {(char*)"ariser", NULL, 0, 0},
722 {(char*)"arized", NULL, -1, 1},
723 {(char*)"arizer", NULL, 0, 0},
724 {(char*)"atable", NULL, -2, 2},
725 {(char*)"ations", B, 0, 0},
726 {(char*)"atives", NULL, -1, 0},
727
728 {(char*)"ability", NULL, 0, 1},
729 {(char*)"aically", NULL, 0, 0},
730 {(char*)"alistic", B, -2, 2},
731 {(char*)"alities", NULL, 0, 0},
732 {(char*)"ariness", E, -1, 0},
733 {(char*)"aristic", NULL, -3, 3},
734 {(char*)"arizing", NULL, 0, 1},
735 {(char*)"ateness", NULL, 0, 0},
736 {(char*)"atingly", NULL, -2, 2},
737 {(char*)"ational", B, 0, 0},
738 {(char*)"atively", NULL, -1, 1},
739 {(char*)"ativism", NULL, 0, 0},
740
741 {(char*)"ableness", NULL, 0, 1},
742 {(char*)"arizable", NULL, 0, 0},
743
744 {(char*)"allically", C, 0, 0},
745 {(char*)"antaneous", NULL, -1, 1},
746 {(char*)"antiality", NULL, 0, 0},
747 {(char*)"arisation", NULL, -2, 2},
748 {(char*)"arization", NULL, 0, 0},
749 {(char*)"ationally", B, -1, 1},
750 {(char*)"ativeness", NULL, 0, 0},
751
752 {(char*)"antialness", NULL, 0, 0},
753 {(char*)"arisations", NULL, -1, 1},
754 {(char*)"arizations", NULL, 0, 0},
755
756 {(char*)"alistically", B, 0, 1},
757 {(char*)"arizability", NULL, 0, 0},
758
759 {(char*)"e", NULL, 0, 0},
760
761 {(char*)"ed", E, 0, 0},
762 {(char*)"en", F, -1, 1},
763 {(char*)"es", E, 0, 0},
764
765 {(char*)"eal", Y, 0, 0},
766 {(char*)"ear", Y, -1, 1},
767 {(char*)"ely", E, 0, 0},
768 {(char*)"ene", E, -2, 2},
769 {(char*)"ent", C, 0, 0},
770 {(char*)"ery", E, -1, 1},
771 {(char*)"ese", NULL, 0, 0},
772
773 {(char*)"ealy", Y, 0, 1},
774 {(char*)"edly", E, 0, 0},
775 {(char*)"eful", NULL, -2, 1},
776 {(char*)"eity", NULL, 0, 0},
777 {(char*)"ence", NULL, -2, 2},
778 {(char*)"ency", NULL, 0, 0},
779 {(char*)"ened", E, -1, 2},
780 {(char*)"enly", E, 0, 0},
781 {(char*)"eous", NULL, -1, 0},
782
783 {(char*)"early", Y, 0, 1},
784 {(char*)"ehood", NULL, 0, 0},
785 {(char*)"eless", NULL, -2, 2},
786 {(char*)"elity", NULL, 0, 0},
787 {(char*)"ement", NULL, -1, 0},
788 {(char*)"enced", NULL, -3, 3},
789 {(char*)"ences", NULL, 0, 1},
790 {(char*)"eness", E, 0, 0},
791 {(char*)"ening", E, -2, 2},
792 {(char*)"ental", NULL, 0, 0},
793 {(char*)"ented", C, -1, 1},
794 {(char*)"ently", NULL, 0, 0},
795
796 {(char*)"eature", Z, 0, 0},
797 {(char*)"efully", NULL, -1, 1},
798 {(char*)"encies", NULL, 0, 0},
799 {(char*)"encing", NULL, -2, 2},
800 {(char*)"ential", NULL, 0, 0},
801 {(char*)"enting", C, -1, 1},
802 {(char*)"entist", NULL, 0, 1},
803 {(char*)"eously", NULL, 0, 0},
804
805 {(char*)"elihood", E, 0, 1},
806 {(char*)"encible", NULL, 0, 0},
807 {(char*)"entally", NULL, -2, 2},
808 {(char*)"entials", NULL, 0, 0},
809 {(char*)"entiate", NULL, -1, 1},
810 {(char*)"entness", NULL, 0, 0},
811
812 {(char*)"entation", NULL, 0, 0},
813 {(char*)"entially", NULL, -1, 1},
814 {(char*)"eousness", NULL, 0, 0},
815
816 {(char*)"eableness", E, 0, 1},
817 {(char*)"entations", NULL, 0, 0},
818 {(char*)"entiality", NULL, -2, 2},
819 {(char*)"entialize", NULL, 0, 0},
820 {(char*)"entiation", NULL, -1, 0},
821
822 {(char*)"entialness", NULL, 0, 0},
823
824 {(char*)"ful", NULL, 0, 0},
825
826 {(char*)"fully", NULL, 0, 0},
827
828 {(char*)"fulness", NULL, 0, 0},
829
830 {(char*)"hood", NULL, 0, 0},
831
832 {(char*)"i", NULL, 0, 0},
833
834 {(char*)"ia", NULL, 0, 0},
835 {(char*)"ic", NULL, -1, 1},
836 {(char*)"is", NULL, 0, 0},
837
838 {(char*)"ial", NULL, 0, 0},
839 {(char*)"ian", NULL, -1, 1},
840 {(char*)"ics", NULL, 0, 1},
841 {(char*)"ide", L, 0, 0},
842 {(char*)"ied", NULL, -3, 3},
843 {(char*)"ier", NULL, 0, 0},
844 {(char*)"ies", P, -1, 0},
845 {(char*)"ily", NULL, -1, 1},
846 {(char*)"ine", M, 0, 0},
847 {(char*)"ing", N, -5, 5},
848 {(char*)"ion", Q, 0, 0},
849 {(char*)"ish", C, -1, 1},
850 {(char*)"ism", B, 0, 1},
851 {(char*)"ist", NULL, 0, 0},
852 {(char*)"ite", AA, -3, 3},
853 {(char*)"ity", NULL, 0, 0},
854 {(char*)"ium", NULL, -1, 0},
855 {(char*)"ive", NULL, -1, 1},
856 {(char*)"ize", F, 0, 0},
857
858 {(char*)"ials", NULL, 0, 0},
859 {(char*)"ians", NULL, -1, 0},
860 {(char*)"ible", NULL, -1, 1},
861 {(char*)"ibly", NULL, 0, 0},
862 {(char*)"ical", NULL, -2, 2},
863 {(char*)"ides", L, 0, 0},
864 {(char*)"iers", NULL, -1, 1},
865 {(char*)"iful", NULL, 0, 0},
866 {(char*)"ines", M, -4, 4},
867 {(char*)"ings", N, 0, 0},
868 {(char*)"ions", B, -1, 1},
869 {(char*)"ious", NULL, 0, 0},
870 {(char*)"isms", B, -2, 2},
871 {(char*)"ists", NULL, 0, 0},
872 {(char*)"itic", H, -1, 1},
873 {(char*)"ized", F, 0, 1},
874 {(char*)"izer", F, 0, 0},
875
876 {(char*)"ially", NULL, 0, 0},
877 {(char*)"icant", NULL, -1, 1},
878 {(char*)"ician", NULL, 0, 0},
879 {(char*)"icide", NULL, -2, 2},
880 {(char*)"icism", NULL, 0, 0},
881 {(char*)"icist", NULL, -1, 0},
882 {(char*)"icity", NULL, -3, 3},
883 {(char*)"idine", I, 0, 1},
884 {(char*)"iedly", NULL, 0, 0},
885 {(char*)"ihood", NULL, -2, 2},
886 {(char*)"inate", NULL, 0, 0},
887 {(char*)"iness", NULL, -1, 0},
888 {(char*)"ingly", B, -6, 6},
889 {(char*)"inism", J, 0, 1},
890 {(char*)"inity", CC, 0, 0},
891 {(char*)"ional", NULL, -2, 2},
892 {(char*)"ioned", NULL, 0, 0},
893 {(char*)"ished", NULL, -1, 0},
894 {(char*)"istic", NULL, -3, 3},
895 {(char*)"ities", NULL, 0, 1},
896 {(char*)"itous", NULL, 0, 0},
897 {(char*)"ively", NULL, -2, 2},
898 {(char*)"ivity", NULL, 0, 0},
899 {(char*)"izers", F, -1, 1},
900 {(char*)"izing", F, 0, 0},
901
902 {(char*)"ialist", NULL, 0, 0},
903 {(char*)"iality", NULL, -1, 1},
904 {(char*)"ialize", NULL, 0, 0},
905 {(char*)"ically", NULL, -2, 2},
906 {(char*)"icance", NULL, 0, 0},
907 {(char*)"icians", NULL, -1, 1},
908 {(char*)"icists", NULL, 0, 0},
909 {(char*)"ifully", NULL, -4, 4},
910 {(char*)"ionals", NULL, 0, 0},
911 {(char*)"ionate", D, -1, 1},
912 {(char*)"ioning", NULL, 0, 0},
913 {(char*)"ionist", NULL, -2, 2},
914 {(char*)"iously", NULL, 0, 0},
915 {(char*)"istics", NULL, -1, 1},
916 {(char*)"izable", E, 0, 0},
917
918 {(char*)"ibility", NULL, 0, 0},
919 {(char*)"icalism", NULL, -1, 1},
920 {(char*)"icalist", NULL, 0, 1},
921 {(char*)"icality", NULL, 0, 0},
922 {(char*)"icalize", NULL, -3, 3},
923 {(char*)"ication", G, 0, 0},
924 {(char*)"icianry", NULL, -1, 0},
925 {(char*)"ination", NULL, -1, 1},
926 {(char*)"ingness", NULL, 0, 0},
927 {(char*)"ionally", NULL, -5, 5},
928 {(char*)"isation", NULL, 0, 0},
929 {(char*)"ishness", NULL, -1, 1},
930 {(char*)"istical", NULL, 0, 1},
931 {(char*)"iteness", NULL, 0, 0},
932 {(char*)"iveness", NULL, -3, 3},
933 {(char*)"ivistic", NULL, 0, 0},
934 {(char*)"ivities", NULL, -1, 0},
935 {(char*)"ization", F, -1, 1},
936 {(char*)"izement", NULL, 0, 0},
937
938 {(char*)"ibleness", NULL, 0, 0},
939 {(char*)"icalness", NULL, -1, 1},
940 {(char*)"ionalism", NULL, 0, 0},
941 {(char*)"ionality", NULL, -2, 2},
942 {(char*)"ionalize", NULL, 0, 0},
943 {(char*)"iousness", NULL, -1, 1},
944 {(char*)"izations", NULL, 0, 0},
945
946 {(char*)"ionalness", NULL, 0, 1},
947 {(char*)"istically", NULL, 0, 0},
948 {(char*)"itousness", NULL, -2, 2},
949 {(char*)"izability", NULL, 0, 0},
950 {(char*)"izational", NULL, -1, 0},
951
952 {(char*)"izationally", B, 0, 0},
953
954 {(char*)"ly", B, 0, 0},
955
956 {(char*)"less", NULL, 0, 1},
957 {(char*)"lily", NULL, 0, 0},
958
959 {(char*)"lessly", NULL, 0, 0},
960
961 {(char*)"lessness", NULL, 0, 0},
962
963 {(char*)"ness", NULL, 0, 0},
964
965 {(char*)"nesses", NULL, 0, 0},
966
967 {(char*)"o", NULL, 0, 0},
968
969 {(char*)"on", S, 0, 1},
970 {(char*)"or", T, 0, 0},
971
972 {(char*)"oid", NULL, 0, 0},
973 {(char*)"one", R, -1, 1},
974 {(char*)"ous", NULL, 0, 0},
975
976 {(char*)"ogen", NULL, 0, 0},
977
978 {(char*)"oidal", NULL, 0, 0},
979 {(char*)"oides", NULL, -1, 2},
980 {(char*)"otide", NULL, 0, 0},
981 {(char*)"ously", NULL, -1, 0},
982
983 {(char*)"oidism", NULL, 0, 0},
984
985 {(char*)"oidally", NULL, 0, 1},
986 {(char*)"ousness", NULL, 0, 0},
987
988 {(char*)"s", W, 0, 0},
989
990 {(char*)"s'", NULL, 0, 0},
991
992 {(char*)"um", U, 0, 1},
993 {(char*)"us", V, 0, 0},
994
995 {(char*)"ward", NULL, 0, 1},
996 {(char*)"wise", NULL, 0, 0},
997
998 {(char*)"y", B, 0, 0},
999
1000 {(char*)"yl", R, 0, 0},
1001
1002 {(char*)"ying", B, 0, 1},
1003 {(char*)"yish", NULL, 0, 0},
1004
1005 {(char*)"'s", NULL, 0, 0},
1006};
1007
1008typedef struct node
1009 {
1010 char c; /* First character */
1011 struct node *left; /* used in balanced */
1012 struct node *right; /* binary tree */
1013 Ending_List *ptr[11]; /* the approriate location */
1014 }
1015First_Char_Node;
1016
1017static First_Char_Node First[] =
1018{
1019 {'\'', NULL, NULL,
1020 {NULL,
1021 List + 293, NULL, NULL, NULL, NULL,
1022 NULL, NULL, NULL, NULL, NULL}},
1023
1024 {'a', First, NULL,
1025 {List,
1026 List + 2, List + 9, List + 20, List + 39, List + 58,
1027 List + 70, List + 77, List + 82, List + 87, List + 89}},
1028
1029 {'e', First + 1, First + 4,
1030 {List + 91,
1031 List + 93, List + 98, List + 106, List + 116, List + 126,
1032 List + 133, List + 138, List + 142, List + 145, NULL}},
1033
1034 {'f', NULL, NULL,
1035 {NULL,
1036 NULL, List + 146, NULL, List + 147, NULL,
1037 List + 148, NULL, NULL, NULL, NULL}},
1038
1039 {'h', First + 3, First + 5,
1040 {NULL,
1041 NULL, NULL, List + 149, NULL, NULL,
1042 NULL, NULL, NULL, NULL, NULL}},
1043
1044 {'i', NULL, NULL,
1045 {List + 150,
1046 List + 152, List + 163, List + 181, List + 202, List + 222,
1047 List + 239, List + 252, List + 258, NULL, List + 261}},
1048
1049 {'l', First + 2, First + 10,
1050 {NULL,
1051 List + 262, NULL, List + 263, NULL, List + 265,
1052 NULL, List + 266, NULL, NULL, NULL}},
1053
1054 {'n', NULL, NULL,
1055 {NULL,
1056 NULL, NULL, List + 267, NULL, List + 268,
1057 NULL, NULL, NULL, NULL, NULL}},
1058
1059 {'o', First + 7, First + 9,
1060 {List + 269,
1061 List + 270, List + 273, List + 275, List + 277, List + 280,
1062 List + 281, NULL, NULL, NULL, NULL}},
1063
1064 {'s', NULL, NULL,
1065 {List + 283,
1066 List + 284, NULL, NULL, NULL, NULL,
1067 NULL, NULL, NULL, NULL, NULL}},
1068
1069 {'u', First + 8, First + 12,
1070 {NULL,
1071 List + 285, NULL, NULL, NULL, NULL,
1072 NULL, NULL, NULL, NULL, NULL}},
1073
1074 {'w', NULL, NULL,
1075 {NULL,
1076 NULL, NULL, List + 287, NULL, NULL,
1077 NULL, NULL, NULL, NULL, NULL}},
1078
1079 {'y', First + 11, NULL,
1080 {List + 289,
1081 List + 290, NULL, List + 291, NULL, NULL,
1082 NULL, NULL, NULL, NULL, NULL}},
1083};
1084
1085
1086/******************************************************************************
1087 *
1088 * Look for a match in the suffix list.
1089 * Return the pointer to the end of the new stem if there is a match.
1090 * Otherwise, return the pointer to the end of the original word.
1091 *
1092 * Method:
1093 * * Search for the first character in the suffix list.
1094 * * If there is match, search for the rest of the suffix list.
1095 *
1096 *****************************************************************************/
1097
1098static char *
1099remove_ending (char *word, int w_len)
1100{
1101 static First_Char_Node *root = First + 6; /* the root of binary tree is 'l' */
1102 First_Char_Node *p_first; /* Points to the first character */
1103 /* of the possible suffix */
1104 Ending_List *p_list; /* Points to the List of Endings */
1105 char *suffix_start, /* Points to start of possible suffix */
1106 *stem_end, /* Points to the end of possible stem */
1107 ch1, ch2;
1108 int s_len, /* Possible stem length */
1109 e_offset, /* Offset from the end to the start of */
1110 /* possible suffix */
1111 not_done = 1, cmp;
1112
1113 stem_end = word + w_len - 1;
1114 if (w_len > PREDEFINED_SIZE) /* find the start of suffix */
1115 suffix_start = word + w_len - MAX_ENDING_SIZE;
1116 else
1117 suffix_start = word + MIN_STEM_LENGTH;
1118
1119 ch1 = *suffix_start;
1120 do /* Search for the first character */
1121 {
1122 p_first = root;
1123 do
1124 {
1125 ch2 = p_first->c;
1126 if (ch1 == ch2)
1127 break;
1128 else if (ch1 > ch2)
1129 p_first = p_first->right;
1130 else
1131 p_first = p_first->left;
1132 }
1133 while (p_first != NULL);
1134
1135 if (p_first != NULL) /* Search for the rest */
1136 {
1137 e_offset = stem_end - suffix_start;
1138 if ((p_list = p_first->ptr[e_offset]) != NULL)
1139 {
1140 for (;;) /* no need to compare the first char */
1141 {
1142 cmp = strcmp (suffix_start + 1, p_list->ending + 1);
1143 if (cmp > 0)
1144 if (p_list->right_offset)
1145 p_list += p_list->right_offset;
1146 else
1147 break;
1148 else if (cmp < 0)
1149 if (p_list->left_offset)
1150 p_list += p_list->left_offset;
1151 else
1152 break;
1153 else
1154 {
1155 s_len = suffix_start - word;
1156 if (!p_list->cond ||
1157 (*p_list->cond) (s_len, suffix_start - 1))
1158 {
1159 *suffix_start = EOS;
1160 stem_end = suffix_start - 1;
1161 not_done = 0;
1162 }
1163 break;
1164 }
1165 }
1166 }
1167 }
1168 ++suffix_start;
1169 }
1170 while (not_done && ((ch1 = *suffix_start) != EOS));
1171 return (stem_end);
1172}
Note: See TracBrowser for help on using the repository browser.