Context Navigation

source: trunk/indexers/mgpp/lib/lovinstem.cpp@ 9612

Last change on this file since 9612 was 9612, checked in by kjdon, 19 years ago
added in x++ -> ++x changes submitted by Emanuel Dejanu
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 23.7 KB

Line
1	/**************************************************************************
2	*
3	* lovinstem.c -- Stemming code
4	* Copyright (C) 1994 Linh Huynh ([email protected])
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#include "sysfuncs.h"
23
24
25	/*****************************************************************************
26	*
27	* Transformational Rules used in Recoding Stem Terminations
28	*
29	* Parameter in functions:
30	* ch : the second last character of the old ending
31	* (after removing one of double consonants)
32	*
33	****************************************************************************/
34
35
36	/********** Conditional rules associated with transformations ************/
37
38	/*static int aio ();
39	static int s ();
40	static int pt ();
41	static int m ();
42	static int n ();*/
43
44	static int
45	aio (char ch) /* Rule number 9 */
46	{
47	return ((ch != 'a') && (ch != 'i') && (ch != 'o'));
48	}
49
50	static int
51	s (char ch) /* Rule Number 24 */
52	{
53	return (ch != 's');
54	}
55
56	static int
57	pt (char ch) /* Rule number 28 */
58	{
59	return ((ch != 'p') && (ch != 't'));
60	}
61
62	static int
63	m (char ch) /* Rule number 30 */
64	{
65	return (ch != 'm');
66	}
67
68	static int
69	n (char ch) /* Rule number 32 */
70	{
71	return (ch != 'n');
72	}
73
74
75	/**************************************************************************
76	*
77	* Context-sensitive rules associated with certain endings.
78	*
79	* The notations of the rules are the same as in Lovins' paper
80	* except that rule A is replaced by a NULL in the data structure.
81	*
82	* Function parameters:
83	* - stem_length: possible stem length
84	* - end : pointer points to the end of the possible stem.
85	*
86	*************************************************************************/
87
88
89	/******* Context-sensitive rule function declarations ****************/
90
91	/*static int B ();
92	static int C ();
93	static int D ();
94	static int E ();
95	static int F ();
96	static int G ();
97	static int H ();
98	static int I ();
99	static int J ();
100	static int K ();
101	static int L ();
102	static int M ();
103	static int N ();
104	static int O ();
105	static int P ();
106	static int Q ();
107	static int R ();
108	static int S ();
109	static int T ();
110	static int U ();
111	static int V ();
112	static int W ();
113	static int X ();
114	static int Y ();
115	static int Z ();
116	static int AA ();
117	static int BB ();
118	static int CC ();
119	*/
120	static int
121	B (int stem_length, char *end)
122	{
123	return (stem_length >= 3);
124	}
125
126	static int
127	C (int stem_length, char *end)
128	{
129	return (stem_length >= 4);
130	}
131
132	static int
133	D (int stem_length, char *end)
134	{
135	return (stem_length >= 5);
136	}
137
138	static int
139	E (int stem_length, char *end)
140
141
142	{
143	return (*end != 'e');
144	}
145
146	static int
147	F (int stem_length, char *end)
148
149
150	{
151	return ((stem_length >= 3) && (*end != 'e'));
152	}
153
154	static int
155	G (int stem_length, char *end)
156
157
158	{
159	return ((stem_length >= 3) && (*end == 'f'));
160	}
161
162	static int
163	H (int stem_length, char *end)
164
165
166	{
167	char p1, p2;
168	p1 = *end;
169	p2 = *(end - 1);
170	return ((p1 == 't') \|\| ((p1 == 'l') && (p2 == 'l')));
171	}
172
173	static int
174	I (int stem_length, char *end)
175
176
177	{
178	char p1;
179	p1 = *end;
180	return ((p1 != 'o') && (p1 != 'e'));
181	}
182
183	static int
184	J (int stem_length, char *end)
185
186
187	{
188	char p1;
189	p1 = *end;
190	return ((p1 != 'a') && (p1 != 'e'));
191	}
192
193	static int
194	K (int stem_length, char *end)
195
196
197	{
198	char p1;
199	p1 = *end;
200	return ((stem_length >= 3) &&
201	((p1 == 'l') \|\| (p1 == 'i') \|\|
202	((p1 == 'e') && (*(end - 2) == 'u'))));
203	}
204
205	static int
206	L (int stem_length, char *end)
207
208
209	{
210	char p1, p2;
211	p1 = *end;
212	p2 = *(end - 1);
213	if (p1 == 's')
214	return (p2 == 'o');
215	else
216	return ((p1 != 'u') && (p1 != 'x'));
217	}
218
219	static int
220	M (int stem_length, char *end)
221
222
223	{
224	char p1;
225	p1 = *end;
226	return ((p1 != 'a') && (p1 != 'c') &&
227	(p1 != 'e') && (p1 != 'm'));
228	}
229
230	static int
231	N (int stem_length, char *end)
232
233
234	{
235	if (stem_length >= 3)
236	{
237	if (*(end - 2) == 's')
238	return (stem_length >= 4);
239	else
240	return 1;
241	}
242	else
243	return 0;
244	}
245
246	static int
247	O (int stem_length, char *end)
248
249
250	{
251	char p1;
252	p1 = *end;
253	return ((p1 == 'l') \|\| (p1 == 'i'));
254	}
255
256	static int
257	P (int stem_length, char *end)
258
259
260	{
261	return (*end != 'c');
262	}
263
264	static int
265	Q (int stem_length, char *end)
266
267
268	{
269	char p1;
270	p1 = *end;
271
272	return ((stem_length >= 3) && (p1 != 'l') && (p1 != 'n'));
273	}
274
275	static int
276	R (int stem_length, char *end)
277
278
279	{
280	char p1;
281	p1 = *end;
282	return ((p1 == 'n') \|\| (p1 == 'r'));
283	}
284
285	static int
286	S (int stem_length, char *end)
287
288
289	{
290	char p1, p2;
291	p1 = *end;
292	p2 = *(end - 1);
293
294	if (p1 == 't')
295	return (p2 != 't');
296	else
297	return ((p1 == 'r') && (p2 == 'd'));
298	}
299
300	static int
301	T (int stem_length, char *end)
302
303
304	{
305	char p1, p2;
306	p1 = *end;
307	p2 = *(end - 1);
308
309	if (p1 == 't')
310	return (p2 != 'o');
311	else
312	return (p1 == 's');
313	}
314
315	static int
316	U (int stem_length, char *end)
317
318
319	{
320	char p1;
321	p1 = *end;
322
323	return ((p1 == 'l') \|\| (p1 == 'm') \|\|
324	(p1 == 'n') \|\| (p1 == 'r'));
325	}
326
327	static int
328	V (int stem_length, char *end)
329
330
331	{
332	return (*end == 'c');
333	}
334
335	static int
336	W (int stem_length, char *end)
337
338
339	{
340	char p1;
341	p1 = *end;
342
343	return ((p1 != 's') && (p1 != 'u'));
344	}
345
346	static int
347	X (int stem_length, char *end)
348
349
350	{
351	char p1;
352	p1 = *end;
353
354	if (p1 == 'e')
355	return ((stem_length >= 3) && (*(end - 2) == 'u'));
356	else
357	return ((p1 == 'l') \|\| (p1 == 'i'));
358	}
359
360	static int
361	Y (int stem_length, char *end)
362
363
364	{
365	return ((end == 'n') && ((end - 1) == 'i'));
366	}
367
368	static int
369	Z (int stem_length, char *end)
370
371
372	{
373	return (*end != 'f');
374	}
375
376	static int
377	AA (int stem_length, char *end)
378
379
380	{
381	char p1, p2;
382	p1 = *end;
383	p2 = *(end - 1);
384
385	if (p1 == 'h')
386	return ((p2 == 'p') \|\| (p2 == 't'));
387	else if (p1 == 'r')
388	return ((p2 == 'e') \|\| (p2 == 'o'));
389	else if (p1 == 's')
390	return (p2 == 'e');
391	else
392	return ((p1 == 'd') \|\| (p1 == 'f') \|\|
393	(p1 == 'l') \|\| (p1 == 't'));
394	}
395
396	static int
397	BB (int stem_length, char *end)
398
399
400	{
401	if (stem_length >= 3)
402	{
403	if (stem_length >= 4)
404	return (strncmp (end - 3, "ryst", 4) != 0);
405	else
406	return (strncmp (end - 2, "met", 3) != 0);
407	}
408	else
409	return 0;
410	}
411
412	static int
413	CC (int stem_length, char *end)
414
415
416	{
417	return (*end == 'l');
418	}
419
420	/**************************************************************************/
421
422	#define MIN_STEM_LENGTH 2
423	#define MAX_ENDING_SIZE 11
424	#define PREDEFINED_SIZE (MAX_ENDING_SIZE + MIN_STEM_LENGTH)
425	#define EOS '\0'
426	/* [RPAP - Jan 97: Stem Index Change] */
427	#define MAX_STEM_LENGTH 255
428
429	static char remove_ending (char word, int w_len);
430	static void recode_stem (char *stem_end);
431
432	/* =========================================================================
433	* Function: stem
434	* Description:
435	* Implemetation of the Lovins' stemming algorithm described in
436	* J.B. Lovins, "Development of a Stemming Algorithm",
437	* Mechanical Translation and Computational Linguistics, Vol 11,1968.
438	* Input: a word string with the length in the first byte
439	* Output: the stemmed word
440	* ========================================================================= */
441
442	void
443	lovinstem (u_char * word)
444	{
445	char word_start; / points to first letter of word */
446	char stem_end; / points to last character of stem portion */
447	int old_length; /* length of word */
448	int new_length; /* length after stemming */
449
450	/* [RPAP - Jan 97: Stem Index Change] */
451	u_char copy[MAX_STEM_LENGTH + 2];
452	memcpy (copy, word, *word + 1);
453
454	old_length = copy; / [RPAP - Jan 97: Stem Index Change] */
455
456	/* The word must be at least MIN_STEM_LENGTH characters long. */
457	if (old_length <= MIN_STEM_LENGTH)
458	return;
459
460	word_start = (char ) copy + 1; / jump over length byte / / [RPAP - Jan 97: Stem Index Change] */
461	copy[old_length + 1] = '\0'; /* null terminate string / / [RPAP - Jan 97: Stem Index Change] */
462
463	stem_end = remove_ending (word_start, old_length);
464	recode_stem (stem_end);
465
466	/* fix up new length */
467	/* have to use strlen because have no other way of telling length */
468	new_length = strlen (word_start);
469	copy = new_length; / [RPAP - Jan 97: Stem Index Change] */
470
471	/* [RPAP - Jan 97: Stem Index Change] */
472	memcpy (word, copy, *copy + 1);
473
474	} /stem_complex /
475
476	/* =========================================================================
477
478	* Data structure for recoding and the method for recoding the stem.
479	*
480	* ========================================================================= */
481
482	typedef struct
483	{
484	char old_end; / old ending */
485	char new_end; / new ending */
486	char old_offset; /* length of the old ending - 1 */
487	int (cond) (char); / condition rule */
488	char end_group; /* signal the end of the group */
489	}
490	Recode_Rules;
491
492	/*
493	Stem terminations are divided into groups having the same last
494	character
495	*/
496
497	static Recode_Rules Rules[] =
498	{
499	"uad", "uas", 2, NULL, 0,
500	"vad", "vas", 2, NULL, 0,
501	"cid", "cis", 2, NULL, 0,
502	"lid", "lis", 2, NULL, 0,
503	"erid", "eris", 3, NULL, 0,
504	"pand", "pans", 3, NULL, 0,
505	"end", "ens", 2, s, 0,
506	"end", "ens", 2, m, 0,
507	"ond", "ons", 2, NULL, 0,
508	"lud", "lus", 2, NULL, 0,
509	"rud", "rus", 2, NULL, 1,
510
511	"ul", "l", 1, aio, 1,
512
513	"istr", "ister", 3, NULL, 0,
514	"metr", "meter", 3, NULL, 0,
515	"her", "hes", 2, pt, 1,
516
517	"urs", "ur", 2, NULL, 1,
518
519	"uct", "uc", 2, NULL, 0,
520	"umpt", "um", 3, NULL, 0,
521	"rpt", "rb", 2, NULL, 0,
522	"mit", "mis", 2, NULL, 0,
523	"ert", "ers", 2, NULL, 0,
524	"et", "es", 1, n, 0,
525	"yt", "ys", 1, NULL, 1,
526
527	"iev", "ief", 2, NULL, 0,
528	"olv", "olut", 2, NULL, 1,
529
530	"bex", "bic", 2, NULL, 0,
531	"dex", "dic", 2, NULL, 0,
532	"pex", "pic", 2, NULL, 0,
533	"tex", "tic", 2, NULL, 0,
534	"ax", "ac", 1, NULL, 0,
535	"ex", "ec", 1, NULL, 0,
536	"ix", "ic", 1, NULL, 0,
537	"lux", "luc", 2, NULL, 1,
538
539	"yz", "ys", 1, NULL, 1
540	};
541
542	typedef struct last
543	{
544	char c; /* The last character */
545	struct last left; / used in balanced */
546	struct last right; / binary tree */
547	Recode_Rules pt; / location of approriate group */
548	}
549	Last_Char_Node;
550
551	/*------------
552	s
553	/ \
554	l x
555	/ \ / \
556	d r t z
557	\
558	v
559	---------------*/
560	static Last_Char_Node pr[] =
561	{
562	'd', NULL, NULL, Rules,
563	'l', pr, pr + 2, Rules + 11,
564	'r', NULL, NULL, Rules + 12,
565	's', pr + 1, pr + 6, Rules + 15,
566	't', NULL, pr + 5, Rules + 16,
567	'v', NULL, NULL, Rules + 23,
568	'x', pr + 4, pr + 7, Rules + 25,
569	'z', NULL, NULL, Rules + 33,
570	};
571
572
573	/*****************************************************************************
574	*
575	* Recode the stem according to the transformation rules.
576	*
577	*****************************************************************************/
578
579	static void
580	recode_stem (char *stem_end)
581	{
582	static Last_Char_Node *root = pr + 3;
583	Last_Char_Node *p_last = root;
584	Recode_Rules rule; / points to the transformation list */
585	char h, / points to start of possible ending */
586	ch, /* last character of the old ending */
587	ch2;
588
589	h = stem_end - 1;
590	ch = *stem_end;
591	if (h == ch) / Check for double consonant */
592	{
593	if (strchr ("bdglmnprst", ch) != NULL)
594	{
595	*stem_end = EOS;
596	stem_end = h;
597	}
598	}
599
600	do /* Check for the last character */
601	{
602	ch2 = p_last->c;
603	if (ch == ch2)
604	break;
605	else if (ch > ch2)
606	p_last = p_last->right;
607	else
608	p_last = p_last->left;
609	}
610	while (p_last != NULL);
611
612	if (p_last != NULL) /* Check for the rest of suffix list */
613	{
614	rule = p_last->pt;
615	for (;;)
616	{
617	h = stem_end - rule->old_offset;
618	if (strcmp (h, rule->old_end) == 0)
619	{
620	if (!rule->cond \|\| (rule->cond) ((h - 1)))
621	{
622	(void) strcpy (h, rule->new_end);
623	break;
624	}
625	}
626	if (rule->end_group)
627	break;
628	++rule;
629	}
630	}
631	}
632
633	/* ======================================================================
634
635	* List of endings and the method to remove the ending
636	*
637	* ======================================================================*/
638
639	/********** Data structures for list of endings ******************/
640
641	typedef struct
642	{
643	char ending; / old ending */
644	int (cond) (int length, char end); /* conditional rule */
645	signed char left_offset; /* used to find the siblings */
646	signed char right_offset; /* in balanced binary tree */
647	}
648	Ending_List;
649
650	/*
651	The original list of endings is re-organised into groups having
652	the same length and the same first character.
653	*/
654
655	static Ending_List List[] =
656	{
657	"a", NULL, 0, 0,
658
659	"ae", NULL, 0, 0,
660	"al", BB, -1, 2,
661	"ar", X, 0, 0,
662	"as", B, -1, 0,
663
664	"acy", NULL, 0, 1,
665	"age", B, 0, 0,
666	"aic", NULL, -2, 1,
667	"als", BB, 0, 0,
668	"ant", B, -2, 2,
669	"ars", O, 0, 0,
670	"ary", F, -1, 2,
671	"ata", NULL, 0, 0,
672	"ate", NULL, -1, 0,
673
674	"able", NULL, 0, 1,
675	"ably", NULL, 0, 0,
676	"ages", B, -2, 2,
677	"ally", B, 0, 0,
678	"ance", B, -1, 1,
679	"ancy", B, 0, 0,
680	"ants", B, -4, 4,
681	"aric", NULL, 0, 0,
682	"arly", K, -1, 1,
683	"ated", I, 0, 0,
684	"ates", NULL, -2, 2,
685	"atic", B, 0, 0,
686	"ator", NULL, -1, 0,
687
688	"acies", NULL, 0, 0,
689	"acity", NULL, -1, 1,
690	"aging", B, 0, 0,
691	"aical", NULL, -2, 2,
692	"alist", NULL, 0, 0,
693	"alism", B, -1, 0,
694	"ality", NULL, -3, 3,
695	"alize", NULL, 0, 1,
696	"allic", BB, 0, 0,
697	"anced", B, -2, 2,
698	"ances", B, 0, 0,
699	"antic", C, -1, 0,
700	"arial", NULL, -6, 6,
701	"aries", NULL, 0, 1,
702	"arily", NULL, 0, 0,
703	"arity", B, -2, 2,
704	"arize", NULL, 0, 0,
705	"aroid", NULL, -1, 0,
706	"ately", NULL, -3, 3,
707	"ating", I, 0, 1,
708	"ation", B, 0, 0,
709	"ative", NULL, -2, 2,
710	"ators", NULL, 0, 0,
711	"atory", NULL, -1, 1,
712	"ature", E, 0, 0,
713
714	"aceous", NULL, 0, 1,
715	"acious", B, 0, 0,
716	"action", G, -2, 2,
717	"alness", NULL, 0, 0,
718	"ancial", NULL, -1, 1,
719	"ancies", NULL, 0, 0,
720	"ancing", B, -4, 4,
721	"ariser", NULL, 0, 0,
722	"arized", NULL, -1, 1,
723	"arizer", NULL, 0, 0,
724	"atable", NULL, -2, 2,
725	"ations", B, 0, 0,
726	"atives", NULL, -1, 0,
727
728	"ability", NULL, 0, 1,
729	"aically", NULL, 0, 0,
730	"alistic", B, -2, 2,
731	"alities", NULL, 0, 0,
732	"ariness", E, -1, 0,
733	"aristic", NULL, -3, 3,
734	"arizing", NULL, 0, 1,
735	"ateness", NULL, 0, 0,
736	"atingly", NULL, -2, 2,
737	"ational", B, 0, 0,
738	"atively", NULL, -1, 1,
739	"ativism", NULL, 0, 0,
740
741	"ableness", NULL, 0, 1,
742	"arizable", NULL, 0, 0,
743
744	"allically", C, 0, 0,
745	"antaneous", NULL, -1, 1,
746	"antiality", NULL, 0, 0,
747	"arisation", NULL, -2, 2,
748	"arization", NULL, 0, 0,
749	"ationally", B, -1, 1,
750	"ativeness", NULL, 0, 0,
751
752	"antialness", NULL, 0, 0,
753	"arisations", NULL, -1, 1,
754	"arizations", NULL, 0, 0,
755
756	"alistically", B, 0, 1,
757	"arizability", NULL, 0, 0,
758
759	"e", NULL, 0, 0,
760
761	"ed", E, 0, 0,
762	"en", F, -1, 1,
763	"es", E, 0, 0,
764
765	"eal", Y, 0, 0,
766	"ear", Y, -1, 1,
767	"ely", E, 0, 0,
768	"ene", E, -2, 2,
769	"ent", C, 0, 0,
770	"ery", E, -1, 1,
771	"ese", NULL, 0, 0,
772
773	"ealy", Y, 0, 1,
774	"edly", E, 0, 0,
775	"eful", NULL, -2, 1,
776	"eity", NULL, 0, 0,
777	"ence", NULL, -2, 2,
778	"ency", NULL, 0, 0,
779	"ened", E, -1, 2,
780	"enly", E, 0, 0,
781	"eous", NULL, -1, 0,
782
783	"early", Y, 0, 1,
784	"ehood", NULL, 0, 0,
785	"eless", NULL, -2, 2,
786	"elity", NULL, 0, 0,
787	"ement", NULL, -1, 0,
788	"enced", NULL, -3, 3,
789	"ences", NULL, 0, 1,
790	"eness", E, 0, 0,
791	"ening", E, -2, 2,
792	"ental", NULL, 0, 0,
793	"ented", C, -1, 1,
794	"ently", NULL, 0, 0,
795
796	"eature", Z, 0, 0,
797	"efully", NULL, -1, 1,
798	"encies", NULL, 0, 0,
799	"encing", NULL, -2, 2,
800	"ential", NULL, 0, 0,
801	"enting", C, -1, 1,
802	"entist", NULL, 0, 1,
803	"eously", NULL, 0, 0,
804
805	"elihood", E, 0, 1,
806	"encible", NULL, 0, 0,
807	"entally", NULL, -2, 2,
808	"entials", NULL, 0, 0,
809	"entiate", NULL, -1, 1,
810	"entness", NULL, 0, 0,
811
812	"entation", NULL, 0, 0,
813	"entially", NULL, -1, 1,
814	"eousness", NULL, 0, 0,
815
816	"eableness", E, 0, 1,
817	"entations", NULL, 0, 0,
818	"entiality", NULL, -2, 2,
819	"entialize", NULL, 0, 0,
820	"entiation", NULL, -1, 0,
821
822	"entialness", NULL, 0, 0,
823
824	"ful", NULL, 0, 0,
825
826	"fully", NULL, 0, 0,
827
828	"fulness", NULL, 0, 0,
829
830	"hood", NULL, 0, 0,
831
832	"i", NULL, 0, 0,
833
834	"ia", NULL, 0, 0,
835	"ic", NULL, -1, 1,
836	"is", NULL, 0, 0,
837
838	"ial", NULL, 0, 0,
839	"ian", NULL, -1, 1,
840	"ics", NULL, 0, 1,
841	"ide", L, 0, 0,
842	"ied", NULL, -3, 3,
843	"ier", NULL, 0, 0,
844	"ies", P, -1, 0,
845	"ily", NULL, -1, 1,
846	"ine", M, 0, 0,
847	"ing", N, -5, 5,
848	"ion", Q, 0, 0,
849	"ish", C, -1, 1,
850	"ism", B, 0, 1,
851	"ist", NULL, 0, 0,
852	"ite", AA, -3, 3,
853	"ity", NULL, 0, 0,
854	"ium", NULL, -1, 0,
855	"ive", NULL, -1, 1,
856	"ize", F, 0, 0,
857
858	"ials", NULL, 0, 0,
859	"ians", NULL, -1, 0,
860	"ible", NULL, -1, 1,
861	"ibly", NULL, 0, 0,
862	"ical", NULL, -2, 2,
863	"ides", L, 0, 0,
864	"iers", NULL, -1, 1,
865	"iful", NULL, 0, 0,
866	"ines", M, -4, 4,
867	"ings", N, 0, 0,
868	"ions", B, -1, 1,
869	"ious", NULL, 0, 0,
870	"isms", B, -2, 2,
871	"ists", NULL, 0, 0,
872	"itic", H, -1, 1,
873	"ized", F, 0, 1,
874	"izer", F, 0, 0,
875
876	"ially", NULL, 0, 0,
877	"icant", NULL, -1, 1,
878	"ician", NULL, 0, 0,
879	"icide", NULL, -2, 2,
880	"icism", NULL, 0, 0,
881	"icist", NULL, -1, 0,
882	"icity", NULL, -3, 3,
883	"idine", I, 0, 1,
884	"iedly", NULL, 0, 0,
885	"ihood", NULL, -2, 2,
886	"inate", NULL, 0, 0,
887	"iness", NULL, -1, 0,
888	"ingly", B, -6, 6,
889	"inism", J, 0, 1,
890	"inity", CC, 0, 0,
891	"ional", NULL, -2, 2,
892	"ioned", NULL, 0, 0,
893	"ished", NULL, -1, 0,
894	"istic", NULL, -3, 3,
895	"ities", NULL, 0, 1,
896	"itous", NULL, 0, 0,
897	"ively", NULL, -2, 2,
898	"ivity", NULL, 0, 0,
899	"izers", F, -1, 1,
900	"izing", F, 0, 0,
901
902	"ialist", NULL, 0, 0,
903	"iality", NULL, -1, 1,
904	"ialize", NULL, 0, 0,
905	"ically", NULL, -2, 2,
906	"icance", NULL, 0, 0,
907	"icians", NULL, -1, 1,
908	"icists", NULL, 0, 0,
909	"ifully", NULL, -4, 4,
910	"ionals", NULL, 0, 0,
911	"ionate", D, -1, 1,
912	"ioning", NULL, 0, 0,
913	"ionist", NULL, -2, 2,
914	"iously", NULL, 0, 0,
915	"istics", NULL, -1, 1,
916	"izable", E, 0, 0,
917
918	"ibility", NULL, 0, 0,
919	"icalism", NULL, -1, 1,
920	"icalist", NULL, 0, 1,
921	"icality", NULL, 0, 0,
922	"icalize", NULL, -3, 3,
923	"ication", G, 0, 0,
924	"icianry", NULL, -1, 0,
925	"ination", NULL, -1, 1,
926	"ingness", NULL, 0, 0,
927	"ionally", NULL, -5, 5,
928	"isation", NULL, 0, 0,
929	"ishness", NULL, -1, 1,
930	"istical", NULL, 0, 1,
931	"iteness", NULL, 0, 0,
932	"iveness", NULL, -3, 3,
933	"ivistic", NULL, 0, 0,
934	"ivities", NULL, -1, 0,
935	"ization", F, -1, 1,
936	"izement", NULL, 0, 0,
937
938	"ibleness", NULL, 0, 0,
939	"icalness", NULL, -1, 1,
940	"ionalism", NULL, 0, 0,
941	"ionality", NULL, -2, 2,
942	"ionalize", NULL, 0, 0,
943	"iousness", NULL, -1, 1,
944	"izations", NULL, 0, 0,
945
946	"ionalness", NULL, 0, 1,
947	"istically", NULL, 0, 0,
948	"itousness", NULL, -2, 2,
949	"izability", NULL, 0, 0,
950	"izational", NULL, -1, 0,
951
952	"izationally", B, 0, 0,
953
954	"ly", B, 0, 0,
955
956	"less", NULL, 0, 1,
957	"lily", NULL, 0, 0,
958
959	"lessly", NULL, 0, 0,
960
961	"lessness", NULL, 0, 0,
962
963	"ness", NULL, 0, 0,
964
965	"nesses", NULL, 0, 0,
966
967	"o", NULL, 0, 0,
968
969	"on", S, 0, 1,
970	"or", T, 0, 0,
971
972	"oid", NULL, 0, 0,
973	"one", R, -1, 1,
974	"ous", NULL, 0, 0,
975
976	"ogen", NULL, 0, 0,
977
978	"oidal", NULL, 0, 0,
979	"oides", NULL, -1, 2,
980	"otide", NULL, 0, 0,
981	"ously", NULL, -1, 0,
982
983	"oidism", NULL, 0, 0,
984
985	"oidally", NULL, 0, 1,
986	"ousness", NULL, 0, 0,
987
988	"s", W, 0, 0,
989
990	"s'", NULL, 0, 0,
991
992	"um", U, 0, 1,
993	"us", V, 0, 0,
994
995	"ward", NULL, 0, 1,
996	"wise", NULL, 0, 0,
997
998	"y", B, 0, 0,
999
1000	"yl", R, 0, 0,
1001
1002	"ying", B, 0, 1,
1003	"yish", NULL, 0, 0,
1004
1005	"'s", NULL, 0, 0,
1006	};
1007
1008	typedef struct node
1009	{
1010	char c; /* First character */
1011	struct node left; / used in balanced */
1012	struct node right; / binary tree */
1013	Ending_List ptr[11]; / the approriate location */
1014	}
1015	First_Char_Node;
1016
1017	static First_Char_Node First[] =
1018	{
1019	'\'', NULL, NULL,
1020	{NULL,
1021	List + 293, NULL, NULL, NULL, NULL,
1022	NULL, NULL, NULL, NULL, NULL},
1023
1024	'a', First, NULL,
1025	{List,
1026	List + 2, List + 9, List + 20, List + 39, List + 58,
1027	List + 70, List + 77, List + 82, List + 87, List + 89},
1028
1029	'e', First + 1, First + 4,
1030	{List + 91,
1031	List + 93, List + 98, List + 106, List + 116, List + 126,
1032	List + 133, List + 138, List + 142, List + 145, NULL},
1033
1034	'f', NULL, NULL,
1035	{NULL,
1036	NULL, List + 146, NULL, List + 147, NULL,
1037	List + 148, NULL, NULL, NULL, NULL},
1038
1039	'h', First + 3, First + 5,
1040	{NULL,
1041	NULL, NULL, List + 149, NULL, NULL,
1042	NULL, NULL, NULL, NULL, NULL},
1043
1044	'i', NULL, NULL,
1045	{List + 150,
1046	List + 152, List + 163, List + 181, List + 202, List + 222,
1047	List + 239, List + 252, List + 258, NULL, List + 261},
1048
1049	'l', First + 2, First + 10,
1050	{NULL,
1051	List + 262, NULL, List + 263, NULL, List + 265,
1052	NULL, List + 266, NULL, NULL, NULL},
1053
1054	'n', NULL, NULL,
1055	{NULL,
1056	NULL, NULL, List + 267, NULL, List + 268,
1057	NULL, NULL, NULL, NULL, NULL},
1058
1059	'o', First + 7, First + 9,
1060	{List + 269,
1061	List + 270, List + 273, List + 275, List + 277, List + 280,
1062	List + 281, NULL, NULL, NULL, NULL},
1063
1064	's', NULL, NULL,
1065	{List + 283,
1066	List + 284, NULL, NULL, NULL, NULL,
1067	NULL, NULL, NULL, NULL, NULL},
1068
1069	'u', First + 8, First + 12,
1070	{NULL,
1071	List + 285, NULL, NULL, NULL, NULL,
1072	NULL, NULL, NULL, NULL, NULL},
1073
1074	'w', NULL, NULL,
1075	{NULL,
1076	NULL, NULL, List + 287, NULL, NULL,
1077	NULL, NULL, NULL, NULL, NULL},
1078
1079	'y', First + 11, NULL,
1080	{List + 289,
1081	List + 290, NULL, List + 291, NULL, NULL,
1082	NULL, NULL, NULL, NULL, NULL},
1083	};
1084
1085
1086	/******************************************************************************
1087	*
1088	* Look for a match in the suffix list.
1089	* Return the pointer to the end of the new stem if there is a match.
1090	* Otherwise, return the pointer to the end of the original word.
1091	*
1092	* Method:
1093	* * Search for the first character in the suffix list.
1094	* * If there is match, search for the rest of the suffix list.
1095	*
1096	*****************************************************************************/
1097
1098	static char *
1099	remove_ending (char *word, int w_len)
1100	{
1101	static First_Char_Node root = First + 6; / the root of binary tree is 'l' */
1102	First_Char_Node p_first; / Points to the first character */
1103	/* of the possible suffix */
1104	Ending_List p_list; / Points to the List of Endings */
1105	char suffix_start, / Points to start of possible suffix */
1106	stem_end, / Points to the end of possible stem */
1107	ch1, ch2;
1108	int s_len, /* Possible stem length */
1109	e_offset, /* Offset from the end to the start of */
1110	/* possible suffix */
1111	not_done = 1, cmp;
1112
1113	stem_end = word + w_len - 1;
1114	if (w_len > PREDEFINED_SIZE) /* find the start of suffix */
1115	suffix_start = word + w_len - MAX_ENDING_SIZE;
1116	else
1117	suffix_start = word + MIN_STEM_LENGTH;
1118
1119	ch1 = *suffix_start;
1120	do /* Search for the first character */
1121	{
1122	p_first = root;
1123	do
1124	{
1125	ch2 = p_first->c;
1126	if (ch1 == ch2)
1127	break;
1128	else if (ch1 > ch2)
1129	p_first = p_first->right;
1130	else
1131	p_first = p_first->left;
1132	}
1133	while (p_first != NULL);
1134
1135	if (p_first != NULL) /* Search for the rest */
1136	{
1137	e_offset = stem_end - suffix_start;
1138	if ((p_list = p_first->ptr[e_offset]) != NULL)
1139	{
1140	for (;;) /* no need to compare the first char */
1141	{
1142	cmp = strcmp (suffix_start + 1, p_list->ending + 1);
1143	if (cmp > 0)
1144	if (p_list->right_offset)
1145	p_list += p_list->right_offset;
1146	else
1147	break;
1148	else if (cmp < 0)
1149	if (p_list->left_offset)
1150	p_list += p_list->left_offset;
1151	else
1152	break;
1153	else
1154	{
1155	s_len = suffix_start - word;
1156	if (!p_list->cond \|\|
1157	(*p_list->cond) (s_len, suffix_start - 1))
1158	{
1159	*suffix_start = EOS;
1160	stem_end = suffix_start - 1;
1161	not_done = 0;
1162	}
1163	break;
1164	}
1165	}
1166	}
1167	}
1168	++suffix_start;
1169	}
1170	while (not_done && ((ch1 = *suffix_start) != EOS));
1171	return (stem_end);
1172	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: