Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/indexers/mg/lib/lovinstem.c@ 3745

Last change on this file since 3745 was 3745, checked in by mdewsnip, 21 years ago
Addition of MG package for search and retrieval
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 24.3 KB

Line
1	/**************************************************************************
2	*
3	* lovinstem.c -- Stemming code
4	* Copyright (C) 1994 Linh Huynh ([email protected])
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: lovinstem.c 3745 2003-02-20 21:20:24Z mdewsnip $
21	*
22	**************************************************************************/
23
24	static char *RCSID = "$Id: lovinstem.c 3745 2003-02-20 21:20:24Z mdewsnip $";
25
26	#include "sysfuncs.h"
27
28
29	/*****************************************************************************
30	*
31	* Transformational Rules used in Recoding Stem Terminations
32	*
33	* Parameter in functions:
34	* ch : the second last character of the old ending
35	* (after removing one of double consonants)
36	*
37	****************************************************************************/
38
39
40	/********** Conditional rules associated with transformations ************/
41
42	static int aio ();
43	static int s ();
44	static int pt ();
45	static int m ();
46	static int n ();
47
48	static int
49	aio (ch) /* Rule number 9 */
50	char ch;
51	{
52	return ((ch != 'a') && (ch != 'i') && (ch != 'o'));
53	}
54
55	static int
56	s (ch) /* Rule Number 24 */
57	char ch;
58	{
59	return (ch != 's');
60	}
61
62	static int
63	pt (ch) /* Rule number 28 */
64	char ch;
65	{
66	return ((ch != 'p') && (ch != 't'));
67	}
68
69	static int
70	m (ch) /* Rule number 30 */
71	char ch;
72	{
73	return (ch != 'm');
74	}
75
76	static int
77	n (ch) /* Rule number 32 */
78	char ch;
79	{
80	return (ch != 'n');
81	}
82
83
84	/**************************************************************************
85	*
86	* Context-sensitive rules associated with certain endings.
87	*
88	* The notations of the rules are the same as in Lovins' paper
89	* except that rule A is replaced by a NULL in the data structure.
90	*
91	* Function parameters:
92	* - stem_length: possible stem length
93	* - end : pointer points to the end of the possible stem.
94	*
95	*************************************************************************/
96
97
98	/******* Context-sensitive rule function declarations ****************/
99
100	static int B ();
101	static int C ();
102	static int D ();
103	static int E ();
104	static int F ();
105	static int G ();
106	static int H ();
107	static int I ();
108	static int J ();
109	static int K ();
110	static int L ();
111	static int M ();
112	static int N ();
113	static int O ();
114	static int P ();
115	static int Q ();
116	static int R ();
117	static int S ();
118	static int T ();
119	static int U ();
120	static int V ();
121	static int W ();
122	static int X ();
123	static int Y ();
124	static int Z ();
125	static int AA ();
126	static int BB ();
127	static int CC ();
128
129	static int
130	B (stem_length, end)
131	int stem_length;
132	char *end;
133	{
134	return (stem_length >= 3);
135	}
136
137	static int
138	C (stem_length, end)
139	int stem_length;
140	char *end;
141	{
142	return (stem_length >= 4);
143	}
144
145	static int
146	D (stem_length, end)
147	int stem_length;
148	char *end;
149	{
150	return (stem_length >= 5);
151	}
152
153	static int
154	E (stem_length, end)
155	int stem_length;
156	char *end;
157	{
158	return (*end != 'e');
159	}
160
161	static int
162	F (stem_length, end)
163	int stem_length;
164	char *end;
165	{
166	return ((stem_length >= 3) && (*end != 'e'));
167	}
168
169	static int
170	G (stem_length, end)
171	int stem_length;
172	char *end;
173	{
174	return ((stem_length >= 3) && (*end == 'f'));
175	}
176
177	static int
178	H (stem_length, end)
179	int stem_length;
180	char *end;
181	{
182	char p1, p2;
183	p1 = *end;
184	p2 = *(end - 1);
185	return ((p1 == 't') \|\| ((p1 == 'l') && (p2 == 'l')));
186	}
187
188	static int
189	I (stem_length, end)
190	int stem_length;
191	char *end;
192	{
193	char p1;
194	p1 = *end;
195	return ((p1 != 'o') && (p1 != 'e'));
196	}
197
198	static int
199	J (stem_length, end)
200	int stem_length;
201	char *end;
202	{
203	char p1;
204	p1 = *end;
205	return ((p1 != 'a') && (p1 != 'e'));
206	}
207
208	static int
209	K (stem_length, end)
210	int stem_length;
211	char *end;
212	{
213	char p1;
214	p1 = *end;
215	return ((stem_length >= 3) &&
216	((p1 == 'l') \|\| (p1 == 'i') \|\|
217	((p1 == 'e') && (*(end - 2) == 'u'))));
218	}
219
220	static int
221	L (stem_length, end)
222	int stem_length;
223	char *end;
224	{
225	char p1, p2;
226	p1 = *end;
227	p2 = *(end - 1);
228	if (p1 == 's')
229	return (p2 == 'o');
230	else
231	return ((p1 != 'u') && (p1 != 'x'));
232	}
233
234	static int
235	M (stem_length, end)
236	int stem_length;
237	char *end;
238	{
239	char p1;
240	p1 = *end;
241	return ((p1 != 'a') && (p1 != 'c') &&
242	(p1 != 'e') && (p1 != 'm'));
243	}
244
245	static int
246	N (stem_length, end)
247	int stem_length;
248	char *end;
249	{
250	if (stem_length >= 3)
251	{
252	if (*(end - 2) == 's')
253	return (stem_length >= 4);
254	else
255	return 1;
256	}
257	else
258	return 0;
259	}
260
261	static int
262	O (stem_length, end)
263	int stem_length;
264	char *end;
265	{
266	char p1;
267	p1 = *end;
268	return ((p1 == 'l') \|\| (p1 == 'i'));
269	}
270
271	static int
272	P (stem_length, end)
273	int stem_length;
274	char *end;
275	{
276	return (*end != 'c');
277	}
278
279	static int
280	Q (stem_length, end)
281	int stem_length;
282	char *end;
283	{
284	char p1;
285	p1 = *end;
286
287	return ((stem_length >= 3) && (p1 != 'l') && (p1 != 'n'));
288	}
289
290	static int
291	R (stem_length, end)
292	int stem_length;
293	char *end;
294	{
295	char p1;
296	p1 = *end;
297	return ((p1 == 'n') \|\| (p1 == 'r'));
298	}
299
300	static int
301	S (stem_length, end)
302	int stem_length;
303	char *end;
304	{
305	char p1, p2;
306	p1 = *end;
307	p2 = *(end - 1);
308
309	if (p1 == 't')
310	return (p2 != 't');
311	else
312	return ((p1 == 'r') && (p2 == 'd'));
313	}
314
315	static int
316	T (stem_length, end)
317	int stem_length;
318	char *end;
319	{
320	char p1, p2;
321	p1 = *end;
322	p2 = *(end - 1);
323
324	if (p1 == 't')
325	return (p2 != 'o');
326	else
327	return (p1 == 's');
328	}
329
330	static int
331	U (stem_length, end)
332	int stem_length;
333	char *end;
334	{
335	char p1;
336	p1 = *end;
337
338	return ((p1 == 'l') \|\| (p1 == 'm') \|\|
339	(p1 == 'n') \|\| (p1 == 'r'));
340	}
341
342	static int
343	V (stem_length, end)
344	int stem_length;
345	char *end;
346	{
347	return (*end == 'c');
348	}
349
350	static int
351	W (stem_length, end)
352	int stem_length;
353	char *end;
354	{
355	char p1;
356	p1 = *end;
357
358	return ((p1 != 's') && (p1 != 'u'));
359	}
360
361	static int
362	X (stem_length, end)
363	int stem_length;
364	char *end;
365	{
366	char p1;
367	p1 = *end;
368
369	if (p1 == 'e')
370	return ((stem_length >= 3) && (*(end - 2) == 'u'));
371	else
372	return ((p1 == 'l') \|\| (p1 == 'i'));
373	}
374
375	static int
376	Y (stem_length, end)
377	int stem_length;
378	char *end;
379	{
380	return ((end == 'n') && ((end - 1) == 'i'));
381	}
382
383	static int
384	Z (stem_length, end)
385	int stem_length;
386	char *end;
387	{
388	return (*end != 'f');
389	}
390
391	static int
392	AA (stem_length, end)
393	int stem_length;
394	char *end;
395	{
396	char p1, p2;
397	p1 = *end;
398	p2 = *(end - 1);
399
400	if (p1 == 'h')
401	return ((p2 == 'p') \|\| (p2 == 't'));
402	else if (p1 == 'r')
403	return ((p2 == 'e') \|\| (p2 == 'o'));
404	else if (p1 == 's')
405	return (p2 == 'e');
406	else
407	return ((p1 == 'd') \|\| (p1 == 'f') \|\|
408	(p1 == 'l') \|\| (p1 == 't'));
409	}
410
411	static int
412	BB (stem_length, end)
413	int stem_length;
414	char *end;
415	{
416	if (stem_length >= 3)
417	{
418	if (stem_length >= 4)
419	return (strncmp (end - 3, "ryst", 4) != 0);
420	else
421	return (strncmp (end - 2, "met", 3) != 0);
422	}
423	else
424	return 0;
425	}
426
427	static int
428	CC (stem_length, end)
429	int stem_length;
430	char *end;
431	{
432	return (*end == 'l');
433	}
434
435	/**************************************************************************/
436
437	#define MIN_STEM_LENGTH 2
438	#define MAX_ENDING_SIZE 11
439	#define PREDEFINED_SIZE (MAX_ENDING_SIZE + MIN_STEM_LENGTH)
440	#define EOS '\0'
441	/* [RPAP - Jan 97: Stem Index Change] */
442	#define MAX_STEM_LENGTH 255
443
444	static char remove_ending (char word, int w_len);
445	static void recode_stem (char *stem_end);
446
447	/* =========================================================================
448	* Function: stem
449	* Description:
450	* Implemetation of the Lovins' stemming algorithm described in
451	* J.B. Lovins, "Development of a Stemming Algorithm",
452	* Mechanical Translation and Computational Linguistics, Vol 11,1968.
453	* Input: a word string with the length in the first byte
454	* Output: the stemmed word
455	* ========================================================================= */
456
457	void
458	lovinstem (u_char * word)
459	{
460	char word_start; / points to first letter of word */
461	char stem_end; / points to last character of stem portion */
462	int old_length; /* length of word */
463	int new_length; /* length after stemming */
464
465	/* [RPAP - Jan 97: Stem Index Change] */
466	u_char copy[MAX_STEM_LENGTH + 2];
467	bcopy ((char ) word, (char ) copy, *word + 1);
468
469	old_length = copy; / [RPAP - Jan 97: Stem Index Change] */
470
471	/* The word must be at least MIN_STEM_LENGTH characters long. */
472	if (old_length <= MIN_STEM_LENGTH)
473	return;
474
475	word_start = (char ) copy + 1; / jump over length byte / / [RPAP - Jan 97: Stem Index Change] */
476	copy[old_length + 1] = '\0'; /* null terminate string / / [RPAP - Jan 97: Stem Index Change] */
477
478	stem_end = remove_ending (word_start, old_length);
479	recode_stem (stem_end);
480
481	/* fix up new length */
482	/* have to use strlen because have no other way of telling length */
483	new_length = strlen (word_start);
484	copy = new_length; / [RPAP - Jan 97: Stem Index Change] */
485
486	/* [RPAP - Jan 97: Stem Index Change] */
487	bcopy ((char ) copy, (char ) word, *copy + 1);
488
489	} /stem_complex /
490
491	/* =========================================================================
492
493	* Data structure for recoding and the method for recoding the stem.
494	*
495	* ========================================================================= */
496
497	typedef struct
498	{
499	char old_end; / old ending */
500	char new_end; / new ending */
501	char old_offset; /* length of the old ending - 1 */
502	int (cond) (); / condition rule */
503	char end_group; /* signal the end of the group */
504	}
505	Recode_Rules;
506
507	/*
508	Stem terminations are divided into groups having the same last
509	character
510	*/
511
512	static Recode_Rules Rules[] =
513	{
514	"uad", "uas", 2, NULL, 0,
515	"vad", "vas", 2, NULL, 0,
516	"cid", "cis", 2, NULL, 0,
517	"lid", "lis", 2, NULL, 0,
518	"erid", "eris", 3, NULL, 0,
519	"pand", "pans", 3, NULL, 0,
520	"end", "ens", 2, s, 0,
521	"end", "ens", 2, m, 0,
522	"ond", "ons", 2, NULL, 0,
523	"lud", "lus", 2, NULL, 0,
524	"rud", "rus", 2, NULL, 1,
525
526	"ul", "l", 1, aio, 1,
527
528	"istr", "ister", 3, NULL, 0,
529	"metr", "meter", 3, NULL, 0,
530	"her", "hes", 2, pt, 1,
531
532	"urs", "ur", 2, NULL, 1,
533
534	"uct", "uc", 2, NULL, 0,
535	"umpt", "um", 3, NULL, 0,
536	"rpt", "rb", 2, NULL, 0,
537	"mit", "mis", 2, NULL, 0,
538	"ert", "ers", 2, NULL, 0,
539	"et", "es", 1, n, 0,
540	"yt", "ys", 1, NULL, 1,
541
542	"iev", "ief", 2, NULL, 0,
543	"olv", "olut", 2, NULL, 1,
544
545	"bex", "bic", 2, NULL, 0,
546	"dex", "dic", 2, NULL, 0,
547	"pex", "pic", 2, NULL, 0,
548	"tex", "tic", 2, NULL, 0,
549	"ax", "ac", 1, NULL, 0,
550	"ex", "ec", 1, NULL, 0,
551	"ix", "ic", 1, NULL, 0,
552	"lux", "luc", 2, NULL, 1,
553
554	"yz", "ys", 1, NULL, 1
555	};
556
557	typedef struct last
558	{
559	char c; /* The last character */
560	struct last left; / used in balanced */
561	struct last right; / binary tree */
562	Recode_Rules pt; / location of approriate group */
563	}
564	Last_Char_Node;
565
566	/*------------
567	s
568	/ \
569	l x
570	/ \ / \
571	d r t z
572	\
573	v
574	---------------*/
575	static Last_Char_Node pr[] =
576	{
577	'd', NULL, NULL, Rules,
578	'l', pr, pr + 2, Rules + 11,
579	'r', NULL, NULL, Rules + 12,
580	's', pr + 1, pr + 6, Rules + 15,
581	't', NULL, pr + 5, Rules + 16,
582	'v', NULL, NULL, Rules + 23,
583	'x', pr + 4, pr + 7, Rules + 25,
584	'z', NULL, NULL, Rules + 33,
585	};
586
587
588	/*****************************************************************************
589	*
590	* Recode the stem according to the transformation rules.
591	*
592	*****************************************************************************/
593
594	static void
595	recode_stem (char *stem_end)
596	{
597	static Last_Char_Node *root = pr + 3;
598	Last_Char_Node *p_last = root;
599	Recode_Rules rule; / points to the transformation list */
600	char h, / points to start of possible ending */
601	ch, /* last character of the old ending */
602	ch2;
603
604	h = stem_end - 1;
605	ch = *stem_end;
606	if (h == ch) / Check for double consonant */
607	{
608	if (strchr ("bdglmnprst", ch) != NULL)
609	{
610	*stem_end = EOS;
611	stem_end = h;
612	}
613	}
614
615	do /* Check for the last character */
616	{
617	ch2 = p_last->c;
618	if (ch == ch2)
619	break;
620	else if (ch > ch2)
621	p_last = p_last->right;
622	else
623	p_last = p_last->left;
624	}
625	while (p_last != NULL);
626
627	if (p_last != NULL) /* Check for the rest of suffix list */
628	{
629	rule = p_last->pt;
630	for (;;)
631	{
632	h = stem_end - rule->old_offset;
633	if (strcmp (h, rule->old_end) == 0)
634	{
635	if (!rule->cond \|\| (rule->cond) ((h - 1)))
636	{
637	(void) strcpy (h, rule->new_end);
638	break;
639	}
640	}
641	if (rule->end_group)
642	break;
643	rule++;
644	}
645	}
646	}
647
648	/* ======================================================================
649
650	* List of endings and the method to remove the ending
651	*
652	* ======================================================================*/
653
654	/********** Data structures for list of endings ******************/
655
656	typedef struct
657	{
658	char ending; / old ending */
659	int (cond) (); / conditional rule */
660	signed char left_offset; /* used to find the siblings */
661	signed char right_offset; /* in balanced binary tree */
662	}
663	Ending_List;
664
665	/*
666	The original list of endings is re-organised into groups having
667	the same length and the same first character.
668	*/
669
670	static Ending_List List[] =
671	{
672	"a", NULL, 0, 0,
673
674	"ae", NULL, 0, 0,
675	"al", BB, -1, 2,
676	"ar", X, 0, 0,
677	"as", B, -1, 0,
678
679	"acy", NULL, 0, 1,
680	"age", B, 0, 0,
681	"aic", NULL, -2, 1,
682	"als", BB, 0, 0,
683	"ant", B, -2, 2,
684	"ars", O, 0, 0,
685	"ary", F, -1, 2,
686	"ata", NULL, 0, 0,
687	"ate", NULL, -1, 0,
688
689	"able", NULL, 0, 1,
690	"ably", NULL, 0, 0,
691	"ages", B, -2, 2,
692	"ally", B, 0, 0,
693	"ance", B, -1, 1,
694	"ancy", B, 0, 0,
695	"ants", B, -4, 4,
696	"aric", NULL, 0, 0,
697	"arly", K, -1, 1,
698	"ated", I, 0, 0,
699	"ates", NULL, -2, 2,
700	"atic", B, 0, 0,
701	"ator", NULL, -1, 0,
702
703	"acies", NULL, 0, 0,
704	"acity", NULL, -1, 1,
705	"aging", B, 0, 0,
706	"aical", NULL, -2, 2,
707	"alist", NULL, 0, 0,
708	"alism", B, -1, 0,
709	"ality", NULL, -3, 3,
710	"alize", NULL, 0, 1,
711	"allic", BB, 0, 0,
712	"anced", B, -2, 2,
713	"ances", B, 0, 0,
714	"antic", C, -1, 0,
715	"arial", NULL, -6, 6,
716	"aries", NULL, 0, 1,
717	"arily", NULL, 0, 0,
718	"arity", B, -2, 2,
719	"arize", NULL, 0, 0,
720	"aroid", NULL, -1, 0,
721	"ately", NULL, -3, 3,
722	"ating", I, 0, 1,
723	"ation", B, 0, 0,
724	"ative", NULL, -2, 2,
725	"ators", NULL, 0, 0,
726	"atory", NULL, -1, 1,
727	"ature", E, 0, 0,
728
729	"aceous", NULL, 0, 1,
730	"acious", B, 0, 0,
731	"action", G, -2, 2,
732	"alness", NULL, 0, 0,
733	"ancial", NULL, -1, 1,
734	"ancies", NULL, 0, 0,
735	"ancing", B, -4, 4,
736	"ariser", NULL, 0, 0,
737	"arized", NULL, -1, 1,
738	"arizer", NULL, 0, 0,
739	"atable", NULL, -2, 2,
740	"ations", B, 0, 0,
741	"atives", NULL, -1, 0,
742
743	"ability", NULL, 0, 1,
744	"aically", NULL, 0, 0,
745	"alistic", B, -2, 2,
746	"alities", NULL, 0, 0,
747	"ariness", E, -1, 0,
748	"aristic", NULL, -3, 3,
749	"arizing", NULL, 0, 1,
750	"ateness", NULL, 0, 0,
751	"atingly", NULL, -2, 2,
752	"ational", B, 0, 0,
753	"atively", NULL, -1, 1,
754	"ativism", NULL, 0, 0,
755
756	"ableness", NULL, 0, 1,
757	"arizable", NULL, 0, 0,
758
759	"allically", C, 0, 0,
760	"antaneous", NULL, -1, 1,
761	"antiality", NULL, 0, 0,
762	"arisation", NULL, -2, 2,
763	"arization", NULL, 0, 0,
764	"ationally", B, -1, 1,
765	"ativeness", NULL, 0, 0,
766
767	"antialness", NULL, 0, 0,
768	"arisations", NULL, -1, 1,
769	"arizations", NULL, 0, 0,
770
771	"alistically", B, 0, 1,
772	"arizability", NULL, 0, 0,
773
774	"e", NULL, 0, 0,
775
776	"ed", E, 0, 0,
777	"en", F, -1, 1,
778	"es", E, 0, 0,
779
780	"eal", Y, 0, 0,
781	"ear", Y, -1, 1,
782	"ely", E, 0, 0,
783	"ene", E, -2, 2,
784	"ent", C, 0, 0,
785	"ery", E, -1, 1,
786	"ese", NULL, 0, 0,
787
788	"ealy", Y, 0, 1,
789	"edly", E, 0, 0,
790	"eful", NULL, -2, 1,
791	"eity", NULL, 0, 0,
792	"ence", NULL, -2, 2,
793	"ency", NULL, 0, 0,
794	"ened", E, -1, 2,
795	"enly", E, 0, 0,
796	"eous", NULL, -1, 0,
797
798	"early", Y, 0, 1,
799	"ehood", NULL, 0, 0,
800	"eless", NULL, -2, 2,
801	"elity", NULL, 0, 0,
802	"ement", NULL, -1, 0,
803	"enced", NULL, -3, 3,
804	"ences", NULL, 0, 1,
805	"eness", E, 0, 0,
806	"ening", E, -2, 2,
807	"ental", NULL, 0, 0,
808	"ented", C, -1, 1,
809	"ently", NULL, 0, 0,
810
811	"eature", Z, 0, 0,
812	"efully", NULL, -1, 1,
813	"encies", NULL, 0, 0,
814	"encing", NULL, -2, 2,
815	"ential", NULL, 0, 0,
816	"enting", C, -1, 1,
817	"entist", NULL, 0, 1,
818	"eously", NULL, 0, 0,
819
820	"elihood", E, 0, 1,
821	"encible", NULL, 0, 0,
822	"entally", NULL, -2, 2,
823	"entials", NULL, 0, 0,
824	"entiate", NULL, -1, 1,
825	"entness", NULL, 0, 0,
826
827	"entation", NULL, 0, 0,
828	"entially", NULL, -1, 1,
829	"eousness", NULL, 0, 0,
830
831	"eableness", E, 0, 1,
832	"entations", NULL, 0, 0,
833	"entiality", NULL, -2, 2,
834	"entialize", NULL, 0, 0,
835	"entiation", NULL, -1, 0,
836
837	"entialness", NULL, 0, 0,
838
839	"ful", NULL, 0, 0,
840
841	"fully", NULL, 0, 0,
842
843	"fulness", NULL, 0, 0,
844
845	"hood", NULL, 0, 0,
846
847	"i", NULL, 0, 0,
848
849	"ia", NULL, 0, 0,
850	"ic", NULL, -1, 1,
851	"is", NULL, 0, 0,
852
853	"ial", NULL, 0, 0,
854	"ian", NULL, -1, 1,
855	"ics", NULL, 0, 1,
856	"ide", L, 0, 0,
857	"ied", NULL, -3, 3,
858	"ier", NULL, 0, 0,
859	"ies", P, -1, 0,
860	"ily", NULL, -1, 1,
861	"ine", M, 0, 0,
862	"ing", N, -5, 5,
863	"ion", Q, 0, 0,
864	"ish", C, -1, 1,
865	"ism", B, 0, 1,
866	"ist", NULL, 0, 0,
867	"ite", AA, -3, 3,
868	"ity", NULL, 0, 0,
869	"ium", NULL, -1, 0,
870	"ive", NULL, -1, 1,
871	"ize", F, 0, 0,
872
873	"ials", NULL, 0, 0,
874	"ians", NULL, -1, 0,
875	"ible", NULL, -1, 1,
876	"ibly", NULL, 0, 0,
877	"ical", NULL, -2, 2,
878	"ides", L, 0, 0,
879	"iers", NULL, -1, 1,
880	"iful", NULL, 0, 0,
881	"ines", M, -4, 4,
882	"ings", N, 0, 0,
883	"ions", B, -1, 1,
884	"ious", NULL, 0, 0,
885	"isms", B, -2, 2,
886	"ists", NULL, 0, 0,
887	"itic", H, -1, 1,
888	"ized", F, 0, 1,
889	"izer", F, 0, 0,
890
891	"ially", NULL, 0, 0,
892	"icant", NULL, -1, 1,
893	"ician", NULL, 0, 0,
894	"icide", NULL, -2, 2,
895	"icism", NULL, 0, 0,
896	"icist", NULL, -1, 0,
897	"icity", NULL, -3, 3,
898	"idine", I, 0, 1,
899	"iedly", NULL, 0, 0,
900	"ihood", NULL, -2, 2,
901	"inate", NULL, 0, 0,
902	"iness", NULL, -1, 0,
903	"ingly", B, -6, 6,
904	"inism", J, 0, 1,
905	"inity", CC, 0, 0,
906	"ional", NULL, -2, 2,
907	"ioned", NULL, 0, 0,
908	"ished", NULL, -1, 0,
909	"istic", NULL, -3, 3,
910	"ities", NULL, 0, 1,
911	"itous", NULL, 0, 0,
912	"ively", NULL, -2, 2,
913	"ivity", NULL, 0, 0,
914	"izers", F, -1, 1,
915	"izing", F, 0, 0,
916
917	"ialist", NULL, 0, 0,
918	"iality", NULL, -1, 1,
919	"ialize", NULL, 0, 0,
920	"ically", NULL, -2, 2,
921	"icance", NULL, 0, 0,
922	"icians", NULL, -1, 1,
923	"icists", NULL, 0, 0,
924	"ifully", NULL, -4, 4,
925	"ionals", NULL, 0, 0,
926	"ionate", D, -1, 1,
927	"ioning", NULL, 0, 0,
928	"ionist", NULL, -2, 2,
929	"iously", NULL, 0, 0,
930	"istics", NULL, -1, 1,
931	"izable", E, 0, 0,
932
933	"ibility", NULL, 0, 0,
934	"icalism", NULL, -1, 1,
935	"icalist", NULL, 0, 1,
936	"icality", NULL, 0, 0,
937	"icalize", NULL, -3, 3,
938	"ication", G, 0, 0,
939	"icianry", NULL, -1, 0,
940	"ination", NULL, -1, 1,
941	"ingness", NULL, 0, 0,
942	"ionally", NULL, -5, 5,
943	"isation", NULL, 0, 0,
944	"ishness", NULL, -1, 1,
945	"istical", NULL, 0, 1,
946	"iteness", NULL, 0, 0,
947	"iveness", NULL, -3, 3,
948	"ivistic", NULL, 0, 0,
949	"ivities", NULL, -1, 0,
950	"ization", F, -1, 1,
951	"izement", NULL, 0, 0,
952
953	"ibleness", NULL, 0, 0,
954	"icalness", NULL, -1, 1,
955	"ionalism", NULL, 0, 0,
956	"ionality", NULL, -2, 2,
957	"ionalize", NULL, 0, 0,
958	"iousness", NULL, -1, 1,
959	"izations", NULL, 0, 0,
960
961	"ionalness", NULL, 0, 1,
962	"istically", NULL, 0, 0,
963	"itousness", NULL, -2, 2,
964	"izability", NULL, 0, 0,
965	"izational", NULL, -1, 0,
966
967	"izationally", B, 0, 0,
968
969	"ly", B, 0, 0,
970
971	"less", NULL, 0, 1,
972	"lily", NULL, 0, 0,
973
974	"lessly", NULL, 0, 0,
975
976	"lessness", NULL, 0, 0,
977
978	"ness", NULL, 0, 0,
979
980	"nesses", NULL, 0, 0,
981
982	"o", NULL, 0, 0,
983
984	"on", S, 0, 1,
985	"or", T, 0, 0,
986
987	"oid", NULL, 0, 0,
988	"one", R, -1, 1,
989	"ous", NULL, 0, 0,
990
991	"ogen", NULL, 0, 0,
992
993	"oidal", NULL, 0, 0,
994	"oides", NULL, -1, 2,
995	"otide", NULL, 0, 0,
996	"ously", NULL, -1, 0,
997
998	"oidism", NULL, 0, 0,
999
1000	"oidally", NULL, 0, 1,
1001	"ousness", NULL, 0, 0,
1002
1003	"s", W, 0, 0,
1004
1005	"s'", NULL, 0, 0,
1006
1007	"um", U, 0, 1,
1008	"us", V, 0, 0,
1009
1010	"ward", NULL, 0, 1,
1011	"wise", NULL, 0, 0,
1012
1013	"y", B, 0, 0,
1014
1015	"yl", R, 0, 0,
1016
1017	"ying", B, 0, 1,
1018	"yish", NULL, 0, 0,
1019
1020	"'s", NULL, 0, 0,
1021	};
1022
1023	typedef struct node
1024	{
1025	char c; /* First character */
1026	struct node left; / used in balanced */
1027	struct node right; / binary tree */
1028	Ending_List ptr[11]; / the approriate location */
1029	}
1030	First_Char_Node;
1031
1032	static First_Char_Node First[] =
1033	{
1034	'\'', NULL, NULL,
1035	{NULL,
1036	List + 293, NULL, NULL, NULL, NULL,
1037	NULL, NULL, NULL, NULL, NULL},
1038
1039	'a', First, NULL,
1040	{List,
1041	List + 2, List + 9, List + 20, List + 39, List + 58,
1042	List + 70, List + 77, List + 82, List + 87, List + 89},
1043
1044	'e', First + 1, First + 4,
1045	{List + 91,
1046	List + 93, List + 98, List + 106, List + 116, List + 126,
1047	List + 133, List + 138, List + 142, List + 145, NULL},
1048
1049	'f', NULL, NULL,
1050	{NULL,
1051	NULL, List + 146, NULL, List + 147, NULL,
1052	List + 148, NULL, NULL, NULL, NULL},
1053
1054	'h', First + 3, First + 5,
1055	{NULL,
1056	NULL, NULL, List + 149, NULL, NULL,
1057	NULL, NULL, NULL, NULL, NULL},
1058
1059	'i', NULL, NULL,
1060	{List + 150,
1061	List + 152, List + 163, List + 181, List + 202, List + 222,
1062	List + 239, List + 252, List + 258, NULL, List + 261},
1063
1064	'l', First + 2, First + 10,
1065	{NULL,
1066	List + 262, NULL, List + 263, NULL, List + 265,
1067	NULL, List + 266, NULL, NULL, NULL},
1068
1069	'n', NULL, NULL,
1070	{NULL,
1071	NULL, NULL, List + 267, NULL, List + 268,
1072	NULL, NULL, NULL, NULL, NULL},
1073
1074	'o', First + 7, First + 9,
1075	{List + 269,
1076	List + 270, List + 273, List + 275, List + 277, List + 280,
1077	List + 281, NULL, NULL, NULL, NULL},
1078
1079	's', NULL, NULL,
1080	{List + 283,
1081	List + 284, NULL, NULL, NULL, NULL,
1082	NULL, NULL, NULL, NULL, NULL},
1083
1084	'u', First + 8, First + 12,
1085	{NULL,
1086	List + 285, NULL, NULL, NULL, NULL,
1087	NULL, NULL, NULL, NULL, NULL},
1088
1089	'w', NULL, NULL,
1090	{NULL,
1091	NULL, NULL, List + 287, NULL, NULL,
1092	NULL, NULL, NULL, NULL, NULL},
1093
1094	'y', First + 11, NULL,
1095	{List + 289,
1096	List + 290, NULL, List + 291, NULL, NULL,
1097	NULL, NULL, NULL, NULL, NULL},
1098	};
1099
1100
1101	/******************************************************************************
1102	*
1103	* Look for a match in the suffix list.
1104	* Return the pointer to the end of the new stem if there is a match.
1105	* Otherwise, return the pointer to the end of the original word.
1106	*
1107	* Method:
1108	* * Search for the first character in the suffix list.
1109	* * If there is match, search for the rest of the suffix list.
1110	*
1111	*****************************************************************************/
1112
1113	static char *
1114	remove_ending (char *word, int w_len)
1115	{
1116	static First_Char_Node root = First + 6; / the root of binary tree is 'l' */
1117	First_Char_Node p_first; / Points to the first character */
1118	/* of the possible suffix */
1119	Ending_List p_list; / Points to the List of Endings */
1120	char suffix_start, / Points to start of possible suffix */
1121	stem_end, / Points to the end of possible stem */
1122	ch1, ch2;
1123	int s_len, /* Possible stem length */
1124	e_offset, /* Offset from the end to the start of */
1125	/* possible suffix */
1126	not_done = 1, cmp;
1127
1128	stem_end = word + w_len - 1;
1129	if (w_len > PREDEFINED_SIZE) /* find the start of suffix */
1130	suffix_start = word + w_len - MAX_ENDING_SIZE;
1131	else
1132	suffix_start = word + MIN_STEM_LENGTH;
1133
1134	ch1 = *suffix_start;
1135	do /* Search for the first character */
1136	{
1137	p_first = root;
1138	do
1139	{
1140	ch2 = p_first->c;
1141	if (ch1 == ch2)
1142	break;
1143	else if (ch1 > ch2)
1144	p_first = p_first->right;
1145	else
1146	p_first = p_first->left;
1147	}
1148	while (p_first != NULL);
1149
1150	if (p_first != NULL) /* Search for the rest */
1151	{
1152	e_offset = stem_end - suffix_start;
1153	if ((p_list = p_first->ptr[e_offset]) != NULL)
1154	{
1155	for (;;) /* no need to compare the first char */
1156	{
1157	cmp = strcmp (suffix_start + 1, p_list->ending + 1);
1158	if (cmp > 0)
1159	if (p_list->right_offset)
1160	p_list += p_list->right_offset;
1161	else
1162	break;
1163	else if (cmp < 0)
1164	if (p_list->left_offset)
1165	p_list += p_list->left_offset;
1166	else
1167	break;
1168	else
1169	{
1170	s_len = suffix_start - word;
1171	if (!p_list->cond \|\|
1172	(*p_list->cond) (s_len, suffix_start - 1))
1173	{
1174	*suffix_start = EOS;
1175	stem_end = suffix_start - 1;
1176	not_done = 0;
1177	}
1178	break;
1179	}
1180	}
1181	}
1182	}
1183	suffix_start++;
1184	}
1185	while (not_done && ((ch1 = *suffix_start) != EOS));
1186	return (stem_end);
1187	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: