Context Navigation

lovinstem.cpp@ 23508

Last change on this file since 23508 was 23508, checked in by sjm84, 13 years ago
Committing 64 bit changes into the branch
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 26.9 KB

Line
1	/**************************************************************************
2	*
3	* lovinstem.c -- Stemming code
4	* Copyright (C) 1994 Linh Huynh ([email protected])
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#include "sysfuncs.h"
23
24
25	/*****************************************************************************
26	*
27	* Transformational Rules used in Recoding Stem Terminations
28	*
29	* Parameter in functions:
30	* ch : the second last character of the old ending
31	* (after removing one of double consonants)
32	*
33	****************************************************************************/
34
35
36	/********** Conditional rules associated with transformations ************/
37
38	/*static int aio ();
39	static int s ();
40	static int pt ();
41	static int m ();
42	static int n ();*/
43
44	static int
45	aio (char ch) /* Rule number 9 */
46	{
47	return ((ch != 'a') && (ch != 'i') && (ch != 'o'));
48	}
49
50	static int
51	s (char ch) /* Rule Number 24 */
52	{
53	return (ch != 's');
54	}
55
56	static int
57	pt (char ch) /* Rule number 28 */
58	{
59	return ((ch != 'p') && (ch != 't'));
60	}
61
62	static int
63	m (char ch) /* Rule number 30 */
64	{
65	return (ch != 'm');
66	}
67
68	static int
69	n (char ch) /* Rule number 32 */
70	{
71	return (ch != 'n');
72	}
73
74
75	/**************************************************************************
76	*
77	* Context-sensitive rules associated with certain endings.
78	*
79	* The notations of the rules are the same as in Lovins' paper
80	* except that rule A is replaced by a NULL in the data structure.
81	*
82	* Function parameters:
83	* - stem_length: possible stem length
84	* - end : pointer points to the end of the possible stem.
85	*
86	*************************************************************************/
87
88
89	/******* Context-sensitive rule function declarations ****************/
90
91	/*static int B ();
92	static int C ();
93	static int D ();
94	static int E ();
95	static int F ();
96	static int G ();
97	static int H ();
98	static int I ();
99	static int J ();
100	static int K ();
101	static int L ();
102	static int M ();
103	static int N ();
104	static int O ();
105	static int P ();
106	static int Q ();
107	static int R ();
108	static int S ();
109	static int T ();
110	static int U ();
111	static int V ();
112	static int W ();
113	static int X ();
114	static int Y ();
115	static int Z ();
116	static int AA ();
117	static int BB ();
118	static int CC ();
119	*/
120	static int
121	B (int stem_length, char *end)
122	{
123	return (stem_length >= 3);
124	}
125
126	static int
127	C (int stem_length, char *end)
128	{
129	return (stem_length >= 4);
130	}
131
132	static int
133	D (int stem_length, char *end)
134	{
135	return (stem_length >= 5);
136	}
137
138	static int
139	E (int stem_length, char *end)
140
141
142	{
143	return (*end != 'e');
144	}
145
146	static int
147	F (int stem_length, char *end)
148
149
150	{
151	return ((stem_length >= 3) && (*end != 'e'));
152	}
153
154	static int
155	G (int stem_length, char *end)
156
157
158	{
159	return ((stem_length >= 3) && (*end == 'f'));
160	}
161
162	static int
163	H (int stem_length, char *end)
164
165
166	{
167	char p1, p2;
168	p1 = *end;
169	p2 = *(end - 1);
170	return ((p1 == 't') \|\| ((p1 == 'l') && (p2 == 'l')));
171	}
172
173	static int
174	I (int stem_length, char *end)
175
176
177	{
178	char p1;
179	p1 = *end;
180	return ((p1 != 'o') && (p1 != 'e'));
181	}
182
183	static int
184	J (int stem_length, char *end)
185
186
187	{
188	char p1;
189	p1 = *end;
190	return ((p1 != 'a') && (p1 != 'e'));
191	}
192
193	static int
194	K (int stem_length, char *end)
195
196
197	{
198	char p1;
199	p1 = *end;
200	return ((stem_length >= 3) &&
201	((p1 == 'l') \|\| (p1 == 'i') \|\|
202	((p1 == 'e') && (*(end - 2) == 'u'))));
203	}
204
205	static int
206	L (int stem_length, char *end)
207
208
209	{
210	char p1, p2;
211	p1 = *end;
212	p2 = *(end - 1);
213	if (p1 == 's')
214	return (p2 == 'o');
215	else
216	return ((p1 != 'u') && (p1 != 'x'));
217	}
218
219	static int
220	M (int stem_length, char *end)
221
222
223	{
224	char p1;
225	p1 = *end;
226	return ((p1 != 'a') && (p1 != 'c') &&
227	(p1 != 'e') && (p1 != 'm'));
228	}
229
230	static int
231	N (int stem_length, char *end)
232
233
234	{
235	if (stem_length >= 3)
236	{
237	if (*(end - 2) == 's')
238	return (stem_length >= 4);
239	else
240	return 1;
241	}
242	else
243	return 0;
244	}
245
246	static int
247	O (int stem_length, char *end)
248
249
250	{
251	char p1;
252	p1 = *end;
253	return ((p1 == 'l') \|\| (p1 == 'i'));
254	}
255
256	static int
257	P (int stem_length, char *end)
258
259
260	{
261	return (*end != 'c');
262	}
263
264	static int
265	Q (int stem_length, char *end)
266
267
268	{
269	char p1;
270	p1 = *end;
271
272	return ((stem_length >= 3) && (p1 != 'l') && (p1 != 'n'));
273	}
274
275	static int
276	R (int stem_length, char *end)
277
278
279	{
280	char p1;
281	p1 = *end;
282	return ((p1 == 'n') \|\| (p1 == 'r'));
283	}
284
285	static int
286	S (int stem_length, char *end)
287
288
289	{
290	char p1, p2;
291	p1 = *end;
292	p2 = *(end - 1);
293
294	if (p1 == 't')
295	return (p2 != 't');
296	else
297	return ((p1 == 'r') && (p2 == 'd'));
298	}
299
300	static int
301	T (int stem_length, char *end)
302
303
304	{
305	char p1, p2;
306	p1 = *end;
307	p2 = *(end - 1);
308
309	if (p1 == 't')
310	return (p2 != 'o');
311	else
312	return (p1 == 's');
313	}
314
315	static int
316	U (int stem_length, char *end)
317
318
319	{
320	char p1;
321	p1 = *end;
322
323	return ((p1 == 'l') \|\| (p1 == 'm') \|\|
324	(p1 == 'n') \|\| (p1 == 'r'));
325	}
326
327	static int
328	V (int stem_length, char *end)
329
330
331	{
332	return (*end == 'c');
333	}
334
335	static int
336	W (int stem_length, char *end)
337
338
339	{
340	char p1;
341	p1 = *end;
342
343	return ((p1 != 's') && (p1 != 'u'));
344	}
345
346	static int
347	X (int stem_length, char *end)
348
349
350	{
351	char p1;
352	p1 = *end;
353
354	if (p1 == 'e')
355	return ((stem_length >= 3) && (*(end - 2) == 'u'));
356	else
357	return ((p1 == 'l') \|\| (p1 == 'i'));
358	}
359
360	static int
361	Y (int stem_length, char *end)
362
363
364	{
365	return ((end == 'n') && ((end - 1) == 'i'));
366	}
367
368	static int
369	Z (int stem_length, char *end)
370
371
372	{
373	return (*end != 'f');
374	}
375
376	static int
377	AA (int stem_length, char *end)
378
379
380	{
381	char p1, p2;
382	p1 = *end;
383	p2 = *(end - 1);
384
385	if (p1 == 'h')
386	return ((p2 == 'p') \|\| (p2 == 't'));
387	else if (p1 == 'r')
388	return ((p2 == 'e') \|\| (p2 == 'o'));
389	else if (p1 == 's')
390	return (p2 == 'e');
391	else
392	return ((p1 == 'd') \|\| (p1 == 'f') \|\|
393	(p1 == 'l') \|\| (p1 == 't'));
394	}
395
396	static int
397	BB (int stem_length, char *end)
398
399
400	{
401	if (stem_length >= 3)
402	{
403	if (stem_length >= 4)
404	return (strncmp (end - 3, "ryst", 4) != 0);
405	else
406	return (strncmp (end - 2, "met", 3) != 0);
407	}
408	else
409	return 0;
410	}
411
412	static int
413	CC (int stem_length, char *end)
414
415
416	{
417	return (*end == 'l');
418	}
419
420	/**************************************************************************/
421
422	#define MIN_STEM_LENGTH 2
423	#define MAX_ENDING_SIZE 11
424	#define PREDEFINED_SIZE (MAX_ENDING_SIZE + MIN_STEM_LENGTH)
425	#define EOS '\0'
426	/* [RPAP - Jan 97: Stem Index Change] */
427	#define MAX_STEM_LENGTH 255
428
429	static char remove_ending (char word, int w_len);
430	static void recode_stem (char *stem_end);
431
432	/* =========================================================================
433	* Function: stem
434	* Description:
435	* Implemetation of the Lovins' stemming algorithm described in
436	* J.B. Lovins, "Development of a Stemming Algorithm",
437	* Mechanical Translation and Computational Linguistics, Vol 11,1968.
438	* Input: a word string with the length in the first byte
439	* Output: the stemmed word
440	* ========================================================================= */
441
442	void
443	lovinstem (u_char * word)
444	{
445	char word_start; / points to first letter of word */
446	char stem_end; / points to last character of stem portion */
447	int old_length; /* length of word */
448	int new_length; /* length after stemming */
449
450	/* [RPAP - Jan 97: Stem Index Change] */
451	u_char copy[MAX_STEM_LENGTH + 2];
452	memcpy (copy, word, *word + 1);
453
454	old_length = copy; / [RPAP - Jan 97: Stem Index Change] */
455
456	/* The word must be at least MIN_STEM_LENGTH characters long. */
457	if (old_length <= MIN_STEM_LENGTH)
458	return;
459
460	word_start = (char ) copy + 1; / jump over length byte / / [RPAP - Jan 97: Stem Index Change] */
461	copy[old_length + 1] = '\0'; /* null terminate string / / [RPAP - Jan 97: Stem Index Change] */
462
463	stem_end = remove_ending (word_start, old_length);
464	recode_stem (stem_end);
465
466	/* fix up new length */
467	/* have to use strlen because have no other way of telling length */
468	new_length = strlen (word_start);
469	copy = new_length; / [RPAP - Jan 97: Stem Index Change] */
470
471	/* [RPAP - Jan 97: Stem Index Change] */
472	memcpy (word, copy, *copy + 1);
473
474	} /stem_complex /
475
476	/* =========================================================================
477
478	* Data structure for recoding and the method for recoding the stem.
479	*
480	* ========================================================================= */
481
482	typedef struct
483	{
484	char old_end; / old ending */
485	char new_end; / new ending */
486	char old_offset; /* length of the old ending - 1 */
487	int (cond) (char); / condition rule */
488	char end_group; /* signal the end of the group */
489	}
490	Recode_Rules;
491
492	/*
493	Stem terminations are divided into groups having the same last
494	character
495	*/
496
497	static Recode_Rules Rules[] =
498	{
499	{(char)"uad", (char)"uas", 2, NULL, 0},
500	{(char)"vad", (char)"vas", 2, NULL, 0},
501	{(char)"cid", (char)"cis", 2, NULL, 0},
502	{(char)"lid", (char)"lis", 2, NULL, 0},
503	{(char)"erid", (char)"eris", 3, NULL, 0},
504	{(char)"pand", (char)"pans", 3, NULL, 0},
505	{(char)"end", (char)"ens", 2, s, 0},
506	{(char)"end", (char)"ens", 2, m, 0},
507	{(char)"ond", (char)"ons", 2, NULL, 0},
508	{(char)"lud", (char)"lus", 2, NULL, 0},
509	{(char)"rud", (char)"rus", 2, NULL, 1},
510
511	{(char)"ul", (char)"l", 1, aio, 1},
512
513	{(char)"istr", (char)"ister", 3, NULL, 0},
514	{(char)"metr", (char)"meter", 3, NULL, 0},
515	{(char)"her", (char)"hes", 2, pt, 1},
516
517	{(char)"urs", (char)"ur", 2, NULL, 1},
518
519	{(char)"uct", (char)"uc", 2, NULL, 0},
520	{(char)"umpt", (char)"um", 3, NULL, 0},
521	{(char)"rpt", (char)"rb", 2, NULL, 0},
522	{(char)"mit", (char)"mis", 2, NULL, 0},
523	{(char)"ert", (char)"ers", 2, NULL, 0},
524	{(char)"et", (char)"es", 1, n, 0},
525	{(char)"yt", (char)"ys", 1, NULL, 1},
526
527	{(char)"iev", (char)"ief", 2, NULL, 0},
528	{(char)"olv", (char)"olut", 2, NULL, 1},
529
530	{(char)"bex", (char)"bic", 2, NULL, 0},
531	{(char)"dex", (char)"dic", 2, NULL, 0},
532	{(char)"pex", (char)"pic", 2, NULL, 0},
533	{(char)"tex", (char)"tic", 2, NULL, 0},
534	{(char)"ax", (char)"ac", 1, NULL, 0},
535	{(char)"ex", (char)"ec", 1, NULL, 0},
536	{(char)"ix", (char)"ic", 1, NULL, 0},
537	{(char)"lux", (char)"luc", 2, NULL, 1},
538
539	{(char)"yz", (char)"ys", 1, NULL, 1}
540	};
541
542	typedef struct last
543	{
544	char c; /* The last character */
545	struct last left; / used in balanced */
546	struct last right; / binary tree */
547	Recode_Rules pt; / location of approriate group */
548	}
549	Last_Char_Node;
550
551	/*------------
552	s
553	/ \
554	l x
555	/ \ / \
556	d r t z
557	\
558	v
559	---------------*/
560	static Last_Char_Node pr[] =
561	{
562	{'d', NULL, NULL, Rules},
563	{'l', pr, pr + 2, Rules + 11},
564	{'r', NULL, NULL, Rules + 12},
565	{'s', pr + 1, pr + 6, Rules + 15},
566	{'t', NULL, pr + 5, Rules + 16},
567	{'v', NULL, NULL, Rules + 23},
568	{'x', pr + 4, pr + 7, Rules + 25},
569	{'z', NULL, NULL, Rules + 33},
570	};
571
572
573	/*****************************************************************************
574	*
575	* Recode the stem according to the transformation rules.
576	*
577	*****************************************************************************/
578
579	static void
580	recode_stem (char *stem_end)
581	{
582	static Last_Char_Node *root = pr + 3;
583	Last_Char_Node *p_last = root;
584	Recode_Rules rule; / points to the transformation list */
585	char h, / points to start of possible ending */
586	ch, /* last character of the old ending */
587	ch2;
588
589	h = stem_end - 1;
590	ch = *stem_end;
591	if (h == ch) / Check for double consonant */
592	{
593	if (strchr ("bdglmnprst", ch) != NULL)
594	{
595	*stem_end = EOS;
596	stem_end = h;
597	}
598	}
599
600	do /* Check for the last character */
601	{
602	ch2 = p_last->c;
603	if (ch == ch2)
604	break;
605	else if (ch > ch2)
606	p_last = p_last->right;
607	else
608	p_last = p_last->left;
609	}
610	while (p_last != NULL);
611
612	if (p_last != NULL) /* Check for the rest of suffix list */
613	{
614	rule = p_last->pt;
615	for (;;)
616	{
617	h = stem_end - rule->old_offset;
618	if (strcmp (h, rule->old_end) == 0)
619	{
620	if (!rule->cond \|\| (rule->cond) ((h - 1)))
621	{
622	(void) strcpy (h, rule->new_end);
623	break;
624	}
625	}
626	if (rule->end_group)
627	break;
628	++rule;
629	}
630	}
631	}
632
633	/* ======================================================================
634
635	* List of endings and the method to remove the ending
636	*
637	* ======================================================================*/
638
639	/********** Data structures for list of endings ******************/
640
641	typedef struct
642	{
643	char ending; / old ending */
644	int (cond) (int length, char end); /* conditional rule */
645	signed char left_offset; /* used to find the siblings */
646	signed char right_offset; /* in balanced binary tree */
647	}
648	Ending_List;
649
650	/*
651	The original list of endings is re-organised into groups having
652	the same length and the same first character.
653	*/
654
655	static Ending_List List[] =
656	{
657	{(char*)"a", NULL, 0, 0},
658
659	{(char*)"ae", NULL, 0, 0},
660	{(char*)"al", BB, -1, 2},
661	{(char*)"ar", X, 0, 0},
662	{(char*)"as", B, -1, 0},
663
664	{(char*)"acy", NULL, 0, 1},
665	{(char*)"age", B, 0, 0},
666	{(char*)"aic", NULL, -2, 1},
667	{(char*)"als", BB, 0, 0},
668	{(char*)"ant", B, -2, 2},
669	{(char*)"ars", O, 0, 0},
670	{(char*)"ary", F, -1, 2},
671	{(char*)"ata", NULL, 0, 0},
672	{(char*)"ate", NULL, -1, 0},
673
674	{(char*)"able", NULL, 0, 1},
675	{(char*)"ably", NULL, 0, 0},
676	{(char*)"ages", B, -2, 2},
677	{(char*)"ally", B, 0, 0},
678	{(char*)"ance", B, -1, 1},
679	{(char*)"ancy", B, 0, 0},
680	{(char*)"ants", B, -4, 4},
681	{(char*)"aric", NULL, 0, 0},
682	{(char*)"arly", K, -1, 1},
683	{(char*)"ated", I, 0, 0},
684	{(char*)"ates", NULL, -2, 2},
685	{(char*)"atic", B, 0, 0},
686	{(char*)"ator", NULL, -1, 0},
687
688	{(char*)"acies", NULL, 0, 0},
689	{(char*)"acity", NULL, -1, 1},
690	{(char*)"aging", B, 0, 0},
691	{(char*)"aical", NULL, -2, 2},
692	{(char*)"alist", NULL, 0, 0},
693	{(char*)"alism", B, -1, 0},
694	{(char*)"ality", NULL, -3, 3},
695	{(char*)"alize", NULL, 0, 1},
696	{(char*)"allic", BB, 0, 0},
697	{(char*)"anced", B, -2, 2},
698	{(char*)"ances", B, 0, 0},
699	{(char*)"antic", C, -1, 0},
700	{(char*)"arial", NULL, -6, 6},
701	{(char*)"aries", NULL, 0, 1},
702	{(char*)"arily", NULL, 0, 0},
703	{(char*)"arity", B, -2, 2},
704	{(char*)"arize", NULL, 0, 0},
705	{(char*)"aroid", NULL, -1, 0},
706	{(char*)"ately", NULL, -3, 3},
707	{(char*)"ating", I, 0, 1},
708	{(char*)"ation", B, 0, 0},
709	{(char*)"ative", NULL, -2, 2},
710	{(char*)"ators", NULL, 0, 0},
711	{(char*)"atory", NULL, -1, 1},
712	{(char*)"ature", E, 0, 0},
713
714	{(char*)"aceous", NULL, 0, 1},
715	{(char*)"acious", B, 0, 0},
716	{(char*)"action", G, -2, 2},
717	{(char*)"alness", NULL, 0, 0},
718	{(char*)"ancial", NULL, -1, 1},
719	{(char*)"ancies", NULL, 0, 0},
720	{(char*)"ancing", B, -4, 4},
721	{(char*)"ariser", NULL, 0, 0},
722	{(char*)"arized", NULL, -1, 1},
723	{(char*)"arizer", NULL, 0, 0},
724	{(char*)"atable", NULL, -2, 2},
725	{(char*)"ations", B, 0, 0},
726	{(char*)"atives", NULL, -1, 0},
727
728	{(char*)"ability", NULL, 0, 1},
729	{(char*)"aically", NULL, 0, 0},
730	{(char*)"alistic", B, -2, 2},
731	{(char*)"alities", NULL, 0, 0},
732	{(char*)"ariness", E, -1, 0},
733	{(char*)"aristic", NULL, -3, 3},
734	{(char*)"arizing", NULL, 0, 1},
735	{(char*)"ateness", NULL, 0, 0},
736	{(char*)"atingly", NULL, -2, 2},
737	{(char*)"ational", B, 0, 0},
738	{(char*)"atively", NULL, -1, 1},
739	{(char*)"ativism", NULL, 0, 0},
740
741	{(char*)"ableness", NULL, 0, 1},
742	{(char*)"arizable", NULL, 0, 0},
743
744	{(char*)"allically", C, 0, 0},
745	{(char*)"antaneous", NULL, -1, 1},
746	{(char*)"antiality", NULL, 0, 0},
747	{(char*)"arisation", NULL, -2, 2},
748	{(char*)"arization", NULL, 0, 0},
749	{(char*)"ationally", B, -1, 1},
750	{(char*)"ativeness", NULL, 0, 0},
751
752	{(char*)"antialness", NULL, 0, 0},
753	{(char*)"arisations", NULL, -1, 1},
754	{(char*)"arizations", NULL, 0, 0},
755
756	{(char*)"alistically", B, 0, 1},
757	{(char*)"arizability", NULL, 0, 0},
758
759	{(char*)"e", NULL, 0, 0},
760
761	{(char*)"ed", E, 0, 0},
762	{(char*)"en", F, -1, 1},
763	{(char*)"es", E, 0, 0},
764
765	{(char*)"eal", Y, 0, 0},
766	{(char*)"ear", Y, -1, 1},
767	{(char*)"ely", E, 0, 0},
768	{(char*)"ene", E, -2, 2},
769	{(char*)"ent", C, 0, 0},
770	{(char*)"ery", E, -1, 1},
771	{(char*)"ese", NULL, 0, 0},
772
773	{(char*)"ealy", Y, 0, 1},
774	{(char*)"edly", E, 0, 0},
775	{(char*)"eful", NULL, -2, 1},
776	{(char*)"eity", NULL, 0, 0},
777	{(char*)"ence", NULL, -2, 2},
778	{(char*)"ency", NULL, 0, 0},
779	{(char*)"ened", E, -1, 2},
780	{(char*)"enly", E, 0, 0},
781	{(char*)"eous", NULL, -1, 0},
782
783	{(char*)"early", Y, 0, 1},
784	{(char*)"ehood", NULL, 0, 0},
785	{(char*)"eless", NULL, -2, 2},
786	{(char*)"elity", NULL, 0, 0},
787	{(char*)"ement", NULL, -1, 0},
788	{(char*)"enced", NULL, -3, 3},
789	{(char*)"ences", NULL, 0, 1},
790	{(char*)"eness", E, 0, 0},
791	{(char*)"ening", E, -2, 2},
792	{(char*)"ental", NULL, 0, 0},
793	{(char*)"ented", C, -1, 1},
794	{(char*)"ently", NULL, 0, 0},
795
796	{(char*)"eature", Z, 0, 0},
797	{(char*)"efully", NULL, -1, 1},
798	{(char*)"encies", NULL, 0, 0},
799	{(char*)"encing", NULL, -2, 2},
800	{(char*)"ential", NULL, 0, 0},
801	{(char*)"enting", C, -1, 1},
802	{(char*)"entist", NULL, 0, 1},
803	{(char*)"eously", NULL, 0, 0},
804
805	{(char*)"elihood", E, 0, 1},
806	{(char*)"encible", NULL, 0, 0},
807	{(char*)"entally", NULL, -2, 2},
808	{(char*)"entials", NULL, 0, 0},
809	{(char*)"entiate", NULL, -1, 1},
810	{(char*)"entness", NULL, 0, 0},
811
812	{(char*)"entation", NULL, 0, 0},
813	{(char*)"entially", NULL, -1, 1},
814	{(char*)"eousness", NULL, 0, 0},
815
816	{(char*)"eableness", E, 0, 1},
817	{(char*)"entations", NULL, 0, 0},
818	{(char*)"entiality", NULL, -2, 2},
819	{(char*)"entialize", NULL, 0, 0},
820	{(char*)"entiation", NULL, -1, 0},
821
822	{(char*)"entialness", NULL, 0, 0},
823
824	{(char*)"ful", NULL, 0, 0},
825
826	{(char*)"fully", NULL, 0, 0},
827
828	{(char*)"fulness", NULL, 0, 0},
829
830	{(char*)"hood", NULL, 0, 0},
831
832	{(char*)"i", NULL, 0, 0},
833
834	{(char*)"ia", NULL, 0, 0},
835	{(char*)"ic", NULL, -1, 1},
836	{(char*)"is", NULL, 0, 0},
837
838	{(char*)"ial", NULL, 0, 0},
839	{(char*)"ian", NULL, -1, 1},
840	{(char*)"ics", NULL, 0, 1},
841	{(char*)"ide", L, 0, 0},
842	{(char*)"ied", NULL, -3, 3},
843	{(char*)"ier", NULL, 0, 0},
844	{(char*)"ies", P, -1, 0},
845	{(char*)"ily", NULL, -1, 1},
846	{(char*)"ine", M, 0, 0},
847	{(char*)"ing", N, -5, 5},
848	{(char*)"ion", Q, 0, 0},
849	{(char*)"ish", C, -1, 1},
850	{(char*)"ism", B, 0, 1},
851	{(char*)"ist", NULL, 0, 0},
852	{(char*)"ite", AA, -3, 3},
853	{(char*)"ity", NULL, 0, 0},
854	{(char*)"ium", NULL, -1, 0},
855	{(char*)"ive", NULL, -1, 1},
856	{(char*)"ize", F, 0, 0},
857
858	{(char*)"ials", NULL, 0, 0},
859	{(char*)"ians", NULL, -1, 0},
860	{(char*)"ible", NULL, -1, 1},
861	{(char*)"ibly", NULL, 0, 0},
862	{(char*)"ical", NULL, -2, 2},
863	{(char*)"ides", L, 0, 0},
864	{(char*)"iers", NULL, -1, 1},
865	{(char*)"iful", NULL, 0, 0},
866	{(char*)"ines", M, -4, 4},
867	{(char*)"ings", N, 0, 0},
868	{(char*)"ions", B, -1, 1},
869	{(char*)"ious", NULL, 0, 0},
870	{(char*)"isms", B, -2, 2},
871	{(char*)"ists", NULL, 0, 0},
872	{(char*)"itic", H, -1, 1},
873	{(char*)"ized", F, 0, 1},
874	{(char*)"izer", F, 0, 0},
875
876	{(char*)"ially", NULL, 0, 0},
877	{(char*)"icant", NULL, -1, 1},
878	{(char*)"ician", NULL, 0, 0},
879	{(char*)"icide", NULL, -2, 2},
880	{(char*)"icism", NULL, 0, 0},
881	{(char*)"icist", NULL, -1, 0},
882	{(char*)"icity", NULL, -3, 3},
883	{(char*)"idine", I, 0, 1},
884	{(char*)"iedly", NULL, 0, 0},
885	{(char*)"ihood", NULL, -2, 2},
886	{(char*)"inate", NULL, 0, 0},
887	{(char*)"iness", NULL, -1, 0},
888	{(char*)"ingly", B, -6, 6},
889	{(char*)"inism", J, 0, 1},
890	{(char*)"inity", CC, 0, 0},
891	{(char*)"ional", NULL, -2, 2},
892	{(char*)"ioned", NULL, 0, 0},
893	{(char*)"ished", NULL, -1, 0},
894	{(char*)"istic", NULL, -3, 3},
895	{(char*)"ities", NULL, 0, 1},
896	{(char*)"itous", NULL, 0, 0},
897	{(char*)"ively", NULL, -2, 2},
898	{(char*)"ivity", NULL, 0, 0},
899	{(char*)"izers", F, -1, 1},
900	{(char*)"izing", F, 0, 0},
901
902	{(char*)"ialist", NULL, 0, 0},
903	{(char*)"iality", NULL, -1, 1},
904	{(char*)"ialize", NULL, 0, 0},
905	{(char*)"ically", NULL, -2, 2},
906	{(char*)"icance", NULL, 0, 0},
907	{(char*)"icians", NULL, -1, 1},
908	{(char*)"icists", NULL, 0, 0},
909	{(char*)"ifully", NULL, -4, 4},
910	{(char*)"ionals", NULL, 0, 0},
911	{(char*)"ionate", D, -1, 1},
912	{(char*)"ioning", NULL, 0, 0},
913	{(char*)"ionist", NULL, -2, 2},
914	{(char*)"iously", NULL, 0, 0},
915	{(char*)"istics", NULL, -1, 1},
916	{(char*)"izable", E, 0, 0},
917
918	{(char*)"ibility", NULL, 0, 0},
919	{(char*)"icalism", NULL, -1, 1},
920	{(char*)"icalist", NULL, 0, 1},
921	{(char*)"icality", NULL, 0, 0},
922	{(char*)"icalize", NULL, -3, 3},
923	{(char*)"ication", G, 0, 0},
924	{(char*)"icianry", NULL, -1, 0},
925	{(char*)"ination", NULL, -1, 1},
926	{(char*)"ingness", NULL, 0, 0},
927	{(char*)"ionally", NULL, -5, 5},
928	{(char*)"isation", NULL, 0, 0},
929	{(char*)"ishness", NULL, -1, 1},
930	{(char*)"istical", NULL, 0, 1},
931	{(char*)"iteness", NULL, 0, 0},
932	{(char*)"iveness", NULL, -3, 3},
933	{(char*)"ivistic", NULL, 0, 0},
934	{(char*)"ivities", NULL, -1, 0},
935	{(char*)"ization", F, -1, 1},
936	{(char*)"izement", NULL, 0, 0},
937
938	{(char*)"ibleness", NULL, 0, 0},
939	{(char*)"icalness", NULL, -1, 1},
940	{(char*)"ionalism", NULL, 0, 0},
941	{(char*)"ionality", NULL, -2, 2},
942	{(char*)"ionalize", NULL, 0, 0},
943	{(char*)"iousness", NULL, -1, 1},
944	{(char*)"izations", NULL, 0, 0},
945
946	{(char*)"ionalness", NULL, 0, 1},
947	{(char*)"istically", NULL, 0, 0},
948	{(char*)"itousness", NULL, -2, 2},
949	{(char*)"izability", NULL, 0, 0},
950	{(char*)"izational", NULL, -1, 0},
951
952	{(char*)"izationally", B, 0, 0},
953
954	{(char*)"ly", B, 0, 0},
955
956	{(char*)"less", NULL, 0, 1},
957	{(char*)"lily", NULL, 0, 0},
958
959	{(char*)"lessly", NULL, 0, 0},
960
961	{(char*)"lessness", NULL, 0, 0},
962
963	{(char*)"ness", NULL, 0, 0},
964
965	{(char*)"nesses", NULL, 0, 0},
966
967	{(char*)"o", NULL, 0, 0},
968
969	{(char*)"on", S, 0, 1},
970	{(char*)"or", T, 0, 0},
971
972	{(char*)"oid", NULL, 0, 0},
973	{(char*)"one", R, -1, 1},
974	{(char*)"ous", NULL, 0, 0},
975
976	{(char*)"ogen", NULL, 0, 0},
977
978	{(char*)"oidal", NULL, 0, 0},
979	{(char*)"oides", NULL, -1, 2},
980	{(char*)"otide", NULL, 0, 0},
981	{(char*)"ously", NULL, -1, 0},
982
983	{(char*)"oidism", NULL, 0, 0},
984
985	{(char*)"oidally", NULL, 0, 1},
986	{(char*)"ousness", NULL, 0, 0},
987
988	{(char*)"s", W, 0, 0},
989
990	{(char*)"s'", NULL, 0, 0},
991
992	{(char*)"um", U, 0, 1},
993	{(char*)"us", V, 0, 0},
994
995	{(char*)"ward", NULL, 0, 1},
996	{(char*)"wise", NULL, 0, 0},
997
998	{(char*)"y", B, 0, 0},
999
1000	{(char*)"yl", R, 0, 0},
1001
1002	{(char*)"ying", B, 0, 1},
1003	{(char*)"yish", NULL, 0, 0},
1004
1005	{(char*)"'s", NULL, 0, 0},
1006	};
1007
1008	typedef struct node
1009	{
1010	char c; /* First character */
1011	struct node left; / used in balanced */
1012	struct node right; / binary tree */
1013	Ending_List ptr[11]; / the approriate location */
1014	}
1015	First_Char_Node;
1016
1017	static First_Char_Node First[] =
1018	{
1019	{'\'', NULL, NULL,
1020	{NULL,
1021	List + 293, NULL, NULL, NULL, NULL,
1022	NULL, NULL, NULL, NULL, NULL}},
1023
1024	{'a', First, NULL,
1025	{List,
1026	List + 2, List + 9, List + 20, List + 39, List + 58,
1027	List + 70, List + 77, List + 82, List + 87, List + 89}},
1028
1029	{'e', First + 1, First + 4,
1030	{List + 91,
1031	List + 93, List + 98, List + 106, List + 116, List + 126,
1032	List + 133, List + 138, List + 142, List + 145, NULL}},
1033
1034	{'f', NULL, NULL,
1035	{NULL,
1036	NULL, List + 146, NULL, List + 147, NULL,
1037	List + 148, NULL, NULL, NULL, NULL}},
1038
1039	{'h', First + 3, First + 5,
1040	{NULL,
1041	NULL, NULL, List + 149, NULL, NULL,
1042	NULL, NULL, NULL, NULL, NULL}},
1043
1044	{'i', NULL, NULL,
1045	{List + 150,
1046	List + 152, List + 163, List + 181, List + 202, List + 222,
1047	List + 239, List + 252, List + 258, NULL, List + 261}},
1048
1049	{'l', First + 2, First + 10,
1050	{NULL,
1051	List + 262, NULL, List + 263, NULL, List + 265,
1052	NULL, List + 266, NULL, NULL, NULL}},
1053
1054	{'n', NULL, NULL,
1055	{NULL,
1056	NULL, NULL, List + 267, NULL, List + 268,
1057	NULL, NULL, NULL, NULL, NULL}},
1058
1059	{'o', First + 7, First + 9,
1060	{List + 269,
1061	List + 270, List + 273, List + 275, List + 277, List + 280,
1062	List + 281, NULL, NULL, NULL, NULL}},
1063
1064	{'s', NULL, NULL,
1065	{List + 283,
1066	List + 284, NULL, NULL, NULL, NULL,
1067	NULL, NULL, NULL, NULL, NULL}},
1068
1069	{'u', First + 8, First + 12,
1070	{NULL,
1071	List + 285, NULL, NULL, NULL, NULL,
1072	NULL, NULL, NULL, NULL, NULL}},
1073
1074	{'w', NULL, NULL,
1075	{NULL,
1076	NULL, NULL, List + 287, NULL, NULL,
1077	NULL, NULL, NULL, NULL, NULL}},
1078
1079	{'y', First + 11, NULL,
1080	{List + 289,
1081	List + 290, NULL, List + 291, NULL, NULL,
1082	NULL, NULL, NULL, NULL, NULL}},
1083	};
1084
1085
1086	/******************************************************************************
1087	*
1088	* Look for a match in the suffix list.
1089	* Return the pointer to the end of the new stem if there is a match.
1090	* Otherwise, return the pointer to the end of the original word.
1091	*
1092	* Method:
1093	* * Search for the first character in the suffix list.
1094	* * If there is match, search for the rest of the suffix list.
1095	*
1096	*****************************************************************************/
1097
1098	static char *
1099	remove_ending (char *word, int w_len)
1100	{
1101	static First_Char_Node root = First + 6; / the root of binary tree is 'l' */
1102	First_Char_Node p_first; / Points to the first character */
1103	/* of the possible suffix */
1104	Ending_List p_list; / Points to the List of Endings */
1105	char suffix_start, / Points to start of possible suffix */
1106	stem_end, / Points to the end of possible stem */
1107	ch1, ch2;
1108	int s_len, /* Possible stem length */
1109	e_offset, /* Offset from the end to the start of */
1110	/* possible suffix */
1111	not_done = 1, cmp;
1112
1113	stem_end = word + w_len - 1;
1114	if (w_len > PREDEFINED_SIZE) /* find the start of suffix */
1115	suffix_start = word + w_len - MAX_ENDING_SIZE;
1116	else
1117	suffix_start = word + MIN_STEM_LENGTH;
1118
1119	ch1 = *suffix_start;
1120	do /* Search for the first character */
1121	{
1122	p_first = root;
1123	do
1124	{
1125	ch2 = p_first->c;
1126	if (ch1 == ch2)
1127	break;
1128	else if (ch1 > ch2)
1129	p_first = p_first->right;
1130	else
1131	p_first = p_first->left;
1132	}
1133	while (p_first != NULL);
1134
1135	if (p_first != NULL) /* Search for the rest */
1136	{
1137	e_offset = stem_end - suffix_start;
1138	if ((p_list = p_first->ptr[e_offset]) != NULL)
1139	{
1140	for (;;) /* no need to compare the first char */
1141	{
1142	cmp = strcmp (suffix_start + 1, p_list->ending + 1);
1143	if (cmp > 0)
1144	if (p_list->right_offset)
1145	p_list += p_list->right_offset;
1146	else
1147	break;
1148	else if (cmp < 0)
1149	if (p_list->left_offset)
1150	p_list += p_list->left_offset;
1151	else
1152	break;
1153	else
1154	{
1155	s_len = suffix_start - word;
1156	if (!p_list->cond \|\|
1157	(*p_list->cond) (s_len, suffix_start - 1))
1158	{
1159	*suffix_start = EOS;
1160	stem_end = suffix_start - 1;
1161	not_done = 0;
1162	}
1163	break;
1164	}
1165	}
1166	}
1167	}
1168	++suffix_start;
1169	}
1170	while (not_done && ((ch1 = *suffix_start) != EOS));
1171	return (stem_end);
1172	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/branches/64_bit_Greenstone/greenstone2/common-src/indexers/mgpp/lib/lovinstem.cpp@ 23508

Download in other formats: