Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: indexers/stable/mgpp/text/stemmer.cpp@ 20497

Last change on this file since 20497 was 16583, checked in by davidb, 16 years ago
Undoing change commited in r16582
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 3.9 KB

Line
1	/**************************************************************************
2	*
3	* stemmer.cpp -- The stemmer/case folder
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#include "sysfuncs.h"
23	#include "stemmer.h"
24	#include "lovinstem.h"
25	#include "simplefrenchstem.h"
26	#include "unitool.h"
27
28	#ifdef ENABLE_ACCENTFOLD
29	/* [JFG - Mar 06: Accent folding patch] */
30	#include "unac.h"
31	#endif
32
33	#define LOVINSTEMMER 0
34	#define SIMPLEFRENCHSTEMMER 1
35
36
37	/* decode the utf-8 encoded unicode, casefold and then recode
38	* making sure the final length doesn't exceed the original
39	* length */
40	static void mgpp_unicode_casefold (u_char *word) {
41	unsigned short out[256]; /* temp space */
42	int i;
43	int len;
44
45	/* decode */
46	utf8_word_to_unicode (word, out, 255);
47	len = out[0];
48
49	/* casefold and simplify-fold */
50	for (i=0; i<len; ++i) {
51	out[i+1] = unicode_tosimplified(unicode_tolower(out[i+1]));
52	}
53
54	/* re-code */
55	unicode_to_utf8_word (out, word, word[0]+1);
56	}
57
58	#ifdef ENABLE_ACCENTFOLD
59	/* [JFG - Mar 06: Accent folding patch] */
60	/* =========================================================================
61	* Function: unicode_accentfold
62
63	* Description: remove accents from characters
64	* Input: a word string with the length in the first byte
65	* Output: the unaccented word
66	* ========================================================================= */
67	void mgpp_unicode_accentfold (unsigned char *word) {
68	size_t unac_size = 0;
69	char *unac = NULL;
70
71
72	unac_string("utf-8", (char*)word+1, word[0], &unac, &unac_size);
73	strncpy((char*)word+1, unac, word[0]+1);
74	word[0] = unac_size;
75
76	free(unac);
77	return;
78	}
79	#endif
80
81	int mgpp_stemmernumber (u_char *stemmerdescription) {
82	u_char descript[MAX_STEM_DESCRIPTION_LEN];
83	int i;
84
85	/* copy and case-fold the description */
86	for (i=0; (stemmerdescription[i] != '\0') &&
87	(i < MAX_STEM_DESCRIPTION_LEN-1); ++i)
88	descript[i] = tolower (stemmerdescription[i]);
89	descript[i] = '\0';
90
91	/* map the description to its number */
92
93	if ((strcmp ((char *) descript, "0") == 0) \|\|
94	(strcmp ((char *) descript, "english") == 0) \|\|
95	(strcmp ((char *) descript, "lovin") == 0))
96	return LOVINSTEMMER;
97
98	if ((strcmp ((char *) descript, "1") == 1) \|\|
99	(strcmp ((char *) descript, "french") == 0) \|\|
100	(strcmp ((char *) descript, "simplefrench") == 0))
101	return SIMPLEFRENCHSTEMMER;
102
103	return -1;
104	}
105
106
107
108	/*
109	* Method 0 - Do not stem or case fold.
110	* Method 1 - Case fold.
111	* Method 2 - Stem.
112	* Method 3 - Case fold and stem.
113	* Method 4 - Accent fold
114	* Method 5 - Accent fold and case fold
115	* Method 6 - Accent fold and stem
116	* Method 7 - Accent fold, stem and case fold
117
118	* The stemmer number should be obtained using
119	* the stemmernumber function above.
120	*/
121	void
122	mgpp_stemmer (int method, int stemmer, u_char *word) {
123	if (method & STEM_CaseFolding) {
124	mgpp_unicode_casefold (word);
125	}
126
127	#ifdef ENABLE_ACCENTFOLD
128	if (method & STEM_AccentFolding) {
129	mgpp_unicode_accentfold (word);
130	}
131	#endif
132
133	if (method & STEM_Stemming) {
134	switch (stemmer) {
135	case LOVINSTEMMER: lovinstem (word);
136	break;
137	case SIMPLEFRENCHSTEMMER: simplefrenchstem (word);
138	break;
139	}
140	}
141	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: