Context Navigation

simplefrenchstem.cpp@ 26162

Last change on this file since 26162 was 26162, checked in by kjdon, 12 years ago
in unicode_to_utf8_word, the max_output_length is the maximum length of the actual string, not including the length, so shouldn't add one to the length
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 2.5 KB

Line
1	/**************************************************************************
2	*
3	* simplefrenchstem.c -- a simple french stemmer
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*
19	**************************************************************************/
20
21	#include "simplefrenchstem.h"
22	#include "unitool.h"
23
24
25	/* =========================================================================
26	* Function: simplefrenchstem
27	* Description: a simple french stemmer
28	* Input: a word string with the length in the first byte
29	* Output: the stemmed word
30	* ========================================================================= */
31
32	void simplefrenchstem (unsigned char *word) {
33	unsigned short out[256]; /* temp space */
34	unsigned short wordstart; / points to first letter of word */
35	int last; /* last points to the last character */
36
37	/* decode */
38	utf8_word_to_unicode (word, out, 255);
39	wordstart = out + 1;
40	last = out[0]-1;
41
42
43	if (last > 4) {
44	if (wordstart[last]=='x') {
45	if (wordstart[last-1]=='u' && wordstart[last-2]=='a') {
46	wordstart[last-1]='l';
47	}
48	--last;
49
50	} else {
51	if (last>=0 && wordstart[last]=='s') --last;
52	if (last>=0 && wordstart[last]=='r') --last;
53	if (last>=0 && wordstart[last]=='e') --last;
54
55	/* letter with accent e + ' -- there are two possible encodings */
56	if (last>=0 && wordstart[last]==0xe9) {
57	--last;
58	} else if (last>=1 && wordstart[last-1]=='e' && wordstart[last]==0x301) {
59	last -= 2;
60	}
61
62	if (last >= 1 && wordstart[last]==wordstart[last-1]) --last;
63	} /* end else */
64
65	out[0] = (unsigned char)(last+1);
66	} /* end if (len > 4) */
67
68	/* re-code, make sure the result is not longer than the input */
69	//unicode_to_utf8_word (out, word, word[0]+1);
70	// The max_output_length (3rd param) for unicode_to_utf8 is the max length of
71	// the string, not including the length
72	unicode_to_utf8_word (out, word, word[0]);
73	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/common-src/indexers/mgpp/lib/simplefrenchstem.cpp@ 26162

Download in other formats: