Context Navigation

source: tags/greenstone-3_01-distribution/mgpp/text/stemmer.cpp@ 10896

Last change on this file since 10896 was 9613, checked in by kjdon, 19 years ago
added in x++ -> ++x changes submitted by Emanuel Dejanu
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 2.8 KB

Line
1	/**************************************************************************
2	*
3	* stemmer.cpp -- The stemmer/case folder
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#include "sysfuncs.h"
23	#include "stemmer.h"
24
25	#include "lovinstem.h"
26	#include "simplefrenchstem.h"
27	#include "unitool.h"
28
29	#define LOVINSTEMMER 0
30	#define SIMPLEFRENCHSTEMMER 1
31
32
33	/* decode the utf-8 encoded unicode, casefold and then recode
34	* making sure the final length doesn't exceed the original
35	* length */
36	static void unicode_casefold (u_char *word) {
37	unsigned short out[256]; /* temp space */
38	int i;
39	int len;
40
41	/* decode */
42	utf8_word_to_unicode (word, out, 255);
43	len = out[0];
44
45	/* casefold and simplify-fold */
46	for (i=0; i<len; ++i) {
47	out[i+1] = unicode_tosimplified(unicode_tolower(out[i+1]));
48	}
49
50	/* re-code */
51	unicode_to_utf8_word (out, word, word[0]+1);
52	}
53
54
55	int stemmernumber (u_char *stemmerdescription) {
56	u_char descript[MAX_STEM_DESCRIPTION_LEN];
57	int i;
58
59	/* copy and case-fold the description */
60	for (i=0; (stemmerdescription[i] != '\0') &&
61	(i < MAX_STEM_DESCRIPTION_LEN-1); ++i)
62	descript[i] = tolower (stemmerdescription[i]);
63	descript[i] = '\0';
64
65	/* map the description to its number */
66
67	if ((strcmp ((char *) descript, "0") == 0) \|\|
68	(strcmp ((char *) descript, "english") == 0) \|\|
69	(strcmp ((char *) descript, "lovin") == 0))
70	return LOVINSTEMMER;
71
72	if ((strcmp ((char *) descript, "1") == 1) \|\|
73	(strcmp ((char *) descript, "french") == 0) \|\|
74	(strcmp ((char *) descript, "simplefrench") == 0))
75	return SIMPLEFRENCHSTEMMER;
76
77	return -1;
78	}
79
80
81
82	/*
83	* Method 0 - Do not stem or case fold.
84	* Method 1 - Case fold.
85	* Method 2 - Stem.
86	* Method 3 - Case fold and stem.
87	*
88	* The stemmer number should be obtained using
89	* the stemmernumber function above.
90	*/
91	void
92	stemmer (int method, int stemmer, u_char *word) {
93	if (method & 1) {
94	unicode_casefold (word);
95	}
96
97	if (method & 2) {
98	switch (stemmer) {
99	case LOVINSTEMMER: lovinstem (word);
100	break;
101	case SIMPLEFRENCHSTEMMER: simplefrenchstem (word);
102	break;
103	}
104	}
105	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: