Context Navigation

source: branches/ant-install-branch/indexers/mg/src/text/stemmer.c@ 9804

Last change on this file since 9804 was 7627, checked in by kjdon, 20 years ago
make stemmerdescription a const
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 4.0 KB

Line
1	/**************************************************************************
2	*
3	* stemmer.c -- The stemmer/case folder
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: stemmer.c 7627 2004-06-22 04:17:06Z kjdon $
21	*
22	**************************************************************************/
23
24	#include "sysfuncs.h"
25	#include "stemmer.h"
26
27	#include "lovinstem.h"
28	#include "simplefrenchstem.h"
29	#include "unitool.h"
30
31	/*
32	$Log$
33	Revision 1.2 2004/06/22 04:17:06 kjdon
34	make stemmerdescription a const
35
36	Revision 1.1 2003/02/20 21:18:24 mdewsnip
37	Addition of MG package for search and retrieval
38
39	Revision 1.1 1999/08/10 21:18:23 sjboddie
40	renamed mg-1.3d directory mg
41
42	Revision 1.3 1998/12/17 09:12:54 rjmcnab
43
44	Altered mg to process utf-8 encoded Unicode. The main changes
45	are in the parsing of the input, the casefolding, and the stemming.
46
47	Revision 1.2 1998/11/25 07:55:51 rjmcnab
48
49	Modified mg to that you can specify the stemmer you want
50	to use via a command line option. You specify it to
51	mg_passes during the build process. The number of the
52	stemmer that you used is stored within the inverted
53	dictionary header and the stemmed dictionary header so
54	the correct stemmer is used in later stages of building
55	and querying.
56
57	Revision 1.1 1998/11/17 09:35:42 rjmcnab
58	* empty log message *
59
60	* Revision 1.3 1994/10/20 03:57:05 tes
61	* I have rewritten the boolean query optimiser and abstracted out the
62	* components of the boolean query.
63	*
64	* Revision 1.2 1994/09/20 04:42:10 tes
65	* For version 1.1
66	*
67	*/
68
69	static char *RCSID = "$Id: stemmer.c 7627 2004-06-22 04:17:06Z kjdon $";
70
71
72	#define LOVINSTEMMER 0
73	#define SIMPLEFRENCHSTEMMER 1
74
75
76	/* decode the utf-8 encoded unicode, casefold and then recode
77	* making sure the final length doesn't exceed the original
78	* length */
79	static void unicode_casefold (u_char *word) {
80	unsigned short out[256]; /* temp space */
81	int i;
82	int len;
83
84	/* decode */
85	utf8_word_to_unicode (word, out, 255);
86	len = out[0];
87
88	/* casefold and simplify-fold */
89	for (i=0; i<len; i++) {
90	out[i+1] = unicode_tosimplified(unicode_tolower(out[i+1]));
91	}
92
93	/* re-code */
94	unicode_to_utf8_word (out, word, word[0]+1);
95	}
96
97
98	int stemmernumber (const u_char *stemmerdescription) {
99	u_char descript[MAX_STEM_DESCRIPTION_LEN];
100	int i;
101
102	/* copy and case-fold the description */
103	for (i=0; (stemmerdescription[i] != '\0') &&
104	(i < MAX_STEM_DESCRIPTION_LEN-1); i++)
105	descript[i] = tolower (stemmerdescription[i]);
106	descript[i] = '\0';
107
108	/* map the description to its number */
109
110	if ((strcmp (descript, "0") == 0) \|\|
111	(strcmp (descript, "english") == 0) \|\|
112	(strcmp (descript, "lovin") == 0))
113	return LOVINSTEMMER;
114
115	if ((strcmp (descript, "1") == 1) \|\|
116	(strcmp (descript, "french") == 0) \|\|
117	(strcmp (descript, "simplefrench") == 0))
118	return SIMPLEFRENCHSTEMMER;
119
120	return -1;
121	}
122
123
124
125	/*
126	* Method 0 - Do not stem or case fold.
127	* Method 1 - Case fold.
128	* Method 2 - Stem.
129	* Method 3 - Case fold and stem.
130	*
131	* The stemmer number should be obtained using
132	* the stemmernumber function above.
133	*/
134	void
135	stemmer (int method, int stemmer, u_char *word) {
136	if (method & 1) {
137	unicode_casefold (word);
138	}
139
140	if (method & 2) {
141	switch (stemmer) {
142	case LOVINSTEMMER: lovinstem (word);
143	break;
144	case SIMPLEFRENCHSTEMMER: simplefrenchstem (word);
145	break;
146	}
147	}
148	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: