Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/packages/kea/kea-3.0/GermanStemmer.java@ 8815

Last change on this file since 8815 was 8815, checked in by mdewsnip, 19 years ago
Kea 3.0, as downloaded from http://www.nzdl.org/kea but with CSTR_abstracts_test, CSTR_abstracts_train, Chinese_test, and Chinese_train directories removed.
Property svn:keywords set to `Author Date Id Revision`
File size: 10.4 KB

Line
1
2	/* ====================================================================
3	* The Apache Software License, Version 1.1
4	*
5	* Copyright (c) 2001 The Apache Software Foundation. All rights
6	* reserved.
7	*
8	* Redistribution and use in source and binary forms, with or without
9	* modification, are permitted provided that the following conditions
10	* are met:
11	*
12	* 1. Redistributions of source code must retain the above copyright
13	* notice, this list of conditions and the following disclaimer.
14	*
15	* 2. Redistributions in binary form must reproduce the above copyright
16	* notice, this list of conditions and the following disclaimer in
17	* the documentation and/or other materials provided with the
18	* distribution.
19	*
20	* 3. The end-user documentation included with the redistribution,
21	* if any, must include the following acknowledgment:
22	* "This product includes software developed by the
23	* Apache Software Foundation (http://www.apache.org/)."
24	* Alternately, this acknowledgment may appear in the software itself,
25	* if and wherever such third-party acknowledgments normally appear.
26	*
27	* 4. The names "Apache" and "Apache Software Foundation" and
28	* "Apache Lucene" must not be used to endorse or promote products
29	* derived from this software without prior written permission. For
30	* written permission, please contact [email protected].
31	*
32	* 5. Products derived from this software may not be called "Apache",
33	* "Apache Lucene", nor may "Apache" appear in their name, without
34	* prior written permission of the Apache Software Foundation.
35	*
36	* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37	* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39	* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40	* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43	* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47	* SUCH DAMAGE.
48	* ====================================================================
49	*
50	* This software consists of voluntary contributions made by many
51	* individuals on behalf of the Apache Software Foundation. For more
52	* information on the Apache Software Foundation, please see
53	* <http://www.apache.org/>.
54	*/
55
56	/**
57	* A stemmer for German words. The algorithm is based on the report
58	* "A Fast and Simple Stemming Algorithm for German Words" by Jörg
59	* Caumanns ([email protected]).
60	*
61	* Changed stem() from protected to public.
62	* Changed coding for umlaute to unicode.
63	*
64	* @author Gerhard Schwarz
65	* @version $Id: GermanStemmer.java 8815 2004-12-15 01:13:55Z mdewsnip $
66	*/
67	public class GermanStemmer extends Stemmer
68	{
69	/**
70	* Buffer for the terms while stemming them.
71	*/
72	private StringBuffer sb = new StringBuffer();
73
74	/**
75	* Indicates if a term is handled as a noun.
76	*/
77	private boolean uppercase = false;
78
79	/**
80	* Amount of characters that are removed with <tt>substitute()</tt> while stemming.
81	*/
82	private int substCount = 0;
83
84	/**
85	* Stemms the given term to an unique <tt>discriminator</tt>.
86	*
87	* @param term The term that should be stemmed.
88	* @return Discriminator for <tt>term</tt>
89	*/
90	public String stem( String term )
91	{
92	// Mark a possible noun.
93	uppercase = Character.isUpperCase( term.charAt( 0 ) );
94	// Use lowercase for medium stemming.
95	term = term.toLowerCase();
96	if ( !isStemmable( term ) )
97	return term;
98	// Reset the StringBuffer.
99	sb.delete( 0, sb.length() );
100	sb.insert( 0, term );
101	// Stemming starts here...
102	substitute( sb );
103	strip( sb );
104	optimize( sb );
105	resubstitute( sb );
106	removeParticleDenotion( sb );
107	return sb.toString();
108	}
109
110	/**
111	* Checks if a term could be stemmed.
112	*
113	* @return true if, and only if, the given term consists in letters.
114	*/
115	private boolean isStemmable( String term )
116	{
117	for ( int c = 0; c < term.length(); c++ ) {
118	if ( !Character.isLetter( term.charAt( c ) ) ) return false;
119	}
120	return true;
121	}
122
123	/**
124	* suffix stripping (stemming) on the current term. The stripping is reduced
125	* to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
126	* from which all regular suffixes are build of. The simplification causes
127	* some overstemming, and way more irregular stems, but still provides unique.
128	* discriminators in the most of those cases.
129	* The algorithm is context free, except of the length restrictions.
130	*/
131	private void strip( StringBuffer buffer )
132	{
133	boolean doMore = true;
134	while ( doMore && buffer.length() > 3 ) {
135	if ( ( buffer.length() + substCount > 5 ) &&
136	buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
137	{
138	buffer.delete( buffer.length() - 2, buffer.length() );
139	}
140	else if ( ( buffer.length() + substCount > 4 ) &&
141	buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
142	buffer.delete( buffer.length() - 2, buffer.length() );
143	}
144	else if ( ( buffer.length() + substCount > 4 ) &&
145	buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
146	buffer.delete( buffer.length() - 2, buffer.length() );
147	}
148	else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
149	buffer.deleteCharAt( buffer.length() - 1 );
150	}
151	else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
152	buffer.deleteCharAt( buffer.length() - 1 );
153	}
154	else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
155	buffer.deleteCharAt( buffer.length() - 1 );
156	}
157	// "t" occurs only as suffix of verbs.
158	else if ( buffer.charAt( buffer.length() - 1 ) == 't' && !uppercase ) {
159	buffer.deleteCharAt( buffer.length() - 1 );
160	}
161	else {
162	doMore = false;
163	}
164	}
165	}
166
167	/**
168	* Does some optimizations on the term. This optimisations are
169	* contextual.
170	*/
171	private void optimize( StringBuffer buffer )
172	{
173	// Additional step for female plurals of professions and inhabitants.
174	if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
175	buffer.deleteCharAt( buffer.length() -1 );
176	strip( buffer );
177	}
178	// Additional step for irregular plural nouns like "Matrizen -> Matrix".
179	if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
180	buffer.setCharAt( buffer.length() - 1, 'x' );
181	}
182	}
183
184	/**
185	* Removes a particle denotion ("ge") from a term.
186	*/
187	private void removeParticleDenotion( StringBuffer buffer )
188	{
189	if ( buffer.length() > 4 ) {
190	for ( int c = 0; c < buffer.length() - 3; c++ ) {
191	if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
192	buffer.delete( c, c + 2 );
193	return;
194	}
195	}
196	}
197	}
198
199	/**
200	* Do some substitutions for the term to reduce overstemming:
201	*
202	* - Substitute Umlauts with their corresponding vowel: äöü -> aou,
203	* "ß" is substituted by "ss"
204	* - Substitute a second char of a pair of equal characters with
205	* an asterisk: ?? -> ?*
206	* - Substitute some common character combinations with a token:
207	* sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
208	*/
209	private void substitute( StringBuffer buffer )
210	{
211	substCount = 0;
212	for ( int c = 0; c < buffer.length(); c++ ) {
213	// Replace the second char of a pair of the equal characters with an asterisk
214	if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) {
215	buffer.setCharAt( c, '*' );
216	}
217	// Substitute Umlauts.
218	else if ( buffer.charAt( c ) == '\u00E4' ) {
219	buffer.setCharAt( c, 'a' );
220	}
221	else if ( buffer.charAt( c ) == '\u00F6' ) {
222	buffer.setCharAt( c, 'o' );
223	}
224	else if ( buffer.charAt( c ) == '\u00FC' ) {
225	buffer.setCharAt( c, 'u' );
226	}
227	// Take care that at least one character is left left side from the current one
228	if ( c < buffer.length() - 1 ) {
229	if ( buffer.charAt( c ) == '\u00DF' ) {
230	buffer.setCharAt( c, 's' );
231	buffer.insert( c + 1, 's' );
232	substCount++;
233	}
234	// Masking several common character combinations with an token
235	else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
236	buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
237	{
238	buffer.setCharAt( c, '$' );
239	buffer.delete( c + 1, c + 3 );
240	substCount =+ 2;
241	}
242	else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
243	buffer.setCharAt( c, '§' );
244	buffer.deleteCharAt( c + 1 );
245	substCount++;
246	}
247	else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
248	buffer.setCharAt( c, '%' );
249	buffer.deleteCharAt( c + 1 );
250	substCount++;
251	}
252	else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
253	buffer.setCharAt( c, '&' );
254	buffer.deleteCharAt( c + 1 );
255	substCount++;
256	}
257	else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
258	buffer.setCharAt( c, '#' );
259	buffer.deleteCharAt( c + 1 );
260	substCount++;
261	}
262	else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
263	buffer.setCharAt( c, '!' );
264	buffer.deleteCharAt( c + 1 );
265	substCount++;
266	}
267	}
268	}
269	}
270
271	/**
272	* Undoes the changes made by substitute(). That are character pairs and
273	* character combinations. Umlauts will remain as their corresponding vowel,
274	* as "ß" remains as "ss".
275	*/
276	private void resubstitute( StringBuffer buffer )
277	{
278	for ( int c = 0; c < buffer.length(); c++ ) {
279	if ( buffer.charAt( c ) == '*' ) {
280	char x = buffer.charAt( c - 1 );
281	buffer.setCharAt( c, x );
282	}
283	else if ( buffer.charAt( c ) == '$' ) {
284	buffer.setCharAt( c, 's' );
285	buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
286	}
287	else if ( buffer.charAt( c ) == '§' ) {
288	buffer.setCharAt( c, 'c' );
289	buffer.insert( c + 1, 'h' );
290	}
291	else if ( buffer.charAt( c ) == '%' ) {
292	buffer.setCharAt( c, 'e' );
293	buffer.insert( c + 1, 'i' );
294	}
295	else if ( buffer.charAt( c ) == '&' ) {
296	buffer.setCharAt( c, 'i' );
297	buffer.insert( c + 1, 'e' );
298	}
299	else if ( buffer.charAt( c ) == '#' ) {
300	buffer.setCharAt( c, 'i' );
301	buffer.insert( c + 1, 'g' );
302	}
303	else if ( buffer.charAt( c ) == '!' ) {
304	buffer.setCharAt( c, 's' );
305	buffer.insert( c + 1, 't' );
306	}
307	}
308	}
309	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: