source: trunk/gsdl/packages/kea/kea-3.0/GermanStemmer.java@ 8815

Last change on this file since 8815 was 8815, checked in by mdewsnip, 19 years ago

Kea 3.0, as downloaded from http://www.nzdl.org/kea but with CSTR_abstracts_test, CSTR_abstracts_train, Chinese_test, and Chinese_train directories removed.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.4 KB
Line 
1
2/* ====================================================================
3 * The Apache Software License, Version 1.1
4 *
5 * Copyright (c) 2001 The Apache Software Foundation. All rights
6 * reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. The end-user documentation included with the redistribution,
21 * if any, must include the following acknowledgment:
22 * "This product includes software developed by the
23 * Apache Software Foundation (http://www.apache.org/)."
24 * Alternately, this acknowledgment may appear in the software itself,
25 * if and wherever such third-party acknowledgments normally appear.
26 *
27 * 4. The names "Apache" and "Apache Software Foundation" and
28 * "Apache Lucene" must not be used to endorse or promote products
29 * derived from this software without prior written permission. For
30 * written permission, please contact [email protected].
31 *
32 * 5. Products derived from this software may not be called "Apache",
33 * "Apache Lucene", nor may "Apache" appear in their name, without
34 * prior written permission of the Apache Software Foundation.
35 *
36 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47 * SUCH DAMAGE.
48 * ====================================================================
49 *
50 * This software consists of voluntary contributions made by many
51 * individuals on behalf of the Apache Software Foundation. For more
52 * information on the Apache Software Foundation, please see
53 * <http://www.apache.org/>.
54 */
55
56/**
57 * A stemmer for German words. The algorithm is based on the report
58 * "A Fast and Simple Stemming Algorithm for German Words" by Jörg
59 * Caumanns ([email protected]).
60 *
61 * Changed stem() from protected to public.
62 * Changed coding for umlaute to unicode.
63 *
64 * @author Gerhard Schwarz
65 * @version $Id: GermanStemmer.java 8815 2004-12-15 01:13:55Z mdewsnip $
66 */
67public class GermanStemmer extends Stemmer
68{
69 /**
70 * Buffer for the terms while stemming them.
71 */
72 private StringBuffer sb = new StringBuffer();
73
74 /**
75 * Indicates if a term is handled as a noun.
76 */
77 private boolean uppercase = false;
78
79 /**
80 * Amount of characters that are removed with <tt>substitute()</tt> while stemming.
81 */
82 private int substCount = 0;
83
84 /**
85 * Stemms the given term to an unique <tt>discriminator</tt>.
86 *
87 * @param term The term that should be stemmed.
88 * @return Discriminator for <tt>term</tt>
89 */
90 public String stem( String term )
91 {
92 // Mark a possible noun.
93 uppercase = Character.isUpperCase( term.charAt( 0 ) );
94 // Use lowercase for medium stemming.
95 term = term.toLowerCase();
96 if ( !isStemmable( term ) )
97 return term;
98 // Reset the StringBuffer.
99 sb.delete( 0, sb.length() );
100 sb.insert( 0, term );
101 // Stemming starts here...
102 substitute( sb );
103 strip( sb );
104 optimize( sb );
105 resubstitute( sb );
106 removeParticleDenotion( sb );
107 return sb.toString();
108 }
109
110 /**
111 * Checks if a term could be stemmed.
112 *
113 * @return true if, and only if, the given term consists in letters.
114 */
115 private boolean isStemmable( String term )
116 {
117 for ( int c = 0; c < term.length(); c++ ) {
118 if ( !Character.isLetter( term.charAt( c ) ) ) return false;
119 }
120 return true;
121 }
122
123 /**
124 * suffix stripping (stemming) on the current term. The stripping is reduced
125 * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
126 * from which all regular suffixes are build of. The simplification causes
127 * some overstemming, and way more irregular stems, but still provides unique.
128 * discriminators in the most of those cases.
129 * The algorithm is context free, except of the length restrictions.
130 */
131 private void strip( StringBuffer buffer )
132 {
133 boolean doMore = true;
134 while ( doMore && buffer.length() > 3 ) {
135 if ( ( buffer.length() + substCount > 5 ) &&
136 buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
137 {
138 buffer.delete( buffer.length() - 2, buffer.length() );
139 }
140 else if ( ( buffer.length() + substCount > 4 ) &&
141 buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
142 buffer.delete( buffer.length() - 2, buffer.length() );
143 }
144 else if ( ( buffer.length() + substCount > 4 ) &&
145 buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
146 buffer.delete( buffer.length() - 2, buffer.length() );
147 }
148 else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
149 buffer.deleteCharAt( buffer.length() - 1 );
150 }
151 else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
152 buffer.deleteCharAt( buffer.length() - 1 );
153 }
154 else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
155 buffer.deleteCharAt( buffer.length() - 1 );
156 }
157 // "t" occurs only as suffix of verbs.
158 else if ( buffer.charAt( buffer.length() - 1 ) == 't' && !uppercase ) {
159 buffer.deleteCharAt( buffer.length() - 1 );
160 }
161 else {
162 doMore = false;
163 }
164 }
165 }
166
167 /**
168 * Does some optimizations on the term. This optimisations are
169 * contextual.
170 */
171 private void optimize( StringBuffer buffer )
172 {
173 // Additional step for female plurals of professions and inhabitants.
174 if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
175 buffer.deleteCharAt( buffer.length() -1 );
176 strip( buffer );
177 }
178 // Additional step for irregular plural nouns like "Matrizen -> Matrix".
179 if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
180 buffer.setCharAt( buffer.length() - 1, 'x' );
181 }
182 }
183
184 /**
185 * Removes a particle denotion ("ge") from a term.
186 */
187 private void removeParticleDenotion( StringBuffer buffer )
188 {
189 if ( buffer.length() > 4 ) {
190 for ( int c = 0; c < buffer.length() - 3; c++ ) {
191 if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
192 buffer.delete( c, c + 2 );
193 return;
194 }
195 }
196 }
197 }
198
199 /**
200 * Do some substitutions for the term to reduce overstemming:
201 *
202 * - Substitute Umlauts with their corresponding vowel: äöü -> aou,
203 * "ß" is substituted by "ss"
204 * - Substitute a second char of a pair of equal characters with
205 * an asterisk: ?? -> ?*
206 * - Substitute some common character combinations with a token:
207 * sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
208 */
209 private void substitute( StringBuffer buffer )
210 {
211 substCount = 0;
212 for ( int c = 0; c < buffer.length(); c++ ) {
213 // Replace the second char of a pair of the equal characters with an asterisk
214 if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) {
215 buffer.setCharAt( c, '*' );
216 }
217 // Substitute Umlauts.
218 else if ( buffer.charAt( c ) == '\u00E4' ) {
219 buffer.setCharAt( c, 'a' );
220 }
221 else if ( buffer.charAt( c ) == '\u00F6' ) {
222 buffer.setCharAt( c, 'o' );
223 }
224 else if ( buffer.charAt( c ) == '\u00FC' ) {
225 buffer.setCharAt( c, 'u' );
226 }
227 // Take care that at least one character is left left side from the current one
228 if ( c < buffer.length() - 1 ) {
229 if ( buffer.charAt( c ) == '\u00DF' ) {
230 buffer.setCharAt( c, 's' );
231 buffer.insert( c + 1, 's' );
232 substCount++;
233 }
234 // Masking several common character combinations with an token
235 else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
236 buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
237 {
238 buffer.setCharAt( c, '$' );
239 buffer.delete( c + 1, c + 3 );
240 substCount =+ 2;
241 }
242 else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
243 buffer.setCharAt( c, '§' );
244 buffer.deleteCharAt( c + 1 );
245 substCount++;
246 }
247 else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
248 buffer.setCharAt( c, '%' );
249 buffer.deleteCharAt( c + 1 );
250 substCount++;
251 }
252 else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
253 buffer.setCharAt( c, '&' );
254 buffer.deleteCharAt( c + 1 );
255 substCount++;
256 }
257 else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
258 buffer.setCharAt( c, '#' );
259 buffer.deleteCharAt( c + 1 );
260 substCount++;
261 }
262 else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
263 buffer.setCharAt( c, '!' );
264 buffer.deleteCharAt( c + 1 );
265 substCount++;
266 }
267 }
268 }
269 }
270
271 /**
272 * Undoes the changes made by substitute(). That are character pairs and
273 * character combinations. Umlauts will remain as their corresponding vowel,
274 * as "ß" remains as "ss".
275 */
276 private void resubstitute( StringBuffer buffer )
277 {
278 for ( int c = 0; c < buffer.length(); c++ ) {
279 if ( buffer.charAt( c ) == '*' ) {
280 char x = buffer.charAt( c - 1 );
281 buffer.setCharAt( c, x );
282 }
283 else if ( buffer.charAt( c ) == '$' ) {
284 buffer.setCharAt( c, 's' );
285 buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
286 }
287 else if ( buffer.charAt( c ) == '§' ) {
288 buffer.setCharAt( c, 'c' );
289 buffer.insert( c + 1, 'h' );
290 }
291 else if ( buffer.charAt( c ) == '%' ) {
292 buffer.setCharAt( c, 'e' );
293 buffer.insert( c + 1, 'i' );
294 }
295 else if ( buffer.charAt( c ) == '&' ) {
296 buffer.setCharAt( c, 'i' );
297 buffer.insert( c + 1, 'e' );
298 }
299 else if ( buffer.charAt( c ) == '#' ) {
300 buffer.setCharAt( c, 'i' );
301 buffer.insert( c + 1, 'g' );
302 }
303 else if ( buffer.charAt( c ) == '!' ) {
304 buffer.setCharAt( c, 's' );
305 buffer.insert( c + 1, 't' );
306 }
307 }
308 }
309}
Note: See TracBrowser for help on using the repository browser.