1 |
|
---|
2 | /* ====================================================================
|
---|
3 | * The Apache Software License, Version 1.1
|
---|
4 | *
|
---|
5 | * Copyright (c) 2001 The Apache Software Foundation. All rights
|
---|
6 | * reserved.
|
---|
7 | *
|
---|
8 | * Redistribution and use in source and binary forms, with or without
|
---|
9 | * modification, are permitted provided that the following conditions
|
---|
10 | * are met:
|
---|
11 | *
|
---|
12 | * 1. Redistributions of source code must retain the above copyright
|
---|
13 | * notice, this list of conditions and the following disclaimer.
|
---|
14 | *
|
---|
15 | * 2. Redistributions in binary form must reproduce the above copyright
|
---|
16 | * notice, this list of conditions and the following disclaimer in
|
---|
17 | * the documentation and/or other materials provided with the
|
---|
18 | * distribution.
|
---|
19 | *
|
---|
20 | * 3. The end-user documentation included with the redistribution,
|
---|
21 | * if any, must include the following acknowledgment:
|
---|
22 | * "This product includes software developed by the
|
---|
23 | * Apache Software Foundation (http://www.apache.org/)."
|
---|
24 | * Alternately, this acknowledgment may appear in the software itself,
|
---|
25 | * if and wherever such third-party acknowledgments normally appear.
|
---|
26 | *
|
---|
27 | * 4. The names "Apache" and "Apache Software Foundation" and
|
---|
28 | * "Apache Lucene" must not be used to endorse or promote products
|
---|
29 | * derived from this software without prior written permission. For
|
---|
30 | * written permission, please contact [email protected].
|
---|
31 | *
|
---|
32 | * 5. Products derived from this software may not be called "Apache",
|
---|
33 | * "Apache Lucene", nor may "Apache" appear in their name, without
|
---|
34 | * prior written permission of the Apache Software Foundation.
|
---|
35 | *
|
---|
36 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
---|
37 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
---|
38 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
---|
39 | * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
---|
40 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
---|
41 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
---|
42 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
---|
43 | * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
---|
44 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
---|
45 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
---|
46 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
---|
47 | * SUCH DAMAGE.
|
---|
48 | * ====================================================================
|
---|
49 | *
|
---|
50 | * This software consists of voluntary contributions made by many
|
---|
51 | * individuals on behalf of the Apache Software Foundation. For more
|
---|
52 | * information on the Apache Software Foundation, please see
|
---|
53 | * <http://www.apache.org/>.
|
---|
54 | */
|
---|
55 |
|
---|
56 | /**
|
---|
57 | * A stemmer for German words. The algorithm is based on the report
|
---|
58 | * "A Fast and Simple Stemming Algorithm for German Words" by Jörg
|
---|
59 | * Caumanns ([email protected]).
|
---|
60 | *
|
---|
61 | * Changed stem() from protected to public.
|
---|
62 | * Changed coding for umlaute to unicode.
|
---|
63 | *
|
---|
64 | * @author Gerhard Schwarz
|
---|
65 | * @version $Id: GermanStemmer.java 8815 2004-12-15 01:13:55Z mdewsnip $
|
---|
66 | */
|
---|
67 | public class GermanStemmer extends Stemmer
|
---|
68 | {
|
---|
69 | /**
|
---|
70 | * Buffer for the terms while stemming them.
|
---|
71 | */
|
---|
72 | private StringBuffer sb = new StringBuffer();
|
---|
73 |
|
---|
74 | /**
|
---|
75 | * Indicates if a term is handled as a noun.
|
---|
76 | */
|
---|
77 | private boolean uppercase = false;
|
---|
78 |
|
---|
79 | /**
|
---|
80 | * Amount of characters that are removed with <tt>substitute()</tt> while stemming.
|
---|
81 | */
|
---|
82 | private int substCount = 0;
|
---|
83 |
|
---|
84 | /**
|
---|
85 | * Stemms the given term to an unique <tt>discriminator</tt>.
|
---|
86 | *
|
---|
87 | * @param term The term that should be stemmed.
|
---|
88 | * @return Discriminator for <tt>term</tt>
|
---|
89 | */
|
---|
90 | public String stem( String term )
|
---|
91 | {
|
---|
92 | // Mark a possible noun.
|
---|
93 | uppercase = Character.isUpperCase( term.charAt( 0 ) );
|
---|
94 | // Use lowercase for medium stemming.
|
---|
95 | term = term.toLowerCase();
|
---|
96 | if ( !isStemmable( term ) )
|
---|
97 | return term;
|
---|
98 | // Reset the StringBuffer.
|
---|
99 | sb.delete( 0, sb.length() );
|
---|
100 | sb.insert( 0, term );
|
---|
101 | // Stemming starts here...
|
---|
102 | substitute( sb );
|
---|
103 | strip( sb );
|
---|
104 | optimize( sb );
|
---|
105 | resubstitute( sb );
|
---|
106 | removeParticleDenotion( sb );
|
---|
107 | return sb.toString();
|
---|
108 | }
|
---|
109 |
|
---|
110 | /**
|
---|
111 | * Checks if a term could be stemmed.
|
---|
112 | *
|
---|
113 | * @return true if, and only if, the given term consists in letters.
|
---|
114 | */
|
---|
115 | private boolean isStemmable( String term )
|
---|
116 | {
|
---|
117 | for ( int c = 0; c < term.length(); c++ ) {
|
---|
118 | if ( !Character.isLetter( term.charAt( c ) ) ) return false;
|
---|
119 | }
|
---|
120 | return true;
|
---|
121 | }
|
---|
122 |
|
---|
123 | /**
|
---|
124 | * suffix stripping (stemming) on the current term. The stripping is reduced
|
---|
125 | * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
|
---|
126 | * from which all regular suffixes are build of. The simplification causes
|
---|
127 | * some overstemming, and way more irregular stems, but still provides unique.
|
---|
128 | * discriminators in the most of those cases.
|
---|
129 | * The algorithm is context free, except of the length restrictions.
|
---|
130 | */
|
---|
131 | private void strip( StringBuffer buffer )
|
---|
132 | {
|
---|
133 | boolean doMore = true;
|
---|
134 | while ( doMore && buffer.length() > 3 ) {
|
---|
135 | if ( ( buffer.length() + substCount > 5 ) &&
|
---|
136 | buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
|
---|
137 | {
|
---|
138 | buffer.delete( buffer.length() - 2, buffer.length() );
|
---|
139 | }
|
---|
140 | else if ( ( buffer.length() + substCount > 4 ) &&
|
---|
141 | buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
|
---|
142 | buffer.delete( buffer.length() - 2, buffer.length() );
|
---|
143 | }
|
---|
144 | else if ( ( buffer.length() + substCount > 4 ) &&
|
---|
145 | buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
|
---|
146 | buffer.delete( buffer.length() - 2, buffer.length() );
|
---|
147 | }
|
---|
148 | else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
|
---|
149 | buffer.deleteCharAt( buffer.length() - 1 );
|
---|
150 | }
|
---|
151 | else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
|
---|
152 | buffer.deleteCharAt( buffer.length() - 1 );
|
---|
153 | }
|
---|
154 | else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
|
---|
155 | buffer.deleteCharAt( buffer.length() - 1 );
|
---|
156 | }
|
---|
157 | // "t" occurs only as suffix of verbs.
|
---|
158 | else if ( buffer.charAt( buffer.length() - 1 ) == 't' && !uppercase ) {
|
---|
159 | buffer.deleteCharAt( buffer.length() - 1 );
|
---|
160 | }
|
---|
161 | else {
|
---|
162 | doMore = false;
|
---|
163 | }
|
---|
164 | }
|
---|
165 | }
|
---|
166 |
|
---|
167 | /**
|
---|
168 | * Does some optimizations on the term. This optimisations are
|
---|
169 | * contextual.
|
---|
170 | */
|
---|
171 | private void optimize( StringBuffer buffer )
|
---|
172 | {
|
---|
173 | // Additional step for female plurals of professions and inhabitants.
|
---|
174 | if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
|
---|
175 | buffer.deleteCharAt( buffer.length() -1 );
|
---|
176 | strip( buffer );
|
---|
177 | }
|
---|
178 | // Additional step for irregular plural nouns like "Matrizen -> Matrix".
|
---|
179 | if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
|
---|
180 | buffer.setCharAt( buffer.length() - 1, 'x' );
|
---|
181 | }
|
---|
182 | }
|
---|
183 |
|
---|
184 | /**
|
---|
185 | * Removes a particle denotion ("ge") from a term.
|
---|
186 | */
|
---|
187 | private void removeParticleDenotion( StringBuffer buffer )
|
---|
188 | {
|
---|
189 | if ( buffer.length() > 4 ) {
|
---|
190 | for ( int c = 0; c < buffer.length() - 3; c++ ) {
|
---|
191 | if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
|
---|
192 | buffer.delete( c, c + 2 );
|
---|
193 | return;
|
---|
194 | }
|
---|
195 | }
|
---|
196 | }
|
---|
197 | }
|
---|
198 |
|
---|
199 | /**
|
---|
200 | * Do some substitutions for the term to reduce overstemming:
|
---|
201 | *
|
---|
202 | * - Substitute Umlauts with their corresponding vowel: äöü -> aou,
|
---|
203 | * "ß" is substituted by "ss"
|
---|
204 | * - Substitute a second char of a pair of equal characters with
|
---|
205 | * an asterisk: ?? -> ?*
|
---|
206 | * - Substitute some common character combinations with a token:
|
---|
207 | * sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
|
---|
208 | */
|
---|
209 | private void substitute( StringBuffer buffer )
|
---|
210 | {
|
---|
211 | substCount = 0;
|
---|
212 | for ( int c = 0; c < buffer.length(); c++ ) {
|
---|
213 | // Replace the second char of a pair of the equal characters with an asterisk
|
---|
214 | if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) {
|
---|
215 | buffer.setCharAt( c, '*' );
|
---|
216 | }
|
---|
217 | // Substitute Umlauts.
|
---|
218 | else if ( buffer.charAt( c ) == '\u00E4' ) {
|
---|
219 | buffer.setCharAt( c, 'a' );
|
---|
220 | }
|
---|
221 | else if ( buffer.charAt( c ) == '\u00F6' ) {
|
---|
222 | buffer.setCharAt( c, 'o' );
|
---|
223 | }
|
---|
224 | else if ( buffer.charAt( c ) == '\u00FC' ) {
|
---|
225 | buffer.setCharAt( c, 'u' );
|
---|
226 | }
|
---|
227 | // Take care that at least one character is left left side from the current one
|
---|
228 | if ( c < buffer.length() - 1 ) {
|
---|
229 | if ( buffer.charAt( c ) == '\u00DF' ) {
|
---|
230 | buffer.setCharAt( c, 's' );
|
---|
231 | buffer.insert( c + 1, 's' );
|
---|
232 | substCount++;
|
---|
233 | }
|
---|
234 | // Masking several common character combinations with an token
|
---|
235 | else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
|
---|
236 | buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
|
---|
237 | {
|
---|
238 | buffer.setCharAt( c, '$' );
|
---|
239 | buffer.delete( c + 1, c + 3 );
|
---|
240 | substCount =+ 2;
|
---|
241 | }
|
---|
242 | else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
|
---|
243 | buffer.setCharAt( c, '§' );
|
---|
244 | buffer.deleteCharAt( c + 1 );
|
---|
245 | substCount++;
|
---|
246 | }
|
---|
247 | else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
|
---|
248 | buffer.setCharAt( c, '%' );
|
---|
249 | buffer.deleteCharAt( c + 1 );
|
---|
250 | substCount++;
|
---|
251 | }
|
---|
252 | else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
|
---|
253 | buffer.setCharAt( c, '&' );
|
---|
254 | buffer.deleteCharAt( c + 1 );
|
---|
255 | substCount++;
|
---|
256 | }
|
---|
257 | else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
|
---|
258 | buffer.setCharAt( c, '#' );
|
---|
259 | buffer.deleteCharAt( c + 1 );
|
---|
260 | substCount++;
|
---|
261 | }
|
---|
262 | else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
|
---|
263 | buffer.setCharAt( c, '!' );
|
---|
264 | buffer.deleteCharAt( c + 1 );
|
---|
265 | substCount++;
|
---|
266 | }
|
---|
267 | }
|
---|
268 | }
|
---|
269 | }
|
---|
270 |
|
---|
271 | /**
|
---|
272 | * Undoes the changes made by substitute(). That are character pairs and
|
---|
273 | * character combinations. Umlauts will remain as their corresponding vowel,
|
---|
274 | * as "ß" remains as "ss".
|
---|
275 | */
|
---|
276 | private void resubstitute( StringBuffer buffer )
|
---|
277 | {
|
---|
278 | for ( int c = 0; c < buffer.length(); c++ ) {
|
---|
279 | if ( buffer.charAt( c ) == '*' ) {
|
---|
280 | char x = buffer.charAt( c - 1 );
|
---|
281 | buffer.setCharAt( c, x );
|
---|
282 | }
|
---|
283 | else if ( buffer.charAt( c ) == '$' ) {
|
---|
284 | buffer.setCharAt( c, 's' );
|
---|
285 | buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
|
---|
286 | }
|
---|
287 | else if ( buffer.charAt( c ) == '§' ) {
|
---|
288 | buffer.setCharAt( c, 'c' );
|
---|
289 | buffer.insert( c + 1, 'h' );
|
---|
290 | }
|
---|
291 | else if ( buffer.charAt( c ) == '%' ) {
|
---|
292 | buffer.setCharAt( c, 'e' );
|
---|
293 | buffer.insert( c + 1, 'i' );
|
---|
294 | }
|
---|
295 | else if ( buffer.charAt( c ) == '&' ) {
|
---|
296 | buffer.setCharAt( c, 'i' );
|
---|
297 | buffer.insert( c + 1, 'e' );
|
---|
298 | }
|
---|
299 | else if ( buffer.charAt( c ) == '#' ) {
|
---|
300 | buffer.setCharAt( c, 'i' );
|
---|
301 | buffer.insert( c + 1, 'g' );
|
---|
302 | }
|
---|
303 | else if ( buffer.charAt( c ) == '!' ) {
|
---|
304 | buffer.setCharAt( c, 's' );
|
---|
305 | buffer.insert( c + 1, 't' );
|
---|
306 | }
|
---|
307 | }
|
---|
308 | }
|
---|
309 | }
|
---|