source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper3/GS2StandardAnalyzer.java@ 24726

Last change on this file since 24726 was 24726, checked in by davidb, 13 years ago

Repackaging to LuceneWrapper3

  • Property svn:executable set to *
File size: 5.4 KB
Line 
1package org.greenstone.LuceneWrapper3;
2
3/**
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20// Modified so the class is *not* final
21//
22// This is an interim measure.
23//
24// We should rewrite GS2Analyzer so it inherits directly from
25// StopwordAnalyzeBase and uses ASCIIFoldingFilter rather than
26// ISOLatin1AccentFilter as the latter is deprecated and the former is
27// a superset
28
29
30import org.apache.lucene.analysis.*;
31import org.apache.lucene.util.Version;
32
33import org.apache.lucene.analysis.standard.*;
34
35import java.io.File;
36import java.io.IOException;
37import java.io.Reader;
38import java.util.Set;
39
40/**
41 * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
42 * LowerCaseFilter} and {@link StopFilter}, using a list of
43 * English stop words.
44 *
45 * <a name="version"/>
46 * <p>You must specify the required {@link Version}
47 * compatibility when creating StandardAnalyzer:
48 * <ul>
49 * <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
50 * and StopFilter correctly handles Unicode 4.0 supplementary characters
51 * in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer}
52 * are the pre-3.1 implementations of StandardTokenizer and
53 * StandardAnalyzer.
54 * <li> As of 2.9, StopFilter preserves position increments
55 * <li> As of 2.4, Tokens incorrectly identified as acronyms
56 * are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
57 * </ul>
58 */
59public class GS2StandardAnalyzer extends StopwordAnalyzerBase {
60
61 /** Default maximum allowed token length */
62 public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
63
64 protected int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
65
66 /**
67 * Specifies whether deprecated acronyms should be replaced with HOST type.
68 * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
69 */
70 protected final boolean replaceInvalidAcronym;
71
72 /** An unmodifiable set containing some common English words that are usually not
73 useful for searching. */
74 public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
75
76 /** Builds an analyzer with the given stop words.
77 * @param matchVersion Lucene version to match See {@link
78 * <a href="#version">above</a>}
79 * @param stopWords stop words */
80 public GS2StandardAnalyzer(Version matchVersion, Set<?> stopWords) {
81 super(matchVersion, stopWords);
82 replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_24);
83 }
84
85 /** Builds an analyzer with the default stop words ({@link
86 * #STOP_WORDS_SET}).
87 * @param matchVersion Lucene version to match See {@link
88 * <a href="#version">above</a>}
89 */
90 public GS2StandardAnalyzer(Version matchVersion) {
91 this(matchVersion, STOP_WORDS_SET);
92 }
93
94 /** Builds an analyzer with the stop words from the given file.
95 * @see WordlistLoader#getWordSet(File)
96 * @param matchVersion Lucene version to match See {@link
97 * <a href="#version">above</a>}
98 * @param stopwords File to read stop words from */
99 public GS2StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
100 this(matchVersion, WordlistLoader.getWordSet(stopwords));
101 }
102
103 /** Builds an analyzer with the stop words from the given reader.
104 * @see WordlistLoader#getWordSet(Reader)
105 * @param matchVersion Lucene version to match See {@link
106 * <a href="#version">above</a>}
107 * @param stopwords Reader to read stop words from */
108 public GS2StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
109 this(matchVersion, WordlistLoader.getWordSet(stopwords));
110 }
111
112 /**
113 * Set maximum allowed token length. If a token is seen
114 * that exceeds this length then it is discarded. This
115 * setting only takes effect the next time tokenStream or
116 * reusableTokenStream is called.
117 */
118 public void setMaxTokenLength(int length) {
119 maxTokenLength = length;
120 }
121
122 /**
123 * @see #setMaxTokenLength
124 */
125 public int getMaxTokenLength() {
126 return maxTokenLength;
127 }
128
129 @Override
130 protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
131 final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
132 src.setMaxTokenLength(maxTokenLength);
133 src.setReplaceInvalidAcronym(replaceInvalidAcronym);
134 TokenStream tok = new StandardFilter(matchVersion, src);
135 tok = new LowerCaseFilter(matchVersion, tok);
136 tok = new StopFilter(matchVersion, tok, stopwords);
137 return new TokenStreamComponents(src, tok) {
138 @Override
139 protected boolean reset(final Reader reader) throws IOException {
140 src.setMaxTokenLength(GS2StandardAnalyzer.this.maxTokenLength);
141 return super.reset(reader);
142 }
143 };
144 }
145}
Note: See TracBrowser for help on using the repository browser.