1 | package org.greenstone.LuceneWrapper;
|
---|
2 |
|
---|
3 | //package org.apache.lucene.analysis.standard;
|
---|
4 |
|
---|
5 | /**
|
---|
6 | * Licensed to the Apache Software Foundation (ASF) under one or more
|
---|
7 | * contributor license agreements. See the NOTICE file distributed with
|
---|
8 | * this work for additional information regarding copyright ownership.
|
---|
9 | * The ASF licenses this file to You under the Apache License, Version 2.0
|
---|
10 | * (the "License"); you may not use this file except in compliance with
|
---|
11 | * the License. You may obtain a copy of the License at
|
---|
12 | *
|
---|
13 | * http://www.apache.org/licenses/LICENSE-2.0
|
---|
14 | *
|
---|
15 | * Unless required by applicable law or agreed to in writing, software
|
---|
16 | * distributed under the License is distributed on an "AS IS" BASIS,
|
---|
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
18 | * See the License for the specific language governing permissions and
|
---|
19 | * limitations under the License.
|
---|
20 | */
|
---|
21 |
|
---|
22 | // Modified so the class is *not* final
|
---|
23 | //
|
---|
24 | // This is an interim measure.
|
---|
25 | //
|
---|
26 | // We should rewrite GS2Analyzer so it inherits directly from
|
---|
27 | // StopwordAnalyzeBase and uses ASCIIFoldingFilter rather than
|
---|
28 | // ISOLatin1AccentFilter as the latter is deprecated and the former is
|
---|
29 | // a superset
|
---|
30 |
|
---|
31 |
|
---|
32 | import org.apache.lucene.analysis.*;
|
---|
33 | import org.apache.lucene.util.Version;
|
---|
34 |
|
---|
35 | import org.apache.lucene.analysis.standard.*;
|
---|
36 |
|
---|
37 | import java.io.File;
|
---|
38 | import java.io.IOException;
|
---|
39 | import java.io.Reader;
|
---|
40 | import java.util.Set;
|
---|
41 |
|
---|
42 | /**
|
---|
43 | * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
|
---|
44 | * LowerCaseFilter} and {@link StopFilter}, using a list of
|
---|
45 | * English stop words.
|
---|
46 | *
|
---|
47 | * <a name="version"/>
|
---|
48 | * <p>You must specify the required {@link Version}
|
---|
49 | * compatibility when creating StandardAnalyzer:
|
---|
50 | * <ul>
|
---|
51 | * <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
|
---|
52 | * and StopFilter correctly handles Unicode 4.0 supplementary characters
|
---|
53 | * in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer}
|
---|
54 | * are the pre-3.1 implementations of StandardTokenizer and
|
---|
55 | * StandardAnalyzer.
|
---|
56 | * <li> As of 2.9, StopFilter preserves position increments
|
---|
57 | * <li> As of 2.4, Tokens incorrectly identified as acronyms
|
---|
58 | * are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
|
---|
59 | * </ul>
|
---|
60 | */
|
---|
61 | public class GS2StandardAnalyzer extends StopwordAnalyzerBase {
|
---|
62 |
|
---|
63 | /** Default maximum allowed token length */
|
---|
64 | public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
|
---|
65 |
|
---|
66 | protected int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
|
---|
67 |
|
---|
68 | /**
|
---|
69 | * Specifies whether deprecated acronyms should be replaced with HOST type.
|
---|
70 | * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
|
---|
71 | */
|
---|
72 | protected final boolean replaceInvalidAcronym;
|
---|
73 |
|
---|
74 | /** An unmodifiable set containing some common English words that are usually not
|
---|
75 | useful for searching. */
|
---|
76 | public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
---|
77 |
|
---|
78 | /** Builds an analyzer with the given stop words.
|
---|
79 | * @param matchVersion Lucene version to match See {@link
|
---|
80 | * <a href="#version">above</a>}
|
---|
81 | * @param stopWords stop words */
|
---|
82 | public GS2StandardAnalyzer(Version matchVersion, Set<?> stopWords) {
|
---|
83 | super(matchVersion, stopWords);
|
---|
84 | replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_24);
|
---|
85 | }
|
---|
86 |
|
---|
87 | /** Builds an analyzer with the default stop words ({@link
|
---|
88 | * #STOP_WORDS_SET}).
|
---|
89 | * @param matchVersion Lucene version to match See {@link
|
---|
90 | * <a href="#version">above</a>}
|
---|
91 | */
|
---|
92 | public GS2StandardAnalyzer(Version matchVersion) {
|
---|
93 | this(matchVersion, STOP_WORDS_SET);
|
---|
94 | }
|
---|
95 |
|
---|
96 | /** Builds an analyzer with the stop words from the given file.
|
---|
97 | * @see WordlistLoader#getWordSet(File)
|
---|
98 | * @param matchVersion Lucene version to match See {@link
|
---|
99 | * <a href="#version">above</a>}
|
---|
100 | * @param stopwords File to read stop words from */
|
---|
101 | public GS2StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
|
---|
102 | this(matchVersion, WordlistLoader.getWordSet(stopwords));
|
---|
103 | }
|
---|
104 |
|
---|
105 | /** Builds an analyzer with the stop words from the given reader.
|
---|
106 | * @see WordlistLoader#getWordSet(Reader)
|
---|
107 | * @param matchVersion Lucene version to match See {@link
|
---|
108 | * <a href="#version">above</a>}
|
---|
109 | * @param stopwords Reader to read stop words from */
|
---|
110 | public GS2StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
|
---|
111 | this(matchVersion, WordlistLoader.getWordSet(stopwords));
|
---|
112 | }
|
---|
113 |
|
---|
114 | /**
|
---|
115 | * Set maximum allowed token length. If a token is seen
|
---|
116 | * that exceeds this length then it is discarded. This
|
---|
117 | * setting only takes effect the next time tokenStream or
|
---|
118 | * reusableTokenStream is called.
|
---|
119 | */
|
---|
120 | public void setMaxTokenLength(int length) {
|
---|
121 | maxTokenLength = length;
|
---|
122 | }
|
---|
123 |
|
---|
124 | /**
|
---|
125 | * @see #setMaxTokenLength
|
---|
126 | */
|
---|
127 | public int getMaxTokenLength() {
|
---|
128 | return maxTokenLength;
|
---|
129 | }
|
---|
130 |
|
---|
131 | @Override
|
---|
132 | protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
|
---|
133 | final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
|
---|
134 | src.setMaxTokenLength(maxTokenLength);
|
---|
135 | src.setReplaceInvalidAcronym(replaceInvalidAcronym);
|
---|
136 | TokenStream tok = new StandardFilter(matchVersion, src);
|
---|
137 | tok = new LowerCaseFilter(matchVersion, tok);
|
---|
138 | tok = new StopFilter(matchVersion, tok, stopwords);
|
---|
139 | return new TokenStreamComponents(src, tok) {
|
---|
140 | @Override
|
---|
141 | protected boolean reset(final Reader reader) throws IOException {
|
---|
142 | src.setMaxTokenLength(GS2StandardAnalyzer.this.maxTokenLength);
|
---|
143 | return super.reset(reader);
|
---|
144 | }
|
---|
145 | };
|
---|
146 | }
|
---|
147 | }
|
---|