source: gs3-extensions/solr/trunk/src/src/java/org/greenstone/LuceneWrapper/GS2StandardAnalyzer.java@ 24641

Last change on this file since 24641 was 24641, checked in by davidb, 13 years ago

Initial cut at Greenstone3 runtime code to support Solr. Solr code based on version 3.3, so this also include an upgraded version of the LuceneWrapper code (gs2build/common-src/indexers/lucene-gs) that works with this version of the support jar files

  • Property svn:executable set to *
File size: 5.4 KB
Line 
1package org.greenstone.LuceneWrapper;
2
3//package org.apache.lucene.analysis.standard;
4
5/**
6 * Licensed to the Apache Software Foundation (ASF) under one or more
7 * contributor license agreements. See the NOTICE file distributed with
8 * this work for additional information regarding copyright ownership.
9 * The ASF licenses this file to You under the Apache License, Version 2.0
10 * (the "License"); you may not use this file except in compliance with
11 * the License. You may obtain a copy of the License at
12 *
13 * http://www.apache.org/licenses/LICENSE-2.0
14 *
15 * Unless required by applicable law or agreed to in writing, software
16 * distributed under the License is distributed on an "AS IS" BASIS,
17 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 * See the License for the specific language governing permissions and
19 * limitations under the License.
20 */
21
22// Modified so the class is *not* final
23//
24// This is an interim measure.
25//
26// We should rewrite GS2Analyzer so it inherits directly from
27// StopwordAnalyzeBase and uses ASCIIFoldingFilter rather than
28// ISOLatin1AccentFilter as the latter is deprecated and the former is
29// a superset
30
31
32import org.apache.lucene.analysis.*;
33import org.apache.lucene.util.Version;
34
35import org.apache.lucene.analysis.standard.*;
36
37import java.io.File;
38import java.io.IOException;
39import java.io.Reader;
40import java.util.Set;
41
42/**
43 * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
44 * LowerCaseFilter} and {@link StopFilter}, using a list of
45 * English stop words.
46 *
47 * <a name="version"/>
48 * <p>You must specify the required {@link Version}
49 * compatibility when creating StandardAnalyzer:
50 * <ul>
51 * <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
52 * and StopFilter correctly handles Unicode 4.0 supplementary characters
53 * in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer}
54 * are the pre-3.1 implementations of StandardTokenizer and
55 * StandardAnalyzer.
56 * <li> As of 2.9, StopFilter preserves position increments
57 * <li> As of 2.4, Tokens incorrectly identified as acronyms
58 * are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
59 * </ul>
60 */
61public class GS2StandardAnalyzer extends StopwordAnalyzerBase {
62
63 /** Default maximum allowed token length */
64 public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
65
66 protected int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
67
68 /**
69 * Specifies whether deprecated acronyms should be replaced with HOST type.
70 * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
71 */
72 protected final boolean replaceInvalidAcronym;
73
74 /** An unmodifiable set containing some common English words that are usually not
75 useful for searching. */
76 public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
77
78 /** Builds an analyzer with the given stop words.
79 * @param matchVersion Lucene version to match See {@link
80 * <a href="#version">above</a>}
81 * @param stopWords stop words */
82 public GS2StandardAnalyzer(Version matchVersion, Set<?> stopWords) {
83 super(matchVersion, stopWords);
84 replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_24);
85 }
86
87 /** Builds an analyzer with the default stop words ({@link
88 * #STOP_WORDS_SET}).
89 * @param matchVersion Lucene version to match See {@link
90 * <a href="#version">above</a>}
91 */
92 public GS2StandardAnalyzer(Version matchVersion) {
93 this(matchVersion, STOP_WORDS_SET);
94 }
95
96 /** Builds an analyzer with the stop words from the given file.
97 * @see WordlistLoader#getWordSet(File)
98 * @param matchVersion Lucene version to match See {@link
99 * <a href="#version">above</a>}
100 * @param stopwords File to read stop words from */
101 public GS2StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
102 this(matchVersion, WordlistLoader.getWordSet(stopwords));
103 }
104
105 /** Builds an analyzer with the stop words from the given reader.
106 * @see WordlistLoader#getWordSet(Reader)
107 * @param matchVersion Lucene version to match See {@link
108 * <a href="#version">above</a>}
109 * @param stopwords Reader to read stop words from */
110 public GS2StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
111 this(matchVersion, WordlistLoader.getWordSet(stopwords));
112 }
113
114 /**
115 * Set maximum allowed token length. If a token is seen
116 * that exceeds this length then it is discarded. This
117 * setting only takes effect the next time tokenStream or
118 * reusableTokenStream is called.
119 */
120 public void setMaxTokenLength(int length) {
121 maxTokenLength = length;
122 }
123
124 /**
125 * @see #setMaxTokenLength
126 */
127 public int getMaxTokenLength() {
128 return maxTokenLength;
129 }
130
131 @Override
132 protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
133 final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
134 src.setMaxTokenLength(maxTokenLength);
135 src.setReplaceInvalidAcronym(replaceInvalidAcronym);
136 TokenStream tok = new StandardFilter(matchVersion, src);
137 tok = new LowerCaseFilter(matchVersion, tok);
138 tok = new StopFilter(matchVersion, tok, stopwords);
139 return new TokenStreamComponents(src, tok) {
140 @Override
141 protected boolean reset(final Reader reader) throws IOException {
142 src.setMaxTokenLength(GS2StandardAnalyzer.this.maxTokenLength);
143 return super.reset(reader);
144 }
145 };
146 }
147}
Note: See TracBrowser for help on using the repository browser.