Context Navigation

GS2StandardAnalyzer.java@ 24726

Last change on this file since 24726 was 24726, checked in by davidb, 13 years ago
Repackaging to LuceneWrapper3
Property svn:executable set to ``*
File size: 5.4 KB

Line
1	package org.greenstone.LuceneWrapper3;
2
3	/**
4	* Licensed to the Apache Software Foundation (ASF) under one or more
5	* contributor license agreements. See the NOTICE file distributed with
6	* this work for additional information regarding copyright ownership.
7	* The ASF licenses this file to You under the Apache License, Version 2.0
8	* (the "License"); you may not use this file except in compliance with
9	* the License. You may obtain a copy of the License at
10	*
11	* http://www.apache.org/licenses/LICENSE-2.0
12	*
13	* Unless required by applicable law or agreed to in writing, software
14	* distributed under the License is distributed on an "AS IS" BASIS,
15	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16	* See the License for the specific language governing permissions and
17	* limitations under the License.
18	*/
19
20	// Modified so the class is not final
21	//
22	// This is an interim measure.
23	//
24	// We should rewrite GS2Analyzer so it inherits directly from
25	// StopwordAnalyzeBase and uses ASCIIFoldingFilter rather than
26	// ISOLatin1AccentFilter as the latter is deprecated and the former is
27	// a superset
28
29
30	import org.apache.lucene.analysis.*;
31	import org.apache.lucene.util.Version;
32
33	import org.apache.lucene.analysis.standard.*;
34
35	import java.io.File;
36	import java.io.IOException;
37	import java.io.Reader;
38	import java.util.Set;
39
40	/**
41	* Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
42	* LowerCaseFilter} and {@link StopFilter}, using a list of
43	* English stop words.
44	*
45	* <a name="version"/>
46	* <p>You must specify the required {@link Version}
47	* compatibility when creating StandardAnalyzer:
48	* <ul>
49	* <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
50	* and StopFilter correctly handles Unicode 4.0 supplementary characters
51	* in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer}
52	* are the pre-3.1 implementations of StandardTokenizer and
53	* StandardAnalyzer.
54	* <li> As of 2.9, StopFilter preserves position increments
55	* <li> As of 2.4, Tokens incorrectly identified as acronyms
56	* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
57	* </ul>
58	*/
59	public class GS2StandardAnalyzer extends StopwordAnalyzerBase {
60
61	/** Default maximum allowed token length */
62	public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
63
64	protected int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
65
66	/**
67	* Specifies whether deprecated acronyms should be replaced with HOST type.
68	* See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
69	*/
70	protected final boolean replaceInvalidAcronym;
71
72	/** An unmodifiable set containing some common English words that are usually not
73	useful for searching. */
74	public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
75
76	/** Builds an analyzer with the given stop words.
77	* @param matchVersion Lucene version to match See {@link
78	* <a href="#version">above</a>}
79	* @param stopWords stop words */
80	public GS2StandardAnalyzer(Version matchVersion, Set<?> stopWords) {
81	super(matchVersion, stopWords);
82	replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_24);
83	}
84
85	/** Builds an analyzer with the default stop words ({@link
86	* #STOP_WORDS_SET}).
87	* @param matchVersion Lucene version to match See {@link
88	* <a href="#version">above</a>}
89	*/
90	public GS2StandardAnalyzer(Version matchVersion) {
91	this(matchVersion, STOP_WORDS_SET);
92	}
93
94	/** Builds an analyzer with the stop words from the given file.
95	* @see WordlistLoader#getWordSet(File)
96	* @param matchVersion Lucene version to match See {@link
97	* <a href="#version">above</a>}
98	* @param stopwords File to read stop words from */
99	public GS2StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
100	this(matchVersion, WordlistLoader.getWordSet(stopwords));
101	}
102
103	/** Builds an analyzer with the stop words from the given reader.
104	* @see WordlistLoader#getWordSet(Reader)
105	* @param matchVersion Lucene version to match See {@link
106	* <a href="#version">above</a>}
107	* @param stopwords Reader to read stop words from */
108	public GS2StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
109	this(matchVersion, WordlistLoader.getWordSet(stopwords));
110	}
111
112	/**
113	* Set maximum allowed token length. If a token is seen
114	* that exceeds this length then it is discarded. This
115	* setting only takes effect the next time tokenStream or
116	* reusableTokenStream is called.
117	*/
118	public void setMaxTokenLength(int length) {
119	maxTokenLength = length;
120	}
121
122	/**
123	* @see #setMaxTokenLength
124	*/
125	public int getMaxTokenLength() {
126	return maxTokenLength;
127	}
128
129	@Override
130	protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
131	final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
132	src.setMaxTokenLength(maxTokenLength);
133	src.setReplaceInvalidAcronym(replaceInvalidAcronym);
134	TokenStream tok = new StandardFilter(matchVersion, src);
135	tok = new LowerCaseFilter(matchVersion, tok);
136	tok = new StopFilter(matchVersion, tok, stopwords);
137	return new TokenStreamComponents(src, tok) {
138	@Override
139	protected boolean reset(final Reader reader) throws IOException {
140	src.setMaxTokenLength(GS2StandardAnalyzer.this.maxTokenLength);
141	return super.reset(reader);
142	}
143	};
144	}
145	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper3/GS2StandardAnalyzer.java@ 24726

Download in other formats: