Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

GS2StandardAnalyzer.java@ 29148

Last change on this file since 29148 was 29148, checked in by ak19, 10 years ago
Part of port from lucene3.3.0 to lucene4.7.2. Related to LuceneWrapper. 1. Updating the lucene-gs makefiles to allow compiling up Lucene4Wrapper.jar or Lucene3Wrapper.jar. Only the Linux Makefile.in has been tested so far. 2. Adding in the jar files necessary for Lucene4Wrapper into the lib folder's new lucene4 subfolder. 3. Updating the Lucene src code to use lucene4.7.2 instead of lucene3.3.0.
Property svn:executable set to ``*
File size: 6.4 KB

Line
1	package org.greenstone.LuceneWrapper4;
2
3	/**
4	* Licensed to the Apache Software Foundation (ASF) under one or more
5	* contributor license agreements. See the NOTICE file distributed with
6	* this work for additional information regarding copyright ownership.
7	* The ASF licenses this file to You under the Apache License, Version 2.0
8	* (the "License"); you may not use this file except in compliance with
9	* the License. You may obtain a copy of the License at
10	*
11	* http://www.apache.org/licenses/LICENSE-2.0
12	*
13	* Unless required by applicable law or agreed to in writing, software
14	* distributed under the License is distributed on an "AS IS" BASIS,
15	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16	* See the License for the specific language governing permissions and
17	* limitations under the License.
18	*/
19
20	// Modified so the class is not final
21	//
22	// This is an interim measure.
23	//
24	// We should rewrite GS2Analyzer so it inherits directly from
25	// StopwordAnalyzeBase and uses ASCIIFoldingFilter rather than
26	// ISOLatin1AccentFilter as the latter is deprecated and the former is
27	// a superset
28
29
30	import org.apache.lucene.analysis.*;
31	import org.apache.lucene.util.Version;
32
33	import org.apache.lucene.analysis.core.StopAnalyzer;
34	import org.apache.lucene.analysis.core.*; // StopFilter, StandardFilter, LowerCaseFilter
35	import org.apache.lucene.analysis.standard.StandardFilter;
36	import org.apache.lucene.analysis.standard.StandardTokenizer;
37	import org.apache.lucene.analysis.util.*;
38
39
40	import java.io.File;
41	import java.io.FileReader;
42	import java.io.IOException;
43	import java.io.Reader;
44	import java.util.Set;
45
46	/**
47	* Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
48	* LowerCaseFilter} and {@link StopFilter}, using a list of
49	* English stop words.
50	*
51	* <a name="version"/>
52	* <p>You must specify the required {@link Version}
53	* compatibility when creating StandardAnalyzer:
54	* <ul>
55	* <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
56	* and StopFilter correctly handles Unicode 4.0 supplementary characters
57	* in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer}
58	* are the pre-3.1 implementations of StandardTokenizer and
59	* StandardAnalyzer.
60	* <li> As of 2.9, StopFilter preserves position increments
61	* <li> As of 2.4, Tokens incorrectly identified as acronyms
62	* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
63	* </ul>
64	*/
65	public class GS2StandardAnalyzer extends StopwordAnalyzerBase {
66
67	/** Default maximum allowed token length */
68	public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
69
70	protected int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
71
72	/**
73	* Specifies whether deprecated acronyms should be replaced with HOST type.
74	* See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
75	*/
76	protected final boolean replaceInvalidAcronym;
77
78	/** An unmodifiable set containing some common English words that are usually not
79	useful for searching. */
80	public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
81
82	/** Builds an analyzer with the given stop words.
83	* @param matchVersion Lucene version to match See {@link
84	* <a href="#version">above</a>}
85	* @param stopWords stop words */
86	public GS2StandardAnalyzer(Version matchVersion, CharArraySet stopWords) {
87	// Creates a CharArraySet from a file.
88	//protected static CharArraySet loadStopwordSet(File stopwords, Version matchVersion)
89
90	super(matchVersion, stopWords);
91	replaceInvalidAcronym = true;//= matchVersion.onOrAfter(Version.LUCENE_24);
92	}
93
94	/** Builds an analyzer with the default stop words ({@link
95	* #STOP_WORDS_SET}).
96	* @param matchVersion Lucene version to match See {@link
97	* <a href="#version">above</a>}
98	*/
99	public GS2StandardAnalyzer(Version matchVersion) {
100	this(matchVersion, STOP_WORDS_SET);
101	}
102
103	/** Builds an analyzer with the stop words from the given file.
104	* @see WordlistLoader#getWordSet(File)
105	* @param matchVersion Lucene version to match See {@link
106	* <a href="#version">above</a>}
107	* @param stopwords File to read stop words from */
108	public GS2StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
109	this(matchVersion, WordlistLoader.getWordSet(new FileReader(stopwords), matchVersion));
110	}
111
112	/** Builds an analyzer with the stop words from the given reader.
113	* @see WordlistLoader#getWordSet(Reader)
114	* @param matchVersion Lucene version to match See {@link
115	* <a href="#version">above</a>}
116	* @param stopwords Reader to read stop words from */
117	public GS2StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
118	this(matchVersion, WordlistLoader.getWordSet(stopwords, matchVersion));
119	}
120
121	/**
122	* Set maximum allowed token length. If a token is seen
123	* that exceeds this length then it is discarded. This
124	* setting only takes effect the next time tokenStream or
125	* reusableTokenStream is called.
126	*/
127	public void setMaxTokenLength(int length) {
128	maxTokenLength = length;
129	}
130
131	/**
132	* @see #setMaxTokenLength
133	*/
134	public int getMaxTokenLength() {
135	return maxTokenLength;
136	}
137
138	@Override
139	protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
140	final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
141	src.setMaxTokenLength(maxTokenLength);
142	//src.setReplaceInvalidAcronym(replaceInvalidAcronym); // now true by default, see http://lucene.apache.org/core/3_0_3/api/all/org/apache/lucene/analysis/standard/StandardTokenizer.html
143	TokenStream tok = new StandardFilter(matchVersion, src);
144	tok = new LowerCaseFilter(matchVersion, tok);
145	tok = new StopFilter(matchVersion, tok, stopwords);
146	return new TokenStreamComponents(src, tok) {
147	@Override
148	protected void setReader(final Reader reader) throws IOException {
149	// previously called reset(Reader), now called setReader(Reader), but with mostly the same method description:
150	// https://lucene.apache.org/core/3_6_0/api/all/org/apache/lucene/analysis/ReusableAnalyzerBase.TokenStreamComponents.html
151	// http://lucene.apache.org/core/4_8_1/core/org/apache/lucene/analysis/Analyzer.TokenStreamComponents.html
152	// New method should throw an exception, not return false if unable to reset
153
154	src.setMaxTokenLength(GS2StandardAnalyzer.this.maxTokenLength);
155	super.setReader(reader);
156	}
157	};
158	}
159	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2StandardAnalyzer.java@ 29148

Download in other formats: