source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2StandardAnalyzer.java@ 29148

Last change on this file since 29148 was 29148, checked in by ak19, 10 years ago

Part of port from lucene3.3.0 to lucene4.7.2. Related to LuceneWrapper. 1. Updating the lucene-gs makefiles to allow compiling up Lucene4Wrapper.jar or Lucene3Wrapper.jar. Only the Linux Makefile.in has been tested so far. 2. Adding in the jar files necessary for Lucene4Wrapper into the lib folder's new lucene4 subfolder. 3. Updating the Lucene src code to use lucene4.7.2 instead of lucene3.3.0.

  • Property svn:executable set to *
File size: 6.4 KB
Line 
1package org.greenstone.LuceneWrapper4;
2
3/**
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20// Modified so the class is *not* final
21//
22// This is an interim measure.
23//
24// We should rewrite GS2Analyzer so it inherits directly from
25// StopwordAnalyzeBase and uses ASCIIFoldingFilter rather than
26// ISOLatin1AccentFilter as the latter is deprecated and the former is
27// a superset
28
29
30import org.apache.lucene.analysis.*;
31import org.apache.lucene.util.Version;
32
33import org.apache.lucene.analysis.core.StopAnalyzer;
34import org.apache.lucene.analysis.core.*; // StopFilter, StandardFilter, LowerCaseFilter
35import org.apache.lucene.analysis.standard.StandardFilter;
36import org.apache.lucene.analysis.standard.StandardTokenizer;
37import org.apache.lucene.analysis.util.*;
38
39
40import java.io.File;
41import java.io.FileReader;
42import java.io.IOException;
43import java.io.Reader;
44import java.util.Set;
45
46/**
47 * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
48 * LowerCaseFilter} and {@link StopFilter}, using a list of
49 * English stop words.
50 *
51 * <a name="version"/>
52 * <p>You must specify the required {@link Version}
53 * compatibility when creating StandardAnalyzer:
54 * <ul>
55 * <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
56 * and StopFilter correctly handles Unicode 4.0 supplementary characters
57 * in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer}
58 * are the pre-3.1 implementations of StandardTokenizer and
59 * StandardAnalyzer.
60 * <li> As of 2.9, StopFilter preserves position increments
61 * <li> As of 2.4, Tokens incorrectly identified as acronyms
62 * are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
63 * </ul>
64 */
65public class GS2StandardAnalyzer extends StopwordAnalyzerBase {
66
67 /** Default maximum allowed token length */
68 public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
69
70 protected int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
71
72 /**
73 * Specifies whether deprecated acronyms should be replaced with HOST type.
74 * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
75 */
76 protected final boolean replaceInvalidAcronym;
77
78 /** An unmodifiable set containing some common English words that are usually not
79 useful for searching. */
80 public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
81
82 /** Builds an analyzer with the given stop words.
83 * @param matchVersion Lucene version to match See {@link
84 * <a href="#version">above</a>}
85 * @param stopWords stop words */
86 public GS2StandardAnalyzer(Version matchVersion, CharArraySet stopWords) {
87 // Creates a CharArraySet from a file.
88 //protected static CharArraySet loadStopwordSet(File stopwords, Version matchVersion)
89
90 super(matchVersion, stopWords);
91 replaceInvalidAcronym = true;//= matchVersion.onOrAfter(Version.LUCENE_24);
92 }
93
94 /** Builds an analyzer with the default stop words ({@link
95 * #STOP_WORDS_SET}).
96 * @param matchVersion Lucene version to match See {@link
97 * <a href="#version">above</a>}
98 */
99 public GS2StandardAnalyzer(Version matchVersion) {
100 this(matchVersion, STOP_WORDS_SET);
101 }
102
103 /** Builds an analyzer with the stop words from the given file.
104 * @see WordlistLoader#getWordSet(File)
105 * @param matchVersion Lucene version to match See {@link
106 * <a href="#version">above</a>}
107 * @param stopwords File to read stop words from */
108 public GS2StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
109 this(matchVersion, WordlistLoader.getWordSet(new FileReader(stopwords), matchVersion));
110 }
111
112 /** Builds an analyzer with the stop words from the given reader.
113 * @see WordlistLoader#getWordSet(Reader)
114 * @param matchVersion Lucene version to match See {@link
115 * <a href="#version">above</a>}
116 * @param stopwords Reader to read stop words from */
117 public GS2StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
118 this(matchVersion, WordlistLoader.getWordSet(stopwords, matchVersion));
119 }
120
121 /**
122 * Set maximum allowed token length. If a token is seen
123 * that exceeds this length then it is discarded. This
124 * setting only takes effect the next time tokenStream or
125 * reusableTokenStream is called.
126 */
127 public void setMaxTokenLength(int length) {
128 maxTokenLength = length;
129 }
130
131 /**
132 * @see #setMaxTokenLength
133 */
134 public int getMaxTokenLength() {
135 return maxTokenLength;
136 }
137
138 @Override
139 protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
140 final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
141 src.setMaxTokenLength(maxTokenLength);
142 //src.setReplaceInvalidAcronym(replaceInvalidAcronym); // now true by default, see http://lucene.apache.org/core/3_0_3/api/all/org/apache/lucene/analysis/standard/StandardTokenizer.html
143 TokenStream tok = new StandardFilter(matchVersion, src);
144 tok = new LowerCaseFilter(matchVersion, tok);
145 tok = new StopFilter(matchVersion, tok, stopwords);
146 return new TokenStreamComponents(src, tok) {
147 @Override
148 protected void setReader(final Reader reader) throws IOException {
149 // previously called reset(Reader), now called setReader(Reader), but with mostly the same method description:
150 // https://lucene.apache.org/core/3_6_0/api/all/org/apache/lucene/analysis/ReusableAnalyzerBase.TokenStreamComponents.html
151 // http://lucene.apache.org/core/4_8_1/core/org/apache/lucene/analysis/Analyzer.TokenStreamComponents.html
152 // New method should throw an exception, not return false if unable to reset
153
154 src.setMaxTokenLength(GS2StandardAnalyzer.this.maxTokenLength);
155 super.setReader(reader);
156 }
157 };
158 }
159}
Note: See TracBrowser for help on using the repository browser.