source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2Analyzer.java@ 29148

Last change on this file since 29148 was 29148, checked in by ak19, 10 years ago

Part of port from lucene3.3.0 to lucene4.7.2. Related to LuceneWrapper. 1. Updating the lucene-gs makefiles to allow compiling up Lucene4Wrapper.jar or Lucene3Wrapper.jar. Only the Linux Makefile.in has been tested so far. 2. Adding in the jar files necessary for Lucene4Wrapper into the lib folder's new lucene4 subfolder. 3. Updating the Lucene src code to use lucene4.7.2 instead of lucene3.3.0.

  • Property svn:executable set to *
File size: 3.2 KB
Line 
1/**********************************************************************
2 *
3 * GS2Analyzer.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper4;
27
28
29import java.io.*;
30import java.util.Set;
31
32import org.apache.lucene.analysis.*;
33import org.apache.lucene.analysis.core.*; // StopFilter, LowerCaseFilter, (StandardFilter)
34import org.apache.lucene.analysis.standard.StandardFilter;
35import org.apache.lucene.analysis.standard.StandardTokenizer;
36import org.apache.lucene.analysis.util.*;
37
38import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
39
40import org.apache.lucene.util.Version;
41
42
43public class GS2Analyzer extends GS2StandardAnalyzer
44{
45 public GS2Analyzer()
46 {
47 super(GSLuceneConstants.MATCH_VERSION);
48 }
49
50 public GS2Analyzer(String [] stopwords)
51 {
52 super(GSLuceneConstants.MATCH_VERSION,StopFilter.makeStopSet(GSLuceneConstants.MATCH_VERSION, stopwords));
53 }
54
55 @Override
56 protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
57 final StandardTokenizer src = new StandardTokenizer(GSLuceneConstants.MATCH_VERSION, reader);
58 src.setMaxTokenLength(maxTokenLength);
59 //src.setReplaceInvalidAcronym(replaceInvalidAcronym); // now true by default, see http://lucene.apache.org/core/3_0_3/api/all/org/apache/lucene/analysis/standard/StandardTokenizer.html
60 TokenStream tok = new StandardFilter(GSLuceneConstants.MATCH_VERSION, src);
61 tok = new LowerCaseFilter(GSLuceneConstants.MATCH_VERSION, tok);
62 tok = new StopFilter(GSLuceneConstants.MATCH_VERSION, tok, stopwords);
63
64 // top it up with accent folding
65 tok = new ASCIIFoldingFilter(tok);
66
67 return new TokenStreamComponents(src, tok) {
68 @Override
69 protected void setReader(final Reader reader) throws IOException {
70
71 // Previously called reset(Reader), now called setReader(Reader), but with mostly the same method description:
72 // https://lucene.apache.org/core/3_6_0/api/all/org/apache/lucene/analysis/ReusableAnalyzerBase.TokenStreamComponents.html
73 // http://lucene.apache.org/core/4_8_1/core/org/apache/lucene/analysis/Analyzer.TokenStreamComponents.html
74 // New method should throw an exception, not return false if unable to reset
75
76 src.setMaxTokenLength(GS2Analyzer.this.maxTokenLength);
77 super.setReader(reader);
78 }
79 };
80 }
81
82}
83
84
Note: See TracBrowser for help on using the repository browser.