Changeset 31254 for other-projects/hathitrust
- Timestamp:
- 2016-12-20T15:29:56+13:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java
r31252 r31254 12 12 import java.util.ArrayList; 13 13 import java.util.Iterator; 14 import java.util.Set;15 16 14 import org.apache.commons.compress.compressors.CompressorException; 17 15 import org.json.JSONObject; … … 20 18 import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; 21 19 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 22 import org.apache.lucene.analysis. tokenattributes.OffsetAttribute;20 import org.apache.lucene.analysis.core.LowerCaseFilter; 23 21 24 22 public class SolrDocJSON { … … 27 25 boolean icu_tokenize) 28 26 { 27 boolean lowercase_filter = true; 29 28 30 29 ArrayList<String> words = new ArrayList<String>(); … … 39 38 Reader reader = new StringReader(token); 40 39 41 Tokenizertokenizer = new ICUTokenizer();42 tokenizer.setReader(reader);40 ICUTokenizer icu_tokenizer = new ICUTokenizer(); 41 icu_tokenizer.setReader(reader); 43 42 44 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class); 45 43 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class); 44 45 TokenStream token_stream = null; 46 47 if (lowercase_filter) { 48 token_stream = new LowerCaseFilter(icu_tokenizer); 49 } 50 else { 51 token_stream = icu_tokenizer; 52 } 53 46 54 try { 47 token izer.reset();55 token_stream.reset(); 48 56 49 while (token izer.incrementToken()) {57 while (token_stream.incrementToken()) { 50 58 String term = charTermAttribute.toString(); 51 59 words.add(term); 52 60 } 53 61 54 token izer.end();55 token izer.close();62 token_stream.end(); 63 token_stream.close(); 56 64 } 57 65 catch (IOException e) {
Note:
See TracChangeset
for help on using the changeset viewer.