Changeset 31243 for other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main
- Timestamp:
- 2016-12-17T17:25:08+13:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java
r31242 r31243 6 6 import java.io.InputStreamReader; 7 7 import java.io.OutputStream; 8 import java.io.Reader; 9 import java.io.StringReader; 8 10 import java.net.HttpURLConnection; 9 11 import java.net.URL; 10 12 import java.util.ArrayList; 11 13 import java.util.Iterator; 14 import java.util.Set; 12 15 13 16 import org.apache.commons.compress.compressors.CompressorException; 14 17 import org.json.JSONObject; 18 import org.apache.lucene.analysis.Tokenizer; 19 import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; 20 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 21 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 15 22 16 23 public class SolrDocJSON { … … 18 25 protected static String generateSolrText(JSONObject ef_token_pos_count, WhitelistBloomFilter whitelist_bloomfilter) 19 26 { 20 StringBuilder sb = new StringBuilder(); 21 27 boolean solr_icu_tokenize = true; 28 29 ArrayList<String> tokens = new ArrayList<String>(); 30 22 31 Iterator<String> token_iter = ef_token_pos_count.keys(); 23 32 33 while (token_iter.hasNext()) { 34 String token = token_iter.next(); 35 36 if (solr_icu_tokenize == true) { 37 Reader reader = new StringReader(token); 38 39 Tokenizer tokenizer = new ICUTokenizer(); 40 tokenizer.setReader(reader); 41 42 //TokenStream tokenStream = analyzer.tokenStream(fieldName, reader); 43 //OffsetAttribute offsetAttribute = tokenizer.addAttribute(OffsetAttribute.class); 44 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class); 45 46 try { 47 tokenizer.reset(); 48 49 while (tokenizer.incrementToken()) { 50 //int startOffset = offsetAttribute.startOffset(); 51 //int endOffset = offsetAttribute.endOffset(); 52 String term = charTermAttribute.toString(); 53 tokens.add(term); 54 } 55 56 tokenizer.close(); 57 } 58 catch (IOException e) { 59 e.printStackTrace(); 60 } 61 } 62 else { 63 tokens.add(token); 64 } 65 } 66 67 StringBuilder sb = new StringBuilder(); 68 24 69 if (whitelist_bloomfilter == null) { 25 70 26 while (token_iter.hasNext()) { 27 String token = token_iter.next(); 71 boolean first_append = true; 72 73 for (int i=0; i<tokens.size(); i++) { 74 String token = tokens.get(i); 75 76 if (!first_append) { 77 sb.append(" "); 78 } 79 else { 80 first_append = false; 81 } 28 82 sb.append(token); 29 if (token_iter.hasNext()) {30 sb.append(" ");31 }32 83 } 33 84 } 34 85 else { 35 while (token_iter.hasNext()) { 36 String token = token_iter.next(); 86 boolean first_append = true; 87 88 for (int i=0; i<tokens.size(); i++) { 89 String token = tokens.get(i); 90 37 91 if (whitelist_bloomfilter.contains(token)) { 38 sb.append(token); 39 if (token_iter.hasNext()) { 92 if (!first_append) { 40 93 sb.append(" "); 41 94 } 42 } 43 } 44 95 else { 96 first_append = false; 97 } 98 sb.append(token); 99 } 100 } 101 45 102 } 46 103 /* … … 50 107 } 51 108 */ 109 110 52 111 53 112 return sb.toString();
Note:
See TracChangeset
for help on using the changeset viewer.