source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/WhitelistBloomFilter.java@ 31215

Last change on this file since 31215 was 31215, checked in by davidb, 7 years ago

Changed back to Guava 20 API, now mvn shading allows me to have this in the JAR file as well as the Guava 14 that Spark/Apache uses

  • Property svn:executable set to *
File size: 4.5 KB
RevLine 
[31201]1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedInputStream;
4import java.io.BufferedOutputStream;
5import java.io.File;
6import java.io.FileInputStream;
7import java.io.FileNotFoundException;
8import java.io.FileOutputStream;
9import java.io.IOException;
10import java.io.InputStream;
[31202]11import java.io.Serializable;
12import java.nio.charset.Charset;
[31201]13import java.nio.charset.StandardCharsets;
14import java.nio.file.Files;
15import java.nio.file.Paths;
16import java.util.stream.Stream;
17
[31202]18import javax.annotation.Nullable;
19
20import com.google.common.base.Preconditions;
[31211]21import com.google.common.hash.BloomFilter;
[31201]22import com.google.common.hash.Funnel;
23import com.google.common.hash.Funnels;
[31202]24import com.google.common.hash.PrimitiveSink;
[31201]25
26public class WhitelistBloomFilter {
27
[31211]28 protected BloomFilter<CharSequence> _bloomFilter;
[31201]29 protected static final String SERIALIZED_SUFFIX = "-serialized";
30 protected static final double FALSE_POSITIVE_PERCENTAGE = 0.01;
31
32
33 // http://stackoverflow.com/questions/453018/number-of-lines-in-a-file-in-java
34 public static int countLines(String filename) throws IOException
35 {
36 InputStream is = new BufferedInputStream(new FileInputStream(filename));
37
38 try {
39 byte[] c = new byte[1024];
40 int count = 0;
41 int readChars = 0;
42 boolean empty = true;
43 while ((readChars = is.read(c)) != -1) {
44 empty = false;
45 for (int i = 0; i < readChars; ++i) {
46 if (c[i] == '\n') {
47 ++count;
48 }
49 }
50 }
51 return (count == 0 && !empty) ? 1 : count;
52 } finally {
53 is.close();
54 }
55 }
56
57
58 public WhitelistBloomFilter(String dictionary_filename, boolean serialize) {
59 System.out.println("Constructing: WhitelistBloomFilter");
60
61 File ser_dictionary_file = new File(dictionary_filename + SERIALIZED_SUFFIX);
62
63 if (ser_dictionary_file.exists()) {
64 System.out.println("Loading Serialized Bloom filter ...");
65 _bloomFilter = serializeIn(ser_dictionary_file);
66 System.out.println("... done");
67 }
68 else {
69 // Need to generate the Bloom filter from the given raw text file
70
71 System.out.println("Counting lines in: " + dictionary_filename);
72 int num_lines = -1;
73 try {
74 num_lines = countLines(dictionary_filename);
[31202]75
[31215]76 Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
[31211]77 _bloomFilter = BloomFilter.create(string_funnel, num_lines,FALSE_POSITIVE_PERCENTAGE);
[31201]78 }
79 catch (IOException e) {
80 e.printStackTrace();
81 }
82 System.out.println("Number of lines: " + num_lines);
83
84 storeEntries(dictionary_filename,serialize);
85 }
86
87 }
88
89 protected void storeEntries(String filename, boolean serialize)
90 {
91 System.out.println("Building Bloom filter ...");
92
93 //read file into stream, try-with-resources
94 try (Stream<String> stream = Files.lines(Paths.get(filename))) {
95 stream.forEach(word -> {_bloomFilter.put(word);});
96 } catch (IOException e) {
97 e.printStackTrace();
98 }
99
100 System.out.println("... done");
101
102 if (serialize) {
103 System.out.println("Serializing Bloom filter ...");
104
105 File ser_dictionary = new File(filename + SERIALIZED_SUFFIX);
106 serializeOut(ser_dictionary);
107
108 System.out.println("... done");
109 }
110
111 }
112
113 public boolean contains(String key)
114 {
115 return _bloomFilter.mightContain(key);
116 }
117
118 protected void serializeOut(File ser_file)
119 {
120 try {
121 FileOutputStream fos = new FileOutputStream(ser_file);
122
123 BufferedOutputStream bfos = new BufferedOutputStream(fos);
124
125 _bloomFilter.writeTo(bfos);
126
127 bfos.close();
128 }
129 catch (FileNotFoundException e) {
130 System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath());
131 e.printStackTrace();
132 } catch (IOException e) {
133 System.err.println("Error reading in Bloom file:" + ser_file.getAbsolutePath());
134 e.printStackTrace();
135 }
136 }
137
[31211]138 protected static BloomFilter<CharSequence> serializeIn(File ser_file)
[31201]139 {
[31211]140 BloomFilter<CharSequence> bloomFilter = null;
[31201]141
142 try {
143 FileInputStream fis = new FileInputStream(ser_file);
144 BufferedInputStream bfis = new BufferedInputStream(fis);
145
[31215]146 Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
[31211]147 bloomFilter = BloomFilter.readFrom(bfis,string_funnel);
[31201]148
149 bfis.close();
150 }
151 catch (FileNotFoundException e) {
152 System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath());
153 e.printStackTrace();
154 } catch (IOException e) {
155 System.err.println("Error writing out Bloom file:" + ser_file.getAbsolutePath());
156 e.printStackTrace();
157 }
158 return bloomFilter;
159 }
160
161
[31202]162
163
164
[31201]165}
Note: See TracBrowser for help on using the repository browser.