source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/WhitelistBloomFilter.java@ 31204

Last change on this file since 31204 was 31204, checked in by davidb, 7 years ago

Splicing in Guava verion 20 of BloomFilter into code as own class (now BloomFilterAdvanced). This is because Spark runs with older version of Guava (14.0). Written code makes use of newer features.

  • Property svn:executable set to *
File size: 6.8 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedInputStream;
4import java.io.BufferedOutputStream;
5import java.io.DataInputStream;
6import java.io.DataOutputStream;
7import java.io.File;
8import java.io.FileInputStream;
9import java.io.FileNotFoundException;
10import java.io.FileOutputStream;
11import java.io.IOException;
12import java.io.InputStream;
13import java.io.OutputStream;
14import java.io.Serializable;
15import java.nio.charset.Charset;
16import java.nio.charset.StandardCharsets;
17import java.nio.file.Files;
18import java.nio.file.Paths;
19import java.util.stream.Stream;
20
21import javax.annotation.Nullable;
22
23import static com.google.common.base.Preconditions.checkNotNull;
24
25import com.google.common.base.Preconditions;
26//import com.google.common.hash.BloomFilter;
27import com.google.common.hash.BloomFilterAdvanced;
28//import com.google.common.hash.BloomFilterStrategies.BitArray;
29import com.google.common.hash.Funnel;
30import com.google.common.hash.Funnels;
31import com.google.common.hash.PrimitiveSink;
32import com.google.common.primitives.SignedBytes;
33import com.google.common.primitives.UnsignedBytes;
34
35public class WhitelistBloomFilter {
36
37
38
39 protected BloomFilterAdvanced<CharSequence> _bloomFilter;
40 protected static final String SERIALIZED_SUFFIX = "-serialized";
41 protected static final double FALSE_POSITIVE_PERCENTAGE = 0.01;
42
43
44 // http://stackoverflow.com/questions/453018/number-of-lines-in-a-file-in-java
45 public static int countLines(String filename) throws IOException
46 {
47 InputStream is = new BufferedInputStream(new FileInputStream(filename));
48
49 try {
50 byte[] c = new byte[1024];
51 int count = 0;
52 int readChars = 0;
53 boolean empty = true;
54 while ((readChars = is.read(c)) != -1) {
55 empty = false;
56 for (int i = 0; i < readChars; ++i) {
57 if (c[i] == '\n') {
58 ++count;
59 }
60 }
61 }
62 return (count == 0 && !empty) ? 1 : count;
63 } finally {
64 is.close();
65 }
66 }
67
68
69 public WhitelistBloomFilter(String dictionary_filename, boolean serialize) {
70 System.out.println("Constructing: WhitelistBloomFilter");
71
72 File ser_dictionary_file = new File(dictionary_filename + SERIALIZED_SUFFIX);
73
74 if (ser_dictionary_file.exists()) {
75 System.out.println("Loading Serialized Bloom filter ...");
76 _bloomFilter = serializeIn(ser_dictionary_file);
77 System.out.println("... done");
78 }
79 else {
80 // Need to generate the Bloom filter from the given raw text file
81
82 System.out.println("Counting lines in: " + dictionary_filename);
83 int num_lines = -1;
84 try {
85 num_lines = countLines(dictionary_filename);
86
87 //Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
88 Funnel<CharSequence> string_funnel = stringFunnel(StandardCharsets.UTF_8);
89 _bloomFilter = BloomFilterAdvanced.create(string_funnel, num_lines,FALSE_POSITIVE_PERCENTAGE);
90 }
91 catch (IOException e) {
92 e.printStackTrace();
93 }
94 System.out.println("Number of lines: " + num_lines);
95
96 storeEntries(dictionary_filename,serialize);
97 }
98
99 }
100
101 protected void storeEntries(String filename, boolean serialize)
102 {
103 System.out.println("Building Bloom filter ...");
104
105 //read file into stream, try-with-resources
106 try (Stream<String> stream = Files.lines(Paths.get(filename))) {
107 stream.forEach(word -> {_bloomFilter.put(word);});
108 } catch (IOException e) {
109 e.printStackTrace();
110 }
111
112 System.out.println("... done");
113
114 if (serialize) {
115 System.out.println("Serializing Bloom filter ...");
116
117 File ser_dictionary = new File(filename + SERIALIZED_SUFFIX);
118 serializeOut(ser_dictionary);
119
120 System.out.println("... done");
121 }
122
123 }
124
125 public boolean contains(String key)
126 {
127 return _bloomFilter.mightContain(key);
128 }
129
130 protected void serializeOut(File ser_file)
131 {
132 try {
133 FileOutputStream fos = new FileOutputStream(ser_file);
134
135 BufferedOutputStream bfos = new BufferedOutputStream(fos);
136
137 _bloomFilter.writeTo(bfos);
138
139 bfos.close();
140 }
141 catch (FileNotFoundException e) {
142 System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath());
143 e.printStackTrace();
144 } catch (IOException e) {
145 System.err.println("Error reading in Bloom file:" + ser_file.getAbsolutePath());
146 e.printStackTrace();
147 }
148 }
149
150 protected static BloomFilterAdvanced<CharSequence> serializeIn(File ser_file)
151 {
152 BloomFilterAdvanced<CharSequence> bloomFilter = null;
153
154 try {
155 FileInputStream fis = new FileInputStream(ser_file);
156 BufferedInputStream bfis = new BufferedInputStream(fis);
157
158 //Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
159 Funnel<CharSequence> string_funnel = stringFunnel(StandardCharsets.UTF_8);
160 bloomFilter = BloomFilterAdvanced.readFrom(bfis,string_funnel);
161
162 bfis.close();
163 }
164 catch (FileNotFoundException e) {
165 System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath());
166 e.printStackTrace();
167 } catch (IOException e) {
168 System.err.println("Error writing out Bloom file:" + ser_file.getAbsolutePath());
169 e.printStackTrace();
170 }
171 return bloomFilter;
172 }
173
174
175 // Spark uses Guava 14.0, the following is future-ported from Guava 20.0
176 // Added in here, rather then Funnel, and StringCharsetFunnel -> MyStringCharsetFunnel
177
178 public static Funnel<CharSequence> stringFunnel(Charset charset) {
179
180 return new MyStringCharsetFunnel(charset);
181
182 }
183
184 private static class MyStringCharsetFunnel implements Funnel<CharSequence>, Serializable {
185
186 private static final long serialVersionUID = 1L;
187
188 private final Charset charset;
189
190 MyStringCharsetFunnel(Charset charset) {
191 this.charset = Preconditions.checkNotNull(charset);
192 }
193
194 public void funnel(CharSequence from, PrimitiveSink into) {
195 into.putString(from, charset);
196 }
197
198 @Override
199 public String toString() {
200 return "Funnels.stringFunnel(" + charset.name() + ")";
201 }
202
203 @Override
204 public boolean equals(@Nullable Object o) {
205
206 if (o instanceof MyStringCharsetFunnel) {
207 MyStringCharsetFunnel funnel = (MyStringCharsetFunnel) o;
208 return this.charset.equals(funnel.charset);
209 }
210 return false;
211 }
212
213 @Override
214 public int hashCode() {
215 return MyStringCharsetFunnel.class.hashCode() ^ charset.hashCode();
216 }
217
218 Object writeReplace() {
219 return new SerializedForm(charset);
220 }
221
222 private static class SerializedForm implements Serializable {
223 private final String charsetCanonicalName;
224
225 SerializedForm(Charset charset) {
226 this.charsetCanonicalName = charset.name();
227 }
228
229 private Object readResolve() {
230 return stringFunnel(Charset.forName(charsetCanonicalName));
231 }
232
233 private static final long serialVersionUID = 0;
234
235 }
236 }
237
238
239}
Note: See TracBrowser for help on using the repository browser.