source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/WhitelistBloomFilter.java@ 31211

Last change on this file since 31211 was 31211, checked in by davidb, 7 years ago

Changing back to regular Guava classes. Looking to use maven shading to remap Guava 20.0 into different namespace

  • Property svn:executable set to *
File size: 6.8 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedInputStream;
4import java.io.BufferedOutputStream;
5import java.io.DataInputStream;
6import java.io.DataOutputStream;
7import java.io.File;
8import java.io.FileInputStream;
9import java.io.FileNotFoundException;
10import java.io.FileOutputStream;
11import java.io.IOException;
12import java.io.InputStream;
13import java.io.OutputStream;
14import java.io.Serializable;
15import java.nio.charset.Charset;
16import java.nio.charset.StandardCharsets;
17import java.nio.file.Files;
18import java.nio.file.Paths;
19import java.util.stream.Stream;
20
21import javax.annotation.Nullable;
22
23import static com.google.common.base.Preconditions.checkNotNull;
24
25import com.google.common.base.Preconditions;
26import com.google.common.hash.BloomFilter;
27//import com.google.common.hash.BloomFilterAdvanced;
28//import com.google.common.hash.BloomFilterStrategies.BitArray;
29import com.google.common.hash.Funnel;
30import com.google.common.hash.Funnels;
31import com.google.common.hash.PrimitiveSink;
32import com.google.common.primitives.SignedBytes;
33import com.google.common.primitives.UnsignedBytes;
34
35public class WhitelistBloomFilter {
36
37 protected BloomFilter<CharSequence> _bloomFilter;
38 protected static final String SERIALIZED_SUFFIX = "-serialized";
39 protected static final double FALSE_POSITIVE_PERCENTAGE = 0.01;
40
41
42 // http://stackoverflow.com/questions/453018/number-of-lines-in-a-file-in-java
43 public static int countLines(String filename) throws IOException
44 {
45 InputStream is = new BufferedInputStream(new FileInputStream(filename));
46
47 try {
48 byte[] c = new byte[1024];
49 int count = 0;
50 int readChars = 0;
51 boolean empty = true;
52 while ((readChars = is.read(c)) != -1) {
53 empty = false;
54 for (int i = 0; i < readChars; ++i) {
55 if (c[i] == '\n') {
56 ++count;
57 }
58 }
59 }
60 return (count == 0 && !empty) ? 1 : count;
61 } finally {
62 is.close();
63 }
64 }
65
66
67 public WhitelistBloomFilter(String dictionary_filename, boolean serialize) {
68 System.out.println("Constructing: WhitelistBloomFilter");
69
70 File ser_dictionary_file = new File(dictionary_filename + SERIALIZED_SUFFIX);
71
72 if (ser_dictionary_file.exists()) {
73 System.out.println("Loading Serialized Bloom filter ...");
74 _bloomFilter = serializeIn(ser_dictionary_file);
75 System.out.println("... done");
76 }
77 else {
78 // Need to generate the Bloom filter from the given raw text file
79
80 System.out.println("Counting lines in: " + dictionary_filename);
81 int num_lines = -1;
82 try {
83 num_lines = countLines(dictionary_filename);
84
85 //Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
86 Funnel<CharSequence> string_funnel = stringFunnel(StandardCharsets.UTF_8);
87 _bloomFilter = BloomFilter.create(string_funnel, num_lines,FALSE_POSITIVE_PERCENTAGE);
88 }
89 catch (IOException e) {
90 e.printStackTrace();
91 }
92 System.out.println("Number of lines: " + num_lines);
93
94 storeEntries(dictionary_filename,serialize);
95 }
96
97 }
98
99 protected void storeEntries(String filename, boolean serialize)
100 {
101 System.out.println("Building Bloom filter ...");
102
103 //read file into stream, try-with-resources
104 try (Stream<String> stream = Files.lines(Paths.get(filename))) {
105 stream.forEach(word -> {_bloomFilter.put(word);});
106 } catch (IOException e) {
107 e.printStackTrace();
108 }
109
110 System.out.println("... done");
111
112 if (serialize) {
113 System.out.println("Serializing Bloom filter ...");
114
115 File ser_dictionary = new File(filename + SERIALIZED_SUFFIX);
116 serializeOut(ser_dictionary);
117
118 System.out.println("... done");
119 }
120
121 }
122
123 public boolean contains(String key)
124 {
125 return _bloomFilter.mightContain(key);
126 }
127
128 protected void serializeOut(File ser_file)
129 {
130 try {
131 FileOutputStream fos = new FileOutputStream(ser_file);
132
133 BufferedOutputStream bfos = new BufferedOutputStream(fos);
134
135 _bloomFilter.writeTo(bfos);
136
137 bfos.close();
138 }
139 catch (FileNotFoundException e) {
140 System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath());
141 e.printStackTrace();
142 } catch (IOException e) {
143 System.err.println("Error reading in Bloom file:" + ser_file.getAbsolutePath());
144 e.printStackTrace();
145 }
146 }
147
148 protected static BloomFilter<CharSequence> serializeIn(File ser_file)
149 {
150 BloomFilter<CharSequence> bloomFilter = null;
151
152 try {
153 FileInputStream fis = new FileInputStream(ser_file);
154 BufferedInputStream bfis = new BufferedInputStream(fis);
155
156 //Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
157 Funnel<CharSequence> string_funnel = stringFunnel(StandardCharsets.UTF_8);
158 bloomFilter = BloomFilter.readFrom(bfis,string_funnel);
159
160 bfis.close();
161 }
162 catch (FileNotFoundException e) {
163 System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath());
164 e.printStackTrace();
165 } catch (IOException e) {
166 System.err.println("Error writing out Bloom file:" + ser_file.getAbsolutePath());
167 e.printStackTrace();
168 }
169 return bloomFilter;
170 }
171
172
173 // Spark uses Guava 14.0, the following is future-ported from Guava 20.0
174 // Added in here, rather then Funnel, and StringCharsetFunnel -> MyStringCharsetFunnel
175
176 public static Funnel<CharSequence> stringFunnel(Charset charset) {
177
178 return new MyStringCharsetFunnel(charset);
179
180 }
181
182 private static class MyStringCharsetFunnel implements Funnel<CharSequence>, Serializable {
183
184 private static final long serialVersionUID = 1L;
185
186 private final Charset charset;
187
188 MyStringCharsetFunnel(Charset charset) {
189 this.charset = Preconditions.checkNotNull(charset);
190 }
191
192 public void funnel(CharSequence from, PrimitiveSink into) {
193 into.putString(from, charset);
194 }
195
196 @Override
197 public String toString() {
198 return "Funnels.stringFunnel(" + charset.name() + ")";
199 }
200
201 @Override
202 public boolean equals(@Nullable Object o) {
203
204 if (o instanceof MyStringCharsetFunnel) {
205 MyStringCharsetFunnel funnel = (MyStringCharsetFunnel) o;
206 return this.charset.equals(funnel.charset);
207 }
208 return false;
209 }
210
211 @Override
212 public int hashCode() {
213 return MyStringCharsetFunnel.class.hashCode() ^ charset.hashCode();
214 }
215
216 Object writeReplace() {
217 return new SerializedForm(charset);
218 }
219
220 private static class SerializedForm implements Serializable {
221 private final String charsetCanonicalName;
222
223 SerializedForm(Charset charset) {
224 this.charsetCanonicalName = charset.name();
225 }
226
227 private Object readResolve() {
228 return stringFunnel(Charset.forName(charsetCanonicalName));
229 }
230
231 private static final long serialVersionUID = 0;
232
233 }
234 }
235
236
237}
Note: See TracBrowser for help on using the repository browser.