source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/WhitelistBloomFilter.java@ 31203

Last change on this file since 31203 was 31203, checked in by davidb, 7 years ago

Use class provided stringFunnel

  • Property svn:executable set to *
File size: 6.4 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedInputStream;
4import java.io.BufferedOutputStream;
5import java.io.File;
6import java.io.FileInputStream;
7import java.io.FileNotFoundException;
8import java.io.FileOutputStream;
9import java.io.IOException;
10import java.io.InputStream;
11import java.io.Serializable;
12import java.nio.charset.Charset;
13import java.nio.charset.StandardCharsets;
14import java.nio.file.Files;
15import java.nio.file.Paths;
16import java.util.stream.Stream;
17
18import javax.annotation.Nullable;
19
20import com.google.common.base.Preconditions;
21import com.google.common.hash.BloomFilter;
22import com.google.common.hash.Funnel;
23import com.google.common.hash.Funnels;
24import com.google.common.hash.PrimitiveSink;
25
26public class WhitelistBloomFilter {
27
28
29
30 protected BloomFilter<CharSequence> _bloomFilter;
31 protected static final String SERIALIZED_SUFFIX = "-serialized";
32 protected static final double FALSE_POSITIVE_PERCENTAGE = 0.01;
33
34
35 // http://stackoverflow.com/questions/453018/number-of-lines-in-a-file-in-java
36 public static int countLines(String filename) throws IOException
37 {
38 InputStream is = new BufferedInputStream(new FileInputStream(filename));
39
40 try {
41 byte[] c = new byte[1024];
42 int count = 0;
43 int readChars = 0;
44 boolean empty = true;
45 while ((readChars = is.read(c)) != -1) {
46 empty = false;
47 for (int i = 0; i < readChars; ++i) {
48 if (c[i] == '\n') {
49 ++count;
50 }
51 }
52 }
53 return (count == 0 && !empty) ? 1 : count;
54 } finally {
55 is.close();
56 }
57 }
58
59
60 public WhitelistBloomFilter(String dictionary_filename, boolean serialize) {
61 System.out.println("Constructing: WhitelistBloomFilter");
62
63 File ser_dictionary_file = new File(dictionary_filename + SERIALIZED_SUFFIX);
64
65 if (ser_dictionary_file.exists()) {
66 System.out.println("Loading Serialized Bloom filter ...");
67 _bloomFilter = serializeIn(ser_dictionary_file);
68 System.out.println("... done");
69 }
70 else {
71 // Need to generate the Bloom filter from the given raw text file
72
73 System.out.println("Counting lines in: " + dictionary_filename);
74 int num_lines = -1;
75 try {
76 num_lines = countLines(dictionary_filename);
77
78 //Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
79 Funnel<CharSequence> string_funnel = stringFunnel(StandardCharsets.UTF_8);
80 _bloomFilter = BloomFilter.create(string_funnel, num_lines,FALSE_POSITIVE_PERCENTAGE);
81 }
82 catch (IOException e) {
83 e.printStackTrace();
84 }
85 System.out.println("Number of lines: " + num_lines);
86
87 storeEntries(dictionary_filename,serialize);
88 }
89
90 }
91
92 protected void storeEntries(String filename, boolean serialize)
93 {
94 System.out.println("Building Bloom filter ...");
95
96 //read file into stream, try-with-resources
97 try (Stream<String> stream = Files.lines(Paths.get(filename))) {
98 stream.forEach(word -> {_bloomFilter.put(word);});
99 } catch (IOException e) {
100 e.printStackTrace();
101 }
102
103 System.out.println("... done");
104
105 if (serialize) {
106 System.out.println("Serializing Bloom filter ...");
107
108 File ser_dictionary = new File(filename + SERIALIZED_SUFFIX);
109 serializeOut(ser_dictionary);
110
111 System.out.println("... done");
112 }
113
114 }
115
116 public boolean contains(String key)
117 {
118 return _bloomFilter.mightContain(key);
119 }
120
121 protected void serializeOut(File ser_file)
122 {
123 try {
124 FileOutputStream fos = new FileOutputStream(ser_file);
125
126 BufferedOutputStream bfos = new BufferedOutputStream(fos);
127
128 _bloomFilter.writeTo(bfos);
129
130 bfos.close();
131 }
132 catch (FileNotFoundException e) {
133 System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath());
134 e.printStackTrace();
135 } catch (IOException e) {
136 System.err.println("Error reading in Bloom file:" + ser_file.getAbsolutePath());
137 e.printStackTrace();
138 }
139 }
140
141 protected static BloomFilter<CharSequence> serializeIn(File ser_file)
142 {
143 BloomFilter<CharSequence> bloomFilter = null;
144
145 try {
146 FileInputStream fis = new FileInputStream(ser_file);
147 BufferedInputStream bfis = new BufferedInputStream(fis);
148
149 //Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
150 Funnel<CharSequence> string_funnel = stringFunnel(StandardCharsets.UTF_8);
151 bloomFilter = BloomFilter.readFrom(bfis,string_funnel);
152
153 bfis.close();
154 }
155 catch (FileNotFoundException e) {
156 System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath());
157 e.printStackTrace();
158 } catch (IOException e) {
159 System.err.println("Error writing out Bloom file:" + ser_file.getAbsolutePath());
160 e.printStackTrace();
161 }
162 return bloomFilter;
163 }
164
165
166 // Spark uses Guava 14.0, the following is future-ported from Guava 20.0
167 // Added in here, rather then Funnel, and StringCharsetFunnel -> MyStringCharsetFunnel
168
169 public static Funnel<CharSequence> stringFunnel(Charset charset) {
170
171 return new MyStringCharsetFunnel(charset);
172
173 }
174
175 private static class MyStringCharsetFunnel implements Funnel<CharSequence>, Serializable {
176
177 private static final long serialVersionUID = 1L;
178
179 private final Charset charset;
180
181 MyStringCharsetFunnel(Charset charset) {
182 this.charset = Preconditions.checkNotNull(charset);
183 }
184
185 public void funnel(CharSequence from, PrimitiveSink into) {
186 into.putString(from, charset);
187 }
188
189 @Override
190 public String toString() {
191 return "Funnels.stringFunnel(" + charset.name() + ")";
192 }
193
194 @Override
195 public boolean equals(@Nullable Object o) {
196
197 if (o instanceof MyStringCharsetFunnel) {
198 MyStringCharsetFunnel funnel = (MyStringCharsetFunnel) o;
199 return this.charset.equals(funnel.charset);
200 }
201 return false;
202 }
203
204 @Override
205 public int hashCode() {
206 return MyStringCharsetFunnel.class.hashCode() ^ charset.hashCode();
207 }
208
209 Object writeReplace() {
210 return new SerializedForm(charset);
211 }
212
213 private static class SerializedForm implements Serializable {
214 private final String charsetCanonicalName;
215
216 SerializedForm(Charset charset) {
217 this.charsetCanonicalName = charset.name();
218 }
219
220 private Object readResolve() {
221 return stringFunnel(Charset.forName(charsetCanonicalName));
222 }
223
224 private static final long serialVersionUID = 0;
225
226 }
227
228 }
229
230
231}
Note: See TracBrowser for help on using the repository browser.