source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/WhitelistBloomFilter.java@ 31224

Last change on this file since 31224 was 31224, checked in by davidb, 7 years ago

Debug added

  • Property svn:executable set to *
File size: 6.0 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedInputStream;
4import java.io.BufferedOutputStream;
5import java.io.File;
6import java.io.FileInputStream;
7import java.io.FileNotFoundException;
8import java.io.FileOutputStream;
9import java.io.IOException;
10import java.io.InputStream;
11import java.net.URI;
12import java.nio.charset.StandardCharsets;
13import java.nio.file.Files;
14import java.nio.file.Paths;
15import java.util.stream.Stream;
16
17import org.apache.hadoop.fs.FSDataInputStream;
18import org.apache.hadoop.fs.FSDataOutputStream;
19import org.apache.hadoop.fs.FileSystem;
20import org.apache.hadoop.fs.Path;
21
22import com.google.common.hash.BloomFilter;
23import com.google.common.hash.Funnel;
24import com.google.common.hash.Funnels;
25
26public class WhitelistBloomFilter {
27
28 protected BloomFilter<CharSequence> _bloomFilter;
29 protected static final String SERIALIZED_SUFFIX = "-serialized";
30 protected static final double FALSE_POSITIVE_PERCENTAGE = 0.01;
31
32 // http://stackoverflow.com/questions/453018/number-of-lines-in-a-file-in-java
33 protected static int countLines(String filename) throws IOException
34 {
35 InputStream is = new BufferedInputStream(new FileInputStream(filename));
36
37 try {
38 byte[] c = new byte[1024];
39 int count = 0;
40 int readChars = 0;
41 boolean empty = true;
42 while ((readChars = is.read(c)) != -1) {
43 empty = false;
44 for (int i = 0; i < readChars; ++i) {
45 if (c[i] == '\n') {
46 ++count;
47 }
48 }
49 }
50 return (count == 0 && !empty) ? 1 : count;
51 } finally {
52 is.close();
53 }
54 }
55
56
57 public WhitelistBloomFilter(String dictionary_filename, boolean serialize) {
58 System.out.println("Constructing: WhitelistBloomFilter");
59
60
61 String ser_dictionary_filename = dictionary_filename + SERIALIZED_SUFFIX;
62
63
64 //File ser_dictionary_file = new File(dictionary_filename + SERIALIZED_SUFFIX);
65
66 //if (ser_dictionary_file.exists()) {
67 System.err.println("***** checking serialized dictionary:" + ser_dictionary_filename);
68 if (ClusterFileIO.exists(ser_dictionary_filename)) {
69 System.out.println("Loading Serialized Bloom filter ...");
70 _bloomFilter = serializeIn(ser_dictionary_filename);
71 System.out.println("... done");
72 }
73 else {
74 // Need to generate the Bloom filter from the given raw text file
75
76 System.out.println("Counting lines in: " + dictionary_filename);
77 int num_lines = -1;
78 try {
79 num_lines = countLines(dictionary_filename);
80
81 Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
82 _bloomFilter = BloomFilter.create(string_funnel, num_lines,FALSE_POSITIVE_PERCENTAGE);
83 }
84 catch (IOException e) {
85 e.printStackTrace();
86 }
87 System.out.println("Number of lines: " + num_lines);
88
89 storeEntries(dictionary_filename,serialize);
90 }
91
92 }
93
94 protected void storeEntries(String filename, boolean serialize)
95 {
96 System.out.println("Building Bloom filter ...");
97
98 //read file into stream, try-with-resources
99 try (Stream<String> stream = Files.lines(Paths.get(filename))) {
100 stream.forEach(word -> {_bloomFilter.put(word);});
101 } catch (IOException e) {
102 e.printStackTrace();
103 }
104
105 System.out.println("... done");
106
107 if (serialize) {
108 System.out.println("Serializing Bloom filter ...");
109
110 String ser_filename = filename + SERIALIZED_SUFFIX;
111
112 //File ser_dictionary = new File(ser_filename);
113 serializeOut(ser_filename);
114
115 System.out.println("... done");
116 }
117
118 }
119
120 public boolean contains(String key)
121 {
122 return _bloomFilter.mightContain(key);
123 }
124
125 //protected void serializeOut(File ser_file)
126 protected void serializeOut(String ser_filename)
127 {
128 //FileSystem fs = ClusterFileIO.getFileSystemInstance(ser_filename);
129
130 try {
131 //URI ser_uri = URI.create (ser_filename);
132 //Path ser_path = new Path(ser_uri);
133
134 BufferedOutputStream bos = ClusterFileIO.getBufferedOutputStream(ser_filename);
135
136 //FileOutputStream fos = new FileOutputStream(ser_file);
137
138 //BufferedOutputStream bfos = new BufferedOutputStream(fos);
139
140 _bloomFilter.writeTo(bos);
141
142 bos.close();
143 }
144 catch (FileNotFoundException e) {
145 System.err.println("Unable to open Bloom file:" + ser_filename);
146 //System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath());
147
148 e.printStackTrace();
149 } catch (IOException e) {
150 System.err.println("Error reading in Bloom file:" + ser_filename);
151 //System.err.println("Error reading in Bloom file:" + ser_file.getAbsolutePath());
152 e.printStackTrace();
153 }
154 }
155
156 //protected static BloomFilter<CharSequence> serializeIn(File ser_file)
157 protected static BloomFilter<CharSequence> serializeIn(String ser_filename)
158 {
159 BloomFilter<CharSequence> bloomFilter = null;
160
161 //FileSystem fs = ClusterFileIO.getFileSystemInstance(ser_filename);
162
163 try {
164 //URI ser_uri = URI.create (ser_filename);
165 //Path ser_path = new Path(ser_uri);
166
167 //FSDataInputStream fsdis = fs.open(ser_path);
168 //BufferedInputStream bis = new BufferedInputStream(fsdis);
169
170 BufferedInputStream bis = ClusterFileIO.getBufferedInputStream(ser_filename);
171
172 //FileInputStream fis = new FileInputStream(ser_file);
173 //BufferedInputStream bfis = new BufferedInputStream(fis);
174
175 Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
176 bloomFilter = BloomFilter.readFrom(bis,string_funnel);
177
178 bis.close();
179 }
180 catch (FileNotFoundException e) {
181 //System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath());
182 //System.err.println("Unable to open Bloom file:" + ser_path.getName());
183 System.err.println("Unable to open Bloom file:" + ser_filename);
184
185 e.printStackTrace();
186 } catch (IOException e) {
187 //System.err.println("Error writing out Bloom file:" + ser_file.getAbsolutePath());
188 //System.err.println("Error writing out Bloom file:" + ser_path.getName());
189 System.err.println("Error writing out Bloom file:" + ser_filename);
190 e.printStackTrace();
191 }
192 return bloomFilter;
193 }
194
195
196
197
198
199}
Note: See TracBrowser for help on using the repository browser.