source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/WhitelistBloomFilter.java@ 31294

Last change on this file since 31294 was 31227, checked in by davidb, 7 years ago

Code tidy up

  • Property svn:executable set to *
File size: 4.2 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedInputStream;
4import java.io.BufferedOutputStream;
5import java.io.FileInputStream;
6import java.io.FileNotFoundException;
7import java.io.IOException;
8import java.io.InputStream;
9import java.nio.charset.StandardCharsets;
10import java.nio.file.Files;
11import java.nio.file.Paths;
12import java.util.stream.Stream;
13
14import com.google.common.hash.BloomFilter;
15import com.google.common.hash.Funnel;
16import com.google.common.hash.Funnels;
17
18public class WhitelistBloomFilter {
19
20 protected BloomFilter<CharSequence> _bloomFilter;
21 protected static final String SERIALIZED_SUFFIX = "-serialized";
22 protected static final double FALSE_POSITIVE_PERCENTAGE = 0.01;
23
24 // http://stackoverflow.com/questions/453018/number-of-lines-in-a-file-in-java
25 protected static int countLines(String filename) throws IOException
26 {
27 InputStream is = new BufferedInputStream(new FileInputStream(filename));
28
29 try {
30 byte[] c = new byte[1024];
31 int count = 0;
32 int readChars = 0;
33 boolean empty = true;
34 while ((readChars = is.read(c)) != -1) {
35 empty = false;
36 for (int i = 0; i < readChars; ++i) {
37 if (c[i] == '\n') {
38 ++count;
39 }
40 }
41 }
42 return (count == 0 && !empty) ? 1 : count;
43 } finally {
44 is.close();
45 }
46 }
47
48
49 public WhitelistBloomFilter(String dictionary_filename, boolean serialize) {
50 System.out.println("Constructing: WhitelistBloomFilter");
51
52 String ser_dictionary_filename = dictionary_filename + SERIALIZED_SUFFIX;
53
54 if (ClusterFileIO.exists(ser_dictionary_filename)) {
55 System.out.println("Loading Serialized Bloom filter ...");
56 _bloomFilter = serializeIn(ser_dictionary_filename);
57 System.out.println("... done");
58 }
59 else {
60 // Need to generate the Bloom filter from the given raw text file
61
62 System.out.println("Counting lines in: " + dictionary_filename);
63 int num_lines = -1;
64 try {
65 num_lines = countLines(dictionary_filename);
66
67 Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
68 _bloomFilter = BloomFilter.create(string_funnel, num_lines,FALSE_POSITIVE_PERCENTAGE);
69 }
70 catch (IOException e) {
71 e.printStackTrace();
72 }
73 System.out.println("Number of lines: " + num_lines);
74
75 storeEntries(dictionary_filename,serialize);
76 }
77
78 }
79
80 protected void storeEntries(String filename, boolean serialize)
81 {
82 System.out.println("Building Bloom filter ...");
83
84 //read file into stream, try-with-resources
85 try (Stream<String> stream = Files.lines(Paths.get(filename))) {
86 stream.forEach(word -> {_bloomFilter.put(word);});
87 } catch (IOException e) {
88 e.printStackTrace();
89 }
90
91 System.out.println("... done");
92
93 if (serialize) {
94 System.out.println("Serializing Bloom filter ...");
95
96 String ser_filename = filename + SERIALIZED_SUFFIX;
97 serializeOut(ser_filename);
98
99 System.out.println("... done");
100 }
101
102 }
103
104 public boolean contains(String key)
105 {
106 return _bloomFilter.mightContain(key);
107 }
108
109 protected void serializeOut(String ser_filename)
110 {
111 try {
112 BufferedOutputStream bos = ClusterFileIO.getBufferedOutputStream(ser_filename);
113 _bloomFilter.writeTo(bos);
114 bos.close();
115 }
116 catch (FileNotFoundException e) {
117 System.err.println("Unable to open Bloom file:" + ser_filename);
118 //e.printStackTrace();
119 } catch (IOException e) {
120 System.err.println("Error reading in Bloom file:" + ser_filename);
121 //e.printStackTrace();
122 }
123 }
124
125 protected static BloomFilter<CharSequence> serializeIn(String ser_filename)
126 {
127 BloomFilter<CharSequence> bloomFilter = null;
128
129 try {
130 BufferedInputStream bis = ClusterFileIO.getBufferedInputStream(ser_filename);
131
132 Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
133 bloomFilter = BloomFilter.readFrom(bis,string_funnel);
134
135 bis.close();
136 }
137 catch (FileNotFoundException e) {
138 System.err.println("Unable to open Bloom file:" + ser_filename);
139 //e.printStackTrace();
140 } catch (IOException e) {
141 System.err.println("Error writing out Bloom file:" + ser_filename);
142 //e.printStackTrace();
143 }
144 return bloomFilter;
145 }
146}
Note: See TracBrowser for help on using the repository browser.