source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/WhitelistBloomFilter.java@ 31201

Last change on this file since 31201 was 31201, checked in by davidb, 7 years ago

Trigger serialization of whitelist in main program

  • Property svn:executable set to *
File size: 4.3 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedInputStream;
4import java.io.BufferedOutputStream;
5import java.io.File;
6import java.io.FileInputStream;
7import java.io.FileNotFoundException;
8import java.io.FileOutputStream;
9import java.io.IOException;
10import java.io.InputStream;
11import java.nio.charset.StandardCharsets;
12import java.nio.file.Files;
13import java.nio.file.Paths;
14import java.util.stream.Stream;
15
16import com.google.common.hash.BloomFilter;
17import com.google.common.hash.Funnel;
18import com.google.common.hash.Funnels;
19
20public class WhitelistBloomFilter {
21
22 protected BloomFilter<CharSequence> _bloomFilter;
23 protected static final String SERIALIZED_SUFFIX = "-serialized";
24 protected static final double FALSE_POSITIVE_PERCENTAGE = 0.01;
25
26
27 // http://stackoverflow.com/questions/453018/number-of-lines-in-a-file-in-java
28 public static int countLines(String filename) throws IOException
29 {
30 InputStream is = new BufferedInputStream(new FileInputStream(filename));
31
32 try {
33 byte[] c = new byte[1024];
34 int count = 0;
35 int readChars = 0;
36 boolean empty = true;
37 while ((readChars = is.read(c)) != -1) {
38 empty = false;
39 for (int i = 0; i < readChars; ++i) {
40 if (c[i] == '\n') {
41 ++count;
42 }
43 }
44 }
45 return (count == 0 && !empty) ? 1 : count;
46 } finally {
47 is.close();
48 }
49 }
50
51
52 public WhitelistBloomFilter(String dictionary_filename, boolean serialize) {
53 System.out.println("Constructing: WhitelistBloomFilter");
54
55 File ser_dictionary_file = new File(dictionary_filename + SERIALIZED_SUFFIX);
56
57 if (ser_dictionary_file.exists()) {
58 System.out.println("Loading Serialized Bloom filter ...");
59 _bloomFilter = serializeIn(ser_dictionary_file);
60 System.out.println("... done");
61 }
62 else {
63 // Need to generate the Bloom filter from the given raw text file
64
65 System.out.println("Counting lines in: " + dictionary_filename);
66 int num_lines = -1;
67 try {
68 num_lines = countLines(dictionary_filename);
69
70 Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
71 _bloomFilter = BloomFilter.create(string_funnel, num_lines,FALSE_POSITIVE_PERCENTAGE);
72 }
73 catch (IOException e) {
74 e.printStackTrace();
75 }
76 System.out.println("Number of lines: " + num_lines);
77
78 storeEntries(dictionary_filename,serialize);
79 }
80
81 }
82
83 protected void storeEntries(String filename, boolean serialize)
84 {
85 System.out.println("Building Bloom filter ...");
86
87 //read file into stream, try-with-resources
88 try (Stream<String> stream = Files.lines(Paths.get(filename))) {
89 stream.forEach(word -> {_bloomFilter.put(word);});
90 } catch (IOException e) {
91 e.printStackTrace();
92 }
93
94 System.out.println("... done");
95
96 if (serialize) {
97 System.out.println("Serializing Bloom filter ...");
98
99 File ser_dictionary = new File(filename + SERIALIZED_SUFFIX);
100 serializeOut(ser_dictionary);
101
102 System.out.println("... done");
103 }
104
105 }
106
107 public boolean contains(String key)
108 {
109 return _bloomFilter.mightContain(key);
110 }
111
112 protected void serializeOut(File ser_file)
113 {
114 try {
115 FileOutputStream fos = new FileOutputStream(ser_file);
116
117 BufferedOutputStream bfos = new BufferedOutputStream(fos);
118
119 _bloomFilter.writeTo(bfos);
120
121 bfos.close();
122 }
123 catch (FileNotFoundException e) {
124 System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath());
125 e.printStackTrace();
126 } catch (IOException e) {
127 System.err.println("Error reading in Bloom file:" + ser_file.getAbsolutePath());
128 e.printStackTrace();
129 }
130 }
131
132 protected static BloomFilter<CharSequence> serializeIn(File ser_file)
133 {
134 BloomFilter<CharSequence> bloomFilter = null;
135
136 try {
137 FileInputStream fis = new FileInputStream(ser_file);
138 BufferedInputStream bfis = new BufferedInputStream(fis);
139
140 Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
141 bloomFilter = BloomFilter.readFrom(bfis,string_funnel);
142
143 bfis.close();
144 }
145 catch (FileNotFoundException e) {
146 System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath());
147 e.printStackTrace();
148 } catch (IOException e) {
149 System.err.println("Error writing out Bloom file:" + ser_file.getAbsolutePath());
150 e.printStackTrace();
151 }
152 return bloomFilter;
153 }
154
155
156}
Note: See TracBrowser for help on using the repository browser.