source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/TestWhitelistBloomFilter.java@ 31199

Last change on this file since 31199 was 31199, checked in by davidb, 7 years ago

Renaming of classname to reflect filename rename

  • Property svn:executable set to *
File size: 3.7 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedInputStream;
4import java.io.BufferedOutputStream;
5import java.io.FileInputStream;
6import java.io.FileNotFoundException;
7import java.io.FileOutputStream;
8import java.io.IOException;
9import java.io.InputStream;
10import java.io.OutputStream;
11import java.nio.charset.StandardCharsets;
12import java.nio.file.Files;
13import java.nio.file.Paths;
14import java.util.stream.Stream;
15
16import com.google.common.hash.BloomFilter;
17import com.google.common.hash.Funnel;
18import com.google.common.hash.Funnels;
19
20public class TestWhitelistBloomFilter {
21
22 protected String _dictionary_filename;
23 protected BloomFilter<CharSequence> _bloomFilter;
24
25 protected int FILL_FACTOR = 26;
26
27 // http://stackoverflow.com/questions/453018/number-of-lines-in-a-file-in-java
28 public static int countLines(String filename) throws IOException
29 {
30 InputStream is = new BufferedInputStream(new FileInputStream(filename));
31
32 try {
33 byte[] c = new byte[1024];
34 int count = 0;
35 int readChars = 0;
36 boolean empty = true;
37 while ((readChars = is.read(c)) != -1) {
38 empty = false;
39 for (int i = 0; i < readChars; ++i) {
40 if (c[i] == '\n') {
41 ++count;
42 }
43 }
44 }
45 return (count == 0 && !empty) ? 1 : count;
46 } finally {
47 is.close();
48 }
49 }
50
51
52 public TestWhitelistBloomFilter(String dictionary_filename) {
53 System.out.println("Constructing: WhitelistBloomFilter");
54
55 _dictionary_filename = dictionary_filename;
56
57 System.out.println("Counting lines in: " + dictionary_filename);
58 int num_lines;
59 try {
60 num_lines = countLines(dictionary_filename);
61
62 Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
63 _bloomFilter = BloomFilter.create(string_funnel, FILL_FACTOR * num_lines,0.01);
64
65 } catch (IOException e) {
66
67 e.printStackTrace();
68 }
69
70 }
71
72 public void storeEntries()
73 {
74 System.out.println("Build Bloom filter ...");
75
76 for (int i=0; i<FILL_FACTOR; i++) {
77
78 char prefix = (char) ('a' + i);
79
80 //read file into stream, try-with-resources
81 try (Stream<String> stream = Files.lines(Paths.get(_dictionary_filename))) {
82
83 stream.forEach(word -> {_bloomFilter.put(prefix+word);});
84 } catch (IOException e) {
85 e.printStackTrace();
86 }
87 }
88 System.out.println("... done");
89
90 }
91
92 public boolean contains(String key)
93 {
94 return _bloomFilter.mightContain(key);
95 }
96
97 public void serializeOut(String filename)
98 {
99 try {
100 FileOutputStream fos = new FileOutputStream(filename);
101
102 BufferedOutputStream bfos = new BufferedOutputStream(fos);
103
104 _bloomFilter.writeTo(bfos);
105
106 bfos.close();
107 }
108 catch (FileNotFoundException e) {
109 System.err.println("Unable to open Bloom file:" + filename);
110 e.printStackTrace();
111 } catch (IOException e) {
112 System.err.println("Error reading in Bloom file:" + filename);
113 e.printStackTrace();
114 }
115 }
116
117 public void serializeIn(String filename)
118 {
119 try {
120 FileInputStream fis = new FileInputStream(filename);
121
122 BufferedInputStream bfis = new BufferedInputStream(fis);
123
124 Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
125 _bloomFilter = BloomFilter.readFrom(bfis,string_funnel);
126
127 bfis.close();
128 }
129 catch (FileNotFoundException e) {
130 System.err.println("Unable to open Bloom file:" + filename);
131 e.printStackTrace();
132 } catch (IOException e) {
133 System.err.println("Error writing out Bloom file:" + filename);
134 e.printStackTrace();
135 }
136 }
137
138
139}
Note: See TracBrowser for help on using the repository browser.