source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/WhitelistBloomFilter.java@ 31175

Last change on this file since 31175 was 31175, checked in by davidb, 7 years ago

Trial to find memory difference betwen Hashmap and Bloom filters

  • Property svn:executable set to *
File size: 2.4 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedInputStream;
4import java.io.FileInputStream;
5import java.io.IOException;
6import java.io.InputStream;
7import java.nio.charset.StandardCharsets;
8import java.nio.file.Files;
9import java.nio.file.Paths;
10import java.util.stream.Stream;
11
12import com.google.common.hash.BloomFilter;
13import com.google.common.hash.Funnel;
14import com.google.common.hash.Funnels;
15
16public class WhitelistBloomFilter {
17
18 protected String _dictionary_filename;
19 protected BloomFilter<CharSequence> _bloomFilter;
20
21 protected int FILL_FACTOR = 26;
22
23 // http://stackoverflow.com/questions/453018/number-of-lines-in-a-file-in-java
24 public static int countLines(String filename) throws IOException
25 {
26 InputStream is = new BufferedInputStream(new FileInputStream(filename));
27
28 try {
29 byte[] c = new byte[1024];
30 int count = 0;
31 int readChars = 0;
32 boolean empty = true;
33 while ((readChars = is.read(c)) != -1) {
34 empty = false;
35 for (int i = 0; i < readChars; ++i) {
36 if (c[i] == '\n') {
37 ++count;
38 }
39 }
40 }
41 return (count == 0 && !empty) ? 1 : count;
42 } finally {
43 is.close();
44 }
45 }
46
47
48 public WhitelistBloomFilter(String dictionary_filename) {
49 System.out.println("Constructing: WhitelistBloomFilter");
50
51 _dictionary_filename = dictionary_filename;
52
53 System.out.println("Counting lines in: " + dictionary_filename);
54 int num_lines;
55 try {
56 num_lines = countLines(dictionary_filename);
57
58 Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8);
59 _bloomFilter = BloomFilter.create(string_funnel, FILL_FACTOR * num_lines,0.01);
60
61 } catch (IOException e) {
62
63 e.printStackTrace();
64 }
65
66 }
67
68 public void storeEntries()
69 {
70 System.out.println("Build Bloom filter ...");
71
72 for (int i=0; i<FILL_FACTOR; i++) {
73
74 char prefix = (char) ('a' + i);
75
76 //read file into stream, try-with-resources
77 try (Stream<String> stream = Files.lines(Paths.get(_dictionary_filename))) {
78
79 stream.forEach(word -> {_bloomFilter.put(prefix+word);});
80 } catch (IOException e) {
81 e.printStackTrace();
82 }
83 }
84 System.out.println("... done");
85
86 }
87
88 public boolean contains(String key)
89 {
90 return _bloomFilter.mightContain(key);
91 }
92}
Note: See TracBrowser for help on using the repository browser.