source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/com/google/common/hash/BloomFilterAdvanced.java@ 31204

Last change on this file since 31204 was 31204, checked in by davidb, 7 years ago

Splicing in Guava verion 20 of BloomFilter into code as own class (now BloomFilterAdvanced). This is because Spark runs with older version of Guava (14.0). Written code makes use of newer features.

  • Property svn:executable set to *
File size: 21.7 KB
Line 
1/*
2 * Copyright (C) 2011 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5 * in compliance with the License. You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software distributed under the License
10 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11 * or implied. See the License for the specific language governing permissions and limitations under
12 * the License.
13 */
14
15package com.google.common.hash;
16
17import static com.google.common.base.Preconditions.checkArgument;
18import static com.google.common.base.Preconditions.checkNotNull;
19
20import com.google.common.annotations.Beta;
21import com.google.common.annotations.VisibleForTesting;
22import com.google.common.base.Objects;
23import com.google.common.base.Predicate;
24import com.google.common.hash.BloomFilterStrategies.BitArray;
25import com.google.common.primitives.SignedBytes;
26import com.google.common.primitives.UnsignedBytes;
27//import com.google.errorprone.annotations.CanIgnoreReturnValue;
28import java.io.DataInputStream;
29import java.io.DataOutputStream;
30import java.io.IOException;
31import java.io.InputStream;
32import java.io.OutputStream;
33import java.io.Serializable;
34import javax.annotation.Nullable;
35
36import com.google.common.hash.BloomFilter;
37
38
39/**
40 * A Bloom filter for instances of {@code T}. A Bloom filter offers an approximate containment test
41 * with one-sided error: if it claims that an element is contained in it, this might be in error,
42 * but if it claims that an element is <i>not</i> contained in it, then this is definitely true.
43 *
44 * <p>If you are unfamiliar with Bloom filters, this nice
45 * <a href="http://llimllib.github.com/bloomfilter-tutorial/">tutorial</a> may help you understand
46 * how they work.
47 *
48 * <p>The false positive probability ({@code FPP}) of a bloom filter is defined as the probability
49 * that {@linkplain #mightContain(Object)} will erroneously return {@code true} for an object that
50 * has not actually been put in the {@code BloomFilter}.
51 *
52 * <p>Bloom filters are serializable. They also support a more compact serial representation via the
53 * {@link #writeTo} and {@link #readFrom} methods. Both serialized forms will continue to be
54 * supported by future versions of this library. However, serial forms generated by newer versions
55 * of the code may not be readable by older versions of the code (e.g., a serialized bloom filter
56 * generated today may <i>not</i> be readable by a binary that was compiled 6 months ago).
57 *
58 * @param <T> the type of instances that the {@code BloomFilter} accepts
59 * @author Dimitris Andreou
60 * @author Kevin Bourrillion
61 * @since 11.0
62 */
63@Beta
64public final class BloomFilterAdvanced<T> implements Predicate<T>, Serializable {
65 /**
66 * A strategy to translate T instances, to {@code numHashFunctions} bit indexes.
67 *
68 * <p>Implementations should be collections of pure functions (i.e. stateless).
69 */
70 interface xxStrategy extends java.io.Serializable {
71
72 /**
73 * Sets {@code numHashFunctions} bits of the given bit array, by hashing a user element.
74 *
75 * <p>Returns whether any bits changed as a result of this operation.
76 */
77 <T> boolean put(T object, Funnel<? super T> funnel, int numHashFunctions, BitArray bits);
78
79 /**
80 * Queries {@code numHashFunctions} bits of the given bit array, by hashing a user element;
81 * returns {@code true} if and only if all selected bits are set.
82 */
83 <T> boolean mightContain(
84 T object, Funnel<? super T> funnel, int numHashFunctions, BitArray bits);
85
86 /**
87 * Identifier used to encode this strategy, when marshalled as part of a BloomFilter. Only
88 * values in the [-128, 127] range are valid for the compact serial form. Non-negative values
89 * are reserved for enums defined in BloomFilterStrategies; negative values are reserved for any
90 * custom, stateful strategy we may define (e.g. any kind of strategy that would depend on user
91 * input).
92 */
93 int ordinal();
94 }
95
96 /** The bit set of the BloomFilter (not necessarily power of 2!) */
97 private final BitArray bits;
98
99 /** Number of hashes per element */
100 private final int numHashFunctions;
101
102 /** The funnel to translate Ts to bytes */
103 private final Funnel<? super T> funnel;
104
105 /**
106 * The strategy we employ to map an element T to {@code numHashFunctions} bit indexes.
107 */
108 private final BloomFilter.Strategy strategy;
109
110 /**
111 * Creates a BloomFilter.
112 */
113 private BloomFilterAdvanced(
114 BitArray bits, int numHashFunctions, Funnel<? super T> funnel, BloomFilter.Strategy strategy) {
115 checkArgument(numHashFunctions > 0, "numHashFunctions (%s) must be > 0", numHashFunctions);
116 checkArgument(
117 numHashFunctions <= 255, "numHashFunctions (%s) must be <= 255", numHashFunctions);
118 this.bits = checkNotNull(bits);
119 this.numHashFunctions = numHashFunctions;
120 this.funnel = checkNotNull(funnel);
121 this.strategy = checkNotNull(strategy);
122 }
123
124 /**
125 * Creates a new {@code BloomFilter} that's a copy of this instance. The new instance is equal to
126 * this instance but shares no mutable state.
127 *
128 * @since 12.0
129 */
130 public BloomFilterAdvanced<T> copy() {
131 return new BloomFilterAdvanced<T>(bits.copy(), numHashFunctions, funnel, strategy);
132 }
133
134 /**
135 * Returns {@code true} if the element <i>might</i> have been put in this Bloom filter,
136 * {@code false} if this is <i>definitely</i> not the case.
137 */
138 public boolean mightContain(T object) {
139 return strategy.mightContain(object, funnel, numHashFunctions, bits);
140 }
141
142 /**
143 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #mightContain}
144 * instead.
145 */
146 @Deprecated
147 @Override
148 public boolean apply(T input) {
149 return mightContain(input);
150 }
151
152 /**
153 * Puts an element into this {@code BloomFilter}. Ensures that subsequent invocations of
154 * {@link #mightContain(Object)} with the same element will always return {@code true}.
155 *
156 * @return true if the bloom filter's bits changed as a result of this operation. If the bits
157 * changed, this is <i>definitely</i> the first time {@code object} has been added to the
158 * filter. If the bits haven't changed, this <i>might</i> be the first time {@code object} has
159 * been added to the filter. Note that {@code put(t)} always returns the <i>opposite</i>
160 * result to what {@code mightContain(t)} would have returned at the time it is called."
161 * @since 12.0 (present in 11.0 with {@code void} return type})
162 */
163 //@CanIgnoreReturnValue
164 public boolean put(T object) {
165 return strategy.put(object, funnel, numHashFunctions, bits);
166 }
167
168 /**
169 * Returns the probability that {@linkplain #mightContain(Object)} will erroneously return
170 * {@code true} for an object that has not actually been put in the {@code BloomFilter}.
171 *
172 * <p>Ideally, this number should be close to the {@code fpp} parameter passed in
173 * {@linkplain #create(Funnel, int, double)}, or smaller. If it is significantly higher, it is
174 * usually the case that too many elements (more than expected) have been put in the
175 * {@code BloomFilter}, degenerating it.
176 *
177 * @since 14.0 (since 11.0 as expectedFalsePositiveProbability())
178 */
179 public double expectedFpp() {
180 // You down with FPP? (Yeah you know me!) Who's down with FPP? (Every last homie!)
181 return Math.pow((double) bits.bitCount() / bitSize(), numHashFunctions);
182 }
183
184 /**
185 * Returns the number of bits in the underlying bit array.
186 */
187 @VisibleForTesting
188 long bitSize() {
189 return bits.bitSize();
190 }
191
192 /**
193 * Determines whether a given bloom filter is compatible with this bloom filter. For two bloom
194 * filters to be compatible, they must:
195 *
196 * <ul>
197 * <li>not be the same instance
198 * <li>have the same number of hash functions
199 * <li>have the same bit size
200 * <li>have the same strategy
201 * <li>have equal funnels
202 * <ul>
203 *
204 * @param that The bloom filter to check for compatibility.
205 * @since 15.0
206 */
207 public boolean isCompatible(BloomFilterAdvanced<T> that) {
208 checkNotNull(that);
209 return (this != that)
210 && (this.numHashFunctions == that.numHashFunctions)
211 && (this.bitSize() == that.bitSize())
212 && (this.strategy.equals(that.strategy))
213 && (this.funnel.equals(that.funnel));
214 }
215
216 /**
217 * Combines this bloom filter with another bloom filter by performing a bitwise OR of the
218 * underlying data. The mutations happen to <b>this</b> instance. Callers must ensure the bloom
219 * filters are appropriately sized to avoid saturating them.
220 *
221 * @param that The bloom filter to combine this bloom filter with. It is not mutated.
222 * @throws IllegalArgumentException if {@code isCompatible(that) == false}
223 *
224 * @since 15.0
225 */
226 public void putAll(BloomFilterAdvanced<T> that) {
227 checkNotNull(that);
228 checkArgument(this != that, "Cannot combine a BloomFilter with itself.");
229 checkArgument(
230 this.numHashFunctions == that.numHashFunctions,
231 "BloomFilters must have the same number of hash functions (%s != %s)",
232 this.numHashFunctions,
233 that.numHashFunctions);
234 checkArgument(
235 this.bitSize() == that.bitSize(),
236 "BloomFilters must have the same size underlying bit arrays (%s != %s)",
237 this.bitSize(),
238 that.bitSize());
239 checkArgument(
240 this.strategy.equals(that.strategy),
241 "BloomFilters must have equal strategies (%s != %s)",
242 this.strategy,
243 that.strategy);
244 checkArgument(
245 this.funnel.equals(that.funnel),
246 "BloomFilters must have equal funnels (%s != %s)",
247 this.funnel,
248 that.funnel);
249 this.bits.putAll(that.bits);
250 }
251
252 @Override
253 public boolean equals(@Nullable Object object) {
254 if (object == this) {
255 return true;
256 }
257 if (object instanceof BloomFilterAdvanced) {
258 BloomFilterAdvanced<?> that = (BloomFilterAdvanced<?>) object;
259 return this.numHashFunctions == that.numHashFunctions
260 && this.funnel.equals(that.funnel)
261 && this.bits.equals(that.bits)
262 && this.strategy.equals(that.strategy);
263 }
264 return false;
265 }
266
267 @Override
268 public int hashCode() {
269 return Objects.hashCode(numHashFunctions, funnel, strategy, bits);
270 }
271
272 /**
273 * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of insertions and
274 * expected false positive probability.
275 *
276 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
277 * will result in its saturation, and a sharp deterioration of its false positive probability.
278 *
279 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided
280 * {@code Funnel<T>} is.
281 *
282 * <p>It is recommended that the funnel be implemented as a Java enum. This has the benefit of
283 * ensuring proper serialization and deserialization, which is important since {@link #equals}
284 * also relies on object identity of funnels.
285 *
286 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use
287 * @param expectedInsertions the number of expected insertions to the constructed
288 * {@code BloomFilter<T>}; must be positive
289 * @param fpp the desired false positive probability (must be positive and less than 1.0)
290 * @return a {@code BloomFilter}
291 */
292 public static <T> BloomFilterAdvanced<T> create(
293 Funnel<? super T> funnel, int expectedInsertions, double fpp) {
294 return create(funnel, (long) expectedInsertions, fpp);
295 }
296
297 /**
298 * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of insertions and
299 * expected false positive probability.
300 *
301 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
302 * will result in its saturation, and a sharp deterioration of its false positive probability.
303 *
304 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided
305 * {@code Funnel<T>} is.
306 *
307 * <p>It is recommended that the funnel be implemented as a Java enum. This has the benefit of
308 * ensuring proper serialization and deserialization, which is important since {@link #equals}
309 * also relies on object identity of funnels.
310 *
311 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use
312 * @param expectedInsertions the number of expected insertions to the constructed
313 * {@code BloomFilter<T>}; must be positive
314 * @param fpp the desired false positive probability (must be positive and less than 1.0)
315 * @return a {@code BloomFilter}
316 * @since 19.0
317 */
318 public static <T> BloomFilterAdvanced<T> create(
319 Funnel<? super T> funnel, long expectedInsertions, double fpp) {
320 return create(funnel, expectedInsertions, fpp, BloomFilterStrategies.MURMUR128_MITZ_64);
321 }
322
323 @VisibleForTesting
324 static <T> BloomFilterAdvanced<T> create(
325 Funnel<? super T> funnel, long expectedInsertions, double fpp, BloomFilter.Strategy strategy) {
326 checkNotNull(funnel);
327 checkArgument(
328 expectedInsertions >= 0, "Expected insertions (%s) must be >= 0", expectedInsertions);
329 checkArgument(fpp > 0.0, "False positive probability (%s) must be > 0.0", fpp);
330 checkArgument(fpp < 1.0, "False positive probability (%s) must be < 1.0", fpp);
331 checkNotNull(strategy);
332
333 if (expectedInsertions == 0) {
334 expectedInsertions = 1;
335 }
336 /*
337 * TODO(user): Put a warning in the javadoc about tiny fpp values, since the resulting size
338 * is proportional to -log(p), but there is not much of a point after all, e.g.
339 * optimalM(1000, 0.0000000000000001) = 76680 which is less than 10kb. Who cares!
340 */
341 long numBits = optimalNumOfBits(expectedInsertions, fpp);
342 int numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, numBits);
343 try {
344 return new BloomFilterAdvanced<T>(new BitArray(numBits), numHashFunctions, funnel, strategy);
345 } catch (IllegalArgumentException e) {
346 throw new IllegalArgumentException("Could not create BloomFilter of " + numBits + " bits", e);
347 }
348 }
349
350 /**
351 * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of insertions and a
352 * default expected false positive probability of 3%.
353 *
354 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
355 * will result in its saturation, and a sharp deterioration of its false positive probability.
356 *
357 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided
358 * {@code Funnel<T>} is.
359 *
360 * <p>It is recommended that the funnel be implemented as a Java enum. This has the benefit of
361 * ensuring proper serialization and deserialization, which is important since {@link #equals}
362 * also relies on object identity of funnels.
363 *
364 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use
365 * @param expectedInsertions the number of expected insertions to the constructed
366 * {@code BloomFilter<T>}; must be positive
367 * @return a {@code BloomFilter}
368 */
369 public static <T> BloomFilterAdvanced<T> create(Funnel<? super T> funnel, int expectedInsertions) {
370 return create(funnel, (long) expectedInsertions);
371 }
372
373 /**
374 * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of insertions and a
375 * default expected false positive probability of 3%.
376 *
377 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
378 * will result in its saturation, and a sharp deterioration of its false positive probability.
379 *
380 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided
381 * {@code Funnel<T>} is.
382 *
383 * <p>It is recommended that the funnel be implemented as a Java enum. This has the benefit of
384 * ensuring proper serialization and deserialization, which is important since {@link #equals}
385 * also relies on object identity of funnels.
386 *
387 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use
388 * @param expectedInsertions the number of expected insertions to the constructed
389 * {@code BloomFilter<T>}; must be positive
390 * @return a {@code BloomFilter}
391 * @since 19.0
392 */
393 public static <T> BloomFilterAdvanced<T> create(Funnel<? super T> funnel, long expectedInsertions) {
394 return create(funnel, expectedInsertions, 0.03); // FYI, for 3%, we always get 5 hash functions
395 }
396
397 // Cheat sheet:
398 //
399 // m: total bits
400 // n: expected insertions
401 // b: m/n, bits per insertion
402 // p: expected false positive probability
403 //
404 // 1) Optimal k = b * ln2
405 // 2) p = (1 - e ^ (-kn/m))^k
406 // 3) For optimal k: p = 2 ^ (-k) ~= 0.6185^b
407 // 4) For optimal k: m = -nlnp / ((ln2) ^ 2)
408
409 /**
410 * Computes the optimal k (number of hashes per element inserted in Bloom filter), given the
411 * expected insertions and total number of bits in the Bloom filter.
412 *
413 * See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula.
414 *
415 * @param n expected insertions (must be positive)
416 * @param m total number of bits in Bloom filter (must be positive)
417 */
418 @VisibleForTesting
419 static int optimalNumOfHashFunctions(long n, long m) {
420 // (m / n) * log(2), but avoid truncation due to division!
421 return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
422 }
423
424 /**
425 * Computes m (total bits of Bloom filter) which is expected to achieve, for the specified
426 * expected insertions, the required false positive probability.
427 *
428 * See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the formula.
429 *
430 * @param n expected insertions (must be positive)
431 * @param p false positive rate (must be 0 < p < 1)
432 */
433 @VisibleForTesting
434 static long optimalNumOfBits(long n, double p) {
435 if (p == 0) {
436 p = Double.MIN_VALUE;
437 }
438 return (long) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
439 }
440
441 private Object writeReplace() {
442 return new SerialForm<T>(this);
443 }
444
445 private static class SerialForm<T> implements Serializable {
446 final long[] data;
447 final int numHashFunctions;
448 final Funnel<? super T> funnel;
449 final BloomFilter.Strategy strategy;
450
451 SerialForm(BloomFilterAdvanced<T> bf) {
452 this.data = bf.bits.data;
453 this.numHashFunctions = bf.numHashFunctions;
454 this.funnel = bf.funnel;
455 this.strategy = bf.strategy;
456 }
457
458 Object readResolve() {
459 return new BloomFilterAdvanced<T>(new BitArray(data), numHashFunctions, funnel, strategy);
460 }
461
462 private static final long serialVersionUID = 1;
463 }
464
465 /**
466 * Writes this {@code BloomFilter} to an output stream, with a custom format (not Java
467 * serialization). This has been measured to save at least 400 bytes compared to regular
468 * serialization.
469 *
470 * <p>Use {@linkplain #readFrom(InputStream, Funnel)} to reconstruct the written BloomFilter.
471 */
472 public void writeTo(OutputStream out) throws IOException {
473 // Serial form:
474 // 1 signed byte for the strategy
475 // 1 unsigned byte for the number of hash functions
476 // 1 big endian int, the number of longs in our bitset
477 // N big endian longs of our bitset
478 DataOutputStream dout = new DataOutputStream(out);
479 dout.writeByte(SignedBytes.checkedCast(strategy.ordinal()));
480 dout.writeByte(UnsignedBytes.checkedCast(numHashFunctions)); // note: checked at the c'tor
481 dout.writeInt(bits.data.length);
482 for (long value : bits.data) {
483 dout.writeLong(value);
484 }
485 }
486
487 /**
488 * Reads a byte stream, which was written by {@linkplain #writeTo(OutputStream)}, into a
489 * {@code BloomFilter<T>}.
490 *
491 * The {@code Funnel} to be used is not encoded in the stream, so it must be provided here.
492 * <b>Warning:</b> the funnel provided <b>must</b> behave identically to the one used to populate
493 * the original Bloom filter!
494 *
495 * @throws IOException if the InputStream throws an {@code IOException}, or if its data does not
496 * appear to be a BloomFilter serialized using the {@linkplain #writeTo(OutputStream)} method.
497 */
498 public static <T> BloomFilterAdvanced<T> readFrom(InputStream in, Funnel<T> funnel) throws IOException {
499 checkNotNull(in, "InputStream");
500 checkNotNull(funnel, "Funnel");
501 int strategyOrdinal = -1;
502 int numHashFunctions = -1;
503 int dataLength = -1;
504 try {
505 DataInputStream din = new DataInputStream(in);
506 // currently this assumes there is no negative ordinal; will have to be updated if we
507 // add non-stateless strategies (for which we've reserved negative ordinals; see
508 // Strategy.ordinal()).
509 strategyOrdinal = din.readByte();
510 numHashFunctions = UnsignedBytes.toInt(din.readByte());
511 dataLength = din.readInt();
512
513 BloomFilter.Strategy strategy = BloomFilterStrategies.values()[strategyOrdinal];
514 long[] data = new long[dataLength];
515 for (int i = 0; i < data.length; i++) {
516 data[i] = din.readLong();
517 }
518 return new BloomFilterAdvanced<T>(new BitArray(data), numHashFunctions, funnel, strategy);
519 } catch (RuntimeException e) {
520 String message =
521 "Unable to deserialize BloomFilter from InputStream."
522 + " strategyOrdinal: "
523 + strategyOrdinal
524 + " numHashFunctions: "
525 + numHashFunctions
526 + " dataLength: "
527 + dataLength;
528 throw new IOException(message, e);
529 }
530 }
531}
Note: See TracBrowser for help on using the repository browser.