source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/com/google/common/hash/BloomFilterAdvanced.java@ 31206

Last change on this file since 31206 was 31206, checked in by davidb, 7 years ago

More tweaking of Guava cloned code

  • Property svn:executable set to *
File size: 21.6 KB
Line 
1/*
2 * Copyright (C) 2011 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5 * in compliance with the License. You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software distributed under the License
10 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11 * or implied. See the License for the specific language governing permissions and limitations under
12 * the License.
13 */
14
15package com.google.common.hash;
16
17import static com.google.common.base.Preconditions.checkArgument;
18import static com.google.common.base.Preconditions.checkNotNull;
19
20import com.google.common.annotations.Beta;
21import com.google.common.annotations.VisibleForTesting;
22import com.google.common.base.Objects;
23import com.google.common.base.Predicate;
24import com.google.common.hash.BloomFilterStrategies.BitArray;
25import com.google.common.primitives.SignedBytes;
26import com.google.common.primitives.UnsignedBytes;
27//import com.google.errorprone.annotations.CanIgnoreReturnValue;
28import java.io.DataInputStream;
29import java.io.DataOutputStream;
30import java.io.IOException;
31import java.io.InputStream;
32import java.io.OutputStream;
33import java.io.Serializable;
34import javax.annotation.Nullable;
35
36import com.google.common.hash.BloomFilter;
37
38
39/**
40 * A Bloom filter for instances of {@code T}. A Bloom filter offers an approximate containment test
41 * with one-sided error: if it claims that an element is contained in it, this might be in error,
42 * but if it claims that an element is <i>not</i> contained in it, then this is definitely true.
43 *
44 * <p>If you are unfamiliar with Bloom filters, this nice
45 * <a href="http://llimllib.github.com/bloomfilter-tutorial/">tutorial</a> may help you understand
46 * how they work.
47 *
48 * <p>The false positive probability ({@code FPP}) of a bloom filter is defined as the probability
49 * that {@linkplain #mightContain(Object)} will erroneously return {@code true} for an object that
50 * has not actually been put in the {@code BloomFilter}.
51 *
52 * <p>Bloom filters are serializable. They also support a more compact serial representation via the
53 * {@link #writeTo} and {@link #readFrom} methods. Both serialized forms will continue to be
54 * supported by future versions of this library. However, serial forms generated by newer versions
55 * of the code may not be readable by older versions of the code (e.g., a serialized bloom filter
56 * generated today may <i>not</i> be readable by a binary that was compiled 6 months ago).
57 *
58 * @param <T> the type of instances that the {@code BloomFilter} accepts
59 * @author Dimitris Andreou
60 * @author Kevin Bourrillion
61 * @since 11.0
62 */
63@Beta
64public final class BloomFilterAdvanced<T> implements Predicate<T>, Serializable {
65 /**
66 * A strategy to translate T instances, to {@code numHashFunctions} bit indexes.
67 *
68 * <p>Implementations should be collections of pure functions (i.e. stateless).
69 */
70 interface Strategy extends java.io.Serializable {
71
72 /**
73 * Sets {@code numHashFunctions} bits of the given bit array, by hashing a user element.
74 *
75 * <p>Returns whether any bits changed as a result of this operation.
76 */
77 <T> boolean put(T object, Funnel<? super T> funnel, int numHashFunctions, BitArray bits);
78
79 /**
80 * Queries {@code numHashFunctions} bits of the given bit array, by hashing a user element;
81 * returns {@code true} if and only if all selected bits are set.
82 */
83 <T> boolean mightContain(
84 T object, Funnel<? super T> funnel, int numHashFunctions, BitArray bits);
85
86 /**
87 * Identifier used to encode this strategy, when marshalled as part of a BloomFilter. Only
88 * values in the [-128, 127] range are valid for the compact serial form. Non-negative values
89 * are reserved for enums defined in BloomFilterStrategies; negative values are reserved for any
90 * custom, stateful strategy we may define (e.g. any kind of strategy that would depend on user
91 * input).
92 */
93 int ordinal();
94 }
95
96 /** The bit set of the BloomFilter (not necessarily power of 2!) */
97 private final BitArray bits;
98
99 /** Number of hashes per element */
100 private final int numHashFunctions;
101
102 /** The funnel to translate Ts to bytes */
103 private final Funnel<? super T> funnel;
104
105 /**
106 * The strategy we employ to map an element T to {@code numHashFunctions} bit indexes.
107 */
108 private final Strategy strategy;
109
110 /**
111 * Creates a BloomFilter.
112 */
113 private BloomFilterAdvanced(
114 BitArray bits, int numHashFunctions, Funnel<? super T> funnel, Strategy strategy) {
115 checkArgument(numHashFunctions > 0, "numHashFunctions (%s) must be > 0", numHashFunctions);
116 checkArgument(
117 numHashFunctions <= 255, "numHashFunctions (%s) must be <= 255", numHashFunctions);
118 this.bits = checkNotNull(bits);
119 this.numHashFunctions = numHashFunctions;
120 this.funnel = checkNotNull(funnel);
121 this.strategy = checkNotNull(strategy);
122 }
123
124 /**
125 * Creates a new {@code BloomFilter} that's a copy of this instance. The new instance is equal to
126 * this instance but shares no mutable state.
127 *
128 * @since 12.0
129 */
130 public BloomFilterAdvanced<T> copy() {
131 return new BloomFilterAdvanced<T>(bits.copy(), numHashFunctions, funnel, strategy);
132 }
133
134 /**
135 * Returns {@code true} if the element <i>might</i> have been put in this Bloom filter,
136 * {@code false} if this is <i>definitely</i> not the case.
137 */
138 public boolean mightContain(T object) {
139 return strategy.mightContain(object, funnel, numHashFunctions, bits);
140 }
141
142 /**
143 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #mightContain}
144 * instead.
145 */
146 @Deprecated
147 @Override
148 public boolean apply(T input) {
149 return mightContain(input);
150 }
151
152 /**
153 * Puts an element into this {@code BloomFilter}. Ensures that subsequent invocations of
154 * {@link #mightContain(Object)} with the same element will always return {@code true}.
155 *
156 * @return true if the bloom filter's bits changed as a result of this operation. If the bits
157 * changed, this is <i>definitely</i> the first time {@code object} has been added to the
158 * filter. If the bits haven't changed, this <i>might</i> be the first time {@code object} has
159 * been added to the filter. Note that {@code put(t)} always returns the <i>opposite</i>
160 * result to what {@code mightContain(t)} would have returned at the time it is called."
161 * @since 12.0 (present in 11.0 with {@code void} return type})
162 */
163 //@CanIgnoreReturnValue
164 public boolean put(T object) {
165 return strategy.put(object, funnel, numHashFunctions, bits);
166 }
167
168 /**
169 * Returns the probability that {@linkplain #mightContain(Object)} will erroneously return
170 * {@code true} for an object that has not actually been put in the {@code BloomFilter}.
171 *
172 * <p>Ideally, this number should be close to the {@code fpp} parameter passed in
173 * {@linkplain #create(Funnel, int, double)}, or smaller. If it is significantly higher, it is
174 * usually the case that too many elements (more than expected) have been put in the
175 * {@code BloomFilter}, degenerating it.
176 *
177 * @since 14.0 (since 11.0 as expectedFalsePositiveProbability())
178 */
179 public double expectedFpp() {
180 // You down with FPP? (Yeah you know me!) Who's down with FPP? (Every last homie!)
181 return Math.pow((double) bits.bitCount() / bitSize(), numHashFunctions);
182 }
183
184 /**
185 * Returns the number of bits in the underlying bit array.
186 */
187 @VisibleForTesting
188 long bitSize() {
189 return bits.bitSize();
190 }
191
192 /**
193 * Determines whether a given bloom filter is compatible with this bloom filter. For two bloom
194 * filters to be compatible, they must:
195 *
196 * <ul>
197 * <li>not be the same instance
198 * <li>have the same number of hash functions
199 * <li>have the same bit size
200 * <li>have the same strategy
201 * <li>have equal funnels
202 * <ul>
203 *
204 * @param that The bloom filter to check for compatibility.
205 * @since 15.0
206 */
207 public boolean isCompatible(BloomFilterAdvanced<T> that) {
208 checkNotNull(that);
209 return (this != that)
210 && (this.numHashFunctions == that.numHashFunctions)
211 && (this.bitSize() == that.bitSize())
212 && (this.strategy.equals(that.strategy))
213 && (this.funnel.equals(that.funnel));
214 }
215
216 /**
217 * Combines this bloom filter with another bloom filter by performing a bitwise OR of the
218 * underlying data. The mutations happen to <b>this</b> instance. Callers must ensure the bloom
219 * filters are appropriately sized to avoid saturating them.
220 *
221 * @param that The bloom filter to combine this bloom filter with. It is not mutated.
222 * @throws IllegalArgumentException if {@code isCompatible(that) == false}
223 *
224 * @since 15.0
225 */
226 public void putAll(BloomFilterAdvanced<T> that) {
227 checkNotNull(that);
228 checkArgument(this != that, "Cannot combine a BloomFilter with itself.");
229 checkArgument(
230 this.numHashFunctions == that.numHashFunctions,
231 "BloomFilters must have the same number of hash functions (%s != %s)",
232 this.numHashFunctions,
233 that.numHashFunctions);
234 checkArgument(
235 this.bitSize() == that.bitSize(),
236 "BloomFilters must have the same size underlying bit arrays (%s != %s)",
237 this.bitSize(),
238 that.bitSize());
239 checkArgument(
240 this.strategy.equals(that.strategy),
241 "BloomFilters must have equal strategies (%s != %s)",
242 this.strategy,
243 that.strategy);
244 checkArgument(
245 this.funnel.equals(that.funnel),
246 "BloomFilters must have equal funnels (%s != %s)",
247 this.funnel,
248 that.funnel);
249 this.bits.putAll(that.bits);
250 }
251
252 @Override
253 public boolean equals(@Nullable Object object) {
254 if (object == this) {
255 return true;
256 }
257 if (object instanceof BloomFilterAdvanced) {
258 BloomFilterAdvanced<?> that = (BloomFilterAdvanced<?>) object;
259 return this.numHashFunctions == that.numHashFunctions
260 && this.funnel.equals(that.funnel)
261 && this.bits.equals(that.bits)
262 && this.strategy.equals(that.strategy);
263 }
264 return false;
265 }
266
267 @Override
268 public int hashCode() {
269 return Objects.hashCode(numHashFunctions, funnel, strategy, bits);
270 }
271
272 /**
273 * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of insertions and
274 * expected false positive probability.
275 *
276 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
277 * will result in its saturation, and a sharp deterioration of its false positive probability.
278 *
279 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided
280 * {@code Funnel<T>} is.
281 *
282 * <p>It is recommended that the funnel be implemented as a Java enum. This has the benefit of
283 * ensuring proper serialization and deserialization, which is important since {@link #equals}
284 * also relies on object identity of funnels.
285 *
286 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use
287 * @param expectedInsertions the number of expected insertions to the constructed
288 * {@code BloomFilter<T>}; must be positive
289 * @param fpp the desired false positive probability (must be positive and less than 1.0)
290 * @return a {@code BloomFilter}
291 */
292 public static <T> BloomFilterAdvanced<T> create(
293 Funnel<? super T> funnel, int expectedInsertions, double fpp) {
294 return create(funnel, (long) expectedInsertions, fpp);
295 }
296
297 /**
298 * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of insertions and
299 * expected false positive probability.
300 *
301 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
302 * will result in its saturation, and a sharp deterioration of its false positive probability.
303 *
304 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided
305 * {@code Funnel<T>} is.
306 *
307 * <p>It is recommended that the funnel be implemented as a Java enum. This has the benefit of
308 * ensuring proper serialization and deserialization, which is important since {@link #equals}
309 * also relies on object identity of funnels.
310 *
311 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use
312 * @param expectedInsertions the number of expected insertions to the constructed
313 * {@code BloomFilter<T>}; must be positive
314 * @param fpp the desired false positive probability (must be positive and less than 1.0)
315 * @return a {@code BloomFilter}
316 * @since 19.0
317 */
318 public static <T> BloomFilterAdvanced<T> create(
319 Funnel<? super T> funnel, long expectedInsertions, double fpp) {
320 return create(funnel, expectedInsertions, fpp, BloomFilterAdvancedStrategies.MURMUR128_MITZ_64);
321 }
322
323 @VisibleForTesting
324 static <T> BloomFilterAdvanced<T> create(
325 Funnel<? super T> funnel, long expectedInsertions, double fpp, Strategy strategy) {
326 checkNotNull(funnel);
327 checkArgument(
328 expectedInsertions >= 0, "Expected insertions (%s) must be >= 0", expectedInsertions);
329 checkArgument(fpp > 0.0, "False positive probability (%s) must be > 0.0", fpp);
330 checkArgument(fpp < 1.0, "False positive probability (%s) must be < 1.0", fpp);
331 checkNotNull(strategy);
332
333 if (expectedInsertions == 0) {
334 expectedInsertions = 1;
335 }
336 /*
337 * TODO(user): Put a warning in the javadoc about tiny fpp values, since the resulting size
338 * is proportional to -log(p), but there is not much of a point after all, e.g.
339 * optimalM(1000, 0.0000000000000001) = 76680 which is less than 10kb. Who cares!
340 */
341 long numBits = optimalNumOfBits(expectedInsertions, fpp);
342 int numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, numBits);
343 try {
344 return new BloomFilterAdvanced<T>(new BitArray(numBits), numHashFunctions, funnel, strategy);
345 } catch (IllegalArgumentException e) {
346 throw new IllegalArgumentException("Could not create BloomFilter of " + numBits + " bits", e);
347 }
348 }
349
350 /**
351 * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of insertions and a
352 * default expected false positive probability of 3%.
353 *
354 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
355 * will result in its saturation, and a sharp deterioration of its false positive probability.
356 *
357 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided
358 * {@code Funnel<T>} is.
359 *
360 * <p>It is recommended that the funnel be implemented as a Java enum. This has the benefit of
361 * ensuring proper serialization and deserialization, which is important since {@link #equals}
362 * also relies on object identity of funnels.
363 *
364 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use
365 * @param expectedInsertions the number of expected insertions to the constructed
366 * {@code BloomFilter<T>}; must be positive
367 * @return a {@code BloomFilter}
368 */
369 public static <T> BloomFilterAdvanced<T> create(Funnel<? super T> funnel, int expectedInsertions) {
370 return create(funnel, (long) expectedInsertions);
371 }
372
373 /**
374 * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of insertions and a
375 * default expected false positive probability of 3%.
376 *
377 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
378 * will result in its saturation, and a sharp deterioration of its false positive probability.
379 *
380 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided
381 * {@code Funnel<T>} is.
382 *
383 * <p>It is recommended that the funnel be implemented as a Java enum. This has the benefit of
384 * ensuring proper serialization and deserialization, which is important since {@link #equals}
385 * also relies on object identity of funnels.
386 *
387 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use
388 * @param expectedInsertions the number of expected insertions to the constructed
389 * {@code BloomFilter<T>}; must be positive
390 * @return a {@code BloomFilter}
391 * @since 19.0
392 */
393 public static <T> BloomFilterAdvanced<T> create(Funnel<? super T> funnel, long expectedInsertions) {
394 return create(funnel, expectedInsertions, 0.03); // FYI, for 3%, we always get 5 hash functions
395 }
396
397 // Cheat sheet:
398 //
399 // m: total bits
400 // n: expected insertions
401 // b: m/n, bits per insertion
402 // p: expected false positive probability
403 //
404 // 1) Optimal k = b * ln2
405 // 2) p = (1 - e ^ (-kn/m))^k
406 // 3) For optimal k: p = 2 ^ (-k) ~= 0.6185^b
407 // 4) For optimal k: m = -nlnp / ((ln2) ^ 2)
408
409 /**
410 * Computes the optimal k (number of hashes per element inserted in Bloom filter), given the
411 * expected insertions and total number of bits in the Bloom filter.
412 *
413 * See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula.
414 *
415 * @param n expected insertions (must be positive)
416 * @param m total number of bits in Bloom filter (must be positive)
417 */
418 @VisibleForTesting
419 static int optimalNumOfHashFunctions(long n, long m) {
420 // (m / n) * log(2), but avoid truncation due to division!
421 return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
422 }
423
424 /**
425 * Computes m (total bits of Bloom filter) which is expected to achieve, for the specified
426 * expected insertions, the required false positive probability.
427 *
428 * See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the formula.
429 *
430 * @param n expected insertions (must be positive)
431 * @param p false positive rate (must be 0 < p < 1)
432 */
433 @VisibleForTesting
434 static long optimalNumOfBits(long n, double p) {
435 if (p == 0) {
436 p = Double.MIN_VALUE;
437 }
438 return (long) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
439 }
440
441 private Object writeReplace() {
442 return new SerialForm<T>(this);
443 }
444
445 private static class SerialForm<T> implements Serializable {
446 final long[] data;
447 final int numHashFunctions;
448 final Funnel<? super T> funnel;
449 final Strategy strategy;
450
451 SerialForm(BloomFilterAdvanced<T> bf) {
452 this.data = bf.bits.data;
453 this.numHashFunctions = bf.numHashFunctions;
454 this.funnel = bf.funnel;
455 this.strategy = bf.strategy;
456 }
457
458 Object readResolve() {
459 return new BloomFilterAdvanced<T>(new BitArray(data), numHashFunctions, funnel, strategy);
460 }
461
462 private static final long serialVersionUID = 1;
463 }
464
465 /**
466 * Writes this {@code BloomFilter} to an output stream, with a custom format (not Java
467 * serialization). This has been measured to save at least 400 bytes compared to regular
468 * serialization.
469 *
470 * <p>Use {@linkplain #readFrom(InputStream, Funnel)} to reconstruct the written BloomFilter.
471 */
472 public void writeTo(OutputStream out) throws IOException {
473 // Serial form:
474 // 1 signed byte for the strategy
475 // 1 unsigned byte for the number of hash functions
476 // 1 big endian int, the number of longs in our bitset
477 // N big endian longs of our bitset
478 DataOutputStream dout = new DataOutputStream(out);
479 dout.writeByte(SignedBytes.checkedCast(strategy.ordinal()));
480 dout.writeByte(UnsignedBytes.checkedCast(numHashFunctions)); // note: checked at the c'tor
481 dout.writeInt(bits.data.length);
482 for (long value : bits.data) {
483 dout.writeLong(value);
484 }
485 }
486
487 /**
488 * Reads a byte stream, which was written by {@linkplain #writeTo(OutputStream)}, into a
489 * {@code BloomFilter<T>}.
490 *
491 * The {@code Funnel} to be used is not encoded in the stream, so it must be provided here.
492 * <b>Warning:</b> the funnel provided <b>must</b> behave identically to the one used to populate
493 * the original Bloom filter!
494 *
495 * @throws IOException if the InputStream throws an {@code IOException}, or if its data does not
496 * appear to be a BloomFilter serialized using the {@linkplain #writeTo(OutputStream)} method.
497 */
498 public static <T> BloomFilterAdvanced<T> readFrom(InputStream in, Funnel<T> funnel) throws IOException {
499 checkNotNull(in, "InputStream");
500 checkNotNull(funnel, "Funnel");
501 int strategyOrdinal = -1;
502 int numHashFunctions = -1;
503 int dataLength = -1;
504 try {
505 DataInputStream din = new DataInputStream(in);
506 // currently this assumes there is no negative ordinal; will have to be updated if we
507 // add non-stateless strategies (for which we've reserved negative ordinals; see
508 // Strategy.ordinal()).
509 strategyOrdinal = din.readByte();
510 numHashFunctions = UnsignedBytes.toInt(din.readByte());
511 dataLength = din.readInt();
512
513 Strategy strategy = BloomFilterAdvancedStrategies.values()[strategyOrdinal];
514 long[] data = new long[dataLength];
515 for (int i = 0; i < data.length; i++) {
516 data[i] = din.readLong();
517 }
518 return new BloomFilterAdvanced<T>(new BitArray(data), numHashFunctions, funnel, strategy);
519 } catch (RuntimeException e) {
520 String message =
521 "Unable to deserialize BloomFilter from InputStream."
522 + " strategyOrdinal: "
523 + strategyOrdinal
524 + " numHashFunctions: "
525 + numHashFunctions
526 + " dataLength: "
527 + dataLength;
528 throw new IOException(message, e);
529 }
530 }
531}
Note: See TracBrowser for help on using the repository browser.