source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/com/google/common/hash/BloomFilterAdvancedStrategies.java@ 31205

Last change on this file since 31205 was 31205, checked in by davidb, 7 years ago

Next added in part of new Guava code

  • Property svn:executable set to *
File size: 7.4 KB
Line 
1/*
2 * Copyright (C) 2011 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5 * in compliance with the License. You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software distributed under the License
10 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11 * or implied. See the License for the specific language governing permissions and limitations under
12 * the License.
13 */
14
15package com.google.common.hash;
16
17import static com.google.common.base.Preconditions.checkArgument;
18
19import com.google.common.math.LongMath;
20import com.google.common.primitives.Ints;
21import com.google.common.primitives.Longs;
22import java.math.RoundingMode;
23import java.util.Arrays;
24import javax.annotation.Nullable;
25
26/**
27 * Collections of strategies of generating the k * log(M) bits required for an element to be mapped
28 * to a BloomFilter of M bits and k hash functions. These strategies are part of the serialized form
29 * of the Bloom filters that use them, thus they must be preserved as is (no updates allowed, only
30 * introduction of new versions).
31 *
32 * Important: the order of the constants cannot change, and they cannot be deleted - we depend on
33 * their ordinal for BloomFilter serialization.
34 *
35 * @author Dimitris Andreou
36 * @author Kurt Alfred Kluever
37 */
38enum BloomFilterAdvancedStrategies implements BloomFilter.Strategy {
39 /**
40 * See "Less Hashing, Same Performance: Building a Better Bloom Filter" by Adam Kirsch and Michael
41 * Mitzenmacher. The paper argues that this trick doesn't significantly deteriorate the
42 * performance of a Bloom filter (yet only needs two 32bit hash functions).
43 */
44 MURMUR128_MITZ_32() {
45 @Override
46 public <T> boolean put(
47 T object, Funnel<? super T> funnel, int numHashFunctions, BloomFilterStrategies.BitArray bits) {
48 long bitSize = bits.bitSize();
49 long hash64 = Hashing.murmur3_128().hashObject(object, funnel).asLong();
50 int hash1 = (int) hash64;
51 int hash2 = (int) (hash64 >>> 32);
52
53 boolean bitsChanged = false;
54 for (int i = 1; i <= numHashFunctions; i++) {
55 int combinedHash = hash1 + (i * hash2);
56 // Flip all the bits if it's negative (guaranteed positive number)
57 if (combinedHash < 0) {
58 combinedHash = ~combinedHash;
59 }
60 bitsChanged |= bits.set(combinedHash % bitSize);
61 }
62 return bitsChanged;
63 }
64
65 @Override
66 public <T> boolean mightContain(
67 T object, Funnel<? super T> funnel, int numHashFunctions, BloomFilterStrategies.BitArray bits) {
68 long bitSize = bits.bitSize();
69 long hash64 = Hashing.murmur3_128().hashObject(object, funnel).asLong();
70 int hash1 = (int) hash64;
71 int hash2 = (int) (hash64 >>> 32);
72
73 for (int i = 1; i <= numHashFunctions; i++) {
74 int combinedHash = hash1 + (i * hash2);
75 // Flip all the bits if it's negative (guaranteed positive number)
76 if (combinedHash < 0) {
77 combinedHash = ~combinedHash;
78 }
79 if (!bits.get(combinedHash % bitSize)) {
80 return false;
81 }
82 }
83 return true;
84 }
85 },
86 /**
87 * This strategy uses all 128 bits of {@link Hashing#murmur3_128} when hashing. It looks different
88 * than the implementation in MURMUR128_MITZ_32 because we're avoiding the multiplication in the
89 * loop and doing a (much simpler) += hash2. We're also changing the index to a positive number by
90 * AND'ing with Long.MAX_VALUE instead of flipping the bits.
91 */
92 MURMUR128_MITZ_64() {
93 @Override
94 public <T> boolean put(
95 T object, Funnel<? super T> funnel, int numHashFunctions, BloomFilterStrategies.BitArray bits) {
96 long bitSize = bits.bitSize();
97 byte[] bytes = Hashing.murmur3_128().hashObject(object, funnel).getBytesInternal();
98 long hash1 = lowerEight(bytes);
99 long hash2 = upperEight(bytes);
100
101 boolean bitsChanged = false;
102 long combinedHash = hash1;
103 for (int i = 0; i < numHashFunctions; i++) {
104 // Make the combined hash positive and indexable
105 bitsChanged |= bits.set((combinedHash & Long.MAX_VALUE) % bitSize);
106 combinedHash += hash2;
107 }
108 return bitsChanged;
109 }
110
111 @Override
112 public <T> boolean mightContain(
113 T object, Funnel<? super T> funnel, int numHashFunctions, BloomFilterStrategies.BitArray bits) {
114 long bitSize = bits.bitSize();
115 byte[] bytes = Hashing.murmur3_128().hashObject(object, funnel).getBytesInternal();
116 long hash1 = lowerEight(bytes);
117 long hash2 = upperEight(bytes);
118
119 long combinedHash = hash1;
120 for (int i = 0; i < numHashFunctions; i++) {
121 // Make the combined hash positive and indexable
122 if (!bits.get((combinedHash & Long.MAX_VALUE) % bitSize)) {
123 return false;
124 }
125 combinedHash += hash2;
126 }
127 return true;
128 }
129
130 private /* static */ long lowerEight(byte[] bytes) {
131 return Longs.fromBytes(
132 bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]);
133 }
134
135 private /* static */ long upperEight(byte[] bytes) {
136 return Longs.fromBytes(
137 bytes[15], bytes[14], bytes[13], bytes[12], bytes[11], bytes[10], bytes[9], bytes[8]);
138 }
139 };
140
141 // Note: We use this instead of java.util.BitSet because we need access to the long[] data field
142 static final class BitArray {
143 final long[] data;
144 long bitCount;
145
146 BitArray(long bits) {
147 this(new long[Ints.checkedCast(LongMath.divide(bits, 64, RoundingMode.CEILING))]);
148 }
149
150 // Used by serialization
151 BitArray(long[] data) {
152 checkArgument(data.length > 0, "data length is zero!");
153 this.data = data;
154 long bitCount = 0;
155 for (long value : data) {
156 bitCount += Long.bitCount(value);
157 }
158 this.bitCount = bitCount;
159 }
160
161 /** Returns true if the bit changed value. */
162 boolean set(long index) {
163 if (!get(index)) {
164 data[(int) (index >>> 6)] |= (1L << index);
165 bitCount++;
166 return true;
167 }
168 return false;
169 }
170
171 boolean get(long index) {
172 return (data[(int) (index >>> 6)] & (1L << index)) != 0;
173 }
174
175 /** Number of bits */
176 long bitSize() {
177 return (long) data.length * Long.SIZE;
178 }
179
180 /** Number of set bits (1s) */
181 long bitCount() {
182 return bitCount;
183 }
184
185 BitArray copy() {
186 return new BitArray(data.clone());
187 }
188
189 /** Combines the two BitArrays using bitwise OR. */
190 void putAll(BitArray array) {
191 checkArgument(
192 data.length == array.data.length,
193 "BitArrays must be of equal length (%s != %s)",
194 data.length,
195 array.data.length);
196 bitCount = 0;
197 for (int i = 0; i < data.length; i++) {
198 data[i] |= array.data[i];
199 bitCount += Long.bitCount(data[i]);
200 }
201 }
202
203 @Override
204 public boolean equals(@Nullable Object o) {
205 if (o instanceof BitArray) {
206 BitArray bitArray = (BitArray) o;
207 return Arrays.equals(data, bitArray.data);
208 }
209 return false;
210 }
211
212 @Override
213 public int hashCode() {
214 return Arrays.hashCode(data);
215 }
216 }
217}
Note: See TracBrowser for help on using the repository browser.