source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/com/google/common/hash/BloomFilterAdvancedStrategies.java@ 31209

Last change on this file since 31209 was 31209, checked in by davidb, 7 years ago

checkArgument added in

  • Property svn:executable set to *
File size: 7.9 KB
Line 
1/*
2 * Copyright (C) 2011 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5 * in compliance with the License. You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software distributed under the License
10 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11 * or implied. See the License for the specific language governing permissions and limitations under
12 * the License.
13 */
14
15package com.google.common.hash;
16
17//import static com.google.common.base.Preconditions.checkArgument;
18
19import com.google.common.math.LongMath;
20import com.google.common.primitives.Ints;
21import com.google.common.primitives.Longs;
22import java.math.RoundingMode;
23import java.util.Arrays;
24import javax.annotation.Nullable;
25
26
27/**
28 * Collections of strategies of generating the k * log(M) bits required for an element to be mapped
29 * to a BloomFilter of M bits and k hash functions. These strategies are part of the serialized form
30 * of the Bloom filters that use them, thus they must be preserved as is (no updates allowed, only
31 * introduction of new versions).
32 *
33 * Important: the order of the constants cannot change, and they cannot be deleted - we depend on
34 * their ordinal for BloomFilter serialization.
35 *
36 * @author Dimitris Andreou
37 * @author Kurt Alfred Kluever
38 */
39enum BloomFilterAdvancedStrategies implements BloomFilterAdvanced.Strategy {
40 /**
41 * See "Less Hashing, Same Performance: Building a Better Bloom Filter" by Adam Kirsch and Michael
42 * Mitzenmacher. The paper argues that this trick doesn't significantly deteriorate the
43 * performance of a Bloom filter (yet only needs two 32bit hash functions).
44 */
45 MURMUR128_MITZ_32() {
46 @Override
47 public <T> boolean put(
48 T object, Funnel<? super T> funnel, int numHashFunctions, BloomFilterAdvancedStrategies.BitArray bits) {
49 long bitSize = bits.bitSize();
50 long hash64 = Hashing.murmur3_128().hashObject(object, funnel).asLong();
51 int hash1 = (int) hash64;
52 int hash2 = (int) (hash64 >>> 32);
53
54 boolean bitsChanged = false;
55 for (int i = 1; i <= numHashFunctions; i++) {
56 int combinedHash = hash1 + (i * hash2);
57 // Flip all the bits if it's negative (guaranteed positive number)
58 if (combinedHash < 0) {
59 combinedHash = ~combinedHash;
60 }
61 bitsChanged |= bits.set(combinedHash % bitSize);
62 }
63 return bitsChanged;
64 }
65
66 @Override
67 public <T> boolean mightContain(
68 T object, Funnel<? super T> funnel, int numHashFunctions, BloomFilterAdvancedStrategies.BitArray bits) {
69 long bitSize = bits.bitSize();
70 long hash64 = Hashing.murmur3_128().hashObject(object, funnel).asLong();
71 int hash1 = (int) hash64;
72 int hash2 = (int) (hash64 >>> 32);
73
74 for (int i = 1; i <= numHashFunctions; i++) {
75 int combinedHash = hash1 + (i * hash2);
76 // Flip all the bits if it's negative (guaranteed positive number)
77 if (combinedHash < 0) {
78 combinedHash = ~combinedHash;
79 }
80 if (!bits.get(combinedHash % bitSize)) {
81 return false;
82 }
83 }
84 return true;
85 }
86 },
87 /**
88 * This strategy uses all 128 bits of {@link Hashing#murmur3_128} when hashing. It looks different
89 * than the implementation in MURMUR128_MITZ_32 because we're avoiding the multiplication in the
90 * loop and doing a (much simpler) += hash2. We're also changing the index to a positive number by
91 * AND'ing with Long.MAX_VALUE instead of flipping the bits.
92 */
93 MURMUR128_MITZ_64() {
94 @Override
95 public <T> boolean put(
96 T object, Funnel<? super T> funnel, int numHashFunctions, BloomFilterAdvancedStrategies.BitArray bits) {
97 long bitSize = bits.bitSize();
98 byte[] bytes = Hashing.murmur3_128().hashObject(object, funnel).getBytesInternal();
99 long hash1 = lowerEight(bytes);
100 long hash2 = upperEight(bytes);
101
102 boolean bitsChanged = false;
103 long combinedHash = hash1;
104 for (int i = 0; i < numHashFunctions; i++) {
105 // Make the combined hash positive and indexable
106 bitsChanged |= bits.set((combinedHash & Long.MAX_VALUE) % bitSize);
107 combinedHash += hash2;
108 }
109 return bitsChanged;
110 }
111
112 @Override
113 public <T> boolean mightContain(
114 T object, Funnel<? super T> funnel, int numHashFunctions, BloomFilterAdvancedStrategies.BitArray bits) {
115 long bitSize = bits.bitSize();
116 byte[] bytes = Hashing.murmur3_128().hashObject(object, funnel).getBytesInternal();
117 long hash1 = lowerEight(bytes);
118 long hash2 = upperEight(bytes);
119
120 long combinedHash = hash1;
121 for (int i = 0; i < numHashFunctions; i++) {
122 // Make the combined hash positive and indexable
123 if (!bits.get((combinedHash & Long.MAX_VALUE) % bitSize)) {
124 return false;
125 }
126 combinedHash += hash2;
127 }
128 return true;
129 }
130
131 private /* static */ long lowerEight(byte[] bytes) {
132 return Longs.fromBytes(
133 bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]);
134 }
135
136 private /* static */ long upperEight(byte[] bytes) {
137 return Longs.fromBytes(
138 bytes[15], bytes[14], bytes[13], bytes[12], bytes[11], bytes[10], bytes[9], bytes[8]);
139 }
140 };
141
142 // Note: We use this instead of java.util.BitSet because we need access to the long[] data field
143 static final class BitArray {
144 final long[] data;
145 long bitCount;
146
147 BitArray(long bits) {
148 this(new long[Ints.checkedCast(LongMath.divide(bits, 64, RoundingMode.CEILING))]);
149 }
150
151 // Used by serialization
152 BitArray(long[] data) {
153 checkArgument(data.length > 0, "data length is zero!");
154 this.data = data;
155 long bitCount = 0;
156 for (long value : data) {
157 bitCount += Long.bitCount(value);
158 }
159 this.bitCount = bitCount;
160 }
161
162 /** Returns true if the bit changed value. */
163 boolean set(long index) {
164 if (!get(index)) {
165 data[(int) (index >>> 6)] |= (1L << index);
166 bitCount++;
167 return true;
168 }
169 return false;
170 }
171
172 boolean get(long index) {
173 return (data[(int) (index >>> 6)] & (1L << index)) != 0;
174 }
175
176 /** Number of bits */
177 long bitSize() {
178 return (long) data.length * Long.SIZE;
179 }
180
181 /** Number of set bits (1s) */
182 long bitCount() {
183 return bitCount;
184 }
185
186 BitArray copy() {
187 return new BitArray(data.clone());
188 }
189
190 /** Combines the two BitArrays using bitwise OR. */
191 void putAll(BitArray array) {
192 checkArgument(
193 data.length == array.data.length,
194 "BitArrays must be of equal length (%s != %s)",
195 data.length,
196 array.data.length);
197 bitCount = 0;
198 for (int i = 0; i < data.length; i++) {
199 data[i] |= array.data[i];
200 bitCount += Long.bitCount(data[i]);
201 }
202 }
203
204 @Override
205 public boolean equals(@Nullable Object o) {
206 if (o instanceof BitArray) {
207 BitArray bitArray = (BitArray) o;
208 return Arrays.equals(data, bitArray.data);
209 }
210 return false;
211 }
212
213 @Override
214 public int hashCode() {
215 return Arrays.hashCode(data);
216 }
217
218 public static void checkArgument(boolean expression, @Nullable Object errorMessage) {
219 if (!expression) {
220 throw new IllegalArgumentException(String.valueOf(errorMessage));
221 }
222 }
223 public static void checkArgument(
224 boolean b, @Nullable String errorMessageTemplate, @Nullable Object p1, @Nullable Object p2) {
225 if (!b) {
226 throw new IllegalArgumentException(String.format(errorMessageTemplate, p1, p2));
227 }
228 }
229
230 }
231}
Note: See TracBrowser for help on using the repository browser.