Skip to content

Commit 41828f6

Browse files
committed
Add a new BF strategy. The old one is subtly broken. For more information, see: https://code.google.com/p/guava-libraries/issues/detail?id=1119
------------- Created by MOE: http://code.google.com/p/moe-java MOE_MIGRATED_REVID=63970075
1 parent 353b6e6 commit 41828f6

File tree

2 files changed

+114
-11
lines changed

2 files changed

+114
-11
lines changed

guava-tests/test/com/google/common/hash/BloomFilterTest.java

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
* @author Dimitris Andreou
3939
*/
4040
public class BloomFilterTest extends TestCase {
41-
4241
public void testLargeBloomFilterDoesntOverflow() {
4342
long numBits = Integer.MAX_VALUE;
4443
numBits++;
@@ -93,6 +92,47 @@ public void testCreateAndCheckMitz32BloomFilterWithKnownFalsePositives() {
9392
assertEquals(actualFpp, expectedFpp, 0.00015);
9493
}
9594

95+
public void testCreateAndCheckBloomFilterWithKnownFalsePositives64() {
96+
int numInsertions = 1000000;
97+
BloomFilter<CharSequence> bf = BloomFilter.create(
98+
Funnels.unencodedCharsFunnel(), numInsertions, 0.03,
99+
BloomFilterStrategies.MURMUR128_MITZ_64);
100+
101+
// Insert "numInsertions" even numbers into the BF.
102+
for (int i = 0; i < numInsertions * 2; i += 2) {
103+
bf.put(Integer.toString(i));
104+
}
105+
106+
// Assert that the BF "might" have all of the even numbers.
107+
for (int i = 0; i < numInsertions * 2; i += 2) {
108+
assertTrue(bf.mightContain(Integer.toString(i)));
109+
}
110+
111+
// Now we check for known false positives using a set of known false positives.
112+
// (These are all of the false positives under 900.)
113+
ImmutableSet<Integer> falsePositives = ImmutableSet.of(
114+
1, 143, 231, 287, 311, 319, 331, 421, 457, 547, 599, 659, 723);
115+
for (int i = 1; i < 900; i += 2) {
116+
if (!falsePositives.contains(i)) {
117+
assertFalse("BF should not contain " + i, bf.mightContain(Integer.toString(i)));
118+
}
119+
}
120+
121+
// Check that there are exactly 29651 false positives for this BF.
122+
int knownNumberOfFalsePositives = 29651;
123+
int numFpp = 0;
124+
for (int i = 1; i < numInsertions * 2; i += 2) {
125+
if (bf.mightContain(Integer.toString(i))) {
126+
numFpp++;
127+
}
128+
}
129+
assertEquals(knownNumberOfFalsePositives, numFpp);
130+
double actualFpp = (double) knownNumberOfFalsePositives / numInsertions;
131+
double expectedFpp = bf.expectedFpp();
132+
// The normal order of (expected, actual) is reversed here on purpose.
133+
assertEquals(actualFpp, expectedFpp, 0.00033);
134+
}
135+
96136
/**
97137
* Sanity checking with many combinations of false positive rates and expected insertions
98138
*/
@@ -361,7 +401,8 @@ public void testJavaSerialization() {
361401
* Only appending a new constant is allowed.
362402
*/
363403
public void testBloomFilterStrategies() {
364-
assertEquals(1, BloomFilterStrategies.values().length);
404+
assertEquals(2, BloomFilterStrategies.values().length);
365405
assertEquals(BloomFilterStrategies.MURMUR128_MITZ_32, BloomFilterStrategies.values()[0]);
406+
assertEquals(BloomFilterStrategies.MURMUR128_MITZ_64, BloomFilterStrategies.values()[1]);
366407
}
367408
}

guava/src/com/google/common/hash/BloomFilterStrategies.java

Lines changed: 71 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import com.google.common.math.LongMath;
2020
import com.google.common.primitives.Ints;
21+
import com.google.common.primitives.Longs;
2122

2223
import java.math.RoundingMode;
2324
import java.util.Arrays;
@@ -32,6 +33,7 @@
3233
* on their ordinal for BloomFilter serialization.
3334
*
3435
* @author Dimitris Andreou
36+
* @author Kurt Alfred Kluever
3537
*/
3638
enum BloomFilterStrategies implements BloomFilter.Strategy {
3739
/**
@@ -42,40 +44,100 @@ enum BloomFilterStrategies implements BloomFilter.Strategy {
4244
MURMUR128_MITZ_32() {
4345
@Override public <T> boolean put(T object, Funnel<? super T> funnel,
4446
int numHashFunctions, BitArray bits) {
47+
long bitSize = bits.bitSize();
4548
long hash64 = Hashing.murmur3_128().hashObject(object, funnel).asLong();
4649
int hash1 = (int) hash64;
4750
int hash2 = (int) (hash64 >>> 32);
51+
4852
boolean bitsChanged = false;
4953
for (int i = 1; i <= numHashFunctions; i++) {
50-
int nextHash = hash1 + i * hash2;
51-
if (nextHash < 0) {
52-
nextHash = ~nextHash;
54+
int combinedHash = hash1 + (i * hash2);
55+
// Flip all the bits if it's negative (guaranteed positive number)
56+
if (combinedHash < 0) {
57+
combinedHash = ~combinedHash;
5358
}
54-
bitsChanged |= bits.set(nextHash % bits.bitSize());
59+
bitsChanged |= bits.set(combinedHash % bitSize);
5560
}
5661
return bitsChanged;
5762
}
5863

5964
@Override public <T> boolean mightContain(T object, Funnel<? super T> funnel,
6065
int numHashFunctions, BitArray bits) {
66+
long bitSize = bits.bitSize();
6167
long hash64 = Hashing.murmur3_128().hashObject(object, funnel).asLong();
6268
int hash1 = (int) hash64;
6369
int hash2 = (int) (hash64 >>> 32);
70+
6471
for (int i = 1; i <= numHashFunctions; i++) {
65-
int nextHash = hash1 + i * hash2;
66-
if (nextHash < 0) {
67-
nextHash = ~nextHash;
72+
int combinedHash = hash1 + (i * hash2);
73+
// Flip all the bits if it's negative (guaranteed positive number)
74+
if (combinedHash < 0) {
75+
combinedHash = ~combinedHash;
6876
}
69-
if (!bits.get(nextHash % bits.bitSize())) {
77+
if (!bits.get(combinedHash % bitSize)) {
7078
return false;
7179
}
7280
}
7381
return true;
7482
}
83+
},
84+
/**
85+
* This strategy uses all 128 bits of {@link Hashing#murmur3_128} when hashing. It looks
86+
* different than the implementation in MURMUR128_MITZ_32 because we're avoiding the
87+
* multiplication in the loop and doing a (much simpler) += hash2. We're also changing the
88+
* index to a positive number by AND'ing with Long.MAX_VALUE instead of flipping the bits.
89+
*/
90+
MURMUR128_MITZ_64() {
91+
@Override
92+
public <T> boolean put(T object, Funnel<? super T> funnel,
93+
int numHashFunctions, BitArray bits) {
94+
long bitSize = bits.bitSize();
95+
byte[] bytes = Hashing.murmur3_128().hashObject(object, funnel).getBytesInternal();
96+
long hash1 = lowerEight(bytes);
97+
long hash2 = upperEight(bytes);
98+
99+
boolean bitsChanged = false;
100+
long combinedHash = hash1 + hash2;
101+
for (int i = 0; i < numHashFunctions; i++) {
102+
// Make the combined hash positive and indexable
103+
bitsChanged |= bits.set((combinedHash & Long.MAX_VALUE) % bitSize);
104+
combinedHash += hash2;
105+
}
106+
return bitsChanged;
107+
}
108+
109+
@Override
110+
public <T> boolean mightContain(T object, Funnel<? super T> funnel,
111+
int numHashFunctions, BitArray bits) {
112+
long bitSize = bits.bitSize();
113+
byte[] bytes = Hashing.murmur3_128().hashObject(object, funnel).getBytesInternal();
114+
long hash1 = lowerEight(bytes);
115+
long hash2 = upperEight(bytes);
116+
117+
long combinedHash = hash1 + hash2;
118+
for (int i = 0; i < numHashFunctions; i++) {
119+
// Make the combined hash positive and indexable
120+
if (!bits.get((combinedHash & Long.MAX_VALUE) % bitSize)) {
121+
return false;
122+
}
123+
combinedHash += hash2;
124+
}
125+
return true;
126+
}
127+
128+
private /* static */ long lowerEight(byte[] bytes) {
129+
return Longs.fromBytes(
130+
bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]);
131+
}
132+
133+
private /* static */ long upperEight(byte[] bytes) {
134+
return Longs.fromBytes(
135+
bytes[15], bytes[14], bytes[13], bytes[12], bytes[11], bytes[10], bytes[9], bytes[8]);
136+
}
75137
};
76138

77139
// Note: We use this instead of java.util.BitSet because we need access to the long[] data field
78-
static class BitArray {
140+
static final class BitArray {
79141
final long[] data;
80142
long bitCount;
81143

0 commit comments

Comments
 (0)