Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

setup(
name='pybloof',
version='0.7.1',
version='0.7.3',
author='Jake Heinz',
author_email='[email protected]',
url="http://github.com/jhgg/pybloof",
Expand Down
64 changes: 62 additions & 2 deletions src/_pybloof.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import base64

from libc.string cimport memcpy

cdef unsigned int high = 0xFFFFFFFF

cdef array.array char_array_template = array.array('b', [])

cdef extern from "MurmurHash3.h" nogil:
Expand Down Expand Up @@ -47,16 +49,20 @@ cdef void _get_hash_buckets(key, unsigned long long * _bucket_indexes, unsigned
for i in range(hash_count):
_bucket_indexes[i] = llabs((hash1 + i * hash2) % max)\



@cython.boundscheck(False)
cdef void _get_hash_buckets_for_long(long long key, unsigned long long * _bucket_indexes, unsigned int hash_count,
@cython.cdivision(True)
cdef inline void _get_hash_buckets_for_long(long long key, unsigned long long * _bucket_indexes, unsigned int hash_count,
unsigned long max):
cdef unsigned long result[2]
cdef unsigned long hash1, hash2
cdef unsigned long i


MurmurHash3_x64_128_long(key, 0, &result)
hash1 = result[0]
MurmurHash3_x64_128_long(key, result[1] & 0xFFFFFFFF, result)
MurmurHash3_x64_128_long(key, result[1] & high, result)
hash2 = result[0]

for i in range(hash_count):
Expand All @@ -67,6 +73,34 @@ cdef char* fmt = '!III'
cdef ssize_t header_size = sizeof(unsigned int) * 3
DEF MAX_HASHES = 32


@cython.boundscheck(False)
cdef int _uniques_in_range(unsigned int start, unsigned int stop, int[:] bitarray,
unsigned long long * _bucket_indexes, unsigned int size,
unsigned int hash_count, int[:] flags, int[:] uniques):
cdef unsigned int i
cdef unsigned int bucket_index
cdef unsigned int idx
cdef unsigned int bit
cdef unsigned int off
cdef int is_in
idx = 0
off = 0
for item in range(start, stop):
is_in = 1
_get_hash_buckets_for_long(item, _bucket_indexes, hash_count, size)
for i in range(hash_count):
if not bitarray[_bucket_indexes[i]]:
is_in = 0
break
flags[idx] = is_in
if is_in:
uniques[off] = item
off += 1
idx += 1
return off


cdef class _BloomFilter:
cdef unsigned int _size
cdef unsigned int _hashes
Expand Down Expand Up @@ -213,9 +247,35 @@ cdef class UIntBloomFilter(_BloomFilter):

return True

@cython.boundscheck(False)
cdef _uniques_in_range(self, unsigned int start, unsigned int stop):
cdef unsigned long long _bucket_indexes[MAX_HASHES]
cdef unsigned int i
cdef unsigned int bucket_index
cdef unsigned int idx
cdef unsigned int bit
cdef int is_in
cdef array.array flags = array.array('i', [stop - start])
array.resize(flags, stop - start)
cdef array.array uniq = array.array('i', [stop - start])
array.resize(uniq, stop - start)
cdef array.array bitarray = array.array('i', [self._size])
array.resize(bitarray, self._size)
byte = self._bitarray.unpack()
for idx in range(self._size):
bitarray[idx] = byte[idx] == 255
off = _uniques_in_range(start, stop, bitarray, _bucket_indexes,
self._size, self._hashes, flags, uniq)
return flags, uniq, off

def __contains__(self, unsigned int item):
return self.contains(item)

def uniques_in_range(self, start, stop):
flags, uniq, off = self._uniques_in_range(start, stop)
return list(flags), set(uniq[:off])


cdef class StringBloomFilter(_BloomFilter):
cpdef add(self, item):
cdef unsigned long long _bucket_indexes[MAX_HASHES]
Expand Down