Intro to Bloom Filter

Implementation

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import io
import hashlib
HASH_ALGORITHMS_NAMES = ['sha1', 'sha224', 'sha384', 'sha256', 'sha512', 'md5']
HASH_ALGORITHMS_FUNCTIONS = [hashlib.sha1, hashlib.sha224, hashlib.sha384, hashlib.sha256, hashlib.sha512, hashlib.md5]
HASH_ALGORITHMS_MAPPING = dict(zip(HASH_ALGORITHMS_NAMES, HASH_ALGORITHMS_FUNCTIONS))
SIZE_BIT_VECTOR = int(1e7)
bit_vec = [0] * SIZE_BIT_VECTOR
def get_hash(input_string, algorithm='sha256'):
hash_object = HASH_ALGORITHMS_MAPPING[algorithm](input_string)
return int(hash_object.hexdigest(),16)
def build_spell_checker(word_list_file='wordlist.txt'):
with io.open('wordlist.txt', mode='r', encoding='latin-1') as f:
for line in f:
for algorithm in HASH_ALGORITHMS_NAMES:
pos = get_hash(line.strip().encode('utf-8'), algorithm) % SIZE_BIT_VECTOR
bit_vec[pos] = 1
return bit_vec
def word_checker(word):
pos_list = []
for algorithm in HASH_ALGORITHMS_NAMES:
pos_list.append(get_hash(word.encode('utf-8'), algorithm) % SIZE_BIT_VECTOR)
return all([bit_vec[pos] == 1 for pos in pos_list])
if __name__ == '__main__':
bit_vec = build_spell_checker()
for word in ['adsfadsdfcx', 'cat', 'uk', 'ask']:
print(word_checker(word))

Check out https://wiki.python.org/moin/BitArrays for faster bit array implementation

nuances

The difference between Bloom Filter and LSH

Reference

Mining Massive Dataset: Review
A good introduction to bloom filter

https://en.wikipedia.org/wiki/Cryptographic_hash_function