Intro to Bloom Filter

Posted on 2018-05-03 | In learning_notes , algorithm |

Implementation

import io
import hashlib
HASH_ALGORITHMS_NAMES = ['sha1', 'sha224', 'sha384', 'sha256', 'sha512', 'md5']
HASH_ALGORITHMS_FUNCTIONS = [hashlib.sha1, hashlib.sha224, hashlib.sha384, hashlib.sha256, hashlib.sha512, hashlib.md5]
HASH_ALGORITHMS_MAPPING = dict(zip(HASH_ALGORITHMS_NAMES, HASH_ALGORITHMS_FUNCTIONS))
SIZE_BIT_VECTOR = int(1e7)
bit_vec = [0] * SIZE_BIT_VECTOR
def get_hash(input_string, algorithm='sha256'):
    hash_object = HASH_ALGORITHMS_MAPPING[algorithm](input_string)
    return int(hash_object.hexdigest(),16)
def build_spell_checker(word_list_file='wordlist.txt'):
    with io.open('wordlist.txt', mode='r', encoding='latin-1') as f:
        for line in f:
            for algorithm in HASH_ALGORITHMS_NAMES:
                pos = get_hash(line.strip().encode('utf-8'), algorithm) % SIZE_BIT_VECTOR
                bit_vec[pos] = 1
    return bit_vec
def word_checker(word):
    pos_list = []
    for algorithm in HASH_ALGORITHMS_NAMES:
        pos_list.append(get_hash(word.encode('utf-8'), algorithm) % SIZE_BIT_VECTOR)
    return all([bit_vec[pos] == 1 for pos in pos_list])
if __name__ == '__main__':
    bit_vec = build_spell_checker()
    for word in ['adsfadsdfcx', 'cat', 'uk', 'ask']:
        print(word_checker(word))

Check out https://wiki.python.org/moin/BitArrays for faster bit array implementation

nuances

The difference between Bloom Filter and LSH

Reference

Mining Massive Dataset: Review
A good introduction to bloom filter

https://en.wikipedia.org/wiki/Cryptographic_hash_function