• 布隆过滤器之Python+Redis


    pip install mmh3

    对于安装报错,c++编译错误问题:可以安装    Microsoft Visual C++ Build Tools()


    from bitarray import bitarray
    # 3rd party
    import mmh3
    class BloomFilter(set):
        def __init__(self, size, hash_count):
            super(BloomFilter, self).__init__()
            self.bit_array = bitarray(size)
            self.size = size
            self.hash_count = hash_count
        def __len__(self):
            return self.size
        def __iter__(self):
            return iter(self.bit_array)
        def add(self, item):
            for ii in range(self.hash_count):
                index = mmh3.hash(item, ii) % self.size
                self.bit_array[index] = 1
            return self
        def __contains__(self, item):
            out = True
            for ii in range(self.hash_count):
                index = mmh3.hash(item, ii) % self.size
                if self.bit_array[index] == 0:
                    out = False
            return out
    def main():
        bloom = BloomFilter(10000, 10)
        animals = ['dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse', 'eagle',
                   'bird', 'bison', 'boar', 'butterfly', 'ant', 'anaconda', 'bear',
                   'chicken', 'dolphin', 'donkey', 'crow', 'crocodile']
        # First insertion of animals into the bloom filter
        for animal in animals:
        # Membership existence for already inserted animals
        # There should not be any false negatives
        for animal in animals:
            if animal in bloom:
                print('{} is in bloom filter as expected'.format(animal))
                print('Something is terribly went wrong for {}'.format(animal))
                print('FALSE NEGATIVE!')
        # Membership existence for not inserted animals
        # There could be false positives
        other_animals = ['badger', 'cow', 'pig', 'sheep', 'bee', 'wolf', 'fox',
                         'whale', 'shark', 'fish', 'turkey', 'duck', 'dove',
                         'deer', 'elephant', 'frog', 'falcon', 'goat', 'gorilla',
                         'hawk' ]
        for other_animal in other_animals:
            if other_animal in bloom:
                print('{} is not in the bloom, but a false positive'.format(other_animal))
                print('{} is not in the bloom filter as expected'.format(other_animal))
    if __name__ == '__main__':


    dog is in bloom filter as expected
    cat is in bloom filter as expected
    giraffe is in bloom filter as expected
    fly is in bloom filter as expected
    mosquito is in bloom filter as expected
    horse is in bloom filter as expected
    eagle is in bloom filter as expected
    bird is in bloom filter as expected
    bison is in bloom filter as expected
    boar is in bloom filter as expected
    butterfly is in bloom filter as expected
    ant is in bloom filter as expected
    anaconda is in bloom filter as expected
    bear is in bloom filter as expected
    chicken is in bloom filter as expected
    dolphin is in bloom filter as expected
    donkey is in bloom filter as expected
    crow is in bloom filter as expected
    crocodile is in bloom filter as expected
    badger is not in the bloom filter as expected
    cow is not in the bloom filter as expected
    pig is not in the bloom filter as expected
    sheep is not in the bloom, but a false positive
    bee is not in the bloom filter as expected
    wolf is not in the bloom filter as expected
    fox is not in the bloom filter as expected
    whale is not in the bloom filter as expected
    shark is not in the bloom, but a false positive
    fish is not in the bloom, but a false positive
    turkey is not in the bloom filter as expected
    duck is not in the bloom filter as expected
    dove is not in the bloom误报 filter as expected
    deer is not in the bloom filter as expected
    elephant is not in the bloom, but a false positive
    frog is not in the bloom filter as expected
    falcon is not in the bloom filter as expected
    goat is not in the bloom filter as expected
    gorilla is not in the bloom filter as expected
    hawk is not in the bloom filter as expected










    python 基于redis实现的bloomfilter(布隆过滤器),BloomFilter_imooc




    import mmh3
    import redis
    import math
    import time
    class PyBloomFilter():
        SEEDS = [543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372,
                 344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338,
                 465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53,
                 481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371,
                 63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518]
        def __init__(self, capacity=1000000000, error_rate=0.00000001, conn=None, key='BloomFilter'):
            self.m = math.ceil(capacity*math.log2(math.e)*math.log2(1/error_rate))      #需要的总bit位数
            self.k = math.ceil(math.log1p(2)*self.m/capacity)                           #需要最少的hash次数
            self.mem = math.ceil(self.m/8/1024/1024)                                    #需要的多少M内存
            self.blocknum = math.ceil(self.mem/512)                                     #需要多少个512M的内存块,value的第一个字符必须是ascii码,所有最多有256个内存块
            self.seeds = self.SEEDS[0:self.k]
            self.key = key
            self.N = 2**31-1
            self.redis = conn
            # print(self.mem)
            # print(self.k)
        def add(self, value):
            name = self.key + "_" + str(ord(value[0])%self.blocknum)
            hashs = self.get_hashs(value)
            for hash in hashs:
                self.redis.setbit(name, hash, 1)
        def is_exist(self, value):
            name = self.key + "_" + str(ord(value[0])%self.blocknum)
            hashs = self.get_hashs(value)
            exist = True
            for hash in hashs:
                exist = exist & self.redis.getbit(name, hash)
            return exist
        def get_hashs(self, value):
            hashs = list()
            for seed in self.seeds:
                hash = mmh3.hash(value, seed)
                if hash >= 0:
                    hashs.append(self.N - hash)
            return hashs
    pool = redis.ConnectionPool(host='', port=6379, db=0)
    conn = redis.StrictRedis(connection_pool=pool)
    # 使用方法
    # if __name__ == "__main__":
    #     bf = PyBloomFilter(conn=conn)           # 利用连接池连接Redis
    #     bf.add('www.jobbole.com')               # 向Redis默认的通道添加一个域名
    #     bf.add('www.luyin.org')                 # 向Redis默认的通道添加一个域名
    #     print(bf.is_exist('www.zhihu.com'))     # 打印此域名在通道里是否存在,存在返回1,不存在返回0
    #     print(bf.is_exist('www.luyin.org'))     # 打印此域名在通道里是否存在,存在返回1,不存在返回0
  • 相关阅读:
    mysql5.6 sql_mode设置
    centos6.5 mysql5.6主从复制
    linux 挂载windows共享文件夹
    Django model反向关联名称的方法(转)
  • 原文地址:https://www.cnblogs.com/yhll/p/9842514.html
Copyright © 2020-2023  润新知