• Bloom filters in Python


    Bloom filters in Python - 张沈鹏,在路上... - ITeye技术网站


    Python语言: 临时自用代码@代码发芽网

    #coding:utf-8

    # Bloom filters in Python

    # Adam Langley <agl@imperialviolet.org>

    # 给CountedBloom加了一个max_count 张沈鹏 <zsp007@gmail.com>

    # Bloom-Filter算法简介

    # http://www.googlechinablog.com/2007/07/bloom-filter.html

    # http://zh.wikipedia.org/wiki/%E5%B8%83%E9%9A%86%E8%BF%87%E6%BB%A4%E5%99%A8

    # 这个计算器可以帮你求最佳的参数

    # http://www.cc.gatech.edu/~manolios/bloom-filters/calculator.html

    # CountedBloom 的 buckets 参数对应于计算器的m,也就是"m denotes the number of bits in the Bloom filter"



    import array

    import struct



    mixarray = array.array ('B', '\x00' * 256)

    # The mixarray is based on RC4 and is used as diffusion in the hashing function



    def mixarray_init (mixarray):

        for i in range (256):

            mixarray[i] = i

        k = 7

        for j in range (4):

            for i in range (256):

                s = mixarray[i]

                k = (k + s) % 256

                mixarray[i] = mixarray[k]

                mixarray[k] = s



    mixarray_init (mixarray)



    class Bloom (object):

        '''Bloom filters provide a fast and compact way of checking set membership. They do this by introducing a risk of a

      false positive (but there are no false negatives).



      For more information see http://www.cs.wisc.edu/~cao/papers/summary-cache/node8.html'''

        def __init__ (self, bytes, hashes, data = None):

            '''@bytes is the size of the bloom filter in 8-bit bytes and @hashes is the number of hash functions to use. Consult the

        web page linked above for values to use. If in doubt, bytes = num_elements and hashes = 4'''

            self.hashes = hashes

            self.bytes = bytes



            if data == None:

                self.a = self._make_array (bytes)

            else:

                assert len (data) == bytes

                self.a = data



        def init_from_counted (self, cnt):

            '''Set the contents of this filter from the contents of the counted filter @cnt. You have to match sizes'''

            if self.bytes * 8 != (len (cnt.a) * 2):

                raise ValueError ('Filters are not the same size')

            for i in xrange (len (cnt.a)):

                b = cnt.a[i]

                b1 = (b & 0xf0) >> 4

                b2 = (b & 0x0f)

                if b1:

                    self.a[(i * 2) // 8] |= self.bitmask[(i * 2) % 8]

                if b2:

                    self.a[(i * 2 + 1) // 8] |= self.bitmask[(i * 2 + 1) % 8]



        def _make_array (self, size):

            a = array.array ('B')

            # stupidly, there's no good way that I can see of resizing an array without allocing a huge string to do so

            # thus I use this, slightly odd, method:



            blocklen = 256

            arrayblock = array.array ('B', '\x00' * blocklen)

            todo = size

            while (todo >= blocklen):

                a.extend (arrayblock)

                todo -= blocklen

            if todo:

                a.extend (array.array ('B', '\x00' * todo))

            # now a is of the right length

            return a



        def _hashfunc (self, n, val):

            '''Apply the nth hash function'''



            global mixarray



            b = [ord(x) for x in struct.pack ('I', val)]

            c = array.array ('B', [0, 0, 0, 0])

            for i in range (4):

                c[i] = mixarray[(b[i] + n) % 256]



            return struct.unpack ('I', c.tostring())[0]



        bitmask = [0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01]



        def insert (self, val):

            for i in range (self.hashes):

                n = self._hashfunc (i, val) % (self.bytes *

                self.a[n // 8] |= self.bitmask[n % 8]



        def __contains__ (self, val):

            for i in range (self.hashes):

                n = self._hashfunc (i, val) % (self.bytes *

                if not self.a[n // 8] & self.bitmask[n % 8]:

                    return 0

            return 1

    MAX_COUNT = 15

    class CountedBloom (Bloom):

        '''Just like a Bloom filter, but provides counting (e.g. you can delete as well). This uses 4 bits per bucket, so is

      generally four times larger than the same non-counted bloom filter.'''



        def __init__ (self, buckets, hashes):

            '''Please note that @buckets must be even. Also note that with a Bloom object you give the number of *bytes* and each byte is 8 buckets. Here you're giving the number of buckets.'''

            assert buckets % 2 == 0

            self.hashes = hashes

            self.buckets = buckets



            self.a = self._make_array (buckets // 2)



        def insert (self, val):

            masks  = [(0x0f, 0xf0), (0xf0, 0x0f)]

            shifts = [4, 0           ]



            for i in range (self.hashes):

                n = self._hashfunc (i, val) % self.buckets

                byte         = n // 2

                bucket = n % 2

                (notmask, mask) = masks[bucket]

                shift        = shifts[bucket]

                bval         = ((self.a[byte] & mask) >> shift)

                if bval < MAX_COUNT: # we shouldn't increment it if it's at the maximum

                    bval += 1



                self.a[byte] = (self.a[byte] & notmask) | (bval << shift)

        def __contains__ (self, val):

            masks        = [(0x0f, 0xf0), (0xf0, 0x0f)]

            shifts = [4, 0]



            for i in range (self.hashes):

                n = self._hashfunc (i, val) % self.buckets

                byte         = n // 2

                bucket = n % 2

                (notmask, mask) = masks[bucket]

                shift        = shifts[bucket]

                bval          = ((self.a[byte] & mask) >> shift)



                if bval == 0:

                    return 0

            return 1



        def max_count(self, val):

            masks        = [(0x0f, 0xf0), (0xf0, 0x0f)]

            shifts = [4, 0]

            count_val = MAX_COUNT

            for i in range (self.hashes):

                n = self._hashfunc (i, val) % self.buckets

                byte         = n // 2

                bucket = n % 2

                (notmask, mask) = masks[bucket]

                shift        = shifts[bucket]

                bval          = ((self.a[byte] & mask) >> shift)



                if bval < MAX_COUNT:

                    if bval == 0:

                        return 0

                    else:

                        count_val = bval

            return count_val



        def __delitem__ (self, val):

            masks  = [(0x0f, 0xf0), (0xf0, 0x0f)]

            shifts = [4, 0]



            for i in range (self.hashes):

                n = self._hashfunc (i, val) % self.buckets

                byte         = n // 2

                bucket = n % 2

                (notmask, mask) = masks[bucket]

                shift        = shifts[bucket]

                bval          = ((self.a[byte] & mask) >> shift)



                if bval < MAX_COUNT: # we shouldn't decrement it if it's at the maximum

                    bval -= 1



                self.a[byte] = (self.a[byte] & notmask) | (bval << shift)



    __all__ = ['Bloom']



    if __name__ == '__main__':

        print 'Testing bloom filter: there should be no assertion failures'

        a = Bloom (3, 4)



        a.insert (45)

        print a.a

        a.insert (17)

        print a.a

        a.insert (12)

        print a.a

        assert 45 in a



        assert 45 in a

        assert not 33 in a

        assert 45 in a

        assert 17 in a

        assert 12 in a



        c = 0

        for x in range (255):

            if x in a:

                c += 1

        print c

        print float(c)/255





        a = CountedBloom (24, 4)

        a.insert (45)

        print a.a

        a.insert (17)

        print a.a

        a.insert (12)

        a.insert (12)

        print "a.max_count(12)", a.max_count(12)

        a.insert ("张沈鹏")

        a.insert ("张沈鹏")

        a.insert ("张沈鹏")

        print "a.max_count(zsp)", a.max_count(12)

        print a.a

        assert 45 in a



        assert 45 in a

        assert not 33 in a

        assert 45 in a

        assert 17 in a

        assert 12 in a



        c = 0

        for x in range (255):

            if x in a:

                c += 1

        print c

        print float(c)/255



        del a[45]

        assert not 45 in a



        a2 = Bloom (3, 4)

        a2.init_from_counted (a)



        print a2.a



        assert 17 in a2

        assert 12 in a2

        assert not 45 in a

  • 相关阅读:
    java中math类
    java中的值传递和引用传递(转)
    eclipse 字体和配色修改
    JAR WAR EAR包的区别
    java中基本类型
    tomcat 7.0 配置详解
    或许你需要一些可操作性更强的实践
    Assembly.CreateInstance()与Activator.CreateInstanc
    OOD沉思录2 类和对象的关系包含关系
    C#的插件开发
  • 原文地址:https://www.cnblogs.com/lexus/p/2399593.html
Copyright © 2020-2023  润新知