• Python爬虫学习——布隆过滤器


    布隆过滤器的实现方法1:自己实现

    参考 http://www.cnblogs.com/naive/p/5815433.html

    bllomFilter两个参数分别代表,布隆过滤器的大小和hash函数的个数

    #coding:utf-8
    #!/usr/bin/env python
    
    from bitarray import bitarray
    # 3rd party
    import mmh3
    import scrapy
    from BeautifulSoup import BeautifulSoup as BS
    import os
    ls = os.linesep
    
    class BloomFilter(set):
    
        def __init__(self, size, hash_count):
            super(BloomFilter, self).__init__()
            self.bit_array = bitarray(size)
            self.bit_array.setall(0)
            self.size = size
            self.hash_count = hash_count
    
        def __len__(self):
            return self.size
    
        def __iter__(self):
            return iter(self.bit_array)
    
        def add(self, item):
            for ii in range(self.hash_count):
                index = mmh3.hash(item, ii) % self.size
                self.bit_array[index] = 1
    
            return self
    
        def __contains__(self, item):
            out = True
            for ii in range(self.hash_count):
                index = mmh3.hash(item, ii) % self.size
                if self.bit_array[index] == 0:
                    out = False
    
            return out
    
    class DmozSpider(scrapy.Spider):
        name = "baidu"
        allowed_domains = ["baidu.com"]
        start_urls = [
            "http://baike.baidu.com/item/%E7%BA%B3%E5%85%B0%E6%98%8E%E7%8F%A0"
        ]
    
        def parse(self, response):
    
            # fname = "/media/common/娱乐/Electronic_Design/Coding/Python/Scrapy/tutorial/tutorial/spiders/temp"
            #
            # html = response.xpath('//html').extract()[0]
            # fobj = open(fname, 'w')
            # fobj.writelines(html.encode('utf-8'))
            # fobj.close()
    
            bloom = BloomFilter(1000, 10)
            animals = ['dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse', 'eagle',
                       'bird', 'bison', 'boar', 'butterfly', 'ant', 'anaconda', 'bear',
                       'chicken', 'dolphin', 'donkey', 'crow', 'crocodile']
            # First insertion of animals into the bloom filter
            for animal in animals:
                bloom.add(animal)
    
            # Membership existence for already inserted animals
            # There should not be any false negatives
            for animal in animals:
                if animal in bloom:
                    print('{} is in bloom filter as expected'.format(animal))
                else:
                    print('Something is terribly went wrong for {}'.format(animal))
                    print('FALSE NEGATIVE!')
    
            # Membership existence for not inserted animals
            # There could be false positives
            other_animals = ['badger', 'cow', 'pig', 'sheep', 'bee', 'wolf', 'fox',
                             'whale', 'shark', 'fish', 'turkey', 'duck', 'dove',
                             'deer', 'elephant', 'frog', 'falcon', 'goat', 'gorilla',
                             'hawk']
            for other_animal in other_animals:
                if other_animal in bloom:
                    print('{} is not in the bloom, but a false positive'.format(other_animal))
                else:
                    print('{} is not in the bloom filter as expected'.format(other_animal))
    

    布隆过滤器的实现方法2:使用pybloom

    参考 http://www.jianshu.com/p/f57187e2b5b9

    #coding:utf-8
    #!/usr/bin/env python
    
    from pybloom import BloomFilter
    
    import scrapy
    from BeautifulSoup import BeautifulSoup as BS
    import os
    ls = os.linesep
    
    class DmozSpider(scrapy.Spider):
        name = "baidu"
        allowed_domains = ["baidu.com"]
        start_urls = [
            "http://baike.baidu.com/item/%E7%BA%B3%E5%85%B0%E6%98%8E%E7%8F%A0"
        ]
    
        def parse(self, response):
    
            # fname = "/media/common/娱乐/Electronic_Design/Coding/Python/Scrapy/tutorial/tutorial/spiders/temp"
            #
            # html = response.xpath('//html').extract()[0]
            # fobj = open(fname, 'w')
            # fobj.writelines(html.encode('utf-8'))
            # fobj.close()
    
            # bloom = BloomFilter(100, 10)
            bloom = BloomFilter(1000, 0.001)
            animals = ['dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse', 'eagle',
                       'bird', 'bison', 'boar', 'butterfly', 'ant', 'anaconda', 'bear',
                       'chicken', 'dolphin', 'donkey', 'crow', 'crocodile']
            # First insertion of animals into the bloom filter
            for animal in animals:
                bloom.add(animal)
    
            # Membership existence for already inserted animals
            # There should not be any false negatives
            for animal in animals:
                if animal in bloom:
                    print('{} is in bloom filter as expected'.format(animal))
                else:
                    print('Something is terribly went wrong for {}'.format(animal))
                    print('FALSE NEGATIVE!')
    
            # Membership existence for not inserted animals
            # There could be false positives
            other_animals = ['badger', 'cow', 'pig', 'sheep', 'bee', 'wolf', 'fox',
                             'whale', 'shark', 'fish', 'turkey', 'duck', 'dove',
                             'deer', 'elephant', 'frog', 'falcon', 'goat', 'gorilla',
                             'hawk']
            for other_animal in other_animals:
                if other_animal in bloom:
                    print('{} is not in the bloom, but a false positive'.format(other_animal))
                else:
                    print('{} is not in the bloom filter as expected'.format(other_animal))
    

    输出

    dog is in bloom filter as expected
    cat is in bloom filter as expected
    giraffe is in bloom filter as expected
    fly is in bloom filter as expected
    mosquito is in bloom filter as expected
    horse is in bloom filter as expected
    eagle is in bloom filter as expected
    bird is in bloom filter as expected
    bison is in bloom filter as expected
    boar is in bloom filter as expected
    butterfly is in bloom filter as expected
    ant is in bloom filter as expected
    anaconda is in bloom filter as expected
    bear is in bloom filter as expected
    chicken is in bloom filter as expected
    dolphin is in bloom filter as expected
    donkey is in bloom filter as expected
    crow is in bloom filter as expected
    crocodile is in bloom filter as expected
    badger is not in the bloom filter as expected
    cow is not in the bloom filter as expected
    pig is not in the bloom filter as expected
    sheep is not in the bloom filter as expected
    bee is not in the bloom filter as expected
    wolf is not in the bloom filter as expected
    fox is not in the bloom filter as expected
    whale is not in the bloom filter as expected
    shark is not in the bloom filter as expected
    fish is not in the bloom filter as expected
    turkey is not in the bloom filter as expected
    duck is not in the bloom filter as expected
    dove is not in the bloom filter as expected
    deer is not in the bloom filter as expected
    elephant is not in the bloom filter as expected
    frog is not in the bloom filter as expected
    falcon is not in the bloom filter as expected
    goat is not in the bloom filter as expected
    gorilla is not in the bloom filter as expected
    hawk is not in the bloom filter as expected
    
  • 相关阅读:
    uni-app在小程序开发者工具:TypeError: Cannot read property ‘forceUpdate‘ of undefined
    windows部署多个tomcat并添加到服务开机自动启动
    区域填充算法和多边形填充的扫描线算法[转]
    如何在不规则多边形内均匀撒点的算法[转]
    基于Living Atlas数据为木里山体滑坡敏感性建模【转】
    重磅!前端开发技术之Vue架构知识分享[转]
    如何使用 IAM 策略授予对特定 AWS S3 文件夹的用户特定访问权限?
    XXL-JOB安装、配置、启动、停止教程
    centos7 部署YApi
    CentOS 7安装MySQL8.0
  • 原文地址:https://www.cnblogs.com/tonglin0325/p/7043886.html
Copyright © 2020-2023  润新知