• mongodb数据库 存爬虫数据


    #在进行操作之前,先把mongodb数据库启动起来,新建一个mongo_cache.py文件
    import pickle
    import zlib
    from datetime import datetime,timedelta
    
    import requests
    from pymongo import MongoClient
    from bson.binary import Binary
    
    class MongoCache(object):
        """
        数据库缓存
        """
        def __init__(self,client=None,expires=timedelta(days=30)):
            self.client = MongoClient("localhost",27017)
            self.db = self.client.cache
            ##加速查找设置索引,设置超时时间,如果达到expi reAfterSeconds设置的超时时间,mongodb会自动删除超时数据
            self.db.webpage.create_index('timestamp',expireAfterSeconds=expires.total_seconds())
    
    
        def __setitem__(self, key, value):
            # 压缩数据设置时间戳
            record = {"result":Binary(zlib.compress(pickle.dumps(value))),"timestamp":datetime.utcnow()}
            ##使用update的upsert (如果不存在执行insert,存在update)参数迸行插入更新操作,$set内置函数表示覆盖原始数据
            self.db.webpage.update({"_id":key},{'$set':record},upsert=True)
    
        def __getitem__(self, item):
            #根据_id以item作为关键字,查找相关网页
            record = self.db.webpage.find_one({"_id":item})
            if record:
                #如果存在进行解压缩反序列化
                return pickle.loads(zlib.decompress(record["result"]))
            else:
                raise KeyError(item + "does not exist")#找不到抛出异常
    
        def __contains__(self, item):
            try:
                self[item]#执行__getitem__方法
            except KeyError:
                return False#捕获到keyerror异常
            else:
                return True#找到相应数据说明说句酷白喊下载内容
    
        def clear(self):
            self.db.webpage.drop()
    
    if __name__ == '__main__':
        mongo_cache = MongoCache()
        url = 'http://tieba.baidu.com/f?kw=猫&red_tag=1'
        response = requests.get(url)
        mongo_cache[url] = response.text
        print(mongo_cache[url])
    #在建一个文件
    import requests
    
    import mongo_cache
    
    download_url = "http://tieba.baidu.com/f?kw=猫&red_tag=2"
    download_response = requests.get(download_url)
    m_cache = mongo_cache. MongoCache()
    m_cache [download_url] = download_response.content
    print (m_cache [download_url]. decode('utf-8'))
    print(download_url in m_cache)
    

      

  • 相关阅读:
    Python3标准库:copy复制对象
    Python3标准库:weakref对象的非永久引用
    Python3标准库:queue线程安全的FIFO实现
    Python3标准库:bisect维护有序列表
    Python3标准库:struct二进制数据结构
    Python3标准库:heapq堆排序算法
    Python3标准库:array数组
    Python3标准库:collections容器数据类型
    20-如何运行容器?
    19-Docker 镜像小结
  • 原文地址:https://www.cnblogs.com/liangliangzz/p/10142341.html
Copyright © 2020-2023  润新知