mongodb数据库存爬虫数据

#在进行操作之前，先把mongodb数据库启动起来，新建一个mongo_cache.py文件

import pickle
import zlib
from datetime import datetime,timedelta

import requests
from pymongo import MongoClient
from bson.binary import Binary

class MongoCache(object):
    """
    数据库缓存
    """
    def __init__(self,client=None,expires=timedelta(days=30)):
        self.client = MongoClient("localhost",27017)
        self.db = self.client.cache
        ##加速查找设置索引，设置超时时间,如果达到expi reAfterSeconds设置的超时时间，mongodb会自动删除超时数据
        self.db.webpage.create_index('timestamp',expireAfterSeconds=expires.total_seconds())


    def __setitem__(self, key, value):
        # 压缩数据设置时间戳
        record = {"result":Binary(zlib.compress(pickle.dumps(value))),"timestamp":datetime.utcnow()}
        ##使用update的upsert (如果不存在执行insert,存在update)参数迸行插入更新操作，$set内置函数表示覆盖原始数据
        self.db.webpage.update({"_id":key},{'$set':record},upsert=True)

    def __getitem__(self, item):
        #根据_id以item作为关键字，查找相关网页
        record = self.db.webpage.find_one({"_id":item})
        if record:
            #如果存在进行解压缩反序列化
            return pickle.loads(zlib.decompress(record["result"]))
        else:
            raise KeyError(item + "does not exist")#找不到抛出异常

    def __contains__(self, item):
        try:
            self[item]#执行__getitem__方法
        except KeyError:
            return False#捕获到keyerror异常
        else:
            return True#找到相应数据说明说句酷白喊下载内容

    def clear(self):
        self.db.webpage.drop()

if __name__ == '__main__':
    mongo_cache = MongoCache()
    url = 'http://tieba.baidu.com/f?kw=猫&red_tag=1'
    response = requests.get(url)
    mongo_cache[url] = response.text
    print(mongo_cache[url])

#在建一个文件
import requests

import mongo_cache

download_url = "http://tieba.baidu.com/f?kw=猫&red_tag=2"
download_response = requests.get(download_url)
m_cache = mongo_cache. MongoCache()
m_cache [download_url] = download_response.content
print (m_cache [download_url]. decode('utf-8'))
print(download_url in m_cache)

相关阅读:
Python3标准库：copy复制对象
Python3标准库：weakref对象的非永久引用
Python3标准库：queue线程安全的FIFO实现
Python3标准库：bisect维护有序列表
Python3标准库：struct二进制数据结构
Python3标准库：heapq堆排序算法
Python3标准库：array数组
Python3标准库：collections容器数据类型
20-如何运行容器？
19-Docker 镜像小结

原文地址：https://www.cnblogs.com/liangliangzz/p/10142341.html

mongodb数据库 存爬虫数据

mongodb数据库存爬虫数据