1 from pymongo import MongoClient
2 import os
3 base_dir = os.getcwd()
4 class MongoPipeline(object):
5 # 实现保存到mongo数据库的类,
6 collection = 'douban' # mongo 数据库的 collection 名字
7
8 def __init__(self, mongo_uri, db_name, db_user, db_pass):
9 self.mongo_uri = mongo_uri
10 self.db_name = db_name
11 self.db_user = db_user
12 self.db_pass = db_pass
13
14 @classmethod
15 def from_crawler(cls, crawler):
16 # scrapy 为我们访问settings提供了这样的一个方法,这里,
17 # 我们需要从 settings.py 文件中,取得数据库的URI和数据库名称
18 return cls(
19 mongo_uri=crawler.settings.get('MONGO_URI'),
20 db_name=crawler.settings.get('DB_NAME'),
21 db_user=crawler.settings.get('DB_USER'),
22 db_pass=crawler.settings.get('DB_PASS'))
23
24 def open_spider(self, spider): # 爬虫启动时调用,连接到数据库
25 self.client = MongoClient(self.mongo_uri)
26 self.zfdb = self.client[self.db_name]
27 self.zfdb.authenticate(self.db_user, self.db_pass)
28
29 def close_spider(self, spider): # 爬虫关闭时调用,关闭数据库连接
30 self.client.close()
31
32 def process_item(self, item, spider):
33 self.zfdb[self.collection].insert({"title": item["title"].strip()})
34 return item