pipelines.py
import json from scrapy.conf import settings from pymongo import MongoClient class SunPipeline(object): def open_spider(self,spider): self.file = open('dongguan.json','w') def process_item(self, item, spider): str_data = json.dumps(dict(item),ensure_ascii=False) + ', ' self.file.write(str_data) return item def close_spider(self,spider): self.file.close() class MongoPipeline(object): def __init__(self): # 获取数据库参数 host = settings['MONGO_HOST'] port = settings['MONGO_PORT'] dbname = settings['MONGO_DBNAME'] colname = settings['MONGO_COLNAME'] # 连接数据库 self.client = MongoClient(host, port) # 选择数据库 self.db = self.client[dbname] # 选择集合 self.col = self.db[colname] def process_item(self, item, spider): data = dict(item) self.col.insert(data) return item def __del__(self): # 关闭数据库链接 self.client.close()
settings.py
BOT_NAME = 'Sun' SPIDER_MODULES = ['Sun.spiders'] NEWSPIDER_MODULE = 'Sun.spiders' MONGO_HOST = '127.0.0.1' MONGO_PORT = 27017 MONGO_DBNAME = 'Sun' MONGO_COLNAME = 'dongguan' ITEM_PIPELINES = { 'Sun.pipelines.SunPipeline': 300, 'Sun.pipelines.MongoPipeline': 301, }