scrapy爬虫框架(二)
将数据保存到json文件中
settings.py打开pipeline,其中数字代表优先级(值越小优先级越高)
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'qsbkSpider.pipelines.QsbkspiderPipeline': 300,
}
qsbk.py
# -*- coding: utf-8 -*-
import scrapy
class QsbkSpider(scrapy.Spider):
name = 'qsbk'
allowed_domains = ['www.yicommunity.com']
start_urls = ['http://www.yicommunity.com/']
def parse(self, response):
print("=" * 80)
contents = response.xpath('//div[@class="col1"]/div')
print(contents)
print("=" * 80)
for content in contents:
author = content.xpath("./div[@class='author']/text()").get()
word = content.xpath("./div[@class='content']/text()").get()
print(author, word)
duanzi = {"author": author, "word": word}
# 从函数变成生成器,遍历生成器的时候就会一个一个返回回去
yield duanzi # 移交给引擎,引擎再移交给pipeline
pipeline.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json
class QsbkspiderPipeline(object):
def __init__(self): # 初始化方法
self.fp = open("duanzi.json", "w", encoding='utf-8')
def process_item(self, item, spider):
item_json = json.dumps(item)
self.fp.write(item_json + '
')
return item
def open_spider(self, spider):
print("爬虫开始了!")
def close_spider(self, spider):
self.fp.close()
print("爬虫结束了!")
运行效果
scrapy crawl qsbk
同时生成文件