• pipeline做持久化


    注意:pipeline是所有爬虫公用的,如果想要给某个爬虫定制需要使用spider参数自己进行处理。

    流程:

    1、先写pipeline类,默认会生成

    #pipelines.py
    class XXXPipeline(object):
        def process_item(self, item, spider):
            return item

    2、写Item类

    #items.py
    class XdbItem(scrapy.Item):
        href = scrapy.Field()
        title = scrapy.Field()

    3、配置

    #settings.py
    ITEM_PIPELINES = {
       'xdb.pipelines.XdbPipeline': 300,
    }

    4、爬虫,yield每执行一次,process_item就调用一次

    yield Item对象

    示例:

    piplines.py

    """
    源码内容:
        1. 判断当前XdbPipeline类中是否有from_crawler
            有:
                obj = FilePipeline.from_crawler(....)
            否:
                obj = FilePipeline()
        2. obj.open_spider()
        
        3. obj.process_item()/obj.process_item()/obj.process_item()/obj.process_item()/obj.process_item()
        
        4. obj.close_spider()
    """
    from scrapy.exceptions import DropItem
    
    class FilePipeline(object):
    
        def __init__(self,path):
            self.f = None
            self.path = path
    
        @classmethod
        def from_crawler(cls, crawler):
            """
            初始化时候,用于创建pipeline对象
            :param crawler:
            :return:
            """
            print('File.from_crawler')
            path = crawler.settings.get('HREF_FILE_PATH')
            return cls(path)
    
        def open_spider(self,spider):
            """
            爬虫开始执行时,调用
            :param spider:
            :return:
            """
            # if spider.name == 'chouti':
            print('File.open_spider')
            self.f = open(self.path,'a+')
    
        def process_item(self, item, spider):
            # f = open('xx.log','a+')
            # f.write(item['href']+'
    ')
            # f.close()
            print('File',item['href'])
            self.f.write(item['href']+'
    ')
            # return item  #item会交给下一个Pipeline(如DbPipeline)中process_item方法中的item
            raise DropItem()  #后续的 pipeline的process_item方法不再执行
    
        def close_spider(self,spider):
            """
            爬虫关闭时,被调用
            :param spider:
            :return:
            """
            print('File.close_spider')
            self.f.close()
    
    
    class DbPipeline(object):
        def __init__(self,path):
            self.f = None
            self.path = path
    
        @classmethod
        def from_crawler(cls, crawler):
            """
            初始化时候,用于创建pipeline对象
            :param crawler:
            :return:
            """
            print('DB.from_crawler')
            path = crawler.settings.get('HREF_DB_PATH')
            return cls(path)
    
        def open_spider(self,spider):
            """
            爬虫开始执行时,调用
            :param spider:
            :return:
            """
            print('Db.open_spider')
            self.f = open(self.path,'a+')
    
        def process_item(self, item, spider):
            # f = open('xx.log','a+')
            # f.write(item['href']+'
    ')
            # f.close()
            print('Db',item)
            # self.f.write(item['href']+'
    ')
            return item
    
        def close_spider(self,spider):
            """
            爬虫关闭时,被调用
            :param spider:
            :return:
            """
            print('Db.close_spider')
            self.f.close()

    settings.py

    ITEM_PIPELINES = {
       'xdb.pipelines.FilePipeline': 300,
       'xdb.pipelines.DbPipeline': 301,
    }
    
    HREF_FILE_PATH = "news.log"
    HREF_DB_PATH = "db.log"

    items.py

    import scrapy
    
    class XdbItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        href = scrapy.Field()
        title = scrapy.Field()

    chouti.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.http.response.html import HtmlResponse
    # import sys,os,io
    # sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
    from xdb.items import XdbItem
    class ChoutiSpider(scrapy.Spider):
        name = 'chouti'
        allowed_domains = ['chouti.com']
        start_urls = ['http://chouti.com/']
    
        def parse(self, response):
            # print(response,type(response)) # 对象
            # print(response.text)
            """
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(response.text,'html.parser')
            content_list = soup.find('div',attrs={'id':'content-list'})
            """
            # 去子孙中找div并且id=content-list
            item_list = response.xpath('//div[@id="content-list"]/div[@class="item"]')
            for item in item_list:
                text = item.xpath('.//a/text()').extract_first()
                href = item.xpath('.//a/@href').extract_first()
                yield XdbItem(title=text,href=href)
    
            """
            page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract()
            for page in page_list:
                from scrapy.http import Request
                page = "https://dig.chouti.com" + page
                yield Request(url=page,callback=self.parse) # https://dig.chouti.com/all/hot/recent/2
            """
  • 相关阅读:
    香港2013迷你制汇节即将启幕
    neuroph轻量级神经网络框架
    java如何实现python的urllib.quote(str,safe='/')
    python 的日志logging模块
    Python 代码使用pdb调试技巧
    python中reload(module)的用法,以及错误提示
    Notepad++如何取消打开最近的历史文件
    机器学习--入门答疑
    计算机的启动过程
    缓存设计
  • 原文地址:https://www.cnblogs.com/zh-xiaoyuan/p/13293470.html
Copyright © 2020-2023  润新知