• scrapy之持久化存储


    下面演示基于管道存储到mysql、redis、和本地文件

    代码实现流程

    1. 将解析到的页面数据存储到items对象
    2. 使用yield关键字将items提交给管道文件进行处理
    3. 在管道文件中编写代码完成数据存储的操作
    4. 在配置文件中开启管道操作

    代码实现

    items:存储解析到的页面数据

    pipelines:处理持久化存储的相关操作

    下面以抓取糗百的段子为例:

    爬虫相关操作

    # -*- coding: utf-8 -*-
    import scrapy
    from qiubai.items import QiubaiItem
    
    
    class QiubaiSpiderSpider(scrapy.Spider):
        name = 'qiubai_spider'
        # allowed_domains = ['www.qiushibaike.com/text']  # 防止爬取的内容不属于当前域名
        start_urls = ['https://www.qiushibaike.com/text/']
    
        def parse(self, response):
            # 建议使用xpath进行指定内容的解析(框架集成了xpath解析的接口)
            # 段子的内容和作者
            div_list = response.xpath('//div[@id="content-left"]/div')
            # 存储解析到的页面数据
            for div in div_list:
                # xpath解析到的指定内容被存储到了Selector对象
                # extract()该方法可以将Selector对象中存储的数据值拿到
                # author = div.xpath('./div/a[2]/h2/text()')
                # extract_first()  ==   extract()[0]
                author = div.xpath('./div/a[2]/h2/text()').extract_first()
                content = div.xpath('.//div[@class="content"]/span/text()').extract_first()
    
                # 1. 创建item对象
                item = QiubaiItem()
                item['author'] = author
                item['content'] = content
    
                # 2. 提交给管道
                yield item

    储存解析到的页面数据:

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class QiubaiItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
    
        # 声明属性
        author = scrapy.Field()
        content = scrapy.Field()

    处理持久化储存

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    import redis
    import pymysql
    
    
    class QiubaiPipeline(object):
        conn = None
    
        def open_spider(self, spider):
            print('redis连接开始')
            self.conn = redis.Redis(host='127.0.0.1', port=6379)
    
        # 编写向数据库中存储数据的相关代码
        def process_item(self, item, spider):
            """
            :param item: 接收到的item对象
            :param spider:
            :return:
            """
            dic = {
                'author': item['author'],
                'content': item['content'],
            }
            self.conn.lpush('data', dic)
            return item
    
        def close_spider(self, spider):
            print('redis链接结束')
    
    
    class QiubaiByMysql(object):
        """
            实现将数据值存储到mysql数据库中
        """
    
        conn = None
        cursor = None
    
        # 编写向数据库中存储数据的相关代码
        def open_spider(self, spider):
            print('mysql链接开始')
            # 链接数据库
            self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='112233', db='qiubai')
    
        def process_item(self, item, spider):
            """
            :param item: 接收到的item对象
            :param spider:
            :return:
            """
            print('数据已写入mysql')
            # 1. 链接数据库
            # 2. 执行sql语句
            sql = f'insert into qiubai values("{item["author"]}","{item["content"]}")'
            self.cursor = self.conn.cursor()
            try:
                self.cursor.execute(sql)
                self.conn.commit()
            except Exception as e:
                print(e)
                self.conn.rollback()
            # 3. 提交事物
            return item
    
        def close_spider(self, spider):
            print('mysql链接结束')
            self.cursor.close()
            self.conn.close()
    
    
    class QiubaiByFiles(object):
        """
        将数据值存储到本地磁盘中
        """
        fp = None
    
        def open_spider(self, spider):
            print('打开文件')
            self.fp = open('../qiubai.txt', 'w', encoding='utf-8')
    
        def process_item(self, item, spider):
            print('数据已经写入到文件')
            author = item['author']
            content = item['content']
    
            self.fp.write(author + ':' + content + '
    
    
    ')
            return item
    
        def close_spider(self, spider):
            print('关闭文件')
            self.fp.close()

    配置文件的编写

    BOT_NAME = 'qiubai'
    
    SPIDER_MODULES = ['qiubai.spiders']
    NEWSPIDER_MODULE = 'qiubai.spiders'
    
    USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    
    ROBOTSTXT_OBEY = False
    
    # 配置管道
    ITEM_PIPELINES = {
        'qiubai.pipelines.QiubaiPipeline': 300,
        'qiubai.pipelines.QiubaiByFiles': 400,
        'qiubai.pipelines.QiubaiByMysql': 500,
    }
  • 相关阅读:
    编程实践56
    诫子篇
    编程实践58
    编程实践55
    C#Process类应该声明个什么引用空间啊 找不到类型或命名空间名称“Process”(是否缺少 using 指令或程序集引用?) 如何解决?
    课堂题目54
    jQuery学习笔记jQuery的动画
    Asp.net生成各种网页快捷方式[转贴]
    jQuery学习笔记Helloworld
    FreeTextBox配置
  • 原文地址:https://www.cnblogs.com/lshedward/p/10697116.html
Copyright © 2020-2023  润新知