• Scrapy框架的使用


    • Scrapy框架的安装
      • pip install pywin32
      • 下载 Twisted 包       pip install Twisted包的路径
      • pip insatll scrapy
    • Scrapy的基本使用
      • 切换到开始项目的目录 
      • scrapy startproject my_first_spider 命令创建一个新项目
      • scrapy genspider my_first_spider www.xxx.com 
      •  目录结构如下
      • # -*- coding: utf-8 -*-
        import scrapy
        from my_first_spider.items import MyFirstSpiderItem
        
        
        class FirstSpider(scrapy.Spider):
            # 当前spider的名称
            name = 'first'
        
            # 允许的域名
            # allowed_domains = ['www.xxx.com']
        
            # 开始的第一个url
            start_urls = ['https://www.zhipin.com/c101010100/?query=python开发&page=1&ka=page-1']
        
            url = 'https://www.zhipin.com/c101010100/?query=python开发&page=%d&ka=page-1'
            page = 1
        
            # 用于解析的解析函数, 适用 xpath, 必传参数response
            def parse(self, response):
                div_list = response.xpath('//div[@class="job-list"]/ul/li')
                for div in div_list:
                    job = div.xpath('./div/div[1]/h3/a/div[1]/text()').extract_first()
                    salary = div.xpath('./div/div[1]/h3/a/span/text()').extract_first()
                    company = div.xpath('./div/div[2]/div/h3/a/text()').extract_first()
        
                    item = MyFirstSpiderItem()
        
                    item['job'] = job
                    item['salary'] = salary
                    item['company'] = company
        
                    yield item  # 将item对象返回给数据持久化管道(pipelines)
        
                # 分页爬取
                if self.page <= 7:
                    print(f'第{self.page}页爬取完毕,开始爬取第{self.page+1}页')
                    self.page += 1
                    yield scrapy.Request(url=self.url%self.page, callback=self.parse)
        first.py
      • # -*- coding: utf-8 -*-
        
        # Define here the models for your scraped items
        #
        # See documentation in:
        # https://doc.scrapy.org/en/latest/topics/items.html
        
        import scrapy
        
        
        # 与管道交互,实例化item对象进行交互,将爬取到的数据封装在对象中,以对象的方式进行文件间数据传递
        class MyFirstSpiderItem(scrapy.Item):
            # define the fields for your item here like:
            # name = scrapy.Field()
            job = scrapy.Field()
            salary = scrapy.Field()
            company = scrapy.Field()
        item.py
      • # -*- coding: utf-8 -*-
        
        # Define your item pipelines here
        #
        # Don't forget to add your pipeline to the ITEM_PIPELINES setting
        # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
        
        import redis
        
        # 写入boss.txt文件
        class MyFirstSpiderPipeline(object):
        
            fp = None
        
            def open_spider(self, spider):
                print('开始爬虫')
                self.fp = open('./boss.txt', 'w', encoding='utf8')
        
            def close_spider(self, spider):
                self.fp.close()
                print('爬虫结束')
        
            def process_item(self, item, spider):
        
                self.fp.write(item['job']+':'+item['salary']+':'+item['company']+'
        ')
        
                return item
        
        
        # MySQL 写入管道
        class MySqlPipeline(object):
            conn = None
            cursor = None
        
            # 爬虫开始后如若使用管道, 首先执行open_spider方法(打开mysql数据库连接)
            def open_spider(self, spider):
        
                import pymysql.cursors
        
                self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='root',
                                             db='scrapy', charset='utf8')
                # print('-----', self.conn)
                print('打开MySQL连接, 开始写入')
        
            # 第二步 执行process_item 函数, 在其中进行写入操作, 如若有在此管道之后还有管道 此方法要将item对象返回
            def process_item(self, item, spider):
        
                self.cursor = self.conn.cursor()
                try:
                    self.cursor.execute(
                        'INSERT INTO boss (job_name, salary, company) VALUES ("%s", "%s", "%s")' % (item['job'], item['salary'], item['company']))
                    self.conn.commit()
                except Exception as e:
                    self.conn.rollback()
                    print('出现错误, 事件回滚')
                    print(e)
        
            # 第三部爬虫结束后执行close_spider方法(关闭MySQL连接)
            def close_spider(self, spider):
        
                self.conn.close()
                self.cursor.close()
                print('MySQl 写入完毕')
        
        
        # Redis 写入管道
        class RedisPipeline(object):
        
            conn = None
        
            def open_spider(self, spider):
                self.conn = redis.Redis(host='127.0.0.1', port=6379)
        
            #  第二步 执行process_item 函数, 在其中进行写入操作, 如若有在此管道之后还有管道 此方法要将item对象返回
            def process_item(self, item, spider):
                import json
                dic = {
                    'job_name': item['job'],
                    'salary': item['salary'],
                    'company': item['company']
                }
                try:
                    self.conn.lpush('boss', json.dumps(dic))
                    print('redis 写入成功')
                except Exception as e:
        
                    print('redis 写入失败', e)
                return item
        pipeline.py
      • # Configure item pipelines
        # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
        ITEM_PIPELINES = {
           'my_first_spider.pipelines.MyFirstSpiderPipeline': 300,
           'my_first_spider.pipelines.RedisPipeline': 301,
           'my_first_spider.pipelines.MySqlPipeline': 302,
        }
        
        
        BOT_NAME = 'my_first_spider'
        
        SPIDER_MODULES = ['my_first_spider.spiders']
        NEWSPIDER_MODULE = 'my_first_spider.spiders'
        
        
        # Crawl responsibly by identifying yourself (and your website) on the user-agent
        USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
        
        # Obey robots.txt rules
        ROBOTSTXT_OBEY = False
        setting.py
      • 在命令行中输入 scrapy crawl first --nolog  --nolog不显示日志
      • 爬取成功
  • 相关阅读:
    Session的使用与Session的生命周期
    Long-Polling, Websockets, SSE(Server-Sent Event), WebRTC 之间的区别与使用
    十九、详述 IntelliJ IDEA 之 添加 jar 包
    十八、IntelliJ IDEA 常用快捷键 之 Windows 版
    十七、IntelliJ IDEA 中的 Maven 项目初体验及搭建 Spring MVC 框架
    十六、详述 IntelliJ IDEA 创建 Maven 项目及设置 java 源目录的方法
    十五、详述 IntelliJ IDEA 插件的安装及使用方法
    十四、详述 IntelliJ IDEA 提交代码前的 Code Analysis 机制
    十三、IntelliJ IDEA 中的版本控制介绍(下)
    十二、IntelliJ IDEA 中的版本控制介绍(中)
  • 原文地址:https://www.cnblogs.com/Treasuremy/p/10457725.html
Copyright © 2020-2023  润新知