• 基于scrapy的一些实例


    一.爬取斗鱼主播

     1. 爬虫文件

    # -*- coding: utf-8 -*-
    import scrapy
    import json
    from Douyu.items import DouyuItem
    
    class DouyuSpider(scrapy.Spider):
        name = 'douyu'
        # allowed_domains = ['www.xxx.com']
        baseurl = 'http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset='
        # 偏移量,指的是起始值,从0开始的偏移值
        offset = 0
        start_urls = [baseurl + str(offset)]
    
        def parse(self, response):
            # 对获取的数据进转jsao格式后进行判断
            data = json.loads(response.text)['data']
    
            if len(data) == 0:
                return
            data = json.loads(response.text)['data']
            # //循环data这个列表,拿到的是每一个主播信息的字典
            for each in data:
                name = each['nickname']
                img_url = each['vertical_src']
                # //实例一个item对象来装获取到的数据
                item = DouyuItem()
                item['name'] = name
                item['img_url'] = img_url
                # 这边要记得返回,否则管道文件接不到数据
                yield item
    
            # 获取所有页的数据
            # 这样不容出错,上面有判断了状态表示码,如果为1就不会走if这边了
            self.offset += 20
            url = self.baseurl + str(self.offset)
            yield scrapy.Request(url=url, callback=self.parse)

      2.item

    import scrapy
    
    
    class DouyuItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        name=scrapy.Field()  #保存昵称
        img_url=scrapy.Field()  #保存图片url

      3.pipeline

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    from scrapy.pipelines.images import ImagesPipeline
    from Douyu.settings import IMAGES_STORE as images_store
    import os
    import scrapy
    
    #文字存储
    class DouyuPipeline(object):
        f = None
    
        def open_spider(self, spider):
            self.f = open('./douyu.txt', 'w', encoding='utf-8')
    
        def process_item(self, item, spider):
            name = item['name']
            img_url = item['img_url']
            self.f.write(name + ":" + img_url + "
    ")
            return item
    
        def close_spider(self, spider):
            self.f.close()
    
    
    # 设置照片存储
    class ImagesPipieline(ImagesPipeline):
        # 从爬虫文件赤岸过来的item中获取诈骗的url,对照片的url进行请求,获取照片
        # 照片默认获取保存到settingts.py中IMGS_STORE,自己要去设置路径
        def get_media_requests(self, item, info):
            img_url = item['img_url']
    
            yield scrapy.Request(img_url)
    
        # 对图片修改名字
        def item_completed(self, results, item, info):
            # 固定写法,获取图片路径,同时判断这个路径是否正确,如果正确就放到imgpath里面
            # results:把图片从文件读出来的信息
            img_path = [x['path'] for ok, x in results if ok]
            os.rename(images_store + img_path[0], images_store + item['name'] + '.jpg')

      4.settings

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for Douyu project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'Douyu'
    
    SPIDER_MODULES = ['Douyu.spiders']
    NEWSPIDER_MODULE = 'Douyu.spiders'
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    # CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    # DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    # CONCURRENT_REQUESTS_PER_DOMAIN = 16
    # CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    # COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    # TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    # DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    # }
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    # SPIDER_MIDDLEWARES = {
    #    'Douyu.middlewares.DouyuSpiderMiddleware': 543,
    # }
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    # DOWNLOADER_MIDDLEWARES = {
    #    'Douyu.middlewares.DouyuDownloaderMiddleware': 543,
    # }
    
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    # EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    # }
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
        'Douyu.pipelines.DouyuPipeline': 300,
        'Douyu.pipelines.ImagesPipieline': 301,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    # AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    # AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    # AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    # AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    # HTTPCACHE_ENABLED = True
    # HTTPCACHE_EXPIRATION_SECS = 0
    # HTTPCACHE_DIR = 'httpcache'
    # HTTPCACHE_IGNORE_HTTP_CODES = []
    # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    
    # 图片的存储路径
    #settings都要大写,这边的字一个都不能错
    IMAGES_STORE ='D:/scrapy/Douyu/imgs/'
    View Code
  • 相关阅读:
    on asp.net
    总结
    CSS的一点使用体会
    existence way of The malicious software
    算法空山幽谷的佳人
    杀毒软件工程师看的书籍
    经典sql语句大全
    客户提的一个需求
    机器什么时候能够学习?
    当实证资产定价遇上机器学习
  • 原文地址:https://www.cnblogs.com/tjp40922/p/10523027.html
Copyright © 2020-2023  润新知