• scrapy文件管道


    安装scrapy

    pip install scrapy

    新建项目

    (python36) E:www>scrapy startproject fileDownload
    New Scrapy project 'fileDownload', using template directory 'c:usersrady.condaenvspython36libsite-packagesscrapy	emplatesproject', created in:
        E:wwwfileDownload
    
    You can start your first spider with:
        cd fileDownload
        scrapy genspider example example.com
    
    (python36) E:www>
    (python36) E:www>scrapy startproject fileDownload
    New Scrapy project 'fileDownload', using template directory 'c:usersrady.condaenvspython36libsite-packagesscrapy	emplatesproject', created in:
        E:wwwfileDownload
    
    You can start your first spider with:
        cd fileDownload
        scrapy genspider example example.com
    
    (python36) E:www>
    

    编辑爬虫提取内容

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    
    from  fileDownload.items import  FiledownloadItem
    
    class PexelsSpider(CrawlSpider):
        name = 'pexels'
        allowed_domains = ['www.pexels.com']
        start_urls = ['https://www.pexels.com/photo/white-concrete-building-2559175/']
    
        rules = (
            Rule(LinkExtractor(allow=r'/photo/'), callback='parse_item', follow=True),
        )
    
        def parse_item(self, response):
            print(response.url)
            url = response.xpath("//img[contains(@src,'photos')]/@src").extract()
            item = FiledownloadItem()
            try:
                item['file_urls'] = url
                print("爬取到图片列表 " + url)
                yield item
            except Exception as  e:
                print(str(e))
    

    配置item

    class FiledownloadItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        file_urls = scrapy.Field()
    

      

    setting.py

    启用文件管道

    'scrapy.pipelines.files.FilesPipeline':2  文件管道

    FILES_STORE=''  //存储路径

    item里面

    file_urls = scrapy.Field()

    files = scrapy.field()

    爬虫里面 改为file_urls参数传递到管道

    重写文件管道 保存文件名为图片原名

    pipelines.php里面 新建自己图片管道,继承图片管道

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    
    from scrapy.pipelines.files import  FilesPipeline
    class FiledownloadPipeline(object):
        def process_item(self, item, spider):
            tmp = item['file_urls']
            item['file_urls'] = []
    
            for i in tmp:
                if "?" in i:
                    item['file_urls'].append(i.split('?')[0])
                else:
                    item['file_urls'].append(i)
            print(item)
            return item
    
    
    class  MyFilesPipeline(FilesPipeline):
        def file_path(self, request, response=None, info=None):
            file_path = request.url
            file_path = file_path.split('/')[-1]
            print("下载图片"+ file_path)
            return 'full/%s' % (file_path)
    

    setting.py 改为启用自己文件管道

    ITEM_PIPELINES = {
        'fileDownload.pipelines.FiledownloadPipeline': 1,
        'fileDownload.pipelines.MyFilesPipeline': 2,
        #'scrapy.pipelines.files.FilesPipeline':2
    }
    

    获取套图 

    # -*- coding: utf-8 -*-
    from time import sleep
    
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    
    
    class AngelSpider(CrawlSpider):
        name = 'angel'
        allowed_domains = ['angelimg.spbeen.com']
        start_urls = ['http://angelimg.spbeen.com/']
    
        base_url = "http://angelimg.spbeen.com"
        rules = (
            Rule(LinkExtractor(allow=r'^http://angelimg.spbeen.com/ang/d+$'), callback='parse_item', follow=False),
        )
    
        def parse_item(self, response):
            item = response.meta.get('item',False)
            if item:
                pass
            else:
                item = {}
                item['files'] = []
                item['file_urls'] = []
            print(response.url)
            img_url = response.xpath('.//div[@id="content"]/a/img/@src').extract_first()
            item['file_urls'].append(img_url)
    
            # 如果有下一页 请求下一页,没有数据丢回管道
            next_url = response.xpath('.//div[@class="page"]//a[contains(@class,"next")]/@href').extract_first()
    
            if next_url:
                next_url = self.base_url + next_url
                yield scrapy.Request(next_url,callback=self.parse_item,meta={'item':item})
            else:
                print(item)
                yield item
        def parse_next_response(self,response,):
            item = response.meta.get('item')
            print(item,response.url)
    

      

      github地址

    https://github.com/brady-wang/spider-fileDownload

      

  • 相关阅读:
    PAT 甲题 1155 Heap Paths
    PAT甲题 1014 Waiting in Line
    PAT甲题 1014 Waiting in Line
    AcWing 840. 模拟散列表
    2019新生赛 %%%xxh
    AcWing 240. 食物链
    AcWing 143. 最大异或对
    AcWing 838. 堆排序
    AcWing 836. 合并集合
    AcWing 837. 连通块中点的数量
  • 原文地址:https://www.cnblogs.com/brady-wang/p/11796310.html
Copyright © 2020-2023  润新知