• 爬虫学习之-文件管道重写


    如果要文件管道保存为原有的文件名  需要重写文件管道的方法

    pipeitem文件 

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    from scrapy.pipelines.files import FilesPipeline
    class OveridePipeline(FilesPipeline):
        def file_path(self, request, response=None, info=None):
            file_name = request.url.split('/')[-1]
            if "." not in file_name:
                file_name = file_name + '.png'
            return "pexels/"+file_name
    
    
    class ImagesPipeline(object):
        def process_item(self, item, spider):
            # tmp = item['image_urls']
            # item['image_urls'] = []
            #
            # for i in tmp:
            #     if "?" in i:
            #         item['image_urls'].append(i.split("?")[0])
            #     else:
            #         item['image_urls'].append(i)
            # print("下载图片:",item['image_urls'])
            # return item
            tmp = item['file_urls']
            item['file_urls'] = []
    
            for i in tmp:
                if "?" in i:
                    item['file_urls'].append(i.split("?")[0])
                else:
                    item['file_urls'].append(i)
            print("下载图片:", item['file_urls'])
            return item
    

      setting配置

    ITEM_PIPELINES = {
        #'scrapy.pipelines.images.ImagesPipeline':2,
        #'scrapy.pipelines.files.FilesPipeline':3,
        'images.pipelines.OveridePipeline':3,
        'images.pipelines.ImagesPipeline': 1,
    }
    
    FILES_STORE = 'd:/crawl'
    

      spider文件

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    
    from ..items import ImagesItem
    from scrapy.pipelines.images import ImagesPipeline
    from scrapy.pipelines.files import FilesPipeline
    from scrapy.pipelines.media import MediaPipeline
    class PexSpider(CrawlSpider):
        name = 'pex'
        allowed_domains = ['www.pexels.com']
        start_urls = ['https://www.pexels.com/photo/vehicle-on-road-along-green-grass-during-night-714023/']
    
        rules = (
            Rule(LinkExtractor(allow=r'/photo/'), callback='parse_item', follow=True),
        )
    
        def parse_item(self, response):
            i = ImagesItem()
            #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
            #i['name'] = response.xpath('//div[@id="name"]').extract()
            #i['description'] = response.xpath('//div[@id="description"]').extract()
            #i['image_urls'] = response.xpath("//img[@class='image-section__image js-photo-zoom']/@src").extract()
            i['file_urls'] = response.xpath("//img[@class='image-section__image js-photo-zoom']/@src").extract()
            return i
    

      item文件

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # http://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class ImagesItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        # image_urls = scrapy.Field()
        # images = scrapy.Field()
        file_urls = scrapy.Field()
        files = scrapy.Field()
    

      

  • 相关阅读:
    ubuntu如何以删除文件夹?
    Ubuntu下安装lrzsz
    SSH服务器拒绝密码检测
    ubuntu下安装、启动和卸载SSH
    VirtualBox下Ubuntu利用桥接方式上网
    Python网络编程笔记二
    Python网络编程笔记一
    Python反射笔记
    Python之time模块和datatime模块
    Python正则表达式之findall疑点
  • 原文地址:https://www.cnblogs.com/brady-wang/p/9695422.html
Copyright © 2020-2023  润新知