• Scrapy学习-8-ItemLoader


    ItemLoader使用
    作用
      方便管理维护重用xpath或css规则
     
    实例
    itemloader+图片处理
    # items.py
    
        import scrapy
        from scrapy.loader.processors import MapCompose, Join, TakeFirst
        from scrapy.loader import ItemLoader
        import datetime
        import re
    
        def date_convert(value):
            try:
                create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date()
            except Exception as e:
                create_date = datetime.datetime.now().date()
    
            return create_date
    
        def get_nums(value):
            match_re = re.match(".*?(d+).*", value)
            if match_re:
                nums = int(match_re.group(1))
            else:
                nums = 0
    
            return nums
    
        def return_value(value):
            return value
    
        def remove_comment_tags(value):
            #去掉tag中提取的评论
            if "评论" in value:
                return ""
            else:
                return value
    
        class ArticleItemLoader(ItemLoader):
            #自定义itemloader
            default_output_processor = TakeFirst()
    
        class ArticlespiderItem(scrapy.Item):
            # title字段值回在最后加上 -jobbole-abc
            title = scrapy.Field(
                input_processor=MapCompose(lambda x:x+"-jobbole", lambda y:y+"-abc")
            )
            create_date = scrapy.Field(
                input_processor=MapCompose(date_convert),
            )
            url = scrapy.Field()
            url_object_id = scrapy.Field()
            # 由于front_image_url需要存储一个列表,不能使用defauls方法获取,为了保持原来的列表,我们重写一个output_processor
            front_image_url = scrapy.Field(
                output_processor=MapCompose(return_value)
            )
            front_image_path = scrapy.Field()
            praise_nums = scrapy.Field(
                input_processor=MapCompose(get_nums)
            )
            comment_nums = scrapy.Field(
                input_processor=MapCompose(get_nums)
            )
            fav_nums = scrapy.Field(
                input_processor=MapCompose(get_nums)
            )
            tags = scrapy.Field(
                input_processor=MapCompose(remove_comment_tags),
                output_processor=Join(",")
            )
            content = scrapy.Field()
    
    # pipelines.py
    
        from scrapy.pipelines.images import ImagesPipeline
    
        class ArticleImagePipeline(ImagesPipeline):
            
            def item_completed(self, results, item, info):
                
                if "front_image_url" in item:
                
                    for res, value in results:
    
                        image_path = value['path']
    
                        item['front_image_path'] = image_path
    
                # 处理完成路径需要将item返回,因为在settings中,配置的了优先级,该pipelines可以将items继续传递给下一个pipelines中
    
                return item 
    
    # 使用自己的pipelines
    
        ITEM_PIPELINES = {
            'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
            'ArticleSpider.pipelines.ArticleImagePipeline': 1,
        }
  • 相关阅读:
    分享的选择
    @contextmanager 另外一种实现上下文的方法(含yield 生成器)
    Linux ☞ Good good study,day day up
    Linux常用命令
    python 新手遇到的问题
    Pyqt 基础功能
    redis安装使用
    python 对任意文件(jpg,png,mp3,mp4)base64的编码解码
    PHP7.* AES的加密解密
    Ubuntu宝塔面板设置网站 Apache Server API为Apache 2.0 Handler模式
  • 原文地址:https://www.cnblogs.com/cq146637/p/9053398.html
Copyright © 2020-2023  润新知