• 用scrapy爬取搜狗Lofter图片


    用scrapy爬取搜狗Lofter图片

    # -*- coding: utf-8 -*-
    import json
    
    import scrapy
    from scrapy.http import Request
    from urllib import parse
    from scrapy.loader import ItemLoader
    
    from tutorial.items import LofterSpiderItem
    
    
    class LofterSpider(scrapy.Spider):
        name = "lofter"
        allowed_domains = ["pic.sogou.com"]
        start_urls = ['http://pic.sogou.com/']
    
        # question的第一页answer的请求url
        start_answer_url = "http://pic.sogou.com/pics/channel/getAllRecomPicByTag.jsp?category=LOFTER&tag=%E5%85%A8%E9%83%A8&start={0}&len=15"
    
        headers = {
            "HOST": "pic.sogou.com",
            "Referer": "http://pic.sogou.com",
            'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
        }
    
        def parse(self, response):
    
            yield scrapy.Request(self.start_answer_url.format(0), headers=self.headers,callback=self.parse_url)
    
        def parse_url(self,response):
            ans_json = json.loads(response.text)
            for ans in ans_json['all_items']:
                image_url = ans['ori_pic_url']
                item_loader = ItemLoader(item=LofterSpiderItem(), response=response)
                item_loader.add_value("lofter_image_url", image_url)
                lofter_item = item_loader.load_item()
                yield lofter_item
    
            yield scrapy.Request(self.start_answer_url.format(ans_json['startIndex']+15), headers=self.headers,callback=self.parse_url)
    
    
    

    settings.py

    ITEM_PIPELINES = {
       'tutorial.pipelines.TutorialPipeline': 300,
        'tutorial.pipelines.TutorialImagePipeline': 1,
    }
    # IMAGES_URLS_FIELD = "front_image_url"
    project_dir = os.path.abspath(os.path.dirname(__file__))
    IMAGES_STORE = os.path.join(project_dir, 'image')
    

    items.py

    class LofterSpiderItem(scrapy.Item):
        lofter_image_url = scrapy.Field(
            output_processor=MapCompose(return_value)
        )
    
  • 相关阅读:
    [NOIP2002 提高组] 均分纸牌
    洛谷 P1303 A*B Problem
    OpenJudge 1.6.5 年龄与疾病
    hdu 3340 线段树思路活用
    poj 2464 线段树统计区间..两棵树
    hdu 4419 矩形面积覆盖颜色
    经典动态规划 dp Rqnoj 57
    最基础二维线段树 hdu 1823 (简单)
    hdu 3564 线段树+dp
    spoj 1557 线段树 区间最大连续和 (不重复数)
  • 原文地址:https://www.cnblogs.com/luozhiyun/p/8127259.html
Copyright © 2020-2023  润新知