• scrapy--doutu


      年轻人都爱斗图,可是有时候斗图的数量比较少.就想办法收藏其他的人图片,然而只要能在doutula网页里爬取图片,是一件很棒的的事,看别人写爬斗图的爬虫程序有点麻烦,自己也来动动手,简单,实用.给大家分享一下。先给大家看看成果,喜欢的话,就开始吧!!!

    1.doutu.py

    # -*- coding: utf-8 -*-
    import scrapy
    from doutu.items import DoutuItem
    from scrapy.linkextractors import LinkExtractor
    import pdb
    
    class DoutuSpider(scrapy.Spider):
        name = 'Doutu'
        #allowed_domains = ['www.doutula.com']
        start_urls = ['http://www.doutula.com/']
    
        def parse(self, response):
            le = LinkExtractor(restrict_css='div.col-sm-9')
            links = le.extract_links(response)
            for link in links[1:4]:
                yield scrapy.Request(link.url,callback=self.parse_pager)
    
            le1 = LinkExtractor(restrict_css='ul.pagination')
            links1 = le1.extract_links(response)
            for link1 in links1:
                yield scrapy.Request(link1.url,callback=self.parse)
    
        def parse_pager(self,response):
            le2 = LinkExtractor(restrict_css='div.pic-content')
            links2 = le2.extract_links(response)
            for link2 in links2:
                yield scrapy.Request(link2.url,callback=self.parse_img)
    
        def parse_img(self,response):
            doutu = DoutuItem()
            doutu['image_urls'] = response.xpath('//div[@class="swiper-slide"]//img/@src').extract()[0]items.py

    2.items.py

    import scrapy
    
    class DoutuItem(scrapy.Item):
        image_urls = scrapy.Field()
        images = scrapy.Field()

    3.pipelines.py

    import pdb
    import scrapy
    from scrapy.pipelines.images import ImagesPipeline
    
    class DoutuPipeline(ImagesPipeline):
        def get_media_requests(self, item, info):
    
            yield scrapy.Request(item['image_urls'])
    
        def item_completed(self, results, item, info):
    
            image_paths = [x['path'] for ok, x in results if ok]  # ok判断是否下载成功
    
            if not image_paths:
                raise DropItem("Item contains no images")
            return item

    4.settings.py

    IMAGES_STORE = r'C:Desktopdoutula'     #图片存储文件名
    
    USER_AGENT ={       #设置浏览器的User_agent
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    }
    
    CONCURRENT_REQUESTS = 16    #同时来16个请求
    DOWNLOAD_DELAY = 0.2        #0.2s后开启处理第一个请求
    
    IMAGES_THUMBS = {#缩略图的尺寸,设置这个值就会产生缩略图
        'small': (50, 50),      #full/small
        'big': (200, 200),      #full/big
    }
    
    ROBOTSTXT_OBEY = False      #不遵守robot.txt条约
    
    COOKIES_ENABLED = False     #禁用cookies
    
    ITEM_PIPELINES = {
        'doutu.pipelines.DoutuPipeline': 1,     #设置优先级1-1000
    }

    有遇到问题的小伙伴,可以在下面留言.欢迎

  • 相关阅读:
    线性分类器之感知机算法
    字符串包含判断
    王家林 云计算分布式大数据Hadoop实战高手之路从零开始 第二讲:全球最详细的从零起步搭建Hadoop单机和伪分布式开发环境图文教程
    王家林 第六讲Hadoop图文训练课程:使用HDFS命令行工具操作Hadoop分布式集群初体验
    王家林的“云计算分布式大数据Hadoop实战高手之路从零开始”的第五讲Hadoop图文训练课程:解决典型Hadoop分布式集群环境搭建问题
    王家林的 第三讲Hadoop图文训练课程:证明Hadoop工作的正确性和可靠性只需4步图文并茂的过程
    王家林 第四讲Hadoop图文训练课程:实战构建真正的Hadoop分布式集群环境
    麻雀GUIv1.0整理好咯,发个开源上来。
    body设置背景色异常
    safari浏览器placeholder垂直居中
  • 原文地址:https://www.cnblogs.com/eilinge/p/9401753.html
Copyright © 2020-2023  润新知