• scrapy(四): 爬取二级页面的内容


    scrapy爬取二级页面的内容

    1.定义数据结构item.py文件

    # -*- coding: utf-8 -*-
    '''
    field: item.py
    '''
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class TupianprojectItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        # 图片标题
        title = scrapy.Field()
        # 发布时间
        publish_time = scrapy.Field()
        # 浏览量
        look = scrapy.Field()
        # 收藏量
        collect = scrapy.Field()
        # 下载量
        download = scrapy.Field()
        # 图片链接
        image_url = scrapy.Field()
    
    
    

    2.爬虫文件

    # -*- coding: utf-8 -*-
    import scrapy
    
    from tupianproject.items import TupianprojectItem
    
    
    class ImageSpider(scrapy.Spider):
        name = 'image'
        allowed_domains = ['699pic.com']
        start_urls = ['http://699pic.com/people-1-0-0-0-0-0-0.html']
        
        url = 'http://699pic.com/people-{}-0-0-0-0-0-0.html'
        page = 1
    
        def parse(self, response):
            # 在一级页面中,应该将所有的图片详情页面的链接获取到
            image_detail_url_list = response.xpath('//div[@class="list"]/a/@href').extract()
            # pass
            # 遍历详情页面,向每一个详情页面发送请求即可
            for image_detail_url in image_detail_url_list:
                yield scrapy.Request(url=image_detail_url, callback=self.parse_detail)
            
            # 接着发送其他请求
            if self.page <= 3:
                self.page += 1
                url = self.url.format(self.page)
                yield scrapy.Request(url=url, callback=self.parse)
        
        def parse_detail(self, response):
            # 创建一个item对象
            item = TupianprojectItem()
            # 提取图片的每一个信息
            # title
            item['title'] = response.xpath('//div[@class="photo-view"]/h1/text()').extract_first()
            # 发布时间
            item['publish_time'] = response.xpath('//div[@class="photo-view"]/div/span[@class="publicityt"]')[0].xpath('string(.)').extract_first()
            # 获取浏览量
            item['look'] = response.xpath('//div[@class="photo-view"]/div/span[@class="look"]/read/text()').extract_first()
            # 获取收藏量
            item['collect'] = response.xpath('//div[@class="photo-view"]/div/span[@class="collect"]')[0].xpath('string(.)').extract_first()
            # 获取下载量
            item['download'] = response.xpath('//div[@class="photo-view"]/div/span[@class="download"]')[0].xpath('string(.)').extract_first().strip('
    	')
            # 获取图片的链接
            item['image_url'] = response.xpath('//div[@class="huabu"]//img/@src').extract_first()
            # 将item发送出去
            yield item
    
    
    

    3.管道文件

    # -*- coding: utf-8 -*-
    '''
    filed: pipelines.py
    '''
    s
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    import json
    import urllib.request
    import os
    
    class TupianprojectPipeline(object):
        def open_spider(self, spider):
            self.fp = open('tupian.json', 'w', encoding='utf8')
            
        def process_item(self, item, spider):
            d = dict(item)
            string = json.dumps(d, ensure_ascii=False)
            self.fp.write(string + '
    ')
            
            # 下载图片
            self.download(item)
            return item
            
        def download(self, item):
            dirname = './people'
            suffix = item['image_url'].split('.')[-1]
            filename = item['title'] + '.' + suffix
            filepath = os.path.join(dirname, filename)
            urllib.request.urlretrieve(item['image_url'], filepath)
        
        def close_spider(self, spider):
            self.fp.close()
    
    
  • 相关阅读:
    微信小程序
    js
    js
    uni
    uni/微信小程序
    uni/微信小程序
    ES6...扩展运算符(数组或类数组对象)
    微信小程序
    微信小程序
    玩转storm
  • 原文地址:https://www.cnblogs.com/lpdeboke/p/12964947.html
Copyright © 2020-2023  润新知