使用爬虫抓取王者荣耀英雄皮肤

1:创建爬虫项目

scrapy startproject wzry

2：创建爬虫

scrapy genspider jishudaniu example.com

3:启动爬虫

scrapy crawl jishudaniu

# -*- coding: utf-8 -*-
import scrapy
import os
import urllib.request

class JishudaniuSpider(scrapy.Spider):
    name = "jishudaniu"
    #allowed_domains = ["example.com"]

    #爬虫爬取链接的起点
    start_urls = ['https://pvp.qq.com/web201605/herolist.shtml']

    def parse(self, response):
        host_name="https://pvp.qq.com/web201605/"
        hero_list = response.xpath('//div[@class="herolist-box"]/div[@class="herolist-content"]/ul/li/a');#// 表示HTML网页结构中任意部位
        for link in hero_list:
            href=link.xpath('./@href').extract()[0]  #./表示当前
            detial_url=host_name+href
            yield scrapy.Request(detial_url, self.detial_parse)
            #print(href)


    def detial_parse(self,response):
        message=response.xpath('/html/body/script[10]/text()').extract()[0];
        heroName = message.split(",")[0].replace("'", "").split(" = ")[1]
        heroNo = message.split(",")[1].replace("'", "").replace(";", "").split(" = ")[1].strip()
        #print(message.split(",")[0].replace("'", "").split("=")[1]);
        #print(message.split(",")[1].replace("'", "").replace(";", "").split(" = ")[1]);
        heroSkinLinksTemplate = f"https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{heroNo}/{heroNo}-bigskin-"

        filePath = "E:\wzryimg\"
        if not os.path.exists(filePath + heroName):
            os.makedirs(filePath + heroName)

        skins = response.xpath('//div[@class="pic-pf"]/ul/@data-imgname').extract()[0]
        skin_list = skins.split("|")
        tempSkinList = []
        for skin in skin_list:
            tempSkinList.append(skin.split("&")[0])
        for index in range(0,len(tempSkinList)):
            #获取皮肤名称 os.sep:分割符
            skinname=tempSkinList[index]
            fileName="{}{}{}{}".format(filePath + heroName,os.sep,skinname,".jpg")
            print(heroSkinLinksTemplate)
            urllib.request.urlretrieve(heroSkinLinksTemplate + "{0}.jpg".format(index + 1), filename=fileName)

相关阅读:
JAVA LinkedList和ArrayList的使用及性能分析
学习笔记—Node中的模块调试
学习笔记—Node的核心模块
学习笔记—Node中VM模块详解
学习笔记—Node中require的实现
入园了
【引用】asp.net服务器推送(ServerPush)和客户端拉拽技术
ajax xmlHttp.responseXML取不到值问题备忘
oracle实时插值速度突然变慢问题解决办法
[转帖作者： fuyuncat 来源： www.HelloDBA.com ]Oracle IO问题解析

原文地址：https://www.cnblogs.com/xqschool/p/14131026.html