• Python--爬取彼岸网站的图片


    爬取彼岸网站上图片信息,并将图片下载下来分类保存

    思路: 先获取不同类别的链接信息,再获取不同类别图片分页的链接,进入图片详情页面获取图片下载地址,下载图片并分类保存

    代码如下:

    # encoding:utf-8
    import requests
    from lxml import etree
    import os, time
    
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"}
    base_url = "http://pic.netbian.com/"
    
    def send_request(url):
        # 发送请求函数
        time.sleep(2)
        response = requests.get(url=url, headers=header).content
        return response
    
    def get_catrgories_link():
        resp = send_request(base_url)
        html = etree.HTML(resp)
        links = html.xpath("//div[@class='classify clearfix']/a")
        for link in links:
            lik = base_url + link.xpath("./@href")[0]
            title = link.xpath("./@title")[0]
            name_link(title, lik)
    
    def name_link(dir_name, link):
        # 抓取前4页面的图片
        for i in range(1, 5):
            if i == 1:
                url = link
            else:
                url = link + "index_" + str(i) + ".html"
            resp = send_request(url)
            html = etree.HTML(resp)
            links = html.xpath("//div[@class='slist']/ul/li")
            for lik in links:
                img_name = lik.xpath("./a/img/@alt")[0]
                img_link = base_url + lik.xpath("./a/@href")[0]
                img_name_url(dir_name, img_name, img_link)
    
    def img_name_url(dir_name, img_name, img_link):
        resp = send_request(img_link)
        html = etree.HTML(resp)
        image_url = base_url + html.xpath("//*[@id='img']/img/@src")[0]
        download(dir_name, img_name, image_url)
    
    count = 1
    def download(dir_name, img_name, image_url):
        # 下载图片
        global count
        path = "彼岸图库/{}".format(dir_name)
        if not os.path.exists(path):
            os.makedirs(path)
            print('-------[{}]文件夹已经创建成功,开始下载图片-------'.format(img_name))
        print('正在下载{}, 这是第{}张图片'.format(img_name, count))
        rep = send_request(image_url)
        with open('彼岸图库/{}/{}.jpg'.format(dir_name, img_name), 'wb')as f:
            count += 1
            f.write(rep)
            print('{}已经成功下载, 这是第{}张图片'.format(img_name, count))
    
    get_catrgories_link()
  • 相关阅读:
    Bootstrap学习笔记系列2-------Bootstrap简单表格处理
    Bootstrap学习笔记系列1-------Bootstrap网格系统
    前端代码规范
    Dev TreeList设置焦点失败解决方法
    las数据集加载las数据
    c# 文件另存为代码
    Dev 饼图
    ASP.NET MVC Json的序列化和反序列化
    服务器重启后导致访问ArcServer地图服务须登录
    jQuery回调函数
  • 原文地址:https://www.cnblogs.com/zhouzetian/p/12814916.html
Copyright © 2020-2023  润新知