• python----爬虫---爬取图片万余张,应有尽有(分门别类)--第二版


    #
    # date:2021/01/16
    # author:eihouwang
    # 目标网址:http://www.netbian.com/
    # 开发环境:pycharm2020.3.2、Python3.8
    # 用到的模块(库):os,re,requests,urllib,time
    # 整体思路:
    # 1、获取主界面各个分类标签,并生成对应的文件夹 比如分类标签:rili,dongman,fengjing等
    # 2、进入单个标签单独一页,获取单页图片网页列表 比如松林公路图片,帅气美女图片,美丽冬天图片等图片的网址(整个网址图片并不是高清图片)
    # 3、进入单页图片网页列表中的一个网页,获取实际1920*1080像素的图片网址并进行下载
    # 4、保存图片到本地
    # 5、完成单标签图片收集
    # 6、完成多标签图片收集
    # 注意:此爬虫未用多线程,爬取时间较长(几个小时),运行之前最好没有代码中存在的文件夹,让代码自行创建,否则可能出现分类错误
    # 本次选取的分类下载的图片都是不需要登录的,需要登录的没有收集

    import os
    import re
    import time
    import requests
    from urllib.parse import urljoin
    
    # 设置全局变量
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
                             "Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0"}
    
    # 访问网页,获取反馈
    def get_html(url):
        r = requests.get(url, headers=headers)
        if r.status_code == 200:
            return r.text
        return None
    
    # 获取标签列表和创建对应文件夹(返回已经排序的标签网址列表和文件夹列表)
    def get_lable_list(url):
        html = get_html(url)
        result = re.findall('/em(.*?)LOL', html)
        result2 = re.findall('"/(.*?)/"', result[0])
        lable_list = []
        dirnames = []
        for i in result2:
            urlnew = urljoin(url, i)
            lable_list.append(urlnew)
            if "/" in i:
                i = i.replace("/", "\")
            # print(i)
            path = "E:\czxt\" + i + "\"
            # print(path)
            try:
                os.makedirs(path)
                # print("创建{}成功".format(path))
            except:
                pass
            dirnames.append(path)
        # print(sorted(lable_list));print(sorted(dirnames))
        return sorted(lable_list), sorted(dirnames)
    
    # 获取单个标签网页列表
    def get_onelable_list(url):
        html = get_html(url)
        onelable_list = []
        try:
            totalpages = int(re.findall('/span(.*?)>(d+)</a(.*?)class', html)[0][1])
        except:
            totalpages = 1
        print(totalpages)
        for i in range(1, totalpages + 1):
            if i == 1:
                urlnew = url
            else:
                urlnew = url + '/index_' + str(i) + '.htm'
            onelable_list.append(urlnew)
            print(urlnew)
        return onelable_list
    
    # 获取单页图片
    def get_one_page(url, k=0, dirnames=[]):
        baseurl = "http://www.netbian.com/"
        html = get_html(url)
        result = re.findall('desk/(.*?).htm"', html)
        n = 0
        for i in result:
            urlnew = baseurl + "/desk/" + i + "-1920x1080.htm"
            htmlnew = get_html(urlnew)
            try:
                pattern = re.compile('left(.*?)href="(.*?)"', re.S)
                picurl = pattern.findall(htmlnew)[0][1]
                path1 = picurl.split('/')[-1]
                path = dirnames[k] + path1
                if os.path.exists(path):
                    print('已下载!')
                    continue
                # print(picurl)
                r = requests.get(picurl)
                store_one_pic(path, r.content)
                n += 1
                print(path, end="--->")
                print('第({})张图片下载完成'.format(n))
            except:
                pass
        print('本页共计下载有效图片({})张'.format(n))
    
    def store_one_pic(path, content):
        with open(path, 'wb') as f:
            f.write(content)
            f.close()
    
    def main():
        url = "http://www.netbian.com/"
        lable_list, dirnames = get_lable_list(url)
        print('---------------分类标签共计{}个,即将分标签下载------------'.format(len(lable_list)))
        time.sleep(3)
        num = 0
        l = 0
        for k in lable_list:
            num += 1
            onelable_list = get_onelable_list(k)
            print('获取第({})个标签,共({})页'.format(num, len(onelable_list)))
            num2 = 0
            for m in onelable_list:
                num2 += 1
                print('正在获取第({})页数据'.format(num2))
                get_one_page(m, k=l, dirnames=dirnames)
            l += 1
    
    main()
  • 相关阅读:
    smarty display和fetch的区别 简单
    ERRORLEVEL is not %ERRORLEVEL%
    孟岩 快速掌握一个语言最常用的50%
    Delphi 的用户们,早点开始享受 MSBuild 吧!
    DLL的Export和Import
    WinFX Architecture for developers
    孟岩 技术路线的选择重要但不具有决定性
    科技部基础软件产品重大专项2009年课题(基础软件产品部分 转载)
    nmake vs MSBuild
    关于nmake, build , sources 文件 WDM驱动程序设计之编译安装篇
  • 原文地址:https://www.cnblogs.com/eihouwang/p/14288718.html
Copyright © 2020-2023  润新知