• Python批量下载壁纸,保存壁纸,flask搭建壁纸网站


    小提示:需更换新的请求头,下载时同目录创建img文件夹存图片

    1. 获取壁纸数据

    """
    思路
    1. 请求网页,拿到源代码 ok   requests.get(网址)
    2. 将源代码解析成网页        lxml的etree.HTML(源代码)
    3. 从网页中提取数据         HTML.xpath('复制来的xpath/text()')
    4. 翻页,全站数据爬取       翻页一般修改url就可以实现
    5. 数据保存
    """
    from pprint import pprint
    import csv
    
    """
    1. 获取小图片页面的大图页面的地址
    2. 将域名拼接上去,构成大图片的页面地址
    3. 请求大图片的地址,将大图片的网址保存
    4. 下载图片
    """
    
    import requests
    from lxml import etree
    
    baseUrl = 'http://www.netbian.com'
    
    # 伪装成浏览器  需要更换请求头
    cookies = {
        '__yjs_duid': '1_33e223172d0308c509f12b4f304f2d491651476976719',
        'yjs_js_security_passport': 'f75d121a02abe7650a974a503fabb1b5f24977f8_1652958780_js',
        'Hm_lvt_0f461eb489c245a31c209d36e41fcc0f': '1652672709,1652774098,1652870042,1652958781',
        'Hm_lpvt_0f461eb489c245a31c209d36e41fcc0f': '1652958781',
    }
    
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Referer': 'http://www.netbian.com/',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
    }
    
    
    def page(url):
        res = requests.get(url, headers=headers, cookies=cookies)
    
        html = res.text
        HTML = etree.HTML(html)
    
        # 提取数据 href属性
        for i in range(1, 21):
            if i == 3:
                continue
    
            href = HTML.xpath('//*[@id="main"]/div[3]/ul/li[{}]/a/@href'.format(i))
    
            # 提取小图片地址
            small = HTML.xpath('//*[@id="main"]/div[3]/ul/li[{}]/a/img/@src'.format(i))
    
            # 将域名拼接上去
            detail_url = baseUrl + href[0]
            # print(detail_url)
    
            # 请求详情页,拿大图地址
            detail = requests.get(detail_url, headers=headers, cookies=cookies)
            detail.encoding = "gbk"
            detail_html = detail.text
            detail_HTML = etree.HTML(detail_html)
    
            # 提取图片
            big = detail_HTML.xpath('//*[@id="main"]/div[3]/div/p/a/img/@src')
    
            # 提取图片标题
            title = detail_HTML.xpath('//*[@id="main"]/div[3]/div/p/a/img/@title')
            print(title, small, big)
            data.append(
                {"title": title[0], "small": small[0], "big": big[0], "category": category}
            )
    
    
    if __name__ == '__main__':
        category = "动漫"
        data = []
        for p in range(2, 5):
            main_url = "http://www.netbian.com/dongman/"
            url = main_url + 'index_{}.htm'.format(p)
            page(url)
    
        pprint(data)
    
        # 保存数据
        # 1. 创建表头
        header_list = ["title", "small", "big", "category"]
        # 打开文件
        with open("img_data_a.csv", 'w', encoding="utf-8-sig", newline="") as f:
            # 创建csv的写对象
            writer = csv.DictWriter(f, header_list)
            # 写入表头(a模式 第一次爬的需要写入表头,第二次不需要表头,请把写入表头注释了)
            writer.writeheader()
            # 写入数据
            writer.writerows(data)
    获取数据

    2. 批量下载

    # 导入 csv 库
    import csv
    import requests
    import time
    
    # 伪装成浏览器  需要更换请求头
    cookies = {
        '__yjs_duid': '1_33e223172d0308c509f12b4f304f2d491651476976719',
        'yjs_js_security_passport': 'f75d121a02abe7650a974a503fabb1b5f24977f8_1652958780_js',
        'Hm_lvt_0f461eb489c245a31c209d36e41fcc0f': '1652672709,1652774098,1652870042,1652958781',
        'Hm_lpvt_0f461eb489c245a31c209d36e41fcc0f': '1652958781',
    }
    
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Referer': 'http://www.netbian.com/',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
    }
    
    
    def progressbar(url, filepath='./必须加上扩展名'):
        start = time.time()  # 下载开始时间
        response = requests.get(url, stream=True, headers=headers, cookies=cookies)  # stream=True必须写上
        size = 0  # 初始化已下载大小
        chunk_size = 1024  # 每次下载的数据大小
        content_size = int(response.headers['content-length'])  # 下载文件总大小
        try:
            if response.status_code == 200:  # 判断是否响应成功
                print('开始下载,[文件大小]:{size:.2f} MB'.format(
                    size=content_size / chunk_size / 1024))  # 开始下载,显示下载文件大小
                # filepath = '下载/222.mp4'  #注:必须加上扩展名
                with open(filepath, 'wb') as file:  # 显示进度条
                    for data in response.iter_content(chunk_size=chunk_size):
                        file.write(data)
                        size += len(data)
                        print('\r' + '[下载进度]:%s%.2f%%' % (
                            '>' * int(size * 50 / content_size), float(size / content_size * 100)), end=' ')
            end = time.time()  # 下载结束时间
            print('完成!用时: %.2f秒' % (end - start))  # 输出下载用时时间
        except Exception:
            pass
    
    
    # 打开文件
    with open("img_data_a.csv", encoding="utf-8-sig", mode="r") as f:
        # 基于打开的文件,创建csv.DictReader实例
        reader = csv.DictReader(f)
    
        # 输出信息
        for row in reader:
            title = row.get("title")
            big_url = row.get("big")
            print("下载:", title)
            # 下载
            # res = requests.get(big_url, headers=headers, cookies=cookies)
            #
            # with open("img/{}.jpg".format(title), 'wb') as f:
            #     f.write(res.content)
    
            # 用进度条下载  需要当前目录创建img文件夹存放下载的图片
            u = big_url
            progressbar(url=u, filepath="img/{}.jpg".format(title))
    下载图片

    3. 线程池批量获取全站壁纸地址

    import requests
    from lxml import etree
    from concurrent.futures import ThreadPoolExecutor
    import csv
    
    baseUrl = 'http://www.netbian.com'
    
    # 伪装成浏览器 需要更换
    
    cookies = {
        '__yjs_duid': '1_33e223172d0308c509f12b4f304f2d491651476976719',
        'Hm_lvt_0f461eb489c245a31c209d36e41fcc0f': '1652672709,1652774098,1652870042',
        'trenvecookieclassrecord': '^%^2C4^%^2C',
        'trenvecookieinforecord': '^%^2C4-14978^%^2C',
        'yjs_js_security_passport': '9376682f8d181fc1c094828cbcf9858097ffe69e_1652876557_js',
        'Hm_lpvt_0f461eb489c245a31c209d36e41fcc0f': '1652876558',
    }
    
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Referer': 'http://www.netbian.com/fengjing/index_3.htm',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
    }
    
    
    def home_page(url):
        res = requests.get(url, headers=headers, cookies=cookies)
        html = res.text
        HTML = etree.HTML(html)
        lis = HTML.xpath('//*[@id="main"]/div[3]/ul//li')
        # print(lis)
        for i in range(1, len(lis)):
            # 提取数据 href属性
            if i == 3:
                continue
    
            href = HTML.xpath('//*[@id="main"]/div[3]/ul/li[1]/a/@href')
            small = HTML.xpath('//*[@id="main"]/div[3]/ul/li[4]/a/img/@src')[0]
            # print(href)
            # 将域名拼接上去
            detail_url = baseUrl + href[0]
            print(detail_url)
            detail_date = {'small': small}
    
            detail(detail_url, detail_date)
    
    
    def detail(detail_url, detail_date):
        res = requests.get(detail_url, headers=headers, cookies=cookies)
        res.encoding = "gbk"
        html = res.text
        HTML = etree.HTML(html)
        img_url = HTML.xpath('//*[@id="main"]/div[3]/div/p/a/img/@src')[0]
        title = HTML.xpath('//*[@id="main"]/div[3]/div/p/a/img/@title')[0]
        # print(img_url)
        detail_date["title"] = title
        detail_date["big"] = img_url
        detail_date['category'] = current
        print(current)
        print(title)
        data.append(detail_date)
    
    
    def job(url):
        print(url)
        for page in range(1, 10000):
            try:
                if page == 1:
                    home_page(url)
                else:
                    url_next = url + 'index_{}.htm'.format(page)
                    home_page(url_next)
            except:
                print("没有了")
                return
    
    
    def save():
        # 表头
        header_list = ["title", "category", "small", 'big']
        with open("img_data_all.csv", 'w', encoding="utf-8") as f:
            writer = csv.DictWriter(f, header_list)
            writer.writeheader()
    
            # 写入数据
            writer.writerows(data)
    
    
    if __name__ == '__main__':
        category = [
            "/rili/",
            "/dongman/",
            "/fengjing/",
            "/meinv/",
            "/youxi/",
            "/yingshi/",
            "/dongtai/",
            "/weimei/",
            "/sheji/",
            "/keai/",
            "/qiche/",
            "/huahui/",
            "/dongwu/",
            "/jieri/",
            "/renwu/",
            "/meishi/",
            "/shuiguo/",
            "/jianzhu/",
            "/tiyu/",
            "/junshi/",
            "/feizhuliu/",
            "/qita/",
            "/s/wangzherongyao/",
            "/s/huyan/",
            "/s/lol/", ]
        data = []
        pool = ThreadPoolExecutor(50)
        for lei in category:
            current = lei
            url = baseUrl + lei
            # job(url)
            pool.submit(job, url)
        pool.shutdown()
        print("-- 爬取结束 --".center(20, "*"))
        print("开始写入")
        save()
        print("写入完成")
    全站/多线程

    4. 存数据库

    5. 搭建壁纸网站

  • 相关阅读:
    Android混淆
    Web开发人员应当知道的15个开源项目
    应用开发10种免费推广的方法
    (转载)Comparing C++ and C (Inheritance and Virtual Functions)
    JCTVC 会议输出文档
    HEVC bit depth increasment
    函数指针声明时的形参列表可以没有
    关于链接 Linkage
    二级指针和二维数组
    C 与 C++互相调用函数,变量
  • 原文地址:https://www.cnblogs.com/zwnsyw/p/16295765.html
Copyright © 2020-2023  润新知