• urllib结合 concurrent.futures 多线程下载文件。


    示例:

    #!/usr/bin/env python3
    # -*- coding:utf-8 -*-
    #  @Time: 2020/12/16 10:42
    #  @Author:zhangmingda
    #  @File: urllib_multi_download.py
    #  @Software: PyCharm
    #  Description: 使用urllib 模块 实现多线程下载某个文件测试
    
    from concurrent.futures import ThreadPoolExecutor, as_completed
    from urllib.request import urlopen
    from urllib.request import Request
    from urllib.request import quote
    import json
    import math
    import os
    
    class DownLoader(object):
        def __init__(self):
            self.part_size = 1024 * 1024 * 10  # 分块下载大小
            self.part_thread_num = 10
            self.BUFFER_SIZE = 64 * 1024
    
        def download_part(self, encode_url, part_filename, offset, end_bytes):
            """
            :param encode_url:经过URL编码的网络地址 
            :param part_filename: 文件块儿名字
            :param offset: 下载字节起始点(包含)
            :param end_bytes: 下载字节结束点(包含)
            :return: (下载结果)
            """
            # 构造请求头
            range_header = {
                'Range': 'bytes=%s-%s' % (offset, end_bytes)
            }
            print(range_header)
            cur_task_ret = False
            expected_file_size = end_bytes - offset + 1
            part_req = Request(encode_url,headers=range_header)
            with open(part_filename, 'wb') as local_part_fd:
                with urlopen(part_req) as req_fd:
                    while True:
                        # 一直从网络读数据
                        data = req_fd.read(self.BUFFER_SIZE)
                        if not data:
                            break
                        local_part_fd.write(data)
            if expected_file_size == os.stat(part_filename).st_size:
                print('%s 与预期块儿文件大小相符' % part_filename)
                cur_task_ret = True
                # break
            else:
                print('%s 与预期块儿文件大小 不符,预期%s字节,实际得到%s 字节' % (
                part_filename, expected_file_size, os.stat(part_filename).st_size))
    
            return {part_filename: cur_task_ret}
    
        def download(self, url):
            finally_filename = os.path.basename(url)
            # 将URL编码成%字符串格式
            encode_url = quote(url, safe=";/?:@&=+$,")
            print(encode_url)
            # 构造请求
            req = Request(encode_url)
            # 发起请求并且获取内容长度
            with urlopen(req) as fp:
                # print(json.dumps(dir(fp),indent=1))
                print(fp.getheaders())
                # length = fp.getheader('content-Range')
                length = fp.getheader('Content-Length')
                length = int(length)
                print(type(length))
                print('length:', length)
            
            # 分块任务列表
            thread_list = []
            # 每个块儿下载的结果
            multi_chunk_download_result = {}
            chunk_size = self.part_size
            # 计算需要下载的块儿个数
            chunk_count = int(math.ceil(length / float(chunk_size)))
            pool_args_list = []
            
            # 计算每个块儿请求的字节范围
            for i in range(chunk_count):
                offset = chunk_size * i
                end_bytes = min(chunk_size * (i + 1), length) - 1
                # 将一个文件划分的所有块儿任务,添加到任务列表
                part_num = i + 1
                part_filename = finally_filename + '.' + str(part_num)
                # 每个块儿请求的范围,块儿名字,加到线程参数列表
                pool_args_list.append((encode_url, part_filename, offset, end_bytes))
            
            # ********开始多线程下载数据,并获取下载结果**************
            # 构建线程池实例
            tp = ThreadPoolExecutor(max_workers=self.part_thread_num)
            # 全部添加到任务队列开始处理
            [thread_list.append(tp.submit(self.download_part, *args)) for args in pool_args_list]
            # 等待所有线程结束,获取全部线程的执行结果
            [multi_chunk_download_result.update(part_thread.result()) for part_thread in as_completed(thread_list)]
    
            # 下载总结
            print('下载总结')
            # 如果任务数和块儿数对不上,报一下出入
            if len(multi_chunk_download_result) != chunk_count:
                raise RuntimeError(
                    "%s part miss,expect=%d,actual=%d" % (finally_filename, chunk_count, len(multi_chunk_download_result)))
            # 如果任务都完毕,检查是否有失败的块儿
            for item in multi_chunk_download_result.keys():
                if not multi_chunk_download_result[item]:
                    raise RuntimeError("%s part upload has fail" % item)
            # 都OK 整合文件
            with open(finally_filename, 'wb') as local_fd:
                for i in range(chunk_count):
                    part_filename = finally_filename + '.' + str(i + 1)
                    with open(part_filename, 'rb') as part_fd:
                        while True:
                            bytes_data = part_fd.read(self.BUFFER_SIZE)
                            if not bytes_data:
                                break
                            local_fd.write(bytes_data)
            
            if length == os.stat(finally_filename).st_size:
                print('%s  下载完成,文件大小相符' % finally_filename)
                for part_filename in multi_chunk_download_result.keys():
                    os.remove(part_filename)
            else:
                print('%s  下载完成,但大小不符,content_length:%s  下载后大小 %s' % (finally_filename, length,os.stat(finally_filename).st_size ))
    
    
    if __name__ == '__main__':
        downloader = DownLoader()
        url = 'https://ks3-cn-beijing.ksyun.com/zhangmingda/111-3333333.Python安装与命令行操作.mp4'
        print(url)
        downloader.download(url)
  • 相关阅读:
    常用设计模式
    文件上传相关报错: The current request is not a multipart request或is a MultipartResolver configured?
    Intellij IDEA 与 Gitlab 实现代码上传与下载
    Oracle两表关联,只取B表的第一条记录
    notepad++ 调整行间距
    Ubuntu18.04直接安装python3.7或者升级自带的python3.6版本之后导致终端无法打开的解决办法
    黑苹果之DELL台式机安装Mac OS X 10.13.6版本操作系统
    Windows Ping | Tracert 's Bat 脚本并行测试
    centos 7 修改 sshd | 禁止 root 登录及 sshd 端口脚本定义
    C语言中malloc函数的理解
  • 原文地址:https://www.cnblogs.com/zhangmingda/p/14144016.html
Copyright © 2020-2023  润新知