• python 多进程与多线程配合拷贝文件目录


    版本一:使用shutil进行拷贝

      1 # -*- coding: utf-8 -*-
      2 # @author: Tele
      3 # @Time  : 2019/04/02 下午 3:09
      4 # 待改进:
      5 # 1.拷贝逻辑使用原生的io
      6 # 2.针对大文件在进程内部实现多线程方式进行拷贝
      7 
      8 
      9 import time
     10 import re
     11 import os
     12 import shutil
     13 import multiprocessing
     14 
     15 
     16 # 遍历文件夹
     17 def walk_file(file):
     18     file_list = list()
     19     for root, dirs, files in os.walk(file):
     20         # 遍历文件
     21         for f in files:
     22             file_list.append(f)
     23     return file_list
     24 
     25 
     26 # 计算文件数量
     27 def get_file_count(dir):
     28     return len(walk_file(dir))
     29 
     30 
     31 def copy(src, target, queue):
     32     target_number = 1
     33     if os.path.isdir(src):
     34         target_number = get_file_count(src)
     35         shutil.copytree(src, target)
     36     else:
     37         shutil.copyfile(src, target)
     38     # 将拷贝完成的文件数量放入队列中
     39     queue.put(target_number)
     40 
     41 
     42 def copy_dir(src, desc):
     43     total_number = get_file_count(src)
     44     # 分隔符检测
     45     src = check_separator(src)
     46     desc = check_separator(desc)
     47     # print("src:",src)
     48     # print("desc:",desc)
     49 
     50     file_dir_list = [src + "/" + i for i in os.listdir(src)]
     51     if os.path.exists(desc):
     52         shutil.rmtree(desc)
     53     pool = multiprocessing.Pool(3)
     54 
     55     # 创建队列
     56     queue = multiprocessing.Manager().Queue()
     57 
     58     # 一个文件/目录开启一个进程去拷贝
     59     for f_name in file_dir_list:
     60         target = desc + "/" + f_name[index_list("/", f_name)[1] + 1:]
     61         # print(target)
     62         # 创建target目录
     63         parent_path = os.path.split(target)[0]
     64         if not os.path.exists(parent_path):
     65             os.makedirs(parent_path)
     66         pool.apply_async(copy, args=(f_name, target, queue,))
     67 
     68     start = time.time()
     69     pool.close()
     70     #    pool.join()
     71     count = 0
     72     while True:
     73         count += queue.get()
     74         # 格式化输出时两个%输出一个%,不换行,每次定位到行首,实现覆盖
     75         print("
    拷贝进度为 %.2f %%" % (count * 100 / total_number), end="")
     76         if count >= total_number:
     77             break
     78     end = time.time()
     79     print()
     80     print("耗时-----", (end - start), "s")
     81 
     82 
     83 # 查找指定字符出现的全部索引位置
     84 def index_list(c, s):
     85     return [i.start() for i in re.finditer(c, s)]
     86 
     87 
     88 # 检测目录结尾是否有 "/"
     89 def check_separator(path):
     90     if path.rindex("/") == len(path) - 1:
     91         return path[0:path.rindex("/")]
     92     return path
     93 
     94 
     95 def main():
     96     copy_dir("f:/ftp_mypc/", "e:/ftp_mypc/")
     97 
     98 
     99 if __name__ == '__main__':
    100     main()

    这样做仍然有些小问题,对于大文件可以在进程内部采用多线程的方式,可以看到使用shutil进行拷贝时我们没有办法实现字节切割,于是有了下面的版本二

     版本二:

      1 # -*- coding: utf-8 -*-
      2 # @author: Tele
      3 # @Time  : 2019/04/02 下午 3:09
      4 # 使用多进程拷贝文件夹,对于大文件进程内部又使用了多线程进行拷贝
      5 # 使用进程池实现多进程时,使用的消息队列要使用multiprocessing.Manager().Queue()创建
      6 
      7 import time
      8 import re
      9 import os
     10 import shutil
     11 import multiprocessing
     12 import math
     13 from concurrent.futures import ThreadPoolExecutor, wait
     14 
     15 # 设置单个文件的最大值:209715200 200M
     16 MAX_SINGLE_FILE_SIZE = 209715200
     17 mutex = multiprocessing.Lock()
     18 executor = ThreadPoolExecutor(max_workers=3)
     19 
     20 
     21 # 遍历文件夹
     22 def walk_file(file):
     23     file_list = list()
     24     for root, dirs, files in os.walk(file):
     25         # 遍历文件
     26         for f in files:
     27             file_list.append(f)
     28 
     29         # 空文件夹处理
     30         for d in dirs:
     31             if len(os.listdir(os.path.join(root, d))) == 0:
     32                 file_list.append(d)
     33     return file_list
     34 
     35 
     36 # 计算文件数量
     37 def get_file_count(dir):
     38     return len(walk_file(dir))
     39 
     40 
     41 def copy(src, target, queue):
     42     target_number = 1
     43     buffer = 1024
     44     # 文件夹
     45     if os.path.isdir(src):
     46         target_number = get_file_count(src)
     47         for root, dirs, files in os.walk(src):
     48             # 遍历文件
     49             for f in files:
     50                 drive = os.path.splitdrive(target)[0]
     51                 target = drive + os.path.splitdrive(os.path.join(root, f))[1]
     52                 copy_single_file(buffer, os.path.join(root, f), target)
     53             # 空文件夹
     54             for d in dirs:
     55                 drive = os.path.splitdrive(target)[0]
     56                 target = drive + os.path.splitdrive(os.path.join(root, d))[1]
     57                 # 检查文件的层级目录
     58                 if not os.path.exists(target):
     59                     os.makedirs(target)
     60     else:
     61         copy_single_file(buffer, src, target)
     62     # 将拷贝完成的文件数量放入队列中
     63     queue.put(target_number)
     64 
     65 
     66 # 拷贝单文件
     67 def copy_single_file(buffer, src, target):
     68     file_size = os.path.getsize(src)
     69     rs = open(src, "rb")
     70 
     71     # 检查文件的层级目录
     72     parent_path = os.path.split(target)[0]
     73     if not os.path.exists(parent_path):
     74         os.makedirs(parent_path)
     75 
     76     ws = open(target, "wb")
     77     # 小文件直接读写
     78     if file_size <= MAX_SINGLE_FILE_SIZE:
     79         while True:
     80             content = rs.read(buffer)
     81             ws.write(content)
     82             if len(content) == 0:
     83                 break
     84         ws.flush()
     85     else:
     86         # 设置每个线程拷贝的字节数 50M
     87         PER_THREAD_SIZE = 52428800
     88         # 构造参数并执行
     89         task_list = list()
     90         for i in range(math.ceil(file_size / PER_THREAD_SIZE)):
     91             byte_size = PER_THREAD_SIZE
     92             # 最后一个线程拷贝的字节数应该是取模
     93             if i == math.ceil(file_size / PER_THREAD_SIZE) - 1:
     94                 byte_size = file_size % PER_THREAD_SIZE
     95             start = i * PER_THREAD_SIZE + i
     96             t = executor.submit(copy_file_thread, start, byte_size, rs, ws)
     97             task_list.append(t)
     98         wait(task_list)
     99     if rs:
    100         rs.close()
    101     if ws:
    102         ws.close()
    103 
    104 
    105 # 多线程拷贝
    106 def copy_file_thread(start, byte_size, rs, ws):
    107     mutex.acquire()
    108     buffer = 1024
    109     count = 0
    110     rs.seek(start)
    111     ws.seek(start)
    112     while True:
    113         if count + buffer <= byte_size:
    114             content = rs.read(buffer)
    115             count += len(content)
    116             write(content, ws)
    117         else:
    118             content = rs.read(byte_size % buffer)
    119             count += len(content)
    120             write(content, ws)
    121             break
    122     # global total_count
    123     # total_count += byte_size
    124     # print("
    拷贝进度为%.2f %%" % (total_count * 100 / file_size), end="")
    125     mutex.release()
    126 
    127 
    128 def write(content, ws):
    129     ws.write(content)
    130     ws.flush()
    131 
    132 
    133 def copy_dir(src, desc):
    134     # 获得待拷贝的文件总数(含空文件夹)
    135     total_number = get_file_count(src)
    136     # 分隔符检测
    137     src = check_separator(src)
    138     desc = check_separator(desc)
    139     # print("src:",src)
    140     # print("desc:",desc)
    141 
    142     file_dir_list = [src + "/" + i for i in os.listdir(src)]
    143     if os.path.exists(desc):
    144         shutil.rmtree(desc)
    145 
    146     # 进程池
    147     pool = multiprocessing.Pool(3)
    148 
    149     # 创建队列
    150     queue = multiprocessing.Manager().Queue()
    151 
    152     # 一个文件/目录开启一个进程去拷贝
    153     for f_name in file_dir_list:
    154         target = os.path.splitdrive(desc)[0] + "/" + os.path.splitdrive(f_name)[1]
    155         # target = desc + "/" + f_name[index_list("/", f_name)[1] + 1:]
    156         # print(target)
    157         # 创建target目录
    158         parent_path = os.path.split(target)[0]
    159         if not os.path.exists(parent_path):
    160             os.makedirs(parent_path)
    161         pool.apply_async(copy, args=(f_name, target, queue))
    162 
    163     start = time.time()
    164     pool.close()
    165     # pool.join()
    166     count = 0
    167     while True:
    168         count += queue.get()
    169         # 格式化输出时两个%输出一个%,不换行,每次定位到行首,实现覆盖
    170         print("
    当前进度为 %.2f %%" % (count * 100 / total_number), end="")
    171         if count >= total_number:
    172             break
    173 
    174     executor.shutdown()
    175     end = time.time()
    176     print()
    177     print("耗时-----", (end - start), "s")
    178 
    179 
    180 # 查找指定字符出现的全部索引位置
    181 def index_list(c, s):
    182     return [i.start() for i in re.finditer(c, s)]
    183 
    184 
    185 # 检测目录结尾是否有 "/"
    186 def check_separator(path):
    187     if path.rindex("/") == len(path) - 1:
    188         return path[0:path.rindex("/")]
    189     return path
    190 
    191 
    192 def main():
    193     copy_dir("f:/ftp_mypc/", "e:/ftp_mypc/")
    194 
    195 
    196 if __name__ == '__main__':
    197     main()
  • 相关阅读:
    Computability 2: Gödel Number
    Computability 1: Computational Models
    Axiomatic Set Theory
    External Sorting Model
    Minimum Spanning Tree
    All-Pairs Shortest Paths
    Python 泰坦尼克生存率预测(修改)
    Python 分析Kaggle_Titanic案例
    Python 北京二手房成交数据分析过程
    数据挖掘主要解决的四类问题以及常用到的算法
  • 原文地址:https://www.cnblogs.com/tele-share/p/10656811.html
Copyright © 2020-2023  润新知