• readzip_add_maxL3多线程


    #!/usr/bin/env python
    import os
    import numpy as np
    import py7zr
    import shutil
    import pandas as pd
    import time
    import threading
    import multiprocessing
    #from threading import Thread, Lock
    #处理7Z分笔数据
    
    path = r'G:datas of status	ick-by-tick trade'#数据文件存放位置
    pathsave = 'G:\datas of status\python codes\'#设定临时文件存放位置
    
    pathTemp = 'G:\datas of status\python codes\everyday_data\temp'
    
    listM = np.array(os.listdir(path))  #获取月文件夹
    print(listM)
    listM=np.char.add(path + "\",listM)#获取月文件夹路径
    
    
    def fun_time_l2(a,b):
        if float(a)<=float(b) :
            return 1
        else:
            return 0
    
    def read_files(filename):#读文件内容
        #print(filename)
        df1 = pd.DataFrame()
        with open(filename, "r") as f:
            listT = []
            for line in f:
                listT.append(line)
            df1 = pd.DataFrame(listT)
    
        index = df1.loc[(df1[0].str.contains("find"))].index
        if index.isnull:
            df1 = df1.drop(index=index)
        # print(df1[13870:13890])
    
        df1 = pd.DataFrame(df1[0].str.strip())
        # print(df1)
        df1 = pd.DataFrame(df1[0].str.split("	", expand=True))
        # print(df1[1].str.strip())
        # print(df1[2].str.strip())
        # print(df1[1].astype("int")*df1[2].astype("int"))
    
        df1[3] = df1[1].astype("int") * df1[2].astype("int")
        df1.columns = ["time", "price", "vol", "amount"]
        vol_t = abs(df1["vol"].astype("long")).sum()
        amount_t = abs(df1["amount"].astype("long")).sum()
    
        df_f_xiao = df1[(df1["amount"].astype("int") < 0) & ((df1["amount"].astype("int") > -40000))]
        df_f_zhong = df1[(df1["amount"].astype("int") <= -40000) & ((df1["amount"].astype("int") > -200000))]
        df_f_da = df1[(df1["amount"].astype("int") <= - 200000) & ((df1["amount"].astype("int") > -1000000))]
        df_f_te_da = df1[(df1["amount"].astype("int") <= - 1000000)]
    
        f_xiao = df_f_xiao["amount"].astype("long").sum()
        f_zhong = df_f_zhong["amount"].astype("long").sum()
        f_da = df_f_da["amount"].astype("long").sum()
        f_te_da = df_f_te_da["amount"].astype("long").sum()
    
        df_z_xiao = df1[(df1["amount"].astype("int") > 0) & ((df1["amount"].astype("int") < 40000))]
        df_z_zhong = df1[(df1["amount"].astype("int") >= 40000) & ((df1["amount"].astype("int") < 200000))]
        df_z_da = df1[(df1["amount"].astype("int") >= 200000) & ((df1["amount"].astype("int") < 1000000))]
        df_z_te_da = df1[(df1["amount"].astype("int") >= 1000000)]
    
        z_xiao = df_z_xiao["amount"].astype("long").sum()
        z_zhong = df_z_zhong["amount"].astype("long").sum()
        z_da = df_z_da["amount"].astype("long").sum()
        z_te_da = df_z_te_da["amount"].astype("long").sum()
    
        # add 增加计算最小值
    
        min_L = df1["price"].astype("int").min()
        sum_V = abs(df1["vol"].astype("int")).sum()
        min_2 = min_L * 1.02
    
        df_min_2 = df1[(df1["price"].astype("int") < min_2)]
    
        sum_min_2_v = abs(df_min_2["vol"].astype("long")).sum()
        re_min_L2 = abs(sum_min_2_v) / sum_V * 100
    
        # add time
        df_time_all = pd.DataFrame()
        df_time_all["time"] = df1["time"].str[:-2]
        df_time_all["price"] = df1["price"]
    
        df_time_all_only = df_time_all.drop_duplicates(subset=['time'], keep='first', inplace=False)
        df_time_all_only = df_time_all_only.reset_index(drop=True)
        for time_do in df_time_all_only["time"]:
            df_time_t = df_time_all[df_time_all["time"] == time_do]
            df_time_all_only.loc[df_time_all_only["time"] == time_do, "price"] = df_time_t["price"].min()
    
        df_time_all_only["add_times"] = df_time_all_only["price"].apply(lambda x: fun_time_l2(x, min_2))
        time_l2 = df_time_all_only["add_times"].sum()
        # print()
    
        # print(re_min_L2)
    
        # print(sum_V)
        # sum_V = abs(df1[2]).sum()
        # min_2 = min_L * 1.02
        # print(min_2)
    
        # print(sum_V)
    
        '''
        print(vol_t)
        print(amount_t)
    
        print(f_xiao)
        print(f_zhong)
        print(f_da )
        print(f_te_da)
        print(z_xiao)
        print(z_zhong)
        print(z_da )
        print(z_te_da)
        '''
        list_return = [vol_t, amount_t, z_xiao, z_zhong, z_da, z_te_da, f_xiao, f_zhong, f_da, f_te_da, re_min_L2, time_l2]
        return list_return
    
    
    
    
    
    
    
    
    #tempname=r'G:\datas of status\python codes\20200428\SH600000.txt'
    #read_files(tempname)
    
    def run(df_only_name1, semaphore):
    
    
        semaphore.acquire()   #加锁
    
        list_t1 = []
    
    
    
        for file in df_only_name1:
            (filepath, tempfilename) = os.path.split(file)
            (filename, extension) = os.path.splitext(tempfilename)
    
            if not os.path.getsize(file):  # 判断文件大小是否为0
                print("file siz = 0")
                print(file)
            else:
                list_t = read_files(file)
                #print("hah")
                list_t.insert(0, filename)
                list_t1.append(list_t)
    
    
        #print(df_only_name1[0])
    
        #file_p = os.path.split(file_t)
        #print(str(threading.currentThread().ident))
        save_dfile =pathTemp + "//" + str(threading.currentThread().ident)+".csv"
    
    
        npM = pd.DataFrame(list_t1)
        print(save_dfile)
    
    
    
        npM.columns = ["name", "vol", "amount", "z_xiao", "z_zhong", "z_da", "z_te_da", "f_xiao", "f_zhong", "f_da","f_te_da", "re_min_L2", "time_l2"]
    
        #print(save_dfile)
        #print(npM)
        npM.to_csv(save_dfile,sep=",",index=False,header=True)
        #print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    
    
        semaphore.release()     #释放
    
    
    
    
    
    def read_dirs(savedir):#读文件夹
        files=np.array(os.listdir(savedir))
        file_names = np.char.add(savedir + "\",files)
        listdir_return = []
    
        if os.path.exists(pathTemp):
            shutil.rmtree(pathTemp)  # 删除同名文件夹
        os.mkdir(pathTemp)  # 重建文件夹
    
        #========
        all_nums = len(file_names)
        every_batch = 1
        epochs = int(all_nums / every_batch)
        num_of_thread = 303
        # num = 1
        semaphore = threading.BoundedSemaphore(num_of_thread)  # 最多允许5个线程同时运行
        print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        #print(epochs + 1)
        for i in range(epochs ):
            begin = i * every_batch
            end = begin + every_batch
    
            if all_nums <= end:
                end = all_nums
                #i=i+2
            df_only_name1 = file_names[begin:end]
    
    
            t = threading.Thread(target=run, args=(df_only_name1, semaphore))
            t.start()
            # print(i)
    
        print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        #============
    
    
        '''
        for file in file_names:
            (filepath, tempfilename) = os.path.split(file)
            (filename, extension) = os.path.splitext(tempfilename)
    
            if not os.path.getsize(file):#判断文件大小是否为0
                print("file siz = 0")
                print(file)
            else:
                list_t = read_files(file)
                list_t.insert(0,filename)
                listdir_return.append(list_t)
        '''
        #=====================================
    
        while threading.active_count() != 1:
            print(threading.active_count())
    
            time.sleep(10)
            pass  # print threading.active_count()
        else:
            print('-----all threads done-----')
            print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    
    
        #=======================
        #print(listdir_return)
        exit(0)
        npM = pd.DataFrame(listdir_return)
        npM.columns = ["name","vol","amount","z_xiao","z_zhong","z_da","z_te_da","f_xiao","f_zhong","f_da","f_te_da","re_min_L2","time_l2"]
        return npM
        #print(npM)
    
    def extract_files(filename):#提出7Z文件
        with py7zr.SevenZipFile(filename, 'r') as archive:
            allfiles = archive.getnames()#获取7Z文件内的子文件名
            #print(allfiles)
            tempdir = allfiles[0].split("/")[0]#取7Z文件内文件夹名称
            #print(tempdir)
            savedir =pathsave + str(tempdir)
            #print(pathsave)
            if os.path.exists(savedir):
                shutil.rmtree(savedir)#删除同名文件夹
            os.mkdir(savedir)#重建文件夹
            #archive.extract(pathsave,allfiles[0:3])#解压到文件夹
            archive.extractall(pathsave)#解压到文件夹
            #print(archive.extractall())
            pdM2 = read_dirs(savedir)
    
            shutil.rmtree(savedir)
            pdM2.insert(1,"date",tempdir,allow_duplicates=False)
            #print(pdM2)
            return pdM2
    
    
    
    
    
    def do_work(listD):
        pdM_all = pd.DataFrame(
            columns=["name", "date", "vol", "amount", "z_xiao", "z_zhong", "z_da", "z_te_da", "f_xiao", "f_zhong", "f_da",
                     "f_te_da","re_min_L2","time_l2"])
        for filename in listD:
            #filename = listD[0]
            print("=========")
            print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
            pdD_t = extract_files(filename)
            #print(pdD_t["date"][0])
            save_dfile = pathsave + "\" + "everyday_data" + "\" + pdD_t["date"][0] + ".csv"
            #print(save_dfile)
            pdD_t = pdD_t.sort_values(by=['time_l2'], ascending=True)
            pdD_t.to_csv(save_dfile,sep=",",index=False,header=True)
            pdM_all = pdM_all.append(pdD_t)
            print(filename)
        #print(pdM_all)
        save_file = pathsave + pdM_all["date"][0].str[0:6] + ".csv"
        save_file = save_file.reset_index(drop = True)
        print(save_file[0])
        #df.to_csv(‘/opt/births1880.csv’, index=False, header=False
        #pdM_all = pdM_all.sort_values(by=['re_min_L2'], ascending=True)
        pdM_all.to_csv(save_file[0],sep=",",index=False,header=True)
    
    
    
    
    def start_work():
        m = 0  # 开始处理第几个文件夹(1~16,16=202004,15=202003)
        do_num = 1
        for n in range(do_num):
    
            i = m - n #处理第几个文件夹(1~16)
            print(listM[i])
            listD = np.array(os.listdir(listM[i]))#获取一个文件夹下所有日文件全路径
    
            print(listD)
            listD = np.char.add(listM[i] + "\",listD)#获取日文件全名
    
            print(listD)
            do_work(listD)
            print(i)
    #start_work()
    #以下为单位处理一天的数据
    def do_one_day():
        tempdir = "20200718"#某天数据已解压的文件夹
        savedir = pathsave + tempdir
    
        pdM2 = read_dirs(savedir)
    
        pdM2.insert(1, "date", tempdir, allow_duplicates=False)
    
    
        save_dfile = pathsave + "\" + "everyday_data" + "\" + tempdir + ".csv"
        #save_dfile = pathsave + "\" + "everyday_data" + "\" + "20200710" + ".csv"
        # print(save_dfile)
        pdM2 = pdM2.sort_values(by=['time_l2'],ascending=True)
        pdM2.to_csv(save_dfile, sep=",", index=False, header=True)
    
    
    
    do_one_day()
    
    
    def do_one_file():
        file_name = "G:\datas of status\python codes\20200714\SH600000.txt"
        print(read_files(file_name))
    
    
    #do_one_file()
    

      多线程,计算时间部分还可优化

  • 相关阅读:
    将数组转换为 List, 使用 Collections.addAll(arrayList, array)
    Numpy学习笔记
    sql 批量修改字段内容
    sql 查询的优化
    选择低薪喜欢的工作, 还是高薪不喜欢的工作 ?
    Tornado + Bootstrap 快速搭建自己的web应用
    sql查询出现次数最多的记录的名称和现次数以及QT聚合查找失败解决
    idea中修改git提交代码的用户名
    初识Spring Cloud与微服务
    微信小程序解析富文本的几种方法
  • 原文地址:https://www.cnblogs.com/rongye/p/13338557.html
Copyright © 2020-2023  润新知