#!/usr/bin/env python import os import numpy as np import py7zr import shutil import pandas as pd import time #处理7Z分笔数据 path = r'G:datas of status ick-by-tick trade'#数据文件存放位置 pathsave = 'G:\datas of status\python codes\'#设定临时文件存放位置 listM = np.array(os.listdir(path)) #获取月文件夹 print(listM) listM=np.char.add(path + "\",listM)#获取月文件夹路径 def fun_time_l2(a,b): if float(a)<=float(b) : return 1 else: return 0 def read_files(filename):#读文件内容 print(filename) with open(filename, "r") as f: df1 = pd.DataFrame(f.readlines()) index = df1.loc[(df1[0].str.contains("find"))].index if index.isnull : df1 = df1.drop(index= index) #print(df1[13870:13890]) df1 = pd.DataFrame(df1[0].str.strip()) #print(df1) df1 = pd.DataFrame(df1[0].str.split(" ",expand = True)) #print(df1[1].str.strip()) #print(df1[2].str.strip()) #print(df1[1].astype("int")*df1[2].astype("int")) df1[3] = df1[1].astype("int")*df1[2].astype("int") df1.columns = ["time","price","vol","amount"] vol_t = abs(df1["vol"].astype("long")).sum() amount_t = abs(df1["amount"].astype("long")).sum() df_f_xiao = df1[(df1["amount"].astype("int") <0)&((df1["amount"].astype("int") > -40000) )] df_f_zhong = df1[(df1["amount"].astype("int") <= -40000)&((df1["amount"].astype("int") > -200000) )] df_f_da = df1[(df1["amount"].astype("int") <= - 200000)&((df1["amount"].astype("int") > -1000000) )] df_f_te_da = df1[(df1["amount"].astype("int") <= - 1000000)] f_xiao = df_f_xiao["amount"].astype("long").sum() f_zhong = df_f_zhong["amount"].astype("long").sum() f_da = df_f_da["amount"].astype("long").sum() f_te_da = df_f_te_da["amount"].astype("long").sum() df_z_xiao = df1[(df1["amount"].astype("int") > 0) & ((df1["amount"].astype("int") < 40000))] df_z_zhong = df1[(df1["amount"].astype("int") >= 40000) & ((df1["amount"].astype("int") < 200000))] df_z_da = df1[(df1["amount"].astype("int") >= 200000) & ((df1["amount"].astype("int") < 1000000))] df_z_te_da = df1[(df1["amount"].astype("int") >= 1000000)] z_xiao = df_z_xiao["amount"].astype("long").sum() z_zhong = df_z_zhong["amount"].astype("long").sum() z_da = df_z_da["amount"].astype("long").sum() z_te_da = df_z_te_da["amount"].astype("long").sum() #add 增加计算最小值 min_L = df1["price"].astype("int").min() sum_V = abs(df1["vol"].astype("int")).sum() min_2 = min_L * 1.02 df_min_2 = df1[ (df1["price"].astype("int") < min_2)] sum_min_2_v = abs(df_min_2["vol"].astype("long")).sum() re_min_L2 = abs(sum_min_2_v)/sum_V*100 #add time df_time_all = pd.DataFrame() df_time_all["time"] = df1["time"].str[:-2] df_time_all["price"] = df1["price"] df_time_all_only =df_time_all.drop_duplicates(subset=['time'],keep='first',inplace=False) df_time_all_only = df_time_all_only.reset_index(drop = True) for time_do in df_time_all_only["time"]: df_time_t = df_time_all[df_time_all["time"] == time_do] df_time_all_only.loc[df_time_all_only["time"] == time_do,"price"] = df_time_t["price"].min() df_time_all_only["add_times"] =df_time_all_only["price"].apply(lambda x :fun_time_l2(x,min_2)) time_l2 = df_time_all_only["add_times"].sum() #print() #print(re_min_L2) #print(sum_V) #sum_V = abs(df1[2]).sum() #min_2 = min_L * 1.02 #print(min_2) #print(sum_V) ''' print(vol_t) print(amount_t) print(f_xiao) print(f_zhong) print(f_da ) print(f_te_da) print(z_xiao) print(z_zhong) print(z_da ) print(z_te_da) ''' list_return = [vol_t,amount_t,z_xiao,z_zhong,z_da,z_te_da,f_xiao,f_zhong,f_da,f_te_da,re_min_L2,time_l2] return list_return #tempname=r'G:\datas of status\python codes\20200428\SH600000.txt' #read_files(tempname) def read_dirs(savedir):#读文件夹 files=np.array(os.listdir(savedir)) file_names = np.char.add(savedir + "\",files) listdir_return = [] for file in file_names: (filepath, tempfilename) = os.path.split(file) (filename, extension) = os.path.splitext(tempfilename) if not os.path.getsize(file):#判断文件大小是否为0 print("file siz = 0") print(file) else: list_t = read_files(file) list_t.insert(0,filename) listdir_return.append(list_t) #print(listdir_return) npM = pd.DataFrame(listdir_return) npM.columns = ["name","vol","amount","z_xiao","z_zhong","z_da","z_te_da","f_xiao","f_zhong","f_da","f_te_da","re_min_L2","time_l2"] return npM #print(npM) def extract_files(filename):#提出7Z文件 with py7zr.SevenZipFile(filename, 'r') as archive: allfiles = archive.getnames()#获取7Z文件内的子文件名 #print(allfiles) tempdir = allfiles[0].split("/")[0]#取7Z文件内文件夹名称 #print(tempdir) savedir =pathsave + str(tempdir) #print(pathsave) if os.path.exists(savedir): shutil.rmtree(savedir)#删除同名文件夹 os.mkdir(savedir)#重建文件夹 #archive.extract(pathsave,allfiles[0:3])#解压到文件夹 archive.extractall(pathsave)#解压到文件夹 #print(archive.extractall()) pdM2 = read_dirs(savedir) shutil.rmtree(savedir) pdM2.insert(1,"date",tempdir,allow_duplicates=False) #print(pdM2) return pdM2 def do_work(listD): pdM_all = pd.DataFrame( columns=["name", "date", "vol", "amount", "z_xiao", "z_zhong", "z_da", "z_te_da", "f_xiao", "f_zhong", "f_da", "f_te_da","re_min_L2","time_l2"]) for filename in listD: #filename = listD[0] print("=========") print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) pdD_t = extract_files(filename) #print(pdD_t["date"][0]) save_dfile = pathsave + "\" + "everyday_data" + "\" + pdD_t["date"][0] + ".csv" #print(save_dfile) pdD_t = pdD_t.sort_values(by=['time_l2'], ascending=True) pdD_t.to_csv(save_dfile,sep=",",index=False,header=True) pdM_all = pdM_all.append(pdD_t) print(filename) #print(pdM_all) save_file = pathsave + pdM_all["date"][0].str[0:6] + ".csv" save_file = save_file.reset_index(drop = True) print(save_file[0]) #df.to_csv(‘/opt/births1880.csv’, index=False, header=False #pdM_all = pdM_all.sort_values(by=['re_min_L2'], ascending=True) pdM_all.to_csv(save_file[0],sep=",",index=False,header=True) def start_work(): m = 0 # 开始处理第几个文件夹(1~16,16=202004,15=202003) do_num = 1 for n in range(do_num): i = m - n #处理第几个文件夹(1~16) print(listM[i]) listD = np.array(os.listdir(listM[i]))#获取一个文件夹下所有日文件全路径 print(listD) listD = np.char.add(listM[i] + "\",listD)#获取日文件全名 print(listD) do_work(listD) print(i) start_work() #以下为单位处理一天的数据 def do_one_day(): tempdir = "20200718"#某天数据已解压的文件夹 savedir = pathsave + tempdir pdM2 = read_dirs(savedir) pdM2.insert(1, "date", tempdir, allow_duplicates=False) save_dfile = pathsave + "\" + "everyday_data" + "\" + tempdir + ".csv" #save_dfile = pathsave + "\" + "everyday_data" + "\" + "20200710" + ".csv" # print(save_dfile) pdM2 = pdM2.sort_values(by=['time_l2'],ascending=True) pdM2.to_csv(save_dfile, sep=",", index=False, header=True) #do_one_day() def do_one_file(): file_name = "G:\datas of status\python codes\20200714\SH600000.txt" print(read_files(file_name)) #do_one_file()
单线程,计算时间部分还要优化