Pandas
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.engine import Engine
import traceback
import math
class MSSQL:
host = "10.251.21.1XXX"
port = 3306
dbname = "AnyXXXX"
user = "root"
passwd = "AXXXXXX"
def connect_AS_DB():
try:
conn = create_engine ('mysql+pymysql://{user}:{password}@{host}:{port}/{dbname}'.format(user=MSSQL.user,
password=MSSQL.passwd,
host=MSSQL.host,
port=int(MSSQL.port),
dbname=MSSQL.dbname),echo = False)
return conn
except Exception as e:
print("Connect to MSSQL exception:%s"% traceback.format_exc())
return None
if __name__ == '__main__':
#定义每次分割10行
lines=10
try:
conn =connect_AS_DB()
except Exception as E:
print(E)
df= pd.read_csv(r'C:\LocalTemp\Python 拆分大文件\操作日志.2022-4-9~2022-4-19.csv',iterator=False, error_bad_lines=True,encoding='utf-8',names=['DateTime', 'UserName', 'Catelog', 'Action', 'IP', 'Unknown','Record', 'Path', 'Client', 'obj_id'] )
# print(DataInfo_iterator["DateTime"])
for i in range(0, math.ceil(df.shape[0]/lines)):
DataInfo = df.iloc[i*lines:(i+1)*lines]
if conn:
try:
print(DataInfo["DateTime"])
print('*'*50+str(i)+'*'*50)
# DataInfo.to_sql('ASLog', conn, if_exists="append", index=False)
except Exception as e:
print(e)
else :
print("connect to sql error")
版本做了优化 支持多个文件的导入,同时解决第九列文件内含有“,”的问题
点击查看代码
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.engine import Engine
import traceback
import math
class MSSQL:
host = "10.251.21.XXX"
port = 3306
dbname = "Anyshare"
user = "root"
passwd = "AlXXX"
#链接Anyshare 日志数据库
def connect_AS_DB():
try:
conn = create_engine ('mysql+pymysql://{user}:{password}@{host}:{port}/{dbname}'.format(user=MSSQL.user,
password=MSSQL.passwd,
host=MSSQL.host,
port=int(MSSQL.port),
dbname=MSSQL.dbname),echo = False)
return conn
except Exception as e:
print("Connect to MSSQL exception:%s"% traceback.format_exc())
return None
def Writeto_Sql(arr_Lists,columns_Num):
for arr in arr_Lists:
print(arr[0])
df=pd.DataFrame(arr,columns=columns_Num )
df.to_sql('ASLog', conn, if_exists="append", index=False)
# print(df)
#将以迭代返回列表的形式返回数据
def get_CleanFileInfo(File_list):
for file_Name in File_list:
with open(r'C:\LocalTemp\Python 拆分大文件\{file}'.format(file=file_Name),"r", encoding="utf-8") as f:
lines=f.readlines()
i=0
arr_List=[]
for line in lines:
i+=1
vals=line.split(',')
if len(vals)==10:
arr_List.append(vals)
# arr_List=vals
elif len(vals)>10:
arr_List.append(vals[:8]+[','.join(vals[8:-1])]+vals[-1:])
if i % 100==0: #数据库commit 需要消耗时间,将这里调整从1 调整到100后明显发现CPU 、内存消耗更多,速度也更快
yield(arr_List)
arr_List=[]
yield(arr_List)
if __name__ == '__main__':
try:
conn =connect_AS_DB()
except Exception as E:
print(E)
#数据有效列数
columns=['DateTime', 'UserName', 'Catelog', 'Action', 'IP', 'MAC','Record', 'Path', 'Client', 'obj_id']
#待导入文件列表
File_list=["操作日志.2022-1.csv","操作日志.2022-2.csv","操作日志.2022-3.csv","操作日志.2022-4-1~2022-4-3.csv","操作日志.2022-4-3~2022-4-9.csv","操作日志.2022-4-9~2022-4-19.csv"]
#获取返回后的文件信息列表,返回迭代器
infoline_iter=get_CleanFileInfo(File_list)
#将获取的信息写入数据库
Writeto_Sql(infoline_iter,columns)