只记录大体思路和我认为其中需要记录的地方。
正则匹配
正则匹配的模式很难记忆,即使记住了,也很难写出无错误的匹配模式。但是,借助网上一些提供实时对比的网站,如 regexr.com。
代码示意:
import os
import re
'''
define regex mode
'''
get_imgpath_regex = re.compile(r'''(
(d{4}-d{2}-d{2}sd{2}:d{2}:d{2},d+) # time
s # separate
(.*)?get_imgpathsusedstimesiss # info
(d+.d+) # time
sargssiss(u' # separate
(.*?)',)sresultsiss # img dir
(.*?.jpg) # img path
)''', re.VERBOSE)
get_imgpath_flag = 'get_imgpath'
def main():
for infile in infile_list:
# prase line using regex mode
with open(infile, "r") as file:
for line in file: # 先判断关键词
currentDict = {}
if get_imgpath_flag in line:
for groups in regex['get_imgpath_regex'].findall(line): # 再提取模式对应的内容
currentDict = {'date': groups[1], 'cost_time':groups[3],
'img_dir':groups[4], 'img_path':groups[5]}
# print(currentDict)
get_imgpath_match.append(currentDict)
else:
pass
参数配置
采取的方式为参数存储在一个单独的文件,如 config.json。
{
"FLAG" : {
"SAVE_SPILT_LOG_FILE_FLAG" : false ,
"SAVE_MERGE_LOG_FILE_FLAG" : false ,
"USE_CURRENT_PATH" : false
},
"PATH" : {
"INPUT_LOG_FILE_PATH" : "E:\zwk\Code\logger_read\data\pro_data" ,
"SAVE_SPILT_MERGE_LOG_PATH" : "E:\zwk\Code\logger_read\output\spilt_merge_log" ,
"OUTPUT_RESULT_PATH" : "E:\zwk\Code\logger_read\output"
},
"PARAMETERS" : {
"windows_size" : 2 ,
"duplicate_times" : 1
}
}
再对参数进行解析,
import json
def main():
# outfile_path = os.path.normpath("output/filtered")
global parameters
if(config['FLAG']['USE_CURRENT_PATH']):
pwd = os.getcwd()
config['PATH']['INPUT_LOG_FILE_PATH'] = os.path.join(pwd, 'data')
config['PATH']['SAVE_SPILT_MERGE_LOG_PATH'] = os.path.join(pwd, 'output')
config['PATH']['OUTPUT_RESULT_PATH'] = os.getcwd()
output_file_path = config['PATH']
flag = config['FLAG']
if __name__ == '__main__':
this_folder = os.path.dirname(os.path.abspath(__file__))
config_file = os.path.join(this_folder, 'config.json')
exists_check = os.path.isfile(config_file)
if not exists_check:
print('Error: loss of config file, Exit !!!')
with open(config_file, 'r') as f:
config = json.load(f)
# improve it, use as global variable
parameters = config['PARAMETERS']
main()
目录、文件名
在这里,示例我认为的还不错做法,主要是不受系统影响
this_folder = os.path.dirname(os.path.abspath(__file__))
config_file = os.path.join(this_folder, 'config.json')
exists_check = os.path.isfile(config_file)
if not exists_check:
print('Error: loss of config file, Exit !!!')
with open(config_file, 'r') as f:
config = json.load(f)
# 省略中间
# write output to files
os.makedirs(outfile['SAVE_SPILT_MERGE_LOG_PATH'], exist_ok=True)