ubuntu中apache2的日志文件位于:
/var/log/apache2
代码:
# coding=utf-8 import sys ''' 数据 127.0.0.1 - - [10/Jan/2017:10:08:16 +0800] "POST /cgi-bin/login.py HTTP/1.1" 200 335 "-" "curl/7.35.0" ''' def dictify_logline(line): split_line = line.split() return { "remote_address": split_line[0], "status": split_line[8], "bytes_sent": split_line[9] } def generate_log_report(logfile): report_dict = {} for line in logfile: line_dict = dictify_logline(line) print line_dict try: bytes_send = int(line_dict["bytes_sent"]) except ValueError: continue #统计每一个ip,对应发送的字节数 report_dict.setdefault( line_dict["remote_address"], []).append(bytes_send) return report_dict if __name__ == "__main__": if not len(sys.argv) > 1: sys.exit(1) infile_name = sys.argv[1] try: infile = open(infile_name, 'r') except IOError: print 'You must specify a valid file to parse' sys.exit(1) log_report = generate_log_report(infile) print log_report infile.close() #正则表达式提取数据 import re log_line_re = re.compile('''(?P<remote_address>S+) #IP ADDRESS s+ #whitespace S+ #remote logname s+ #whitepsace S+ #remote user s+ #whitespace [[^[]]+] #time s+ #whitespace "[^"]+" #first line of request s+ #whitesapce (?P<status>d+) s+ #whitespace (?P<bytes_sent>-|d+) s* ''', re.VERBOSE) def dictify_logline(line): m = log_line_re.match(line) if m: groupdict = m.groupdict() if groupdict['bytes_sent'] == '-': groupdict['bytes_sent'] = '0' return groupdict else: return { "remote_address": None, "status": None, "bytes_sent": "0" }
效果: