• Python 日志处理(二) 使用正则表达式处理Nginx 日志


     使用正则表达式来处理Nginx 日志

    一、 

    先对单行的日志进行分组正则匹配,返回匹配后的结果(字典格式):

    from datetime import datetime
    import re
    
    #单行日志
    logline = '''183.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /o2o/media.html?menu=3 HTTP/1.1" 200 16691 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"'''
    
    #对每行匹配正则,提取匹配后的字典
    def extract(line):
        pattern = '''(?P<remote_addr>[d.]{7,}) - - (?:[(?P<datetime>[^[]]+)]) "(?P<request>[^"]+)" (?P<status>d+) (?P<size>d+) "(?:[^"]+)" "(?P<user_agent>[^"]+)"'''
        regex = re.compile(pattern)
        matcher = regex.match(line)
        return matcher.groupdict()
    #日志格式key与对应的处理函数
    
    #写入新字典,key,value
    
    
    print(extract(logline))
    

      输出结果:

    {'request': 'GET /o2o/media.html?menu=3 HTTP/1.1', 'size': '16691', 'remote_addr': '183.60.212.153', 'status': '200', 'datetime': '19/Feb/2013:10:23:29 +0800', 'user_agent': 'Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)'}
    

      

    二、

    上面返回结果中再对部分内容细分处理,比如以下四部分:

    'request': 'GET /o2o/media.html?menu=3 HTTP/1.1'
    'size': '16691'
    'status': '200'
    'datetime': '19/Feb/2013:10:23:29 +0800'

    request可以再细分请求方式(method),请求地址(url),协议版本(protocol)
    size可以直接转换成整数,而不是字符串
    status也可以转换位整数
    datetime可以转换成其它格式(2013-02-19 10:23:29+08:00)

    时间格式化解析字符串

    %a 星期几的英文缩写 Sun, Mon, ..., Sat
    %A 星期几的英文全拼 Sunday, Monday, ..., Saturday
    %w 星期几的数字表示格式,0是星期天,1是星期一...6是星期六
    %d 天 01, 02, ..., 31
    %b 月份的英文缩写 Jan, Feb, ..., Dec
    %Y 年份的4位的十进制整数 Year 0001, 0002, ..., 2013, 2014, ..., 9998, 9999
    %H 小时 Hour(24小时制) 00, 01, ..., 23
    %I 小时 Hour(12小时制) 01, 02, ..., 12
    %M 分钟的零填充的十进制整数 Minute(01,02,03...59)
    %S 秒的零填充的十进制整数 Second(01,02,03...59)
    %z 时区偏移 UTC时区偏移大小 (empty), +0000, -0400, +1030

    from datetime import datetime
    import re
    
    #单行日志
    logline = '''183.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /o2o/media.html?menu=3 HTTP/1.1" 200 16691 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"'''
    
    #对每行匹配正则,提取匹配后的字典
    def extract(line):
        pattern = '''(?P<remote_addr>[d.]{7,}) - - (?:[(?P<datetime>[^[]]+)]) "(?P<request>[^"]+)" (?P<status>d+) (?P<size>d+) "(?:[^"]+)" "(?P<user_agent>[^"]+)"'''
        regex = re.compile(pattern)
        matcher = regex.match(line)
        return matcher.groupdict()
    
    #对request分别切割成请求方式(method),请求地址(url),协议版本(protocol)
    def convert_request(request):
        return dict(zip(('method','url','protocol'),request.split()))
    
    
    def convert_time(timestr):
        formatstr = '%d/%b/%Y:%H:%M:%S %z'
        ts = datetime.strptime(timestr,formatstr)
        return ts
    
    #日志格式key与对应的处理函数,进一步对日志格式化处理 'request': 'GET /o2o/media.html?menu=3 HTTP/1.1'
    log_format_func = {
        'request':convert_request,
        'size':int,
        'status':int,
        'datetime':convert_time
    }
    
    #写入新字典,key,value
    d = {}
    for k,v in extract(logline).items():
        # print(k,v)
        d[k] = log_format_func.get(k,lambda x:x)(v)
    
    print(d)
    

      输出结果:

    {'request': {'method': 'GET', 'protocol': 'HTTP/1.1', 'url': '/o2o/media.html?menu=3'}, 'remote_addr': '183.60.212.153', 'datetime': datetime.datetime(2013, 2, 19, 10, 23, 29, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))), 'size': 16691, 'status': 200, 'user_agent': 'Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)'}
    

      

    三、

    request 和 datetime处理的函数再简写成lambda 表达式

    from datetime import datetime
    import re
    
    
    logline = '''183.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /o2o/media.html?menu=3 HTTP/1.1" 200 16691 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"'''
    
    def extract(line):
        pattern = '''(?P<remote_addr>[d.]{7,}) - - (?:[(?P<datetime>[^[]]+)]) "(?P<request>[^"]+)" (?P<status>d+) (?P<size>d+) "[^"]+" "(?P<user_agent>[^"]+)"'''
        regex = re.compile(pattern)
        matcher = regex.match(line)
        if matcher:
            return {k: ops.get(k, lambda x: x)(v) for k, v in matcher.groupdict().items()}
        else:
            raise Exception('No match')
    
    
    ops = {
        'datetime': lambda timestr: datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S %z"),
        'request': lambda request: dict(zip(('method', 'url', 'protocol'), request.split())),
        'status': int,
        'size': int
    }
    
    if __name__ == '__main__':
        log_pro = extract(logline)
        print(log_pro)
        # for k, v in log_pro.items():
        #     print(k, v)

      输出结果:

    {'remote_addr': '183.60.212.153', 'request': {'url': '/o2o/media.html?menu=3', 'method': 'GET', 'protocol': 'HTTP/1.1'}, 'status': 200, 'size': 16691, 'datetime': datetime.datetime(2013, 2, 19, 10, 23, 29, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))), 'user_agent': 'Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)'}
    remote_addr: 183.60.212.153
    request: {'url': '/o2o/media.html?menu=3', 'method': 'GET', 'protocol': 'HTTP/1.1'}
    status: 200
    size: 16691
    datetime: 2013-02-19 10:23:29+08:00
    user_agent: Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)
    

      

  • 相关阅读:
    冲刺周期第一天
    05构建之法阅读笔记之三
    第十周进度表
    问题账户需求分析
    2016年秋季个人阅读计划
    课后作业--1:《软件需求与分析》博文读后感
    《人月神话》阅读笔记--3
    《人月神话》阅读笔记--02
    《人月神话》阅读笔记--01
    个人总结
  • 原文地址:https://www.cnblogs.com/i-honey/p/7783664.html
Copyright © 2020-2023  润新知