• 五、基于hadoop的nginx访问日志分析--userAgent和spider


    useragent:

    代码(不包含蜘蛛):

    # cat top_10_useragent.py 
    #!/usr/bin/env python
    # coding=utf-8
    
    from mrjob.job import MRJob
    from mrjob.step import MRStep
    from nginx_accesslog_parser import NginxLineParser
    
    import heapq
    
    class UserAgent(MRJob):
    
        nginx_line_parser = NginxLineParser()
    
        def mapper(self, _, line):
            
            self.nginx_line_parser.parse(line)
            field_item = self.nginx_line_parser.http_user_agent
            if field_item is not None:
                yield field_item, 1
    
        def reducer_sum(self, key, values):
    
            yield None, (sum(values), key)
    
        def reducer_top100(self, _, values):
            for count, path in heapq.nlargest(10, values):
                yield count, path
           # for count, path in sorted(values, reverse=True)[:10]:
           #    yield count, path
    
        def steps(self):
            return (
                MRStep(mapper=self.mapper,
                       reducer=self.reducer_sum
                       ),
                MRStep(reducer=self.reducer_top100)
            )
    
    def main():
        UserAgent.run()
    
    if __name__ == '__main__':
        main()

    结果:

    # python3 top_10_useragent.py access_all.log-20161227 
    No configs found; falling back on auto-configuration
    Creating temp directory /tmp/top_10_useragent.root.20161228.090725.308144
    Running step 1 of 2...
    Running step 2 of 2...
    Streaming final output from /tmp/top_10_useragent.root.20161228.090725.308144/output...
    85262    "IE"
    79611    "Chrome"
    48560    "Other"
    10662    "Firefox"
    7927    "Mobile Safari UI/WKWebView"
    7182    "Sogou Explorer"
    6681    "QQ Browser"
    1988    "Mobile Safari"
    1781    "Maxthon"
    1404    "Edge"
    Removing temp directory /tmp/top_10_useragent.root.20161228.090725.308144...

    蜘蛛:

    #!/usr/bin/env python
    # coding=utf-8
    
    from mrjob.job import MRJob
    from mrjob.step import MRStep
    from nginx_accesslog_parser import NginxLineParser
    
    import heapq
    
    class Spider(MRJob):
    
        nginx_line_parser = NginxLineParser()
    
        def mapper(self, _, line):
            
            self.nginx_line_parser.parse(line)
            field_item = self.nginx_line_parser.user_agent_type
            if field_item is not None:
                yield field_item, 1
    
        def reducer_sum(self, key, values):
    
            yield None, (sum(values), key)
    
        def reducer_top100(self, _, values):
            for count, path in heapq.nlargest(10, values):
                yield count, path
           # for count, path in sorted(values, reverse=True)[:10]:
           #    yield count, path
    
        def steps(self):
            return (
                MRStep(mapper=self.mapper,
                       reducer=self.reducer_sum
                       ),
                MRStep(reducer=self.reducer_top100)
            )
    
    def main():
        Spider.run()
    
    if __name__ == '__main__':
        main()

    执行结果:

    # python3 top_10_spider.py access_all.log-20161227 
    No configs found; falling back on auto-configuration
    Creating temp directory /tmp/top_10_spider.root.20161228.091326.295972
    Running step 1 of 2...
    Running step 2 of 2...
    Streaming final output from /tmp/top_10_spider.root.20161228.091326.295972/output...
    33542    "magpie-crawler"
    25880    "Other"
    16578    "Sogou web spider"
    6383    "bingbot"
    3688    "Baiduspider"
    1487    "Yahoo! Slurp"
    1096    "JikeSpider"
    731    "YisouSpider"
    648    "Baiduspider-image"
    470    "Googlebot"
    Removing temp directory /tmp/top_10_spider.root.20161228.091326.295972...
  • 相关阅读:
    【转】可见性、原子性和有序性问题:并发编程Bug的源头
    实例详解 Java 死锁与破解死锁
    flutter 网络权限配置
    sqlserver pandas 日期
    Rust中mut, &, &mut的区别
    flutter, 在当前页刷新前一页
    django设置debug=false时静态文件丢失
    flutter dialog刷新
    pandas > polars
    python 读pdf
  • 原文地址:https://www.cnblogs.com/xiaoming279/p/6230237.html
Copyright © 2020-2023  润新知