• 用python通过apache log 获取百度搜索来源关键词


    apache log格式

    127.0.0.1 - - [24/Feb/2011:19:20:27 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=44200 HTTP/1.1" 200 88
    127.0.0.1 - - [24/Feb/2011:19:20:28 +0800] "GET /seo/index2.php HTTP/1.1" 200 1228
    127.0.0.1 - - [24/Feb/2011:19:20:28 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/index2.php&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=85596 HTTP/1.1" 200 88
    127.0.0.1 - - [24/Feb/2011:19:20:29 +0800] "GET /seo/index.php HTTP/1.1" 200 844
    127.0.0.1 - - [24/Feb/2011:19:20:29 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/index.php&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=88069 HTTP/1.1" 200 88
    127.0.0.1 - - [24/Feb/2011:19:20:30 +0800] "GET /seo/index2.php HTTP/1.1" 200 1228
    127.0.0.1 - - [24/Feb/2011:19:20:30 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/index2.php&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=65456 HTTP/1.1" 200 88
    127.0.0.1 - - [24/Feb/2011:19:20:31 +0800] "GET /seo/index.php HTTP/1.1" 200 844
    127.0.0.1 - - [24/Feb/2011:19:20:31 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/index.php&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=91624 HTTP/1.1" 200 88
    127.0.0.1 - - [24/Feb/2011:19:20:31 +0800] "GET /seo/index2.php HTTP/1.1" 200 1228
    127.0.0.1 - - [24/Feb/2011:19:20:31 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/index2.php&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=68220 HTTP/1.1" 200 88
    127.0.0.1 - - [24/Feb/2011:19:20:32 +0800] "GET /seo/index.php HTTP/1.1" 200 844
    127.0.0.1 - - [24/Feb/2011:19:20:32 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/index.php&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=37909 HTTP/1.1" 200 88
    127.0.0.1 - - [24/Feb/2011:19:20:32 +0800] "GET /seo/index2.php HTTP/1.1" 200 1228
    127.0.0.1 - - [24/Feb/2011:19:20:32 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/index2.php&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=53594 HTTP/1.1" 200 88
    127.0.0.1 - - [24/Feb/2011:19:20:33 +0800] "GET /seo/index.php HTTP/1.1" 200 844
    127.0.0.1 - - [24/Feb/2011:19:20:33 +0800] "GET /seo/t.php?pt=Jerry%20Qu%27s%20HTML%20document%20%u6D4B%u8BD5%u4E2D%u6587&pu=http%3A//localhost/seo/index.php&ref=http%3A//www.baidu.com/s%3Fbs%3Ddocument.url%26f%3D8%26wd%3Dphp+referer&wh=1280x800&pid=93BHPILMEB&rnd=32830 HTTP/1.1" 200 88

    python代码

    #!/usr/bin/python
    #
    -#- coding: utf-8 -*-

    import os, base64, re, fnmatch, imghdr, shutil, pprint, urlparse

    log
    = "seo.log"
    reader
    = open(log, 'r')
    contents
    = []
    for line in reader.xreadlines() :
    p
    = re.compile('.*"GET (\/seo\/t\.php\?.*) HTTP\/1\.1".*', re.IGNORECASE)
    m
    = p.match(line)
    if m :
    res_file
    = m.group(1)
    #print res_file
    cs = urlparse.urlparse(res_file)
    #cs_lem = len(cs)
    #pprint.pprint(cs)
    s_q = urlparse.parse_qs(cs.query,True)

    ref
    = urlparse.urlparse(str(s_q['ref'][0]))
    ref_wd
    = urlparse.parse_qs(ref.query,True)
    print ref_wd['wd'][0]


    else :
    contents.append(line)
    reader.close()

    如果要统计google soso baidu

    #!/usr/bin/python
    #
    -#- coding: utf-8 -*-

    import os, base64, re, fnmatch, imghdr, shutil, pprint, urlparse

    log
    = "seo.log"
    reader
    = open(log, 'r')
    config
    = {'s0':{'h':'www.google.com.hk','q':'q'},'s1':{'h':'www.baidu.com','q':'wd|word'},'s3':{'h':'www.soso.com','q':'w'}}
    def get_q(x):
    for i,j in config.items():
    str_q
    = j['q'].split('|')
    if x.netloc == j['h']:
    return str_q

    for line in reader.xreadlines() :
    p
    = re.compile('.*"GET (\/seo\/t\.php\?.*) HTTP\/1\.1".*', re.IGNORECASE)
    m
    = p.match(line)
    if m :
    s_t
    = m.group(1)
    s_t_u
    = urlparse.urlparse(s_t)

    s_t_u_qs
    = urlparse.parse_qs(s_t_u.query,True)
    #print s_t_u_qs['ref'][0]
    ref = urlparse.urlparse(str(s_t_u_qs['ref'][0]))

    ref_qs
    = urlparse.parse_qs(ref.query,True)
    #print ref
    #print get_q(ref)
    for k in get_q(ref):
    if k in ref_qs:
    print ref.netloc+":::"+ref_qs[k][0]
    reader.close()

    结果如下

    ---------- Python ----------
    www.baidu.com:::php referer
    www.baidu.com:::php referer
    www.baidu.com:::php referer
    www.baidu.com:::php referer
    www.baidu.com:::php referer
    www.baidu.com:::php referer
    www.soso.com:::js urlencode
    www.baidu.com:::php referer
    www.google.com.hk:::urldecode js

    输出完毕 (耗时
    0 秒) - 正常终止

  • 相关阅读:
    我的博客
    【git】给文件重命名的简便方法
    【git】通过几次commit来认识工作区和暂存区
    2018年2月份面试题
    【git】建git仓库
    【git】git的安装和最小配置
    selenium WebDriver的实现原理
    什么是selenium
    selenium的安装
    monkey停不下来了怎么整
  • 原文地址:https://www.cnblogs.com/greengnn/p/1964137.html
Copyright © 2020-2023  润新知