• python打造批量关键词排名查询工具


       自己做站点的时候,都看看收录和关键词排名什么的,所以打造的这个批量关键词查询工具。

       

    #encoding:utf-8
    import urllib,re,random,time,sys,StringIO,socket
    try:
        import pycurl
    except:
        pass
    from bs4 import BeautifulSoup
    score={1: 28.56,
           2: 19.23,
           3: 10.20,
           4: 8.14,
           5: 7.50,
           6: 5.72,
           7: 4.01,
           8: 4.41,
           9: 5.53,
           10: 6.70,}
    
    #获取根域名,百度产品直接显示子域名
    def root_domain(url):
        if 'baidu.com' in url:
            return url
        else:
            try:
                url = url.replace('http://', '')
                l = ['.com.cn', '.org.cn', '.net.cn', '.gov.cn']
                for suffix in l:
                    if suffix in url:
                        return re.search('^(.*?..*?)*([^.]+?.[^.]+?.[^.]+)', url).group(2)
                return re.search('^(.*?..*?)*([^.]+?.[^.]+)', url).group(2)
            except:
                return '-'
    
    def curl(url, debug=False, **kwargs):
        list=['Mozilla/5.0 (Windows NT 5.1; rv:37.0) Gecko/20100101 Firefox/37.0','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36','Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36']
        randhead=random.sample(list,1)
        while 1:
            try:
                s = StringIO.StringIO()
                c = pycurl.Curl()
                c.setopt(pycurl.URL, url)
                c.setopt(pycurl.REFERER, url)
                c.setopt(pycurl.FOLLOWLOCATION, True)
                c.setopt(pycurl.TIMEOUT, 60)
                c.setopt(pycurl.ENCODING, 'gzip')
                c.setopt(pycurl.USERAGENT, '%s'%randhead[0])
                c.setopt(pycurl.NOSIGNAL, True)
                c.setopt(pycurl.WRITEFUNCTION, s.write)
                for k, v in kwargs.iteritems():
                    c.setopt(vars(pycurl)[k], v)
                c.perform()
                c.close()
                return s.getvalue()
            except:
                if debug:
                    raise
                continue
    
    
    def get_baidudata(keyword,rn):
        search_url = 'http://www.baidu.com/s?wd=%s&rn=%d'%(urllib.quote(keyword),rn)
        pagetext = curl(search_url)  #获取百度搜索结果源代码
    
        while 'http://verify.baidu.com' in pagetext: #判断 如果查询过程中出现验证码则提示并停止10分钟,然后重新查询
            print u"查询过程出现验证码,休息10分钟",keyword
            time.sleep(600)
            pagetext = curl(search_url)
        else:
            soup = BeautifulSoup(pagetext)
            data = soup.find_all("div",attrs={'class':'result c-container '})#提取自然排名结果
            return data
    
        return
    
    def get_rank_data(keyword,rn):
        data = get_baidudata(keyword,rn)#获取自然排名结果
        items = {}
        for result in data:
            g = result.find_all("a",attrs={'class':'c-showurl'})#获取主域名
            if g:
                site=re.search(r'([a-zA-Z0-9.-]+)',g[0].text)
                host = site.groups(1)[0]
                host=root_domain(host)#获取根域名
                rank = int(result['id'])#排名
                if host not in items.keys():
                    items[host] = []
                    items[host].append(score[rank])
                else:
                    items[host].append(score[rank])
        return items#返回单个词前十数据
    
    def get_keywords(filename):#读取关键词返回列表
        kwfile = open(filename,'r')
        keywords = kwfile.readline()
        kw_list = []
        while keywords:
            kw = keywords.strip()
            kw_list.append(kw)
            keywords = kwfile.readline()
        kwfile.close()
        return kw_list
    
    def get_all_data(filename,rn):#单域名数据合并
        kw_list = get_keywords(filename)
        items = {}
        for i,kw in enumerate(kw_list,1):
            print i,kw
            item = get_rank_data(kw,rn)
            for host,rank in item.items():
                if host not in items.keys():
                    items[host] = rank
                else:
                    items[host].extend(rank)
        return items
    
    def get_score(filename,rn):
        data = get_all_data(filename,rn)
        fh = open('score.csv','a+')
        fh.write('host,kws,average_score,host_score,
    ')
    
        for host,rank in data.items():
            if host != None:
                host = host.encode('utf-8')
            else:
                host = 'error page'
            kws = len(rank)#关键词数
            host_score = sum(rank)#总得分
            average_score = host_score/kws#平均分
            fh.write(host+','+str(kws)+','+str(average_score)+','+str(host_score)+'
    ')
        return
    
    if __name__=="__main__":
        file=raw_input("请输入包含关键词的文件名:")
    

      

  • 相关阅读:
    构造回文的最小插入次数
    动态规划设计:最大子数组
    状态压缩:对动态规划进行降维打击
    团灭 LeetCode 股票买卖问题
    经典动态规划:戳气球
    贪心算法之区间调度问题
    微信分享 添加URL Schemes
    UITouch的用法
    HTTP协议详解
    经典SQL语句大全
  • 原文地址:https://www.cnblogs.com/68xi/p/9348395.html
Copyright © 2020-2023  润新知