• 百度词汇检索,计算PMI值


    '''词汇检索百度返回值,并且计算PMI值的类'''
    from bs4 import BeautifulSoup
    import requests
    import re
    import pandas as pd
    import time
    import numpy as np
    
    class PMI():
        def __init__(self):
            self.url = 'https://www.baidu.com/s?wd='
            #self.vocab = vocab
    
        def getHtml(self, url):  # 只输入URL的主体部分,后面的参数用下面的字典附加上
            '''注意这里必须加一个user-Agent,不然request发送请求是是以Python名义发送的,百度知道是Python发的就不给你返回需要的内容,伪装一下'''
            try:
                header = {
                    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36",
                }
                r = requests.get(url, headers=header)
                r.raise_for_status()
                r.encoding = 'utf-8'
                return r.text
            except:
                print('爬取失败')
    
        def getNum(self, html):  # 返回搜索的数字
            soup = BeautifulSoup(html, 'html.parser')
            content = soup.find_all('span', {
                'class': 'nums_text'})  # 返回内容为   <span class="nums_text">百度为您找到相关结果约100,000,000个</span>
            num = re.findall(r'[d+,*]+', content[0].string)[
                0]  # 返回我们需要的搜索次数,内容是字符串型的数字.形如'100,000,000',数字内部包含逗号,下一步需要剔除掉逗号
            return int(re.sub(r',', '', num))  # 将逗号替换掉,并强制转换为整数
    
        def retrieveNum(self, vocab):  # url主体和爬取网页的数量
            url = self.url + vocab
            html = self.getHtml(url)
            num = self.getNum(html)
            return num
    
        def getPmi(self,vocab):
            n_p = 100000000
            n_f = self.retrieveNum(vocab)
            n_pf = self.retrieveNum(' '.join(['手机', vocab]))
            # print(' '.join(['手机',word]))
            # print(n_pf)
            pmi = np.log10(n_pf / (n_p * n_f))
            return pmi
    
        def getPmiList(self,words_list):#返回输入词列表的pmi值,以列表形式
            pmi_list=[]
            for i in words_list:
                pmi_list.append(self.getPmi(i))
            return pmi_list
    
    if __name__ =='__main__':
        time_start = time.time()
        url = 'https://www.baidu.com/s?wd='
        #print(getHtml( url+'爸爸'))
        # file=pd.DataFrame(columns=name,data=comm)
        # file.to_csv('D:/machinelearning data/crawlerData/huaWei_P20_JD100-110.csv',index=False)
        # num = retrieveNum('办法')
        # print('搜索次数为:', num)
        d=PMI()
        a=['快递','傻子','总体','物流', '验机', '物流', '游戏']#['鸡楚', '留香王者', '系列', '性能', '电池', '电', '视频', '游戏','中华民族', '性价比', '王者', '卡', '天', '红米.', '老婆', '电池', '电', '王者', '时间', '游戏', '相机', '感触', '粉色', '妹妹']
        pmi=d.getPmiList(a)
        print('PMI:',list(pmi))
        time_end = time.time()
        print('耗时%s秒' % (time_end - time_start))
    

      

  • 相关阅读:
    mybatis的延时加载缓存机制
    mybatis03
    事务
    codeforces-200B
    codeforces-339B
    codeforces-492B
    codeforces-266B
    codeforces-110A
    codeforces-887B
    codeforces-69A
  • 原文地址:https://www.cnblogs.com/zz22--/p/9729985.html
Copyright © 2020-2023  润新知