• 一个完整的大作业


    1.选一个自己感兴趣的主题。

    2.网络上爬取相关的数据。

    3.进行文本分析,生成词云。

    4.对文本分析结果解释说明。

    5.写一篇完整的博客,附上源代码、数据爬取及分析结果,形成一个可展示的成果。

    import requests
    from bs4 import BeautifulSoup
    import re
    import pandas
    
    def getonecomment(username,js_text):
        comment = {}
        comment['username'] = username
        if(re.findall('[{}]","userClient.*?content":"(.*?)","creationTime'.format(username),js_text)):
            comment['comment'] = re.findall('[{}]","userClient.*?content":"(.*?)","creationTime'.format(username),js_text)[0]
            comment['time'] = re.findall('[{}]","userClient.*?creationTime":"(.*?)","isTop'.format(username),js_text)[0]
            f0 = open('jd.txt','a')
            f0.write(re.findall('[{}]","userClient.*?content":"(.*?)","creationTime'.format(username),js_text)[0].text)
            f0.close()
        else:
            pass
        return comment
    
    def getpagecomments(js_text):
        pagecomments = []
        for username in re.findall('false,"nickname":"(.*?)","userClient',js_text):
            pagecomments.append(getonecomment(username,js_text))
        return pagecomments
    
    def getcomments(url):
        url_id = re.search('.*/(.*).html',url).groups(0)[0]
        commentsls = []
        for i in range(30):
            js_text = requests.get("https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv4635&productId={}&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1".format(url_id,i)).text
            commentsls.extend(getpagecomments(js_text))
        return commentsls
        
        
    
    
    url_main='https://search.jd.com/Search?keyword=%E5%8D%8E%E4%B8%BA&enc=utf-8&suggest=1.rem.0.undefined&wq=%E5%8D%8E%E4%B8%BA&pvid=a30781dea7a8409aba07c6c86bb320ad'
    res = requests.get(url_main)
    res.encoding = 'UTF-8'
    
    soup = BeautifulSoup(res.text,'html.parser')
    
    commentstotal = []
    for i in soup.select('li'):
        if len(i.select('.gl-i-wrap'))>0:
            url_page = "https:" + i.select('a')[0]['href']
            commentstotal.extend(getcomments(url_page))
            break
    
    df = pandas.DataFrame(commentstotal)
    df.to_excel('jd.xlsx')
    import jieba
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    
    f1 = open('jd.txt','r',encoding='utf-8')
    jd = f1.read()
    f1.close()
    
    words = list(jieba.cut(jd))
    
    ul={'那个', '', '整个', '','','', '我们','','','','', '',
        '怎么', '能够','','他们', '你们','知道', '什么','', '一个','','',
        '','没有','已经','就是','可以','','','这个','','','', '说道',
        '','', '','','','','','','','','','', '','',
        '','','','', '','','','','','','','','','',''
        ,'','','','',' ','u3000','','
    ','/','"',"'",',',':','.','=','>',
        '<','div','class','\','n',''}
    dic={}
    
    keys = set(words)-ul
    for i in keys:
        dic[i]=words.count(i)
    
    c = list(dic.items())
    c.sort(key=lambda x:x[1],reverse=True)
    
    f1 = open('词云.txt','w')
    for i in range(20):
        print(c[i])
        for words_count in range(c[i][1]):
            f1.write(c[i][0]+' ')
    f1.close()
    
    f3 = open('词云.txt','r')
    cy_file = f3.read()
    f3.close()
    cy = WordCloud().generate(cy_file)
    plt.imshow(cy)
    plt.axis("off")
    plt.show()

     

  • 相关阅读:
    控制台——args参数的赋值方法
    整数排序的几种方法
    基于CentOS系统下的Oracle的安装
    QT的学习
    HDU 2104 hide handkerchief
    HDU 2103 Family Plan
    HDU 2115 I Love This Game
    HDU 2100 Lovekey
    猜数字游戏
    利用Hough变换识别图像中的直线
  • 原文地址:https://www.cnblogs.com/zeson/p/7774196.html
Copyright © 2020-2023  润新知