1.选一个自己感兴趣的主题。
2.网络上爬取相关的数据。
3.进行文本分析,生成词云。
4.对文本分析结果解释说明。
5.写一篇完整的博客,附上源代码、数据爬取及分析结果,形成一个可展示的成果。
import requests from bs4 import BeautifulSoup import re import pandas def getonecomment(username,js_text): comment = {} comment['username'] = username if(re.findall('[{}]","userClient.*?content":"(.*?)","creationTime'.format(username),js_text)): comment['comment'] = re.findall('[{}]","userClient.*?content":"(.*?)","creationTime'.format(username),js_text)[0] comment['time'] = re.findall('[{}]","userClient.*?creationTime":"(.*?)","isTop'.format(username),js_text)[0] f0 = open('jd.txt','a') f0.write(re.findall('[{}]","userClient.*?content":"(.*?)","creationTime'.format(username),js_text)[0].text) f0.close() else: pass return comment def getpagecomments(js_text): pagecomments = [] for username in re.findall('false,"nickname":"(.*?)","userClient',js_text): pagecomments.append(getonecomment(username,js_text)) return pagecomments def getcomments(url): url_id = re.search('.*/(.*).html',url).groups(0)[0] commentsls = [] for i in range(30): js_text = requests.get("https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv4635&productId={}&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1".format(url_id,i)).text commentsls.extend(getpagecomments(js_text)) return commentsls url_main='https://search.jd.com/Search?keyword=%E5%8D%8E%E4%B8%BA&enc=utf-8&suggest=1.rem.0.undefined&wq=%E5%8D%8E%E4%B8%BA&pvid=a30781dea7a8409aba07c6c86bb320ad' res = requests.get(url_main) res.encoding = 'UTF-8' soup = BeautifulSoup(res.text,'html.parser') commentstotal = [] for i in soup.select('li'): if len(i.select('.gl-i-wrap'))>0: url_page = "https:" + i.select('a')[0]['href'] commentstotal.extend(getcomments(url_page)) break df = pandas.DataFrame(commentstotal) df.to_excel('jd.xlsx')
import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt f1 = open('jd.txt','r',encoding='utf-8') jd = f1.read() f1.close() words = list(jieba.cut(jd)) ul={'那个', '但', '整个', '控','跟','再', '我们','个','看','没','们', '对', '怎么', '能够','颗','他们', '你们','知道', '什么','把', '一个','吧','系', '她','没有','已经','就是','可以','被','说','这个','得','给','还', '说道', '去','下', '上','好','里','会','要','到','和','让','不','那', '啊','很', '有','着','都','在', '这','的','了','是','就','我','也','他','你','、','”' ,'“','。','!','?',' ','u3000',',',' ','/','"',"'",',',':','.','=','>', '<','div','class','\','n',''} dic={} keys = set(words)-ul for i in keys: dic[i]=words.count(i) c = list(dic.items()) c.sort(key=lambda x:x[1],reverse=True) f1 = open('词云.txt','w') for i in range(20): print(c[i]) for words_count in range(c[i][1]): f1.write(c[i][0]+' ') f1.close() f3 = open('词云.txt','r') cy_file = f3.read() f3.close() cy = WordCloud().generate(cy_file) plt.imshow(cy) plt.axis("off") plt.show()