1.选一个自己感兴趣的主题或网站。(所有同学不能雷同)
2.用python 编写爬虫程序,从网络上爬取相关主题的数据。
3.对爬了的数据进行文本分析,生成词云。
import re import requests from bs4 import BeautifulSoup from datetime import datetime import pandas import matplotlib.pyplot as plt from wordcloud import WordCloud import jieba.analyse def getNewDetail(newurl): res = requests.get(newurl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') div = soup.select_one("#lft-art") if(div==None): div = soup.select_one(".main_art") if (div == None): return None; #标题 title=div.select_one("h1").text info=div.select_one(".info_l").text #作者 a=re.match("s*(By )*(.*) |",info) if(a==None) : author=None; else: author=a.group(2) #时间 time= datetime.strptime(re.search("Updated: (.{16})", info).group(1), "%Y-%m-%d %H:%M") #正文 content=div.select_one("#Content").text writeNewsDetail(content) return {"title":title,"author":author,"time":time,"content":content} def getAll(): newstotal = []; newsurl = 'http://www.chinadaily.com.cn/china/governmentandpolicy' res = requests.get(newsurl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') last = soup.select_one("#div_currpage").select("a")[-1].attrs.get('href') num=re.match("http://www.chinadaily.com.cn/china/governmentandpolicy/page_(d*).html", last).group(1) num=int(num,10) for i in range(num): print(i) listurl="http://www.chinadaily.com.cn/china/governmentandpolicy/page_{}.html".format(i+1) newstotal.extend(getAPage(listurl)) df = pandas.DataFrame(newstotal) df.to_excel('newsData.xlsx') def getAPage(url): newsls = [] res = requests.get(url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') div=soup.select_one("#lft-art").select("div.mb10") for i in div: newurl=i.select_one("a").attrs.get('href') data=getNewDetail(newurl) if(data!=None): if next(re.compile(r'[ 00- 10]|[ 13- 14]|[ 16- 37]').finditer(data["content"]), None): writeIllegalCharacterErrorUrl(newurl+" ") else: newsls.append(data) else: writeAnalysisErrorUrl(newurl+" ") return newsls def writeNewsDetail(content): f = open('content.txt', 'a',encoding='utf-8') f.write(content) f.close() def writeAnalysisErrorUrl(error): f = open('AnalysisError.txt', 'a',encoding='utf-8') f.write(error) f.close() def writeAnalysisErrorUrl(error): f = open('AnalysisError.txt', 'a',encoding='utf-8') f.write(error) f.close() def writeIllegalCharacterErrorUrl(error): f = open('IllegalCharacterError.txt', 'a', encoding='utf-8') f.write(error) f.close() def getWordCloud(): fo = open("content.txt", "r",encoding='utf-8') srt = fo.read() fo.close() result = jieba.analyse.textrank(srt, topK=50, withWeight=True) keywords = dict() for i in result: keywords[i[0]] = i[1] my_wordcloud = WordCloud(font_path="1.ttf").generate_from_frequencies(keywords) plt.imshow(my_wordcloud) plt.axis("off") plt.show() getAll() getWordCloud()
4.对文本分析结果进行解释说明。
分析出中国在国际上的热点是什么
5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。
实现过程:从新闻列表页第一页的分页栏获得新闻列表页总页数;爬下每个列表页的每个新闻;提取每个新闻的数据并保存;最后生成词云
遇到的问题及解决办法:遇见一些数据格式与众不同的新闻页,保存这些链接;遇见一些无法存入xlsx的新闻,字符异常报错,分析出这些新闻带非打印字符,也是保存这些链接
6.最后提交爬取的全部数据、爬虫及数据分析源代码。
全部数据:https://pan.baidu.com/s/1YXSpQwZ-_itlC2NfZyqyvg