爬虫的一个小实例
使用工具:
jieba分词;
Wordcloud词云
requests
-
源程序
import requests#爬虫的请求包 import pandas as pd#pandas数据分析包 from bs4 import BeautifulSoup import datetime import re import jieba from wordcloud import WordCloud from imageio import imread import matplotlib.pyplot as plt header = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } url = 'http://comment.bilibili.com/125511694.xml' response = requests.get(url=url, headers=header)#向对方服务器发送请求 response.encoding = response.apparent_encoding#设置字符编码 data = response.text#获取文本 soup = BeautifulSoup(data, 'lxml')#解析 d_list = soup.find_all('d')#获取所有的d标签 dlst = [] for d in d_list: danmu = {} danmu['弹幕'] = d.text#获取文本信息 dlst.append(danmu) df = pd.DataFrame(dlst)#转换成二位数组,类似于execl表格 f = open('sign.txt', 'w', encoding='utf-8')#打开文件 for i in df['弹幕'].values:#循环所有的文本信息 pat = re.compile(r'[1-龥]+')# 定义过滤数据的规则,所有的汉字 filter_data = re.findall(pattern=pat, string=i)#执行过滤操作 f.write("".join(filter_data))#写入文本 f.close()
改进版的弹幕源代码
import requests#爬虫的请求包 import pandas as pd#pandas数据分析包 from bs4 import BeautifulSoup import datetime import re import jieba from wordcloud import WordCloud from imageio import imread import matplotlib.pyplot as plt f = open('sign.txt', 'r', encoding='utf-8') data = f.read() result = " ".join(jieba.lcut(data)) f.close() color_mask = imread('小猪佩奇.jpg') wc = WordCloud( font_path=r'C:WindowsFontssimkai.ttf', width=1000, height=800, mask=color_mask, background_color='pink' ) wc.generate(result) wc.to_file('bili.jpg') plt.imshow(wc) plt.show()