爬虫大作业

利用Python爬取微博数据生成词云图

import re
from lxml import etree


f = open('info.txt' ,'rb').read().decode('utf-8').replace('\','')
repost = re.findall('<div class="WB_text W_f14" node-type="feed_list_content" nick-name="UNIQ-王一博">(.*?)</div>', f)
dongtai = re.findall('<div class="WB_text W_f14" node-type="feed_list_content" >(.*?)</div>', f)
for i in repost:
    selector = etree.HTML(i)
    text = selector.xpath('string()')
    file = open('weico.txt', 'a+', encoding='utf-8')
    file.write(re.sub('s+', '', text))
for j in dongtai:
    selector = etree.HTML(j)
    text1 = selector.xpath('string()')
    file = open('weico.txt', 'a+', encoding='utf-8')
    file.write(re.sub('s+', '', text1))

import requests
import re

header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
          'Cookie': 'SINAGLOBAL=5542715244496.104.1504275191526; SCF=AhTamjvKXKQ6KxI4fPK4OCo3IcJW7WJ3bzHYaxiXbv6jeaSCC-kZD4k1uL2yYVm2T4vbdkdILuCFmdacFfopvWQ.; SUHB=0OMV2fGIiDZRi3; SUB=_2AkMttnCgdcPxrAFVn_Edy2jraIhH-jyeYxlWAn7uJhMyOhgv7lhQqSVutBF-XLeJtSfazwW1-LaF7O_swC9yVA0q; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WFzxiHi1Fma1.SV9rcCiS_p5JpVF020SoMES0q01Ke4; login_sid_t=d2e992ebac9ccd5b74acb6ce967b2943; _s_tentry=www.baidu.com; Apache=3225744868884.1685.1526545586367; ULV=1526545586373:11:2:1:3225744868884.1685.1526545586367:1525350236568; cross_origin_proto=SSL; UOR=blog.csdn.net,widget.weibo.com,www.baidu.com',}
Url = "https://weibo.com/u/5492443184?refer_flag=1001030101_&is_all=1#_rnd1526811587161"

def parse_page():
    resp = requests.get(Url, headers=header)
    txt_content = resp.content.decode("utf-8").replace('\','')
    html = re.findall("<script>FM.view(.*?)</script>", txt_content)
    for info in html:
        f = open('info.txt', 'a+', encoding='utf-8')
        f.write(info.replace('(','').replace(')',''))
        f.close()

if __name__ == '__main__':
    parse_page()

# -*- coding: utf-8 -*-
import jieba
from wordcloud import WordCloud, STOPWORDS


text = ''
f = open('weico.txt','rb').read().decode('utf-8')
text += ' '.join(jieba.lcut(f))
wc = WordCloud(
    width=1920,
    height=1080,
    margin=2,
    background_color='white',  # 设置背景颜色
    font_path='C:WindowsFontsSTZHONGS.TTF',  # 若是有中文的话，这句代码必须添加，不然会出现方框，不出现汉字
    max_words=200,  # 设置最大现实的字数
    stopwords=STOPWORDS,  # 设置停用词
    max_font_size=150,  # 设置字体最大值
    random_state=42  # 设置有多少种随机生成状态，即有多少种配色方案
)
wc.generate_from_text(text)
wc.to_file('weico.jpg')

运行结果

相关阅读:
Python之路(第十七篇)logging模块
 Python之路(第十五篇)sys模块、json模块、pickle模块、shelve模块
 Python之路(第十四篇)os模块
 Python之路(第十三篇)time模块、random模块、string模块、验证码练习
 Python之路(第十二篇)程序解耦、模块介绍导入安装、包
 Python编程笔记（第三篇）【补充】三元运算、文件处理、检测文件编码、递归、斐波那契数列、名称空间、作用域、生成器
 Python之路(第十一篇)装饰器
 Python之路(第十篇)迭代器协议、for循环机制、三元运算、列表解析式、生成器
 Python之路(第九篇)Python文件操作
 Python编程笔记（第二篇）二进制、字符编码、数据类型
原文地址：https://www.cnblogs.com/1940370572QQ/p/8987783.html

最新文章
嵌入式开发板LInux更新系统、安装软件、下载资源碰到的问题
 C#
C#
C#
C#
C#
C#
C#
Python
C#