• 《手牵手带你走进python世界》系列五


      import requests
      from bs4 import BeautifulSoup
      import datetime
      import pandas as pd
      import matplotlib.pyplot as plt
      import re
      import jieba
      import numpy as np
      from wordcloud import WordCloud, ImageColorGenerator
      
      url = "https://comment.bilibili.com/92542241.xml"
      r = requests.get(url)
      r.encoding = 'utf8'
      
      
      soup = BeautifulSoup(r.text,'lxml')
      d = soup.find_all('d')
      
      dlst = []
      n = 0
      for i in d:
          n += 1
          danmuku = {}
          danmuku['弹幕'] = i.text
          danmuku['网址'] = url
          danmuku['时间'] = datetime.date.today()
          dlst.append(danmuku)
      
      df = pd.DataFrame(dlst)
      
      with open('sign.txt','w',encoding='utf8') as f:
          for text in df['弹幕'].values:
              pattern = re.compile(r'[一-龥]+')
              filter_data = re.findall(pattern,text)
              f.write("".join(filter_data))
      
      with open('sign.txt', 'r', encoding='utf8') as f:
          data = f.read()
          segment = jieba.lcut(data)
          words_df = pd.DataFrame({"segment": segment})
      
      word_stat = words_df.groupby(by=['segment'])['segment'].agg({'计数':np.size})
      words_stat = word_stat.reset_index().sort_values(by=['计数'],ascending=False)
      
      wordcloud = WordCloud(
          font_path="/Library/Application Support/Apple/Fonts/iLife/BalegaRegular.otf",   # mac上没有该字体
          # font_path="C:WindowsFontssimkai.ttf",
          # 设置字体可以显示中文
          background_color="white",  # 背景颜色
          max_words=3000,  # 词云显示的最大词数
          max_font_size=200,  # 字体最大值
          random_state=100,
          width=1000, height=860, margin=2,
          # 设置图片默认的大小,但是如果使用背景图片的话,                                                   # 那么保存的图片大小将会按照其大小保存,margin为词语边缘距离
      )
      
      # 生成词云, 可以用generate输入全部文本,也可以我们计算好词频后使用generate_from_frequencies函数
      word_frequence = {x[0]: x[1] for x in words_stat.head(500).values}
      word_frequence_dict = {}
      for key in word_frequence:
          word_frequence_dict[key] = word_frequence[key]
      
      wordcloud.generate_from_frequencies(word_frequence_dict)
      # 从背景图片生成颜色值
      # image_colors = ImageColorGenerator(color_mask)
      # 重新上色
      # wordcloud.recolor(color_func=image_colors)
      # 保存图片
      wordcloud.to_file('output.png')
      plt.imshow(wordcloud)
      plt.axis("off")
      plt.show()
    
  • 相关阅读:
    分布式集群环境下运行Wordcount程序
    VM搭建hadoop分布式集群
    安装运行Hadoop
    网络问题
    Golang依赖工具
    会话进程组终端 · 守护进程
    Golang笔记
    [转]GDB
    [转]用户态与内核态
    【转】linux环境内存分配原理 malloc info
  • 原文地址:https://www.cnblogs.com/wuxiaoshi/p/11048761.html
Copyright © 2020-2023  润新知