爬虫大作业

import requests
from bs4 import BeautifulSoup
import jieba
import matplotlib.pyplot as plt
from scipy.misc import imread
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


def get_url(urls):
    for n in range(0, 100):
        url = 'https://news.cnblogs.com/n/page/' + str(n) + '/'
        urls.append(url)
    return urls


def get_info(url, content):
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    news = soup.select('div[class="content"] h2')
    for nn in news:
        content.append(nn.get_text().strip('
').split('
'))
    return content


urls = []
single_content =[]
all_content = []
# urls = get_url(urls)
# for u in urls:
#     all_content.append(get_info(u, single_content))
# name = open('blog.txt', 'w', encoding='utf-8')
# for cc in all_content[0]:
#     name.write(str(cc[0]) + '
')
# name.close()


def jieba_split():
    with open('blog.txt', encoding='utf-8') as f:
        comment_text = f.read()
    cut_text = " ".join(jieba.cut(comment_text))
    with open('blog_split.txt', 'w', encoding='utf-8') as f:
        f.write(cut_text)

# jieba_split()

def wordcouter():
    word_lists = []
    with open('blog_split.txt', 'r', encoding='utf-8') as f:
        words = f.readlines()
        for ww in words:
             s_word= list(jieba.cut(ww))
             for word in s_word:
                 word_lists.append(word)

    word_lists_set = set(list(word_lists))
    length = len(word_lists_set)
    k = 1
    couter = []
    for w in word_lists_set:
        couter.append(w + u':' + str(word_lists.count(w)) + u"次
")
        k += 1
    with open('counter.txt', 'w', encoding='utf-8') as f:
        f.writelines(couter)
# wordcouter()

def word_cloud():
    s_words = open('counter.txt', 'r', encoding='utf-8').read()
    words = jieba.cut(s_words, cut_all=True)
    words_split = " ".join(words)
    print(words_split)
    background_pic = imread('hellokity.JPG')
    word_c = WordCloud(
        width=1000,
        height=1000,
        margin=2,
        background_color='white',
        mask=background_pic,
        font_path='C:WindowsFontsSTZHONGS.TTF',
        stopwords=STOPWORDS,
        max_font_size=100,
        random_state=100
    )
    word_c.generate_from_text(words_split)
    word_c.to_file('kity.JPG')

word_cloud()

相关阅读:
中继器，集线器，网桥，交换机，路由器有什么区别？//联网用到的硬件简介
超级详细Tcpdump 的用法
C/C++语言void及void指针深层探索
玩转Android事件监听篇第2篇
密码控晒稀奇密码大开眼界文艺密码PK科学密码
路由环路&路由中毒&路由黑洞简析
工程师淘金：开发Android主攻四大方向
Windows路由表详解
freebsd 终端显示文件夹时有颜色
程序员的八个级别

原文地址：https://www.cnblogs.com/severusandsusa/p/8934009.html