• 爬虫大作业


    import requests
    from bs4 import BeautifulSoup
    import jieba
    import matplotlib.pyplot as plt
    from scipy.misc import imread
    from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
    
    
    def get_url(urls):
        for n in range(0, 100):
            url = 'https://news.cnblogs.com/n/page/' + str(n) + '/'
            urls.append(url)
        return urls
    
    
    def get_info(url, content):
        res = requests.get(url)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        news = soup.select('div[class="content"] h2')
        for nn in news:
            content.append(nn.get_text().strip('
    ').split('
    '))
        return content
    
    
    urls = []
    single_content =[]
    all_content = []
    # urls = get_url(urls)
    # for u in urls:
    #     all_content.append(get_info(u, single_content))
    # name = open('blog.txt', 'w', encoding='utf-8')
    # for cc in all_content[0]:
    #     name.write(str(cc[0]) + '
    ')
    # name.close()
    
    
    def jieba_split():
        with open('blog.txt', encoding='utf-8') as f:
            comment_text = f.read()
        cut_text = " ".join(jieba.cut(comment_text))
        with open('blog_split.txt', 'w', encoding='utf-8') as f:
            f.write(cut_text)
    
    # jieba_split()
    
    def wordcouter():
        word_lists = []
        with open('blog_split.txt', 'r', encoding='utf-8') as f:
            words = f.readlines()
            for ww in words:
                 s_word= list(jieba.cut(ww))
                 for word in s_word:
                     word_lists.append(word)
    
        word_lists_set = set(list(word_lists))
        length = len(word_lists_set)
        k = 1
        couter = []
        for w in word_lists_set:
            couter.append(w + u':' + str(word_lists.count(w)) + u"")
            k += 1
        with open('counter.txt', 'w', encoding='utf-8') as f:
            f.writelines(couter)
    # wordcouter()
    
    def word_cloud():
        s_words = open('counter.txt', 'r', encoding='utf-8').read()
        words = jieba.cut(s_words, cut_all=True)
        words_split = " ".join(words)
        print(words_split)
        background_pic = imread('hellokity.JPG')
        word_c = WordCloud(
            width=1000,
            height=1000,
            margin=2,
            background_color='white',
            mask=background_pic,
            font_path='C:WindowsFontsSTZHONGS.TTF',
            stopwords=STOPWORDS,
            max_font_size=100,
            random_state=100
        )
        word_c.generate_from_text(words_split)
        word_c.to_file('kity.JPG')
    
    word_cloud()

  • 相关阅读:
    中继器,集线器,网桥,交换机,路由器有什么区别?//联网用到的硬件简介
    超级详细Tcpdump 的用法
    C/C++语言void及void指针深层探索
    玩转Android事件监听篇第2篇
    密码控晒稀奇密码大开眼界 文艺密码PK科学密码
    路由环路&路由中毒&路由黑洞简析
    工程师淘金:开发Android主攻四大方向
    Windows路由表详解
    freebsd 终端显示文件夹时有颜色
    程序员的八个级别
  • 原文地址:https://www.cnblogs.com/severusandsusa/p/8934009.html
Copyright © 2020-2023  润新知