• python学习_新增了一个jieba库和wordcloud文件生成词云


    版本:

      新增了一个jieba库和wordcloud文件生成词云

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    '''
    __author__ = '王益夫'
    __mtime__ = '2019/12/20'
    '''
    '''
    版本修改:
    V 1.0:get代码获取的文本内容,通过jieba库和词云进行分析
    '''
    import jieba
    from wordcloud import WordCloud
    from os import path
    import re
    import matplotlib.pyplot as plt
    #from scipy.misc import imread
    import imageio
    
    file_path = path.dirname(__file__) + r'/temp'
    file_name1 = r'新闻联播.txt'
    file_name2 = r'StopWords.txt'
    file_name3 = r'AddWords.txt'
    
    TextPath = file_path + '/' + file_name1
    StopWordsPath = file_path + '/' + file_name2
    AddWordsPath = file_path + '/' + file_name3
    print(AddWordsPath)
    
    
    def jiebaclearText(text):
        mywordslist = []
        seg_list = jieba.cut(text, cut_all=False)
        #seg_list = jieba.cut(TestStr, cut_all=True)    全模式:该模式将语料中所有可以组合成词的词语都构建出来,其优点是速度非常快,缺点是不能解决歧义问题,并且分词结果不太准确。
        #seg_list = jieba.cut(TestStr, cut_all=False)   默认模式:该模式利用其算法将句子最精确地分隔开,适合文本分析,通常采用这种模式进行中文分词。
        #seg_list = jieba.cut_for_search(TestStr)       搜索引擎模式:该模式是在精确模式基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。
        liststr = "/".join(seg_list)
        f_stop = open(StopWordsPath, encoding='utf-8', errors='ignore')
        try:
            f_stop_text = f_stop.read()
        finally:
            f_stop.close()
    
        f_stop_seg_list = f_stop_text.split('
    ')
        for myword in liststr.split('/'):
            if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
                mywordslist.append(myword)
        return ' '.join(mywordslist)
    
    def addWordsRulls(text):
        addwords_list = set()
        try:
            results = re.findall('《[^》]+》', text)
            for result in results:
                addwords_list.add(result)
                #jieba.add_word(result)
            return True
        except Exception as e:
            raise e
            addwords_list.add('EOR:ADD正则解析失败,未获取关键词!')
            return False
        finally:
            with open(AddWordsPath, 'a+', encoding='utf-8', errors='ignore') as file_add:
                for line in list(addwords_list):
                    file_add.write(line + '
    ')
    
    def StopWordsRulls(text):
        Stopwords_list = set()
        try:
            results = re.findall('d{4}年d{1,2}月d{1,2}日', text)
            for result in results:
                print(result)
                Stopwords_list.add(result)
                #jieba.add_word(result)
            return True
        except Exception as e:
            raise e
            Stopwords_list.add('EOR:Stop正则解析失败,未获取关键词!')
            return False
        finally:
            with open(StopWordsPath, 'a+', encoding='utf-8', errors='ignore') as file_Stop:
                for line in list(Stopwords_list):
                    file_Stop.write(line + '
    ')
    
    def main():
    
    
        with open(TextPath, encoding='utf-8', errors='ignore') as file_Text:
            text = file_Text.read()
        # for key in analyse.extract_tags(text, 50, withWeight=False):
        #     print(key)
    
        if addWordsRulls(text) and StopWordsRulls(text):
            with open(AddWordsPath, 'r', encoding='utf-8', errors='ignore') as file_read:
                context = set(file_read.read())
                for line in context:
                    jieba.add_word(line)
    
        text_text = jiebaclearText(text)
    
        color_mask = imageio.imread(file_path + "/template.jpeg")
        cloud = WordCloud(
            # 设置字体,不指定就会出现乱码
            font_path="./temp/HYQiHei-25J.ttf",
            # font_path=path.join(d,'simsun.ttc'),
            # 设置背景色
            background_color='white',
            # 词云形状
            mask=color_mask,
            # 允许最大词汇
            max_words=200,
            # 最大号字体
            max_font_size=40
        )
    #    wordcloud = WordCloud(background_color="white", width=1000, height=860, margin=2).generate(text_text)
        word_cloud = cloud.generate(text_text)  # 产生词云
        word_cloud.to_file("test.jpg")  # 保存图片
        #  显示词云图片
        plt.imshow(word_cloud)
        plt.axis('off')
        plt.show()
    
    if __name__ == '__main__':
        main()
    

      

  • 相关阅读:
    css | js 实现扩展卡片小demo
    ESLint如何配置
    (js描述的)数据结构[哈希表1.3](10)
    (js描述的)数据结构[哈希表1.2](9)
    VSCode——自定义VSCode背景图片
    VSCode 初次写vue项目并一键生成.vue模版
    (js描述的)数据结构[哈希表1.1](8)
    (js描述的)数据结构[字典](7)
    Vue 实战项目: 硅谷外卖(1)
    脑残式网络编程入门(六):什么是公网IP和内网IP?NAT转换又是什么鬼?
  • 原文地址:https://www.cnblogs.com/wyf-349/p/12124193.html
Copyright © 2020-2023  润新知