• jieba分词以及wordcloud词云


    1.从网上下载一份 天龙八部的txt文档以及一份通用的jieba停用词表

    2.下载一个背景  图片.jpg

    3.检查一个字体文件   C:/Windows/Fonts/simsun.ttc

    # -*- coding:utf-8 -*-
    import jieba
    import jieba.analyse
    from PIL import Image
    import  numpy as np
    from wordcloud import WordCloud,ImageColorGenerator
    import  matplotlib.pyplot as plt
    #中文分词,将 天龙八部.txt 文档 除去停用词进行分词,将分词结果导入天龙八部分词.txt
    stopwords= [line.strip() for line in open("./停用词表.txt",encoding="utf-8")]
    def seg_sentence(sentence):
        sentence_seged = [word for word in jieba.cut(sentence.strip()) if (word not in stopwords and word != '	') ]
        result = ' '.join(sentence_seged)
        return result
    outputs = open("天龙八部分词.txt","w",encoding='utf-8')
    for line in open("./天龙八部.txt",'r',encoding='GB18030'):
        line_seg = seg_sentence(line)
        outputs.write(line_seg+'
    ')
    outputs.close()
    
    
    #采用TF-IDF算法进行关键词提取,返回关键词及IF-IDF权重
    text = open("./天龙八部分词.txt",encoding="utf-8").read()
    result = jieba.analyse.extract_tags(text,topK=20,withWeight=True,allowPOS=('nr',))
    print (result)
    
    #将结果[('段誉', 0.5881865046044787), ('萧峰', 0.4631424402591722).....]装换为字典做 词云模块的输入
    keywords = dict()
    for i in result:
        keywords[i[0]]=i[1]
        
    #词云背景
    image = Image.open('./图片.jpg')
    graph = np.array(image)
    wc = WordCloud(font_path='C:/Windows/Fonts/simsun.ttc',
                   background_color ="White",
                   max_words=15,
                   mask= graph)
    #生成词云
    wc.generate_from_frequencies(keywords)
    plt.imshow(wc)
    image_color = ImageColorGenerator(graph)
    plt.axis("off")
    plt.show()
    wc.to_file('词云.jpg')
  • 相关阅读:
    hiveserver2 with kerberos authentication
    python Basic usage
    python Quicksort demo
    Python HeapSort
    mrunit for wordcount demo
    CCDH证书
    Hadoop question list
    Hadoop Yarn core concepts
    Hadoop Resource
    Hadoop could not find or load main class
  • 原文地址:https://www.cnblogs.com/students/p/10820942.html
Copyright © 2020-2023  润新知