• 用Python玩转词云


    第一步:引入相关的库包:

    #coding:utf-8
    __author__ = 'Administrator'
    import jieba    #分词包
    import numpy    #numpy计算包
    import codecs   #codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode 
    import pandas   
    import matplotlib.pyplot as plt
    %matplotlib inline
    
    from wordcloud import WordCloud#词云包

    第二部:导入分好词的西游记txt文件:

    file=codecs.open(u"西游记.txt",'r','utf-8')
    content=file.read()
    file.close()
    jieba.load_userdict(u"红楼梦分词.txt")
    segment=[]
    segs=jieba.cut(content)
    for seg in segs:
        if len(seg)>1 and seg!='
    ':
            segment.append(seg)

    第三部:统计分词结果并去掉停用词:

    segmentDF=pandas.DataFrame({'segment':segment})
    segmentDF.head()
    stopwords=pandas.read_csv("stopwords.txt",index_col=False,quoting=3,sep="	",names=['stopword'])#quoting=3全不引用
    stopwords.head()
    segmentDF=segmentDF[~segmentDF.segment.isin(stopwords.stopword)]
    wyStopWords=pandas.Series(['','','','','','','','','','','','','','','','',''
                               ,'','','','',
                               '','','','','','','','','','',
                               '','','','','','','','便','','','','','','','','','','" "'])
    segmentDF=segmentDF[~segmentDF.segment.isin(wyStopWords)]

    第四部:统计词频:

    segStat=segmentDF.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})
    segStat=segStat.reset_index().sort(columns="计数",ascending=False)
    segStat

     

    第五步:显示词云

    wordcloud=WordCloud(font_path="simhei.ttf",background_color="black")
    wordcloud=wordcloud.fit_words(segStat.head(1000).itertuples(index=False))
    plt.imshow(wordcloud)

     

    第六步:自定义词云形状

    from scipy.misc import imread
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud,ImageColorGenerator
    bimg=imread('3.jPG')
    wordcloud=WordCloud(background_color="white",mask=bimg,font_path='C:WindowsFontssimhei.ttf')
    wordcloud=wordcloud.fit_words(segStat.head(39769).itertuples(index=False))
    bimgColors=ImageColorGenerator(bimg)
    plt.axis("off")
    plt.imshow(wordcloud.recolor(color_func=bimgColors))
    plt.show()

  • 相关阅读:
    UVA756
    SP30906
    SP32900
    CF940F
    洛谷P5030
    洛谷P5142
    洛谷P2569
    网络流 24 题做题记录
    矩阵
    二分图
  • 原文地址:https://www.cnblogs.com/wuchuanying/p/6225179.html
Copyright © 2020-2023  润新知