1 def getTaxt(): 2 txt=open('hamlet.txt') 3 txt = txt.lower() 4 for ch in '!"#$%&()*+,-./:;<=>?@[\]^_‘{|}~': 5 txt = txt.replace(ch, " ") #将文本中特殊字符替换为空格 6 return txt 7 8 hamletTxt = getText() 9 words = hamletTxt.split() 10 counts = {} 11 for word in words: 12 counts[word] = counts.get(word,0) + 1 13 items = list(counts.items()) 14 items.sort(key=lambda x:x[1], reverse=True) 15 for i in range(10): 16 word, count = items[i] 17 # print ("{0:<10}{1:>5}".format(word, count)) 输出出现最多的10个单词和其出现次数 18 print (word,count) #输出出现最多的10个单词
词云表示
1 import jieba 2 import wordcloud 3 import matplotlib.pyplot as plt 4 f = open("D:\360安全浏览器下载\hamlet.txt", "r", encoding="utf-8")#文件路经每个人都不一样,此程序可以在jupyter上运行 5 6 t = f.read() 7 f.close() 8 ls = jieba.lcut(t) 9 10 txt = " ".join(ls) 11 w = wordcloud.WordCloud( 12 width = 1000, height = 700, 13 background_color = "pink", 14 font_path = "msyh.ttc" 15 ) 16 myword=w.generate(txt) 17 plt.imshow(myword) 18 plt.axis("off") 19 plt.show() 20 w.to_file("g.png")