# -*- coding: utf-8 -*-
"""
Created on Thu Jul 27 15:30:27 2017
@author: toby
"""
import copy
import jieba
import pandas
import jieba.analyse
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
font = r'C:WindowsFontssimfang.ttf'
filename="top20_USASales.txt"
def List_none_sense():
file_noneSense_words=open("nonesense_words.txt",'r')
list_noneSense_words=[]
for line in file_noneSense_words:
line_list=line.split()
for word in line_list:
list_noneSense_words.append(word)
return list_noneSense_words
#word遍历行,word处理,去除各种标点,怪异符号,最后计算word出现个数
#判断一个单词是否在垃圾词表里
def none_sense_words_Judeg(word):
if word in list_noneSense_words:
return True
else:
return False
#过滤停用词列表
def filter_none_sense_words(list1):
list2=[]
for i in list1:
#如果不在垃圾词库里或不是数字,汉字长度大于1
if none_sense_words_Judeg(i[0])==False and i[0].isdigit()==False and len(i[0])>1:
#print("remove",i)
list2.append(i)
return list2
#对文件分词
def fenci(filename) :
f = open(filename,'r+')
#所有文件保存为一个字符串
file_list = f.read()
f.close()
#分词
seg_list = list(jieba.cut(file_list,cut_all=False))
#生成一个字典,每个单词排名
tf={}
for seg in seg_list :
#print seg
seg = ''.join(seg.split())
if (seg != '' and seg != "
" and seg != "
") :
if seg in tf :
tf[seg] += 1
else :
tf[seg] = 1
return sorted(tf.items(),key=lambda item:item[1],reverse=True)
'''
f = open("result.txt","w+")
for item in tf:
#print item
f.write(item+" "+str(tf[item])+"
")
f.close()
'''
#筛选排名大于三的
def more3_filter(list1):
list2=[]
for i in list1:
if i[1]>2:
list2.append(i)
return list2
'''
#词语相似度
def Similarity(theList):
newList=copy.deepcopy(theList)
for i in range(len(newList)):
if newList[i][0] in newList[i+1][0]:
name=newList[i][0]
value=newList[i][1]+newList[i+1][1]
newItem=(name,value)
print(newItem)
del newList[i]
del newList[i+1]
newList.append(newItem)
return newList
'''
list_noneSense_words=List_none_sense()
#分词
list_words=fenci(filename)
#过滤停用词
list_words2=filter_none_sense_words(list_words)
list_words3=more3_filter(list_words2)
#前二十
list_words4=list_words3[:10]
for i in list_words4:
print (i)
#写入数据到Excel,用pandas的df数据结构
df=pandas.DataFrame(list_words2)
df.to_excel("result.xlsx")
#标签云传输的frequency必须是字典形式,所以要转换
list_words3=dict(list_words3)
wc = WordCloud(collocations=False, font_path=font, width=1400, height=1400, margin=2).generate_from_frequencies(list_words3)
plt.imshow(wc)
plt.axis("off")
plt.show()
wc.to_file('show_Chinese.png') # 把词云保存下来