美国销售前200药品分析

sklearn实战-乳腺癌细胞数据挖掘(博客主亲自录制视频教程)

https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

# -*- coding: utf-8 -*-
"""
Created on Thu Jul 27 15:30:27 2017

@author: toby
"""
import copy 
import jieba
import pandas
import jieba.analyse
from wordcloud import WordCloud
import matplotlib.pyplot as plt

from matplotlib.font_manager import FontProperties
font = r'C:WindowsFontssimfang.ttf'

filename="top20_USASales.txt"
 
def List_none_sense():
    file_noneSense_words=open("nonesense_words.txt",'r')
    list_noneSense_words=[]
    for line in file_noneSense_words:
        line_list=line.split()
        for word in line_list:
            list_noneSense_words.append(word)
    return list_noneSense_words

 #word遍历行，word处理，去除各种标点，怪异符号，最后计算word出现个数
#判断一个单词是否在垃圾词表里
def none_sense_words_Judeg(word):
    if word in list_noneSense_words:
        return True
    else:
        return False

#过滤停用词列表  
def filter_none_sense_words(list1):
    list2=[]
    for i in list1:
        #如果不在垃圾词库里或不是数字,汉字长度大于1
        if none_sense_words_Judeg(i[0])==False and i[0].isdigit()==False and len(i[0])>1:
            #print("remove",i)
            list2.append(i)
    return list2
    
    
#对文件分词
def fenci(filename) :
    f = open(filename,'r+')
    #所有文件保存为一个字符串
    file_list = f.read()
    f.close()
    #分词
    seg_list = list(jieba.cut(file_list,cut_all=False))
    #生成一个字典，每个单词排名
    tf={}
    for seg in seg_list :
        #print seg
        seg = ''.join(seg.split())
        if (seg != '' and seg != "
" and seg != "

") :
            if seg in tf :
                tf[seg] += 1
            else :
                tf[seg] = 1
    return sorted(tf.items(),key=lambda item:item[1],reverse=True)
    '''
    f = open("result.txt","w+")
    for item in tf:
        #print item
        f.write(item+"  "+str(tf[item])+"
")
    f.close()
    '''
    
#筛选排名大于三的    
def more3_filter(list1):
    list2=[]
    for i in list1:
        if i[1]>2:
            list2.append(i)
    return list2
    
'''    
#词语相似度
def Similarity(theList):
    newList=copy.deepcopy(theList)
    for i in range(len(newList)):
        if newList[i][0] in newList[i+1][0]:
            name=newList[i][0]
            value=newList[i][1]+newList[i+1][1]
            newItem=(name,value)
            print(newItem)
            del newList[i]
            del newList[i+1]
            newList.append(newItem)
    return newList
'''    
        
    
 
list_noneSense_words=List_none_sense()  
#分词    
list_words=fenci(filename)
#过滤停用词
list_words2=filter_none_sense_words(list_words)
list_words3=more3_filter(list_words2)
#前二十
list_words4=list_words3[:10]
for i in list_words4:
    print (i)
    

#写入数据到Excel，用pandas的df数据结构   
df=pandas.DataFrame(list_words2)
df.to_excel("result.xlsx")    
    
#标签云传输的frequency必须是字典形式，所以要转换
list_words3=dict(list_words3)

wc = WordCloud(collocations=False, font_path=font, width=1400, height=1400, margin=2).generate_from_frequencies(list_words3)

plt.imshow(wc)

plt.axis("off")
plt.show()
wc.to_file('show_Chinese.png')  # 把词云保存下来

python风控评分卡建模和风控常识(博客主亲自录制视频教程)

https://study.163.com/course/introduction.htm?courseId=1005214003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

相关阅读:
SRM146 DIV1 600
SRM145 DIV1 1000
SRM146 DIV1 300
SRM145 DIV1 600
【HTML打卡】0115 margin重叠、内联元素、css控制段落
 【HTML打卡】0114 盒模型margin、border、padding
【HTML打卡】0113 div布局，css控制
 【HTML打卡】0112-html发展、doctype声明
 Machine Learning结课感想
 【ACM打卡】ZOJ 1045 2722 2830
原文地址：https://www.cnblogs.com/webRobot/p/7429015.html