• 美国销售前200药品分析


    sklearn实战-乳腺癌细胞数据挖掘(博客主亲自录制视频教程)

    https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

    # -*- coding: utf-8 -*-
    """
    Created on Thu Jul 27 15:30:27 2017
    
    @author: toby
    """
    import copy 
    import jieba
    import pandas
    import jieba.analyse
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    
    from matplotlib.font_manager import FontProperties
    font = r'C:WindowsFontssimfang.ttf'
    
    filename="top20_USASales.txt"
     
    def List_none_sense():
        file_noneSense_words=open("nonesense_words.txt",'r')
        list_noneSense_words=[]
        for line in file_noneSense_words:
            line_list=line.split()
            for word in line_list:
                list_noneSense_words.append(word)
        return list_noneSense_words
    
     #word遍历行,word处理,去除各种标点,怪异符号,最后计算word出现个数
    #判断一个单词是否在垃圾词表里
    def none_sense_words_Judeg(word):
        if word in list_noneSense_words:
            return True
        else:
            return False
    
    #过滤停用词列表  
    def filter_none_sense_words(list1):
        list2=[]
        for i in list1:
            #如果不在垃圾词库里或不是数字,汉字长度大于1
            if none_sense_words_Judeg(i[0])==False and i[0].isdigit()==False and len(i[0])>1:
                #print("remove",i)
                list2.append(i)
        return list2
        
        
    #对文件分词
    def fenci(filename) :
        f = open(filename,'r+')
        #所有文件保存为一个字符串
        file_list = f.read()
        f.close()
        #分词
        seg_list = list(jieba.cut(file_list,cut_all=False))
        #生成一个字典,每个单词排名
        tf={}
        for seg in seg_list :
            #print seg
            seg = ''.join(seg.split())
            if (seg != '' and seg != "
    " and seg != "
    
    ") :
                if seg in tf :
                    tf[seg] += 1
                else :
                    tf[seg] = 1
        return sorted(tf.items(),key=lambda item:item[1],reverse=True)
        '''
        f = open("result.txt","w+")
        for item in tf:
            #print item
            f.write(item+"  "+str(tf[item])+"
    ")
        f.close()
        '''
        
    #筛选排名大于三的    
    def more3_filter(list1):
        list2=[]
        for i in list1:
            if i[1]>2:
                list2.append(i)
        return list2
        
    '''    
    #词语相似度
    def Similarity(theList):
        newList=copy.deepcopy(theList)
        for i in range(len(newList)):
            if newList[i][0] in newList[i+1][0]:
                name=newList[i][0]
                value=newList[i][1]+newList[i+1][1]
                newItem=(name,value)
                print(newItem)
                del newList[i]
                del newList[i+1]
                newList.append(newItem)
        return newList
    '''    
            
        
     
    list_noneSense_words=List_none_sense()  
    #分词    
    list_words=fenci(filename)
    #过滤停用词
    list_words2=filter_none_sense_words(list_words)
    list_words3=more3_filter(list_words2)
    #前二十
    list_words4=list_words3[:10]
    for i in list_words4:
        print (i)
        
    
    #写入数据到Excel,用pandas的df数据结构   
    df=pandas.DataFrame(list_words2)
    df.to_excel("result.xlsx")    
        
    #标签云传输的frequency必须是字典形式,所以要转换
    list_words3=dict(list_words3)
    
    wc = WordCloud(collocations=False, font_path=font, width=1400, height=1400, margin=2).generate_from_frequencies(list_words3)
    
    plt.imshow(wc)
    
    plt.axis("off")
    plt.show()
    wc.to_file('show_Chinese.png')  # 把词云保存下来     
    

    python风控评分卡建模和风控常识(博客主亲自录制视频教程)

    
    
    
  • 相关阅读:
    SRM146 DIV1 600
    SRM145 DIV1 1000
    SRM146 DIV1 300
    SRM145 DIV1 600
    【HTML打卡】0115 margin重叠、内联元素、css控制段落
    【HTML打卡】0114 盒模型margin、border、padding
    【HTML打卡】0113 div布局,css控制
    【HTML打卡】0112-html发展、doctype声明
    Machine Learning结课感想
    【ACM打卡】ZOJ 1045 2722 2830
  • 原文地址:https://www.cnblogs.com/webRobot/p/7429015.html
Copyright © 2020-2023  润新知