• 聚类之k-means附代码


     

     import os
    import sys as sys
    #reload(sys)
    #sys.setdefaultencoding('utf-8')
    from sklearn.cluster import KMeans
    from sklearn import feature_extraction
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.feature_extraction.text import CountVectorizer

    import matplotlib.pyplot as plt
    from matplotlib.font_manager import FontProperties
    from sklearn.cluster import KMeans
    from scipy.spatial.distance import cdist
    import numpy as np

    def tfidf_vector(corpus_path):
        corpus_train=[]
        #利用train-corpus提取特征
        target_train=[]
        for line in open(corpus_path):
            line=line.strip().split(' ')
            if len(line)==2:
                words=line[1]
                category=line[0]
                target_train.append(category)
                corpus_train.append(words)
        print ("build train-corpus done!!")
        count_v1= CountVectorizer(max_df=0.4,min_df=0.01)
        counts_train = count_v1.fit_transform(corpus_train)  
        
        word_dict={}
        for index,word in enumerate(count_v1.get_feature_names()):
            word_dict[index]=word
        
        print ("the shape of train is ")
        print (repr(counts_train.shape))
        tfidftransformer = TfidfTransformer()
        tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train)
        return tfidf_train,word_dict

    def best_kmeans(tfidf_matrix,word_dict):  
        K = range(1, 10)
        meandistortions = []
        for k in K:
            print (k),('****'*5)
            kmeans = KMeans(n_clusters=k)
            kmeans.fit(tfidf_matrix)    
            meandistortions.append(sum(np.min(cdist(tfidf_matrix.toarray(), kmeans.cluster_centers_, 'euclidean'), axis=1)) / tfidf_matrix.shape[0])
        plt.plot(K, meandistortions, 'bx-')
        plt.grid(True)
        plt.xlabel('Number of clusters')
        plt.ylabel('Average within-cluster sum of squares')
        plt.title('Elbow for Kmeans clustering')
        plt.show()

    corpus_train = "corpus_train.txt"
    cluster_docs = "cluster_result_document.txt"
    cluster_keywords = "cluster_result_keyword.txt"
    num_clusters = 7
    tfidf_train,word_dict=tfidf_vector(corpus_train)
    best_kmeans(tfidf_train,word_dict)
    cluster_kmeans(tfidf_train,word_dict,cluster_docs,cluster_keywords,num_clusters)

  • 相关阅读:
    44.分治算法练习:  一元三次方程求解
    44.分治算法练习:  一元三次方程求解
    44.分治算法练习:  一元三次方程求解
    MVC-04 视图(1)
    MVC-03 控制器(5)
    MVC-03 控制器(4)
    MVC-03 控制器(3)
    MVC-03 控制器(2)
    MVC-03 控制器(1)
    MVC-02 路由
  • 原文地址:https://www.cnblogs.com/hrnn/p/13406185.html
Copyright © 2020-2023  润新知