• [聚类算法]常用功能实现


    前言:聚类是非监督学习的主要任务之一,根据原理可分为:基于质心、基于密度、基于连通性、基于概率以及基于神经网络等多种类型。

    本文汇总了常用聚类算法及其评价指标,方便快速查询使用。(本文使用波士顿房价数据集,可用于回归)

    以下为试验结果:

     1 from time import time
     2 
     3 import numpy as np
     4 import pandas as pd
     5 import matplotlib as mpl
     6 import matplotlib.pyplot as plt
     7 import sklearn
     8 from sklearn import datasets
     9 
    10 from sklearn.decomposition import PCA
    11 from sklearn.preprocessing import scale
    12 
    13 from sklearn import metrics
    14 from sklearn.cluster import KMeans
    15 from sklearn.cluster import MeanShift
    16 from sklearn.cluster import DBSCAN
    17 from sklearn.cluster import AgglomerativeClustering
    18 
    19 #1. 加载数据
    20 boston = sklearn.datasets.load_boston()
    21 x,y = boston.data, boston.target
    22 y = y.reshape(len(y),1)
    23 data = np.hstack([x,y])
    24 
    25 #2. 特征归一化
    26 x = scale(x)
    27 data = scale(data)
    28 
    29 
    30 #3. 分析数据
    31 name_data = boston.feature_names
    32 #print(name_data)
    33 
    34 df_x = pd.DataFrame(x,columns=name_data)
    35 df_y = pd.DataFrame(y,columns=['MEDV'],dtype=np.int32)
    36 df = pd.concat([df_x,df_y],axis=1)
    37 
    38 # #506条数据,没有空值,float64类型
    39 # print(df.head())
    40 # print(df.info())
    41 # print(df['MEDV'].describe())
    42 
    43 #拟分为4类,目标为:<=17.025,>17.025 and <= 21.2, >21.2 and <=25, >25
    44 n_clusters = 4#聚簇数量
    45 df_y.loc[df_y['MEDV'] < 20] = 0
    46 df_y.loc[(df_y['MEDV'] > 17.025) & (df_y['MEDV'] <= 21.2)] = 1
    47 df_y.loc[(df_y['MEDV'] > 21.2) & (df_y['MEDV'] <= 25)] = 2
    48 df_y.loc[df_y['MEDV'] > 25] = 3
    49 labels = df_y.values.ravel()
    50 
    51 def bench_k_means(estimator, name, data, method):
    52     t0 = time()
    53     estimator.fit(data)
    54 
    55     print('%-9s	%-9s	%.2fs		%.3f			%.3f		%.3f		%.3f			%.3f			%.3f'
    56           % (method, name, (time() - t0), 
    57              metrics.homogeneity_score(labels, estimator.labels_),
    58              metrics.completeness_score(labels, estimator.labels_),
    59              metrics.v_measure_score(labels, estimator.labels_),
    60              metrics.adjusted_rand_score(labels, estimator.labels_),
    61              metrics.adjusted_mutual_info_score(labels,  estimator.labels_,
    62                                                 average_method='arithmetic'),
    63              metrics.silhouette_score(data, estimator.labels_,
    64                                       metric='euclidean',
    65                                       sample_size=300)))
    66 
    67 print(115 * '_')
    68 print('聚类方式		聚类原理		执行时间		同质性得分		完整性评分	v-测量得分	调整后兰德指数	调整的相互信息	轮廓系数')
    69 
    70 # #5.1 KMeans
    71 bench_k_means(KMeans(init='k-means++', n_clusters=n_clusters, n_init=10),
    72               name="质心", data=data, method='KMeans')
    73 
    74 #5.2 KMeasn
    75 bench_k_means(KMeans(init='random', n_clusters=n_clusters, n_init=10),
    76               name="质心", data=data, method='KMeans')
    77 
    78 #5.3 KMeasn
    79 pca = PCA(n_components=n_clusters).fit(data)
    80 bench_k_means(KMeans(init=pca.components_, n_clusters=n_clusters, n_init=1),
    81               name="质心",
    82               data=data, method='KMeans')
    83 
    84 #5.4 MeanShift
    85 bench_k_means(MeanShift(),
    86               name="密度",
    87               data=data, method='MeanShift')
    88 
    89 #5.5 DBSCAN
    90 bench_k_means(DBSCAN(eps=3, min_samples=2),
    91               name="密度",
    92               data=data, method='DBSCAN')
    93 
    94 #5.6 HCA
    95 bench_k_means(AgglomerativeClustering(n_clusters=n_clusters),
    96               name="连通性",
    97               data=data, method='HCA')
    98 print(115 * '_')
  • 相关阅读:
    kubernetes集群之资源配额(Resource Quotas)
    kubernetes之subpath的使用
    kubernetes之RBAC介绍
    python-日志模块
    pip安装模块提示Command "python setup.py egg_info" failed with error code 1
    TCP/IP协议讲解
    魔镜—58可视化数据智能平台架构与实践
    支付宝开源非侵入式 Android 自动化测试工具 Soloπ
    诗人“九歌”开源
    神奇的Kivy,让Python快速开发移动app
  • 原文地址:https://www.cnblogs.com/asenyang/p/11214725.html
Copyright © 2020-2023  润新知