• 数据处理之聚类分析


    # -*- coding: utf-8 -*-
    from sklearn.cluster import KMeans
    from sklearn.externals import joblib
    import numpy ,time,pdb
    import matplotlib.pyplot as plt
    from sklearn.cluster import MeanShift, estimate_bandwidth
    import numpy as np

    if __name__ == '__main__':
    # step 1: 加载数据
    print "step 1: load data..."

    dataSet = []
    fileIn = open('./MOC_X1000_20170811110600_MHAIL_00.txt')
    for line in fileIn.readlines()[1:]:
    lineArr = line.strip().split(',')
    dataSet.append([float(lineArr[0]), float(lineArr[1])])

    numSamples = len(dataSet)
    X = np.array(dataSet) #列表类型转换成array数组类型
    # bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)
    bandwidth = estimate_bandwidth(X, quantile=0.5, n_samples=numSamples)
    clf = MeanShift(bandwidth=bandwidth, bin_seeding=True,cluster_all=True).fit(X)

    centroids = clf.labels_
    print centroids,type(centroids) #显示每一个点的聚类归属
    # 计算其自动生成的k,并将聚类数量小于3的排除
    arr_flag = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    for i in clf.labels_:
    arr_flag[i]+=1
    k = 0
    # pdb.set_trace()
    for i in arr_flag:
    if(i > 45):
    k +=1
    print k

    mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
    #画出所有样例点 属于同一分类的绘制同样的颜色
    for i in xrange(numSamples):
    plt.plot(dataSet[i][0], dataSet[i][1], mark[clf.labels_[i]]) #mark[markIndex])

    mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
    # 画出质点,用特殊图型
    centroids =clf.cluster_centers_
    for i in range(k):
    plt.plot(centroids[i][0], centroids[i][1], mark[i], markersize = 12)
    # plt.show()
    print centroids #显示中心点坐标
    plt.show()

  • 相关阅读:
    day02_1spring3
    day01_2spring3
    动态代理的介绍
    day04_1hibernate
    day03_2hibernate
    Oracle11gR2安装完成后不手动配置监听的使用方法
    css的样式和选择符的优先权
    调用css时,link和@import url的区别
    jquery 获取和修改img标签的src属性
    正则表达式实现6-10位密码由数字和字母混合组成
  • 原文地址:https://www.cnblogs.com/xiaoxiaoshuaishuai0219/p/8353648.html
Copyright © 2020-2023  润新知