• <第一周> city中国城市聚类 testdata学生上网聚类 例子


    中国城市聚类###

    # -*- coding: utf-8 -*-
    kmeans算法
    """
    Created on Thu May 18 22:55:45 2017
    
    @author: sfzyk
    """
    import numpy as np
    #import sklearn as skl
    from sklearn.cluster import KMeans
    import os 
    os.chdir(r"D:mechine_learningmooc_data")
    def loaddata(file):
        fr=open(file)
        lines=fr.readlines()#按照行分割 
    为标志(存在 
        city_data=[]
        city_name=[]
        for line in lines:
            d=line.split(",")
            city_name.append(d[0])
            city_data.append([float(d[i]) for i in range(1,len(d))])                
        return city_name,city_data
    city_name,city_data=loaddata("31省市居民家庭消费水平-city.txt")
    
    
    km=KMeans(n_clusters=10)
    
    label=km.fit_predict(city_data)
    
    expenses=np.sum(km.cluster_centers_,axis=1)
    
    city_cluster=[]
    
    for i in range(km.n_clusters):
        city_cluster.append([])
        city_cluster[i].append(expenses[i])
    
    for i in range(len(city_name)):
        city_cluster[label[i]].append(city_name[i])
        
    #  city_cluster,key=lambda x : x[0]
    city_cluster.sort(key=lambda x:x[0]) 
      
    for i in range(len(city_cluster)):
        print(city_cluster[i])
    
    

    学生上网数据聚类###

    bdscan算法

    # -*- coding: utf-8 -*-
    """
    Created on Mon May 22 16:24:53 2017
    
    @author: sfzyk
    """
    import numpy as np
    import sklearn as skl
    from sklearn import metrics
    import matplotlib.pyplot as plt
    
     
    mac2id=dict()
    onlinetimes=[]
    f=open("学生月上网时间分布-TestData.txt",encoding='utf-8')
    #这里的encoding 是有必要的 不知道在开始指定coding是什么意思
    for line in f:
        mac=line.split(',')[2]
        onlinetime=int(line.split(',')[6])
        starttime=int(line.split(',')[4].split(' ')[1].split(':')[0])
        if mac not in mac2id:
            mac2id[mac]=len(onlinetimes)
            onlinetimes.append((starttime,onlinetime))
        else:
            onlinetimes[mac2id[mac]]=[(starttime,onlinetime)]
    real_X=np.array(onlinetimes).reshape((-1,2))
    
    X=real_X[:,0:1]
    dbscan=skl.cluster.DBSCAN(eps=0.03,min_samples=20).fit(X)
    labels=dbscan.labels_
    
    ratio=len(labels[labels[:]==-1])/len(labels)
    print("noise ratio %f"%ratio)
    
    n_clusters_ = len(set(labels))-(1 if -1 in labels else 0)
    
    print("Estimated number of clusters:%d "%n_clusters_)
    
    print("Silhouette coefficient:%0.3f"%metrics.silhouette_score(X,labels))
    
    for i in range(n_clusters_):
        print("Clusters ",i,":")
        print(list(X[labels==i].flatten()))
        #flatten  nX1 - 1Xn
    plt.hist(X,24)
        
    

    这里有一个Silhouette coefficient是一个轮廓系数,用于评价聚类效果

  • 相关阅读:
    HCS803对接鼎甲整机备份(LANBASE方式)
    面向对象语言中的设计模式——策略模式
    面向对象语言中的设计模式——工厂模式
    消息队列
    springboot使用feign
    平衡二叉树(AVL)Java
    PHP可变长度参数列表的实用技巧
    mysql表分区简述
    关于mac13寸电脑pyautogui定位不到坐标的问题
    vue 监听图片加载完成事件
  • 原文地址:https://www.cnblogs.com/sfzyk/p/6890389.html
Copyright © 2020-2023  润新知