• 层级聚类(Hierarchical Clustering)


    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    from numpy import *
    
    """
    Code for hierarchical clustering, modified from 
    Programming Collective Intelligence by Toby Segaran 
    (O'Reilly Media 2007, page 33). 
    """
    
    
    class cluster_node:
        def __init__(self, vec, left=None, right=None, distance=0.0, id=None, count=1):#面向对象oo 构造函数
            self.left = left
            self.right = right
            self.vec = vec
            self.id = id
            self.distance = distance
            self.count = count  # only used for weighted average
    
    
    def L2dist(v1, v2):
        return sqrt(sum((v1 - v2) ** 2))
    
    
    def L1dist(v1, v2):
        return sum(abs(v1 - v2))
    
    
    # def Chi2dist(v1,v2):
    #     return sqrt(sum((v1-v2)**2))
    
    def hcluster(features, distance=L2dist):
        # cluster the rows of the "features" matrix
        distances = {}
        currentclustid = -1
    
        # clusters are initially just the individual rows
        clust = [cluster_node(array(features[i]), id=i) for i in range(len(features))]#每一个实例都赋值id
    
        while len(clust) > 1:
            lowestpair = (0, 1)
            closest = distance(clust[0].vec, clust[1].vec)
    
            # loop through every pair looking for the smallest distance
            for i in range(len(clust)):
                for j in range(i + 1, len(clust)):
                    # distances is the cache of distance calculations
                    if (clust[i].id, clust[j].id) not in distances:
                        distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)
    
                    d = distances[(clust[i].id, clust[j].id)]
    
                    if d < closest:
                        closest = d
                        lowestpair = (i, j)#距离最小的一对点
    
            # calculate the average of the two clusters
            mergevec = [(clust[lowestpair[0]].vec[i] + clust[lowestpair[1]].vec[i]) / 2.0 
                        for i in range(len(clust[0].vec))]#计算一个类中的两个点的距离的中间点
    
            # create the new cluster
            newcluster = cluster_node(array(mergevec), left=clust[lowestpair[0]],
                                      right=clust[lowestpair[1]],
                                      distance=closest, id=currentclustid)#将左右儿子已接近两点距离 两点的中心向量进行赋值,构造新的节点
    
            # cluster ids that weren't in the original set are negative
            currentclustid -= 1
            del clust[lowestpair[1]]#删除掉已经合并为一个节点的左右两个(在clust里面)节点
            del clust[lowestpair[0]]
            clust.append(newcluster)
    
        return clust[0]#返回一个包含所有节点的树结构
    
    
    def extract_clusters(clust, dist):#
        # extract list of sub-tree clusters from hcluster tree with distance<dist
        clusters = {}
        if clust.distance < dist:
            # we have found a cluster subtree
            return [clust]
        else:
            # check the right and left branches
            cl = []
            cr = []
            if clust.left != None:
                cl = extract_clusters(clust.left, dist=dist)
            if clust.right != None:
                cr = extract_clusters(clust.right, dist=dist)
            return cl + cr
    
    
    def get_cluster_elements(clust):
        # return ids for elements in a cluster sub-tree
        if clust.id >= 0:
            # positive id means that this is a leaf
            return [clust.id]
        else:
            # check the right and left branches
            cl = []
            cr = []
            if clust.left != None:
                cl = get_cluster_elements(clust.left)
            if clust.right != None:
                cr = get_cluster_elements(clust.right)
            return cl + cr
    
    
    def printclust(clust, labels=None, n=0):
        # indent to make a hierarchy layout
        for i in range(n): print ' ',
        if clust.id < 0:
            # negative id means that this is branch
            print '-'
        else:
            # positive id means that this is an endpoint
            if labels == None:
                print clust.id
            else:
                print labels[clust.id]
    
        # now print the right and left branches
        if clust.left != None: printclust(clust.left, labels=labels, n=n + 1)
        if clust.right != None: printclust(clust.right, labels=labels, n=n + 1)
    
    
    def getheight(clust):
        # Is this an endpoint? Then the height is just 1
        if clust.left == None and clust.right == None: return 1
    
        # Otherwise the height is the same of the heights of
        # each branch
        return getheight(clust.left) + getheight(clust.right)
    
    
    def getdepth(clust):
        # The distance of an endpoint is 0.0
        if clust.left == None and clust.right == None: return 0
    
        # The distance of a branch is the greater of its two sides
        # plus its own distance
        return max(getdepth(clust.left), getdepth(clust.right)) + clust.distance
    

      

  • 相关阅读:
    Business Objects 基础
    常用的bw基础知识
    SAP BW传输请求操作步骤
    FI/CO 财务基础知识
    SAP财务常用数据源概览
    HANA 和 SAP NetWeaver BW
    Request.QueryString中文乱码
    完全备份类型
    SQL Server备份属于I/O密集型操作
    SQL Server 通过发布订阅 实现数据库同步
  • 原文地址:https://www.cnblogs.com/wlc297984368/p/7471138.html
Copyright © 2020-2023  润新知