• 对需要聚类的数据使用canopy做初步的计算


    K值聚类的时候,需要自己指定cluster的数目。

    这个cluster数目一般是通过canopy算法进行预处理来确定的。

    canopy具体描述可以参考这里

    下面是 golang语言的一个实现(对经纬度距离计算进行cluster)。

    package main
    
    import (
        "fmt"
        "math"
    )
    
    const (
        EARTH_RADIUS = 6371
    )
    
    type Point struct {
        lat float64
        lng float64
    }
    
    func Pop(points []Point) (p Point, newPoints []Point) {
        if len(points) > 0 {
            p = points[0]
            newPoints = points[1:]
        }
        return
    }
    
    func Push(p Point, points []Point) []Point {
        points = append(points, p)
        return points
    }
    
    // Calculates the Haversine distance between two points in kilometers.
    // Original Implementation from: http://www.movable-type.co.uk/scripts/latlong.html
    func GreatCircleDistance(p1, p2 Point) float64 {
        dLat := (p2.lat - p1.lat) * (math.Pi / 180.0)
        dLon := (p2.lng - p1.lng) * (math.Pi / 180.0)
    
        lat1 := p1.lat * (math.Pi / 180.0)
        lat2 := p2.lat * (math.Pi / 180.0)
    
        a1 := math.Sin(dLat/2) * math.Sin(dLat/2)
        a2 := math.Sin(dLon/2) * math.Sin(dLon/2) * math.Cos(lat1) * math.Cos(lat2)
    
        a := a1 + a2
    
        c := 2 * math.Atan2(math.Sqrt(a), math.Sqrt(1-a))
        return EARTH_RADIUS * c
    }
    
    /*
    while(没有标记的数据点){
        选择一个没有强标记的数据点p
        把p看作一个新Canopy c的中心
        离p距离<x1的所有点都认为在c中,给这些点做上弱标记  //纳入canopy,有可能会纳入其它canopy
        离p距离<x2的所有点都认为在c中,给这些点做上强标记  //不会再纳入其它canopy
    }
    */

    //目前只实现了经纬度以及经纬度的距离计算,这里可以是一个向量 func CanopyCluster(points []Point, x1, x2 float64) { var tmp []Point var cluster [][]Point for len(points) > 0 { var center Point center, points = Pop(points) index := len(cluster) var cpList []Point cpList = append(cpList, center) cluster = append(cluster, cpList) var cur Point for len(points) > 0 { cur, points = Pop(points) distance := GreatCircleDistance(center, cur) if distance <= x1 { cluster[index] = append(cluster[index], cur) if distance > x2 { tmp = Push(cur, tmp) } } else { tmp = Push(cur, tmp) } } fmt.Printf("current number of items in this canopy %d ", center) var t []Point points = tmp tmp = t } for k, c := range cluster { fmt.Println("canopy", k, "has", len(c), "items:") for _, v := range c { fmt.Println(" ", v.lat, v.lng) } } } func main() { pointsList := []Point{ {34.28637, -110.12059}, {34.28638, -110.1206}, {34.29077, -110.12078}, {34.29111, -110.11941}, {34.29113, -110.11938}, {34.29116, -110.1194}, {34.29145, -110.12043}, {34.29146, -110.12063}, {34.29154, -110.11873}, {34.3141, -110.11556}, {34.31411, -110.11557}, {34.31411, -110.11556}, {34.31412, -110.11556}, {34.31412, -110.11557}, {34.31415, -110.11552}, {34.31415, -110.11556}, } CanopyCluster(pointsList, 1.0, 0.8) }
  • 相关阅读:
    ATM+购物车
    subprocess,re,logging模块
    json,pickle,collections,openpyxl模块
    time,datatime,random,os,sys,hashlib模块
    1.内置函数剩余部分 map reduce filter 2.函数递归 3.模块
    生成器,面向过程编程,三元表达式,列表生成式,生成器表达式,匿名函数,内置函数
    Ajax数据对接出问题了?ThingJS解决方法在这里
    测试
    简单低成本的物联网开发平台-ThingJS
    用ThingJS之CityBuilder快搭3D场景,可视化开发必备
  • 原文地址:https://www.cnblogs.com/zhangqingping/p/5120531.html
Copyright © 2020-2023  润新知