• python实现的kmeans算法(原创)


      1 #! /usr/bin/env python
    2 # -*- coding: utf-8 -*-
    3 import os
    4 import sys
    5 import cmath
    6 import os.path
    7
    8 class KMeans:
    9 '''
    10 @descriptions: K-means Algorithm implementation.
    11 @filename: Filename of input data.
    12 @knums: Clusters number.
    13 '''
    14 def __init__(self, filename, knums):
    15 self._filename = filename;
    16 self._knums = knums
    17 self._dimension = 0
    18 """self._samples := [(seqx, x1, x2, ..., xn),
    19 (seqy, y1, y2, ..., yn),
    20 ...,
    21 (seqz, z1, z2, ..., zn)]"""
    22 self._samples= []
    23 """self._clusters :=[[(0, c1, c2, ..., cn), (seqx, x1, x2, ..., xn), (seqy, y1, y2, ..., yn)],
    24 [],
    25 ...,
    26 []]"""
    27 self._clusters = []
    28
    29 self._open(self._filename)
    30 self._normalize()
    31 #print self._samples
    32 self._select(self._knums)
    33
    34
    35 def _normalize(self):
    36 """
    37 @description: Normalize the attributes of input data.
    38 """
    39 new_samples = []
    40 for t in xrange(len(self._samples)):
    41 st = list(self._samples[t])
    42 new_samples.append(st)
    43
    44 for t in xrange(len(self._samples)):
    45 self._samples.pop()
    46
    47 for d in xrange(1, (self._dimension + 1)):
    48 container_att = []
    49 for idx in xrange(len(new_samples)):
    50 att = new_samples[idx][d]
    51 container_att.append(att)
    52
    53 max_att = max(container_att)
    54 min_att = min(container_att)
    55
    56 for idx in xrange(len(new_samples)):
    57 new_att = (new_samples[idx][d] - min_att) / (max_att - min_att)
    58 new_samples[idx][d] = new_att
    59
    60 for t in xrange(len(new_samples)):
    61 st = tuple(new_samples[t])
    62 self._samples.append(st)
    63
    64
    65
    66 def _open(self, filename):
    67 """
    68 @descriptions: Open the data file and fill each item into memory.
    69 @filename : Filename of input data.
    70 """
    71 data_file= open(self._filename, "r")
    72 data_lines= data_file.readlines();
    73 for line in data_lines:
    74 string_samples = line.split("")
    75 integer_samples= []
    76
    77 integer_samples.append(int(string_samples[0]))
    78
    79 for e in string_samples[1:]:
    80 integer_samples.append(float(e))
    81 samples = tuple(integer_samples)
    82 self._samples.append(samples)
    83 #print self._samples
    84 self._dimension = len(self._samples[0]) - 1
    85 #print self._dimension
    86
    87
    88 def _select(self, knums):
    89 """
    90 @descriptions: Choose the first knums cluster center.
    91 @knums : Clusters number.
    92 """
    93 for i in xrange(knums):
    94 selected = self._samples[i]
    95 temp = list(selected)
    96 temp[0] = 0
    97 self._clusters.append([])
    98 self._clusters[i].append(temp)
    99 #print self._clusters
    100
    101
    102 def _distance(self, va, vb):
    103 '''
    104 @description: Return the (distance ** 2) of tuple va and tuple vb.
    105 @va : tuple va (x1, x2, ..., xn)
    106 @vb : tuple vb (y1, y2, ..., yn)
    107 '''
    108 distance = 0
    109 for i in xrange(self._dimension):
    110 distance += (va[i] - vb[i]) * (va[i] - vb[i])
    111 #print distance
    112
    113 return distance
    114
    115
    116 def _means(self, va):
    117 """
    118 @description: Return the means of va.
    119 @va : A tuple of list va, with the form [(flagx, x1, x2, ..., xn),
    120 (flagy, y1, y2, ..., yn),
    121 (flagz, z1, z2, ..., zn), ...]
    122 """
    123 if (len(va) == 0):
    124 return va
    125
    126 means_cluster = []
    127 means_cluster.append(1)#Indicate that the means has changed.
    128
    129 #print va
    130 for d in xrange(self._dimension):
    131 tmp = 0
    132 for i in xrange(len(va)):
    133 tmp += va[i][d+1]
    134 means_cluster.append(tmp/len(va))
    135 means = tuple(means_cluster)
    136
    137 return means
    138
    139 def _equal(self, ta, tb):
    140 """
    141 @description: Check if tuple ta equals to tuple tb.
    142 @ta : Tuple ta.(flagx, x1, x2, ..., xn)
    143 @tb : Tuple tb.(flagy, y1, y1, ..., ym)
    144 """
    145 if (len(ta) != len(tb)):
    146 return False
    147
    148 for i in xrange(1, len(ta)):
    149 if (ta[i] != tb[i]):
    150 return False
    151
    152 return True
    153
    154 def flush(self, filename):
    155 """
    156 @description: Flush data the disk.
    157 @filename : Filename of output data.
    158 """
    159 foutput = open(filename, "w")
    160
    161 for c in xrange(self._knums):
    162 foutput.write("Group %d" % c)
    163 for e in self._clusters[c][1:]:
    164 foutput.write("%s" % repr(e))
    165 foutput.write("\n\n\n")
    166 print("Done.")
    167 foutput.close()
    168
    169 def _reconstruct(self, idx):
    170 """
    171 @description: Reconstruct the cluster points.
    172
    173 @idx : Index of clusters, where clusters has the form as follows:
    174 self._clusters :=[[(0, c1, c2, ..., cn), (seqx, x1, x2, ..., xn), (seqy, y1, y2, ..., yn)],
    175 [],
    176 ...,
    177 []]
    178 """
    179 new_cluster = []
    180 new_cluster.append(0)
    181 for old_value in self._clusters[idx][0][1:]:
    182 new_cluster.append(old_value)
    183 for i in xrange(len(self._clusters[idx])):
    184 self._clusters[idx].pop()
    185 self._clusters[idx].insert(0, new_cluster)
    186
    187
    188 def process(self):
    189 """
    190 @description: Process data, calculating k-means and clustering.
    191 """
    192 while True:
    193 K = 0
    194 for e in self._samples:
    195 #print e
    196 shortest = -1
    197 for k in xrange(self._knums):
    198 #for k in _clusters[]
    199 #print e
    200 #print self._clusters[k][0]
    201 distance = self._distance(e[1:], self._clusters[k][0][1:])
    202 #print distance
    203 if (distance < 0.000001):
    204 # add e to the k-th cluster.
    205 self._clusters[k].append(e)
    206 break
    207 else:
    208 if (shortest == -1):
    209 shortest = distance
    210 else:
    211 if (shortest > distance):
    212 shortest = distance
    213 K = k
    214 if (k != self._knums - 1):
    215 continue
    216 else:
    217 # add e to the k-th cluster
    218 self._clusters[K].append(e)
    219 #print self._clusters
    220
    221 for k in xrange(self._knums):
    222 new_ktuple = self._means(self._clusters[k][1:])
    223 if (len(new_ktuple) == 0):
    224 continue
    225 if (self._equal(self._clusters[k][0], new_ktuple) == False):
    226 self._clusters[k].pop(0)
    227 self._clusters[k].insert(0, new_ktuple)
    228
    229 else:
    230 continue
    231
    232 flag = 0
    233 for idx in xrange(self._knums):
    234 if (self._clusters[idx][0][0] == 1):
    235 flag = 1
    236 break
    237 else:
    238 continue
    239
    240 if (flag == 1):
    241 for idx in xrange(self._knums):
    242 self._reconstruct(idx)
    243 else:
    244 break
    245
    246
    247 if __name__ =="__main__":
    248 ikmeans = KMeans("./iris-1.dat", 3)
    249 ikmeans.process()
    250 ikmeans.flush("./k-means-out.dat")

    K-means算法的python代码,写完 + 调试花了差不多一天的时间,希望对大家有用。关于K-means聚类算法和ISODATA算法解释见下一篇博文。

  • 相关阅读:
    kafka 流式计算
    解决山地车令人讨厌的中轴异响及其他异响问题
    go语言通道详解
    使用Spring Cloud连接不同服务
    并发之痛 Thread,Goroutine,Actor
    用go语言实现线程池
    golang go语言通道类型的通道示例 通道的通道
    Java 网络IO编程总结(BIO、NIO、AIO均含完整实例代码)
    spring5 reactive
    Go 语言和 Scala 语言对比
  • 原文地址:https://www.cnblogs.com/haippy/p/2183979.html
Copyright © 2020-2023  润新知