1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
- <span style="color: rgb(128, 128, 128); font-style: italic;">
#!/usr/bin/python
- <span style="color: rgb(255, 119, 0); font-weight: bold;">from</span> <span style="color: rgb(220, 20, 60);">__future__</span> <span style="color: rgb(255, 119, 0); font-weight: bold;">import</span> with_statement
- <span style="color: rgb(255, 119, 0); font-weight: bold;">import</span> <span style="color: rgb(220, 20, 60);">cPickle</span> <span style="color: rgb(255, 119, 0); font-weight: bold;">as</span> <span style="color: rgb(220, 20, 60);">pickle</span>
- <span style="color: rgb(255, 119, 0); font-weight: bold;">from</span> matplotlib <span style="color: rgb(255, 119, 0); font-weight: bold;">import</span> pyplot
- <span style="color: rgb(255, 119, 0); font-weight: bold;">from</span> numpy <span style="color: rgb(255, 119, 0); font-weight: bold;">import</span> zeros, <span style="color: rgb(220, 20, 60);">array</span>, tile
- <span style="color: rgb(255, 119, 0); font-weight: bold;">from</span> scipy.<span style="color: black;">linalg</span> <span style="color: rgb(255, 119, 0); font-weight: bold;">import</span> norm
- <span style="color: rgb(255, 119, 0); font-weight: bold;">import</span> numpy.<span style="color: black;">matlib</span> <span style="color: rgb(255, 119, 0); font-weight: bold;">as</span> ml
- <span style="color: rgb(255, 119, 0); font-weight: bold;">import</span> <span style="color: rgb(220, 20, 60);">random</span>
-
- <span style="color: rgb(255, 119, 0); font-weight: bold;">def</span> kmeans<span style="color: black;">(</span>X, k, observer=<span style="color: rgb(8, 0, 0);">None</span>, threshold=1e-15, maxiter=<span style="color: rgb(255, 69, 0);">300</span><span style="color: black;">)</span>:
- N = <span style="color: rgb(8, 0, 0);">len</span><span style="color: black;">(</span>X<span style="color: black;">)</span>
- labels = zeros<span style="color: black;">(</span>N, dtype=<span style="color: rgb(8, 0, 0);">int</span><span style="color: black;">)</span>
- centers = <span style="color: rgb(220, 20, 60);">array</span><span style="color: black;">(</span><span style="color: rgb(220, 20, 60);">random</span>.<span style="color: black;">sample</span><span style="color: black;">(</span>X, k<span style="color: black;">)</span><span style="color: black;">)</span>
- <span style="color: rgb(8, 0, 0);">iter</span> = <span style="color: rgb(255, 69, 0);">0</span>
-
- <span style="color: rgb(255, 119, 0); font-weight: bold;">def</span> calc_J<span style="color: black;">(</span><span style="color: black;">)</span>:
- <span style="color: rgb(8, 0, 0);">sum</span> = <span style="color: rgb(255, 69, 0);">0</span>
- <span style="color: rgb(255, 119, 0); font-weight: bold;">for</span> i <span style="color: rgb(255, 119, 0); font-weight: bold;">in</span> <span style="color: rgb(8, 0, 0);">xrange</span><span style="color: black;">(</span>N<span style="color: black;">)</span>:
- <span style="color: rgb(8, 0, 0);">sum</span> += norm<span style="color: black;">(</span>X<span style="color: black;">[</span>i<span style="color: black;">]</span>-centers<span style="color: black;">[</span>labels<span style="color: black;">[</span>i<span style="color: black;">]</span><span style="color: black;">]</span><span style="color: black;">)</span>
- <span style="color: rgb(255, 119, 0); font-weight: bold;">return</span> <span style="color: rgb(8, 0, 0);">sum</span>
-
- <span style="color: rgb(255, 119, 0); font-weight: bold;">def</span> distmat<span style="color: black;">(</span>X, Y<span style="color: black;">)</span>:
- n = <span style="color: rgb(8, 0, 0);">len</span><span style="color: black;">(</span>X<span style="color: black;">)</span>
- m = <span style="color: rgb(8, 0, 0);">len</span><span style="color: black;">(</span>Y<span style="color: black;">)</span>
- xx = ml.<span style="color: rgb(8, 0, 0);">sum</span><span style="color: black;">(</span>X<span style="color: rgb(102, 204, 102);">*</span>X, axis=<span style="color: rgb(255, 69, 0);">1</span><span style="color: black;">)</span>
- yy = ml.<span style="color: rgb(8, 0, 0);">sum</span><span style="color: black;">(</span>Y<span style="color: rgb(102, 204, 102);">*</span>Y, axis=<span style="color: rgb(255, 69, 0);">1</span><span style="color: black;">)</span>
- xy = ml.<span style="color: black;">dot</span><span style="color: black;">(</span>X, Y.<span style="color: black;">T</span><span style="color: black;">)</span>
-
- <span style="color: rgb(255, 119, 0); font-weight: bold;">return</span> tile<span style="color: black;">(</span>xx, <span style="color: black;">(</span>m, <span style="color: rgb(255, 69, 0);">1</span><span style="color: black;">)</span><span style="color: black;">)</span>.<span style="color: black;">T</span>+tile<span style="color: black;">(</span>yy, <span style="color: black;">(</span>n, <span style="color: rgb(255, 69, 0);">1</span><span style="color: black;">)</span><span style="color: black;">)</span> - <span style="color: rgb(255, 69, 0);">2</span><span style="color: rgb(102, 204, 102);">*</span>xy
-
- Jprev = calc_J<span style="color: black;">(</span><span style="color: black;">)</span>
- <span style="color: rgb(255, 119, 0); font-weight: bold;">while</span> <span style="color: rgb(8, 0, 0);">True</span>:
- <span style="color: rgb(128, 128, 128); font-style: italic;">
- <span style="color: rgb(255, 119, 0); font-weight: bold;">if</span> observer <span style="color: rgb(255, 119, 0); font-weight: bold;">is</span> <span style="color: rgb(255, 119, 0); font-weight: bold;">not</span> <span style="color: rgb(8, 0, 0);">None</span>:
- observer<span style="color: black;">(</span><span style="color: rgb(8, 0, 0);">iter</span>, labels, centers<span style="color: black;">)</span>
-
- <span style="color: rgb(128, 128, 128); font-style: italic;">
- <span style="color: rgb(128, 128, 128); font-style: italic;">
- <span style="color: rgb(128, 128, 128); font-style: italic;">
- dist = distmat<span style="color: black;">(</span>X, centers<span style="color: black;">)</span>
- <span style="color: rgb(128, 128, 128); font-style: italic;">
- labels = dist.<span style="color: black;">argmin</span><span style="color: black;">(</span>axis=<span style="color: rgb(255, 69, 0);">1</span><span style="color: black;">)</span>
- <span style="color: rgb(128, 128, 128); font-style: italic;">
- <span style="color: rgb(255, 119, 0); font-weight: bold;">for</span> j <span style="color: rgb(255, 119, 0); font-weight: bold;">in</span> <span style="color: rgb(8, 0, 0);">range</span><span style="color: black;">(</span>k<span style="color: black;">)</span>:
- idx_j = <span style="color: black;">(</span>labels == j<span style="color: black;">)</span>.<span style="color: black;">nonzero</span><span style="color: black;">(</span><span style="color: black;">)</span>
- centers<span style="color: black;">[</span>j<span style="color: black;">]</span> = X<span style="color: black;">[</span>idx_j<span style="color: black;">]</span>.<span style="color: black;">mean</span><span style="color: black;">(</span>axis=<span style="color: rgb(255, 69, 0);">0</span><span style="color: black;">)</span>
-
- J = calc_J<span style="color: black;">(</span><span style="color: black;">)</span>
- <span style="color: rgb(8, 0, 0);">iter</span> += <span style="color: rgb(255, 69, 0);">1</span>
-
- <span style="color: rgb(255, 119, 0); font-weight: bold;">if</span> Jprev-J <span style="color: rgb(102, 204, 102);"><</span> threshold:
- <span style="color: rgb(255, 119, 0); font-weight: bold;">break</span>
- Jprev = J
- <span style="color: rgb(255, 119, 0); font-weight: bold;">if</span> <span style="color: rgb(8, 0, 0);">iter</span> <span style="color: rgb(102, 204, 102);">></span>= maxiter:
- <span style="color: rgb(255, 119, 0); font-weight: bold;">break</span>
-
- <span style="color: rgb(128, 128, 128); font-style: italic;">
- <span style="color: rgb(255, 119, 0); font-weight: bold;">if</span> observer <span style="color: rgb(255, 119, 0); font-weight: bold;">is</span> <span style="color: rgb(255, 119, 0); font-weight: bold;">not</span> <span style="color: rgb(8, 0, 0);">None</span>:
- observer<span style="color: black;">(</span><span style="color: rgb(8, 0, 0);">iter</span>, labels, centers<span style="color: black;">)</span>
-
- <span style="color: rgb(255, 119, 0); font-weight: bold;">if</span> __name__ == <span style="color: rgb(72, 61, 139);">'__main__'</span>:
- <span style="color: rgb(128, 128, 128); font-style: italic;">
- <span style="color: rgb(255, 119, 0); font-weight: bold;">with</span> <span style="color: rgb(8, 0, 0);">open</span><span style="color: black;">(</span><span style="color: rgb(72, 61, 139);">'cluster.pkl'</span><span style="color: black;">)</span> <span style="color: rgb(255, 119, 0); font-weight: bold;">as</span> inf:
- samples = <span style="color: rgb(220, 20, 60);">pickle</span>.<span style="color: black;">load</span><span style="color: black;">(</span>inf<span style="color: black;">)</span>
- N = <span style="color: rgb(255, 69, 0);">0</span>
- <span style="color: rgb(255, 119, 0); font-weight: bold;">for</span> smp <span style="color: rgb(255, 119, 0); font-weight: bold;">in</span> samples:
- N += <span style="color: rgb(8, 0, 0);">len</span><span style="color: black;">(</span>smp<span style="color: black;">[</span><span style="color: rgb(255, 69, 0);">0</span><span style="color: black;">]</span><span style="color: black;">)</span>
- X = zeros<span style="color: black;">(</span><span style="color: black;">(</span>N, <span style="color: rgb(255, 69, 0);">2</span><span style="color: black;">)</span><span style="color: black;">)</span>
- idxfrm = <span style="color: rgb(255, 69, 0);">0</span>
- <span style="color: rgb(255, 119, 0); font-weight: bold;">for</span> i <span style="color: rgb(255, 119, 0); font-weight: bold;">in</span> <span style="color: rgb(8, 0, 0);">range</span><span style="color: black;">(</span><span style="color: rgb(8, 0, 0);">len</span><span style="color: black;">(</span>samples<span style="color: black;">)</span><span style="color: black;">)</span>:
- idxto = idxfrm + <span style="color: rgb(8, 0, 0);">len</span><span style="color: black;">(</span>samples<span style="color: black;">[</span>i<span style="color: black;">]</span><span style="color: black;">[</span><span style="color: rgb(255, 69, 0);">0</span><span style="color: black;">]</span><span style="color: black;">)</span>
- X<span style="color: black;">[</span>idxfrm:idxto, <span style="color: rgb(255, 69, 0);">0</span><span style="color: black;">]</span> = samples<span style="color: black;">[</span>i<span style="color: black;">]</span><span style="color: black;">[</span><span style="color: rgb(255, 69, 0);">0</span><span style="color: black;">]</span>
- X<span style="color: black;">[</span>idxfrm:idxto, <span style="color: rgb(255, 69, 0);">1</span><span style="color: black;">]</span> = samples<span style="color: black;">[</span>i<span style="color: black;">]</span><span style="color: black;">[</span><span style="color: rgb(255, 69, 0);">1</span><span style="color: black;">]</span>
- idxfrm = idxto
-
- <span style="color: rgb(255, 119, 0); font-weight: bold;">def</span> observer<span style="color: black;">(</span><span style="color: rgb(8, 0, 0);">iter</span>, labels, centers<span style="color: black;">)</span>:
- <span style="color: rgb(255, 119, 0); font-weight: bold;">print</span> <span style="color: rgb(72, 61, 139);">"iter %d."</span> <span style="color: rgb(102, 204, 102);">%</span> <span style="color: rgb(8, 0, 0);">iter</span>
- colors = <span style="color: rgb(220, 20, 60);">array</span><span style="color: black;">(</span><span style="color: black;">[</span><span style="color: black;">[</span><span style="color: rgb(255, 69, 0);">1</span>, <span style="color: rgb(255, 69, 0);">0</span>, <span style="color: rgb(255, 69, 0);">0</span><span style="color: black;">]</span>, <span style="color: black;">[</span><span style="color: rgb(255, 69, 0);">0</span>, <span style="color: rgb(255, 69, 0);">1</span>, <span style="color: rgb(255, 69, 0);">0</span><span style="color: black;">]</span>, <span style="color: black;">[</span><span style="color: rgb(255, 69, 0);">0</span>, <span style="color: rgb(255, 69, 0);">0</span>, <span style="color: rgb(255, 69, 0);">1</span><span style="color: black;">]</span><span style="color: black;">]</span><span style="color: black;">)</span>
- pyplot.<span style="color: black;">plot</span><span style="color: black;">(</span>hold=<span style="color: rgb(8, 0, 0);">False</span><span style="color: black;">)</span> <span style="color: rgb(128, 128, 128); font-style: italic;">
- pyplot.<span style="color: black;">hold</span><span style="color: black;">(</span><span style="color: rgb(8, 0, 0);">True</span><span style="color: black;">)</span>
-
- <span style="color: rgb(128, 128, 128); font-style: italic;">
- data_colors=<span style="color: black;">[</span>colors<span style="color: black;">[</span>lbl<span style="color: black;">]</span> <span style="color: rgb(255, 119, 0); font-weight: bold;">for</span> lbl <span style="color: rgb(255, 119, 0); font-weight: bold;">in</span> labels<span style="color: black;">]</span>
- pyplot.<span style="color: black;">scatter</span><span style="color: black;">(</span>X<span style="color: black;">[</span>:, <span style="color: rgb(255, 69, 0);">0</span><span style="color: black;">]</span>, X<span style="color: black;">[</span>:, <span style="color: rgb(255, 69, 0);">1</span><span style="color: black;">]</span>, c=data_colors, alpha=<span style="color: rgb(255, 69, 0);">0.5</span><span style="color: black;">)</span>
- <span style="color: rgb(128, 128, 128); font-style: italic;">
- pyplot.<span style="color: black;">scatter</span><span style="color: black;">(</span>centers<span style="color: black;">[</span>:, <span style="color: rgb(255, 69, 0);">0</span><span style="color: black;">]</span>, centers<span style="color: black;">[</span>:, <span style="color: rgb(255, 69, 0);">1</span><span style="color: black;">]</span>, s=<span style="color: rgb(255, 69, 0);">200</span>, c=colors<span style="color: black;">)</span>
-
- pyplot.<span style="color: black;">savefig</span><span style="color: black;">(</span><span style="color: rgb(72, 61, 139);">'kmeans/iter_%02d.png'</span> <span style="color: rgb(102, 204, 102);">%</span> <span style="color: rgb(8, 0, 0);">iter</span>, format=<span style="color: rgb(72, 61, 139);">'png'</span><span style="color: black;">)</span>
-
- kmeans<span style="color: black;">(</span>X, <span style="color: rgb(255, 69, 0);">3</span>, observer=observer<span style="color: black;">)</span>
from __future__ import with_statement
import cPickle as pickle
from matplotlib import pyplot
from numpy import zeros, array, tile
from scipy.linalg import norm
import numpy.matlib as ml
import random
def kmeans(X, k, observer=None, threshold=1e-15, maxiter=300):
N = len(X)
labels = zeros(N, dtype=int)
centers = array(random.sample(X, k))
iter = 0
def calc_J():
sum = 0
for i in xrange(N):
sum += norm(X[i]-centers[labels[i]])
return sum
def distmat(X, Y):
n = len(X)
m = len(Y)
xx = ml.sum(X*X, axis=1)
yy = ml.sum(Y*Y, axis=1)
xy = ml.dot(X, Y.T)
return tile(xx, (m, 1)).T+tile(yy, (n, 1)) - 2*xy
Jprev = calc_J()
while True:
# notify the observer
if observer is not None:
observer(iter, labels, centers)
# calculate distance from x to each center
# distance_matrix is only available in scipy newer than 0.7
# dist = distance_matrix(X, centers)
dist = distmat(X, centers)
# assign x to nearst center
labels = dist.argmin(axis=1)
# re-calculate each center
for j in range(k):
idx_j = (labels == j).nonzero()
centers[j] = X[idx_j].mean(axis=0)
J = calc_J()
iter += 1
if Jprev-J < threshold:
break
Jprev = J
if iter >= maxiter:
break
# final notification
if observer is not None:
observer(iter, labels, centers)
if __name__ == '__main__':
# load previously generated points
with open('cluster.pkl') as inf:
samples = pickle.load(inf)
N = 0
for smp in samples:
N += len(smp[0])
X = zeros((N, 2))
idxfrm = 0
for i in range(len(samples)):
idxto = idxfrm + len(samples[i][0])
X[idxfrm:idxto, 0] = samples[i][0]
X[idxfrm:idxto, 1] = samples[i][1]
idxfrm = idxto
def observer(iter, labels, centers):
print "iter %d." % iter
colors = array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
pyplot.plot(hold=False) # clear previous plot
pyplot.hold(True)
# draw points
data_colors=[colors[lbl] for lbl in labels]
pyplot.scatter(X[:, 0], X[:, 1], c=data_colors, alpha=0.5)
# draw centers
pyplot.scatter(centers[:, 0], centers[:, 1], s=200, c=colors)
pyplot.savefig('kmeans/iter_%02d.png' % iter, format='png')
kmeans(X, 3, observer=observer)
|