mi@0
|
1 #!/usr/bin/env python
|
mi@0
|
2 """Class that implements X-means."""
|
mi@0
|
3
|
mitian@18
|
4 import sys, os
|
mi@0
|
5 import argparse
|
mi@0
|
6 import numpy as np
|
mi@0
|
7 import logging
|
mi@0
|
8 import time
|
mi@0
|
9 import pylab as plt
|
mi@0
|
10 import scipy.cluster.vq as vq
|
mi@0
|
11 from scipy.spatial import distance
|
mi@0
|
12
|
mi@0
|
13
|
mitian@18
|
14 class XMeans(object):
|
mitian@18
|
15 def __init__(self, X, init_K=2, plot=False):
|
mitian@18
|
16 self.X = X
|
mitian@18
|
17 self.init_K = init_K
|
mitian@18
|
18 self.plot = plot
|
mi@0
|
19
|
mitian@18
|
20 def estimate_K_xmeans(self, th=0.2, maxK = 10):
|
mitian@18
|
21 """Estimates K running X-means algorithm (Pelleg & Moore, 2000)."""
|
mi@0
|
22
|
mitian@18
|
23 # Run initial K-means
|
mitian@18
|
24 means, labels = self.run_kmeans(self.X, self.init_K)
|
mi@0
|
25
|
mitian@18
|
26 # Run X-means algorithm
|
mitian@18
|
27 stop = False
|
mitian@18
|
28 curr_K = self.init_K
|
mitian@18
|
29 while not stop:
|
mitian@18
|
30 stop = True
|
mitian@18
|
31 final_means = []
|
mitian@18
|
32 for k in xrange(curr_K):
|
mitian@18
|
33 # Find the data that corresponds to the k-th cluster
|
mitian@18
|
34 D = self.get_clustered_data(self.X, labels, k)
|
mitian@18
|
35 if len(D) == 0 or D.shape[0] == 1:
|
mitian@18
|
36 continue
|
mi@0
|
37
|
mitian@18
|
38 # Whiten and find whitened mean
|
mitian@18
|
39 stdD = np.std(D, axis=0)
|
mitian@18
|
40 #D = vq.whiten(D)
|
mitian@18
|
41 D /= stdD # Same as line above
|
mitian@18
|
42 mean = D.mean(axis=0)
|
mi@0
|
43
|
mitian@18
|
44 # Cluster this subspace by half (K=2)
|
mitian@18
|
45 half_means, half_labels = self.run_kmeans(D, K=2)
|
mi@0
|
46
|
mitian@18
|
47 # Compute BICs
|
mitian@18
|
48 bic1 = self.compute_bic(D, [mean], K=1,
|
mitian@18
|
49 labels=np.zeros(D.shape[0]),
|
mitian@18
|
50 R=D.shape[0])
|
mitian@18
|
51 bic2 = self.compute_bic(D, half_means, K=2,
|
mitian@18
|
52 labels=half_labels, R=D.shape[0])
|
mi@0
|
53
|
mitian@18
|
54 # Split or not
|
mitian@18
|
55 max_bic = np.max([np.abs(bic1), np.abs(bic2)])
|
mitian@18
|
56 norm_bic1 = bic1 / max_bic
|
mitian@18
|
57 norm_bic2 = bic2 / max_bic
|
mitian@18
|
58 diff_bic = np.abs(norm_bic1 - norm_bic2)
|
mi@0
|
59
|
mitian@18
|
60 # Split!
|
mitian@18
|
61 print "diff_bic", diff_bic
|
mitian@18
|
62 if diff_bic > th:
|
mitian@18
|
63 final_means.append(half_means[0] * stdD)
|
mitian@18
|
64 final_means.append(half_means[1] * stdD)
|
mitian@18
|
65 curr_K += 1
|
mitian@18
|
66 stop = False
|
mitian@18
|
67 # Don't split
|
mitian@18
|
68 else:
|
mitian@18
|
69 final_means.append(mean * stdD)
|
mi@0
|
70
|
mitian@18
|
71 final_means = np.asarray(final_means)
|
mi@0
|
72
|
mitian@18
|
73 print "Estimated K: ", curr_K
|
mitian@18
|
74 if self.plot:
|
mitian@18
|
75 plt.scatter(self.X[:, 0], self.X[:, 1])
|
mitian@18
|
76 plt.scatter(final_means[:, 0], final_means[:, 1], color="y")
|
mitian@18
|
77 plt.show()
|
mi@0
|
78
|
mitian@18
|
79 if curr_K >= maxK or self.X.shape[-1] != final_means.shape[-1]:
|
mitian@18
|
80 stop = True
|
mitian@18
|
81 else:
|
mitian@18
|
82 labels, dist = vq.vq(self.X, final_means)
|
mi@0
|
83
|
mitian@18
|
84 return curr_K
|
mi@0
|
85
|
mitian@18
|
86 def estimate_K_knee(self, th=.015, maxK=12):
|
mitian@18
|
87 """Estimates the K using K-means and BIC, by sweeping various K and
|
mitian@18
|
88 choosing the optimal BIC."""
|
mitian@18
|
89 # Sweep K-means
|
mitian@18
|
90 if self.X.shape[0] < maxK:
|
mitian@18
|
91 maxK = self.X.shape[0]
|
mitian@18
|
92 if maxK < 2:
|
mitian@18
|
93 maxK = 2
|
mitian@18
|
94 K = np.arange(1, maxK)
|
mitian@18
|
95 bics = []
|
mitian@18
|
96 for k in K:
|
mitian@18
|
97 means, labels = self.run_kmeans(self.X, k)
|
mitian@18
|
98 print 'means, labels', means.shape, len(labels)
|
mitian@18
|
99 bic = self.compute_bic(self.X, means, labels, K=k,
|
mitian@18
|
100 R=self.X.shape[0])
|
mitian@18
|
101 bics.append(bic)
|
mitian@18
|
102 diff_bics = np.diff(bics)
|
mitian@18
|
103 finalK = K[-1]
|
mi@0
|
104
|
mitian@18
|
105 if len(bics) == 1:
|
mitian@18
|
106 finalK = 2
|
mitian@18
|
107 else:
|
mitian@18
|
108 # Normalize
|
mitian@18
|
109 bics = np.asarray(bics)
|
mitian@18
|
110 bics -= bics.min()
|
mitian@18
|
111 #bics /= bics.max()
|
mitian@18
|
112 diff_bics -= diff_bics.min()
|
mitian@18
|
113 #diff_bics /= diff_bics.max()
|
mi@0
|
114
|
mitian@18
|
115 #print bics, diff_bics
|
mi@0
|
116
|
mitian@18
|
117 # Find optimum K
|
mitian@18
|
118 for i in xrange(len(K[:-1])):
|
mitian@18
|
119 #if bics[i] > diff_bics[i]:
|
mitian@18
|
120 if diff_bics[i] < th and K[i] != 1:
|
mitian@18
|
121 finalK = K[i]
|
mitian@18
|
122 break
|
mi@0
|
123
|
mitian@18
|
124 logging.info("Estimated Unique Number of Segments: %d" % finalK)
|
mitian@18
|
125 if self.plot:
|
mitian@18
|
126 plt.subplot(2, 1, 1)
|
mitian@18
|
127 plt.plot(K, bics, label="BIC")
|
mitian@18
|
128 plt.plot(K[:-1], diff_bics, label="BIC diff")
|
mitian@18
|
129 plt.legend(loc=2)
|
mitian@18
|
130 plt.subplot(2, 1, 2)
|
mitian@18
|
131 plt.scatter(self.X[:, 0], self.X[:, 1])
|
mitian@18
|
132 plt.show()
|
mi@0
|
133
|
mitian@18
|
134 return finalK
|
mi@0
|
135
|
mitian@18
|
136 def get_clustered_data(self, X, labels, label_index):
|
mitian@18
|
137 """Returns the data with a specific label_index, using the previously
|
mitian@18
|
138 learned labels."""
|
mitian@18
|
139 D = X[np.argwhere(labels == label_index)]
|
mitian@18
|
140 return D.reshape((D.shape[0], D.shape[-1]))
|
mi@0
|
141
|
mitian@18
|
142 def run_kmeans(self, X, K):
|
mitian@18
|
143 """Runs k-means and returns the labels assigned to the data."""
|
mitian@18
|
144 wX = vq.whiten(X)
|
mitian@18
|
145 means, dist = vq.kmeans(wX, K, iter=100)
|
mitian@18
|
146 labels, dist = vq.vq(wX, means)
|
mitian@18
|
147 return means, labels
|
mi@0
|
148
|
mitian@18
|
149 def compute_bic(self, D, means, labels, K, R):
|
mitian@18
|
150 """Computes the Bayesian Information Criterion."""
|
mitian@18
|
151 D = vq.whiten(D)
|
mitian@18
|
152 Rn = D.shape[0]
|
mitian@18
|
153 M = D.shape[1]
|
mi@0
|
154
|
mitian@18
|
155 if R == K:
|
mitian@18
|
156 return 1
|
mi@0
|
157
|
mitian@18
|
158 # Maximum likelihood estimate (MLE)
|
mitian@18
|
159 mle_var = 0
|
mitian@18
|
160 for k in xrange(len(means)):
|
mitian@18
|
161 X = D[np.argwhere(labels == k)]
|
mitian@18
|
162 X = X.reshape((X.shape[0], X.shape[-1]))
|
mitian@18
|
163 for x in X:
|
mitian@18
|
164 mle_var += distance.euclidean(x, means[k])
|
mitian@18
|
165 #print x, means[k], mle_var
|
mitian@18
|
166 mle_var /= (float(R - K))
|
mi@0
|
167
|
mitian@18
|
168 # Log-likelihood of the data
|
mitian@18
|
169 l_D = - Rn/2. * np.log(2*np.pi) - (Rn * M)/2. * np.log(mle_var) - \
|
mitian@18
|
170 (Rn - K) / 2. + Rn * np.log(Rn) - Rn * np.log(R)
|
mi@0
|
171
|
mitian@18
|
172 # Params of BIC
|
mitian@18
|
173 p = (K-1) + M * K + mle_var
|
mi@0
|
174
|
mitian@18
|
175 #print "BIC:", l_D, p, R, K
|
mi@0
|
176
|
mitian@18
|
177 # Return the bic
|
mitian@18
|
178 return l_D - p/2. * np.log(R)
|
mi@0
|
179
|
mitian@18
|
180 @classmethod
|
mitian@18
|
181 def generate_2d_data(self, N=100, K=5):
|
mitian@18
|
182 """Generates N*K 2D data points with K means and N data points
|
mitian@18
|
183 for each mean."""
|
mitian@18
|
184 # Seed the random
|
mitian@18
|
185 np.random.seed(seed=int(time.time()))
|
mi@0
|
186
|
mitian@18
|
187 # Amount of spread of the centroids
|
mitian@18
|
188 spread = 30
|
mi@0
|
189
|
mitian@18
|
190 # Generate random data
|
mitian@18
|
191 X = np.empty((0, 2))
|
mitian@18
|
192 for i in xrange(K):
|
mitian@18
|
193 mean = np.array([np.random.random()*spread,
|
mitian@18
|
194 np.random.random()*spread])
|
mitian@18
|
195 x = np.random.normal(0.0, scale=1.0, size=(N, 2)) + mean
|
mitian@18
|
196 X = np.append(X, x, axis=0)
|
mi@0
|
197
|
mitian@18
|
198 return X
|
mi@0
|
199
|
mi@0
|
200
|
mi@0
|
201 def test_kmeans(K=5):
|
mitian@18
|
202 """Test k-means with the synthetic data."""
|
mitian@18
|
203 X = XMeans.generate_2d_data(K=4)
|
mitian@18
|
204 wX = vq.whiten(X)
|
mitian@18
|
205 dic, dist = vq.kmeans(wX, K, iter=100)
|
mi@0
|
206
|
mitian@18
|
207 plt.scatter(wX[:, 0], wX[:, 1])
|
mitian@18
|
208 plt.scatter(dic[:, 0], dic[:, 1], color="m")
|
mitian@18
|
209 plt.show()
|
mi@0
|
210
|
mi@0
|
211
|
mi@0
|
212 def main(args):
|
mitian@18
|
213 #test_kmeans(6)
|
mitian@18
|
214 X = XMeans.generate_2d_data(K=args.k)
|
mitian@18
|
215 xmeans = XMeans(X, init_K=2, plot=args.plot)
|
mitian@18
|
216 est_K = xmeans.estimate_K_xmeans()
|
mitian@18
|
217 est_K_knee = xmeans.estimate_K_knee()
|
mitian@18
|
218 print "Estimated x-means K:", est_K
|
mitian@18
|
219 print "Estimated Knee Point Detection K:", est_K_knee
|
mi@0
|
220
|
mi@0
|
221 if __name__ == '__main__':
|
mitian@18
|
222 parser = argparse.ArgumentParser(
|
mitian@18
|
223 description="Runs x-means")
|
mitian@18
|
224 parser.add_argument("k",
|
mitian@18
|
225 metavar="k", type=int,
|
mitian@18
|
226 help="Number of clusters to estimate.")
|
mitian@18
|
227 parser.add_argument("-p", action="store_true", default=False,
|
mitian@18
|
228 dest="plot", help="Plot the results")
|
mitian@18
|
229 main(parser.parse_args())
|