mi@0
|
1 import sys, os, optparse
|
mi@0
|
2 import numpy as np
|
mi@0
|
3 from os.path import join, isdir, isfile, abspath, dirname, basename, split, splitext
|
mi@0
|
4
|
mi@0
|
5 from gmmdist import skl_full, skl_gmm
|
mi@0
|
6 from GmmMetrics import GmmDistance
|
mi@0
|
7 from sklearn.mixture import GMM
|
mi@0
|
8 from sklearn.metrics.pairwise import pairwise_distances
|
mi@0
|
9 from copy import copy
|
mi@0
|
10
|
mi@0
|
11 class Model():
|
mi@0
|
12 def __init__(self,m,c):
|
mi@0
|
13 self.mean = m
|
mi@0
|
14 self.cov = c
|
mi@0
|
15
|
mi@0
|
16 class Kmeans(object):
|
mi@0
|
17
|
mi@0
|
18 def __init__(self, data, K=10, initial_centroids=None):
|
mi@0
|
19 '''
|
mi@0
|
20 K: Pre-regulated number of clusters.
|
mi@0
|
21 data: Input data. A list of GMMs.
|
mi@0
|
22 self.centroids: A K-length list of cluster centroids.
|
mi@0
|
23 '''
|
mi@0
|
24
|
mi@0
|
25 self.data = data
|
mi@0
|
26 self.w = 0.8
|
mi@0
|
27 self.bias = 500
|
mi@0
|
28 rand_points = np.random.choice(len(self.data), K)
|
mi@0
|
29 self.centroids = [self.data[i] for i in rand_points]
|
mi@0
|
30 # used precomputed centroids if initialised externally
|
mi@0
|
31 if initial_centroids:
|
mi@0
|
32 self.centroids = [self.data[i] for i in initial_centroids]
|
mi@0
|
33 self.labels = np.empty(len(self.data))
|
mi@0
|
34 self.labels.fill(np.nan)
|
mi@0
|
35 self.n_clusters = K
|
mi@0
|
36 self.minPts = 10
|
mi@0
|
37 self.maxStd = 1
|
mi@0
|
38
|
mi@0
|
39 def fit(self):
|
mi@0
|
40 '''
|
mi@0
|
41 Cluster the input observations into K clusters.
|
mi@0
|
42 The algorithm stops when no prominent difference is found in the average distance between each observation gmm to the cluster
|
mi@0
|
43 it is assigned to.
|
mi@0
|
44
|
mi@0
|
45 Returns
|
mi@0
|
46 -------
|
mi@0
|
47 labels : 1d np array
|
mi@0
|
48 Index of clusters each sample in the input observations is assigned to.
|
mi@0
|
49 If it is not assigned to any cluster, index will be NaN.
|
mi@0
|
50 '''
|
mi@0
|
51
|
mi@0
|
52 prev_overall_distance = 0.0
|
mi@0
|
53 prev_diff = np.inf
|
mi@0
|
54 stop_condition = False
|
mi@0
|
55 # delta_stop = 1e-4
|
mi@0
|
56 delta_stop = 1
|
mi@0
|
57 max_iter = 10
|
mi@0
|
58 iter_count = max_iter
|
mi@0
|
59 while iter_count > 0:
|
mi@0
|
60 # assign each GMM to closest centroid
|
mi@0
|
61 for i, m in enumerate(self.data):
|
mi@0
|
62 cc = self.find_closest_centroid(self.centroids, m)
|
mi@0
|
63 if cc is None :
|
mi@0
|
64 print "Warning (fit): No matching cluster can be found for data point %i" %i
|
mi@0
|
65 continue
|
mi@0
|
66 self.labels[i] = self.centroids.index(cc)
|
mi@0
|
67 self.data[i].label = self.centroids.index(cc)
|
mi@0
|
68 # centroid re-estimation
|
mi@0
|
69 self.update_centroid(cc, m)
|
mi@0
|
70 distance_array, within_cluster_std = self.evaluate()
|
mi@0
|
71 overall_distance = distance_array.mean()
|
mi@0
|
72 min_distance = distance_array.min()
|
mi@0
|
73 # Terminate the loop when 1. the change in the mean distances of each sample with respective centroid (overall_distance)
|
mi@0
|
74 # goes below this threshold; 2. number of data in each cluster exceeds given threshold (minPts)
|
mi@0
|
75 diff = prev_overall_distance - overall_distance
|
mi@0
|
76 unique_labels = np.unique(self.labels)
|
mi@0
|
77 data_count = [(self.labels==element).sum() for element in unique_labels]
|
mi@0
|
78 print iter_count, overall_distance, diff
|
mi@0
|
79 if 0 < diff < delta_stop:
|
mi@0
|
80 # if ((diff < delta_stop) and not np.isnan(prev_overall_distance)):
|
mi@0
|
81 # if diff < delta_stop and :
|
mi@0
|
82 stop_condition = True
|
mi@0
|
83 break
|
mi@0
|
84 prev_overall_distance = overall_distance
|
mi@0
|
85 # if np.isinf(prev_overall_distance):
|
mi@0
|
86 # prev_overall_distance = 0.0 # Otherwise whenever the next distance value is not nan, condition is met
|
mi@0
|
87 iter_count -= 1
|
mi@0
|
88
|
mi@0
|
89 if iter_count == 0:
|
mi@0
|
90 print 'Stop loop after max number of iteration reached, distance:', overall_distance
|
mi@0
|
91
|
mi@0
|
92 # Splitting and merging formed clusters based on mean distances between each centroid and its member data points (within_cluster_distances).
|
mi@0
|
93 # Also to be handled iteratively.
|
mi@0
|
94 new_label = self.n_clusters
|
mi@0
|
95 label_list = np.array([self.data[i].label for i in xrange(len(self.data))])
|
mi@0
|
96
|
mi@0
|
97 for cluster_id, count_std in enumerate(within_cluster_std):
|
mi@0
|
98 print cluster_id, count_std
|
mi@0
|
99 if count_std[1] > self.maxStd and count_std[0] >= 2 * self.minPts:
|
mi@0
|
100 print 'Splitting the %ith cluster into the %ith and the %ith.' %(cluster_id, new_label, new_label+1)
|
mi@0
|
101 self.split_cluster(label_list, cluster_id, new_label)
|
mi@0
|
102 new_label += 2
|
mi@0
|
103
|
mi@0
|
104 # Label singular data points as noise
|
mi@0
|
105 label_list = np.array([self.data[i].label for i in xrange(len(self.data))])
|
mi@0
|
106 labels = copy(label_list)
|
mi@0
|
107
|
mi@0
|
108 for i, value in enumerate(label_list):
|
mi@0
|
109 if (label_list==value).sum() == 1:
|
mi@0
|
110 labels[i] = -1
|
mi@0
|
111 print 'Label singular data as noise.\n', labels
|
mi@0
|
112
|
mi@0
|
113 return labels
|
mi@0
|
114
|
mi@0
|
115 def find_closest_centroid(self, centroids, x, prn=False):
|
mi@0
|
116 '''Find the best matching model in k model clusters.'''
|
mi@0
|
117 dist = []
|
mi@0
|
118 for c in centroids:
|
mi@0
|
119 dist.append(c.skl_distance_full(x))
|
mi@0
|
120 # if prn and any([np.isinf(x) for x in dist]) :
|
mi@0
|
121 # print "inf encountered in find best match. argmin returns:", np.argmin(dist)
|
mi@0
|
122 # print dist
|
mi@0
|
123 if all([np.isinf(x) for x in dist]) :
|
mi@0
|
124 # print "All distances are inf..."
|
mi@0
|
125 return None
|
mi@0
|
126 c_ix = np.argmin(dist)
|
mi@0
|
127 return centroids[c_ix]
|
mi@0
|
128
|
mi@0
|
129 def update_centroid(self, c, x):
|
mi@0
|
130 '''Update cluster models given a GMM from the observation sequence.'''
|
mi@0
|
131 # print "c pre-update:", c
|
mi@0
|
132 C = len(x.components)
|
mi@0
|
133
|
mi@0
|
134 dist_matrix = np.zeros((C, C))
|
mi@0
|
135 for i in xrange(C):
|
mi@0
|
136 for j in xrange(i, C):
|
mi@0
|
137 dist_matrix[i,j] = skl_full(x.components[i], c.components[j])
|
mi@0
|
138 # dist_matrix[i,j] = skl_gmm(x.components[i].means, c.components[j].means, x.components[i].covars, c.components[j].covars, 1, 1)
|
mi@0
|
139 dist_matrix[j,i] = dist_matrix[i,j]
|
mi@0
|
140 # print dist_matrix
|
mi@0
|
141 min_idx = np.argmin(dist_matrix, axis=0)
|
mi@0
|
142 # print min_idx
|
mi@0
|
143 # sum_weights = sum(c.weights)
|
mi@0
|
144 sum_weights = 0.0
|
mi@0
|
145 for i in xrange(C):
|
mi@0
|
146 pi0 = c.components[i].weight
|
mi@0
|
147 pi1 = x.components[min_idx[i]].weight
|
mi@0
|
148 if pi1 < 0.001 :
|
mi@0
|
149 print "Warning: observed component weight very low. skipping update for this component."
|
mi@0
|
150 continue
|
mi@0
|
151 try :
|
mi@0
|
152 n0 = int(self.w * self.bias * pi0 / (pi0 + pi1))
|
mi@0
|
153 n1 = int((1 - self.w) * self.bias * pi1 / (pi0 + pi1))
|
mi@0
|
154 except :
|
mi@0
|
155 print "Warning: could not estimaate sample size"
|
mi@0
|
156 continue
|
mi@0
|
157 if n0 < 2 or n1 < 2 :
|
mi@0
|
158 print "Warning: sample size estimates too small.. ", n0, n1
|
mi@0
|
159 c.components[i].means, c.components[i].covars = self.update_component(c.components[i], x.components[min_idx[i]], n0, n1)
|
mi@0
|
160 # weights update
|
mi@0
|
161 w = (c.components[i].weight * n0 + x.components[min_idx[i]].weight * n1) / (n0 + n1)
|
mi@0
|
162 c.components[i].weight = w
|
mi@0
|
163 sum_weights += w
|
mi@0
|
164 for i in xrange(C):
|
mi@0
|
165 c.components[i].weight = c.components[i].weight / sum_weights
|
mi@0
|
166 c.update()
|
mi@0
|
167 # print "c post-update:", c
|
mi@0
|
168 return c
|
mi@0
|
169
|
mi@0
|
170 def update_component(self, c1,c2,n1,n2) :
|
mi@0
|
171 '''Returning the updated mean / covariance'''
|
mi@0
|
172 nz = float(n1+n2)
|
mi@0
|
173 m1,m2 = c1.means, c2.means
|
mi@0
|
174 mz_est_b = (n1 * m1 + n2 * m2) / nz
|
mi@0
|
175
|
mi@0
|
176 k1 = (n1-1.0)/float(n1)
|
mi@0
|
177 k2 = (n2-1.0)/float(n2)
|
mi@0
|
178 kz = (nz-1.0)/float(nz)
|
mi@0
|
179
|
mi@0
|
180 m1p = k1*c1.covars + m1[:,np.newaxis]*m1[:,np.newaxis].T
|
mi@0
|
181 m2p = k2*c2.covars + m2[:,np.newaxis]*m2[:,np.newaxis].T
|
mi@0
|
182
|
mi@0
|
183 mzp_est = (n1 * m1p + n2 * m2p) / (nz-1.0)
|
mi@0
|
184
|
mi@0
|
185 # 2) Calculate the variate mean estimates:
|
mi@0
|
186 mz_est = (n1 * m1 + n2 * m2) / (nz-1.0)
|
mi@0
|
187
|
mi@0
|
188 cov_z_est = mzp_est - (mz_est[:,np.newaxis]*mz_est[:,np.newaxis].T) * kz
|
mi@0
|
189
|
mi@0
|
190 return (mz_est_b,cov_z_est)
|
mi@0
|
191
|
mi@0
|
192 def evaluate(self):
|
mi@0
|
193 '''Find stopping criteria using the Bregman loss function.
|
mi@0
|
194 Stopping condition is safisfied when the '''
|
mi@0
|
195 matches = 0
|
mi@0
|
196 distances = np.zeros(self.n_clusters)
|
mi@0
|
197 within_cluster_std = []
|
mi@0
|
198 distance_list = [[] for i in xrange(self.n_clusters)]
|
mi@0
|
199 for j,m in enumerate(self.data):
|
mi@0
|
200 cc = self.find_closest_centroid(self.centroids, m, prn=True)
|
mi@0
|
201 if cc is None :
|
mi@0
|
202 print "Warning (eval): No matching cluster can be found for data point %i" %j
|
mi@0
|
203 continue
|
mi@0
|
204 ix = self.centroids.index(cc)
|
mi@0
|
205 d = cc.skl_distance_full(m)
|
mi@0
|
206 if np.isinf(d) :
|
mi@0
|
207 print j, m
|
mi@0
|
208 print "centroid index, distance:",ix,d
|
mi@0
|
209 # print 'c-m', d
|
mi@0
|
210 distances[ix] += d
|
mi@0
|
211 distance_list[ix].append(d)
|
mi@0
|
212 if d < 0.001 :
|
mi@0
|
213 matches += 1
|
mi@0
|
214 print matches
|
mi@0
|
215 for i in xrange(self.n_clusters):
|
mi@0
|
216 within_cluster_std.append((len(distance_list[i]), np.std(distance_list[i])))
|
mi@0
|
217 distances = np.array(distances) / self.n_clusters
|
mi@0
|
218 print 'within_cluster_std', within_cluster_std
|
mi@0
|
219 return distances, within_cluster_std
|
mi@0
|
220
|
mi@0
|
221 def split_cluster(self, label_list, cluster_id, new_label):
|
mi@0
|
222 '''Split clusters to form new clusters.'''
|
mi@0
|
223 old_members = np.where(label_list==cluster_id)[0]
|
mi@0
|
224 data = [self.data[i] for i in old_members]
|
mi@0
|
225 # Init new centroids
|
mi@0
|
226 new_centroids = [self.data[i] for i in np.random.choice(old_members, 2)]
|
mi@0
|
227 new_label_list = [new_label, new_label+1]
|
mi@0
|
228
|
mi@0
|
229 iter_count = 1
|
mi@0
|
230 while iter_count > 0:
|
mi@0
|
231 for i, m in enumerate(data):
|
mi@0
|
232 cc = self.find_closest_centroid(new_centroids, m)
|
mi@0
|
233 if cc is None :
|
mi@0
|
234 print "Warning (fit): No matching cluster can be found for data point %i" %i
|
mi@0
|
235 continue
|
mi@0
|
236 # assign new label to the original gmm data
|
mi@0
|
237 pos = new_centroids.index(cc)
|
mi@0
|
238 self.data[old_members[data.index(m)]].label = new_label_list[pos]
|
mi@0
|
239 # centroid re-estimation
|
mi@0
|
240 cc = self.update_centroid(cc, m)
|
mi@0
|
241 new_centroids[pos] = cc
|
mi@0
|
242 iter_count -= 1
|
mi@0
|
243
|
mi@0
|
244
|
mi@0
|
245 class FeatureObj() :
|
mi@0
|
246 __slots__ = ['key','audio','timestamps','features']
|
mi@0
|
247
|
mi@0
|
248 def getGMMs(feature, gmmWindow=10, stepsize=1):
|
mi@0
|
249 gmm_list = []
|
mi@0
|
250 steps = int((feature.shape[0] - gmmWindow + stepsize) / stepsize)
|
mi@0
|
251 for i in xrange(steps):
|
mi@0
|
252 gmm_list.append(GmmDistance(feature[i*stepsize:(i*stepsize+gmmWindow), :]))
|
mi@0
|
253 return gmm_list
|
mi@0
|
254
|
mi@0
|
255 def pairwiseSKL(gmm_list):
|
mi@0
|
256 '''Compute pairwise symmetrised KL divergence of a list of GMMs.'''
|
mi@0
|
257 n_GMMs = len(gmm_list)
|
mi@0
|
258 distance_matrix = np.zeros((n_GMMs, n_GMMs))
|
mi@0
|
259 for i in xrange(n_GMMs):
|
mi@0
|
260 for j in xrange(i, n_GMMs):
|
mi@0
|
261 distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j])
|
mi@0
|
262 distance_matrix[j][i] = distance_matrix[i][j]
|
mi@0
|
263 # X = np.array(gmm_list)
|
mi@0
|
264 # distance_matrix = pairwise_distances(X, metric = lambda x, y: x.skl_distance_full(y) )
|
mi@0
|
265 distance_matrix[np.isnan(distance_matrix)] = 10.0
|
mi@0
|
266 distance_matrix[np.isinf(distance_matrix)] = 10.0
|
mi@0
|
267
|
mi@0
|
268
|
mi@0
|
269 # def parse_args():
|
mi@0
|
270 # # define parser
|
mi@0
|
271 # op = optparse.OptionParser()
|
mi@0
|
272 # # IO options
|
mi@0
|
273 # op.add_option('-i', '--input', action="store", dest="INPUT", default='/Volumes/c4dm-scratch/mi/seg/qupujicheng/features', type="str", help="Loading features from..." )
|
mi@0
|
274 # op.add_option('-o', '--out', action="store", dest="OUTPUT", default='test/clustering_resutls', type="str", help="Writing clustering results to... ")
|
mi@0
|
275 #
|
mi@0
|
276 # return op.parse_args()
|
mi@0
|
277 #
|
mi@0
|
278 # options, args = parse_args()
|
mi@0
|
279 #
|
mi@0
|
280 # def main():
|
mi@0
|
281 #
|
mi@0
|
282 # feature_list = [i for i in os.listdir(options.INPUT) if not i.startswith('.')]
|
mi@0
|
283 # feature_list.sort()
|
mi@0
|
284 # fobj_list = []
|
mi@0
|
285 #
|
mi@0
|
286 # for feature in feature_list:
|
mi@0
|
287 # data = np.genfromtxt(join(options.INPUT, feature), delimiter=',',filling_values=0.0)
|
mi@0
|
288 # dim = data.shape[1] - 1
|
mi@0
|
289 # if dim == 1 :
|
mi@0
|
290 # fo = FeatureObj()
|
mi@0
|
291 # fo.audio = feature[:feature.find('_vamp')]
|
mi@0
|
292 # fo.key = splitext(feature.strip(fo.audio + '_'))[0]
|
mi@0
|
293 # fo.timestamps = data[:, 0] # the first column is the timestamps
|
mi@0
|
294 # fo.features = data[:, 1]
|
mi@0
|
295 # fobj_list.append(fo)
|
mi@0
|
296 #
|
mi@0
|
297 # else :
|
mi@0
|
298 # for col in xrange(dim):
|
mi@0
|
299 # fo = FeatureObj()
|
mi@0
|
300 # fo.audio = feature[:feature.find('_vamp')]
|
mi@0
|
301 # fo.key = splitext(feature.strip(fo.audio + '_'))[0] + '_' + '%d' %col
|
mi@0
|
302 # fo.timestamps = data[:, 0] # the first column records the timestamps
|
mi@0
|
303 # fo.features = data[:, col+1][:,np.newaxis]
|
mi@0
|
304 # fobj_list.append(fo)
|
mi@0
|
305 #
|
mi@0
|
306 # timestamps = fobj_list[0].timestamps
|
mi@0
|
307 # features = map(lambda x: "%i:%s" %(x[0],x[1].key), enumerate(fobj_list))
|
mi@0
|
308 # print "Loading %d features:\n", len(fobj_list)
|
mi@0
|
309 #
|
mi@0
|
310 # # find the feature with the fewer number of frames
|
mi@0
|
311 # n_frames = np.array(map(lambda x: x.features.shape[0], fobj_list)).min()
|
mi@0
|
312 # n_features = len(fobj_list)
|
mi@0
|
313 #
|
mi@0
|
314 # feature_matrix = np.zeros((n_frames, n_features))
|
mi@0
|
315 # print 'feature_matrix', feature_matrix.shape
|
mi@0
|
316 #
|
mi@0
|
317 # # take the arrays from the feature objects and add them to a matrix
|
mi@0
|
318 # for i,f in enumerate(fobj_list) :
|
mi@0
|
319 # feature_matrix[:,i] = f.features[:n_frames,0]
|
mi@0
|
320 #
|
mi@0
|
321 # # normalise the feature matrix, get rid of negative features, ensure numerical stability by adding a small constant
|
mi@0
|
322 # feature_matrix = abs(feature_matrix) / (abs(feature_matrix.max(0))+0.0005)
|
mi@0
|
323 # feature_matrix[np.isnan(feature_matrix)] = 0.0
|
mi@0
|
324 #
|
mi@0
|
325 # winlength = 5
|
mi@0
|
326 # stepsize = 2
|
mi@0
|
327 #
|
mi@0
|
328 # gmm_list = getGMMs(feature_matrix, gmmWindow=winlength, stepsize=stepsize)
|
mi@0
|
329 # print 'number of GMMs:', len(gmm_list)
|
mi@0
|
330 # skl_matrix = pairwiseSKL(gmm_list)
|
mi@0
|
331 #
|
mi@0
|
332 # # k-means clustering of GMMs
|
mi@0
|
333 # KmeansClustering = Kmeans(gmm_list, K=5)
|
mi@0
|
334 # labels = KmeansClustering.fit()
|
mi@0
|
335 #
|
mi@0
|
336 # f1 = np.array(zip(timestamps[:len(labels)], labels))
|
mi@0
|
337 #
|
mi@0
|
338 # np.savetxt(join(options.OUTPUT, 'kmeans')+'.csv', f1, delimiter=',')
|
mi@0
|
339 #
|
mi@0
|
340 # if __name__ == '__main__':
|
mi@0
|
341 # main()
|