annotate utils/kmeans.py @ 19:890cfe424f4a tip

added annotations
author mitian
date Fri, 11 Dec 2015 09:47:40 +0000
parents 26838b1f560f
children
rev   line source
mi@0 1 import sys, os, optparse
mi@0 2 import numpy as np
mi@0 3 from os.path import join, isdir, isfile, abspath, dirname, basename, split, splitext
mi@0 4
mi@0 5 from gmmdist import skl_full, skl_gmm
mi@0 6 from GmmMetrics import GmmDistance
mi@0 7 from sklearn.mixture import GMM
mi@0 8 from sklearn.metrics.pairwise import pairwise_distances
mi@0 9 from copy import copy
mi@0 10
mi@0 11 class Model():
mi@0 12 def __init__(self,m,c):
mi@0 13 self.mean = m
mi@0 14 self.cov = c
mi@0 15
mi@0 16 class Kmeans(object):
mi@0 17
mi@0 18 def __init__(self, data, K=10, initial_centroids=None):
mi@0 19 '''
mi@0 20 K: Pre-regulated number of clusters.
mi@0 21 data: Input data. A list of GMMs.
mi@0 22 self.centroids: A K-length list of cluster centroids.
mi@0 23 '''
mi@0 24
mi@0 25 self.data = data
mi@0 26 self.w = 0.8
mi@0 27 self.bias = 500
mi@0 28 rand_points = np.random.choice(len(self.data), K)
mi@0 29 self.centroids = [self.data[i] for i in rand_points]
mi@0 30 # used precomputed centroids if initialised externally
mi@0 31 if initial_centroids:
mi@0 32 self.centroids = [self.data[i] for i in initial_centroids]
mi@0 33 self.labels = np.empty(len(self.data))
mi@0 34 self.labels.fill(np.nan)
mi@0 35 self.n_clusters = K
mi@0 36 self.minPts = 10
mi@0 37 self.maxStd = 1
mi@0 38
mi@0 39 def fit(self):
mi@0 40 '''
mi@0 41 Cluster the input observations into K clusters.
mi@0 42 The algorithm stops when no prominent difference is found in the average distance between each observation gmm to the cluster
mi@0 43 it is assigned to.
mi@0 44
mi@0 45 Returns
mi@0 46 -------
mi@0 47 labels : 1d np array
mi@0 48 Index of clusters each sample in the input observations is assigned to.
mi@0 49 If it is not assigned to any cluster, index will be NaN.
mi@0 50 '''
mi@0 51
mi@0 52 prev_overall_distance = 0.0
mi@0 53 prev_diff = np.inf
mi@0 54 stop_condition = False
mi@0 55 # delta_stop = 1e-4
mi@0 56 delta_stop = 1
mi@0 57 max_iter = 10
mi@0 58 iter_count = max_iter
mi@0 59 while iter_count > 0:
mi@0 60 # assign each GMM to closest centroid
mi@0 61 for i, m in enumerate(self.data):
mi@0 62 cc = self.find_closest_centroid(self.centroids, m)
mi@0 63 if cc is None :
mi@0 64 print "Warning (fit): No matching cluster can be found for data point %i" %i
mi@0 65 continue
mi@0 66 self.labels[i] = self.centroids.index(cc)
mi@0 67 self.data[i].label = self.centroids.index(cc)
mi@0 68 # centroid re-estimation
mi@0 69 self.update_centroid(cc, m)
mi@0 70 distance_array, within_cluster_std = self.evaluate()
mi@0 71 overall_distance = distance_array.mean()
mi@0 72 min_distance = distance_array.min()
mi@0 73 # Terminate the loop when 1. the change in the mean distances of each sample with respective centroid (overall_distance)
mi@0 74 # goes below this threshold; 2. number of data in each cluster exceeds given threshold (minPts)
mi@0 75 diff = prev_overall_distance - overall_distance
mi@0 76 unique_labels = np.unique(self.labels)
mi@0 77 data_count = [(self.labels==element).sum() for element in unique_labels]
mi@0 78 print iter_count, overall_distance, diff
mi@0 79 if 0 < diff < delta_stop:
mi@0 80 # if ((diff < delta_stop) and not np.isnan(prev_overall_distance)):
mi@0 81 # if diff < delta_stop and :
mi@0 82 stop_condition = True
mi@0 83 break
mi@0 84 prev_overall_distance = overall_distance
mi@0 85 # if np.isinf(prev_overall_distance):
mi@0 86 # prev_overall_distance = 0.0 # Otherwise whenever the next distance value is not nan, condition is met
mi@0 87 iter_count -= 1
mi@0 88
mi@0 89 if iter_count == 0:
mi@0 90 print 'Stop loop after max number of iteration reached, distance:', overall_distance
mi@0 91
mi@0 92 # Splitting and merging formed clusters based on mean distances between each centroid and its member data points (within_cluster_distances).
mi@0 93 # Also to be handled iteratively.
mi@0 94 new_label = self.n_clusters
mi@0 95 label_list = np.array([self.data[i].label for i in xrange(len(self.data))])
mi@0 96
mi@0 97 for cluster_id, count_std in enumerate(within_cluster_std):
mi@0 98 print cluster_id, count_std
mi@0 99 if count_std[1] > self.maxStd and count_std[0] >= 2 * self.minPts:
mi@0 100 print 'Splitting the %ith cluster into the %ith and the %ith.' %(cluster_id, new_label, new_label+1)
mi@0 101 self.split_cluster(label_list, cluster_id, new_label)
mi@0 102 new_label += 2
mi@0 103
mi@0 104 # Label singular data points as noise
mi@0 105 label_list = np.array([self.data[i].label for i in xrange(len(self.data))])
mi@0 106 labels = copy(label_list)
mi@0 107
mi@0 108 for i, value in enumerate(label_list):
mi@0 109 if (label_list==value).sum() == 1:
mi@0 110 labels[i] = -1
mi@0 111 print 'Label singular data as noise.\n', labels
mi@0 112
mi@0 113 return labels
mi@0 114
mi@0 115 def find_closest_centroid(self, centroids, x, prn=False):
mi@0 116 '''Find the best matching model in k model clusters.'''
mi@0 117 dist = []
mi@0 118 for c in centroids:
mi@0 119 dist.append(c.skl_distance_full(x))
mi@0 120 # if prn and any([np.isinf(x) for x in dist]) :
mi@0 121 # print "inf encountered in find best match. argmin returns:", np.argmin(dist)
mi@0 122 # print dist
mi@0 123 if all([np.isinf(x) for x in dist]) :
mi@0 124 # print "All distances are inf..."
mi@0 125 return None
mi@0 126 c_ix = np.argmin(dist)
mi@0 127 return centroids[c_ix]
mi@0 128
mi@0 129 def update_centroid(self, c, x):
mi@0 130 '''Update cluster models given a GMM from the observation sequence.'''
mi@0 131 # print "c pre-update:", c
mi@0 132 C = len(x.components)
mi@0 133
mi@0 134 dist_matrix = np.zeros((C, C))
mi@0 135 for i in xrange(C):
mi@0 136 for j in xrange(i, C):
mi@0 137 dist_matrix[i,j] = skl_full(x.components[i], c.components[j])
mi@0 138 # dist_matrix[i,j] = skl_gmm(x.components[i].means, c.components[j].means, x.components[i].covars, c.components[j].covars, 1, 1)
mi@0 139 dist_matrix[j,i] = dist_matrix[i,j]
mi@0 140 # print dist_matrix
mi@0 141 min_idx = np.argmin(dist_matrix, axis=0)
mi@0 142 # print min_idx
mi@0 143 # sum_weights = sum(c.weights)
mi@0 144 sum_weights = 0.0
mi@0 145 for i in xrange(C):
mi@0 146 pi0 = c.components[i].weight
mi@0 147 pi1 = x.components[min_idx[i]].weight
mi@0 148 if pi1 < 0.001 :
mi@0 149 print "Warning: observed component weight very low. skipping update for this component."
mi@0 150 continue
mi@0 151 try :
mi@0 152 n0 = int(self.w * self.bias * pi0 / (pi0 + pi1))
mi@0 153 n1 = int((1 - self.w) * self.bias * pi1 / (pi0 + pi1))
mi@0 154 except :
mi@0 155 print "Warning: could not estimaate sample size"
mi@0 156 continue
mi@0 157 if n0 < 2 or n1 < 2 :
mi@0 158 print "Warning: sample size estimates too small.. ", n0, n1
mi@0 159 c.components[i].means, c.components[i].covars = self.update_component(c.components[i], x.components[min_idx[i]], n0, n1)
mi@0 160 # weights update
mi@0 161 w = (c.components[i].weight * n0 + x.components[min_idx[i]].weight * n1) / (n0 + n1)
mi@0 162 c.components[i].weight = w
mi@0 163 sum_weights += w
mi@0 164 for i in xrange(C):
mi@0 165 c.components[i].weight = c.components[i].weight / sum_weights
mi@0 166 c.update()
mi@0 167 # print "c post-update:", c
mi@0 168 return c
mi@0 169
mi@0 170 def update_component(self, c1,c2,n1,n2) :
mi@0 171 '''Returning the updated mean / covariance'''
mi@0 172 nz = float(n1+n2)
mi@0 173 m1,m2 = c1.means, c2.means
mi@0 174 mz_est_b = (n1 * m1 + n2 * m2) / nz
mi@0 175
mi@0 176 k1 = (n1-1.0)/float(n1)
mi@0 177 k2 = (n2-1.0)/float(n2)
mi@0 178 kz = (nz-1.0)/float(nz)
mi@0 179
mi@0 180 m1p = k1*c1.covars + m1[:,np.newaxis]*m1[:,np.newaxis].T
mi@0 181 m2p = k2*c2.covars + m2[:,np.newaxis]*m2[:,np.newaxis].T
mi@0 182
mi@0 183 mzp_est = (n1 * m1p + n2 * m2p) / (nz-1.0)
mi@0 184
mi@0 185 # 2) Calculate the variate mean estimates:
mi@0 186 mz_est = (n1 * m1 + n2 * m2) / (nz-1.0)
mi@0 187
mi@0 188 cov_z_est = mzp_est - (mz_est[:,np.newaxis]*mz_est[:,np.newaxis].T) * kz
mi@0 189
mi@0 190 return (mz_est_b,cov_z_est)
mi@0 191
mi@0 192 def evaluate(self):
mi@0 193 '''Find stopping criteria using the Bregman loss function.
mi@0 194 Stopping condition is safisfied when the '''
mi@0 195 matches = 0
mi@0 196 distances = np.zeros(self.n_clusters)
mi@0 197 within_cluster_std = []
mi@0 198 distance_list = [[] for i in xrange(self.n_clusters)]
mi@0 199 for j,m in enumerate(self.data):
mi@0 200 cc = self.find_closest_centroid(self.centroids, m, prn=True)
mi@0 201 if cc is None :
mi@0 202 print "Warning (eval): No matching cluster can be found for data point %i" %j
mi@0 203 continue
mi@0 204 ix = self.centroids.index(cc)
mi@0 205 d = cc.skl_distance_full(m)
mi@0 206 if np.isinf(d) :
mi@0 207 print j, m
mi@0 208 print "centroid index, distance:",ix,d
mi@0 209 # print 'c-m', d
mi@0 210 distances[ix] += d
mi@0 211 distance_list[ix].append(d)
mi@0 212 if d < 0.001 :
mi@0 213 matches += 1
mi@0 214 print matches
mi@0 215 for i in xrange(self.n_clusters):
mi@0 216 within_cluster_std.append((len(distance_list[i]), np.std(distance_list[i])))
mi@0 217 distances = np.array(distances) / self.n_clusters
mi@0 218 print 'within_cluster_std', within_cluster_std
mi@0 219 return distances, within_cluster_std
mi@0 220
mi@0 221 def split_cluster(self, label_list, cluster_id, new_label):
mi@0 222 '''Split clusters to form new clusters.'''
mi@0 223 old_members = np.where(label_list==cluster_id)[0]
mi@0 224 data = [self.data[i] for i in old_members]
mi@0 225 # Init new centroids
mi@0 226 new_centroids = [self.data[i] for i in np.random.choice(old_members, 2)]
mi@0 227 new_label_list = [new_label, new_label+1]
mi@0 228
mi@0 229 iter_count = 1
mi@0 230 while iter_count > 0:
mi@0 231 for i, m in enumerate(data):
mi@0 232 cc = self.find_closest_centroid(new_centroids, m)
mi@0 233 if cc is None :
mi@0 234 print "Warning (fit): No matching cluster can be found for data point %i" %i
mi@0 235 continue
mi@0 236 # assign new label to the original gmm data
mi@0 237 pos = new_centroids.index(cc)
mi@0 238 self.data[old_members[data.index(m)]].label = new_label_list[pos]
mi@0 239 # centroid re-estimation
mi@0 240 cc = self.update_centroid(cc, m)
mi@0 241 new_centroids[pos] = cc
mi@0 242 iter_count -= 1
mi@0 243
mi@0 244
mi@0 245 class FeatureObj() :
mi@0 246 __slots__ = ['key','audio','timestamps','features']
mi@0 247
mi@0 248 def getGMMs(feature, gmmWindow=10, stepsize=1):
mi@0 249 gmm_list = []
mi@0 250 steps = int((feature.shape[0] - gmmWindow + stepsize) / stepsize)
mi@0 251 for i in xrange(steps):
mi@0 252 gmm_list.append(GmmDistance(feature[i*stepsize:(i*stepsize+gmmWindow), :]))
mi@0 253 return gmm_list
mi@0 254
mi@0 255 def pairwiseSKL(gmm_list):
mi@0 256 '''Compute pairwise symmetrised KL divergence of a list of GMMs.'''
mi@0 257 n_GMMs = len(gmm_list)
mi@0 258 distance_matrix = np.zeros((n_GMMs, n_GMMs))
mi@0 259 for i in xrange(n_GMMs):
mi@0 260 for j in xrange(i, n_GMMs):
mi@0 261 distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j])
mi@0 262 distance_matrix[j][i] = distance_matrix[i][j]
mi@0 263 # X = np.array(gmm_list)
mi@0 264 # distance_matrix = pairwise_distances(X, metric = lambda x, y: x.skl_distance_full(y) )
mi@0 265 distance_matrix[np.isnan(distance_matrix)] = 10.0
mi@0 266 distance_matrix[np.isinf(distance_matrix)] = 10.0
mi@0 267
mi@0 268
mi@0 269 # def parse_args():
mi@0 270 # # define parser
mi@0 271 # op = optparse.OptionParser()
mi@0 272 # # IO options
mi@0 273 # op.add_option('-i', '--input', action="store", dest="INPUT", default='/Volumes/c4dm-scratch/mi/seg/qupujicheng/features', type="str", help="Loading features from..." )
mi@0 274 # op.add_option('-o', '--out', action="store", dest="OUTPUT", default='test/clustering_resutls', type="str", help="Writing clustering results to... ")
mi@0 275 #
mi@0 276 # return op.parse_args()
mi@0 277 #
mi@0 278 # options, args = parse_args()
mi@0 279 #
mi@0 280 # def main():
mi@0 281 #
mi@0 282 # feature_list = [i for i in os.listdir(options.INPUT) if not i.startswith('.')]
mi@0 283 # feature_list.sort()
mi@0 284 # fobj_list = []
mi@0 285 #
mi@0 286 # for feature in feature_list:
mi@0 287 # data = np.genfromtxt(join(options.INPUT, feature), delimiter=',',filling_values=0.0)
mi@0 288 # dim = data.shape[1] - 1
mi@0 289 # if dim == 1 :
mi@0 290 # fo = FeatureObj()
mi@0 291 # fo.audio = feature[:feature.find('_vamp')]
mi@0 292 # fo.key = splitext(feature.strip(fo.audio + '_'))[0]
mi@0 293 # fo.timestamps = data[:, 0] # the first column is the timestamps
mi@0 294 # fo.features = data[:, 1]
mi@0 295 # fobj_list.append(fo)
mi@0 296 #
mi@0 297 # else :
mi@0 298 # for col in xrange(dim):
mi@0 299 # fo = FeatureObj()
mi@0 300 # fo.audio = feature[:feature.find('_vamp')]
mi@0 301 # fo.key = splitext(feature.strip(fo.audio + '_'))[0] + '_' + '%d' %col
mi@0 302 # fo.timestamps = data[:, 0] # the first column records the timestamps
mi@0 303 # fo.features = data[:, col+1][:,np.newaxis]
mi@0 304 # fobj_list.append(fo)
mi@0 305 #
mi@0 306 # timestamps = fobj_list[0].timestamps
mi@0 307 # features = map(lambda x: "%i:%s" %(x[0],x[1].key), enumerate(fobj_list))
mi@0 308 # print "Loading %d features:\n", len(fobj_list)
mi@0 309 #
mi@0 310 # # find the feature with the fewer number of frames
mi@0 311 # n_frames = np.array(map(lambda x: x.features.shape[0], fobj_list)).min()
mi@0 312 # n_features = len(fobj_list)
mi@0 313 #
mi@0 314 # feature_matrix = np.zeros((n_frames, n_features))
mi@0 315 # print 'feature_matrix', feature_matrix.shape
mi@0 316 #
mi@0 317 # # take the arrays from the feature objects and add them to a matrix
mi@0 318 # for i,f in enumerate(fobj_list) :
mi@0 319 # feature_matrix[:,i] = f.features[:n_frames,0]
mi@0 320 #
mi@0 321 # # normalise the feature matrix, get rid of negative features, ensure numerical stability by adding a small constant
mi@0 322 # feature_matrix = abs(feature_matrix) / (abs(feature_matrix.max(0))+0.0005)
mi@0 323 # feature_matrix[np.isnan(feature_matrix)] = 0.0
mi@0 324 #
mi@0 325 # winlength = 5
mi@0 326 # stepsize = 2
mi@0 327 #
mi@0 328 # gmm_list = getGMMs(feature_matrix, gmmWindow=winlength, stepsize=stepsize)
mi@0 329 # print 'number of GMMs:', len(gmm_list)
mi@0 330 # skl_matrix = pairwiseSKL(gmm_list)
mi@0 331 #
mi@0 332 # # k-means clustering of GMMs
mi@0 333 # KmeansClustering = Kmeans(gmm_list, K=5)
mi@0 334 # labels = KmeansClustering.fit()
mi@0 335 #
mi@0 336 # f1 = np.array(zip(timestamps[:len(labels)], labels))
mi@0 337 #
mi@0 338 # np.savetxt(join(options.OUTPUT, 'kmeans')+'.csv', f1, delimiter=',')
mi@0 339 #
mi@0 340 # if __name__ == '__main__':
mi@0 341 # main()