segmentation: utils/kmeans.py annotate

annotate utils/kmeans.py @ 19:890cfe424f4a tip

added annotations

author	mitian
date	Fri, 11 Dec 2015 09:47:40 +0000
parents	26838b1f560f
children

rev	line source
mi@0	1 import sys, os, optparse
mi@0	2 import numpy as np
mi@0	3 from os.path import join, isdir, isfile, abspath, dirname, basename, split, splitext
mi@0	4
mi@0	5 from gmmdist import skl_full, skl_gmm
mi@0	6 from GmmMetrics import GmmDistance
mi@0	7 from sklearn.mixture import GMM
mi@0	8 from sklearn.metrics.pairwise import pairwise_distances
mi@0	9 from copy import copy
mi@0	10
mi@0	11 class Model():
mi@0	12 def __init__(self,m,c):
mi@0	13 self.mean = m
mi@0	14 self.cov = c
mi@0	15
mi@0	16 class Kmeans(object):
mi@0	17
mi@0	18 def __init__(self, data, K=10, initial_centroids=None):
mi@0	19 '''
mi@0	20 K: Pre-regulated number of clusters.
mi@0	21 data: Input data. A list of GMMs.
mi@0	22 self.centroids: A K-length list of cluster centroids.
mi@0	23 '''
mi@0	24
mi@0	25 self.data = data
mi@0	26 self.w = 0.8
mi@0	27 self.bias = 500
mi@0	28 rand_points = np.random.choice(len(self.data), K)
mi@0	29 self.centroids = [self.data[i] for i in rand_points]
mi@0	30 # used precomputed centroids if initialised externally
mi@0	31 if initial_centroids:
mi@0	32 self.centroids = [self.data[i] for i in initial_centroids]
mi@0	33 self.labels = np.empty(len(self.data))
mi@0	34 self.labels.fill(np.nan)
mi@0	35 self.n_clusters = K
mi@0	36 self.minPts = 10
mi@0	37 self.maxStd = 1
mi@0	38
mi@0	39 def fit(self):
mi@0	40 '''
mi@0	41 Cluster the input observations into K clusters.
mi@0	42 The algorithm stops when no prominent difference is found in the average distance between each observation gmm to the cluster
mi@0	43 it is assigned to.
mi@0	44
mi@0	45 Returns
mi@0	46 -------
mi@0	47 labels : 1d np array
mi@0	48 Index of clusters each sample in the input observations is assigned to.
mi@0	49 If it is not assigned to any cluster, index will be NaN.
mi@0	50 '''
mi@0	51
mi@0	52 prev_overall_distance = 0.0
mi@0	53 prev_diff = np.inf
mi@0	54 stop_condition = False
mi@0	55 # delta_stop = 1e-4
mi@0	56 delta_stop = 1
mi@0	57 max_iter = 10
mi@0	58 iter_count = max_iter
mi@0	59 while iter_count > 0:
mi@0	60 # assign each GMM to closest centroid
mi@0	61 for i, m in enumerate(self.data):
mi@0	62 cc = self.find_closest_centroid(self.centroids, m)
mi@0	63 if cc is None :
mi@0	64 print "Warning (fit): No matching cluster can be found for data point %i" %i
mi@0	65 continue
mi@0	66 self.labels[i] = self.centroids.index(cc)
mi@0	67 self.data[i].label = self.centroids.index(cc)
mi@0	68 # centroid re-estimation
mi@0	69 self.update_centroid(cc, m)
mi@0	70 distance_array, within_cluster_std = self.evaluate()
mi@0	71 overall_distance = distance_array.mean()
mi@0	72 min_distance = distance_array.min()
mi@0	73 # Terminate the loop when 1. the change in the mean distances of each sample with respective centroid (overall_distance)
mi@0	74 # goes below this threshold; 2. number of data in each cluster exceeds given threshold (minPts)
mi@0	75 diff = prev_overall_distance - overall_distance
mi@0	76 unique_labels = np.unique(self.labels)
mi@0	77 data_count = [(self.labels==element).sum() for element in unique_labels]
mi@0	78 print iter_count, overall_distance, diff
mi@0	79 if 0 < diff < delta_stop:
mi@0	80 # if ((diff < delta_stop) and not np.isnan(prev_overall_distance)):
mi@0	81 # if diff < delta_stop and :
mi@0	82 stop_condition = True
mi@0	83 break
mi@0	84 prev_overall_distance = overall_distance
mi@0	85 # if np.isinf(prev_overall_distance):
mi@0	86 # prev_overall_distance = 0.0 # Otherwise whenever the next distance value is not nan, condition is met
mi@0	87 iter_count -= 1
mi@0	88
mi@0	89 if iter_count == 0:
mi@0	90 print 'Stop loop after max number of iteration reached, distance:', overall_distance
mi@0	91
mi@0	92 # Splitting and merging formed clusters based on mean distances between each centroid and its member data points (within_cluster_distances).
mi@0	93 # Also to be handled iteratively.
mi@0	94 new_label = self.n_clusters
mi@0	95 label_list = np.array([self.data[i].label for i in xrange(len(self.data))])
mi@0	96
mi@0	97 for cluster_id, count_std in enumerate(within_cluster_std):
mi@0	98 print cluster_id, count_std
mi@0	99 if count_std[1] > self.maxStd and count_std[0] >= 2 * self.minPts:
mi@0	100 print 'Splitting the %ith cluster into the %ith and the %ith.' %(cluster_id, new_label, new_label+1)
mi@0	101 self.split_cluster(label_list, cluster_id, new_label)
mi@0	102 new_label += 2
mi@0	103
mi@0	104 # Label singular data points as noise
mi@0	105 label_list = np.array([self.data[i].label for i in xrange(len(self.data))])
mi@0	106 labels = copy(label_list)
mi@0	107
mi@0	108 for i, value in enumerate(label_list):
mi@0	109 if (label_list==value).sum() == 1:
mi@0	110 labels[i] = -1
mi@0	111 print 'Label singular data as noise.\n', labels
mi@0	112
mi@0	113 return labels
mi@0	114
mi@0	115 def find_closest_centroid(self, centroids, x, prn=False):
mi@0	116 '''Find the best matching model in k model clusters.'''
mi@0	117 dist = []
mi@0	118 for c in centroids:
mi@0	119 dist.append(c.skl_distance_full(x))
mi@0	120 # if prn and any([np.isinf(x) for x in dist]) :
mi@0	121 # print "inf encountered in find best match. argmin returns:", np.argmin(dist)
mi@0	122 # print dist
mi@0	123 if all([np.isinf(x) for x in dist]) :
mi@0	124 # print "All distances are inf..."
mi@0	125 return None
mi@0	126 c_ix = np.argmin(dist)
mi@0	127 return centroids[c_ix]
mi@0	128
mi@0	129 def update_centroid(self, c, x):
mi@0	130 '''Update cluster models given a GMM from the observation sequence.'''
mi@0	131 # print "c pre-update:", c
mi@0	132 C = len(x.components)
mi@0	133
mi@0	134 dist_matrix = np.zeros((C, C))
mi@0	135 for i in xrange(C):
mi@0	136 for j in xrange(i, C):
mi@0	137 dist_matrix[i,j] = skl_full(x.components[i], c.components[j])
mi@0	138 # dist_matrix[i,j] = skl_gmm(x.components[i].means, c.components[j].means, x.components[i].covars, c.components[j].covars, 1, 1)
mi@0	139 dist_matrix[j,i] = dist_matrix[i,j]
mi@0	140 # print dist_matrix
mi@0	141 min_idx = np.argmin(dist_matrix, axis=0)
mi@0	142 # print min_idx
mi@0	143 # sum_weights = sum(c.weights)
mi@0	144 sum_weights = 0.0
mi@0	145 for i in xrange(C):
mi@0	146 pi0 = c.components[i].weight
mi@0	147 pi1 = x.components[min_idx[i]].weight
mi@0	148 if pi1 < 0.001 :
mi@0	149 print "Warning: observed component weight very low. skipping update for this component."
mi@0	150 continue
mi@0	151 try :
mi@0	152 n0 = int(self.w * self.bias * pi0 / (pi0 + pi1))
mi@0	153 n1 = int((1 - self.w) * self.bias * pi1 / (pi0 + pi1))
mi@0	154 except :
mi@0	155 print "Warning: could not estimaate sample size"
mi@0	156 continue
mi@0	157 if n0 < 2 or n1 < 2 :
mi@0	158 print "Warning: sample size estimates too small.. ", n0, n1
mi@0	159 c.components[i].means, c.components[i].covars = self.update_component(c.components[i], x.components[min_idx[i]], n0, n1)
mi@0	160 # weights update
mi@0	161 w = (c.components[i].weight * n0 + x.components[min_idx[i]].weight * n1) / (n0 + n1)
mi@0	162 c.components[i].weight = w
mi@0	163 sum_weights += w
mi@0	164 for i in xrange(C):
mi@0	165 c.components[i].weight = c.components[i].weight / sum_weights
mi@0	166 c.update()
mi@0	167 # print "c post-update:", c
mi@0	168 return c
mi@0	169
mi@0	170 def update_component(self, c1,c2,n1,n2) :
mi@0	171 '''Returning the updated mean / covariance'''
mi@0	172 nz = float(n1+n2)
mi@0	173 m1,m2 = c1.means, c2.means
mi@0	174 mz_est_b = (n1 * m1 + n2 * m2) / nz
mi@0	175
mi@0	176 k1 = (n1-1.0)/float(n1)
mi@0	177 k2 = (n2-1.0)/float(n2)
mi@0	178 kz = (nz-1.0)/float(nz)
mi@0	179
mi@0	180 m1p = k1c1.covars + m1[:,np.newaxis]m1[:,np.newaxis].T
mi@0	181 m2p = k2c2.covars + m2[:,np.newaxis]m2[:,np.newaxis].T
mi@0	182
mi@0	183 mzp_est = (n1 * m1p + n2 * m2p) / (nz-1.0)
mi@0	184
mi@0	185 # 2) Calculate the variate mean estimates:
mi@0	186 mz_est = (n1 * m1 + n2 * m2) / (nz-1.0)
mi@0	187
mi@0	188 cov_z_est = mzp_est - (mz_est[:,np.newaxis]mz_est[:,np.newaxis].T) kz
mi@0	189
mi@0	190 return (mz_est_b,cov_z_est)
mi@0	191
mi@0	192 def evaluate(self):
mi@0	193 '''Find stopping criteria using the Bregman loss function.
mi@0	194 Stopping condition is safisfied when the '''
mi@0	195 matches = 0
mi@0	196 distances = np.zeros(self.n_clusters)
mi@0	197 within_cluster_std = []
mi@0	198 distance_list = [[] for i in xrange(self.n_clusters)]
mi@0	199 for j,m in enumerate(self.data):
mi@0	200 cc = self.find_closest_centroid(self.centroids, m, prn=True)
mi@0	201 if cc is None :
mi@0	202 print "Warning (eval): No matching cluster can be found for data point %i" %j
mi@0	203 continue
mi@0	204 ix = self.centroids.index(cc)
mi@0	205 d = cc.skl_distance_full(m)
mi@0	206 if np.isinf(d) :
mi@0	207 print j, m
mi@0	208 print "centroid index, distance:",ix,d
mi@0	209 # print 'c-m', d
mi@0	210 distances[ix] += d
mi@0	211 distance_list[ix].append(d)
mi@0	212 if d < 0.001 :
mi@0	213 matches += 1
mi@0	214 print matches
mi@0	215 for i in xrange(self.n_clusters):
mi@0	216 within_cluster_std.append((len(distance_list[i]), np.std(distance_list[i])))
mi@0	217 distances = np.array(distances) / self.n_clusters
mi@0	218 print 'within_cluster_std', within_cluster_std
mi@0	219 return distances, within_cluster_std
mi@0	220
mi@0	221 def split_cluster(self, label_list, cluster_id, new_label):
mi@0	222 '''Split clusters to form new clusters.'''
mi@0	223 old_members = np.where(label_list==cluster_id)[0]
mi@0	224 data = [self.data[i] for i in old_members]
mi@0	225 # Init new centroids
mi@0	226 new_centroids = [self.data[i] for i in np.random.choice(old_members, 2)]
mi@0	227 new_label_list = [new_label, new_label+1]
mi@0	228
mi@0	229 iter_count = 1
mi@0	230 while iter_count > 0:
mi@0	231 for i, m in enumerate(data):
mi@0	232 cc = self.find_closest_centroid(new_centroids, m)
mi@0	233 if cc is None :
mi@0	234 print "Warning (fit): No matching cluster can be found for data point %i" %i
mi@0	235 continue
mi@0	236 # assign new label to the original gmm data
mi@0	237 pos = new_centroids.index(cc)
mi@0	238 self.data[old_members[data.index(m)]].label = new_label_list[pos]
mi@0	239 # centroid re-estimation
mi@0	240 cc = self.update_centroid(cc, m)
mi@0	241 new_centroids[pos] = cc
mi@0	242 iter_count -= 1
mi@0	243
mi@0	244
mi@0	245 class FeatureObj() :
mi@0	246 __slots__ = ['key','audio','timestamps','features']
mi@0	247
mi@0	248 def getGMMs(feature, gmmWindow=10, stepsize=1):
mi@0	249 gmm_list = []
mi@0	250 steps = int((feature.shape[0] - gmmWindow + stepsize) / stepsize)
mi@0	251 for i in xrange(steps):
mi@0	252 gmm_list.append(GmmDistance(feature[istepsize:(istepsize+gmmWindow), :]))
mi@0	253 return gmm_list
mi@0	254
mi@0	255 def pairwiseSKL(gmm_list):
mi@0	256 '''Compute pairwise symmetrised KL divergence of a list of GMMs.'''
mi@0	257 n_GMMs = len(gmm_list)
mi@0	258 distance_matrix = np.zeros((n_GMMs, n_GMMs))
mi@0	259 for i in xrange(n_GMMs):
mi@0	260 for j in xrange(i, n_GMMs):
mi@0	261 distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j])
mi@0	262 distance_matrix[j][i] = distance_matrix[i][j]
mi@0	263 # X = np.array(gmm_list)
mi@0	264 # distance_matrix = pairwise_distances(X, metric = lambda x, y: x.skl_distance_full(y) )
mi@0	265 distance_matrix[np.isnan(distance_matrix)] = 10.0
mi@0	266 distance_matrix[np.isinf(distance_matrix)] = 10.0
mi@0	267
mi@0	268
mi@0	269 # def parse_args():
mi@0	270 # # define parser
mi@0	271 # op = optparse.OptionParser()
mi@0	272 # # IO options
mi@0	273 # op.add_option('-i', '--input', action="store", dest="INPUT", default='/Volumes/c4dm-scratch/mi/seg/qupujicheng/features', type="str", help="Loading features from..." )
mi@0	274 # op.add_option('-o', '--out', action="store", dest="OUTPUT", default='test/clustering_resutls', type="str", help="Writing clustering results to... ")
mi@0	275 #
mi@0	276 # return op.parse_args()
mi@0	277 #
mi@0	278 # options, args = parse_args()
mi@0	279 #
mi@0	280 # def main():
mi@0	281 #
mi@0	282 # feature_list = [i for i in os.listdir(options.INPUT) if not i.startswith('.')]
mi@0	283 # feature_list.sort()
mi@0	284 # fobj_list = []
mi@0	285 #
mi@0	286 # for feature in feature_list:
mi@0	287 # data = np.genfromtxt(join(options.INPUT, feature), delimiter=',',filling_values=0.0)
mi@0	288 # dim = data.shape[1] - 1
mi@0	289 # if dim == 1 :
mi@0	290 # fo = FeatureObj()
mi@0	291 # fo.audio = feature[:feature.find('_vamp')]
mi@0	292 # fo.key = splitext(feature.strip(fo.audio + '_'))[0]
mi@0	293 # fo.timestamps = data[:, 0] # the first column is the timestamps
mi@0	294 # fo.features = data[:, 1]
mi@0	295 # fobj_list.append(fo)
mi@0	296 #
mi@0	297 # else :
mi@0	298 # for col in xrange(dim):
mi@0	299 # fo = FeatureObj()
mi@0	300 # fo.audio = feature[:feature.find('_vamp')]
mi@0	301 # fo.key = splitext(feature.strip(fo.audio + '_'))[0] + '_' + '%d' %col
mi@0	302 # fo.timestamps = data[:, 0] # the first column records the timestamps
mi@0	303 # fo.features = data[:, col+1][:,np.newaxis]
mi@0	304 # fobj_list.append(fo)
mi@0	305 #
mi@0	306 # timestamps = fobj_list[0].timestamps
mi@0	307 # features = map(lambda x: "%i:%s" %(x[0],x[1].key), enumerate(fobj_list))
mi@0	308 # print "Loading %d features:\n", len(fobj_list)
mi@0	309 #
mi@0	310 # # find the feature with the fewer number of frames
mi@0	311 # n_frames = np.array(map(lambda x: x.features.shape[0], fobj_list)).min()
mi@0	312 # n_features = len(fobj_list)
mi@0	313 #
mi@0	314 # feature_matrix = np.zeros((n_frames, n_features))
mi@0	315 # print 'feature_matrix', feature_matrix.shape
mi@0	316 #
mi@0	317 # # take the arrays from the feature objects and add them to a matrix
mi@0	318 # for i,f in enumerate(fobj_list) :
mi@0	319 # feature_matrix[:,i] = f.features[:n_frames,0]
mi@0	320 #
mi@0	321 # # normalise the feature matrix, get rid of negative features, ensure numerical stability by adding a small constant
mi@0	322 # feature_matrix = abs(feature_matrix) / (abs(feature_matrix.max(0))+0.0005)
mi@0	323 # feature_matrix[np.isnan(feature_matrix)] = 0.0
mi@0	324 #
mi@0	325 # winlength = 5
mi@0	326 # stepsize = 2
mi@0	327 #
mi@0	328 # gmm_list = getGMMs(feature_matrix, gmmWindow=winlength, stepsize=stepsize)
mi@0	329 # print 'number of GMMs:', len(gmm_list)
mi@0	330 # skl_matrix = pairwiseSKL(gmm_list)
mi@0	331 #
mi@0	332 # # k-means clustering of GMMs
mi@0	333 # KmeansClustering = Kmeans(gmm_list, K=5)
mi@0	334 # labels = KmeansClustering.fit()
mi@0	335 #
mi@0	336 # f1 = np.array(zip(timestamps[:len(labels)], labels))
mi@0	337 #
mi@0	338 # np.savetxt(join(options.OUTPUT, 'kmeans')+'.csv', f1, delimiter=',')
mi@0	339 #
mi@0	340 # if __name__ == '__main__':
mi@0	341 # main()

Mercurial > hg > segmentation

annotate utils/kmeans.py @ 19:890cfe424f4a tip