Mercurial > hg > segmentation
view utils/kmeans.py @ 0:26838b1f560f
initial commit of a segmenter project
author | mi tian |
---|---|
date | Thu, 02 Apr 2015 18:09:27 +0100 |
parents | |
children |
line wrap: on
line source
import sys, os, optparse import numpy as np from os.path import join, isdir, isfile, abspath, dirname, basename, split, splitext from gmmdist import skl_full, skl_gmm from GmmMetrics import GmmDistance from sklearn.mixture import GMM from sklearn.metrics.pairwise import pairwise_distances from copy import copy class Model(): def __init__(self,m,c): self.mean = m self.cov = c class Kmeans(object): def __init__(self, data, K=10, initial_centroids=None): ''' K: Pre-regulated number of clusters. data: Input data. A list of GMMs. self.centroids: A K-length list of cluster centroids. ''' self.data = data self.w = 0.8 self.bias = 500 rand_points = np.random.choice(len(self.data), K) self.centroids = [self.data[i] for i in rand_points] # used precomputed centroids if initialised externally if initial_centroids: self.centroids = [self.data[i] for i in initial_centroids] self.labels = np.empty(len(self.data)) self.labels.fill(np.nan) self.n_clusters = K self.minPts = 10 self.maxStd = 1 def fit(self): ''' Cluster the input observations into K clusters. The algorithm stops when no prominent difference is found in the average distance between each observation gmm to the cluster it is assigned to. Returns ------- labels : 1d np array Index of clusters each sample in the input observations is assigned to. If it is not assigned to any cluster, index will be NaN. ''' prev_overall_distance = 0.0 prev_diff = np.inf stop_condition = False # delta_stop = 1e-4 delta_stop = 1 max_iter = 10 iter_count = max_iter while iter_count > 0: # assign each GMM to closest centroid for i, m in enumerate(self.data): cc = self.find_closest_centroid(self.centroids, m) if cc is None : print "Warning (fit): No matching cluster can be found for data point %i" %i continue self.labels[i] = self.centroids.index(cc) self.data[i].label = self.centroids.index(cc) # centroid re-estimation self.update_centroid(cc, m) distance_array, within_cluster_std = self.evaluate() overall_distance = distance_array.mean() min_distance = distance_array.min() # Terminate the loop when 1. the change in the mean distances of each sample with respective centroid (overall_distance) # goes below this threshold; 2. number of data in each cluster exceeds given threshold (minPts) diff = prev_overall_distance - overall_distance unique_labels = np.unique(self.labels) data_count = [(self.labels==element).sum() for element in unique_labels] print iter_count, overall_distance, diff if 0 < diff < delta_stop: # if ((diff < delta_stop) and not np.isnan(prev_overall_distance)): # if diff < delta_stop and : stop_condition = True break prev_overall_distance = overall_distance # if np.isinf(prev_overall_distance): # prev_overall_distance = 0.0 # Otherwise whenever the next distance value is not nan, condition is met iter_count -= 1 if iter_count == 0: print 'Stop loop after max number of iteration reached, distance:', overall_distance # Splitting and merging formed clusters based on mean distances between each centroid and its member data points (within_cluster_distances). # Also to be handled iteratively. new_label = self.n_clusters label_list = np.array([self.data[i].label for i in xrange(len(self.data))]) for cluster_id, count_std in enumerate(within_cluster_std): print cluster_id, count_std if count_std[1] > self.maxStd and count_std[0] >= 2 * self.minPts: print 'Splitting the %ith cluster into the %ith and the %ith.' %(cluster_id, new_label, new_label+1) self.split_cluster(label_list, cluster_id, new_label) new_label += 2 # Label singular data points as noise label_list = np.array([self.data[i].label for i in xrange(len(self.data))]) labels = copy(label_list) for i, value in enumerate(label_list): if (label_list==value).sum() == 1: labels[i] = -1 print 'Label singular data as noise.\n', labels return labels def find_closest_centroid(self, centroids, x, prn=False): '''Find the best matching model in k model clusters.''' dist = [] for c in centroids: dist.append(c.skl_distance_full(x)) # if prn and any([np.isinf(x) for x in dist]) : # print "inf encountered in find best match. argmin returns:", np.argmin(dist) # print dist if all([np.isinf(x) for x in dist]) : # print "All distances are inf..." return None c_ix = np.argmin(dist) return centroids[c_ix] def update_centroid(self, c, x): '''Update cluster models given a GMM from the observation sequence.''' # print "c pre-update:", c C = len(x.components) dist_matrix = np.zeros((C, C)) for i in xrange(C): for j in xrange(i, C): dist_matrix[i,j] = skl_full(x.components[i], c.components[j]) # dist_matrix[i,j] = skl_gmm(x.components[i].means, c.components[j].means, x.components[i].covars, c.components[j].covars, 1, 1) dist_matrix[j,i] = dist_matrix[i,j] # print dist_matrix min_idx = np.argmin(dist_matrix, axis=0) # print min_idx # sum_weights = sum(c.weights) sum_weights = 0.0 for i in xrange(C): pi0 = c.components[i].weight pi1 = x.components[min_idx[i]].weight if pi1 < 0.001 : print "Warning: observed component weight very low. skipping update for this component." continue try : n0 = int(self.w * self.bias * pi0 / (pi0 + pi1)) n1 = int((1 - self.w) * self.bias * pi1 / (pi0 + pi1)) except : print "Warning: could not estimaate sample size" continue if n0 < 2 or n1 < 2 : print "Warning: sample size estimates too small.. ", n0, n1 c.components[i].means, c.components[i].covars = self.update_component(c.components[i], x.components[min_idx[i]], n0, n1) # weights update w = (c.components[i].weight * n0 + x.components[min_idx[i]].weight * n1) / (n0 + n1) c.components[i].weight = w sum_weights += w for i in xrange(C): c.components[i].weight = c.components[i].weight / sum_weights c.update() # print "c post-update:", c return c def update_component(self, c1,c2,n1,n2) : '''Returning the updated mean / covariance''' nz = float(n1+n2) m1,m2 = c1.means, c2.means mz_est_b = (n1 * m1 + n2 * m2) / nz k1 = (n1-1.0)/float(n1) k2 = (n2-1.0)/float(n2) kz = (nz-1.0)/float(nz) m1p = k1*c1.covars + m1[:,np.newaxis]*m1[:,np.newaxis].T m2p = k2*c2.covars + m2[:,np.newaxis]*m2[:,np.newaxis].T mzp_est = (n1 * m1p + n2 * m2p) / (nz-1.0) # 2) Calculate the variate mean estimates: mz_est = (n1 * m1 + n2 * m2) / (nz-1.0) cov_z_est = mzp_est - (mz_est[:,np.newaxis]*mz_est[:,np.newaxis].T) * kz return (mz_est_b,cov_z_est) def evaluate(self): '''Find stopping criteria using the Bregman loss function. Stopping condition is safisfied when the ''' matches = 0 distances = np.zeros(self.n_clusters) within_cluster_std = [] distance_list = [[] for i in xrange(self.n_clusters)] for j,m in enumerate(self.data): cc = self.find_closest_centroid(self.centroids, m, prn=True) if cc is None : print "Warning (eval): No matching cluster can be found for data point %i" %j continue ix = self.centroids.index(cc) d = cc.skl_distance_full(m) if np.isinf(d) : print j, m print "centroid index, distance:",ix,d # print 'c-m', d distances[ix] += d distance_list[ix].append(d) if d < 0.001 : matches += 1 print matches for i in xrange(self.n_clusters): within_cluster_std.append((len(distance_list[i]), np.std(distance_list[i]))) distances = np.array(distances) / self.n_clusters print 'within_cluster_std', within_cluster_std return distances, within_cluster_std def split_cluster(self, label_list, cluster_id, new_label): '''Split clusters to form new clusters.''' old_members = np.where(label_list==cluster_id)[0] data = [self.data[i] for i in old_members] # Init new centroids new_centroids = [self.data[i] for i in np.random.choice(old_members, 2)] new_label_list = [new_label, new_label+1] iter_count = 1 while iter_count > 0: for i, m in enumerate(data): cc = self.find_closest_centroid(new_centroids, m) if cc is None : print "Warning (fit): No matching cluster can be found for data point %i" %i continue # assign new label to the original gmm data pos = new_centroids.index(cc) self.data[old_members[data.index(m)]].label = new_label_list[pos] # centroid re-estimation cc = self.update_centroid(cc, m) new_centroids[pos] = cc iter_count -= 1 class FeatureObj() : __slots__ = ['key','audio','timestamps','features'] def getGMMs(feature, gmmWindow=10, stepsize=1): gmm_list = [] steps = int((feature.shape[0] - gmmWindow + stepsize) / stepsize) for i in xrange(steps): gmm_list.append(GmmDistance(feature[i*stepsize:(i*stepsize+gmmWindow), :])) return gmm_list def pairwiseSKL(gmm_list): '''Compute pairwise symmetrised KL divergence of a list of GMMs.''' n_GMMs = len(gmm_list) distance_matrix = np.zeros((n_GMMs, n_GMMs)) for i in xrange(n_GMMs): for j in xrange(i, n_GMMs): distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j]) distance_matrix[j][i] = distance_matrix[i][j] # X = np.array(gmm_list) # distance_matrix = pairwise_distances(X, metric = lambda x, y: x.skl_distance_full(y) ) distance_matrix[np.isnan(distance_matrix)] = 10.0 distance_matrix[np.isinf(distance_matrix)] = 10.0 # def parse_args(): # # define parser # op = optparse.OptionParser() # # IO options # op.add_option('-i', '--input', action="store", dest="INPUT", default='/Volumes/c4dm-scratch/mi/seg/qupujicheng/features', type="str", help="Loading features from..." ) # op.add_option('-o', '--out', action="store", dest="OUTPUT", default='test/clustering_resutls', type="str", help="Writing clustering results to... ") # # return op.parse_args() # # options, args = parse_args() # # def main(): # # feature_list = [i for i in os.listdir(options.INPUT) if not i.startswith('.')] # feature_list.sort() # fobj_list = [] # # for feature in feature_list: # data = np.genfromtxt(join(options.INPUT, feature), delimiter=',',filling_values=0.0) # dim = data.shape[1] - 1 # if dim == 1 : # fo = FeatureObj() # fo.audio = feature[:feature.find('_vamp')] # fo.key = splitext(feature.strip(fo.audio + '_'))[0] # fo.timestamps = data[:, 0] # the first column is the timestamps # fo.features = data[:, 1] # fobj_list.append(fo) # # else : # for col in xrange(dim): # fo = FeatureObj() # fo.audio = feature[:feature.find('_vamp')] # fo.key = splitext(feature.strip(fo.audio + '_'))[0] + '_' + '%d' %col # fo.timestamps = data[:, 0] # the first column records the timestamps # fo.features = data[:, col+1][:,np.newaxis] # fobj_list.append(fo) # # timestamps = fobj_list[0].timestamps # features = map(lambda x: "%i:%s" %(x[0],x[1].key), enumerate(fobj_list)) # print "Loading %d features:\n", len(fobj_list) # # # find the feature with the fewer number of frames # n_frames = np.array(map(lambda x: x.features.shape[0], fobj_list)).min() # n_features = len(fobj_list) # # feature_matrix = np.zeros((n_frames, n_features)) # print 'feature_matrix', feature_matrix.shape # # # take the arrays from the feature objects and add them to a matrix # for i,f in enumerate(fobj_list) : # feature_matrix[:,i] = f.features[:n_frames,0] # # # normalise the feature matrix, get rid of negative features, ensure numerical stability by adding a small constant # feature_matrix = abs(feature_matrix) / (abs(feature_matrix.max(0))+0.0005) # feature_matrix[np.isnan(feature_matrix)] = 0.0 # # winlength = 5 # stepsize = 2 # # gmm_list = getGMMs(feature_matrix, gmmWindow=winlength, stepsize=stepsize) # print 'number of GMMs:', len(gmm_list) # skl_matrix = pairwiseSKL(gmm_list) # # # k-means clustering of GMMs # KmeansClustering = Kmeans(gmm_list, K=5) # labels = KmeansClustering.fit() # # f1 = np.array(zip(timestamps[:len(labels)], labels)) # # np.savetxt(join(options.OUTPUT, 'kmeans')+'.csv', f1, delimiter=',') # # if __name__ == '__main__': # main()