mi@0: import sys, os, optparse mi@0: import numpy as np mi@0: from os.path import join, isdir, isfile, abspath, dirname, basename, split, splitext mi@0: mi@0: from gmmdist import skl_full, skl_gmm mi@0: from GmmMetrics import GmmDistance mi@0: from sklearn.mixture import GMM mi@0: from sklearn.metrics.pairwise import pairwise_distances mi@0: from copy import copy mi@0: mi@0: class Model(): mi@0: def __init__(self,m,c): mi@0: self.mean = m mi@0: self.cov = c mi@0: mi@0: class Kmeans(object): mi@0: mi@0: def __init__(self, data, K=10, initial_centroids=None): mi@0: ''' mi@0: K: Pre-regulated number of clusters. mi@0: data: Input data. A list of GMMs. mi@0: self.centroids: A K-length list of cluster centroids. mi@0: ''' mi@0: mi@0: self.data = data mi@0: self.w = 0.8 mi@0: self.bias = 500 mi@0: rand_points = np.random.choice(len(self.data), K) mi@0: self.centroids = [self.data[i] for i in rand_points] mi@0: # used precomputed centroids if initialised externally mi@0: if initial_centroids: mi@0: self.centroids = [self.data[i] for i in initial_centroids] mi@0: self.labels = np.empty(len(self.data)) mi@0: self.labels.fill(np.nan) mi@0: self.n_clusters = K mi@0: self.minPts = 10 mi@0: self.maxStd = 1 mi@0: mi@0: def fit(self): mi@0: ''' mi@0: Cluster the input observations into K clusters. mi@0: The algorithm stops when no prominent difference is found in the average distance between each observation gmm to the cluster mi@0: it is assigned to. mi@0: mi@0: Returns mi@0: ------- mi@0: labels : 1d np array mi@0: Index of clusters each sample in the input observations is assigned to. mi@0: If it is not assigned to any cluster, index will be NaN. mi@0: ''' mi@0: mi@0: prev_overall_distance = 0.0 mi@0: prev_diff = np.inf mi@0: stop_condition = False mi@0: # delta_stop = 1e-4 mi@0: delta_stop = 1 mi@0: max_iter = 10 mi@0: iter_count = max_iter mi@0: while iter_count > 0: mi@0: # assign each GMM to closest centroid mi@0: for i, m in enumerate(self.data): mi@0: cc = self.find_closest_centroid(self.centroids, m) mi@0: if cc is None : mi@0: print "Warning (fit): No matching cluster can be found for data point %i" %i mi@0: continue mi@0: self.labels[i] = self.centroids.index(cc) mi@0: self.data[i].label = self.centroids.index(cc) mi@0: # centroid re-estimation mi@0: self.update_centroid(cc, m) mi@0: distance_array, within_cluster_std = self.evaluate() mi@0: overall_distance = distance_array.mean() mi@0: min_distance = distance_array.min() mi@0: # Terminate the loop when 1. the change in the mean distances of each sample with respective centroid (overall_distance) mi@0: # goes below this threshold; 2. number of data in each cluster exceeds given threshold (minPts) mi@0: diff = prev_overall_distance - overall_distance mi@0: unique_labels = np.unique(self.labels) mi@0: data_count = [(self.labels==element).sum() for element in unique_labels] mi@0: print iter_count, overall_distance, diff mi@0: if 0 < diff < delta_stop: mi@0: # if ((diff < delta_stop) and not np.isnan(prev_overall_distance)): mi@0: # if diff < delta_stop and : mi@0: stop_condition = True mi@0: break mi@0: prev_overall_distance = overall_distance mi@0: # if np.isinf(prev_overall_distance): mi@0: # prev_overall_distance = 0.0 # Otherwise whenever the next distance value is not nan, condition is met mi@0: iter_count -= 1 mi@0: mi@0: if iter_count == 0: mi@0: print 'Stop loop after max number of iteration reached, distance:', overall_distance mi@0: mi@0: # Splitting and merging formed clusters based on mean distances between each centroid and its member data points (within_cluster_distances). mi@0: # Also to be handled iteratively. mi@0: new_label = self.n_clusters mi@0: label_list = np.array([self.data[i].label for i in xrange(len(self.data))]) mi@0: mi@0: for cluster_id, count_std in enumerate(within_cluster_std): mi@0: print cluster_id, count_std mi@0: if count_std[1] > self.maxStd and count_std[0] >= 2 * self.minPts: mi@0: print 'Splitting the %ith cluster into the %ith and the %ith.' %(cluster_id, new_label, new_label+1) mi@0: self.split_cluster(label_list, cluster_id, new_label) mi@0: new_label += 2 mi@0: mi@0: # Label singular data points as noise mi@0: label_list = np.array([self.data[i].label for i in xrange(len(self.data))]) mi@0: labels = copy(label_list) mi@0: mi@0: for i, value in enumerate(label_list): mi@0: if (label_list==value).sum() == 1: mi@0: labels[i] = -1 mi@0: print 'Label singular data as noise.\n', labels mi@0: mi@0: return labels mi@0: mi@0: def find_closest_centroid(self, centroids, x, prn=False): mi@0: '''Find the best matching model in k model clusters.''' mi@0: dist = [] mi@0: for c in centroids: mi@0: dist.append(c.skl_distance_full(x)) mi@0: # if prn and any([np.isinf(x) for x in dist]) : mi@0: # print "inf encountered in find best match. argmin returns:", np.argmin(dist) mi@0: # print dist mi@0: if all([np.isinf(x) for x in dist]) : mi@0: # print "All distances are inf..." mi@0: return None mi@0: c_ix = np.argmin(dist) mi@0: return centroids[c_ix] mi@0: mi@0: def update_centroid(self, c, x): mi@0: '''Update cluster models given a GMM from the observation sequence.''' mi@0: # print "c pre-update:", c mi@0: C = len(x.components) mi@0: mi@0: dist_matrix = np.zeros((C, C)) mi@0: for i in xrange(C): mi@0: for j in xrange(i, C): mi@0: dist_matrix[i,j] = skl_full(x.components[i], c.components[j]) mi@0: # dist_matrix[i,j] = skl_gmm(x.components[i].means, c.components[j].means, x.components[i].covars, c.components[j].covars, 1, 1) mi@0: dist_matrix[j,i] = dist_matrix[i,j] mi@0: # print dist_matrix mi@0: min_idx = np.argmin(dist_matrix, axis=0) mi@0: # print min_idx mi@0: # sum_weights = sum(c.weights) mi@0: sum_weights = 0.0 mi@0: for i in xrange(C): mi@0: pi0 = c.components[i].weight mi@0: pi1 = x.components[min_idx[i]].weight mi@0: if pi1 < 0.001 : mi@0: print "Warning: observed component weight very low. skipping update for this component." mi@0: continue mi@0: try : mi@0: n0 = int(self.w * self.bias * pi0 / (pi0 + pi1)) mi@0: n1 = int((1 - self.w) * self.bias * pi1 / (pi0 + pi1)) mi@0: except : mi@0: print "Warning: could not estimaate sample size" mi@0: continue mi@0: if n0 < 2 or n1 < 2 : mi@0: print "Warning: sample size estimates too small.. ", n0, n1 mi@0: c.components[i].means, c.components[i].covars = self.update_component(c.components[i], x.components[min_idx[i]], n0, n1) mi@0: # weights update mi@0: w = (c.components[i].weight * n0 + x.components[min_idx[i]].weight * n1) / (n0 + n1) mi@0: c.components[i].weight = w mi@0: sum_weights += w mi@0: for i in xrange(C): mi@0: c.components[i].weight = c.components[i].weight / sum_weights mi@0: c.update() mi@0: # print "c post-update:", c mi@0: return c mi@0: mi@0: def update_component(self, c1,c2,n1,n2) : mi@0: '''Returning the updated mean / covariance''' mi@0: nz = float(n1+n2) mi@0: m1,m2 = c1.means, c2.means mi@0: mz_est_b = (n1 * m1 + n2 * m2) / nz mi@0: mi@0: k1 = (n1-1.0)/float(n1) mi@0: k2 = (n2-1.0)/float(n2) mi@0: kz = (nz-1.0)/float(nz) mi@0: mi@0: m1p = k1*c1.covars + m1[:,np.newaxis]*m1[:,np.newaxis].T mi@0: m2p = k2*c2.covars + m2[:,np.newaxis]*m2[:,np.newaxis].T mi@0: mi@0: mzp_est = (n1 * m1p + n2 * m2p) / (nz-1.0) mi@0: mi@0: # 2) Calculate the variate mean estimates: mi@0: mz_est = (n1 * m1 + n2 * m2) / (nz-1.0) mi@0: mi@0: cov_z_est = mzp_est - (mz_est[:,np.newaxis]*mz_est[:,np.newaxis].T) * kz mi@0: mi@0: return (mz_est_b,cov_z_est) mi@0: mi@0: def evaluate(self): mi@0: '''Find stopping criteria using the Bregman loss function. mi@0: Stopping condition is safisfied when the ''' mi@0: matches = 0 mi@0: distances = np.zeros(self.n_clusters) mi@0: within_cluster_std = [] mi@0: distance_list = [[] for i in xrange(self.n_clusters)] mi@0: for j,m in enumerate(self.data): mi@0: cc = self.find_closest_centroid(self.centroids, m, prn=True) mi@0: if cc is None : mi@0: print "Warning (eval): No matching cluster can be found for data point %i" %j mi@0: continue mi@0: ix = self.centroids.index(cc) mi@0: d = cc.skl_distance_full(m) mi@0: if np.isinf(d) : mi@0: print j, m mi@0: print "centroid index, distance:",ix,d mi@0: # print 'c-m', d mi@0: distances[ix] += d mi@0: distance_list[ix].append(d) mi@0: if d < 0.001 : mi@0: matches += 1 mi@0: print matches mi@0: for i in xrange(self.n_clusters): mi@0: within_cluster_std.append((len(distance_list[i]), np.std(distance_list[i]))) mi@0: distances = np.array(distances) / self.n_clusters mi@0: print 'within_cluster_std', within_cluster_std mi@0: return distances, within_cluster_std mi@0: mi@0: def split_cluster(self, label_list, cluster_id, new_label): mi@0: '''Split clusters to form new clusters.''' mi@0: old_members = np.where(label_list==cluster_id)[0] mi@0: data = [self.data[i] for i in old_members] mi@0: # Init new centroids mi@0: new_centroids = [self.data[i] for i in np.random.choice(old_members, 2)] mi@0: new_label_list = [new_label, new_label+1] mi@0: mi@0: iter_count = 1 mi@0: while iter_count > 0: mi@0: for i, m in enumerate(data): mi@0: cc = self.find_closest_centroid(new_centroids, m) mi@0: if cc is None : mi@0: print "Warning (fit): No matching cluster can be found for data point %i" %i mi@0: continue mi@0: # assign new label to the original gmm data mi@0: pos = new_centroids.index(cc) mi@0: self.data[old_members[data.index(m)]].label = new_label_list[pos] mi@0: # centroid re-estimation mi@0: cc = self.update_centroid(cc, m) mi@0: new_centroids[pos] = cc mi@0: iter_count -= 1 mi@0: mi@0: mi@0: class FeatureObj() : mi@0: __slots__ = ['key','audio','timestamps','features'] mi@0: mi@0: def getGMMs(feature, gmmWindow=10, stepsize=1): mi@0: gmm_list = [] mi@0: steps = int((feature.shape[0] - gmmWindow + stepsize) / stepsize) mi@0: for i in xrange(steps): mi@0: gmm_list.append(GmmDistance(feature[i*stepsize:(i*stepsize+gmmWindow), :])) mi@0: return gmm_list mi@0: mi@0: def pairwiseSKL(gmm_list): mi@0: '''Compute pairwise symmetrised KL divergence of a list of GMMs.''' mi@0: n_GMMs = len(gmm_list) mi@0: distance_matrix = np.zeros((n_GMMs, n_GMMs)) mi@0: for i in xrange(n_GMMs): mi@0: for j in xrange(i, n_GMMs): mi@0: distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j]) mi@0: distance_matrix[j][i] = distance_matrix[i][j] mi@0: # X = np.array(gmm_list) mi@0: # distance_matrix = pairwise_distances(X, metric = lambda x, y: x.skl_distance_full(y) ) mi@0: distance_matrix[np.isnan(distance_matrix)] = 10.0 mi@0: distance_matrix[np.isinf(distance_matrix)] = 10.0 mi@0: mi@0: mi@0: # def parse_args(): mi@0: # # define parser mi@0: # op = optparse.OptionParser() mi@0: # # IO options mi@0: # op.add_option('-i', '--input', action="store", dest="INPUT", default='/Volumes/c4dm-scratch/mi/seg/qupujicheng/features', type="str", help="Loading features from..." ) mi@0: # op.add_option('-o', '--out', action="store", dest="OUTPUT", default='test/clustering_resutls', type="str", help="Writing clustering results to... ") mi@0: # mi@0: # return op.parse_args() mi@0: # mi@0: # options, args = parse_args() mi@0: # mi@0: # def main(): mi@0: # mi@0: # feature_list = [i for i in os.listdir(options.INPUT) if not i.startswith('.')] mi@0: # feature_list.sort() mi@0: # fobj_list = [] mi@0: # mi@0: # for feature in feature_list: mi@0: # data = np.genfromtxt(join(options.INPUT, feature), delimiter=',',filling_values=0.0) mi@0: # dim = data.shape[1] - 1 mi@0: # if dim == 1 : mi@0: # fo = FeatureObj() mi@0: # fo.audio = feature[:feature.find('_vamp')] mi@0: # fo.key = splitext(feature.strip(fo.audio + '_'))[0] mi@0: # fo.timestamps = data[:, 0] # the first column is the timestamps mi@0: # fo.features = data[:, 1] mi@0: # fobj_list.append(fo) mi@0: # mi@0: # else : mi@0: # for col in xrange(dim): mi@0: # fo = FeatureObj() mi@0: # fo.audio = feature[:feature.find('_vamp')] mi@0: # fo.key = splitext(feature.strip(fo.audio + '_'))[0] + '_' + '%d' %col mi@0: # fo.timestamps = data[:, 0] # the first column records the timestamps mi@0: # fo.features = data[:, col+1][:,np.newaxis] mi@0: # fobj_list.append(fo) mi@0: # mi@0: # timestamps = fobj_list[0].timestamps mi@0: # features = map(lambda x: "%i:%s" %(x[0],x[1].key), enumerate(fobj_list)) mi@0: # print "Loading %d features:\n", len(fobj_list) mi@0: # mi@0: # # find the feature with the fewer number of frames mi@0: # n_frames = np.array(map(lambda x: x.features.shape[0], fobj_list)).min() mi@0: # n_features = len(fobj_list) mi@0: # mi@0: # feature_matrix = np.zeros((n_frames, n_features)) mi@0: # print 'feature_matrix', feature_matrix.shape mi@0: # mi@0: # # take the arrays from the feature objects and add them to a matrix mi@0: # for i,f in enumerate(fobj_list) : mi@0: # feature_matrix[:,i] = f.features[:n_frames,0] mi@0: # mi@0: # # normalise the feature matrix, get rid of negative features, ensure numerical stability by adding a small constant mi@0: # feature_matrix = abs(feature_matrix) / (abs(feature_matrix.max(0))+0.0005) mi@0: # feature_matrix[np.isnan(feature_matrix)] = 0.0 mi@0: # mi@0: # winlength = 5 mi@0: # stepsize = 2 mi@0: # mi@0: # gmm_list = getGMMs(feature_matrix, gmmWindow=winlength, stepsize=stepsize) mi@0: # print 'number of GMMs:', len(gmm_list) mi@0: # skl_matrix = pairwiseSKL(gmm_list) mi@0: # mi@0: # # k-means clustering of GMMs mi@0: # KmeansClustering = Kmeans(gmm_list, K=5) mi@0: # labels = KmeansClustering.fit() mi@0: # mi@0: # f1 = np.array(zip(timestamps[:len(labels)], labels)) mi@0: # mi@0: # np.savetxt(join(options.OUTPUT, 'kmeans')+'.csv', f1, delimiter=',') mi@0: # mi@0: # if __name__ == '__main__': mi@0: # main()