mi@0: import sys, os, optparse
mi@0: import numpy as np
mi@0: from os.path import join, isdir, isfile, abspath, dirname, basename, split, splitext
mi@0: 
mi@0: from gmmdist import skl_full, skl_gmm
mi@0: from GmmMetrics import GmmDistance
mi@0: from sklearn.mixture import GMM
mi@0: from sklearn.metrics.pairwise import pairwise_distances
mi@0: from copy import copy
mi@0: 
mi@0: class Model():
mi@0: 	def __init__(self,m,c):
mi@0: 		self.mean = m
mi@0: 		self.cov = c
mi@0: 
mi@0: class Kmeans(object):
mi@0: 	
mi@0: 	def __init__(self, data, K=10, initial_centroids=None):
mi@0: 		'''
mi@0: 		K: Pre-regulated number of clusters.
mi@0: 		data: Input data. A list of GMMs.
mi@0: 		self.centroids: A K-length list of cluster centroids.
mi@0: 		'''
mi@0: 		
mi@0: 		self.data = data
mi@0: 		self.w = 0.8		
mi@0: 		self.bias = 500
mi@0: 		rand_points = np.random.choice(len(self.data), K)
mi@0: 		self.centroids = [self.data[i] for i in rand_points]
mi@0: 		# used precomputed centroids if initialised externally
mi@0: 		if initial_centroids:	
mi@0: 			self.centroids = [self.data[i] for i in initial_centroids]
mi@0: 		self.labels = np.empty(len(self.data))
mi@0: 		self.labels.fill(np.nan)
mi@0: 		self.n_clusters = K
mi@0: 		self.minPts = 10
mi@0: 		self.maxStd = 1
mi@0: 		
mi@0: 	def fit(self):
mi@0: 		'''
mi@0: 		Cluster the input observations into K clusters.
mi@0: 		The algorithm stops when no prominent difference is found in the average distance between each observation gmm to the cluster 
mi@0: 		it is assigned to.
mi@0: 		
mi@0: 		Returns
mi@0: 		-------
mi@0: 		labels : 1d np array
mi@0: 			Index of clusters each sample in the input observations is assigned to. 
mi@0: 			If it is not assigned to any cluster, index will be NaN.	
mi@0: 		'''
mi@0: 		
mi@0: 		prev_overall_distance = 0.0
mi@0: 		prev_diff = np.inf
mi@0: 		stop_condition = False
mi@0: 		# delta_stop = 1e-4
mi@0: 		delta_stop = 1
mi@0: 		max_iter = 10
mi@0: 		iter_count = max_iter
mi@0: 		while iter_count > 0:
mi@0: 			# assign each GMM to closest centroid
mi@0: 			for i, m in enumerate(self.data):
mi@0: 				cc = self.find_closest_centroid(self.centroids, m)
mi@0: 				if cc is None :
mi@0: 					print "Warning (fit): No matching cluster can be found for data point %i" %i
mi@0: 					continue
mi@0: 				self.labels[i] = self.centroids.index(cc)
mi@0: 				self.data[i].label = self.centroids.index(cc)
mi@0: 				# centroid re-estimation
mi@0: 				self.update_centroid(cc, m)
mi@0: 			distance_array, within_cluster_std = self.evaluate()
mi@0: 			overall_distance = distance_array.mean()
mi@0: 			min_distance = distance_array.min()
mi@0: 			# Terminate the loop when 1. the change in the mean distances of each sample with respective centroid (overall_distance)
mi@0: 			# goes below this threshold; 2. number of data in each cluster exceeds given threshold (minPts)
mi@0: 			diff = prev_overall_distance - overall_distance
mi@0: 			unique_labels = np.unique(self.labels)
mi@0: 			data_count = [(self.labels==element).sum() for element in unique_labels]
mi@0: 			print iter_count, overall_distance, diff
mi@0: 			if 0 < diff < delta_stop:	
mi@0: 			# if ((diff < delta_stop) and not np.isnan(prev_overall_distance)):
mi@0: 			# if diff < delta_stop and :
mi@0: 				stop_condition = True
mi@0: 				break
mi@0: 			prev_overall_distance = overall_distance
mi@0: 			# if np.isinf(prev_overall_distance):
mi@0: 			#	prev_overall_distance = 0.0 # Otherwise whenever the next distance value is not nan, condition is met
mi@0: 			iter_count -= 1
mi@0: 			
mi@0: 		if iter_count == 0:
mi@0: 			print 'Stop loop after max number of iteration reached, distance:', overall_distance
mi@0: 		
mi@0: 		# Splitting and merging formed clusters based on mean distances between each centroid and its member data points (within_cluster_distances).
mi@0: 		# Also to be handled iteratively.
mi@0: 		new_label = self.n_clusters
mi@0: 		label_list = np.array([self.data[i].label for i in xrange(len(self.data))])
mi@0: 		
mi@0: 		for cluster_id, count_std in enumerate(within_cluster_std):
mi@0: 			print cluster_id, count_std
mi@0: 			if count_std[1] > self.maxStd and count_std[0] >= 2 * self.minPts:
mi@0: 				print 'Splitting the %ith cluster into the %ith and the %ith.' %(cluster_id, new_label, new_label+1)
mi@0: 				self.split_cluster(label_list, cluster_id, new_label)
mi@0: 				new_label += 2	
mi@0: 				
mi@0: 		# Label singular data points as noise
mi@0: 		label_list = np.array([self.data[i].label for i in xrange(len(self.data))])
mi@0: 		labels = copy(label_list)
mi@0: 		
mi@0: 		for i, value in enumerate(label_list):
mi@0: 			if (label_list==value).sum() == 1:
mi@0: 				labels[i] = -1
mi@0: 		print 'Label singular data as noise.\n', labels
mi@0: 	
mi@0: 		return labels
mi@0: 		
mi@0: 	def find_closest_centroid(self, centroids, x, prn=False):
mi@0: 		'''Find the best matching model in k model clusters.'''
mi@0: 		dist = []
mi@0: 		for c in centroids:
mi@0: 			dist.append(c.skl_distance_full(x))
mi@0: 		# if prn and any([np.isinf(x) for x in dist]) :
mi@0: 		#	print "inf encountered in find best match. argmin returns:", np.argmin(dist)
mi@0: 		#	print dist
mi@0: 		if all([np.isinf(x) for x in dist]) :
mi@0: 			# print "All distances are inf..."
mi@0: 			return None
mi@0: 		c_ix = np.argmin(dist)		
mi@0: 		return centroids[c_ix]
mi@0: 		
mi@0: 	def update_centroid(self, c, x):
mi@0: 		'''Update cluster models given a GMM from the observation sequence.'''
mi@0: 		# print "c pre-update:", c
mi@0: 		C = len(x.components) 
mi@0: 
mi@0: 		dist_matrix = np.zeros((C, C))
mi@0: 		for i in xrange(C):
mi@0: 			for j in xrange(i, C):
mi@0: 				dist_matrix[i,j] = skl_full(x.components[i], c.components[j])
mi@0: 				# dist_matrix[i,j] = skl_gmm(x.components[i].means, c.components[j].means, x.components[i].covars, c.components[j].covars, 1, 1)
mi@0: 				dist_matrix[j,i] = dist_matrix[i,j]
mi@0: 		# print dist_matrix
mi@0: 		min_idx = np.argmin(dist_matrix, axis=0)
mi@0: 		# print min_idx
mi@0: 		# sum_weights = sum(c.weights)
mi@0: 		sum_weights = 0.0
mi@0: 		for i in xrange(C):
mi@0: 			pi0 = c.components[i].weight
mi@0: 			pi1 = x.components[min_idx[i]].weight
mi@0: 			if pi1 < 0.001 :
mi@0: 				print "Warning: observed component weight very low. skipping update for this component."
mi@0: 				continue
mi@0: 			try :
mi@0: 				n0 = int(self.w * self.bias * pi0 / (pi0 + pi1))
mi@0: 				n1 = int((1 - self.w) * self.bias * pi1 / (pi0 + pi1))
mi@0: 			except :
mi@0: 				print "Warning: could not estimaate sample size"
mi@0: 				continue
mi@0: 			if n0 < 2 or n1 < 2 :
mi@0: 				print "Warning: sample size estimates too small.. ", n0, n1
mi@0: 			c.components[i].means, c.components[i].covars = self.update_component(c.components[i], x.components[min_idx[i]], n0, n1)
mi@0: 			# weights update
mi@0: 			w = (c.components[i].weight * n0 + x.components[min_idx[i]].weight * n1) / (n0 + n1)
mi@0: 			c.components[i].weight = w
mi@0: 			sum_weights += w
mi@0: 		for i in xrange(C):
mi@0: 			c.components[i].weight = c.components[i].weight / sum_weights
mi@0: 		c.update()
mi@0: 		# print "c post-update:", c		
mi@0: 		return c
mi@0: 	
mi@0: 	def update_component(self, c1,c2,n1,n2) :
mi@0: 		'''Returning the updated mean / covariance'''
mi@0: 		nz = float(n1+n2)
mi@0: 		m1,m2 = c1.means, c2.means
mi@0: 		mz_est_b = (n1 * m1 + n2 * m2) / nz
mi@0: 
mi@0: 		k1 = (n1-1.0)/float(n1)
mi@0: 		k2 = (n2-1.0)/float(n2)
mi@0: 		kz = (nz-1.0)/float(nz)
mi@0: 
mi@0: 		m1p = k1*c1.covars + m1[:,np.newaxis]*m1[:,np.newaxis].T
mi@0: 		m2p = k2*c2.covars + m2[:,np.newaxis]*m2[:,np.newaxis].T
mi@0: 
mi@0: 		mzp_est = (n1 * m1p + n2 * m2p) / (nz-1.0)
mi@0: 
mi@0: 		# 2) Calculate the variate mean estimates:
mi@0: 		mz_est = (n1 * m1 + n2 * m2) / (nz-1.0)
mi@0: 
mi@0: 		cov_z_est = mzp_est - (mz_est[:,np.newaxis]*mz_est[:,np.newaxis].T) * kz
mi@0: 		
mi@0: 		return (mz_est_b,cov_z_est)
mi@0: 		
mi@0: 	def evaluate(self):
mi@0: 		'''Find stopping criteria using the Bregman loss function.
mi@0: 		Stopping condition is safisfied when the '''
mi@0: 		matches = 0
mi@0: 		distances = np.zeros(self.n_clusters)
mi@0: 		within_cluster_std = []
mi@0: 		distance_list = [[] for i in xrange(self.n_clusters)]
mi@0: 		for j,m in enumerate(self.data):
mi@0: 			cc = self.find_closest_centroid(self.centroids, m, prn=True)
mi@0: 			if cc is None :
mi@0: 				print "Warning (eval): No matching cluster can be found for data point %i" %j
mi@0: 				continue 
mi@0: 			ix = self.centroids.index(cc)
mi@0: 			d = cc.skl_distance_full(m)
mi@0: 			if np.isinf(d) :
mi@0: 				print j, m
mi@0: 				print "centroid index, distance:",ix,d
mi@0: 			# print 'c-m', d
mi@0: 			distances[ix] += d
mi@0: 			distance_list[ix].append(d)
mi@0: 			if d < 0.001 :
mi@0: 				matches += 1
mi@0: 		print matches
mi@0: 		for i in xrange(self.n_clusters):
mi@0: 			within_cluster_std.append((len(distance_list[i]), np.std(distance_list[i])))
mi@0: 		distances = np.array(distances) / self.n_clusters
mi@0: 		print 'within_cluster_std', within_cluster_std
mi@0: 		return distances, within_cluster_std
mi@0: 	
mi@0: 	def split_cluster(self, label_list, cluster_id, new_label):
mi@0: 		'''Split clusters to form new clusters.'''
mi@0: 		old_members = np.where(label_list==cluster_id)[0] 
mi@0: 		data = [self.data[i] for i in old_members]
mi@0: 		# Init new centroids
mi@0: 		new_centroids = [self.data[i] for i in np.random.choice(old_members, 2)]
mi@0: 		new_label_list = [new_label, new_label+1]
mi@0: 		
mi@0: 		iter_count = 1
mi@0: 		while iter_count > 0:
mi@0: 			for i, m in enumerate(data):
mi@0: 				cc = self.find_closest_centroid(new_centroids, m)
mi@0: 				if cc is None :
mi@0: 					print "Warning (fit): No matching cluster can be found for data point %i" %i
mi@0: 					continue
mi@0: 				# assign new label to the original gmm data
mi@0: 				pos = new_centroids.index(cc)
mi@0: 				self.data[old_members[data.index(m)]].label = new_label_list[pos] 
mi@0: 				# centroid re-estimation
mi@0: 				cc = self.update_centroid(cc, m)
mi@0: 				new_centroids[pos] = cc
mi@0: 			iter_count -= 1
mi@0: 		
mi@0: 
mi@0: class FeatureObj() :
mi@0: 	__slots__ = ['key','audio','timestamps','features']
mi@0: 
mi@0: def getGMMs(feature, gmmWindow=10, stepsize=1):
mi@0: 	gmm_list = []
mi@0: 	steps = int((feature.shape[0] - gmmWindow + stepsize) / stepsize)
mi@0: 	for i in xrange(steps):
mi@0: 		gmm_list.append(GmmDistance(feature[i*stepsize:(i*stepsize+gmmWindow), :]))
mi@0: 	return gmm_list
mi@0: 
mi@0: def pairwiseSKL(gmm_list):
mi@0: 	'''Compute pairwise symmetrised KL divergence of a list of GMMs.'''
mi@0: 	n_GMMs = len(gmm_list)
mi@0: 	distance_matrix = np.zeros((n_GMMs, n_GMMs))
mi@0: 	for i in xrange(n_GMMs):
mi@0: 		for j in xrange(i, n_GMMs):
mi@0: 			distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j])
mi@0: 			distance_matrix[j][i] = distance_matrix[i][j]
mi@0: 	# X = np.array(gmm_list)
mi@0: 	# distance_matrix = pairwise_distances(X, metric = lambda x, y:  x.skl_distance_full(y) )
mi@0: 	distance_matrix[np.isnan(distance_matrix)] = 10.0
mi@0: 	distance_matrix[np.isinf(distance_matrix)] = 10.0
mi@0: 
mi@0: 
mi@0: # def parse_args():
mi@0: # 	# define parser
mi@0: # 	op = optparse.OptionParser()
mi@0: # 	# IO options
mi@0: # 	op.add_option('-i', '--input', action="store", dest="INPUT", default='/Volumes/c4dm-scratch/mi/seg/qupujicheng/features', type="str", help="Loading features from..." )
mi@0: # 	op.add_option('-o', '--out', action="store", dest="OUTPUT", default='test/clustering_resutls', type="str", help="Writing clustering results to... ")
mi@0: # 
mi@0: # 	return op.parse_args()
mi@0: # 
mi@0: # options, args = parse_args()
mi@0: # 
mi@0: # def main():
mi@0: # 	
mi@0: # 	feature_list = [i for i in os.listdir(options.INPUT) if not i.startswith('.')]
mi@0: # 	feature_list.sort()
mi@0: # 	fobj_list = []
mi@0: # 
mi@0: # 	for feature in feature_list:
mi@0: # 		data = np.genfromtxt(join(options.INPUT, feature), delimiter=',',filling_values=0.0)
mi@0: # 		dim = data.shape[1] - 1
mi@0: # 		if dim == 1 :
mi@0: # 			fo = FeatureObj()
mi@0: # 			fo.audio = feature[:feature.find('_vamp')]
mi@0: # 			fo.key = splitext(feature.strip(fo.audio + '_'))[0]
mi@0: # 			fo.timestamps = data[:, 0] # the first column is the timestamps
mi@0: # 			fo.features = data[:, 1]
mi@0: # 			fobj_list.append(fo)
mi@0: # 
mi@0: # 		else :
mi@0: # 			for col in xrange(dim):
mi@0: # 				fo = FeatureObj()
mi@0: # 				fo.audio = feature[:feature.find('_vamp')]
mi@0: # 				fo.key = splitext(feature.strip(fo.audio + '_'))[0] + '_' + '%d' %col
mi@0: # 				fo.timestamps = data[:, 0] # the first column records the timestamps
mi@0: # 				fo.features = data[:, col+1][:,np.newaxis]
mi@0: # 				fobj_list.append(fo)
mi@0: # 
mi@0: # 	timestamps = fobj_list[0].timestamps
mi@0: # 	features = map(lambda x: "%i:%s" %(x[0],x[1].key), enumerate(fobj_list))
mi@0: # 	print "Loading %d features:\n", len(fobj_list)
mi@0: # 
mi@0: # 	# find the feature with the fewer number of frames
mi@0: # 	n_frames = np.array(map(lambda x: x.features.shape[0], fobj_list)).min()
mi@0: # 	n_features = len(fobj_list)
mi@0: # 
mi@0: # 	feature_matrix = np.zeros((n_frames, n_features))
mi@0: # 	print 'feature_matrix', feature_matrix.shape
mi@0: # 
mi@0: # 	# take the arrays from the feature objects and add them to a matrix
mi@0: # 	for i,f in enumerate(fobj_list) :
mi@0: # 		feature_matrix[:,i] = f.features[:n_frames,0]
mi@0: # 	
mi@0: # 	# normalise the feature matrix, get rid of negative features, ensure numerical stability by adding a small constant
mi@0: # 	feature_matrix = abs(feature_matrix) / (abs(feature_matrix.max(0))+0.0005)
mi@0: # 	feature_matrix[np.isnan(feature_matrix)] = 0.0
mi@0: # 	
mi@0: # 	winlength = 5
mi@0: # 	stepsize = 2
mi@0: # 	
mi@0: # 	gmm_list = getGMMs(feature_matrix, gmmWindow=winlength, stepsize=stepsize)
mi@0: # 	print 'number of GMMs:', len(gmm_list)
mi@0: # 	skl_matrix = pairwiseSKL(gmm_list)
mi@0: # 	
mi@0: # 	# k-means clustering of GMMs 
mi@0: # 	KmeansClustering = Kmeans(gmm_list, K=5)
mi@0: # 	labels = KmeansClustering.fit()
mi@0: # 	
mi@0: # 	f1 = np.array(zip(timestamps[:len(labels)], labels))
mi@0: # 	
mi@0: # 	np.savetxt(join(options.OUTPUT, 'kmeans')+'.csv', f1, delimiter=',')
mi@0: # 	
mi@0: # if __name__ == '__main__':
mi@0: # 	main()