view utils/kmeans.py @ 0:26838b1f560f

initial commit of a segmenter project
author mi tian
date Thu, 02 Apr 2015 18:09:27 +0100
parents
children
line wrap: on
line source
import sys, os, optparse
import numpy as np
from os.path import join, isdir, isfile, abspath, dirname, basename, split, splitext

from gmmdist import skl_full, skl_gmm
from GmmMetrics import GmmDistance
from sklearn.mixture import GMM
from sklearn.metrics.pairwise import pairwise_distances
from copy import copy

class Model():
	def __init__(self,m,c):
		self.mean = m
		self.cov = c

class Kmeans(object):
	
	def __init__(self, data, K=10, initial_centroids=None):
		'''
		K: Pre-regulated number of clusters.
		data: Input data. A list of GMMs.
		self.centroids: A K-length list of cluster centroids.
		'''
		
		self.data = data
		self.w = 0.8		
		self.bias = 500
		rand_points = np.random.choice(len(self.data), K)
		self.centroids = [self.data[i] for i in rand_points]
		# used precomputed centroids if initialised externally
		if initial_centroids:	
			self.centroids = [self.data[i] for i in initial_centroids]
		self.labels = np.empty(len(self.data))
		self.labels.fill(np.nan)
		self.n_clusters = K
		self.minPts = 10
		self.maxStd = 1
		
	def fit(self):
		'''
		Cluster the input observations into K clusters.
		The algorithm stops when no prominent difference is found in the average distance between each observation gmm to the cluster 
		it is assigned to.
		
		Returns
		-------
		labels : 1d np array
			Index of clusters each sample in the input observations is assigned to. 
			If it is not assigned to any cluster, index will be NaN.	
		'''
		
		prev_overall_distance = 0.0
		prev_diff = np.inf
		stop_condition = False
		# delta_stop = 1e-4
		delta_stop = 1
		max_iter = 10
		iter_count = max_iter
		while iter_count > 0:
			# assign each GMM to closest centroid
			for i, m in enumerate(self.data):
				cc = self.find_closest_centroid(self.centroids, m)
				if cc is None :
					print "Warning (fit): No matching cluster can be found for data point %i" %i
					continue
				self.labels[i] = self.centroids.index(cc)
				self.data[i].label = self.centroids.index(cc)
				# centroid re-estimation
				self.update_centroid(cc, m)
			distance_array, within_cluster_std = self.evaluate()
			overall_distance = distance_array.mean()
			min_distance = distance_array.min()
			# Terminate the loop when 1. the change in the mean distances of each sample with respective centroid (overall_distance)
			# goes below this threshold; 2. number of data in each cluster exceeds given threshold (minPts)
			diff = prev_overall_distance - overall_distance
			unique_labels = np.unique(self.labels)
			data_count = [(self.labels==element).sum() for element in unique_labels]
			print iter_count, overall_distance, diff
			if 0 < diff < delta_stop:	
			# if ((diff < delta_stop) and not np.isnan(prev_overall_distance)):
			# if diff < delta_stop and :
				stop_condition = True
				break
			prev_overall_distance = overall_distance
			# if np.isinf(prev_overall_distance):
			#	prev_overall_distance = 0.0 # Otherwise whenever the next distance value is not nan, condition is met
			iter_count -= 1
			
		if iter_count == 0:
			print 'Stop loop after max number of iteration reached, distance:', overall_distance
		
		# Splitting and merging formed clusters based on mean distances between each centroid and its member data points (within_cluster_distances).
		# Also to be handled iteratively.
		new_label = self.n_clusters
		label_list = np.array([self.data[i].label for i in xrange(len(self.data))])
		
		for cluster_id, count_std in enumerate(within_cluster_std):
			print cluster_id, count_std
			if count_std[1] > self.maxStd and count_std[0] >= 2 * self.minPts:
				print 'Splitting the %ith cluster into the %ith and the %ith.' %(cluster_id, new_label, new_label+1)
				self.split_cluster(label_list, cluster_id, new_label)
				new_label += 2	
				
		# Label singular data points as noise
		label_list = np.array([self.data[i].label for i in xrange(len(self.data))])
		labels = copy(label_list)
		
		for i, value in enumerate(label_list):
			if (label_list==value).sum() == 1:
				labels[i] = -1
		print 'Label singular data as noise.\n', labels
	
		return labels
		
	def find_closest_centroid(self, centroids, x, prn=False):
		'''Find the best matching model in k model clusters.'''
		dist = []
		for c in centroids:
			dist.append(c.skl_distance_full(x))
		# if prn and any([np.isinf(x) for x in dist]) :
		#	print "inf encountered in find best match. argmin returns:", np.argmin(dist)
		#	print dist
		if all([np.isinf(x) for x in dist]) :
			# print "All distances are inf..."
			return None
		c_ix = np.argmin(dist)		
		return centroids[c_ix]
		
	def update_centroid(self, c, x):
		'''Update cluster models given a GMM from the observation sequence.'''
		# print "c pre-update:", c
		C = len(x.components) 

		dist_matrix = np.zeros((C, C))
		for i in xrange(C):
			for j in xrange(i, C):
				dist_matrix[i,j] = skl_full(x.components[i], c.components[j])
				# dist_matrix[i,j] = skl_gmm(x.components[i].means, c.components[j].means, x.components[i].covars, c.components[j].covars, 1, 1)
				dist_matrix[j,i] = dist_matrix[i,j]
		# print dist_matrix
		min_idx = np.argmin(dist_matrix, axis=0)
		# print min_idx
		# sum_weights = sum(c.weights)
		sum_weights = 0.0
		for i in xrange(C):
			pi0 = c.components[i].weight
			pi1 = x.components[min_idx[i]].weight
			if pi1 < 0.001 :
				print "Warning: observed component weight very low. skipping update for this component."
				continue
			try :
				n0 = int(self.w * self.bias * pi0 / (pi0 + pi1))
				n1 = int((1 - self.w) * self.bias * pi1 / (pi0 + pi1))
			except :
				print "Warning: could not estimaate sample size"
				continue
			if n0 < 2 or n1 < 2 :
				print "Warning: sample size estimates too small.. ", n0, n1
			c.components[i].means, c.components[i].covars = self.update_component(c.components[i], x.components[min_idx[i]], n0, n1)
			# weights update
			w = (c.components[i].weight * n0 + x.components[min_idx[i]].weight * n1) / (n0 + n1)
			c.components[i].weight = w
			sum_weights += w
		for i in xrange(C):
			c.components[i].weight = c.components[i].weight / sum_weights
		c.update()
		# print "c post-update:", c		
		return c
	
	def update_component(self, c1,c2,n1,n2) :
		'''Returning the updated mean / covariance'''
		nz = float(n1+n2)
		m1,m2 = c1.means, c2.means
		mz_est_b = (n1 * m1 + n2 * m2) / nz

		k1 = (n1-1.0)/float(n1)
		k2 = (n2-1.0)/float(n2)
		kz = (nz-1.0)/float(nz)

		m1p = k1*c1.covars + m1[:,np.newaxis]*m1[:,np.newaxis].T
		m2p = k2*c2.covars + m2[:,np.newaxis]*m2[:,np.newaxis].T

		mzp_est = (n1 * m1p + n2 * m2p) / (nz-1.0)

		# 2) Calculate the variate mean estimates:
		mz_est = (n1 * m1 + n2 * m2) / (nz-1.0)

		cov_z_est = mzp_est - (mz_est[:,np.newaxis]*mz_est[:,np.newaxis].T) * kz
		
		return (mz_est_b,cov_z_est)
		
	def evaluate(self):
		'''Find stopping criteria using the Bregman loss function.
		Stopping condition is safisfied when the '''
		matches = 0
		distances = np.zeros(self.n_clusters)
		within_cluster_std = []
		distance_list = [[] for i in xrange(self.n_clusters)]
		for j,m in enumerate(self.data):
			cc = self.find_closest_centroid(self.centroids, m, prn=True)
			if cc is None :
				print "Warning (eval): No matching cluster can be found for data point %i" %j
				continue 
			ix = self.centroids.index(cc)
			d = cc.skl_distance_full(m)
			if np.isinf(d) :
				print j, m
				print "centroid index, distance:",ix,d
			# print 'c-m', d
			distances[ix] += d
			distance_list[ix].append(d)
			if d < 0.001 :
				matches += 1
		print matches
		for i in xrange(self.n_clusters):
			within_cluster_std.append((len(distance_list[i]), np.std(distance_list[i])))
		distances = np.array(distances) / self.n_clusters
		print 'within_cluster_std', within_cluster_std
		return distances, within_cluster_std
	
	def split_cluster(self, label_list, cluster_id, new_label):
		'''Split clusters to form new clusters.'''
		old_members = np.where(label_list==cluster_id)[0] 
		data = [self.data[i] for i in old_members]
		# Init new centroids
		new_centroids = [self.data[i] for i in np.random.choice(old_members, 2)]
		new_label_list = [new_label, new_label+1]
		
		iter_count = 1
		while iter_count > 0:
			for i, m in enumerate(data):
				cc = self.find_closest_centroid(new_centroids, m)
				if cc is None :
					print "Warning (fit): No matching cluster can be found for data point %i" %i
					continue
				# assign new label to the original gmm data
				pos = new_centroids.index(cc)
				self.data[old_members[data.index(m)]].label = new_label_list[pos] 
				# centroid re-estimation
				cc = self.update_centroid(cc, m)
				new_centroids[pos] = cc
			iter_count -= 1
		

class FeatureObj() :
	__slots__ = ['key','audio','timestamps','features']

def getGMMs(feature, gmmWindow=10, stepsize=1):
	gmm_list = []
	steps = int((feature.shape[0] - gmmWindow + stepsize) / stepsize)
	for i in xrange(steps):
		gmm_list.append(GmmDistance(feature[i*stepsize:(i*stepsize+gmmWindow), :]))
	return gmm_list

def pairwiseSKL(gmm_list):
	'''Compute pairwise symmetrised KL divergence of a list of GMMs.'''
	n_GMMs = len(gmm_list)
	distance_matrix = np.zeros((n_GMMs, n_GMMs))
	for i in xrange(n_GMMs):
		for j in xrange(i, n_GMMs):
			distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j])
			distance_matrix[j][i] = distance_matrix[i][j]
	# X = np.array(gmm_list)
	# distance_matrix = pairwise_distances(X, metric = lambda x, y:  x.skl_distance_full(y) )
	distance_matrix[np.isnan(distance_matrix)] = 10.0
	distance_matrix[np.isinf(distance_matrix)] = 10.0


# def parse_args():
# 	# define parser
# 	op = optparse.OptionParser()
# 	# IO options
# 	op.add_option('-i', '--input', action="store", dest="INPUT", default='/Volumes/c4dm-scratch/mi/seg/qupujicheng/features', type="str", help="Loading features from..." )
# 	op.add_option('-o', '--out', action="store", dest="OUTPUT", default='test/clustering_resutls', type="str", help="Writing clustering results to... ")
# 
# 	return op.parse_args()
# 
# options, args = parse_args()
# 
# def main():
# 	
# 	feature_list = [i for i in os.listdir(options.INPUT) if not i.startswith('.')]
# 	feature_list.sort()
# 	fobj_list = []
# 
# 	for feature in feature_list:
# 		data = np.genfromtxt(join(options.INPUT, feature), delimiter=',',filling_values=0.0)
# 		dim = data.shape[1] - 1
# 		if dim == 1 :
# 			fo = FeatureObj()
# 			fo.audio = feature[:feature.find('_vamp')]
# 			fo.key = splitext(feature.strip(fo.audio + '_'))[0]
# 			fo.timestamps = data[:, 0] # the first column is the timestamps
# 			fo.features = data[:, 1]
# 			fobj_list.append(fo)
# 
# 		else :
# 			for col in xrange(dim):
# 				fo = FeatureObj()
# 				fo.audio = feature[:feature.find('_vamp')]
# 				fo.key = splitext(feature.strip(fo.audio + '_'))[0] + '_' + '%d' %col
# 				fo.timestamps = data[:, 0] # the first column records the timestamps
# 				fo.features = data[:, col+1][:,np.newaxis]
# 				fobj_list.append(fo)
# 
# 	timestamps = fobj_list[0].timestamps
# 	features = map(lambda x: "%i:%s" %(x[0],x[1].key), enumerate(fobj_list))
# 	print "Loading %d features:\n", len(fobj_list)
# 
# 	# find the feature with the fewer number of frames
# 	n_frames = np.array(map(lambda x: x.features.shape[0], fobj_list)).min()
# 	n_features = len(fobj_list)
# 
# 	feature_matrix = np.zeros((n_frames, n_features))
# 	print 'feature_matrix', feature_matrix.shape
# 
# 	# take the arrays from the feature objects and add them to a matrix
# 	for i,f in enumerate(fobj_list) :
# 		feature_matrix[:,i] = f.features[:n_frames,0]
# 	
# 	# normalise the feature matrix, get rid of negative features, ensure numerical stability by adding a small constant
# 	feature_matrix = abs(feature_matrix) / (abs(feature_matrix.max(0))+0.0005)
# 	feature_matrix[np.isnan(feature_matrix)] = 0.0
# 	
# 	winlength = 5
# 	stepsize = 2
# 	
# 	gmm_list = getGMMs(feature_matrix, gmmWindow=winlength, stepsize=stepsize)
# 	print 'number of GMMs:', len(gmm_list)
# 	skl_matrix = pairwiseSKL(gmm_list)
# 	
# 	# k-means clustering of GMMs 
# 	KmeansClustering = Kmeans(gmm_list, K=5)
# 	labels = KmeansClustering.fit()
# 	
# 	f1 = np.array(zip(timestamps[:len(labels)], labels))
# 	
# 	np.savetxt(join(options.OUTPUT, 'kmeans')+'.csv', f1, delimiter=',')
# 	
# if __name__ == '__main__':
# 	main()