view utils/RankClustering.py @ 18:b4bf37f94e92

prepared to add another annotation
author mitian
date Wed, 09 Dec 2015 16:27:10 +0000
parents cc8ceb270e79
children
line wrap: on
line source
import sys,os,optparse
from os.path import join, isdir, isfile, abspath, dirname, basename, split, splitext
import numpy as np
from numpy import log, power, pi, exp, transpose, zeros, log, ones, dot, argmin, inf, zeros_like, argsort, finfo
from sklearn.mixture import GMM
from sklearn.metrics.pairwise import pairwise_distances
from copy import copy

from MutualInfo import print_progress
from ComputationCache import Meta, with_cache, with_pickle_dump
from gmmdist import skl_full, skl_gmm
from GmmMetrics import GmmDistance

class Node(object):
	
	def __init__(self, model, distances):
		self.model = model
		self.distances = distances
		self.neighborhood = []		
	
	def __str__(self):
		# return 'Node(%i):%s' %(self.sorted_distances[0], self.neighborhood)
		return 'Node(%i):%i, %3.2f' %(self.sorted_distances[0], self.neighborhood_size, self.average_div)
		
	def rankDistances(self):
		'''Return the index of the sorted array.'''
		self.sorted_distances = np.argsort(self.distances)	
	
	def findNeighborhood(self, eps):
		''''''
		d = self.distances[self.sorted_distances][1]
		b = eps * d
		if np.isinf(b) : return None
		
		for i in self.sorted_distances:
			if self.distances[i] <= b :
				self.neighborhood.append(i)
				
		# Brute force hack; we exclude nodes whose neighbourhood is larger than 80% of all frames
		# Proper solution: use the delta assymmetric KL divergence to identify near singularities
		if len(self.neighborhood) > 0.8 * len(self.sorted_distances) :
			self.neighborhood = []
		
	def getStatistics(self):
		
		self.neighborhood_size = len(self.neighborhood)
		self.average_div = np.mean(self.distances[self.neighborhood[1:]])
			
class rClustering(object):
	
	meta = Meta()
	
	@classmethod	
	def set_cache_filename(cls, filename, cache = True, cache_location=""):
		cls.meta.cache = cache
		cls.meta.cache_file_base = filename	
		cls.meta.cache_location = cache_location
	
	def __init__(self, eps, thresh=15, k=5, rank='max_neighbors', centering=True):
		self.eps = eps
		self.thresh = thresh
		self.rank = rank
		self.node_list = []
		self.methods = {'max_neighbors':'neighborhood_size', 'min_avg_div':'average_div'}
		self.classification = []
		self.centering = centering
		self.k = k
		
	def fit(self, data):
		'''
		Arguments:
		data: Input list of GMMs.'''
		D = self.getPairwiseDistances(data)
		if self.centering:
			D[D>=finfo(np.float32).max] = inf	
			print "hubness before centering:", self.hubness_measure(D)		
			D = self.center_divergence_matrix(D)
			print "hubness after centering:", self.hubness_measure(D)
		for i, model in enumerate(data):
			self.node_list.append(Node(model, D[i]))
		for node in self.node_list:
			node.rankDistances()
			node.findNeighborhood(self.eps)
			node.getStatistics()
		
		print "finding centroids"
		
		hub_candidates = self.findCentroids()
		print "\n\n hubs:",len(hub_candidates)
		if len(hub_candidates) > self.k:
			hub_candidates = hub_candidates[:self.k]
		self.classification = []
		
		for i, node in enumerate(self.node_list):
			dists = zeros(len(hub_candidates))
			for k,hub in enumerate(hub_candidates):
				j = self.node_list.index(hub)
				dists[k] = D[i,j]
			di = argmin(dists)
			sel_hub = hub_candidates[di]
			self.classification.append(self.node_list.index(sel_hub))
			
		return self
		
	def classify(self):
		return self.classification
	
	def findCentroids(self):
		# sorted_nodelist = sorted(self.node_list, key = lambda x: x.neighborhood_size)
		sorted_nodelist = sorted(self.node_list, key = lambda x: x.__getattribute__(self.methods[self.rank]), reverse=True)
		hub_candidates = []
		while True:
			hub = sorted_nodelist.pop(0)
			hub_candidates.append(hub)
			[sorted_nodelist.remove(self.node_list[n]) for n in hub.neighborhood if self.node_list[n] in sorted_nodelist]
			# print "hub.neighborhood",len(hub.neighborhood)			
			# for n in hub.neighborhood :
			# 	if self.node_list[n] in sorted_nodelist :
			# 		sorted_nodelist.remove(self.node_list[n])
			# 		print "removed", n, "from node list"
			# print len(hub_candidates),len(sorted_nodelist)
			if not sorted_nodelist: break
		# print 'hub_candidates', hub_candidates
		return hub_candidates
	
			
	def getNodeRank(self):
		average_div = []
		neighborhood_size = []
		node_rank = []
		sorted_nodelist = sorted(self.node_list, key = lambda x: x.__getattribute__(self.methods[self.rank]))
		
		for node in self.node_list:
			neighborhood_size.append(node.neighborhood_size)
			average_div.append(node.average_div)
			node_rank.append(sorted_nodelist.index(node))
		
		return neighborhood_size, average_div, node_rank
		
	def test(self):
		for n in self.node_list:
			print n	
			
	# @with_cache(meta)
	def getPairwiseDistances(self, gmm_list):
		'''Calculate the pairwise distances of the input GMM data'''
		n_GMMs = len(gmm_list)
		distance_matrix = np.zeros((n_GMMs, n_GMMs))
		print "Creating divergence matrix."
		for i in xrange(n_GMMs):
			print_progress(i,"gmm node processed")
			for j in xrange(i, n_GMMs):
				distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j])
				distance_matrix[j][i] = distance_matrix[i][j]
		np.fill_diagonal(distance_matrix, 0.0)
		distance_matrix[np.isinf(distance_matrix)] = np.finfo(np.float64).max
		
		return distance_matrix
	
	def hubness_measure(self, D,r=0.45):
		n = D.shape[0]
		Dmean = zeros(n)
		Dstd = zeros(n)
		for i in xrange(n) :
			Dmean[i] = np.mean(D[i][ [D[i]>0] and [D[i]!=inf] ])
			Dstd[i] = np.std(D[i][ [D[i]>0] and [D[i]!=inf] ])

		m = Dmean.mean()
		r = r*m
		#print m,r
		#print Dmean.min(), Dmean.max() 

		N = zeros(n)
		for i in xrange(n) :
			s = [D[i] > 0] and [D[i] < r]
			N[i] = len(D[i][s])
		sortindex = argsort(N)
		hubs_mean = np.mean(Dmean[sortindex][-5:][::-1,...])
		anti_hubs_mean = np.mean(Dmean[sortindex][:5])
		return (anti_hubs_mean - hubs_mean) / Dmean.mean()
	
	def center_divergence_matrix(self, D) :
		n = D.shape[0]
		Dmean = zeros(n)
		Dstd = zeros(n)
		for i in xrange(D.shape[0]) :
			Dmean[i] = np.mean(D[i][ [D[i]>0] and [D[i]!=inf] ])
			Dstd[i] = np.std(D[i][ [D[i]>0] and [D[i]!=inf] ])
		B = zeros_like(D)
		for i in xrange(n) :
			for j in xrange(n) :
				d = D[i,j]
				B[i,j] = B[j,i] = 0.5 * ((d - Dmean[i]) / Dstd[i] + (d - Dmean[j]) / Dstd[j])
		B += abs(np.min(B))
		np.fill_diagonal(B,0.0)
		return B

class FeatureObj() :
	__slots__ = ['key','audio','timestamps','features']

def getGMMs(feature, gmmWindow=10, stepsize=1):
	gmm_list = []
	steps = int((feature.shape[0] - gmmWindow + stepsize) / stepsize)
	for i in xrange(steps):
		gmm_list.append(GmmDistance(feature[i*stepsize:(i*stepsize+gmmWindow), :]))
	return gmm_list

def pairwiseSKL(gmm_list):
	'''Compute pairwise symmetrised KL divergence of a list of GMMs.'''
	n_GMMs = len(gmm_list)
	distance_matrix = np.zeros((n_GMMs, n_GMMs))
	for i in xrange(n_GMMs):
		for j in xrange(i, n_GMMs):
			distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j])
			distance_matrix[j][i] = distance_matrix[i][j]
	# X = np.array(gmm_list)
	# distance_matrix = pairwise_distances(X, metric = lambda x, y:  x.skl_distance_full(y) )
	distance_matrix[np.isnan(distance_matrix)] = 10.0
	distance_matrix[np.isinf(distance_matrix)] = 10.0


# def parse_args():
# 	# define parser
# 	op = optparse.OptionParser()
# 	# IO options
# 	op.add_option('-i', '--input', action="store", dest="INPUT", default='/Volumes/c4dm-scratch/mi/seg/qupujicheng/features', type="str", help="Loading features from..." )
# 	op.add_option('-o', '--out', action="store", dest="OUTPUT", default='test/clustering_resutls', type="str", help="Writing clustering results to... ")
# 
# 	return op.parse_args()
# 
# options, args = parse_args()
# 
# def main():
# 	
# 	feature_list = [i for i in os.listdir(options.INPUT) if not i.startswith('.')]
# 	feature_list.sort()
# 	fobj_list = []
# 
# 	for feature in feature_list:
# 		data = np.genfromtxt(join(options.INPUT, feature), delimiter=',',filling_values=0.0)
# 		dim = data.shape[1] - 1
# 		if dim == 1 :
# 			fo = FeatureObj()
# 			fo.audio = feature[:feature.find('_vamp')]
# 			fo.key = splitext(feature.strip(fo.audio + '_'))[0]
# 			fo.timestamps = data[:, 0] # the first column is the timestamps
# 			fo.features = data[:, 1]
# 			fobj_list.append(fo)
# 
# 		else :
# 			for col in xrange(dim):
# 				fo = FeatureObj()
# 				fo.audio = feature[:feature.find('_vamp')]
# 				fo.key = splitext(feature.strip(fo.audio + '_'))[0] + '_' + '%d' %col
# 				fo.timestamps = data[:, 0] # the first column records the timestamps
# 				fo.features = data[:, col+1][:,np.newaxis]
# 				fobj_list.append(fo)
# 
# 	timestamps = fobj_list[0].timestamps
# 	features = map(lambda x: "%i:%s" %(x[0],x[1].key), enumerate(fobj_list))
# 	print "Loading %d features:\n", len(fobj_list)
# 
# 	# find the feature with the fewer number of frames
# 	n_frames = np.array(map(lambda x: x.features.shape[0], fobj_list)).min()
# 	n_features = len(fobj_list)
# 
# 	feature_matrix = np.zeros((n_frames, n_features))
# 	print 'feature_matrix', feature_matrix.shape
# 
# 	# take the arrays from the feature objects and add them to a matrix
# 	for i,f in enumerate(fobj_list) :
# 		feature_matrix[:,i] = f.features[:n_frames,0]
# 	
# 	# normalise the feature matrix, get rid of negative features, ensure numerical stability by adding a small constant
# 	feature_matrix = abs(feature_matrix) / (abs(feature_matrix.max(0))+0.0005)
# 	feature_matrix[np.isnan(feature_matrix)] = 0.0
# 	
# 	winlength = 5
# 	stepsize = 2
# 	
# 	gmm_list = getGMMs(feature_matrix, gmmWindow=winlength, stepsize=stepsize)
# 	print 'number of GMMs:', len(gmm_list)
# 	skl_matrix = pairwiseSKL(gmm_list)
# 	
# 	rc = rClustering(eps=3, rank='min_avg_div').fit(gmm_list)
# 	rc.test()
# 	classification = rc.classification
# 	neighborhood_size, average_div, node_rank = rc.getNodeRank()
# 
# 	f1 = np.array(zip(timestamps[:len(classification)], labelclassifications))
# 	f2 = np.array(zip(timestamps[:len(neighborhood_size)], neighborhood_size))
# 	f3 = np.array(zip(timestamps[:len(average_div)], average_div))
# 	f4 = np.array(zip(timestamps[:len(node_rank)], node_rank))
# 	
# 	np.savetxt(join(options.OUTPUT, 'classification')+'.csv', f1, delimiter=',')
# 	np.savetxt(join(options.OUTPUT, 'neighborhood_size')+'.csv', f2, delimiter=',')
# 	np.savetxt(join(options.OUTPUT, 'average_div')+'.csv', f3, delimiter=',')
# 	np.savetxt(join(options.OUTPUT, 'node_rank')+'.csv', f4, delimiter=',')
# 	
# if __name__ == '__main__':
# 	main()