Mercurial > hg > segmentation
view utils/RankClustering.py @ 13:cc8ceb270e79
add some gmm ipynbs
author | mitian |
---|---|
date | Fri, 05 Jun 2015 18:02:05 +0100 |
parents | 26838b1f560f |
children | b4bf37f94e92 |
line wrap: on
line source
import sys,os,optparse from os.path import join, isdir, isfile, abspath, dirname, basename, split, splitext import numpy as np from numpy import log, power, pi, exp, transpose, zeros, log, ones, dot, argmin, inf, zeros_like, argsort, finfo from sklearn.mixture import GMM from sklearn.metrics.pairwise import pairwise_distances from copy import copy from MutualInfo import print_progress from ComputationCache import Meta, with_cache, with_pickle_dump from gmmdist import skl_full, skl_gmm from GmmMetrics import GmmDistance class Node(object): def __init__(self, model, distances): self.model = model self.distances = distances self.neighborhood = [] def __str__(self): # return 'Node(%i):%s' %(self.sorted_distances[0], self.neighborhood) return 'Node(%i):%i, %3.2f' %(self.sorted_distances[0], self.neighborhood_size, self.average_div) def rankDistances(self): '''Return the index of the sorted array.''' self.sorted_distances = np.argsort(self.distances) def findNeighborhood(self, eps): '''''' d = self.distances[self.sorted_distances][1] b = eps * d if np.isinf(b) : return None for i in self.sorted_distances: if self.distances[i] <= b : self.neighborhood.append(i) # Brute force hack; we exclude nodes whose neighbourhood is larger than 80% of all frames # Proper solution: use the delta assymmetric KL divergence to identify near singularities if len(self.neighborhood) > 0.7 * len(self.sorted_distances) : self.neighborhood = [] def getStatistics(self): self.neighborhood_size = len(self.neighborhood) self.average_div = np.mean(self.distances[self.neighborhood[1:]]) class rClustering(object): meta = Meta() @classmethod def set_cache_filename(cls, filename, cache = True, cache_location=""): cls.meta.cache = cache cls.meta.cache_file_base = filename cls.meta.cache_location = cache_location def __init__(self, eps, thresh=15, k=5, rank='max_neighbors', centering=True): self.eps = eps self.thresh = thresh self.rank = rank self.node_list = [] self.methods = {'max_neighbors':'neighborhood_size', 'min_avg_div':'average_div'} self.classification = [] self.centering = centering self.k = k def fit(self, data): ''' Arguments: data: Input list of GMMs.''' D = self.getPairwiseDistances(data) if self.centering: D[D>=finfo(np.float32).max] = inf print "hubness before centering:", self.hubness_measure(D) D = self.center_divergence_matrix(D) print "hubness after centering:", self.hubness_measure(D) for i, model in enumerate(data): self.node_list.append(Node(model, D[i])) for node in self.node_list: node.rankDistances() node.findNeighborhood(self.eps) node.getStatistics() print "finding centroids" hub_candidates = self.findCentroids() print "\n\n hubs:",len(hub_candidates) if len(hub_candidates) > self.k: hub_candidates = hub_candidates[:self.k] self.classification = [] for i, node in enumerate(self.node_list): dists = zeros(len(hub_candidates)) for k,hub in enumerate(hub_candidates): j = self.node_list.index(hub) dists[k] = D[i,j] di = argmin(dists) sel_hub = hub_candidates[di] self.classification.append(self.node_list.index(sel_hub)) return self def classify(self): return self.classification def findCentroids(self): # sorted_nodelist = sorted(self.node_list, key = lambda x: x.neighborhood_size) sorted_nodelist = sorted(self.node_list, key = lambda x: x.__getattribute__(self.methods[self.rank]), reverse=True) hub_candidates = [] while True: hub = sorted_nodelist.pop(0) hub_candidates.append(hub) [sorted_nodelist.remove(self.node_list[n]) for n in hub.neighborhood if self.node_list[n] in sorted_nodelist] # print "hub.neighborhood",len(hub.neighborhood) # for n in hub.neighborhood : # if self.node_list[n] in sorted_nodelist : # sorted_nodelist.remove(self.node_list[n]) # print "removed", n, "from node list" # print len(hub_candidates),len(sorted_nodelist) if not sorted_nodelist: break # print 'hub_candidates', hub_candidates return hub_candidates def getNodeRank(self): average_div = [] neighborhood_size = [] node_rank = [] sorted_nodelist = sorted(self.node_list, key = lambda x: x.__getattribute__(self.methods[self.rank])) for node in self.node_list: neighborhood_size.append(node.neighborhood_size) average_div.append(node.average_div) node_rank.append(sorted_nodelist.index(node)) return neighborhood_size, average_div, node_rank def test(self): for n in self.node_list: print n # @with_cache(meta) def getPairwiseDistances(self, gmm_list): '''Calculate the pairwise distances of the input GMM data''' n_GMMs = len(gmm_list) distance_matrix = np.zeros((n_GMMs, n_GMMs)) print "Creating divergence matrix." for i in xrange(n_GMMs): print_progress(i,"gmm node processed") for j in xrange(i, n_GMMs): distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j]) distance_matrix[j][i] = distance_matrix[i][j] np.fill_diagonal(distance_matrix, 0.0) distance_matrix[np.isinf(distance_matrix)] = np.finfo(np.float64).max return distance_matrix def hubness_measure(self, D,r=0.45): n = D.shape[0] Dmean = zeros(n) Dstd = zeros(n) for i in xrange(n) : Dmean[i] = np.mean(D[i][ [D[i]>0] and [D[i]!=inf] ]) Dstd[i] = np.std(D[i][ [D[i]>0] and [D[i]!=inf] ]) m = Dmean.mean() r = r*m #print m,r #print Dmean.min(), Dmean.max() N = zeros(n) for i in xrange(n) : s = [D[i] > 0] and [D[i] < r] N[i] = len(D[i][s]) sortindex = argsort(N) hubs_mean = np.mean(Dmean[sortindex][-5:][::-1,...]) anti_hubs_mean = np.mean(Dmean[sortindex][:5]) return (anti_hubs_mean - hubs_mean) / Dmean.mean() def center_divergence_matrix(self, D) : n = D.shape[0] Dmean = zeros(n) Dstd = zeros(n) for i in xrange(D.shape[0]) : Dmean[i] = np.mean(D[i][ [D[i]>0] and [D[i]!=inf] ]) Dstd[i] = np.std(D[i][ [D[i]>0] and [D[i]!=inf] ]) B = zeros_like(D) for i in xrange(n) : for j in xrange(n) : d = D[i,j] B[i,j] = B[j,i] = 0.5 * ((d - Dmean[i]) / Dstd[i] + (d - Dmean[j]) / Dstd[j]) B += abs(np.min(B)) np.fill_diagonal(B,0.0) return B class FeatureObj() : __slots__ = ['key','audio','timestamps','features'] def getGMMs(feature, gmmWindow=10, stepsize=1): gmm_list = [] steps = int((feature.shape[0] - gmmWindow + stepsize) / stepsize) for i in xrange(steps): gmm_list.append(GmmDistance(feature[i*stepsize:(i*stepsize+gmmWindow), :])) return gmm_list def pairwiseSKL(gmm_list): '''Compute pairwise symmetrised KL divergence of a list of GMMs.''' n_GMMs = len(gmm_list) distance_matrix = np.zeros((n_GMMs, n_GMMs)) for i in xrange(n_GMMs): for j in xrange(i, n_GMMs): distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j]) distance_matrix[j][i] = distance_matrix[i][j] # X = np.array(gmm_list) # distance_matrix = pairwise_distances(X, metric = lambda x, y: x.skl_distance_full(y) ) distance_matrix[np.isnan(distance_matrix)] = 10.0 distance_matrix[np.isinf(distance_matrix)] = 10.0 # def parse_args(): # # define parser # op = optparse.OptionParser() # # IO options # op.add_option('-i', '--input', action="store", dest="INPUT", default='/Volumes/c4dm-scratch/mi/seg/qupujicheng/features', type="str", help="Loading features from..." ) # op.add_option('-o', '--out', action="store", dest="OUTPUT", default='test/clustering_resutls', type="str", help="Writing clustering results to... ") # # return op.parse_args() # # options, args = parse_args() # # def main(): # # feature_list = [i for i in os.listdir(options.INPUT) if not i.startswith('.')] # feature_list.sort() # fobj_list = [] # # for feature in feature_list: # data = np.genfromtxt(join(options.INPUT, feature), delimiter=',',filling_values=0.0) # dim = data.shape[1] - 1 # if dim == 1 : # fo = FeatureObj() # fo.audio = feature[:feature.find('_vamp')] # fo.key = splitext(feature.strip(fo.audio + '_'))[0] # fo.timestamps = data[:, 0] # the first column is the timestamps # fo.features = data[:, 1] # fobj_list.append(fo) # # else : # for col in xrange(dim): # fo = FeatureObj() # fo.audio = feature[:feature.find('_vamp')] # fo.key = splitext(feature.strip(fo.audio + '_'))[0] + '_' + '%d' %col # fo.timestamps = data[:, 0] # the first column records the timestamps # fo.features = data[:, col+1][:,np.newaxis] # fobj_list.append(fo) # # timestamps = fobj_list[0].timestamps # features = map(lambda x: "%i:%s" %(x[0],x[1].key), enumerate(fobj_list)) # print "Loading %d features:\n", len(fobj_list) # # # find the feature with the fewer number of frames # n_frames = np.array(map(lambda x: x.features.shape[0], fobj_list)).min() # n_features = len(fobj_list) # # feature_matrix = np.zeros((n_frames, n_features)) # print 'feature_matrix', feature_matrix.shape # # # take the arrays from the feature objects and add them to a matrix # for i,f in enumerate(fobj_list) : # feature_matrix[:,i] = f.features[:n_frames,0] # # # normalise the feature matrix, get rid of negative features, ensure numerical stability by adding a small constant # feature_matrix = abs(feature_matrix) / (abs(feature_matrix.max(0))+0.0005) # feature_matrix[np.isnan(feature_matrix)] = 0.0 # # winlength = 5 # stepsize = 2 # # gmm_list = getGMMs(feature_matrix, gmmWindow=winlength, stepsize=stepsize) # print 'number of GMMs:', len(gmm_list) # skl_matrix = pairwiseSKL(gmm_list) # # rc = rClustering(eps=3, rank='min_avg_div').fit(gmm_list) # rc.test() # classification = rc.classification # neighborhood_size, average_div, node_rank = rc.getNodeRank() # # f1 = np.array(zip(timestamps[:len(classification)], labelclassifications)) # f2 = np.array(zip(timestamps[:len(neighborhood_size)], neighborhood_size)) # f3 = np.array(zip(timestamps[:len(average_div)], average_div)) # f4 = np.array(zip(timestamps[:len(node_rank)], node_rank)) # # np.savetxt(join(options.OUTPUT, 'classification')+'.csv', f1, delimiter=',') # np.savetxt(join(options.OUTPUT, 'neighborhood_size')+'.csv', f2, delimiter=',') # np.savetxt(join(options.OUTPUT, 'average_div')+'.csv', f3, delimiter=',') # np.savetxt(join(options.OUTPUT, 'node_rank')+'.csv', f4, delimiter=',') # # if __name__ == '__main__': # main()