annotate utils/RankClustering.py @ 19:890cfe424f4a tip

added annotations
author mitian
date Fri, 11 Dec 2015 09:47:40 +0000
parents b4bf37f94e92
children
rev   line source
mi@0 1 import sys,os,optparse
mi@0 2 from os.path import join, isdir, isfile, abspath, dirname, basename, split, splitext
mi@0 3 import numpy as np
mi@0 4 from numpy import log, power, pi, exp, transpose, zeros, log, ones, dot, argmin, inf, zeros_like, argsort, finfo
mi@0 5 from sklearn.mixture import GMM
mi@0 6 from sklearn.metrics.pairwise import pairwise_distances
mi@0 7 from copy import copy
mi@0 8
mi@0 9 from MutualInfo import print_progress
mi@0 10 from ComputationCache import Meta, with_cache, with_pickle_dump
mi@0 11 from gmmdist import skl_full, skl_gmm
mi@0 12 from GmmMetrics import GmmDistance
mi@0 13
mi@0 14 class Node(object):
mi@0 15
mi@0 16 def __init__(self, model, distances):
mi@0 17 self.model = model
mi@0 18 self.distances = distances
mi@0 19 self.neighborhood = []
mi@0 20
mi@0 21 def __str__(self):
mi@0 22 # return 'Node(%i):%s' %(self.sorted_distances[0], self.neighborhood)
mi@0 23 return 'Node(%i):%i, %3.2f' %(self.sorted_distances[0], self.neighborhood_size, self.average_div)
mi@0 24
mi@0 25 def rankDistances(self):
mi@0 26 '''Return the index of the sorted array.'''
mi@0 27 self.sorted_distances = np.argsort(self.distances)
mi@0 28
mi@0 29 def findNeighborhood(self, eps):
mi@0 30 ''''''
mi@0 31 d = self.distances[self.sorted_distances][1]
mi@0 32 b = eps * d
mi@0 33 if np.isinf(b) : return None
mi@0 34
mi@0 35 for i in self.sorted_distances:
mi@0 36 if self.distances[i] <= b :
mi@0 37 self.neighborhood.append(i)
mi@0 38
mi@0 39 # Brute force hack; we exclude nodes whose neighbourhood is larger than 80% of all frames
mi@0 40 # Proper solution: use the delta assymmetric KL divergence to identify near singularities
mitian@18 41 if len(self.neighborhood) > 0.8 * len(self.sorted_distances) :
mi@0 42 self.neighborhood = []
mi@0 43
mi@0 44 def getStatistics(self):
mi@0 45
mi@0 46 self.neighborhood_size = len(self.neighborhood)
mi@0 47 self.average_div = np.mean(self.distances[self.neighborhood[1:]])
mi@0 48
mi@0 49 class rClustering(object):
mi@0 50
mitian@13 51 meta = Meta()
mitian@13 52
mitian@13 53 @classmethod
mitian@13 54 def set_cache_filename(cls, filename, cache = True, cache_location=""):
mitian@13 55 cls.meta.cache = cache
mitian@13 56 cls.meta.cache_file_base = filename
mitian@13 57 cls.meta.cache_location = cache_location
mi@0 58
mi@0 59 def __init__(self, eps, thresh=15, k=5, rank='max_neighbors', centering=True):
mi@0 60 self.eps = eps
mi@0 61 self.thresh = thresh
mi@0 62 self.rank = rank
mi@0 63 self.node_list = []
mi@0 64 self.methods = {'max_neighbors':'neighborhood_size', 'min_avg_div':'average_div'}
mi@0 65 self.classification = []
mi@0 66 self.centering = centering
mi@0 67 self.k = k
mi@0 68
mi@0 69 def fit(self, data):
mi@0 70 '''
mi@0 71 Arguments:
mi@0 72 data: Input list of GMMs.'''
mi@0 73 D = self.getPairwiseDistances(data)
mi@0 74 if self.centering:
mi@0 75 D[D>=finfo(np.float32).max] = inf
mi@0 76 print "hubness before centering:", self.hubness_measure(D)
mi@0 77 D = self.center_divergence_matrix(D)
mi@0 78 print "hubness after centering:", self.hubness_measure(D)
mi@0 79 for i, model in enumerate(data):
mi@0 80 self.node_list.append(Node(model, D[i]))
mi@0 81 for node in self.node_list:
mi@0 82 node.rankDistances()
mi@0 83 node.findNeighborhood(self.eps)
mi@0 84 node.getStatistics()
mi@0 85
mi@0 86 print "finding centroids"
mi@0 87
mi@0 88 hub_candidates = self.findCentroids()
mi@0 89 print "\n\n hubs:",len(hub_candidates)
mi@0 90 if len(hub_candidates) > self.k:
mi@0 91 hub_candidates = hub_candidates[:self.k]
mi@0 92 self.classification = []
mi@0 93
mi@0 94 for i, node in enumerate(self.node_list):
mi@0 95 dists = zeros(len(hub_candidates))
mi@0 96 for k,hub in enumerate(hub_candidates):
mi@0 97 j = self.node_list.index(hub)
mi@0 98 dists[k] = D[i,j]
mi@0 99 di = argmin(dists)
mi@0 100 sel_hub = hub_candidates[di]
mi@0 101 self.classification.append(self.node_list.index(sel_hub))
mi@0 102
mi@0 103 return self
mi@0 104
mi@0 105 def classify(self):
mi@0 106 return self.classification
mi@0 107
mi@0 108 def findCentroids(self):
mi@0 109 # sorted_nodelist = sorted(self.node_list, key = lambda x: x.neighborhood_size)
mi@0 110 sorted_nodelist = sorted(self.node_list, key = lambda x: x.__getattribute__(self.methods[self.rank]), reverse=True)
mi@0 111 hub_candidates = []
mi@0 112 while True:
mi@0 113 hub = sorted_nodelist.pop(0)
mi@0 114 hub_candidates.append(hub)
mi@0 115 [sorted_nodelist.remove(self.node_list[n]) for n in hub.neighborhood if self.node_list[n] in sorted_nodelist]
mi@0 116 # print "hub.neighborhood",len(hub.neighborhood)
mi@0 117 # for n in hub.neighborhood :
mi@0 118 # if self.node_list[n] in sorted_nodelist :
mi@0 119 # sorted_nodelist.remove(self.node_list[n])
mi@0 120 # print "removed", n, "from node list"
mi@0 121 # print len(hub_candidates),len(sorted_nodelist)
mi@0 122 if not sorted_nodelist: break
mi@0 123 # print 'hub_candidates', hub_candidates
mi@0 124 return hub_candidates
mi@0 125
mi@0 126
mi@0 127 def getNodeRank(self):
mi@0 128 average_div = []
mi@0 129 neighborhood_size = []
mi@0 130 node_rank = []
mi@0 131 sorted_nodelist = sorted(self.node_list, key = lambda x: x.__getattribute__(self.methods[self.rank]))
mi@0 132
mi@0 133 for node in self.node_list:
mi@0 134 neighborhood_size.append(node.neighborhood_size)
mi@0 135 average_div.append(node.average_div)
mi@0 136 node_rank.append(sorted_nodelist.index(node))
mi@0 137
mi@0 138 return neighborhood_size, average_div, node_rank
mi@0 139
mi@0 140 def test(self):
mi@0 141 for n in self.node_list:
mi@0 142 print n
mi@0 143
mi@0 144 # @with_cache(meta)
mi@0 145 def getPairwiseDistances(self, gmm_list):
mi@0 146 '''Calculate the pairwise distances of the input GMM data'''
mi@0 147 n_GMMs = len(gmm_list)
mi@0 148 distance_matrix = np.zeros((n_GMMs, n_GMMs))
mi@0 149 print "Creating divergence matrix."
mi@0 150 for i in xrange(n_GMMs):
mi@0 151 print_progress(i,"gmm node processed")
mi@0 152 for j in xrange(i, n_GMMs):
mi@0 153 distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j])
mi@0 154 distance_matrix[j][i] = distance_matrix[i][j]
mi@0 155 np.fill_diagonal(distance_matrix, 0.0)
mi@0 156 distance_matrix[np.isinf(distance_matrix)] = np.finfo(np.float64).max
mi@0 157
mi@0 158 return distance_matrix
mi@0 159
mi@0 160 def hubness_measure(self, D,r=0.45):
mi@0 161 n = D.shape[0]
mi@0 162 Dmean = zeros(n)
mi@0 163 Dstd = zeros(n)
mi@0 164 for i in xrange(n) :
mi@0 165 Dmean[i] = np.mean(D[i][ [D[i]>0] and [D[i]!=inf] ])
mi@0 166 Dstd[i] = np.std(D[i][ [D[i]>0] and [D[i]!=inf] ])
mi@0 167
mi@0 168 m = Dmean.mean()
mi@0 169 r = r*m
mi@0 170 #print m,r
mi@0 171 #print Dmean.min(), Dmean.max()
mi@0 172
mi@0 173 N = zeros(n)
mi@0 174 for i in xrange(n) :
mi@0 175 s = [D[i] > 0] and [D[i] < r]
mi@0 176 N[i] = len(D[i][s])
mi@0 177 sortindex = argsort(N)
mi@0 178 hubs_mean = np.mean(Dmean[sortindex][-5:][::-1,...])
mi@0 179 anti_hubs_mean = np.mean(Dmean[sortindex][:5])
mi@0 180 return (anti_hubs_mean - hubs_mean) / Dmean.mean()
mi@0 181
mi@0 182 def center_divergence_matrix(self, D) :
mi@0 183 n = D.shape[0]
mi@0 184 Dmean = zeros(n)
mi@0 185 Dstd = zeros(n)
mi@0 186 for i in xrange(D.shape[0]) :
mi@0 187 Dmean[i] = np.mean(D[i][ [D[i]>0] and [D[i]!=inf] ])
mi@0 188 Dstd[i] = np.std(D[i][ [D[i]>0] and [D[i]!=inf] ])
mi@0 189 B = zeros_like(D)
mi@0 190 for i in xrange(n) :
mi@0 191 for j in xrange(n) :
mi@0 192 d = D[i,j]
mi@0 193 B[i,j] = B[j,i] = 0.5 * ((d - Dmean[i]) / Dstd[i] + (d - Dmean[j]) / Dstd[j])
mi@0 194 B += abs(np.min(B))
mi@0 195 np.fill_diagonal(B,0.0)
mi@0 196 return B
mi@0 197
mi@0 198 class FeatureObj() :
mi@0 199 __slots__ = ['key','audio','timestamps','features']
mi@0 200
mi@0 201 def getGMMs(feature, gmmWindow=10, stepsize=1):
mi@0 202 gmm_list = []
mi@0 203 steps = int((feature.shape[0] - gmmWindow + stepsize) / stepsize)
mi@0 204 for i in xrange(steps):
mi@0 205 gmm_list.append(GmmDistance(feature[i*stepsize:(i*stepsize+gmmWindow), :]))
mi@0 206 return gmm_list
mi@0 207
mi@0 208 def pairwiseSKL(gmm_list):
mi@0 209 '''Compute pairwise symmetrised KL divergence of a list of GMMs.'''
mi@0 210 n_GMMs = len(gmm_list)
mi@0 211 distance_matrix = np.zeros((n_GMMs, n_GMMs))
mi@0 212 for i in xrange(n_GMMs):
mi@0 213 for j in xrange(i, n_GMMs):
mi@0 214 distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j])
mi@0 215 distance_matrix[j][i] = distance_matrix[i][j]
mi@0 216 # X = np.array(gmm_list)
mi@0 217 # distance_matrix = pairwise_distances(X, metric = lambda x, y: x.skl_distance_full(y) )
mi@0 218 distance_matrix[np.isnan(distance_matrix)] = 10.0
mi@0 219 distance_matrix[np.isinf(distance_matrix)] = 10.0
mi@0 220
mi@0 221
mi@0 222 # def parse_args():
mi@0 223 # # define parser
mi@0 224 # op = optparse.OptionParser()
mi@0 225 # # IO options
mi@0 226 # op.add_option('-i', '--input', action="store", dest="INPUT", default='/Volumes/c4dm-scratch/mi/seg/qupujicheng/features', type="str", help="Loading features from..." )
mi@0 227 # op.add_option('-o', '--out', action="store", dest="OUTPUT", default='test/clustering_resutls', type="str", help="Writing clustering results to... ")
mi@0 228 #
mi@0 229 # return op.parse_args()
mi@0 230 #
mi@0 231 # options, args = parse_args()
mi@0 232 #
mi@0 233 # def main():
mi@0 234 #
mi@0 235 # feature_list = [i for i in os.listdir(options.INPUT) if not i.startswith('.')]
mi@0 236 # feature_list.sort()
mi@0 237 # fobj_list = []
mi@0 238 #
mi@0 239 # for feature in feature_list:
mi@0 240 # data = np.genfromtxt(join(options.INPUT, feature), delimiter=',',filling_values=0.0)
mi@0 241 # dim = data.shape[1] - 1
mi@0 242 # if dim == 1 :
mi@0 243 # fo = FeatureObj()
mi@0 244 # fo.audio = feature[:feature.find('_vamp')]
mi@0 245 # fo.key = splitext(feature.strip(fo.audio + '_'))[0]
mi@0 246 # fo.timestamps = data[:, 0] # the first column is the timestamps
mi@0 247 # fo.features = data[:, 1]
mi@0 248 # fobj_list.append(fo)
mi@0 249 #
mi@0 250 # else :
mi@0 251 # for col in xrange(dim):
mi@0 252 # fo = FeatureObj()
mi@0 253 # fo.audio = feature[:feature.find('_vamp')]
mi@0 254 # fo.key = splitext(feature.strip(fo.audio + '_'))[0] + '_' + '%d' %col
mi@0 255 # fo.timestamps = data[:, 0] # the first column records the timestamps
mi@0 256 # fo.features = data[:, col+1][:,np.newaxis]
mi@0 257 # fobj_list.append(fo)
mi@0 258 #
mi@0 259 # timestamps = fobj_list[0].timestamps
mi@0 260 # features = map(lambda x: "%i:%s" %(x[0],x[1].key), enumerate(fobj_list))
mi@0 261 # print "Loading %d features:\n", len(fobj_list)
mi@0 262 #
mi@0 263 # # find the feature with the fewer number of frames
mi@0 264 # n_frames = np.array(map(lambda x: x.features.shape[0], fobj_list)).min()
mi@0 265 # n_features = len(fobj_list)
mi@0 266 #
mi@0 267 # feature_matrix = np.zeros((n_frames, n_features))
mi@0 268 # print 'feature_matrix', feature_matrix.shape
mi@0 269 #
mi@0 270 # # take the arrays from the feature objects and add them to a matrix
mi@0 271 # for i,f in enumerate(fobj_list) :
mi@0 272 # feature_matrix[:,i] = f.features[:n_frames,0]
mi@0 273 #
mi@0 274 # # normalise the feature matrix, get rid of negative features, ensure numerical stability by adding a small constant
mi@0 275 # feature_matrix = abs(feature_matrix) / (abs(feature_matrix.max(0))+0.0005)
mi@0 276 # feature_matrix[np.isnan(feature_matrix)] = 0.0
mi@0 277 #
mi@0 278 # winlength = 5
mi@0 279 # stepsize = 2
mi@0 280 #
mi@0 281 # gmm_list = getGMMs(feature_matrix, gmmWindow=winlength, stepsize=stepsize)
mi@0 282 # print 'number of GMMs:', len(gmm_list)
mi@0 283 # skl_matrix = pairwiseSKL(gmm_list)
mi@0 284 #
mi@0 285 # rc = rClustering(eps=3, rank='min_avg_div').fit(gmm_list)
mi@0 286 # rc.test()
mi@0 287 # classification = rc.classification
mi@0 288 # neighborhood_size, average_div, node_rank = rc.getNodeRank()
mi@0 289 #
mi@0 290 # f1 = np.array(zip(timestamps[:len(classification)], labelclassifications))
mi@0 291 # f2 = np.array(zip(timestamps[:len(neighborhood_size)], neighborhood_size))
mi@0 292 # f3 = np.array(zip(timestamps[:len(average_div)], average_div))
mi@0 293 # f4 = np.array(zip(timestamps[:len(node_rank)], node_rank))
mi@0 294 #
mi@0 295 # np.savetxt(join(options.OUTPUT, 'classification')+'.csv', f1, delimiter=',')
mi@0 296 # np.savetxt(join(options.OUTPUT, 'neighborhood_size')+'.csv', f2, delimiter=',')
mi@0 297 # np.savetxt(join(options.OUTPUT, 'average_div')+'.csv', f3, delimiter=',')
mi@0 298 # np.savetxt(join(options.OUTPUT, 'node_rank')+'.csv', f4, delimiter=',')
mi@0 299 #
mi@0 300 # if __name__ == '__main__':
mi@0 301 # main()
mi@0 302