mi@0
|
1 import sys,os,optparse
|
mi@0
|
2 from os.path import join, isdir, isfile, abspath, dirname, basename, split, splitext
|
mi@0
|
3 import numpy as np
|
mi@0
|
4 from numpy import log, power, pi, exp, transpose, zeros, log, ones, dot, argmin, inf, zeros_like, argsort, finfo
|
mi@0
|
5 from sklearn.mixture import GMM
|
mi@0
|
6 from sklearn.metrics.pairwise import pairwise_distances
|
mi@0
|
7 from copy import copy
|
mi@0
|
8
|
mi@0
|
9 from MutualInfo import print_progress
|
mi@0
|
10 from ComputationCache import Meta, with_cache, with_pickle_dump
|
mi@0
|
11 from gmmdist import skl_full, skl_gmm
|
mi@0
|
12 from GmmMetrics import GmmDistance
|
mi@0
|
13
|
mi@0
|
14 class Node(object):
|
mi@0
|
15
|
mi@0
|
16 def __init__(self, model, distances):
|
mi@0
|
17 self.model = model
|
mi@0
|
18 self.distances = distances
|
mi@0
|
19 self.neighborhood = []
|
mi@0
|
20
|
mi@0
|
21 def __str__(self):
|
mi@0
|
22 # return 'Node(%i):%s' %(self.sorted_distances[0], self.neighborhood)
|
mi@0
|
23 return 'Node(%i):%i, %3.2f' %(self.sorted_distances[0], self.neighborhood_size, self.average_div)
|
mi@0
|
24
|
mi@0
|
25 def rankDistances(self):
|
mi@0
|
26 '''Return the index of the sorted array.'''
|
mi@0
|
27 self.sorted_distances = np.argsort(self.distances)
|
mi@0
|
28
|
mi@0
|
29 def findNeighborhood(self, eps):
|
mi@0
|
30 ''''''
|
mi@0
|
31 d = self.distances[self.sorted_distances][1]
|
mi@0
|
32 b = eps * d
|
mi@0
|
33 if np.isinf(b) : return None
|
mi@0
|
34
|
mi@0
|
35 for i in self.sorted_distances:
|
mi@0
|
36 if self.distances[i] <= b :
|
mi@0
|
37 self.neighborhood.append(i)
|
mi@0
|
38
|
mi@0
|
39 # Brute force hack; we exclude nodes whose neighbourhood is larger than 80% of all frames
|
mi@0
|
40 # Proper solution: use the delta assymmetric KL divergence to identify near singularities
|
mitian@18
|
41 if len(self.neighborhood) > 0.8 * len(self.sorted_distances) :
|
mi@0
|
42 self.neighborhood = []
|
mi@0
|
43
|
mi@0
|
44 def getStatistics(self):
|
mi@0
|
45
|
mi@0
|
46 self.neighborhood_size = len(self.neighborhood)
|
mi@0
|
47 self.average_div = np.mean(self.distances[self.neighborhood[1:]])
|
mi@0
|
48
|
mi@0
|
49 class rClustering(object):
|
mi@0
|
50
|
mitian@13
|
51 meta = Meta()
|
mitian@13
|
52
|
mitian@13
|
53 @classmethod
|
mitian@13
|
54 def set_cache_filename(cls, filename, cache = True, cache_location=""):
|
mitian@13
|
55 cls.meta.cache = cache
|
mitian@13
|
56 cls.meta.cache_file_base = filename
|
mitian@13
|
57 cls.meta.cache_location = cache_location
|
mi@0
|
58
|
mi@0
|
59 def __init__(self, eps, thresh=15, k=5, rank='max_neighbors', centering=True):
|
mi@0
|
60 self.eps = eps
|
mi@0
|
61 self.thresh = thresh
|
mi@0
|
62 self.rank = rank
|
mi@0
|
63 self.node_list = []
|
mi@0
|
64 self.methods = {'max_neighbors':'neighborhood_size', 'min_avg_div':'average_div'}
|
mi@0
|
65 self.classification = []
|
mi@0
|
66 self.centering = centering
|
mi@0
|
67 self.k = k
|
mi@0
|
68
|
mi@0
|
69 def fit(self, data):
|
mi@0
|
70 '''
|
mi@0
|
71 Arguments:
|
mi@0
|
72 data: Input list of GMMs.'''
|
mi@0
|
73 D = self.getPairwiseDistances(data)
|
mi@0
|
74 if self.centering:
|
mi@0
|
75 D[D>=finfo(np.float32).max] = inf
|
mi@0
|
76 print "hubness before centering:", self.hubness_measure(D)
|
mi@0
|
77 D = self.center_divergence_matrix(D)
|
mi@0
|
78 print "hubness after centering:", self.hubness_measure(D)
|
mi@0
|
79 for i, model in enumerate(data):
|
mi@0
|
80 self.node_list.append(Node(model, D[i]))
|
mi@0
|
81 for node in self.node_list:
|
mi@0
|
82 node.rankDistances()
|
mi@0
|
83 node.findNeighborhood(self.eps)
|
mi@0
|
84 node.getStatistics()
|
mi@0
|
85
|
mi@0
|
86 print "finding centroids"
|
mi@0
|
87
|
mi@0
|
88 hub_candidates = self.findCentroids()
|
mi@0
|
89 print "\n\n hubs:",len(hub_candidates)
|
mi@0
|
90 if len(hub_candidates) > self.k:
|
mi@0
|
91 hub_candidates = hub_candidates[:self.k]
|
mi@0
|
92 self.classification = []
|
mi@0
|
93
|
mi@0
|
94 for i, node in enumerate(self.node_list):
|
mi@0
|
95 dists = zeros(len(hub_candidates))
|
mi@0
|
96 for k,hub in enumerate(hub_candidates):
|
mi@0
|
97 j = self.node_list.index(hub)
|
mi@0
|
98 dists[k] = D[i,j]
|
mi@0
|
99 di = argmin(dists)
|
mi@0
|
100 sel_hub = hub_candidates[di]
|
mi@0
|
101 self.classification.append(self.node_list.index(sel_hub))
|
mi@0
|
102
|
mi@0
|
103 return self
|
mi@0
|
104
|
mi@0
|
105 def classify(self):
|
mi@0
|
106 return self.classification
|
mi@0
|
107
|
mi@0
|
108 def findCentroids(self):
|
mi@0
|
109 # sorted_nodelist = sorted(self.node_list, key = lambda x: x.neighborhood_size)
|
mi@0
|
110 sorted_nodelist = sorted(self.node_list, key = lambda x: x.__getattribute__(self.methods[self.rank]), reverse=True)
|
mi@0
|
111 hub_candidates = []
|
mi@0
|
112 while True:
|
mi@0
|
113 hub = sorted_nodelist.pop(0)
|
mi@0
|
114 hub_candidates.append(hub)
|
mi@0
|
115 [sorted_nodelist.remove(self.node_list[n]) for n in hub.neighborhood if self.node_list[n] in sorted_nodelist]
|
mi@0
|
116 # print "hub.neighborhood",len(hub.neighborhood)
|
mi@0
|
117 # for n in hub.neighborhood :
|
mi@0
|
118 # if self.node_list[n] in sorted_nodelist :
|
mi@0
|
119 # sorted_nodelist.remove(self.node_list[n])
|
mi@0
|
120 # print "removed", n, "from node list"
|
mi@0
|
121 # print len(hub_candidates),len(sorted_nodelist)
|
mi@0
|
122 if not sorted_nodelist: break
|
mi@0
|
123 # print 'hub_candidates', hub_candidates
|
mi@0
|
124 return hub_candidates
|
mi@0
|
125
|
mi@0
|
126
|
mi@0
|
127 def getNodeRank(self):
|
mi@0
|
128 average_div = []
|
mi@0
|
129 neighborhood_size = []
|
mi@0
|
130 node_rank = []
|
mi@0
|
131 sorted_nodelist = sorted(self.node_list, key = lambda x: x.__getattribute__(self.methods[self.rank]))
|
mi@0
|
132
|
mi@0
|
133 for node in self.node_list:
|
mi@0
|
134 neighborhood_size.append(node.neighborhood_size)
|
mi@0
|
135 average_div.append(node.average_div)
|
mi@0
|
136 node_rank.append(sorted_nodelist.index(node))
|
mi@0
|
137
|
mi@0
|
138 return neighborhood_size, average_div, node_rank
|
mi@0
|
139
|
mi@0
|
140 def test(self):
|
mi@0
|
141 for n in self.node_list:
|
mi@0
|
142 print n
|
mi@0
|
143
|
mi@0
|
144 # @with_cache(meta)
|
mi@0
|
145 def getPairwiseDistances(self, gmm_list):
|
mi@0
|
146 '''Calculate the pairwise distances of the input GMM data'''
|
mi@0
|
147 n_GMMs = len(gmm_list)
|
mi@0
|
148 distance_matrix = np.zeros((n_GMMs, n_GMMs))
|
mi@0
|
149 print "Creating divergence matrix."
|
mi@0
|
150 for i in xrange(n_GMMs):
|
mi@0
|
151 print_progress(i,"gmm node processed")
|
mi@0
|
152 for j in xrange(i, n_GMMs):
|
mi@0
|
153 distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j])
|
mi@0
|
154 distance_matrix[j][i] = distance_matrix[i][j]
|
mi@0
|
155 np.fill_diagonal(distance_matrix, 0.0)
|
mi@0
|
156 distance_matrix[np.isinf(distance_matrix)] = np.finfo(np.float64).max
|
mi@0
|
157
|
mi@0
|
158 return distance_matrix
|
mi@0
|
159
|
mi@0
|
160 def hubness_measure(self, D,r=0.45):
|
mi@0
|
161 n = D.shape[0]
|
mi@0
|
162 Dmean = zeros(n)
|
mi@0
|
163 Dstd = zeros(n)
|
mi@0
|
164 for i in xrange(n) :
|
mi@0
|
165 Dmean[i] = np.mean(D[i][ [D[i]>0] and [D[i]!=inf] ])
|
mi@0
|
166 Dstd[i] = np.std(D[i][ [D[i]>0] and [D[i]!=inf] ])
|
mi@0
|
167
|
mi@0
|
168 m = Dmean.mean()
|
mi@0
|
169 r = r*m
|
mi@0
|
170 #print m,r
|
mi@0
|
171 #print Dmean.min(), Dmean.max()
|
mi@0
|
172
|
mi@0
|
173 N = zeros(n)
|
mi@0
|
174 for i in xrange(n) :
|
mi@0
|
175 s = [D[i] > 0] and [D[i] < r]
|
mi@0
|
176 N[i] = len(D[i][s])
|
mi@0
|
177 sortindex = argsort(N)
|
mi@0
|
178 hubs_mean = np.mean(Dmean[sortindex][-5:][::-1,...])
|
mi@0
|
179 anti_hubs_mean = np.mean(Dmean[sortindex][:5])
|
mi@0
|
180 return (anti_hubs_mean - hubs_mean) / Dmean.mean()
|
mi@0
|
181
|
mi@0
|
182 def center_divergence_matrix(self, D) :
|
mi@0
|
183 n = D.shape[0]
|
mi@0
|
184 Dmean = zeros(n)
|
mi@0
|
185 Dstd = zeros(n)
|
mi@0
|
186 for i in xrange(D.shape[0]) :
|
mi@0
|
187 Dmean[i] = np.mean(D[i][ [D[i]>0] and [D[i]!=inf] ])
|
mi@0
|
188 Dstd[i] = np.std(D[i][ [D[i]>0] and [D[i]!=inf] ])
|
mi@0
|
189 B = zeros_like(D)
|
mi@0
|
190 for i in xrange(n) :
|
mi@0
|
191 for j in xrange(n) :
|
mi@0
|
192 d = D[i,j]
|
mi@0
|
193 B[i,j] = B[j,i] = 0.5 * ((d - Dmean[i]) / Dstd[i] + (d - Dmean[j]) / Dstd[j])
|
mi@0
|
194 B += abs(np.min(B))
|
mi@0
|
195 np.fill_diagonal(B,0.0)
|
mi@0
|
196 return B
|
mi@0
|
197
|
mi@0
|
198 class FeatureObj() :
|
mi@0
|
199 __slots__ = ['key','audio','timestamps','features']
|
mi@0
|
200
|
mi@0
|
201 def getGMMs(feature, gmmWindow=10, stepsize=1):
|
mi@0
|
202 gmm_list = []
|
mi@0
|
203 steps = int((feature.shape[0] - gmmWindow + stepsize) / stepsize)
|
mi@0
|
204 for i in xrange(steps):
|
mi@0
|
205 gmm_list.append(GmmDistance(feature[i*stepsize:(i*stepsize+gmmWindow), :]))
|
mi@0
|
206 return gmm_list
|
mi@0
|
207
|
mi@0
|
208 def pairwiseSKL(gmm_list):
|
mi@0
|
209 '''Compute pairwise symmetrised KL divergence of a list of GMMs.'''
|
mi@0
|
210 n_GMMs = len(gmm_list)
|
mi@0
|
211 distance_matrix = np.zeros((n_GMMs, n_GMMs))
|
mi@0
|
212 for i in xrange(n_GMMs):
|
mi@0
|
213 for j in xrange(i, n_GMMs):
|
mi@0
|
214 distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j])
|
mi@0
|
215 distance_matrix[j][i] = distance_matrix[i][j]
|
mi@0
|
216 # X = np.array(gmm_list)
|
mi@0
|
217 # distance_matrix = pairwise_distances(X, metric = lambda x, y: x.skl_distance_full(y) )
|
mi@0
|
218 distance_matrix[np.isnan(distance_matrix)] = 10.0
|
mi@0
|
219 distance_matrix[np.isinf(distance_matrix)] = 10.0
|
mi@0
|
220
|
mi@0
|
221
|
mi@0
|
222 # def parse_args():
|
mi@0
|
223 # # define parser
|
mi@0
|
224 # op = optparse.OptionParser()
|
mi@0
|
225 # # IO options
|
mi@0
|
226 # op.add_option('-i', '--input', action="store", dest="INPUT", default='/Volumes/c4dm-scratch/mi/seg/qupujicheng/features', type="str", help="Loading features from..." )
|
mi@0
|
227 # op.add_option('-o', '--out', action="store", dest="OUTPUT", default='test/clustering_resutls', type="str", help="Writing clustering results to... ")
|
mi@0
|
228 #
|
mi@0
|
229 # return op.parse_args()
|
mi@0
|
230 #
|
mi@0
|
231 # options, args = parse_args()
|
mi@0
|
232 #
|
mi@0
|
233 # def main():
|
mi@0
|
234 #
|
mi@0
|
235 # feature_list = [i for i in os.listdir(options.INPUT) if not i.startswith('.')]
|
mi@0
|
236 # feature_list.sort()
|
mi@0
|
237 # fobj_list = []
|
mi@0
|
238 #
|
mi@0
|
239 # for feature in feature_list:
|
mi@0
|
240 # data = np.genfromtxt(join(options.INPUT, feature), delimiter=',',filling_values=0.0)
|
mi@0
|
241 # dim = data.shape[1] - 1
|
mi@0
|
242 # if dim == 1 :
|
mi@0
|
243 # fo = FeatureObj()
|
mi@0
|
244 # fo.audio = feature[:feature.find('_vamp')]
|
mi@0
|
245 # fo.key = splitext(feature.strip(fo.audio + '_'))[0]
|
mi@0
|
246 # fo.timestamps = data[:, 0] # the first column is the timestamps
|
mi@0
|
247 # fo.features = data[:, 1]
|
mi@0
|
248 # fobj_list.append(fo)
|
mi@0
|
249 #
|
mi@0
|
250 # else :
|
mi@0
|
251 # for col in xrange(dim):
|
mi@0
|
252 # fo = FeatureObj()
|
mi@0
|
253 # fo.audio = feature[:feature.find('_vamp')]
|
mi@0
|
254 # fo.key = splitext(feature.strip(fo.audio + '_'))[0] + '_' + '%d' %col
|
mi@0
|
255 # fo.timestamps = data[:, 0] # the first column records the timestamps
|
mi@0
|
256 # fo.features = data[:, col+1][:,np.newaxis]
|
mi@0
|
257 # fobj_list.append(fo)
|
mi@0
|
258 #
|
mi@0
|
259 # timestamps = fobj_list[0].timestamps
|
mi@0
|
260 # features = map(lambda x: "%i:%s" %(x[0],x[1].key), enumerate(fobj_list))
|
mi@0
|
261 # print "Loading %d features:\n", len(fobj_list)
|
mi@0
|
262 #
|
mi@0
|
263 # # find the feature with the fewer number of frames
|
mi@0
|
264 # n_frames = np.array(map(lambda x: x.features.shape[0], fobj_list)).min()
|
mi@0
|
265 # n_features = len(fobj_list)
|
mi@0
|
266 #
|
mi@0
|
267 # feature_matrix = np.zeros((n_frames, n_features))
|
mi@0
|
268 # print 'feature_matrix', feature_matrix.shape
|
mi@0
|
269 #
|
mi@0
|
270 # # take the arrays from the feature objects and add them to a matrix
|
mi@0
|
271 # for i,f in enumerate(fobj_list) :
|
mi@0
|
272 # feature_matrix[:,i] = f.features[:n_frames,0]
|
mi@0
|
273 #
|
mi@0
|
274 # # normalise the feature matrix, get rid of negative features, ensure numerical stability by adding a small constant
|
mi@0
|
275 # feature_matrix = abs(feature_matrix) / (abs(feature_matrix.max(0))+0.0005)
|
mi@0
|
276 # feature_matrix[np.isnan(feature_matrix)] = 0.0
|
mi@0
|
277 #
|
mi@0
|
278 # winlength = 5
|
mi@0
|
279 # stepsize = 2
|
mi@0
|
280 #
|
mi@0
|
281 # gmm_list = getGMMs(feature_matrix, gmmWindow=winlength, stepsize=stepsize)
|
mi@0
|
282 # print 'number of GMMs:', len(gmm_list)
|
mi@0
|
283 # skl_matrix = pairwiseSKL(gmm_list)
|
mi@0
|
284 #
|
mi@0
|
285 # rc = rClustering(eps=3, rank='min_avg_div').fit(gmm_list)
|
mi@0
|
286 # rc.test()
|
mi@0
|
287 # classification = rc.classification
|
mi@0
|
288 # neighborhood_size, average_div, node_rank = rc.getNodeRank()
|
mi@0
|
289 #
|
mi@0
|
290 # f1 = np.array(zip(timestamps[:len(classification)], labelclassifications))
|
mi@0
|
291 # f2 = np.array(zip(timestamps[:len(neighborhood_size)], neighborhood_size))
|
mi@0
|
292 # f3 = np.array(zip(timestamps[:len(average_div)], average_div))
|
mi@0
|
293 # f4 = np.array(zip(timestamps[:len(node_rank)], node_rank))
|
mi@0
|
294 #
|
mi@0
|
295 # np.savetxt(join(options.OUTPUT, 'classification')+'.csv', f1, delimiter=',')
|
mi@0
|
296 # np.savetxt(join(options.OUTPUT, 'neighborhood_size')+'.csv', f2, delimiter=',')
|
mi@0
|
297 # np.savetxt(join(options.OUTPUT, 'average_div')+'.csv', f3, delimiter=',')
|
mi@0
|
298 # np.savetxt(join(options.OUTPUT, 'node_rank')+'.csv', f4, delimiter=',')
|
mi@0
|
299 #
|
mi@0
|
300 # if __name__ == '__main__':
|
mi@0
|
301 # main()
|
mi@0
|
302 |