comparison experiment-reverb/code/supervised_training_pca-bak.py @ 0:246d5546657c

initial commit, needs cleanup
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Wed, 14 Dec 2016 13:15:48 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:246d5546657c
1 #!/usr/bin/python2
2 # -*- coding: utf-8 -*-
3 """
4 Created on Thu Apr 23 11:53:17 2015
5
6 @author: mmxgn
7 """
8
9 # This file does the cluster estimation and the removal of outliers
10
11 from sys import argv, exit
12 from essentia.standard import YamlInput, YamlOutput
13 from essentia import Pool
14 from pca import *
15
16 from numpy import *
17 from sklearn import cluster
18 from sklearn.metrics import pairwise_distances
19 from sklearn.cluster import KMeans, MiniBatchKMeans
20 import matplotlib.pyplot as plt
21 from sklearn.mixture import GMM
22 from sklearn.naive_bayes import GaussianNB, MultinomialNB
23 from scipy.signal import decimate
24 #from hmmlearn import hmm
25 from sklearn import hmm
26 #from adpcm import adm, adm_reconstruct
27
28
29 mse = lambda A,B: ((array(A)-array(B)) ** 2).mean()
30
31
32
33
34 def adm_reconstruct(codeword, h, dmin=.01, dmax=.28):
35 x = zeros((1, codeword.shape[1]))
36
37 delta1 = dmin
38 delta2 = dmin
39 Sum = h
40
41 x[0] = h
42 for i in range(0, codeword.shape[1]):
43 if codeword[0,i] == 0:
44 delta1 = dmin
45 delta2 = dmin
46
47 elif codeword[0,i] == 1:
48 delta2 = dmin
49 Sum += delta1
50 delta1 *= 2
51 if delta1 > dmax:
52 delta1 = dmax
53
54 elif codeword[0,i] == -1:
55 delta1 = dmin
56 Sum -= delta2
57 delta2 *= 2
58 if delta2 > dmax:
59 delta2 = dmax
60 x[0,i] = Sum
61 return x
62
63 def adm(x, dmin=.01, dmax=.28, tol=0.0001):
64
65 # Adaptive delta modulation adapted by code:
66 # (adeltamod.m)
67 #
68 # Adaptive Delta Modulator
69 # by Gandhar Desai (gdesai)
70 # BITS Pilani Goa Campus
71 # Date: 28 Sept, 2013
72
73 xsig = x
74
75 Lx = len(x)
76
77 ADMout = zeros((1, Lx))
78 codevec = zeros((1, Lx))
79
80
81 Sum = x[0]
82 delta1 = dmin
83 delta2 = dmin
84 mult1 = 2
85 mult2 = 2
86 for i in range(0, Lx):
87 #print abs(xsig[i] - Sum)
88 if (abs(xsig[i] - Sum) < tol):
89 bit = 0
90 delta2 = dmin
91 delta1 = dmin
92
93
94 elif (xsig[i] >= Sum):
95 bit = 1
96 delta2 = dmin
97 Sum += delta1
98 delta1 *= mult1
99 if delta1 > dmax:
100 delta1 = dmax
101
102
103 else:
104 bit = -1
105 delta1 = dmin
106 Sum -= delta2
107 delta2 *= mult2
108 if delta2 > dmax:
109 delta2 = dmax
110
111
112
113 ADMout[0, i] = Sum
114 codevec[0, i]= bit
115
116 return ADMout,codevec, x[0]
117
118 if __name__=="__main__":
119 if len(argv) != 2:
120 print "[EE] Wrong number of arguments"
121 print "[II] Correct syntax is:"
122 print "[II] \t%s <training_file>"
123 print "[II] where <training_file> is a .yaml file containing the"
124 print "[II] features of the dataset (try output2_stage/fulltraining-last.yaml)"
125 exit(-1)
126
127
128 infile = argv[1]
129
130 features_pool = YamlInput(filename = infile)()
131
132
133
134 feature_captions = features_pool.descriptorNames()
135 parameter_captions = []
136
137
138 for c in features_pool.descriptorNames():
139 if c.split('.')[0] == 'parameter':
140 parameter_captions.append(c)
141 if c.split('.')[0] == 'metadata' or c.split('.')[0] == 'parameter':
142 feature_captions.remove(c)
143
144
145
146 close('all')
147
148 print "[II] Loaded training data from %s (%s) " % (infile, features_pool['metadata.date'][0])
149 print "[II] %d Features Available: " % len(feature_captions)
150
151
152
153 print str(feature_captions).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7]
154
155 nfeatures_in = len(feature_captions)
156 nparameters_in = len(parameter_captions)
157 features_vector = zeros((nfeatures_in, len(features_pool[feature_captions[0]])))
158
159 parameters_vector = zeros((nparameters_in, len(features_pool[parameter_captions[0]])))
160
161
162 for i in range(0, nfeatures_in):
163 features_vector[i, :] = features_pool[feature_captions[i]].T
164 for i in range(0, nparameters_in):
165 parameters_vector[i, :] = features_pool[parameter_captions[0]].T
166
167 print "[II] %d parameters used:" % len(parameter_captions)
168 print str(parameter_captions).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7].replace('parameter.','')
169
170 print "[II] Marking silent parts"
171
172 silent_parts = zeros((1, len(features_pool[feature_captions[i]].T)))
173
174 rms = features_vector[feature_captions.index('rms'), :]
175
176 # Implementing Hysteresis Gate -- High threshold is halfway between
177 # the mean and the max and Low is halfway between the mean dn the min
178
179 rms_threshold_mean = mean(rms)
180
181 rms_threshold_max = max(rms)
182 rms_threshold_min = min(rms)
183
184 rms_threshold_high = 0.1 * rms_threshold_mean
185 rms_threshold_low = 0.01 * rms_threshold_mean
186
187 for n in range(1, len(rms)):
188 prev = rms[n-1]
189 curr = rms[n]
190
191 if prev >= rms_threshold_high:
192 if curr < rms_threshold_low:
193 silent_parts[0,n] = 1
194 else:
195 silent_parts[0,n] = 0
196 elif prev <= rms_threshold_low:
197 if curr > rms_threshold_high:
198 silent_parts[0,n] = 0
199 else:
200 silent_parts[0,n] = 1
201 else:
202 silent_parts[0,n] = silent_parts[0,n-1]
203
204
205 if silent_parts[0,1] == 1:
206 silent_parts[0, 0] = 1
207
208
209 # plot(rms)
210 # plot(silent_parts.T)
211 # plot(ones((len(rms), 1))*rms_threshold_high)
212 # plot(ones((len(rms), 1))*rms_threshold_low)
213
214 active_index = invert(silent_parts.flatten().astype(bool))
215
216 # Keep only active parts
217
218 # Uncomment this
219 #features_vector = features_vector[:, active_index]
220 # parameters_vector = parameters_vector[:, active_index]
221
222 moments_vector = zeros((features_vector.shape[0], 2))
223
224 print "[II] Storing moments vector"
225 for i in range(0, features_vector.shape[0]):
226 mean_ = mean(features_vector[i,:])
227 std_ = std(features_vector[i,:], ddof=1)
228 moments_vector[i,0] = mean_
229 moments_vector[i,1] = std_
230
231 features_vector[i,:] = (features_vector[i,:] - mean_)/std_
232
233
234
235 print "[II] Extracting PCA configuration "
236
237 kernel, q, featurelist = extract_pca_configuration_from_data(features_vector)
238
239 print "[II] Optimal number of PCs to keep: %d" % q
240
241 feature_captions_array = array(feature_captions)
242
243 features_to_keep = features_vector
244
245 features_to_keep = list(feature_captions_array[featurelist])
246 print "[II] Decided to keep %d features:" % len(features_to_keep)
247 print str(features_to_keep).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7]
248
249
250
251 # Keep the desired features
252 #Uncomment this
253 #features_kept_data = features_vector[featurelist,:]
254 features_lept_data = features_vector
255
256 # Generate the parameter clusters using k-means
257
258 # Uncomment this
259 features_vector = (kernel.T*features_kept_data)[0:q,:]
260
261
262
263 # parameters_k_means = KMeans(n_clusters = parameters_k, init='k-means++', max_iter=300, tol=0.0000001, verbose = 1)
264 parameters_k_means = KMeans(init='k-means++', n_init=10, max_iter=300, tol=0.0000001, verbose = 0)
265 # # parameters_k_means = MiniBatchKMeans(init='k-means++', max_iter=300, tol=0.00001, verbose = 1)
266 #
267 # # Quantize the differences of the parameters instead of the parameters themselves
268 # parameters_vector_diff = concatenate((zeros((shape(parameters_vector)[0],1)),diff(parameters_vector, axis=1)),axis=1)
269 # features_vector_diff = concatenate((zeros((shape(features_vector)[0],1)),diff(features_vector,axis=1)),axis=1)
270 #
271 # # Delete this afterwards
272 # # features_vector = features_vector_diff
273 # parameters_k_means.fit(parameters_vector_diff.T)
274 parameters_k_means.fit(parameters_vector.T)
275 #
276 parameters_k_means_centers = parameters_k_means.cluster_centers_
277 parameters_k_means_labels = parameters_k_means.labels_
278 #
279 parameters_vector_estimated = zeros(shape(parameters_vector))
280 #
281 for n in range(0, len(parameters_vector_estimated[0])):
282 parameters_vector_estimated[:,n] = parameters_k_means_centers[parameters_k_means_labels[n]]
283 #
284 ## plot(parameters_vector[0])
285 # # plot(parameters_vector_estimated[0])
286 #
287 # # PROvLIMA EDW
288 print "[II] Parameters MSE for %d clusters: %.3f" % (len(parameters_k_means.cluster_centers_), mse(parameters_vector, parameters_vector_estimated))
289
290 #
291
292
293
294
295
296 # print "[II] Clustering features."
297 #
298 # n_clusters = 100
299 # features_clustering = GMM(n_components = n_clusters, covariance_type='diag')
300 #
301 # features_clustering.fit( features_vector.T, y=parameters_k_means_labels)
302 #
303 # features_clustering_means = features_clustering.means_
304 # features_clustering_labels = features_clustering.predict(features_vector.T)
305 # features_clustering_sigmas = features_clustering.covars_
306 #
307 # features_vector_estimated = zeros(shape(features_vector))
308 #
309 #
310 # for n in range(0, len(features_vector_estimated[0])):
311 # features_vector_estimated[:,n] = features_clustering_means[features_clustering_labels[n]]
312 #
313 # # for n in range(0,features_vector.shape[0]):
314 # # hist(features_vector[1]-features_vector_estimated[1], 256)
315 # std(features_vector[1]-features_vector_estimated[1], ddof=1)
316 # mean(features_vector[1]-features_vector_estimated[1])
317 #
318 # print "[II] Features MSE for %d clusters: %.3f" % (n_clusters, mse(features_vector, features_vector_estimated))
319 #
320 #
321 # print "[II] Testing Gaussian Naive Bayes Classifier"
322 #
323 # gnb = GaussianNB()
324 # gnb.fit(features_vector.T, parameters_k_means_labels)
325 # parameter_labels_estimated = gnb.predict(features_vector.T)
326 #
327 # print "[II] Raw Parameters - Gaussian Naive Bayes classifier testing ratio: %.4f" % (float(sum((parameter_labels_estimated == parameters_k_means_labels).astype(float)))/float(len(parameters_k_means_labels)))
328 #
329 #
330 # print "[II] Trying ADM-coded parameters"
331 # UpsamplingFactor = 100
332 # print "[II] Upsampling features and parameters by a factor of %d" % UpsamplingFactor
333 #
334 #
335 # # Upsampled features and parameters
336 # features_vector_upsampled = repeat(features_vector,UpsamplingFactor, axis=1)
337 # feature_labels_upsampled = repeat(features_clustering_labels,UpsamplingFactor, axis=0)
338 # parameters_vector_upsampled = repeat(parameters_vector,UpsamplingFactor, axis=1)
339 #
340 # parameters_vector_upsampled_adm = matrix(zeros(shape(parameters_vector_upsampled)))
341 # parameters_vector_upsampled_code = matrix(zeros(shape(parameters_vector_upsampled)))
342 # parameters_vector_upsampled_firstval = matrix(zeros((parameters_vector_upsampled.shape[0],1)))
343 #
344 # # Reconstructed parameters
345 #
346 # parameters_vector_upsampled_reconstructed = matrix(zeros(shape(parameters_vector_upsampled)))
347 #
348 # dmin = 0.001
349 # dmax = 0.28
350 # tol = 0.001
351 # for i in range(0, parameters_vector_upsampled.shape[0]):
352 # out, code, h = adm(parameters_vector_upsampled[i,:],dmin=dmin,dmax=dmax,tol=tol)
353 # parameters_vector_upsampled_adm[i,:] = out
354 # parameters_vector_upsampled_code[i,:] = code
355 # parameters_vector_upsampled_firstval[i, 0] = h
356 #
357 #
358 # # Reconstruct-ADM
359 # parameters_vector_upsampled_reconstructed[i,:] = adm_reconstruct(parameters_vector_upsampled_code[i,:],parameters_vector_upsampled_firstval[i,0], dmin=dmin,dmax=dmax)
360 #
361 #
362 # # plot(parameters_vector_upsampled_adm.T, 'r--')
363 #
364 #
365 # # plot(parameters_vector_upsampled.T)
366 # # plot(parameters_vector_upsampled_reconstructed.T, 'g.')
367 #
368 #
369 #
370 # parameters_vector_reconstructed = zeros(shape(parameters_vector))
371 # for n in range(0, parameters_vector.shape[1]):
372 # parameters_vector_reconstructed[:,n] = parameters_vector_upsampled_reconstructed[:,n*UpsamplingFactor]
373 #
374 #
375 # mse_adm = mse(parameters_vector_reconstructed, parameters_vector)
376 #
377 # print "[II] Expected ADM reconstruction MSE: %.4f" % mse_adm
378 #
379 # # figure()
380 # #plot(parameters_vector.T)
381 # # plot(parameters_vector_reconstructed.T)
382 #
383 # print "[II] Building HMM transition, emission matrices and priors"
384 #
385 # transmat = zeros((3,3))
386 # startprob = zeros((3,))
387 # emissionmat = zeros((3, n_clusters))
388 #
389 #
390 # state_labels = parameters_vector_upsampled_code + 1
391 # stateseq = state_labels.T
392 #
393 # for n in range(0,shape(parameters_vector_upsampled_code)[1]):
394 # if n>0:
395 # transmat[state_labels[0,n-1],state_labels[0,n]] += 1
396 # startprob[state_labels[0,n]] +=1
397 # emissionmat[state_labels[0,n],feature_labels_upsampled[n]] += 1
398 #
399 #
400 # for n in range(0, transmat.shape[0]):
401 # transmat[n,:]/=sum(transmat[n,:])
402 # emissionmat[n,:]/=sum(emissionmat[n,:])
403 #
404 #
405 # transmat = matrix(transmat)
406 # emissionmat = matrix(emissionmat)
407 # # Prior
408 # startprob = startprob/sum(startprob)
409 # startprob = ravel(startprob)
410 #
411 # # Vocabulary
412 #
413 # model = hmm.GMMHMM(n_mix=n_clusters, n_components=3, covariance_type="diag")
414 # model.means_ = features_clustering.means_
415 # model.covars_ = features_clustering.covars_
416 #
417 # features_vector_array = array(features_vector)
418 # features_vector_upsampled_array=array(features_vector_upsampled)
419 #
420 # model.fit([features_vector_array.T])
421 # stateseq_estimated = model.predict(features_vector_upsampled_array.T)
422 #
423 # parameters_vector_upsampled_reconstructed_decoded = matrix(zeros(shape(parameters_vector_upsampled)))
424 #
425 #
426 # plot(stateseq_estimated)
427 # plot(stateseq)
428 #
429 # code_estimated = matrix(zeros(shape(parameters_vector_upsampled)))
430 # code_estimated[0,:] = stateseq_estimated - 1
431 #
432 #
433 #
434 # parameters_vector_upsampled_reconstructed_estimated = matrix(zeros(shape(parameters_vector_upsampled)))
435 #
436 # for i in range(0, parameters_vector_upsampled.shape[0]):
437 # parameters_vector_upsampled_reconstructed_estimated[i,:] = adm_reconstruct(code_estimated,parameters_vector_upsampled_firstval[i,0], dmin=dmin,dmax=dmax)
438 # figure()
439 # plot(parameters_vector_upsampled_reconstructed_estimated.T)
440 # plot(parameters_vector_upsampled.T)