comparison experiment-reverb/code/train.py @ 0:246d5546657c

initial commit, needs cleanup
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Wed, 14 Dec 2016 13:15:48 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:246d5546657c
1 #!/usr/bin/python2
2 # -*- coding: utf-8 -*-
3 """
4 Created on Thu Apr 23 11:53:17 2015
5
6 @author: mmxgn
7 """
8
9 # This file does the cluster estimation and the removal of outliers
10
11 from sys import argv, exit
12 from essentia.standard import YamlInput, YamlOutput
13 from essentia import Pool
14 from pca import *
15
16 from numpy import *
17 from sklearn import cluster
18 from sklearn.metrics import pairwise_distances
19 from sklearn.cluster import KMeans, MiniBatchKMeans
20 from matplotlib.pyplot import *
21 #from sklearn.mixture import GMM
22 from sklearn.naive_bayes import GaussianNB, MultinomialNB
23 from scipy.signal import decimate
24 from sklearn import cross_validation
25
26 #from hmmlearn import hmm
27 from hmmlearn.hmm import GMM
28 from hmmlearn import hmm
29
30 from sklearn import svm
31 #from adpcm import adm, adm_reconstruct
32
33
34 mse = lambda A,B: ((array(A)-array(B)) ** 2).mean()
35
36
37 def smooth_matrix_1D(X):
38 window = scipy.signal.gaussian(51,4)
39 window = window/sum(window)
40 intermx = zeros((X.shape[0],X.shape[1]+100))
41 intermx[:, 50:-50] = X
42
43 for m in range(0, X.shape[0]):
44 # print intermx.shape
45 intermx[m,:] = convolve(ravel(intermx[m,:]), window,'same')
46
47 return intermx[:,50:-50]
48
49 def adm_reconstruct(codeword, h, dmin=.01, dmax=.28):
50 x = zeros((1, codeword.shape[1]))
51
52 delta1 = dmin
53 delta2 = dmin
54 Sum = h
55
56 x[0] = h
57 for i in range(0, codeword.shape[1]):
58 if codeword[0,i] == 0:
59 delta1 = dmin
60 delta2 = dmin
61
62 elif codeword[0,i] == 1:
63 delta2 = dmin
64 Sum += delta1
65 delta1 *= 2
66 if delta1 > dmax:
67 delta1 = dmax
68
69 elif codeword[0,i] == -1:
70 delta1 = dmin
71 Sum -= delta2
72 delta2 *= 2
73 if delta2 > dmax:
74 delta2 = dmax
75 x[0,i] = Sum
76 return x
77
78 def adm(x, dmin=.01, dmax=.28, tol=0.0001):
79
80 # Adaptive delta modulation adapted by code:
81 # (adeltamod.m)
82 #
83 # Adaptive Delta Modulator
84 # by Gandhar Desai (gdesai)
85 # BITS Pilani Goa Campus
86 # Date: 28 Sept, 2013
87
88 xsig = x
89
90 Lx = len(x)
91
92 ADMout = zeros((1, Lx))
93 codevec = zeros((1, Lx))
94
95
96 Sum = x[0]
97 delta1 = dmin
98 delta2 = dmin
99 mult1 = 2
100 mult2 = 2
101 for i in range(0, Lx):
102 #print abs(xsig[i] - Sum)
103 if (abs(xsig[i] - Sum) < tol):
104 bit = 0
105 delta2 = dmin
106 delta1 = dmin
107
108
109 elif (xsig[i] >= Sum):
110 bit = 1
111 delta2 = dmin
112 Sum += delta1
113 delta1 *= mult1
114 if delta1 > dmax:
115 delta1 = dmax
116
117
118 else:
119 bit = -1
120 delta1 = dmin
121 Sum -= delta2
122 delta2 *= mult2
123 if delta2 > dmax:
124 delta2 = dmax
125
126
127
128 ADMout[0, i] = Sum
129 codevec[0, i]= bit
130
131 return ADMout,codevec, x[0]
132
133 if __name__=="__main__":
134 if len(argv) != 2:
135 print "[EE] Wrong number of arguments"
136 print "[II] Correct syntax is:"
137 print "[II] \t%s <training_file>"
138 print "[II] where <training_file> is a .yaml file containing the"
139 print "[II] features of the dataset (try output2_stage/fulltraining-last.yaml)"
140 exit(-1)
141
142
143 n_clusters = 4
144 UpsamplingFactor = 1
145 dmin = 0.001
146 dmax = 0.28
147 tol = 0.001
148
149 infile = argv[1]
150
151 features_pool = YamlInput(filename = infile)()
152
153
154
155 feature_captions = features_pool.descriptorNames()
156 parameter_captions = []
157
158
159 for c in features_pool.descriptorNames():
160 if c.split('.')[0] == 'parameter':
161 parameter_captions.append(c)
162 if c.split('.')[0] == 'metadata' or c.split('.')[0] == 'parameter':
163 feature_captions.remove(c)
164
165
166
167 # close('all')
168
169 print "[II] Loaded training data from %s (%s) " % (infile, features_pool['metadata.date'][0])
170 print "[II] %d Features Available: " % len(feature_captions)
171
172
173
174 print str(feature_captions).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7]
175
176 nfeatures_in = len(feature_captions)
177 nparameters_in = len(parameter_captions)
178 features_vector = zeros((nfeatures_in, len(features_pool[feature_captions[0]])))
179
180 parameters_vector = zeros((nparameters_in, len(features_pool[parameter_captions[0]])))
181
182
183 for i in range(0, nfeatures_in):
184 features_vector[i, :] = features_pool[feature_captions[i]].T
185 for i in range(0, nparameters_in):
186 parameters_vector[i, :] = features_pool[parameter_captions[0]].T
187
188 print "[II] %d parameters used:" % len(parameter_captions)
189 print str(parameter_captions).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7].replace('parameter.','')
190
191 print "[II] Marking silent parts"
192
193 silent_parts = zeros((1, len(features_pool[feature_captions[i]].T)))
194
195 rms = features_vector[feature_captions.index('rms'), :]
196
197 # Implementing Hysteresis Gate -- High threshold is halfway between
198 # the mean and the max and Low is halfway between the mean dn the min
199
200 rms_threshold_mean = mean(rms)
201
202 rms_threshold_max = max(rms)
203 rms_threshold_min = min(rms)
204
205 rms_threshold_high = 0.1 * rms_threshold_mean
206 rms_threshold_low = 0.01 * rms_threshold_mean
207
208 for n in range(1, len(rms)):
209 prev = rms[n-1]
210 curr = rms[n]
211
212 if prev >= rms_threshold_high:
213 if curr < rms_threshold_low:
214 silent_parts[0,n] = 1
215 else:
216 silent_parts[0,n] = 0
217 elif prev <= rms_threshold_low:
218 if curr > rms_threshold_high:
219 silent_parts[0,n] = 0
220 else:
221 silent_parts[0,n] = 1
222 else:
223 silent_parts[0,n] = silent_parts[0,n-1]
224
225
226 if silent_parts[0,1] == 1:
227 silent_parts[0, 0] = 1
228
229
230
231 active_index = invert(silent_parts.flatten().astype(bool))
232
233 # Keep only active parts
234
235 # Uncomment this
236 features_vector = features_vector[:, active_index]
237
238 moments_vector = zeros((features_vector.shape[0], 2))
239
240 print "[II] Storing moments vector"
241 for i in range(0, features_vector.shape[0]):
242 mean_ = mean(features_vector[i,:])
243 std_ = std(features_vector[i,:], ddof=1)
244 moments_vector[i,0] = mean_
245 moments_vector[i,1] = std_
246
247 features_vector[i,:] = (features_vector[i,:] - mean_)/std_
248
249 features_vector_original = features_vector
250
251
252 print "[II] Extracting PCA configuration "
253
254 kernel, q, featurelist = extract_pca_configuration_from_data(features_vector)
255
256 print "[II] Optimal number of PCs to keep: %d" % q
257
258 feature_captions_array = array(feature_captions)
259
260 features_to_keep = list(feature_captions_array[featurelist])
261 print "[II] Decided to keep %d features:" % len(features_to_keep)
262 print str(features_to_keep).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7]
263
264
265 features_kept_data = features_vector[featurelist,:]
266
267 features_vector = (kernel.T*features_kept_data)[0:q,:]
268
269 parameters_k_means = KMeans(init='k-means++', n_init=10, max_iter=300, tol=0.0000001, verbose = 0)
270
271 print "[II] Trying ADM-coded parameters"
272 print "[II] Upsampling features and parameters by a factor of %d" % UpsamplingFactor
273
274
275 # Upsampled features and parameters
276 features_vector_upsampled = smooth_matrix_1D(repeat(features_vector,UpsamplingFactor, axis=1))
277
278 # feature_labels_upsampled = repeat(features_clustering_labels,UpsamplingFactor, axis=0)
279 parameters_vector_upsampled = repeat(parameters_vector,UpsamplingFactor, axis=1)
280
281 # parameters_vector_upsampled = smooth_matrix_1D(parameters_vector_upsampled)
282
283 parameters_vector_upsampled_adm = matrix(zeros(shape(parameters_vector_upsampled)))
284 parameters_vector_upsampled_code = matrix(zeros(shape(parameters_vector_upsampled)))
285 parameters_vector_upsampled_firstval = matrix(zeros((parameters_vector_upsampled.shape[0],1)))
286
287 # Reconstructed parameters
288
289 parameters_vector_upsampled_reconstructed = matrix(zeros(shape(parameters_vector_upsampled)))
290
291
292
293
294 def adm_matrix(X, dmin=0.001,dmax=0.28,tol=0.001):
295
296 out = matrix(zeros(shape(X)))
297 code = matrix(zeros(shape(X)))
298 firstval = matrix(zeros((X.shape[0], 1)))
299
300 for i in range(0, X.shape[0]):
301 out[i,:], code[i,:], firstval[i,0] = adm(X[i,:],dmin=dmin,dmax=dmax,tol=tol)
302
303 return out,code,firstval
304
305 # parameters_vector_upsampled_reconstructed[i,:] = adm_reconstruct(parameters_vector_upsampled_code[i,:],parameters_vector_upsampled_firstval[i,0], dmin=dmin,dmax=dmax)
306
307 def adm_matrix_reconstruct(code, firstval, dmin=0.001, dmax=0.28):
308 X = matrix(zeros(shape(code)))
309 for i in range(0, code.shape[0]):
310 X[i,:] = adm_reconstruct(code[i,:], firstval[i,0], dmin=dmin, dmax=dmax)
311
312 return X
313
314
315 parameters_vector_upsampled_adm, parameters_vector_upsampled_code, parameters_vector_upsampled_firstval = adm_matrix(parameters_vector_upsampled, dmin, dmax, tol)
316
317
318 def diff_and_pad(X):
319 return concatenate((
320 zeros((
321 shape(X)[0],
322 1
323 )),
324 diff(X, axis=1)),
325 axis=1)
326
327
328 print "[II] Clustering features."
329 #
330 features_clustering = GMM(n_components = n_clusters, covariance_type='diag')
331 #
332 features_clustering.fit( features_vector_upsampled.T, y=parameters_vector_upsampled_code)
333 #
334 features_clustering_means = features_clustering.means_
335 features_clustering_labels = features_clustering.predict(features_vector_upsampled.T)
336 features_clustering_sigmas = features_clustering.covars_
337 #
338 features_vector_upsampled_estimated = zeros(shape(features_vector_upsampled))
339 #
340 #
341 for n in range(0, len(features_vector_upsampled_estimated[0])):
342 features_vector_upsampled_estimated[:,n] = features_clustering_means[features_clustering_labels[n]]
343 #
344 #
345 print "[II] Features MSE for %d clusters: %.3f" % (n_clusters, mse(features_vector_upsampled, features_vector_upsampled_estimated))
346
347
348
349 def happy_curve_classification(data, classes, estimator, Nd=1):
350 print "[II] Generating Happy Curve "
351 from copy import deepcopy
352 estimator_fulldata = deepcopy(estimator)
353 estimator_fulldata.fit(data, classes)
354 labels = estimator.predict(data)
355
356 # 1. Split data in two, training and testing
357
358 Ntr = int(round(data.shape[0]/2)) # Training dataset size
359 Nts = data.shape[0] - Ntr # Testing dataset size
360
361 ratios = []
362
363 for n in range(Nd, Ntr):
364 train, test, trainl, testl = cross_validation.train_test_split(data, classes, test_size = 0.5, random_state = 0)
365 train = train[0:n,:]
366 trainl = trainl[0:n]
367 # print "trainl"
368 # print trainl
369 estimator_ = deepcopy(estimator)
370 estimator_.fit(train,trainl)
371 labels = estimator_.predict(test)
372
373 ratio = sum(array(testl==labels).astype(float))/len(labels)
374
375 ratios.append(ratio)
376
377
378 return ratios
379
380
381 def cross_validate_classification(data, classes, estimator):
382 print "[II] Crossvalidating... "
383 from copy import deepcopy
384 estimator_fulldata = deepcopy(estimator)
385 estimator_fulldata.fit(data, classes)
386
387 percents = arange(0.1,0.9,0.1)
388 MSEs = []
389 labels = estimator.predict(data)
390
391 print "[II] for full training-testing: %.2f" % (sum(array(classes==labels).astype(float))/len(labels))
392
393 for p in percents:
394 train,test,trainlabels,testlabels = cross_validation.train_test_split(data,classes,test_size=p,random_state=0)
395 estimator_ = deepcopy(estimator)
396 estimator_.fit(train, trainlabels)
397 labels = estimator_.predict(test)
398 print "[II] for training(%.2f)-testing(%.2f): %.2f" % ((1-p),p,sum(array(testlabels==labels).astype(float))/len(labels))
399
400 return MSEs
401
402 def cross_validate_clustering(data, estimator):
403 print "[II] Crossvalidating... "
404 estimator_fulldata = estimator
405 estimator_fulldata.fit(data)
406
407 # labels = estimator_fulldata.predict(data)
408 means = estimator_fulldata.means_
409 # print means
410
411 percents = arange(0.1,0.6,0.1)
412 MSEs = []
413 reconstructed = zeros(shape(data))
414 labels = estimator.predict(data)
415 for n in range(0, len(reconstructed)):
416 reconstructed[n,:] = means[labels[n]]
417
418 MSEs.append(mse(data,reconstructed))
419 for p in percents:
420 train,test = cross_validation.train_test_split(data,test_size=p,random_state=0)
421 train = matrix(train)
422 test = matrix(test)
423
424 estimator.fit(train)
425 means = estimator.means_
426 labels = estimator.predict(test)
427 reconstructed = zeros(shape(test))
428 for n in range(0, len(reconstructed)):
429 reconstructed[n,:] = means[labels[n]]
430
431 m = mse(test,reconstructed)
432
433 print "[II] MSE for clustering crossvalidated data %.2f-%.2f: %.5f" % ((1-p), p, m)
434 MSEs.append(m)
435
436 print "[II] Crossvalidation complete"
437
438 return MSEs
439
440
441
442
443 # Construct parameters alphabet, each symbol is going to be a different column vector
444 # in parameter code matrix
445
446
447 def vector_to_states(X):
448 """
449 Input: a vector MxN with N samples and M variables
450 Output: a codeword dictionary `parameters_alphabet',
451 state_seq, inverse `parameters_alphabet_inv' """
452
453
454 parameters_alphabet = {}
455 n = 0
456
457 for i in range(0, X.shape[1]):
458 vec = tuple(ravel(X[:,i]))
459 if vec not in parameters_alphabet:
460 parameters_alphabet[vec] = n
461 n += 1
462
463 parameters_alphabet_inv = dict([(parameters_alphabet[m],m) for m in parameters_alphabet])
464
465 state_seq = array([parameters_alphabet[tuple(ravel(X[:,m]))] for m in range(0, parameters_vector_upsampled_code.shape[1])] )
466
467 return state_seq, parameters_alphabet, parameters_alphabet_inv
468
469
470 def states_to_vector(predicted, parameters_alphabet_inv):
471 estimated = matrix(zeros((len(parameters_alphabet_inv[0]), len(predicted))))
472 for i in range(0, len(state_seq)):
473 estimated[:, i] = matrix(parameters_alphabet_inv[predicted[i]]).T
474
475 return estimated
476
477 state_seq, parameters_alphabet, parameters_alphabet_inv = vector_to_states(parameters_vector_upsampled_code)
478
479
480 parameters_change_variable = matrix(diff_and_pad(parameters_vector_upsampled)!=0).astype(int)
481
482 changes_state_seq, changes_parameters_alphabet, changes_parameters_alphabet_inv = vector_to_states(parameters_change_variable)
483
484
485 # This is an hmm that just codes the changes"
486 # We have only two states, change and stay the same.
487
488 # Uncomment that here
489
490 # parameters_vector_upsampled = parameters_vector_upsampled_code
491 # parameters_state_orig, parameter_state_alphabet_orig, parameter_state_alphabet_inv_orig = vector_to_states(parameters_vector)
492
493 parameters_state, parameter_state_alphabet, parameter_state_alphabet_inv = vector_to_states(parameters_vector_upsampled)
494
495
496 print "[II] Testing Gaussian Naive Bayes Classifier"
497 gnb = GaussianNB()
498 gnb.fit(features_vector_upsampled.T, parameters_state)
499
500 parameters_state_estimated = gnb.predict(features_vector_upsampled.T)
501
502 output = states_to_vector(parameters_state_estimated, parameter_state_alphabet_inv)
503
504 figure()
505 subplot(211)
506 plot(parameters_vector_upsampled.T)
507 title('Parameter value upsampled by a factor of %d' % UpsamplingFactor)
508 ylabel('value')
509 xlabel('frame #')
510 subplot(212)
511 #plot(smooth_matrix_1D(output).T)
512 plot(output.T)
513 ylabel('value')
514 xlabel('frame #')
515 #cross_validate_classification(features_vector_upsampled.T, parameters_state, gnb)
516 # hc = happy_curve_classification(features_vector_upsampled.T, parameters_state, gnb)
517
518 #
519 # figure()
520 # plot(hc)
521 # figure()
522
523 print "[II] Trying Multinomial HMM"
524
525 # In order to do classification with HMMs, we need to:
526 # 1. Split the parameters into classes
527 # 2. Train one model per class
528 # 3. Feed our data to all the models
529 # 4. Check which has a better score and assig,n to M
530
531
532 class HmmClassifier:
533 def __init__(self, N=2, n_components = 1):
534 self.n_components = n_components
535 self.chain_size = N
536 self.hmms_ = []
537 self.N = N
538
539 def fit(self, X, states):
540 self.n_states = len(unique(states))
541
542 for n in range(0, self.n_states):
543 hmm_ = hmm.GaussianHMM(n_components = self.n_components, covariance_type = 'full')
544
545 # Get training data for each class
546 vector = X[states == n,:]
547
548 # Fit the HMM
549 # print vector
550 hmm_.fit([vector])
551
552 # And append to the list
553 self.hmms_.append(hmm_)
554
555 def predict(self,X):
556 labels = zeros((X.shape[0],))
557 N = self.N
558
559 m = 0
560
561 scores = zeros((1, self.n_states))
562
563
564 while m*N < X.shape[0]:
565 if (m+1)*N > X.shape[0]:
566 testdata = X[m*N:,:]
567 else:
568 testdata = X[m*N:(m+1)*N,:]
569
570 # print testdata
571
572 for i in range(0, self.n_states):
573 scores[0,i] = self.hmms_[i].score(testdata)
574
575 if (m+1)*N > X.shape[0]:
576 labels[m*N:] = argmax(scores)
577 else:
578 labels[m*N:(m+1)*N] = argmax(scores)
579
580 m+= 1
581
582 return labels
583
584 N = 2
585 n_components = 1
586
587 hmmc = HmmClassifier(N = N, n_components = n_components)
588 hmmc.fit(features_vector_upsampled.T, parameters_state)
589
590
591 # hchmmc = happy_curve_classification(features_vector_upsampled.T, parameters_state, hmmc, Nd=100)
592 # cross_validate_classification(features_vector_upsampled.T, parameters_state, hmmc)
593
594
595
596
597
598
599 # hmms_ = []
600 #
601 # for n in range(0, len(parameter_state_alphabet)):
602 # #hmm_ = hmm.GMMHMM(n_components = 1, n_mix = 2)
603 # hmm_ = hmm.GaussianHMM(n_components = 1,covariance_type = 'full')
604 #
605 # # Get training data for each class
606 # vector = features_vector_upsampled[:,parameters_state == n]
607 #
608 # #if vector.shape[1] < n_clusters:
609 # # hmms_.append(None)
610 # #else:
611 #
612 # hmm_.fit([vector.T])
613 #
614 # # Append to the list
615 #
616 # hmms_.append(hmm_)
617 #
618 # labels = zeros((features_vector_upsampled.shape[1],))
619 #
620 # N = 20
621 # m = 0
622 #
623 # while m*N < features_vector_upsampled.shape[1]:
624 #
625 # scores = zeros((1, len(parameter_state_alphabet)))
626 #
627 # if (m+1)*N > features_vector_upsampled.shape[1]:
628 # testdata = features_vector_upsampled[:,m*N:]
629 # else:
630 # testdata = features_vector_upsampled[:,m*N:(m+1)*N]
631 #
632 # for i in range(0, len(parameter_state_alphabet)):
633 # if hmms_[i] is not None:
634 # scores[0,i] = hmms_[i].score(testdata.T)
635 # else:
636 # scores[0,i] = -100000 # Very large negative score
637 # if (m+1)*N >= features_vector_upsampled.shape[1]:
638 # labels[m*N:] = argmax(scores)
639 # else:
640 # labels[m*N:(m+1)*N] = argmax(scores)
641 #
642 # m += 1
643
644
645 # figure()
646 #plot(labels.T)
647
648
649 labels = hmmc.predict(features_vector_upsampled.T)
650 estimated = states_to_vector(labels,parameter_state_alphabet_inv)
651 plot(estimated.T,'r--')
652
653 title('Estimated parameter values')
654 legend(['Naive Bayes Classifier', 'HMM chain size %d (%.1fms)' % (N, float(N)/UpsamplingFactor*23.0)])
655
656 ylabel('value')
657 xlabel('frame #')
658
659
660 close('all')
661
662 plot(features_clustering_labels/float(max(features_clustering_labels)))
663 plot(parameters_vector_upsampled.T/max(ravel(parameters_vector_upsampled)))
664
665
666 def plot_clusters(x, labels):
667 figure()
668 symbols = ['>', 'x', '.', '<','v']
669 colors = ['b', 'r', 'g', 'm','c']
670
671 for r in range(0, x.shape[0]):
672 scatter(x[r,0], x[r,1], c=colors[int(labels[r]) % len(colors)], marker=symbols[int(labels[r]) % len(symbols)])
673
674
675 plot_clusters(features_vector_upsampled.T, parameters_state)
676
677
678 # SVM
679
680 X = features_vector_upsampled.T
681 y = parameters_state
682
683 clf = svm.SVC()
684 clf.fit(X,y)
685
686 parameters_state_y = clf.predict(X)
687 #plot(parameters)