e@0
|
1 #!/usr/bin/python2
|
e@0
|
2 # -*- coding: utf-8 -*-
|
e@0
|
3 """
|
e@0
|
4 Created on Thu Apr 23 11:53:17 2015
|
e@0
|
5
|
e@0
|
6 @author: mmxgn
|
e@0
|
7 """
|
e@0
|
8
|
e@0
|
9 # This file does the cluster estimation and the removal of outliers
|
e@0
|
10
|
e@0
|
11 from sys import argv, exit
|
e@0
|
12 from essentia.standard import YamlInput, YamlOutput
|
e@0
|
13 from essentia import Pool
|
e@0
|
14 from pca import *
|
e@0
|
15
|
e@0
|
16 from numpy import *
|
e@0
|
17 from sklearn import cluster
|
e@0
|
18 from sklearn.metrics import pairwise_distances
|
e@0
|
19 from sklearn.cluster import KMeans, MiniBatchKMeans
|
e@0
|
20 import matplotlib.pyplot as plt
|
e@0
|
21 #from sklearn.mixture import GMM
|
e@0
|
22 from sklearn.naive_bayes import GaussianNB, MultinomialNB
|
e@0
|
23 from scipy.signal import decimate
|
e@0
|
24 from sklearn import cross_validation
|
e@0
|
25
|
e@0
|
26 #from hmmlearn import hmm
|
e@0
|
27 from hmmlearn.hmm import GMM
|
e@0
|
28 from hmmlearn import hmm
|
e@0
|
29 #from adpcm import adm, adm_reconstruct
|
e@0
|
30
|
e@0
|
31
|
e@0
|
32 mse = lambda A,B: ((array(A)-array(B)) ** 2).mean()
|
e@0
|
33
|
e@0
|
34
|
e@0
|
35
|
e@0
|
36
|
e@0
|
37 def adm_reconstruct(codeword, h, dmin=.01, dmax=.28):
|
e@0
|
38 x = zeros((1, codeword.shape[1]))
|
e@0
|
39
|
e@0
|
40 delta1 = dmin
|
e@0
|
41 delta2 = dmin
|
e@0
|
42 Sum = h
|
e@0
|
43
|
e@0
|
44 x[0] = h
|
e@0
|
45 for i in range(0, codeword.shape[1]):
|
e@0
|
46 if codeword[0,i] == 0:
|
e@0
|
47 delta1 = dmin
|
e@0
|
48 delta2 = dmin
|
e@0
|
49
|
e@0
|
50 elif codeword[0,i] == 1:
|
e@0
|
51 delta2 = dmin
|
e@0
|
52 Sum += delta1
|
e@0
|
53 delta1 *= 2
|
e@0
|
54 if delta1 > dmax:
|
e@0
|
55 delta1 = dmax
|
e@0
|
56
|
e@0
|
57 elif codeword[0,i] == -1:
|
e@0
|
58 delta1 = dmin
|
e@0
|
59 Sum -= delta2
|
e@0
|
60 delta2 *= 2
|
e@0
|
61 if delta2 > dmax:
|
e@0
|
62 delta2 = dmax
|
e@0
|
63 x[0,i] = Sum
|
e@0
|
64 return x
|
e@0
|
65
|
e@0
|
66 def adm(x, dmin=.01, dmax=.28, tol=0.0001):
|
e@0
|
67
|
e@0
|
68 # Adaptive delta modulation adapted by code:
|
e@0
|
69 # (adeltamod.m)
|
e@0
|
70 #
|
e@0
|
71 # Adaptive Delta Modulator
|
e@0
|
72 # by Gandhar Desai (gdesai)
|
e@0
|
73 # BITS Pilani Goa Campus
|
e@0
|
74 # Date: 28 Sept, 2013
|
e@0
|
75
|
e@0
|
76 xsig = x
|
e@0
|
77
|
e@0
|
78 Lx = len(x)
|
e@0
|
79
|
e@0
|
80 ADMout = zeros((1, Lx))
|
e@0
|
81 codevec = zeros((1, Lx))
|
e@0
|
82
|
e@0
|
83
|
e@0
|
84 Sum = x[0]
|
e@0
|
85 delta1 = dmin
|
e@0
|
86 delta2 = dmin
|
e@0
|
87 mult1 = 2
|
e@0
|
88 mult2 = 2
|
e@0
|
89 for i in range(0, Lx):
|
e@0
|
90 #print abs(xsig[i] - Sum)
|
e@0
|
91 if (abs(xsig[i] - Sum) < tol):
|
e@0
|
92 bit = 0
|
e@0
|
93 delta2 = dmin
|
e@0
|
94 delta1 = dmin
|
e@0
|
95
|
e@0
|
96
|
e@0
|
97 elif (xsig[i] >= Sum):
|
e@0
|
98 bit = 1
|
e@0
|
99 delta2 = dmin
|
e@0
|
100 Sum += delta1
|
e@0
|
101 delta1 *= mult1
|
e@0
|
102 if delta1 > dmax:
|
e@0
|
103 delta1 = dmax
|
e@0
|
104
|
e@0
|
105
|
e@0
|
106 else:
|
e@0
|
107 bit = -1
|
e@0
|
108 delta1 = dmin
|
e@0
|
109 Sum -= delta2
|
e@0
|
110 delta2 *= mult2
|
e@0
|
111 if delta2 > dmax:
|
e@0
|
112 delta2 = dmax
|
e@0
|
113
|
e@0
|
114
|
e@0
|
115
|
e@0
|
116 ADMout[0, i] = Sum
|
e@0
|
117 codevec[0, i]= bit
|
e@0
|
118
|
e@0
|
119 return ADMout,codevec, x[0]
|
e@0
|
120
|
e@0
|
121 if __name__=="__main__":
|
e@0
|
122 if len(argv) != 2:
|
e@0
|
123 print "[EE] Wrong number of arguments"
|
e@0
|
124 print "[II] Correct syntax is:"
|
e@0
|
125 print "[II] \t%s <training_file>"
|
e@0
|
126 print "[II] where <training_file> is a .yaml file containing the"
|
e@0
|
127 print "[II] features of the dataset (try output2_stage/fulltraining-last.yaml)"
|
e@0
|
128 exit(-1)
|
e@0
|
129
|
e@0
|
130
|
e@0
|
131 n_clusters = 4
|
e@0
|
132 UpsamplingFactor = 1
|
e@0
|
133 dmin = 0.001
|
e@0
|
134 dmax = 0.28
|
e@0
|
135 tol = 0.001
|
e@0
|
136
|
e@0
|
137 infile = argv[1]
|
e@0
|
138
|
e@0
|
139 features_pool = YamlInput(filename = infile)()
|
e@0
|
140
|
e@0
|
141
|
e@0
|
142
|
e@0
|
143 feature_captions = features_pool.descriptorNames()
|
e@0
|
144 parameter_captions = []
|
e@0
|
145
|
e@0
|
146
|
e@0
|
147 for c in features_pool.descriptorNames():
|
e@0
|
148 if c.split('.')[0] == 'parameter':
|
e@0
|
149 parameter_captions.append(c)
|
e@0
|
150 if c.split('.')[0] == 'metadata' or c.split('.')[0] == 'parameter':
|
e@0
|
151 feature_captions.remove(c)
|
e@0
|
152
|
e@0
|
153
|
e@0
|
154
|
e@0
|
155 close('all')
|
e@0
|
156
|
e@0
|
157 print "[II] Loaded training data from %s (%s) " % (infile, features_pool['metadata.date'][0])
|
e@0
|
158 print "[II] %d Features Available: " % len(feature_captions)
|
e@0
|
159
|
e@0
|
160
|
e@0
|
161
|
e@0
|
162 print str(feature_captions).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7]
|
e@0
|
163
|
e@0
|
164 nfeatures_in = len(feature_captions)
|
e@0
|
165 nparameters_in = len(parameter_captions)
|
e@0
|
166 features_vector = zeros((nfeatures_in, len(features_pool[feature_captions[0]])))
|
e@0
|
167
|
e@0
|
168 parameters_vector = zeros((nparameters_in, len(features_pool[parameter_captions[0]])))
|
e@0
|
169
|
e@0
|
170
|
e@0
|
171 for i in range(0, nfeatures_in):
|
e@0
|
172 features_vector[i, :] = features_pool[feature_captions[i]].T
|
e@0
|
173 for i in range(0, nparameters_in):
|
e@0
|
174 parameters_vector[i, :] = features_pool[parameter_captions[0]].T
|
e@0
|
175
|
e@0
|
176 print "[II] %d parameters used:" % len(parameter_captions)
|
e@0
|
177 print str(parameter_captions).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7].replace('parameter.','')
|
e@0
|
178
|
e@0
|
179 print "[II] Marking silent parts"
|
e@0
|
180
|
e@0
|
181 silent_parts = zeros((1, len(features_pool[feature_captions[i]].T)))
|
e@0
|
182
|
e@0
|
183 rms = features_vector[feature_captions.index('rms'), :]
|
e@0
|
184
|
e@0
|
185 # Implementing Hysteresis Gate -- High threshold is halfway between
|
e@0
|
186 # the mean and the max and Low is halfway between the mean dn the min
|
e@0
|
187
|
e@0
|
188 rms_threshold_mean = mean(rms)
|
e@0
|
189
|
e@0
|
190 rms_threshold_max = max(rms)
|
e@0
|
191 rms_threshold_min = min(rms)
|
e@0
|
192
|
e@0
|
193 rms_threshold_high = 0.1 * rms_threshold_mean
|
e@0
|
194 rms_threshold_low = 0.01 * rms_threshold_mean
|
e@0
|
195
|
e@0
|
196 for n in range(1, len(rms)):
|
e@0
|
197 prev = rms[n-1]
|
e@0
|
198 curr = rms[n]
|
e@0
|
199
|
e@0
|
200 if prev >= rms_threshold_high:
|
e@0
|
201 if curr < rms_threshold_low:
|
e@0
|
202 silent_parts[0,n] = 1
|
e@0
|
203 else:
|
e@0
|
204 silent_parts[0,n] = 0
|
e@0
|
205 elif prev <= rms_threshold_low:
|
e@0
|
206 if curr > rms_threshold_high:
|
e@0
|
207 silent_parts[0,n] = 0
|
e@0
|
208 else:
|
e@0
|
209 silent_parts[0,n] = 1
|
e@0
|
210 else:
|
e@0
|
211 silent_parts[0,n] = silent_parts[0,n-1]
|
e@0
|
212
|
e@0
|
213
|
e@0
|
214 if silent_parts[0,1] == 1:
|
e@0
|
215 silent_parts[0, 0] = 1
|
e@0
|
216
|
e@0
|
217
|
e@0
|
218 # plot(rms)
|
e@0
|
219 # plot(silent_parts.T)
|
e@0
|
220 # plot(ones((len(rms), 1))*rms_threshold_high)
|
e@0
|
221 # plot(ones((len(rms), 1))*rms_threshold_low)
|
e@0
|
222
|
e@0
|
223 active_index = invert(silent_parts.flatten().astype(bool))
|
e@0
|
224
|
e@0
|
225 # Keep only active parts
|
e@0
|
226
|
e@0
|
227 # Uncomment this
|
e@0
|
228 features_vector = features_vector[:, active_index]
|
e@0
|
229 # parameters_vector = parameters_vector[:, active_index]
|
e@0
|
230
|
e@0
|
231 moments_vector = zeros((features_vector.shape[0], 2))
|
e@0
|
232
|
e@0
|
233 print "[II] Storing moments vector"
|
e@0
|
234 for i in range(0, features_vector.shape[0]):
|
e@0
|
235 mean_ = mean(features_vector[i,:])
|
e@0
|
236 std_ = std(features_vector[i,:], ddof=1)
|
e@0
|
237 moments_vector[i,0] = mean_
|
e@0
|
238 moments_vector[i,1] = std_
|
e@0
|
239
|
e@0
|
240 features_vector[i,:] = (features_vector[i,:] - mean_)/std_
|
e@0
|
241
|
e@0
|
242 features_vector_original = features_vector
|
e@0
|
243
|
e@0
|
244
|
e@0
|
245 print "[II] Extracting PCA configuration "
|
e@0
|
246
|
e@0
|
247 kernel, q, featurelist = extract_pca_configuration_from_data(features_vector)
|
e@0
|
248
|
e@0
|
249 print "[II] Optimal number of PCs to keep: %d" % q
|
e@0
|
250
|
e@0
|
251 feature_captions_array = array(feature_captions)
|
e@0
|
252
|
e@0
|
253 # features_to_keep = features_vector
|
e@0
|
254
|
e@0
|
255 features_to_keep = list(feature_captions_array[featurelist])
|
e@0
|
256 print "[II] Decided to keep %d features:" % len(features_to_keep)
|
e@0
|
257 print str(features_to_keep).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7]
|
e@0
|
258
|
e@0
|
259
|
e@0
|
260
|
e@0
|
261 # Keep the desired features
|
e@0
|
262 #Uncomment this
|
e@0
|
263 features_kept_data = features_vector[featurelist,:]
|
e@0
|
264 # features_kept_data = features_vector
|
e@0
|
265 # features_kept_data = features_vector
|
e@0
|
266
|
e@0
|
267 # Generate the parameter clusters using k-means
|
e@0
|
268
|
e@0
|
269 # Uncomment this
|
e@0
|
270 features_vector = (kernel.T*features_kept_data)[0:q,:]
|
e@0
|
271 #features_vector = log(features_vector+0.001)
|
e@0
|
272 # features_vector = features_vector_original
|
e@0
|
273
|
e@0
|
274
|
e@0
|
275 # parameters_k_means = KMeans(n_clusters = parameters_k, init='k-means++', max_iter=300, tol=0.0000001, verbose = 1)
|
e@0
|
276 parameters_k_means = KMeans(init='k-means++', n_init=10, max_iter=300, tol=0.0000001, verbose = 0)
|
e@0
|
277 # # parameters_k_means = MiniBatchKMeans(init='k-means++', max_iter=300, tol=0.00001, verbose = 1)
|
e@0
|
278 #
|
e@0
|
279 # # Quantize the differences of the parameters instead of the parameters themselves
|
e@0
|
280 # parameters_vector_diff = concatenate((zeros((shape(parameters_vector)[0],1)),diff(parameters_vector, axis=1)),axis=1)
|
e@0
|
281 # features_vector_diff = concatenate((zeros((shape(features_vector)[0],1)),diff(features_vector,axis=1)),axis=1)
|
e@0
|
282 #
|
e@0
|
283 # # Delete this afterwards
|
e@0
|
284 # # features_vector = features_vector_diff
|
e@0
|
285 # parameters_k_means.fit(parameters_vector_diff.T)
|
e@0
|
286
|
e@0
|
287 print "[II] Trying ADM-coded parameters"
|
e@0
|
288 print "[II] Upsampling features and parameters by a factor of %d" % UpsamplingFactor
|
e@0
|
289
|
e@0
|
290
|
e@0
|
291 # Upsampled features and parameters
|
e@0
|
292 features_vector_upsampled = repeat(features_vector,UpsamplingFactor, axis=1)
|
e@0
|
293 # feature_labels_upsampled = repeat(features_clustering_labels,UpsamplingFactor, axis=0)
|
e@0
|
294 parameters_vector_upsampled = repeat(parameters_vector,UpsamplingFactor, axis=1)
|
e@0
|
295
|
e@0
|
296 parameters_vector_upsampled_adm = matrix(zeros(shape(parameters_vector_upsampled)))
|
e@0
|
297 parameters_vector_upsampled_code = matrix(zeros(shape(parameters_vector_upsampled)))
|
e@0
|
298 parameters_vector_upsampled_firstval = matrix(zeros((parameters_vector_upsampled.shape[0],1)))
|
e@0
|
299
|
e@0
|
300 # Reconstructed parameters
|
e@0
|
301
|
e@0
|
302 parameters_vector_upsampled_reconstructed = matrix(zeros(shape(parameters_vector_upsampled)))
|
e@0
|
303
|
e@0
|
304
|
e@0
|
305
|
e@0
|
306
|
e@0
|
307 def adm_matrix(X, dmin=0.001,dmax=0.28,tol=0.001):
|
e@0
|
308
|
e@0
|
309 out = matrix(zeros(shape(X)))
|
e@0
|
310 code = matrix(zeros(shape(X)))
|
e@0
|
311 firstval = matrix(zeros((X.shape[0], 1)))
|
e@0
|
312
|
e@0
|
313 for i in range(0, X.shape[0]):
|
e@0
|
314 out[i,:], code[i,:], firstval[i,0] = adm(X[i,:],dmin=dmin,dmax=dmax,tol=tol)
|
e@0
|
315
|
e@0
|
316 return out,code,firstval
|
e@0
|
317
|
e@0
|
318 # parameters_vector_upsampled_reconstructed[i,:] = adm_reconstruct(parameters_vector_upsampled_code[i,:],parameters_vector_upsampled_firstval[i,0], dmin=dmin,dmax=dmax)
|
e@0
|
319
|
e@0
|
320 def adm_matrix_reconstruct(code, firstval, dmin=0.001, dmax=0.28):
|
e@0
|
321 X = matrix(zeros(shape(code)))
|
e@0
|
322 for i in range(0, code.shape[0]):
|
e@0
|
323 X[i,:] = adm_reconstruct(code[i,:], firstval[i,0], dmin=dmin, dmax=dmax)
|
e@0
|
324
|
e@0
|
325 return X
|
e@0
|
326
|
e@0
|
327
|
e@0
|
328 parameters_vector_upsampled_adm, parameters_vector_upsampled_code, parameters_vector_upsampled_firstval = adm_matrix(parameters_vector_upsampled, dmin, dmax, tol)
|
e@0
|
329
|
e@0
|
330
|
e@0
|
331 def diff_and_pad(X):
|
e@0
|
332 return concatenate((
|
e@0
|
333 zeros((
|
e@0
|
334 shape(X)[0],
|
e@0
|
335 1
|
e@0
|
336 )),
|
e@0
|
337 diff(X, axis=1)),
|
e@0
|
338 axis=1)
|
e@0
|
339
|
e@0
|
340
|
e@0
|
341 #features_vector_upsampled = diff_and_pad(diff_and_pad(features_vector_upsampled))
|
e@0
|
342
|
e@0
|
343 # features_vector_diff = concatenate((zeros((shape(features_vector)[0],1)),diff(features_vector,axis=1)),axis=1)
|
e@0
|
344
|
e@0
|
345
|
e@0
|
346
|
e@0
|
347 # Segmentation stuff
|
e@0
|
348
|
e@0
|
349
|
e@0
|
350
|
e@0
|
351 # for i in range(0, parameters_vector_upsampled.shape[0]):
|
e@0
|
352 # out, code, h = adm(parameters_vector_upsampled[i,:],dmin=dmin,dmax=dmax,tol=tol)
|
e@0
|
353 # parameters_vector_upsampled_adm[i,:] = out
|
e@0
|
354 # parameters_vector_upsampled_code[i,:] = code
|
e@0
|
355 # parameters_vector_upsampled_firstval[i, 0] = h
|
e@0
|
356
|
e@0
|
357 # parameters_k_means.fit(parameters_vector.T)
|
e@0
|
358 ##
|
e@0
|
359 # parameters_k_means_centers = parameters_k_means.cluster_centers_
|
e@0
|
360 # parameters_k_means_labels = parameters_k_means.labels_
|
e@0
|
361 ##
|
e@0
|
362 # parameters_vector_estimated = zeros(shape(parameters_vector))
|
e@0
|
363 ##
|
e@0
|
364 # for n in range(0, len(parameters_vector_estimated[0])):
|
e@0
|
365 # parameters_vector_estimated[:,n] = parameters_k_means_centers[parameters_k_means_labels[n]]
|
e@0
|
366 ##
|
e@0
|
367 ### plot(parameters_vector[0])
|
e@0
|
368 ## # plot(parameters_vector_estimated[0])
|
e@0
|
369 ##
|
e@0
|
370 ## # PROvLIMA EDW
|
e@0
|
371 # print "[II] Parameters MSE for %d clusters: %.3f" % (len(parameters_k_means.cluster_centers_), mse(parameters_vector, parameters_vector_estimated))
|
e@0
|
372 #
|
e@0
|
373 ##
|
e@0
|
374 print "[II] Clustering features."
|
e@0
|
375 #
|
e@0
|
376 features_clustering = GMM(n_components = n_clusters, covariance_type='diag')
|
e@0
|
377 #
|
e@0
|
378 features_clustering.fit( features_vector_upsampled.T, y=parameters_vector_upsampled_code)
|
e@0
|
379 #
|
e@0
|
380 features_clustering_means = features_clustering.means_
|
e@0
|
381 features_clustering_labels = features_clustering.predict(features_vector_upsampled.T)
|
e@0
|
382 features_clustering_sigmas = features_clustering.covars_
|
e@0
|
383 #
|
e@0
|
384 features_vector_upsampled_estimated = zeros(shape(features_vector_upsampled))
|
e@0
|
385 #
|
e@0
|
386 #
|
e@0
|
387 for n in range(0, len(features_vector_upsampled_estimated[0])):
|
e@0
|
388 features_vector_upsampled_estimated[:,n] = features_clustering_means[features_clustering_labels[n]]
|
e@0
|
389 #
|
e@0
|
390 # # for n in range(0,features_vector.shape[0]):
|
e@0
|
391 # # hist(features_vector[1]-features_vector_estimated[1], 256)
|
e@0
|
392 # std(features_vector[1]-features_vector_estimated[1], ddof=1)
|
e@0
|
393 # mean(features_vector[1]-features_vector_estimated[1])
|
e@0
|
394 #
|
e@0
|
395 print "[II] Features MSE for %d clusters: %.3f" % (n_clusters, mse(features_vector_upsampled, features_vector_upsampled_estimated))
|
e@0
|
396
|
e@0
|
397
|
e@0
|
398
|
e@0
|
399 def cross_validate_clustering(data, estimator):
|
e@0
|
400 print "[II] Crossvalidating... "
|
e@0
|
401 estimator_fulldata = estimator
|
e@0
|
402 estimator_fulldata.fit(data)
|
e@0
|
403
|
e@0
|
404 # labels = estimator_fulldata.predict(data)
|
e@0
|
405 means = estimator_fulldata.means_
|
e@0
|
406 # print means
|
e@0
|
407
|
e@0
|
408 percents = arange(0.1,0.6,0.1)
|
e@0
|
409 MSEs = []
|
e@0
|
410 reconstructed = zeros(shape(data))
|
e@0
|
411 labels = estimator.predict(data)
|
e@0
|
412 for n in range(0, len(reconstructed)):
|
e@0
|
413 reconstructed[n,:] = means[labels[n]]
|
e@0
|
414
|
e@0
|
415 MSEs.append(mse(data,reconstructed))
|
e@0
|
416 for p in percents:
|
e@0
|
417 train,test = cross_validation.train_test_split(data,test_size=p,random_state=0)
|
e@0
|
418 train = matrix(train)
|
e@0
|
419 test = matrix(test)
|
e@0
|
420 # print shape(train)
|
e@0
|
421 # print shape(test)
|
e@0
|
422 estimator.fit(train)
|
e@0
|
423 means = estimator.means_
|
e@0
|
424 labels = estimator.predict(test)
|
e@0
|
425 reconstructed = zeros(shape(test))
|
e@0
|
426 for n in range(0, len(reconstructed)):
|
e@0
|
427 reconstructed[n,:] = means[labels[n]]
|
e@0
|
428
|
e@0
|
429 m = mse(test,reconstructed)
|
e@0
|
430
|
e@0
|
431 print "[II] MSE for clustering crossvalidated data %.2f-%.2f: %.5f" % ((1-p), p, m)
|
e@0
|
432 MSEs.append(m)
|
e@0
|
433
|
e@0
|
434 print "[II] Crossvalidation complete"
|
e@0
|
435
|
e@0
|
436 return MSEs
|
e@0
|
437
|
e@0
|
438 # print "[II] Trying Cross Validation"
|
e@0
|
439
|
e@0
|
440 # cross_validate_clustering(features_vector_upsampled.T, features_clustering)
|
e@0
|
441
|
e@0
|
442
|
e@0
|
443 # Construct parameters alphabet, each symbol is going to be a different column vector
|
e@0
|
444 # in parameter code matrix
|
e@0
|
445
|
e@0
|
446
|
e@0
|
447 def vector_to_states(X):
|
e@0
|
448 """
|
e@0
|
449 Input: a vector MxN with N samples and M variables
|
e@0
|
450 Output: a codeword dictionary `parameters_alphabet',
|
e@0
|
451 state_seq, inverse `parameters_alphabet_inv' """
|
e@0
|
452
|
e@0
|
453
|
e@0
|
454 parameters_alphabet = {}
|
e@0
|
455 n = 0
|
e@0
|
456
|
e@0
|
457 for i in range(0, X.shape[1]):
|
e@0
|
458 vec = tuple(ravel(X[:,i]))
|
e@0
|
459 if vec not in parameters_alphabet:
|
e@0
|
460 parameters_alphabet[vec] = n
|
e@0
|
461 n += 1
|
e@0
|
462
|
e@0
|
463 parameters_alphabet_inv = dict([(parameters_alphabet[m],m) for m in parameters_alphabet])
|
e@0
|
464
|
e@0
|
465 state_seq = array([parameters_alphabet[tuple(ravel(X[:,m]))] for m in range(0, parameters_vector_upsampled_code.shape[1])] )
|
e@0
|
466
|
e@0
|
467 return state_seq, parameters_alphabet, parameters_alphabet_inv
|
e@0
|
468
|
e@0
|
469
|
e@0
|
470 def states_to_vector(predicted, parameters_alphabet_inv):
|
e@0
|
471 estimated = matrix(zeros((len(parameters_alphabet_inv[0]), len(predicted))))
|
e@0
|
472 for i in range(0, len(state_seq)):
|
e@0
|
473 estimated[:, i] = matrix(parameters_alphabet_inv[predicted[i]]).T
|
e@0
|
474
|
e@0
|
475 return estimated
|
e@0
|
476
|
e@0
|
477 state_seq, parameters_alphabet, parameters_alphabet_inv = vector_to_states(parameters_vector_upsampled_code)
|
e@0
|
478
|
e@0
|
479
|
e@0
|
480 parameters_change_variable = matrix(diff_and_pad(parameters_vector_upsampled)!=0).astype(int)
|
e@0
|
481
|
e@0
|
482 changes_state_seq, changes_parameters_alphabet, changes_parameters_alphabet_inv = vector_to_states(parameters_change_variable)
|
e@0
|
483
|
e@0
|
484
|
e@0
|
485 # This is an hmm that just codes the changes"
|
e@0
|
486 # We have only two states, change and stay the same.
|
e@0
|
487
|
e@0
|
488 print "[II] (changes hmm-chain) Creating emission probability mixtures for every state "
|
e@0
|
489
|
e@0
|
490 gmm_list_changes = []
|
e@0
|
491 for n in range(0, 2):
|
e@0
|
492 vectors = features_vector_upsampled[:,changes_state_seq == n]
|
e@0
|
493 gmm = GMM(n_components=n_clusters, covariance_type = 'diag')
|
e@0
|
494 gmm.fit(vectors.T)
|
e@0
|
495 gmm_list_changes.append(gmm)
|
e@0
|
496
|
e@0
|
497
|
e@0
|
498 hmm_changes = hmm.GMMHMM(n_components=2, gmms=array(gmm_list_changes),n_mix=n_clusters)
|
e@0
|
499 hmm_changes.fit([array(features_vector_upsampled).T])
|
e@0
|
500
|
e@0
|
501 subplot(211)
|
e@0
|
502 plot(parameters_change_variable.T)
|
e@0
|
503 subplot(212)
|
e@0
|
504
|
e@0
|
505 changes_predicted_states = hmm_changes.predict(array(features_vector_upsampled.T))
|
e@0
|
506 predicted_changes_estimated = states_to_vector(changes_predicted_states, changes_parameters_alphabet_inv)
|
e@0
|
507
|
e@0
|
508 plot(predicted_changes_estimated.T)
|
e@0
|
509
|
e@0
|
510
|
e@0
|
511
|
e@0
|
512
|
e@0
|
513 # End of changes HMM here
|
e@0
|
514
|
e@0
|
515
|
e@0
|
516 print "[II] Creating emission probability mixtures for every state"
|
e@0
|
517 gmm_list = []
|
e@0
|
518 for n in range(0, 3):
|
e@0
|
519 vectors = features_vector_upsampled[:,state_seq == n]
|
e@0
|
520 gmm = GMM(n_components=n_clusters,covariance_type = 'diag')
|
e@0
|
521 gmm.fit(vectors.T)
|
e@0
|
522 gmm_list.append(gmm)
|
e@0
|
523
|
e@0
|
524 hmm1 = hmm.GMMHMM(n_components=3, gmms=array(gmm_list),n_mix=n_clusters)
|
e@0
|
525 hmm1.fit([array(features_vector_upsampled).T])
|
e@0
|
526
|
e@0
|
527 figure()
|
e@0
|
528 subplot(221)
|
e@0
|
529 plot(parameters_vector_upsampled_code.T)
|
e@0
|
530
|
e@0
|
531 predicted = hmm1.predict(array(features_vector_upsampled.T))
|
e@0
|
532
|
e@0
|
533 code_estimated = matrix(zeros((len(parameters_vector_upsampled), len(state_seq))))
|
e@0
|
534
|
e@0
|
535 # for i in range(0, len(state_seq)):
|
e@0
|
536 # code_estimated[:, i] = matrix(parameters_alphabet_inv[predicted[i]]).T
|
e@0
|
537
|
e@0
|
538 code_estimated = states_to_vector(predicted,parameters_alphabet_inv)
|
e@0
|
539 subplot(222)
|
e@0
|
540 plot(code_estimated.T)
|
e@0
|
541
|
e@0
|
542 reconstructed_original = adm_matrix_reconstruct(parameters_vector_upsampled_code, parameters_vector_upsampled_firstval)
|
e@0
|
543 subplot(223)
|
e@0
|
544 plot(reconstructed_original.T)
|
e@0
|
545 subplot(224)
|
e@0
|
546 reconstructed_estimated = adm_matrix_reconstruct(code_estimated, parameters_vector_upsampled_firstval)
|
e@0
|
547 plot(reconstructed_estimated.T)
|
e@0
|
548
|
e@0
|
549 # scatter(features_vector_upsampled[0,:],features_vector_upsampled_estimated[0,:])
|
e@0
|
550 # scatter(features_vector_upsampled[1,:],features_vector_upsampled_estimated[1,:])
|
e@0
|
551 #
|
e@0
|
552 # xlabel('Original Features on Principal Components')
|
e@0
|
553 # ylabel('Estimated Features on Principal Components')
|
e@0
|
554 # title('Original vs Estimated Features')
|
e@0
|
555 # savefig('original_vs_estimated.png')
|
e@0
|
556
|
e@0
|
557
|
e@0
|
558
|
e@0
|
559 #
|
e@0
|
560 #
|
e@0
|
561 # print "[II] Testing Gaussian Naive Bayes Classifier"
|
e@0
|
562 ##
|
e@0
|
563 # gnb = GaussianNB()
|
e@0
|
564 # gnb.fit(features_vector_upsampled.T, parameters_vector_upsampled_code.T)
|
e@0
|
565 # parameters_vector_upsampled_code_estimated = gnb.predict(features_vector_upsampled.T)
|
e@0
|
566 #
|
e@0
|
567 #
|
e@0
|
568 ##
|
e@0
|
569 # print "[II] Raw Parameters - Gaussian Naive Bayes classifier testing ratio: %.4f" % (float(sum((parameters_vector_upsampled_code_estimated == parameters_vector_upsampled_code).astype(float)))/float(len(parameters_k_means_labels)))
|
e@0
|
570 ##
|
e@0
|
571 # plot(adm_matrix_reconstruct(parameters_vector_upsampled_code_estimated,parameters_vector_upsampled_firstval,dmin,dmax).T)
|
e@0
|
572 #
|
e@0
|
573 # print "[II] Trying ADM-coded parameters"
|
e@0
|
574 # UpsamplingFactor = 100
|
e@0
|
575 # print "[II] Upsampling features and parameters by a factor of %d" % UpsamplingFactor
|
e@0
|
576 #
|
e@0
|
577 #
|
e@0
|
578 # # Upsampled features and parameters
|
e@0
|
579 # features_vector_upsampled = repeat(features_vector,UpsamplingFactor, axis=1)
|
e@0
|
580 # feature_labels_upsampled = repeat(features_clustering_labels,UpsamplingFactor, axis=0)
|
e@0
|
581 # parameters_vector_upsampled = repeat(parameters_vector,UpsamplingFactor, axis=1)
|
e@0
|
582 #
|
e@0
|
583 # parameters_vector_upsampled_adm = matrix(zeros(shape(parameters_vector_upsampled)))
|
e@0
|
584 # parameters_vector_upsampled_code = matrix(zeros(shape(parameters_vector_upsampled)))
|
e@0
|
585 # parameters_vector_upsampled_firstval = matrix(zeros((parameters_vector_upsampled.shape[0],1)))
|
e@0
|
586 #
|
e@0
|
587 # # Reconstructed parameters
|
e@0
|
588 #
|
e@0
|
589 # parameters_vector_upsampled_reconstructed = matrix(zeros(shape(parameters_vector_upsampled)))
|
e@0
|
590 #
|
e@0
|
591 # dmin = 0.001
|
e@0
|
592 # dmax = 0.28
|
e@0
|
593 # tol = 0.001
|
e@0
|
594 # for i in range(0, parameters_vector_upsampled.shape[0]):
|
e@0
|
595 # out, code, h = adm(parameters_vector_upsampled[i,:],dmin=dmin,dmax=dmax,tol=tol)
|
e@0
|
596 # parameters_vector_upsampled_adm[i,:] = out
|
e@0
|
597 # parameters_vector_upsampled_code[i,:] = code
|
e@0
|
598 # parameters_vector_upsampled_firstval[i, 0] = h
|
e@0
|
599 #
|
e@0
|
600 #
|
e@0
|
601 # # Reconstruct-ADM
|
e@0
|
602 # parameters_vector_upsampled_reconstructed[i,:] = adm_reconstruct(parameters_vector_upsampled_code[i,:],parameters_vector_upsampled_firstval[i,0], dmin=dmin,dmax=dmax)
|
e@0
|
603 #
|
e@0
|
604 #
|
e@0
|
605 # # plot(parameters_vector_upsampled_adm.T, 'r--')
|
e@0
|
606 #
|
e@0
|
607 #
|
e@0
|
608 # # plot(parameters_vector_upsampled.T)
|
e@0
|
609 # # plot(parameters_vector_upsampled_reconstructed.T, 'g.')
|
e@0
|
610 #
|
e@0
|
611 #
|
e@0
|
612 #
|
e@0
|
613 # parameters_vector_reconstructed = zeros(shape(parameters_vector))
|
e@0
|
614 # for n in range(0, parameters_vector.shape[1]):
|
e@0
|
615 # parameters_vector_reconstructed[:,n] = parameters_vector_upsampled_reconstructed[:,n*UpsamplingFactor]
|
e@0
|
616 #
|
e@0
|
617 #
|
e@0
|
618 # mse_adm = mse(parameters_vector_reconstructed, parameters_vector)
|
e@0
|
619 #
|
e@0
|
620 # print "[II] Expected ADM reconstruction MSE: %.4f" % mse_adm
|
e@0
|
621 #
|
e@0
|
622 # # figure()
|
e@0
|
623 # #plot(parameters_vector.T)
|
e@0
|
624 # # plot(parameters_vector_reconstructed.T)
|
e@0
|
625 #
|
e@0
|
626 # print "[II] Building HMM transition, emission matrices and priors"
|
e@0
|
627 #
|
e@0
|
628 # transmat = zeros((3,3))
|
e@0
|
629 # startprob = zeros((3,))
|
e@0
|
630 # emissionmat = zeros((3, n_clusters))
|
e@0
|
631 #
|
e@0
|
632 #
|
e@0
|
633 # state_labels = parameters_vector_upsampled_code + 1
|
e@0
|
634 # stateseq = state_labels.T
|
e@0
|
635 #
|
e@0
|
636 # for n in range(0,shape(parameters_vector_upsampled_code)[1]):
|
e@0
|
637 # if n>0:
|
e@0
|
638 # transmat[state_labels[0,n-1],state_labels[0,n]] += 1
|
e@0
|
639 # startprob[state_labels[0,n]] +=1
|
e@0
|
640 # emissionmat[state_labels[0,n],feature_labels_upsampled[n]] += 1
|
e@0
|
641 #
|
e@0
|
642 #
|
e@0
|
643 # for n in range(0, transmat.shape[0]):
|
e@0
|
644 # transmat[n,:]/=sum(transmat[n,:])
|
e@0
|
645 # emissionmat[n,:]/=sum(emissionmat[n,:])
|
e@0
|
646 #
|
e@0
|
647 #
|
e@0
|
648 # transmat = matrix(transmat)
|
e@0
|
649 # emissionmat = matrix(emissionmat)
|
e@0
|
650 # # Prior
|
e@0
|
651 # startprob = startprob/sum(startprob)
|
e@0
|
652 # startprob = ravel(startprob)
|
e@0
|
653 #
|
e@0
|
654 # # Vocabulary
|
e@0
|
655 #
|
e@0
|
656 # model = hmm.GMMHMM(n_mix=n_clusters, n_components=3, covariance_type="diag")
|
e@0
|
657 # model.means_ = features_clustering.means_
|
e@0
|
658 # model.covars_ = features_clustering.covars_
|
e@0
|
659 #
|
e@0
|
660 # features_vector_array = array(features_vector)
|
e@0
|
661 # features_vector_upsampled_array=array(features_vector_upsampled)
|
e@0
|
662 #
|
e@0
|
663 # model.fit([features_vector_array.T])
|
e@0
|
664 # stateseq_estimated = model.predict(features_vector_upsampled_array.T)
|
e@0
|
665 #
|
e@0
|
666 # parameters_vector_upsampled_reconstructed_decoded = matrix(zeros(shape(parameters_vector_upsampled)))
|
e@0
|
667 #
|
e@0
|
668 #
|
e@0
|
669 # plot(stateseq_estimated)
|
e@0
|
670 # plot(stateseq)
|
e@0
|
671 #
|
e@0
|
672 # code_estimated = matrix(zeros(shape(parameters_vector_upsampled)))
|
e@0
|
673 # code_estimated[0,:] = stateseq_estimated - 1
|
e@0
|
674 #
|
e@0
|
675 #
|
e@0
|
676 #
|
e@0
|
677 # parameters_vector_upsampled_reconstructed_estimated = matrix(zeros(shape(parameters_vector_upsampled)))
|
e@0
|
678 #
|
e@0
|
679 # for i in range(0, parameters_vector_upsampled.shape[0]):
|
e@0
|
680 # parameters_vector_upsampled_reconstructed_estimated[i,:] = adm_reconstruct(code_estimated,parameters_vector_upsampled_firstval[i,0], dmin=dmin,dmax=dmax)
|
e@0
|
681 # figure()
|
e@0
|
682 # plot(parameters_vector_upsampled_reconstructed_estimated.T)
|
e@0
|
683 # plot(parameters_vector_upsampled.T) |