comparison scripts/map_and_average.py @ 9:c4841876a8ff branch-tests

adding notebooks and trying to explain classifier coefficients
author Maria Panteli <m.x.panteli@gmail.com>
date Mon, 11 Sep 2017 19:06:40 +0100
parents
children d118b6ca8370
comparison
equal deleted inserted replaced
8:0f3eba42b425 9:c4841876a8ff
1 # -*- coding: utf-8 -*-
2 """
3 Created on Thu Mar 16 02:44:07 2017
4
5 @author: mariapanteli
6 """
7
8 import numpy as np
9 import pickle
10
11 import util_feature_learning
12
13 WIN_SIZE = 8
14 INPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle',
15 '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle',
16 '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle']
17 OUTPUT_FILES = ['/import/c4dm-04/mariap/lda_data_melodia_'+str(WIN_SIZE)+'.pickle',
18 '/import/c4dm-04/mariap/pca_data_melodia_'+str(WIN_SIZE)+'.pickle',
19 '/import/c4dm-04/mariap/nmf_data_melodia_'+str(WIN_SIZE)+'.pickle',
20 '/import/c4dm-04/mariap/ssnmf_data_melodia_'+str(WIN_SIZE)+'.pickle',
21 '/import/c4dm-04/mariap/na_data_melodia_'+str(WIN_SIZE)+'.pickle']
22
23
24 def remove_inds(features, labels, audiolabels):
25 '''remove instances with unknown country
26 '''
27 remove_inds1 = np.where(labels=='unknown')[0]
28 remove_inds2 = np.where(labels=='Unidentified')[0]
29 keep_inds = np.array(list(set(range(len(labels))) - (set(remove_inds1) | set(remove_inds2))))
30 features = features[keep_inds, :]
31 labels = labels[keep_inds]
32 audiolabels = audiolabels[keep_inds]
33 return features, labels, audiolabels
34
35
36 def averageframes(features, audiolabels, classlabels):
37 '''average frame-based features for each recording
38 '''
39 u, ind = np.unique(audiolabels, return_index=True)
40 uniqsorted = u[np.argsort(ind)]
41 newfeatures = []
42 newclasslabels = []
43 newaudiolabels = []
44 for aulabel in uniqsorted:
45 inds = np.where(audiolabels == aulabel)[0]
46 newfeatures.append(np.mean(features[inds, :], axis=0))
47 newclasslabels.append(classlabels[inds[0]])
48 newaudiolabels.append(aulabel)
49 newfeatures = np.array(newfeatures)
50 newaudiolabels = np.array(newaudiolabels)
51 newclasslabels = np.array(newclasslabels)
52 return newfeatures, newaudiolabels, newclasslabels
53
54
55 def load_data_from_pickle(pickle_file=None):
56 '''load frame based features and labels from pickle file
57 '''
58 with open(pickle_file,'rb') as f:
59 data, labels, audiolabels = pickle.load(f)
60 # remove 'unknown' and 'unidentified' country
61 data, labels, audiolabels = remove_inds(data, labels, audiolabels)
62 # avoid nan which gives error in feature learning
63 data[np.isnan(data)] = 0
64 return data, labels, audiolabels
65
66
67 def load_train_val_test_sets():
68 '''load train, val, test sets
69 '''
70 print INPUT_FILES[0]
71 trainset = load_data_from_pickle(INPUT_FILES[0])
72 valset = load_data_from_pickle(INPUT_FILES[1])
73 testset = load_data_from_pickle(INPUT_FILES[2])
74 return trainset, valset, testset
75
76
77 def get_feat_inds(n_dim=840):
78 '''assume frame with 840 features and return indices for each feature
79 '''
80 if n_dim == 840:
81 rhy_inds = np.arange(400)
82 mel_inds = np.arange(400, 640)
83 mfc_inds = np.arange(640, 720)
84 chr_inds = np.arange(720, 840)
85 elif n_dim == 640:
86 rhy_inds = np.arange(200)
87 mel_inds = np.arange(200, 440)
88 mfc_inds = np.arange(440, 520)
89 chr_inds = np.arange(520, 640)
90 elif n_dim == 460:
91 rhy_inds = np.arange(200)
92 mel_inds = np.arange(200, 260)
93 mfc_inds = np.arange(260, 340)
94 chr_inds = np.arange(340, 460)
95 elif n_dim == 660:
96 rhy_inds = np.arange(400)
97 mel_inds = np.arange(400, 460)
98 mfc_inds = np.arange(460, 540)
99 chr_inds = np.arange(540, 660)
100 feat_inds = [rhy_inds, mel_inds, mfc_inds, chr_inds]
101 feat_labels = ['rhy', 'mel', 'mfc', 'chr']
102 return feat_labels, feat_inds
103
104
105 def map_and_average_frames(dataset=None, n_components=None, min_variance=None):
106 if dataset is None:
107 trainset, valset, testset = load_train_val_test_sets()
108 else:
109 trainset, valset, testset = dataset
110 traindata, trainlabels, trainaudiolabels = trainset
111 valdata, vallabels, valaudiolabels = valset
112 testdata, testlabels, testaudiolabels = testset
113 print traindata.shape, valdata.shape, testdata.shape
114 labels = np.concatenate((trainlabels, vallabels, testlabels)).ravel()
115 audiolabels = np.concatenate((trainaudiolabels, valaudiolabels, testaudiolabels)).ravel()
116
117 feat_labels, feat_inds = get_feat_inds(n_dim=traindata.shape[1])
118 ldadata_list = []
119 pcadata_list = []
120 nmfdata_list = []
121 ssnmfdata_list = []
122 data_list = []
123 for i in range(len(feat_inds)):
124 print "mapping " + feat_labels[i]
125 inds = feat_inds[i]
126 ssm_feat = util_feature_learning.Transformer()
127 if min_variance is not None:
128 ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=len(inds), pca_only=True)
129 n_components = np.where(ssm_feat.pca_transformer.explained_variance_ratio_.cumsum()>min_variance)[0][0]+1
130 print n_components, len(inds)
131 ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=n_components)
132 elif n_components is not None:
133 ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=n_components)
134 else:
135 ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=len(inds))
136 all_data = np.concatenate((traindata[:, inds], valdata[:, inds], testdata[:, inds]), axis=0)
137 transformed_data_dict = ssm_feat.transform_data(all_data)
138 for key in transformed_data_dict.keys():
139 average_data, audiolabs, classlabs = averageframes(transformed_data_dict[key], audiolabels, labels)
140 transformed_data_dict[key] = average_data
141 data_list.append(transformed_data_dict['none'])
142 pcadata_list.append(transformed_data_dict['pca'])
143 ldadata_list.append(transformed_data_dict['lda'])
144 nmfdata_list.append(transformed_data_dict['nmf'])
145 ssnmfdata_list.append(transformed_data_dict['ssnmf'])
146 return data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs
147
148
149 def lda_map_and_average_frames(dataset=None, n_components=None, min_variance=None):
150 if dataset is None:
151 trainset, valset, testset = load_train_val_test_sets()
152 else:
153 trainset, valset, testset = dataset
154 traindata, trainlabels, trainaudiolabels = trainset
155 valdata, vallabels, valaudiolabels = valset
156 testdata, testlabels, testaudiolabels = testset
157 print traindata.shape, valdata.shape, testdata.shape
158 labels = np.concatenate((trainlabels, vallabels, testlabels)).ravel()
159 audiolabels = np.concatenate((trainaudiolabels, valaudiolabels, testaudiolabels)).ravel()
160
161 feat_labels, feat_inds = get_feat_inds(n_dim=traindata.shape[1])
162 ldadata_list = []
163 pcadata_list = []
164 nmfdata_list = []
165 ssnmfdata_list = []
166 data_list = []
167 for i in range(len(feat_inds)):
168 print "mapping " + feat_labels[i]
169 inds = feat_inds[i]
170 ssm_feat = util_feature_learning.Transformer()
171 if min_variance is not None:
172 ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=len(inds), pca_only=True)
173 n_components = np.where(ssm_feat.pca_transformer.explained_variance_ratio_.cumsum()>min_variance)[0][0]+1
174 print n_components, len(inds)
175 ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=n_components)
176 elif n_components is not None:
177 ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=n_components)
178 else:
179 ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=len(inds))
180 all_data = np.concatenate((traindata[:, inds], valdata[:, inds], testdata[:, inds]), axis=0)
181 transformed_data_dict = ssm_feat.transform_lda_data(all_data)
182 for key in transformed_data_dict.keys():
183 if len(transformed_data_dict[key])==0:
184 continue
185 average_data, audiolabs, classlabs = averageframes(transformed_data_dict[key], audiolabels, labels)
186 transformed_data_dict[key] = average_data
187 data_list.append(transformed_data_dict['none'])
188 pcadata_list.append(transformed_data_dict['pca'])
189 ldadata_list.append(transformed_data_dict['lda'])
190 nmfdata_list.append(transformed_data_dict['nmf'])
191 ssnmfdata_list.append(transformed_data_dict['ssnmf'])
192 return data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs
193
194
195 def write_output(data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs):
196 pickle.dump([ldadata_list, classlabs, audiolabs], open(OUTPUT_FILES[0], 'wb'))
197 pickle.dump([pcadata_list, classlabs, audiolabs], open(OUTPUT_FILES[1], 'wb'))
198 pickle.dump([nmfdata_list, classlabs, audiolabs], open(OUTPUT_FILES[2], 'wb'))
199 pickle.dump([ssnmfdata_list, classlabs, audiolabs], open(OUTPUT_FILES[3], 'wb'))
200 pickle.dump([data_list, classlabs, audiolabs], open(OUTPUT_FILES[4], 'wb'))
201
202
203 if __name__ == '__main__':
204 # first only lda - because it goes fast
205 data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs = lda_map_and_average_frames(min_variance=0.99)
206 write_output(data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs)
207 # then add nmf,ssnmf
208 #data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs = map_and_average_frames(min_variance=0.99)
209 #write_output(data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs)
210