Mercurial > hg > plosone_underreview
comparison scripts/map_and_average.py @ 9:c4841876a8ff branch-tests
adding notebooks and trying to explain classifier coefficients
author | Maria Panteli <m.x.panteli@gmail.com> |
---|---|
date | Mon, 11 Sep 2017 19:06:40 +0100 |
parents | |
children | d118b6ca8370 |
comparison
equal
deleted
inserted
replaced
8:0f3eba42b425 | 9:c4841876a8ff |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 Created on Thu Mar 16 02:44:07 2017 | |
4 | |
5 @author: mariapanteli | |
6 """ | |
7 | |
8 import numpy as np | |
9 import pickle | |
10 | |
11 import util_feature_learning | |
12 | |
13 WIN_SIZE = 8 | |
14 INPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', | |
15 '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', | |
16 '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle'] | |
17 OUTPUT_FILES = ['/import/c4dm-04/mariap/lda_data_melodia_'+str(WIN_SIZE)+'.pickle', | |
18 '/import/c4dm-04/mariap/pca_data_melodia_'+str(WIN_SIZE)+'.pickle', | |
19 '/import/c4dm-04/mariap/nmf_data_melodia_'+str(WIN_SIZE)+'.pickle', | |
20 '/import/c4dm-04/mariap/ssnmf_data_melodia_'+str(WIN_SIZE)+'.pickle', | |
21 '/import/c4dm-04/mariap/na_data_melodia_'+str(WIN_SIZE)+'.pickle'] | |
22 | |
23 | |
24 def remove_inds(features, labels, audiolabels): | |
25 '''remove instances with unknown country | |
26 ''' | |
27 remove_inds1 = np.where(labels=='unknown')[0] | |
28 remove_inds2 = np.where(labels=='Unidentified')[0] | |
29 keep_inds = np.array(list(set(range(len(labels))) - (set(remove_inds1) | set(remove_inds2)))) | |
30 features = features[keep_inds, :] | |
31 labels = labels[keep_inds] | |
32 audiolabels = audiolabels[keep_inds] | |
33 return features, labels, audiolabels | |
34 | |
35 | |
36 def averageframes(features, audiolabels, classlabels): | |
37 '''average frame-based features for each recording | |
38 ''' | |
39 u, ind = np.unique(audiolabels, return_index=True) | |
40 uniqsorted = u[np.argsort(ind)] | |
41 newfeatures = [] | |
42 newclasslabels = [] | |
43 newaudiolabels = [] | |
44 for aulabel in uniqsorted: | |
45 inds = np.where(audiolabels == aulabel)[0] | |
46 newfeatures.append(np.mean(features[inds, :], axis=0)) | |
47 newclasslabels.append(classlabels[inds[0]]) | |
48 newaudiolabels.append(aulabel) | |
49 newfeatures = np.array(newfeatures) | |
50 newaudiolabels = np.array(newaudiolabels) | |
51 newclasslabels = np.array(newclasslabels) | |
52 return newfeatures, newaudiolabels, newclasslabels | |
53 | |
54 | |
55 def load_data_from_pickle(pickle_file=None): | |
56 '''load frame based features and labels from pickle file | |
57 ''' | |
58 with open(pickle_file,'rb') as f: | |
59 data, labels, audiolabels = pickle.load(f) | |
60 # remove 'unknown' and 'unidentified' country | |
61 data, labels, audiolabels = remove_inds(data, labels, audiolabels) | |
62 # avoid nan which gives error in feature learning | |
63 data[np.isnan(data)] = 0 | |
64 return data, labels, audiolabels | |
65 | |
66 | |
67 def load_train_val_test_sets(): | |
68 '''load train, val, test sets | |
69 ''' | |
70 print INPUT_FILES[0] | |
71 trainset = load_data_from_pickle(INPUT_FILES[0]) | |
72 valset = load_data_from_pickle(INPUT_FILES[1]) | |
73 testset = load_data_from_pickle(INPUT_FILES[2]) | |
74 return trainset, valset, testset | |
75 | |
76 | |
77 def get_feat_inds(n_dim=840): | |
78 '''assume frame with 840 features and return indices for each feature | |
79 ''' | |
80 if n_dim == 840: | |
81 rhy_inds = np.arange(400) | |
82 mel_inds = np.arange(400, 640) | |
83 mfc_inds = np.arange(640, 720) | |
84 chr_inds = np.arange(720, 840) | |
85 elif n_dim == 640: | |
86 rhy_inds = np.arange(200) | |
87 mel_inds = np.arange(200, 440) | |
88 mfc_inds = np.arange(440, 520) | |
89 chr_inds = np.arange(520, 640) | |
90 elif n_dim == 460: | |
91 rhy_inds = np.arange(200) | |
92 mel_inds = np.arange(200, 260) | |
93 mfc_inds = np.arange(260, 340) | |
94 chr_inds = np.arange(340, 460) | |
95 elif n_dim == 660: | |
96 rhy_inds = np.arange(400) | |
97 mel_inds = np.arange(400, 460) | |
98 mfc_inds = np.arange(460, 540) | |
99 chr_inds = np.arange(540, 660) | |
100 feat_inds = [rhy_inds, mel_inds, mfc_inds, chr_inds] | |
101 feat_labels = ['rhy', 'mel', 'mfc', 'chr'] | |
102 return feat_labels, feat_inds | |
103 | |
104 | |
105 def map_and_average_frames(dataset=None, n_components=None, min_variance=None): | |
106 if dataset is None: | |
107 trainset, valset, testset = load_train_val_test_sets() | |
108 else: | |
109 trainset, valset, testset = dataset | |
110 traindata, trainlabels, trainaudiolabels = trainset | |
111 valdata, vallabels, valaudiolabels = valset | |
112 testdata, testlabels, testaudiolabels = testset | |
113 print traindata.shape, valdata.shape, testdata.shape | |
114 labels = np.concatenate((trainlabels, vallabels, testlabels)).ravel() | |
115 audiolabels = np.concatenate((trainaudiolabels, valaudiolabels, testaudiolabels)).ravel() | |
116 | |
117 feat_labels, feat_inds = get_feat_inds(n_dim=traindata.shape[1]) | |
118 ldadata_list = [] | |
119 pcadata_list = [] | |
120 nmfdata_list = [] | |
121 ssnmfdata_list = [] | |
122 data_list = [] | |
123 for i in range(len(feat_inds)): | |
124 print "mapping " + feat_labels[i] | |
125 inds = feat_inds[i] | |
126 ssm_feat = util_feature_learning.Transformer() | |
127 if min_variance is not None: | |
128 ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=len(inds), pca_only=True) | |
129 n_components = np.where(ssm_feat.pca_transformer.explained_variance_ratio_.cumsum()>min_variance)[0][0]+1 | |
130 print n_components, len(inds) | |
131 ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=n_components) | |
132 elif n_components is not None: | |
133 ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=n_components) | |
134 else: | |
135 ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=len(inds)) | |
136 all_data = np.concatenate((traindata[:, inds], valdata[:, inds], testdata[:, inds]), axis=0) | |
137 transformed_data_dict = ssm_feat.transform_data(all_data) | |
138 for key in transformed_data_dict.keys(): | |
139 average_data, audiolabs, classlabs = averageframes(transformed_data_dict[key], audiolabels, labels) | |
140 transformed_data_dict[key] = average_data | |
141 data_list.append(transformed_data_dict['none']) | |
142 pcadata_list.append(transformed_data_dict['pca']) | |
143 ldadata_list.append(transformed_data_dict['lda']) | |
144 nmfdata_list.append(transformed_data_dict['nmf']) | |
145 ssnmfdata_list.append(transformed_data_dict['ssnmf']) | |
146 return data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs | |
147 | |
148 | |
149 def lda_map_and_average_frames(dataset=None, n_components=None, min_variance=None): | |
150 if dataset is None: | |
151 trainset, valset, testset = load_train_val_test_sets() | |
152 else: | |
153 trainset, valset, testset = dataset | |
154 traindata, trainlabels, trainaudiolabels = trainset | |
155 valdata, vallabels, valaudiolabels = valset | |
156 testdata, testlabels, testaudiolabels = testset | |
157 print traindata.shape, valdata.shape, testdata.shape | |
158 labels = np.concatenate((trainlabels, vallabels, testlabels)).ravel() | |
159 audiolabels = np.concatenate((trainaudiolabels, valaudiolabels, testaudiolabels)).ravel() | |
160 | |
161 feat_labels, feat_inds = get_feat_inds(n_dim=traindata.shape[1]) | |
162 ldadata_list = [] | |
163 pcadata_list = [] | |
164 nmfdata_list = [] | |
165 ssnmfdata_list = [] | |
166 data_list = [] | |
167 for i in range(len(feat_inds)): | |
168 print "mapping " + feat_labels[i] | |
169 inds = feat_inds[i] | |
170 ssm_feat = util_feature_learning.Transformer() | |
171 if min_variance is not None: | |
172 ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=len(inds), pca_only=True) | |
173 n_components = np.where(ssm_feat.pca_transformer.explained_variance_ratio_.cumsum()>min_variance)[0][0]+1 | |
174 print n_components, len(inds) | |
175 ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=n_components) | |
176 elif n_components is not None: | |
177 ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=n_components) | |
178 else: | |
179 ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=len(inds)) | |
180 all_data = np.concatenate((traindata[:, inds], valdata[:, inds], testdata[:, inds]), axis=0) | |
181 transformed_data_dict = ssm_feat.transform_lda_data(all_data) | |
182 for key in transformed_data_dict.keys(): | |
183 if len(transformed_data_dict[key])==0: | |
184 continue | |
185 average_data, audiolabs, classlabs = averageframes(transformed_data_dict[key], audiolabels, labels) | |
186 transformed_data_dict[key] = average_data | |
187 data_list.append(transformed_data_dict['none']) | |
188 pcadata_list.append(transformed_data_dict['pca']) | |
189 ldadata_list.append(transformed_data_dict['lda']) | |
190 nmfdata_list.append(transformed_data_dict['nmf']) | |
191 ssnmfdata_list.append(transformed_data_dict['ssnmf']) | |
192 return data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs | |
193 | |
194 | |
195 def write_output(data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs): | |
196 pickle.dump([ldadata_list, classlabs, audiolabs], open(OUTPUT_FILES[0], 'wb')) | |
197 pickle.dump([pcadata_list, classlabs, audiolabs], open(OUTPUT_FILES[1], 'wb')) | |
198 pickle.dump([nmfdata_list, classlabs, audiolabs], open(OUTPUT_FILES[2], 'wb')) | |
199 pickle.dump([ssnmfdata_list, classlabs, audiolabs], open(OUTPUT_FILES[3], 'wb')) | |
200 pickle.dump([data_list, classlabs, audiolabs], open(OUTPUT_FILES[4], 'wb')) | |
201 | |
202 | |
203 if __name__ == '__main__': | |
204 # first only lda - because it goes fast | |
205 data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs = lda_map_and_average_frames(min_variance=0.99) | |
206 write_output(data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs) | |
207 # then add nmf,ssnmf | |
208 #data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs = map_and_average_frames(min_variance=0.99) | |
209 #write_output(data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs) | |
210 |