annotate scripts/map_and_average.py @ 9:c4841876a8ff branch-tests

adding notebooks and trying to explain classifier coefficients
author Maria Panteli <m.x.panteli@gmail.com>
date Mon, 11 Sep 2017 19:06:40 +0100
parents
children d118b6ca8370
rev   line source
m@9 1 # -*- coding: utf-8 -*-
m@9 2 """
m@9 3 Created on Thu Mar 16 02:44:07 2017
m@9 4
m@9 5 @author: mariapanteli
m@9 6 """
m@9 7
m@9 8 import numpy as np
m@9 9 import pickle
m@9 10
m@9 11 import util_feature_learning
m@9 12
m@9 13 WIN_SIZE = 8
m@9 14 INPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle',
m@9 15 '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle',
m@9 16 '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle']
m@9 17 OUTPUT_FILES = ['/import/c4dm-04/mariap/lda_data_melodia_'+str(WIN_SIZE)+'.pickle',
m@9 18 '/import/c4dm-04/mariap/pca_data_melodia_'+str(WIN_SIZE)+'.pickle',
m@9 19 '/import/c4dm-04/mariap/nmf_data_melodia_'+str(WIN_SIZE)+'.pickle',
m@9 20 '/import/c4dm-04/mariap/ssnmf_data_melodia_'+str(WIN_SIZE)+'.pickle',
m@9 21 '/import/c4dm-04/mariap/na_data_melodia_'+str(WIN_SIZE)+'.pickle']
m@9 22
m@9 23
m@9 24 def remove_inds(features, labels, audiolabels):
m@9 25 '''remove instances with unknown country
m@9 26 '''
m@9 27 remove_inds1 = np.where(labels=='unknown')[0]
m@9 28 remove_inds2 = np.where(labels=='Unidentified')[0]
m@9 29 keep_inds = np.array(list(set(range(len(labels))) - (set(remove_inds1) | set(remove_inds2))))
m@9 30 features = features[keep_inds, :]
m@9 31 labels = labels[keep_inds]
m@9 32 audiolabels = audiolabels[keep_inds]
m@9 33 return features, labels, audiolabels
m@9 34
m@9 35
m@9 36 def averageframes(features, audiolabels, classlabels):
m@9 37 '''average frame-based features for each recording
m@9 38 '''
m@9 39 u, ind = np.unique(audiolabels, return_index=True)
m@9 40 uniqsorted = u[np.argsort(ind)]
m@9 41 newfeatures = []
m@9 42 newclasslabels = []
m@9 43 newaudiolabels = []
m@9 44 for aulabel in uniqsorted:
m@9 45 inds = np.where(audiolabels == aulabel)[0]
m@9 46 newfeatures.append(np.mean(features[inds, :], axis=0))
m@9 47 newclasslabels.append(classlabels[inds[0]])
m@9 48 newaudiolabels.append(aulabel)
m@9 49 newfeatures = np.array(newfeatures)
m@9 50 newaudiolabels = np.array(newaudiolabels)
m@9 51 newclasslabels = np.array(newclasslabels)
m@9 52 return newfeatures, newaudiolabels, newclasslabels
m@9 53
m@9 54
m@9 55 def load_data_from_pickle(pickle_file=None):
m@9 56 '''load frame based features and labels from pickle file
m@9 57 '''
m@9 58 with open(pickle_file,'rb') as f:
m@9 59 data, labels, audiolabels = pickle.load(f)
m@9 60 # remove 'unknown' and 'unidentified' country
m@9 61 data, labels, audiolabels = remove_inds(data, labels, audiolabels)
m@9 62 # avoid nan which gives error in feature learning
m@9 63 data[np.isnan(data)] = 0
m@9 64 return data, labels, audiolabels
m@9 65
m@9 66
m@9 67 def load_train_val_test_sets():
m@9 68 '''load train, val, test sets
m@9 69 '''
m@9 70 print INPUT_FILES[0]
m@9 71 trainset = load_data_from_pickle(INPUT_FILES[0])
m@9 72 valset = load_data_from_pickle(INPUT_FILES[1])
m@9 73 testset = load_data_from_pickle(INPUT_FILES[2])
m@9 74 return trainset, valset, testset
m@9 75
m@9 76
m@9 77 def get_feat_inds(n_dim=840):
m@9 78 '''assume frame with 840 features and return indices for each feature
m@9 79 '''
m@9 80 if n_dim == 840:
m@9 81 rhy_inds = np.arange(400)
m@9 82 mel_inds = np.arange(400, 640)
m@9 83 mfc_inds = np.arange(640, 720)
m@9 84 chr_inds = np.arange(720, 840)
m@9 85 elif n_dim == 640:
m@9 86 rhy_inds = np.arange(200)
m@9 87 mel_inds = np.arange(200, 440)
m@9 88 mfc_inds = np.arange(440, 520)
m@9 89 chr_inds = np.arange(520, 640)
m@9 90 elif n_dim == 460:
m@9 91 rhy_inds = np.arange(200)
m@9 92 mel_inds = np.arange(200, 260)
m@9 93 mfc_inds = np.arange(260, 340)
m@9 94 chr_inds = np.arange(340, 460)
m@9 95 elif n_dim == 660:
m@9 96 rhy_inds = np.arange(400)
m@9 97 mel_inds = np.arange(400, 460)
m@9 98 mfc_inds = np.arange(460, 540)
m@9 99 chr_inds = np.arange(540, 660)
m@9 100 feat_inds = [rhy_inds, mel_inds, mfc_inds, chr_inds]
m@9 101 feat_labels = ['rhy', 'mel', 'mfc', 'chr']
m@9 102 return feat_labels, feat_inds
m@9 103
m@9 104
m@9 105 def map_and_average_frames(dataset=None, n_components=None, min_variance=None):
m@9 106 if dataset is None:
m@9 107 trainset, valset, testset = load_train_val_test_sets()
m@9 108 else:
m@9 109 trainset, valset, testset = dataset
m@9 110 traindata, trainlabels, trainaudiolabels = trainset
m@9 111 valdata, vallabels, valaudiolabels = valset
m@9 112 testdata, testlabels, testaudiolabels = testset
m@9 113 print traindata.shape, valdata.shape, testdata.shape
m@9 114 labels = np.concatenate((trainlabels, vallabels, testlabels)).ravel()
m@9 115 audiolabels = np.concatenate((trainaudiolabels, valaudiolabels, testaudiolabels)).ravel()
m@9 116
m@9 117 feat_labels, feat_inds = get_feat_inds(n_dim=traindata.shape[1])
m@9 118 ldadata_list = []
m@9 119 pcadata_list = []
m@9 120 nmfdata_list = []
m@9 121 ssnmfdata_list = []
m@9 122 data_list = []
m@9 123 for i in range(len(feat_inds)):
m@9 124 print "mapping " + feat_labels[i]
m@9 125 inds = feat_inds[i]
m@9 126 ssm_feat = util_feature_learning.Transformer()
m@9 127 if min_variance is not None:
m@9 128 ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=len(inds), pca_only=True)
m@9 129 n_components = np.where(ssm_feat.pca_transformer.explained_variance_ratio_.cumsum()>min_variance)[0][0]+1
m@9 130 print n_components, len(inds)
m@9 131 ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=n_components)
m@9 132 elif n_components is not None:
m@9 133 ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=n_components)
m@9 134 else:
m@9 135 ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=len(inds))
m@9 136 all_data = np.concatenate((traindata[:, inds], valdata[:, inds], testdata[:, inds]), axis=0)
m@9 137 transformed_data_dict = ssm_feat.transform_data(all_data)
m@9 138 for key in transformed_data_dict.keys():
m@9 139 average_data, audiolabs, classlabs = averageframes(transformed_data_dict[key], audiolabels, labels)
m@9 140 transformed_data_dict[key] = average_data
m@9 141 data_list.append(transformed_data_dict['none'])
m@9 142 pcadata_list.append(transformed_data_dict['pca'])
m@9 143 ldadata_list.append(transformed_data_dict['lda'])
m@9 144 nmfdata_list.append(transformed_data_dict['nmf'])
m@9 145 ssnmfdata_list.append(transformed_data_dict['ssnmf'])
m@9 146 return data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs
m@9 147
m@9 148
m@9 149 def lda_map_and_average_frames(dataset=None, n_components=None, min_variance=None):
m@9 150 if dataset is None:
m@9 151 trainset, valset, testset = load_train_val_test_sets()
m@9 152 else:
m@9 153 trainset, valset, testset = dataset
m@9 154 traindata, trainlabels, trainaudiolabels = trainset
m@9 155 valdata, vallabels, valaudiolabels = valset
m@9 156 testdata, testlabels, testaudiolabels = testset
m@9 157 print traindata.shape, valdata.shape, testdata.shape
m@9 158 labels = np.concatenate((trainlabels, vallabels, testlabels)).ravel()
m@9 159 audiolabels = np.concatenate((trainaudiolabels, valaudiolabels, testaudiolabels)).ravel()
m@9 160
m@9 161 feat_labels, feat_inds = get_feat_inds(n_dim=traindata.shape[1])
m@9 162 ldadata_list = []
m@9 163 pcadata_list = []
m@9 164 nmfdata_list = []
m@9 165 ssnmfdata_list = []
m@9 166 data_list = []
m@9 167 for i in range(len(feat_inds)):
m@9 168 print "mapping " + feat_labels[i]
m@9 169 inds = feat_inds[i]
m@9 170 ssm_feat = util_feature_learning.Transformer()
m@9 171 if min_variance is not None:
m@9 172 ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=len(inds), pca_only=True)
m@9 173 n_components = np.where(ssm_feat.pca_transformer.explained_variance_ratio_.cumsum()>min_variance)[0][0]+1
m@9 174 print n_components, len(inds)
m@9 175 ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=n_components)
m@9 176 elif n_components is not None:
m@9 177 ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=n_components)
m@9 178 else:
m@9 179 ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=len(inds))
m@9 180 all_data = np.concatenate((traindata[:, inds], valdata[:, inds], testdata[:, inds]), axis=0)
m@9 181 transformed_data_dict = ssm_feat.transform_lda_data(all_data)
m@9 182 for key in transformed_data_dict.keys():
m@9 183 if len(transformed_data_dict[key])==0:
m@9 184 continue
m@9 185 average_data, audiolabs, classlabs = averageframes(transformed_data_dict[key], audiolabels, labels)
m@9 186 transformed_data_dict[key] = average_data
m@9 187 data_list.append(transformed_data_dict['none'])
m@9 188 pcadata_list.append(transformed_data_dict['pca'])
m@9 189 ldadata_list.append(transformed_data_dict['lda'])
m@9 190 nmfdata_list.append(transformed_data_dict['nmf'])
m@9 191 ssnmfdata_list.append(transformed_data_dict['ssnmf'])
m@9 192 return data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs
m@9 193
m@9 194
m@9 195 def write_output(data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs):
m@9 196 pickle.dump([ldadata_list, classlabs, audiolabs], open(OUTPUT_FILES[0], 'wb'))
m@9 197 pickle.dump([pcadata_list, classlabs, audiolabs], open(OUTPUT_FILES[1], 'wb'))
m@9 198 pickle.dump([nmfdata_list, classlabs, audiolabs], open(OUTPUT_FILES[2], 'wb'))
m@9 199 pickle.dump([ssnmfdata_list, classlabs, audiolabs], open(OUTPUT_FILES[3], 'wb'))
m@9 200 pickle.dump([data_list, classlabs, audiolabs], open(OUTPUT_FILES[4], 'wb'))
m@9 201
m@9 202
m@9 203 if __name__ == '__main__':
m@9 204 # first only lda - because it goes fast
m@9 205 data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs = lda_map_and_average_frames(min_variance=0.99)
m@9 206 write_output(data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs)
m@9 207 # then add nmf,ssnmf
m@9 208 #data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs = map_and_average_frames(min_variance=0.99)
m@9 209 #write_output(data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs)
m@9 210