Maria@18
|
1 # -*- coding: utf-8 -*-
|
Maria@18
|
2 """
|
Maria@18
|
3 Created on Thu Nov 10 15:10:32 2016
|
Maria@18
|
4
|
Maria@18
|
5 @author: mariapanteli
|
Maria@18
|
6 """
|
Maria@18
|
7 import numpy as np
|
Maria@18
|
8 import pandas as pd
|
m@48
|
9 import pickle
|
Maria@18
|
10 from sklearn import metrics
|
m@62
|
11 from sklearn.model_selection import train_test_split
|
Maria@18
|
12
|
Maria@18
|
13 import map_and_average
|
Maria@18
|
14 import util_feature_learning
|
Maria@18
|
15
|
Maria@18
|
16
|
Maria@18
|
17 FILENAMES = map_and_average.OUTPUT_FILES
|
m@58
|
18 TRANSFORM_LABELS = ['LDA', 'PCA', 'NMF', 'SSNMF', 'NA']
|
m@65
|
19 RANDOM_STATE = 12345
|
Maria@18
|
20
|
Maria@18
|
21 def load_data_from_pickle(filename):
|
Maria@18
|
22 X_list, Y, Yaudio = pickle.load(open(filename,'rb'))
|
m@55
|
23 X = np.concatenate(X_list, axis=1)
|
Maria@18
|
24 return X, Y, Yaudio
|
Maria@18
|
25
|
Maria@18
|
26
|
m@62
|
27 def feat_inds_from_pickle(filename):
|
m@62
|
28 X_list, Y, Yaudio = pickle.load(open(filename,'rb'))
|
m@65
|
29 len_inds = np.array([X_list[0].shape[1], X_list[1].shape[1],
|
m@65
|
30 X_list[2].shape[1], X_list[3].shape[1]])
|
m@65
|
31 cum_sum = np.concatenate([[0], np.cumsum(len_inds)])
|
m@65
|
32 feat_inds = [np.arange(cum_sum[i], cum_sum[i+1]) for i in range(len(X_list))]
|
m@65
|
33 #feat_inds = [X_list[0].shape[1], X_list[1].shape[1], X_list[2].shape[1], X_list[3].shape[1]]
|
m@62
|
34 feat_labels = ['rhy', 'mel', 'mfc', 'chr']
|
m@62
|
35 return feat_labels, feat_inds
|
m@62
|
36
|
m@62
|
37
|
m@45
|
38 def get_train_test_indices(audiolabs):
|
Maria@18
|
39 trainset, valset, testset = map_and_average.load_train_val_test_sets()
|
Maria@18
|
40 trainaudiolabels, testaudiolabels = trainset[2], testset[2]
|
Maria@18
|
41 # train, test indices
|
Maria@18
|
42 aa_train = np.unique(trainaudiolabels)
|
Maria@18
|
43 aa_test = np.unique(testaudiolabels)
|
Maria@18
|
44 traininds = np.array([i for i, item in enumerate(audiolabs) if item in aa_train])
|
Maria@18
|
45 testinds = np.array([i for i, item in enumerate(audiolabs) if item in aa_test])
|
Maria@18
|
46 return traininds, testinds
|
Maria@18
|
47
|
Maria@18
|
48
|
Maria@18
|
49 def get_train_test_sets(X, Y, traininds, testinds):
|
Maria@18
|
50 X_train = X[traininds, :]
|
Maria@18
|
51 Y_train = Y[traininds]
|
Maria@18
|
52 X_test = X[testinds, :]
|
Maria@18
|
53 Y_test = Y[testinds]
|
Maria@18
|
54 return X_train, Y_train, X_test, Y_test
|
Maria@18
|
55
|
Maria@18
|
56
|
Maria@18
|
57 def classify_for_filenames(file_list=FILENAMES):
|
Maria@18
|
58 df_results = pd.DataFrame()
|
Maria@18
|
59 feat_learner = util_feature_learning.Transformer()
|
m@58
|
60 #traininds, testinds = get_train_test_indices(Yaudio)
|
m@58
|
61 for filename, transform_label in zip(file_list, TRANSFORM_LABELS):
|
m@58
|
62 print filename
|
Maria@18
|
63 X, Y, Yaudio = load_data_from_pickle(filename)
|
m@58
|
64 #X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds)
|
m@65
|
65 X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=RANDOM_STATE, stratify=Y)
|
m@65
|
66 X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=RANDOM_STATE, stratify=Y_val_test)
|
m@65
|
67 #df_result = feat_learner.classify(X_train, Y_train, X_test, Y_test, transform_label=transform_label)
|
m@65
|
68 #df_result_feat = classify_each_feature(X_train, Y_train, X_test, Y_test, filename, transform_label=transform_label)
|
m@65
|
69 #df_result = pd.concat([df_result, df_result_feat], axis=1, ignore_index=True)
|
m@65
|
70 #df_results = pd.concat([df_results, df_result], axis=0, ignore_index=True)
|
m@65
|
71 df_result = classify_each_feature(X_train, Y_train, X_test, Y_test, filename, transform_label=transform_label)
|
Maria@18
|
72 df_results = pd.concat([df_results, df_result], axis=0, ignore_index=True)
|
m@47
|
73 return df_results
|
m@47
|
74
|
m@47
|
75
|
m@62
|
76 def classify_each_feature(X_train, Y_train, X_test, Y_test, filename, transform_label=" "):
|
m@47
|
77 n_dim = X_train.shape[1]
|
m@62
|
78 #feat_labels, feat_inds = map_and_average.get_feat_inds(n_dim=n_dim)
|
m@62
|
79 feat_labels, feat_inds = feat_inds_from_pickle(filename)
|
m@47
|
80 #df_results = pd.DataFrame()
|
m@65
|
81 feat_learner = util_feature_learning.Transformer()
|
m@47
|
82 # first the classification with all features together
|
m@58
|
83 df_results = feat_learner.classify(X_train, Y_train, X_test, Y_test, transform_label=transform_label)
|
m@47
|
84 # then append for each feature separately
|
m@47
|
85 for i in range(len(feat_inds)):
|
m@47
|
86 df_result = feat_learner.classify(X_train[:, feat_inds[i]], Y_train,
|
m@65
|
87 X_test[:, feat_inds[i]], Y_test, transform_label=transform_label)
|
m@47
|
88 df_results = pd.concat([df_results, df_result], axis=1, ignore_index=True)
|
m@47
|
89 return df_results
|
Maria@18
|
90
|
Maria@18
|
91
|
Maria@18
|
92 def plot_CF(CF, labels=None, figurename=None):
|
Maria@18
|
93 labels[labels=='United States of America'] = 'United States Amer.'
|
Maria@18
|
94 plt.imshow(CF, cmap="Greys")
|
Maria@18
|
95 plt.xticks(range(len(labels)), labels, rotation='vertical', fontsize=4)
|
Maria@18
|
96 plt.yticks(range(len(labels)), labels, fontsize=4)
|
Maria@18
|
97 if figurename is not None:
|
Maria@18
|
98 plt.savefig(figurename, bbox_inches='tight')
|
Maria@18
|
99
|
Maria@18
|
100
|
Maria@18
|
101 def confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False):
|
Maria@18
|
102 feat_learner = util_feature_learning.Transformer()
|
m@30
|
103 accuracy, predictions = feat_learner.classification_accuracy(X_train, Y_train,
|
m@30
|
104 X_test, Y_test, model=feat_learner.modelLDA)
|
Maria@18
|
105 labels = np.unique(Y_test) # TODO: countries in geographical proximity
|
Maria@18
|
106 CF = metrics.confusion_matrix(Y_test, predictions, labels=labels)
|
Maria@18
|
107 if saveCF:
|
Maria@18
|
108 np.savetxt('data/CFlabels.csv', labels, fmt='%s')
|
Maria@18
|
109 np.savetxt('data/CF.csv', CF, fmt='%10.5f')
|
Maria@18
|
110 if plots:
|
Maria@18
|
111 plot_CF(CF, labels=labels, figurename='data/conf_matrix.pdf')
|
m@58
|
112 return accuracy, CF
|
m@58
|
113
|
m@58
|
114
|
m@58
|
115 def confusion_matrix_for_best_classification_result(df_results, output_data=False):
|
m@58
|
116 max_i = np.argmax(df_results[:, 1])
|
m@58
|
117 feat_learning_i = max_i % 4 # 4 classifiers for each feature learning method
|
m@58
|
118 filename = FILENAMES[feat_learning_i]
|
m@58
|
119 print filename
|
m@58
|
120 X, Y, Yaudio = load_data_from_pickle(filename)
|
m@58
|
121 #traininds, testinds = get_train_test_indices(Yaudio)
|
m@58
|
122 #X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds)
|
m@65
|
123 X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=RANDOM_STATE, stratify=Y)
|
m@65
|
124 X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=RANDOM_STATE, stratify=Y_val_test)
|
m@58
|
125 if output_data:
|
m@58
|
126 _, CF = confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=True, plots=True)
|
m@58
|
127 else:
|
m@58
|
128 _, CF = confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)
|
m@58
|
129 return CF
|
Maria@18
|
130
|
Maria@18
|
131
|
Maria@18
|
132 if __name__ == '__main__':
|
Maria@18
|
133 df_results = classify_for_filenames(file_list=FILENAMES)
|
m@58
|
134 CF = confusion_matrix_for_best_classification_result(df_results, output_data=False)
|
Maria@18
|
135
|