comparison scripts/classification.py @ 58:d118b6ca8370 branch-tests

some changes in classification with random train/test split
author Maria Panteli <m.x.panteli@gmail.com>
date Thu, 21 Sep 2017 15:24:18 +0100
parents 98cd5317e504
children 4425a4918102 b0e194bfb71d
comparison
equal deleted inserted replaced
57:dd86e49ae8bf 58:d118b6ca8370
12 import map_and_average 12 import map_and_average
13 import util_feature_learning 13 import util_feature_learning
14 14
15 15
16 FILENAMES = map_and_average.OUTPUT_FILES 16 FILENAMES = map_and_average.OUTPUT_FILES
17 17 TRANSFORM_LABELS = ['LDA', 'PCA', 'NMF', 'SSNMF', 'NA']
18 18
19 def load_data_from_pickle(filename): 19 def load_data_from_pickle(filename):
20 X_list, Y, Yaudio = pickle.load(open(filename,'rb')) 20 X_list, Y, Yaudio = pickle.load(open(filename,'rb'))
21 X = np.concatenate(X_list, axis=1) 21 X = np.concatenate(X_list, axis=1)
22 return X, Y, Yaudio 22 return X, Y, Yaudio
42 42
43 43
44 def classify_for_filenames(file_list=FILENAMES): 44 def classify_for_filenames(file_list=FILENAMES):
45 df_results = pd.DataFrame() 45 df_results = pd.DataFrame()
46 feat_learner = util_feature_learning.Transformer() 46 feat_learner = util_feature_learning.Transformer()
47 for filename in file_list: 47 #traininds, testinds = get_train_test_indices(Yaudio)
48 for filename, transform_label in zip(file_list, TRANSFORM_LABELS):
49 print filename
48 X, Y, Yaudio = load_data_from_pickle(filename) 50 X, Y, Yaudio = load_data_from_pickle(filename)
49 traininds, testinds = get_train_test_indices(Yaudio) 51 #X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds)
50 X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds) 52 X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=12345, stratify=Y)
51 df_result = feat_learner.classify(X_train, Y_train, X_test, Y_test) 53 X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=12345, stratify=Y_val_test)
54 df_result = feat_learner.classify(X_train, Y_train, X_test, Y_test, transform_label=transform_label)
55 df_result_feat = classify_each_feature(X_train, Y_train, X_test, Y_test, transform_label=transform_label)
56 df_result = pd.concat([df_result, df_result_feat], axis=1, ignore_index=True)
52 df_results = pd.concat([df_results, df_result], axis=0, ignore_index=True) 57 df_results = pd.concat([df_results, df_result], axis=0, ignore_index=True)
53 return df_results 58 return df_results
54 59
55 60
56 def classify_each_feature(X_train, Y_train, X_test, Y_test): 61 def classify_each_feature(X_train, Y_train, X_test, Y_test, transform_label=" "):
57 n_dim = X_train.shape[1] 62 n_dim = X_train.shape[1]
58 feat_labels, feat_inds = map_and_average.get_feat_inds(n_dim=n_dim) 63 feat_labels, feat_inds = map_and_average.get_feat_inds(n_dim=n_dim)
59 #df_results = pd.DataFrame() 64 #df_results = pd.DataFrame()
60 # first the classification with all features together 65 # first the classification with all features together
61 df_results = feat_learner.classify(X_train, Y_train, X_test, Y_test) 66 df_results = feat_learner.classify(X_train, Y_train, X_test, Y_test, transform_label=transform_label)
62 # then append for each feature separately 67 # then append for each feature separately
63 for i in range(len(feat_inds)): 68 for i in range(len(feat_inds)):
64 df_result = feat_learner.classify(X_train[:, feat_inds[i]], Y_train, 69 df_result = feat_learner.classify(X_train[:, feat_inds[i]], Y_train,
65 X_test[:, feat_inds[i]], Y_test) 70 X_test[:, feat_inds[i]], Y_test)
66 df_results = pd.concat([df_results, df_result], axis=1, ignore_index=True) 71 df_results = pd.concat([df_results, df_result], axis=1, ignore_index=True)
85 if saveCF: 90 if saveCF:
86 np.savetxt('data/CFlabels.csv', labels, fmt='%s') 91 np.savetxt('data/CFlabels.csv', labels, fmt='%s')
87 np.savetxt('data/CF.csv', CF, fmt='%10.5f') 92 np.savetxt('data/CF.csv', CF, fmt='%10.5f')
88 if plots: 93 if plots:
89 plot_CF(CF, labels=labels, figurename='data/conf_matrix.pdf') 94 plot_CF(CF, labels=labels, figurename='data/conf_matrix.pdf')
90 return accuracy, predictions 95 return accuracy, CF
96
97
98 def confusion_matrix_for_best_classification_result(df_results, output_data=False):
99 max_i = np.argmax(df_results[:, 1])
100 feat_learning_i = max_i % 4 # 4 classifiers for each feature learning method
101 filename = FILENAMES[feat_learning_i]
102 print filename
103 X, Y, Yaudio = load_data_from_pickle(filename)
104 #traininds, testinds = get_train_test_indices(Yaudio)
105 #X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds)
106 X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=12345, stratify=Y)
107 X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=12345, stratify=Y_val_test)
108 if output_data:
109 _, CF = confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=True, plots=True)
110 else:
111 _, CF = confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)
112 return CF
91 113
92 114
93 if __name__ == '__main__': 115 if __name__ == '__main__':
94 df_results = classify_for_filenames(file_list=FILENAMES) 116 df_results = classify_for_filenames(file_list=FILENAMES)
95 max_i = np.argmax(df_results[:, 1]) 117 CF = confusion_matrix_for_best_classification_result(df_results, output_data=False)
96 feat_learning_i = max_i % 4 # 4 classifiers for each feature learning method
97 filename = FILENAMES[feat_learning_i]
98 X, Y, Yaudio = load_data_from_pickle(filename)
99 traininds, testinds = get_train_test_indices(Yaudio)
100 X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds)
101 confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=True, plots=True)
102 118