Mercurial > hg > plosone_underreview
comparison scripts/classification.py @ 58:d118b6ca8370 branch-tests
some changes in classification with random train/test split
| author | Maria Panteli <m.x.panteli@gmail.com> |
|---|---|
| date | Thu, 21 Sep 2017 15:24:18 +0100 |
| parents | 98cd5317e504 |
| children | 4425a4918102 b0e194bfb71d |
comparison
equal
deleted
inserted
replaced
| 57:dd86e49ae8bf | 58:d118b6ca8370 |
|---|---|
| 12 import map_and_average | 12 import map_and_average |
| 13 import util_feature_learning | 13 import util_feature_learning |
| 14 | 14 |
| 15 | 15 |
| 16 FILENAMES = map_and_average.OUTPUT_FILES | 16 FILENAMES = map_and_average.OUTPUT_FILES |
| 17 | 17 TRANSFORM_LABELS = ['LDA', 'PCA', 'NMF', 'SSNMF', 'NA'] |
| 18 | 18 |
| 19 def load_data_from_pickle(filename): | 19 def load_data_from_pickle(filename): |
| 20 X_list, Y, Yaudio = pickle.load(open(filename,'rb')) | 20 X_list, Y, Yaudio = pickle.load(open(filename,'rb')) |
| 21 X = np.concatenate(X_list, axis=1) | 21 X = np.concatenate(X_list, axis=1) |
| 22 return X, Y, Yaudio | 22 return X, Y, Yaudio |
| 42 | 42 |
| 43 | 43 |
| 44 def classify_for_filenames(file_list=FILENAMES): | 44 def classify_for_filenames(file_list=FILENAMES): |
| 45 df_results = pd.DataFrame() | 45 df_results = pd.DataFrame() |
| 46 feat_learner = util_feature_learning.Transformer() | 46 feat_learner = util_feature_learning.Transformer() |
| 47 for filename in file_list: | 47 #traininds, testinds = get_train_test_indices(Yaudio) |
| 48 for filename, transform_label in zip(file_list, TRANSFORM_LABELS): | |
| 49 print filename | |
| 48 X, Y, Yaudio = load_data_from_pickle(filename) | 50 X, Y, Yaudio = load_data_from_pickle(filename) |
| 49 traininds, testinds = get_train_test_indices(Yaudio) | 51 #X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds) |
| 50 X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds) | 52 X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=12345, stratify=Y) |
| 51 df_result = feat_learner.classify(X_train, Y_train, X_test, Y_test) | 53 X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=12345, stratify=Y_val_test) |
| 54 df_result = feat_learner.classify(X_train, Y_train, X_test, Y_test, transform_label=transform_label) | |
| 55 df_result_feat = classify_each_feature(X_train, Y_train, X_test, Y_test, transform_label=transform_label) | |
| 56 df_result = pd.concat([df_result, df_result_feat], axis=1, ignore_index=True) | |
| 52 df_results = pd.concat([df_results, df_result], axis=0, ignore_index=True) | 57 df_results = pd.concat([df_results, df_result], axis=0, ignore_index=True) |
| 53 return df_results | 58 return df_results |
| 54 | 59 |
| 55 | 60 |
| 56 def classify_each_feature(X_train, Y_train, X_test, Y_test): | 61 def classify_each_feature(X_train, Y_train, X_test, Y_test, transform_label=" "): |
| 57 n_dim = X_train.shape[1] | 62 n_dim = X_train.shape[1] |
| 58 feat_labels, feat_inds = map_and_average.get_feat_inds(n_dim=n_dim) | 63 feat_labels, feat_inds = map_and_average.get_feat_inds(n_dim=n_dim) |
| 59 #df_results = pd.DataFrame() | 64 #df_results = pd.DataFrame() |
| 60 # first the classification with all features together | 65 # first the classification with all features together |
| 61 df_results = feat_learner.classify(X_train, Y_train, X_test, Y_test) | 66 df_results = feat_learner.classify(X_train, Y_train, X_test, Y_test, transform_label=transform_label) |
| 62 # then append for each feature separately | 67 # then append for each feature separately |
| 63 for i in range(len(feat_inds)): | 68 for i in range(len(feat_inds)): |
| 64 df_result = feat_learner.classify(X_train[:, feat_inds[i]], Y_train, | 69 df_result = feat_learner.classify(X_train[:, feat_inds[i]], Y_train, |
| 65 X_test[:, feat_inds[i]], Y_test) | 70 X_test[:, feat_inds[i]], Y_test) |
| 66 df_results = pd.concat([df_results, df_result], axis=1, ignore_index=True) | 71 df_results = pd.concat([df_results, df_result], axis=1, ignore_index=True) |
| 85 if saveCF: | 90 if saveCF: |
| 86 np.savetxt('data/CFlabels.csv', labels, fmt='%s') | 91 np.savetxt('data/CFlabels.csv', labels, fmt='%s') |
| 87 np.savetxt('data/CF.csv', CF, fmt='%10.5f') | 92 np.savetxt('data/CF.csv', CF, fmt='%10.5f') |
| 88 if plots: | 93 if plots: |
| 89 plot_CF(CF, labels=labels, figurename='data/conf_matrix.pdf') | 94 plot_CF(CF, labels=labels, figurename='data/conf_matrix.pdf') |
| 90 return accuracy, predictions | 95 return accuracy, CF |
| 96 | |
| 97 | |
| 98 def confusion_matrix_for_best_classification_result(df_results, output_data=False): | |
| 99 max_i = np.argmax(df_results[:, 1]) | |
| 100 feat_learning_i = max_i % 4 # 4 classifiers for each feature learning method | |
| 101 filename = FILENAMES[feat_learning_i] | |
| 102 print filename | |
| 103 X, Y, Yaudio = load_data_from_pickle(filename) | |
| 104 #traininds, testinds = get_train_test_indices(Yaudio) | |
| 105 #X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds) | |
| 106 X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=12345, stratify=Y) | |
| 107 X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=12345, stratify=Y_val_test) | |
| 108 if output_data: | |
| 109 _, CF = confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=True, plots=True) | |
| 110 else: | |
| 111 _, CF = confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False) | |
| 112 return CF | |
| 91 | 113 |
| 92 | 114 |
| 93 if __name__ == '__main__': | 115 if __name__ == '__main__': |
| 94 df_results = classify_for_filenames(file_list=FILENAMES) | 116 df_results = classify_for_filenames(file_list=FILENAMES) |
| 95 max_i = np.argmax(df_results[:, 1]) | 117 CF = confusion_matrix_for_best_classification_result(df_results, output_data=False) |
| 96 feat_learning_i = max_i % 4 # 4 classifiers for each feature learning method | |
| 97 filename = FILENAMES[feat_learning_i] | |
| 98 X, Y, Yaudio = load_data_from_pickle(filename) | |
| 99 traininds, testinds = get_train_test_indices(Yaudio) | |
| 100 X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds) | |
| 101 confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=True, plots=True) | |
| 102 | 118 |
