# HG changeset patch # User Maria Panteli # Date 1506003858 -3600 # Node ID d118b6ca8370fdcd0fe8fef1a161bf74231c60b3 # Parent dd86e49ae8bff6d4728691cfe5c158a40f8a7cdf some changes in classification with random train/test split diff -r dd86e49ae8bf -r d118b6ca8370 scripts/classification.py --- a/scripts/classification.py Tue Sep 19 21:28:06 2017 +0100 +++ b/scripts/classification.py Thu Sep 21 15:24:18 2017 +0100 @@ -14,7 +14,7 @@ FILENAMES = map_and_average.OUTPUT_FILES - +TRANSFORM_LABELS = ['LDA', 'PCA', 'NMF', 'SSNMF', 'NA'] def load_data_from_pickle(filename): X_list, Y, Yaudio = pickle.load(open(filename,'rb')) @@ -44,21 +44,26 @@ def classify_for_filenames(file_list=FILENAMES): df_results = pd.DataFrame() feat_learner = util_feature_learning.Transformer() - for filename in file_list: + #traininds, testinds = get_train_test_indices(Yaudio) + for filename, transform_label in zip(file_list, TRANSFORM_LABELS): + print filename X, Y, Yaudio = load_data_from_pickle(filename) - traininds, testinds = get_train_test_indices(Yaudio) - X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds) - df_result = feat_learner.classify(X_train, Y_train, X_test, Y_test) + #X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds) + X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=12345, stratify=Y) + X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=12345, stratify=Y_val_test) + df_result = feat_learner.classify(X_train, Y_train, X_test, Y_test, transform_label=transform_label) + df_result_feat = classify_each_feature(X_train, Y_train, X_test, Y_test, transform_label=transform_label) + df_result = pd.concat([df_result, df_result_feat], axis=1, ignore_index=True) df_results = pd.concat([df_results, df_result], axis=0, ignore_index=True) return df_results -def classify_each_feature(X_train, Y_train, X_test, Y_test): +def classify_each_feature(X_train, Y_train, X_test, Y_test, transform_label=" "): n_dim = X_train.shape[1] feat_labels, feat_inds = map_and_average.get_feat_inds(n_dim=n_dim) #df_results = pd.DataFrame() # first the classification with all features together - df_results = feat_learner.classify(X_train, Y_train, X_test, Y_test) + df_results = feat_learner.classify(X_train, Y_train, X_test, Y_test, transform_label=transform_label) # then append for each feature separately for i in range(len(feat_inds)): df_result = feat_learner.classify(X_train[:, feat_inds[i]], Y_train, @@ -87,16 +92,27 @@ np.savetxt('data/CF.csv', CF, fmt='%10.5f') if plots: plot_CF(CF, labels=labels, figurename='data/conf_matrix.pdf') - return accuracy, predictions + return accuracy, CF + + +def confusion_matrix_for_best_classification_result(df_results, output_data=False): + max_i = np.argmax(df_results[:, 1]) + feat_learning_i = max_i % 4 # 4 classifiers for each feature learning method + filename = FILENAMES[feat_learning_i] + print filename + X, Y, Yaudio = load_data_from_pickle(filename) + #traininds, testinds = get_train_test_indices(Yaudio) + #X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds) + X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=12345, stratify=Y) + X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=12345, stratify=Y_val_test) + if output_data: + _, CF = confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=True, plots=True) + else: + _, CF = confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False) + return CF if __name__ == '__main__': df_results = classify_for_filenames(file_list=FILENAMES) - max_i = np.argmax(df_results[:, 1]) - feat_learning_i = max_i % 4 # 4 classifiers for each feature learning method - filename = FILENAMES[feat_learning_i] - X, Y, Yaudio = load_data_from_pickle(filename) - traininds, testinds = get_train_test_indices(Yaudio) - X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds) - confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=True, plots=True) + CF = confusion_matrix_for_best_classification_result(df_results, output_data=False) diff -r dd86e49ae8bf -r d118b6ca8370 scripts/map_and_average.py --- a/scripts/map_and_average.py Tue Sep 19 21:28:06 2017 +0100 +++ b/scripts/map_and_average.py Thu Sep 21 15:24:18 2017 +0100 @@ -67,7 +67,6 @@ def load_train_val_test_sets(): '''load train, val, test sets ''' - print INPUT_FILES[0] trainset = load_data_from_pickle(INPUT_FILES[0]) valset = load_data_from_pickle(INPUT_FILES[1]) testset = load_data_from_pickle(INPUT_FILES[2]) diff -r dd86e49ae8bf -r d118b6ca8370 tests/test_classification.py --- a/tests/test_classification.py Tue Sep 19 21:28:06 2017 +0100 +++ b/tests/test_classification.py Thu Sep 21 15:24:18 2017 +0100 @@ -19,7 +19,7 @@ X[-50:, :] = X[-50:, :] + 10 Y = np.concatenate([np.repeat('a', 50), np.repeat('b', 50)]) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.6, random_state=1, stratify=Y) - accuracy, predictions = classification.confusion_matrix(X_train, Y_train, X_test, Y_test) + accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test) # expect perfect accuracy for this 'easy' dataset assert accuracy == 1.0