annotate scripts/classification.py @ 58:d118b6ca8370 branch-tests

some changes in classification with random train/test split
author Maria Panteli <m.x.panteli@gmail.com>
date Thu, 21 Sep 2017 15:24:18 +0100
parents 98cd5317e504
children 4425a4918102 b0e194bfb71d
rev   line source
Maria@18 1 # -*- coding: utf-8 -*-
Maria@18 2 """
Maria@18 3 Created on Thu Nov 10 15:10:32 2016
Maria@18 4
Maria@18 5 @author: mariapanteli
Maria@18 6 """
Maria@18 7 import numpy as np
Maria@18 8 import pandas as pd
m@48 9 import pickle
Maria@18 10 from sklearn import metrics
Maria@18 11
Maria@18 12 import map_and_average
Maria@18 13 import util_feature_learning
Maria@18 14
Maria@18 15
Maria@18 16 FILENAMES = map_and_average.OUTPUT_FILES
m@58 17 TRANSFORM_LABELS = ['LDA', 'PCA', 'NMF', 'SSNMF', 'NA']
Maria@18 18
Maria@18 19 def load_data_from_pickle(filename):
Maria@18 20 X_list, Y, Yaudio = pickle.load(open(filename,'rb'))
m@55 21 X = np.concatenate(X_list, axis=1)
Maria@18 22 return X, Y, Yaudio
Maria@18 23
Maria@18 24
m@45 25 def get_train_test_indices(audiolabs):
Maria@18 26 trainset, valset, testset = map_and_average.load_train_val_test_sets()
Maria@18 27 trainaudiolabels, testaudiolabels = trainset[2], testset[2]
Maria@18 28 # train, test indices
Maria@18 29 aa_train = np.unique(trainaudiolabels)
Maria@18 30 aa_test = np.unique(testaudiolabels)
Maria@18 31 traininds = np.array([i for i, item in enumerate(audiolabs) if item in aa_train])
Maria@18 32 testinds = np.array([i for i, item in enumerate(audiolabs) if item in aa_test])
Maria@18 33 return traininds, testinds
Maria@18 34
Maria@18 35
Maria@18 36 def get_train_test_sets(X, Y, traininds, testinds):
Maria@18 37 X_train = X[traininds, :]
Maria@18 38 Y_train = Y[traininds]
Maria@18 39 X_test = X[testinds, :]
Maria@18 40 Y_test = Y[testinds]
Maria@18 41 return X_train, Y_train, X_test, Y_test
Maria@18 42
Maria@18 43
Maria@18 44 def classify_for_filenames(file_list=FILENAMES):
Maria@18 45 df_results = pd.DataFrame()
Maria@18 46 feat_learner = util_feature_learning.Transformer()
m@58 47 #traininds, testinds = get_train_test_indices(Yaudio)
m@58 48 for filename, transform_label in zip(file_list, TRANSFORM_LABELS):
m@58 49 print filename
Maria@18 50 X, Y, Yaudio = load_data_from_pickle(filename)
m@58 51 #X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds)
m@58 52 X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=12345, stratify=Y)
m@58 53 X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=12345, stratify=Y_val_test)
m@58 54 df_result = feat_learner.classify(X_train, Y_train, X_test, Y_test, transform_label=transform_label)
m@58 55 df_result_feat = classify_each_feature(X_train, Y_train, X_test, Y_test, transform_label=transform_label)
m@58 56 df_result = pd.concat([df_result, df_result_feat], axis=1, ignore_index=True)
Maria@18 57 df_results = pd.concat([df_results, df_result], axis=0, ignore_index=True)
m@47 58 return df_results
m@47 59
m@47 60
m@58 61 def classify_each_feature(X_train, Y_train, X_test, Y_test, transform_label=" "):
m@47 62 n_dim = X_train.shape[1]
m@47 63 feat_labels, feat_inds = map_and_average.get_feat_inds(n_dim=n_dim)
m@47 64 #df_results = pd.DataFrame()
m@47 65 # first the classification with all features together
m@58 66 df_results = feat_learner.classify(X_train, Y_train, X_test, Y_test, transform_label=transform_label)
m@47 67 # then append for each feature separately
m@47 68 for i in range(len(feat_inds)):
m@47 69 df_result = feat_learner.classify(X_train[:, feat_inds[i]], Y_train,
m@47 70 X_test[:, feat_inds[i]], Y_test)
m@47 71 df_results = pd.concat([df_results, df_result], axis=1, ignore_index=True)
m@47 72 return df_results
Maria@18 73
Maria@18 74
Maria@18 75 def plot_CF(CF, labels=None, figurename=None):
Maria@18 76 labels[labels=='United States of America'] = 'United States Amer.'
Maria@18 77 plt.imshow(CF, cmap="Greys")
Maria@18 78 plt.xticks(range(len(labels)), labels, rotation='vertical', fontsize=4)
Maria@18 79 plt.yticks(range(len(labels)), labels, fontsize=4)
Maria@18 80 if figurename is not None:
Maria@18 81 plt.savefig(figurename, bbox_inches='tight')
Maria@18 82
Maria@18 83
Maria@18 84 def confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False):
Maria@18 85 feat_learner = util_feature_learning.Transformer()
m@30 86 accuracy, predictions = feat_learner.classification_accuracy(X_train, Y_train,
m@30 87 X_test, Y_test, model=feat_learner.modelLDA)
Maria@18 88 labels = np.unique(Y_test) # TODO: countries in geographical proximity
Maria@18 89 CF = metrics.confusion_matrix(Y_test, predictions, labels=labels)
Maria@18 90 if saveCF:
Maria@18 91 np.savetxt('data/CFlabels.csv', labels, fmt='%s')
Maria@18 92 np.savetxt('data/CF.csv', CF, fmt='%10.5f')
Maria@18 93 if plots:
Maria@18 94 plot_CF(CF, labels=labels, figurename='data/conf_matrix.pdf')
m@58 95 return accuracy, CF
m@58 96
m@58 97
m@58 98 def confusion_matrix_for_best_classification_result(df_results, output_data=False):
m@58 99 max_i = np.argmax(df_results[:, 1])
m@58 100 feat_learning_i = max_i % 4 # 4 classifiers for each feature learning method
m@58 101 filename = FILENAMES[feat_learning_i]
m@58 102 print filename
m@58 103 X, Y, Yaudio = load_data_from_pickle(filename)
m@58 104 #traininds, testinds = get_train_test_indices(Yaudio)
m@58 105 #X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds)
m@58 106 X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=12345, stratify=Y)
m@58 107 X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=12345, stratify=Y_val_test)
m@58 108 if output_data:
m@58 109 _, CF = confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=True, plots=True)
m@58 110 else:
m@58 111 _, CF = confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)
m@58 112 return CF
Maria@18 113
Maria@18 114
Maria@18 115 if __name__ == '__main__':
Maria@18 116 df_results = classify_for_filenames(file_list=FILENAMES)
m@58 117 CF = confusion_matrix_for_best_classification_result(df_results, output_data=False)
Maria@18 118