m@16: # -*- coding: utf-8 -*- m@16: """ m@16: Created on Thu Nov 10 15:10:32 2016 m@16: m@16: @author: mariapanteli m@16: """ m@16: import numpy as np m@16: import pandas as pd m@16: from sklearn import metrics m@16: m@16: import map_and_average m@16: import util_feature_learning m@16: m@16: m@16: FILENAMES = map_and_average.OUTPUT_FILES m@16: m@16: m@16: def load_data_from_pickle(filename): m@16: X_list, Y, Yaudio = pickle.load(open(filename,'rb')) m@16: X = np.concatenate(data_list, axis=1) m@16: return X, Y, Yaudio m@16: m@16: m@16: def get_train_test_indices(): m@16: trainset, valset, testset = map_and_average.load_train_val_test_sets() m@16: trainaudiolabels, testaudiolabels = trainset[2], testset[2] m@16: # train, test indices m@16: aa_train = np.unique(trainaudiolabels) m@16: aa_test = np.unique(testaudiolabels) m@16: traininds = np.array([i for i, item in enumerate(audiolabs) if item in aa_train]) m@16: testinds = np.array([i for i, item in enumerate(audiolabs) if item in aa_test]) m@16: return traininds, testinds m@16: m@16: m@16: def get_train_test_sets(X, Y, traininds, testinds): m@16: X_train = X[traininds, :] m@16: Y_train = Y[traininds] m@16: X_test = X[testinds, :] m@16: Y_test = Y[testinds] m@16: return X_train, Y_train, X_test, Y_test m@16: m@16: m@16: def classify_for_filenames(file_list=FILENAMES): m@16: df_results = pd.DataFrame() m@16: feat_learner = util_feature_learning.Transformer() m@16: for filename in file_list: m@16: X, Y, Yaudio = load_data_from_pickle(filename) m@16: traininds, testinds = get_train_test_indices() m@16: X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds) m@16: df_result = feat_learner.classify(X_train, Y_train, X_test, Y_test) m@16: df_results = pd.concat([df_results, df_result], axis=0, ignore_index=True) m@16: return df_results m@16: m@16: m@16: def plot_CF(CF, labels=None, figurename=None): m@16: labels[labels=='United States of America'] = 'United States Amer.' m@16: plt.imshow(CF, cmap="Greys") m@16: plt.xticks(range(len(labels)), labels, rotation='vertical', fontsize=4) m@16: plt.yticks(range(len(labels)), labels, fontsize=4) m@16: if figurename is not None: m@16: plt.savefig(figurename, bbox_inches='tight') m@16: m@16: m@16: def confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False): m@16: feat_learner = util_feature_learning.Transformer() m@16: accuracy, predictions = util_feature_learning.classification_accuracy(X_train, Y_train, m@16: X_test, Y_test, model=util_feature_learning.modelLDA) m@16: labels = np.unique(Y_test) # TODO: countries in geographical proximity m@16: CF = metrics.confusion_matrix(Y_test, predictions, labels=labels) m@16: if saveCF: m@16: np.savetxt('data/CFlabels.csv', labels, fmt='%s') m@16: np.savetxt('data/CF.csv', CF, fmt='%10.5f') m@16: if plots: m@16: plot_CF(CF, labels=labels, figurename='data/conf_matrix.pdf') m@16: return accuracy, predictions m@16: m@16: m@16: if __name__ == '__main__': m@16: df_results = classify_for_filenames(file_list=FILENAMES) m@16: max_i = np.argmax(df_results[:, 1]) m@16: feat_learning_i = max_i % 4 # 4 classifiers for each feature learning method m@16: filename = FILENAMES[feat_learning_i] m@16: X, Y, Yaudio = load_data_from_pickle(filename) m@16: traininds, testinds = get_train_test_indices() m@16: X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds) m@16: confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=True, plots=True) m@16: