Maria@18: # -*- coding: utf-8 -*- Maria@18: """ Maria@18: Created on Thu Nov 10 15:10:32 2016 Maria@18: Maria@18: @author: mariapanteli Maria@18: """ Maria@18: import numpy as np Maria@18: import pandas as pd m@48: import pickle Maria@18: from sklearn import metrics m@62: from sklearn.model_selection import train_test_split m@93: import matplotlib.pyplot as plt Maria@18: Maria@18: import map_and_average Maria@18: import util_feature_learning Maria@18: Maria@18: Maria@18: FILENAMES = map_and_average.OUTPUT_FILES m@58: TRANSFORM_LABELS = ['LDA', 'PCA', 'NMF', 'SSNMF', 'NA'] m@65: RANDOM_STATE = 12345 Maria@18: Maria@18: def load_data_from_pickle(filename): Maria@18: X_list, Y, Yaudio = pickle.load(open(filename,'rb')) m@55: X = np.concatenate(X_list, axis=1) Maria@18: return X, Y, Yaudio Maria@18: Maria@18: m@62: def feat_inds_from_pickle(filename): m@62: X_list, Y, Yaudio = pickle.load(open(filename,'rb')) m@65: len_inds = np.array([X_list[0].shape[1], X_list[1].shape[1], m@65: X_list[2].shape[1], X_list[3].shape[1]]) m@65: cum_sum = np.concatenate([[0], np.cumsum(len_inds)]) m@65: feat_inds = [np.arange(cum_sum[i], cum_sum[i+1]) for i in range(len(X_list))] m@65: #feat_inds = [X_list[0].shape[1], X_list[1].shape[1], X_list[2].shape[1], X_list[3].shape[1]] m@62: feat_labels = ['rhy', 'mel', 'mfc', 'chr'] m@62: return feat_labels, feat_inds m@62: m@62: m@45: def get_train_test_indices(audiolabs): Maria@18: trainset, valset, testset = map_and_average.load_train_val_test_sets() Maria@18: trainaudiolabels, testaudiolabels = trainset[2], testset[2] Maria@18: # train, test indices Maria@18: aa_train = np.unique(trainaudiolabels) Maria@18: aa_test = np.unique(testaudiolabels) Maria@18: traininds = np.array([i for i, item in enumerate(audiolabs) if item in aa_train]) Maria@18: testinds = np.array([i for i, item in enumerate(audiolabs) if item in aa_test]) Maria@18: return traininds, testinds Maria@18: Maria@18: Maria@18: def get_train_test_sets(X, Y, traininds, testinds): Maria@18: X_train = X[traininds, :] Maria@18: Y_train = Y[traininds] Maria@18: X_test = X[testinds, :] Maria@18: Y_test = Y[testinds] Maria@18: return X_train, Y_train, X_test, Y_test Maria@18: Maria@18: Maria@18: def classify_for_filenames(file_list=FILENAMES): Maria@18: df_results = pd.DataFrame() Maria@18: feat_learner = util_feature_learning.Transformer() m@58: #traininds, testinds = get_train_test_indices(Yaudio) m@58: for filename, transform_label in zip(file_list, TRANSFORM_LABELS): Maria@18: X, Y, Yaudio = load_data_from_pickle(filename) m@58: #X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds) m@65: X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=RANDOM_STATE, stratify=Y) m@65: X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=RANDOM_STATE, stratify=Y_val_test) m@65: #df_result = feat_learner.classify(X_train, Y_train, X_test, Y_test, transform_label=transform_label) m@65: #df_result_feat = classify_each_feature(X_train, Y_train, X_test, Y_test, filename, transform_label=transform_label) m@65: #df_result = pd.concat([df_result, df_result_feat], axis=1, ignore_index=True) m@65: #df_results = pd.concat([df_results, df_result], axis=0, ignore_index=True) m@65: df_result = classify_each_feature(X_train, Y_train, X_test, Y_test, filename, transform_label=transform_label) Maria@18: df_results = pd.concat([df_results, df_result], axis=0, ignore_index=True) m@47: return df_results m@47: m@47: m@62: def classify_each_feature(X_train, Y_train, X_test, Y_test, filename, transform_label=" "): m@47: n_dim = X_train.shape[1] m@62: #feat_labels, feat_inds = map_and_average.get_feat_inds(n_dim=n_dim) m@62: feat_labels, feat_inds = feat_inds_from_pickle(filename) m@47: #df_results = pd.DataFrame() m@65: feat_learner = util_feature_learning.Transformer() m@47: # first the classification with all features together m@58: df_results = feat_learner.classify(X_train, Y_train, X_test, Y_test, transform_label=transform_label) m@47: # then append for each feature separately m@47: for i in range(len(feat_inds)): m@47: df_result = feat_learner.classify(X_train[:, feat_inds[i]], Y_train, m@65: X_test[:, feat_inds[i]], Y_test, transform_label=transform_label) m@79: df_results = pd.concat([df_results, df_result.iloc[:, 2]], axis=1, ignore_index=True) m@47: return df_results Maria@18: Maria@18: Maria@18: def plot_CF(CF, labels=None, figurename=None): Maria@18: labels[labels=='United States of America'] = 'United States Amer.' Maria@18: plt.imshow(CF, cmap="Greys") m@93: plt.xticks(range(len(labels)), labels, rotation='vertical', fontsize=8) m@93: plt.yticks(range(len(labels)), labels, fontsize=8) m@93: plt.colorbar() Maria@18: if figurename is not None: Maria@18: plt.savefig(figurename, bbox_inches='tight') Maria@18: Maria@18: m@93: def confusion_matrix(X_train, Y_train, X_test, Y_test, classifier='LDA'): Maria@18: feat_learner = util_feature_learning.Transformer() m@93: if classifier=='LDA': m@93: model = feat_learner.modelLDA m@93: elif classifier=='KNN': m@93: model = feat_learner.modelKNN m@93: elif classifier=='SVM': m@93: model = feat_learner.modelSVM m@93: elif classifier=='RF': m@93: model = feat_learner.modelRF m@30: accuracy, predictions = feat_learner.classification_accuracy(X_train, Y_train, m@93: X_test, Y_test, model=model) Maria@18: labels = np.unique(Y_test) # TODO: countries in geographical proximity Maria@18: CF = metrics.confusion_matrix(Y_test, predictions, labels=labels) m@93: return accuracy, CF, labels m@58: m@58: m@93: def confusion_matrix_for_dataset(df_results, filename, classifier='LDA', output_data=False): m@58: X, Y, Yaudio = load_data_from_pickle(filename) m@65: X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=RANDOM_STATE, stratify=Y) m@65: X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=RANDOM_STATE, stratify=Y_val_test) m@93: accuracy, CF, labels = confusion_matrix(X_train, Y_train, X_test, Y_test, classifier=classifier) m@58: if output_data: m@93: np.savetxt('../data/CFlabels.csv', labels, fmt='%s') m@93: np.savetxt('../data/CF.csv', CF, fmt='%10.5f') m@93: plot_CF(CF, labels=labels, figurename='../data/conf_matrix.pdf') m@93: return accuracy, CF, labels Maria@18: Maria@18: Maria@18: if __name__ == '__main__': Maria@18: df_results = classify_for_filenames(file_list=FILENAMES) m@58: CF = confusion_matrix_for_best_classification_result(df_results, output_data=False) Maria@18: