Mercurial > hg > plosone_underreview
changeset 18:ed109218dd4b branch-tests
rename result scripts and more tests
author | Maria Panteli |
---|---|
date | Tue, 12 Sep 2017 23:18:19 +0100 |
parents | 2e487b9c0a7b |
children | 0bba6f63f4fd |
files | notebooks/correlation_samples_outliers.ipynb notebooks/sensitivity_experiment.ipynb notebooks/test_hubness.ipynb scripts/classification.py scripts/outliers.py tests/test_load_dataset.py tests/test_map_and_average.py tests/test_outliers.py |
diffstat | 8 files changed, 335 insertions(+), 12 deletions(-) [+] |
line wrap: on
line diff
--- a/notebooks/correlation_samples_outliers.ipynb Tue Sep 12 19:11:43 2017 +0100 +++ b/notebooks/correlation_samples_outliers.ipynb Tue Sep 12 23:18:19 2017 +0100 @@ -20,7 +20,7 @@ "%autoreload 2\n", "\n", "sys.path.append('../')\n", - "import scripts.results as results\n", + "import scripts.outliers as outliers\n", "import scripts.utils_spatial as utils_spatial" ] }, @@ -87,7 +87,7 @@ ], "source": [ "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n", - "ddf = results.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n", + "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n", "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n", "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n", "Xrhy, Xmel, Xmfc, Xchr = X_list\n", @@ -196,7 +196,7 @@ ], "source": [ "# global outliers\n", - "df_global, threshold, MD = results.get_outliers_df(X, Y, chi2thr=0.999)\n", + "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n", "df_global['N'] = np.zeros(len(df_global))\n", "df_global['OutliersN'] = np.zeros(len(df_global))\n", "for i, country in enumerate(df_global['Country']):\n",
--- a/notebooks/sensitivity_experiment.ipynb Tue Sep 12 19:11:43 2017 +0100 +++ b/notebooks/sensitivity_experiment.ipynb Tue Sep 12 23:18:19 2017 +0100 @@ -27,8 +27,8 @@ "sys.path.append('../')\n", "import scripts.load_dataset as load_dataset\n", "import scripts.map_and_average as mapper\n", - "import scripts.results_classification as results_class\n", - "import scripts.results as results" + "import scripts.classification\n", + "import scripts.outliers as outliers" ] }, { @@ -74,14 +74,14 @@ " \n", " # classification and confusion\n", " print \"classifying...\"\n", - " traininds, testinds = results_class.get_train_test_indices()\n", - " X_train, Y_train, X_test, Y_test = results_class.get_train_test_sets(X, Y, traininds, testinds)\n", - " accuracy, _ = results_class.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n", + " traininds, testinds = classification.get_train_test_indices()\n", + " X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n", + " accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n", " print accuracy\n", " \n", " # outliers\n", " print \"detecting outliers...\"\n", - " ddf = results.load_metadata(Yaudio, metadata_file=load_dataset.METADATA_FILE)\n", + " ddf = outliers.load_metadata(Yaudio, metadata_file=load_dataset.METADATA_FILE)\n", " df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)\n", " print_most_least_outliers_topN(df_global, N=10)\n", " \n",
--- a/notebooks/test_hubness.ipynb Tue Sep 12 19:11:43 2017 +0100 +++ b/notebooks/test_hubness.ipynb Tue Sep 12 23:18:19 2017 +0100 @@ -19,7 +19,7 @@ "%autoreload 2\n", "\n", "sys.path.append('../')\n", - "import scripts.results as results\n", + "import scripts.outliers as outliers\n", "import scripts.utils_spatial as utils_spatial" ] }, @@ -68,14 +68,14 @@ ], "source": [ "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n", - "ddf = results.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n", + "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n", "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n", "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n", "Xrhy, Xmel, Xmfc, Xchr = X_list\n", "X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)\n", "\n", "# global outliers\n", - "df_global, threshold, MD = results.get_outliers_df(X, Y, chi2thr=0.999)" + "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)" ] }, {
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/classification.py Tue Sep 12 23:18:19 2017 +0100 @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Nov 10 15:10:32 2016 + +@author: mariapanteli +""" +import numpy as np +import pandas as pd +from sklearn import metrics + +import map_and_average +import util_feature_learning + + +FILENAMES = map_and_average.OUTPUT_FILES + + +def load_data_from_pickle(filename): + X_list, Y, Yaudio = pickle.load(open(filename,'rb')) + X = np.concatenate(data_list, axis=1) + return X, Y, Yaudio + + +def get_train_test_indices(): + trainset, valset, testset = map_and_average.load_train_val_test_sets() + trainaudiolabels, testaudiolabels = trainset[2], testset[2] + # train, test indices + aa_train = np.unique(trainaudiolabels) + aa_test = np.unique(testaudiolabels) + traininds = np.array([i for i, item in enumerate(audiolabs) if item in aa_train]) + testinds = np.array([i for i, item in enumerate(audiolabs) if item in aa_test]) + return traininds, testinds + + +def get_train_test_sets(X, Y, traininds, testinds): + X_train = X[traininds, :] + Y_train = Y[traininds] + X_test = X[testinds, :] + Y_test = Y[testinds] + return X_train, Y_train, X_test, Y_test + + +def classify_for_filenames(file_list=FILENAMES): + df_results = pd.DataFrame() + feat_learner = util_feature_learning.Transformer() + for filename in file_list: + X, Y, Yaudio = load_data_from_pickle(filename) + traininds, testinds = get_train_test_indices() + X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds) + df_result = feat_learner.classify(X_train, Y_train, X_test, Y_test) + df_results = pd.concat([df_results, df_result], axis=0, ignore_index=True) + return df_results + + +def plot_CF(CF, labels=None, figurename=None): + labels[labels=='United States of America'] = 'United States Amer.' + plt.imshow(CF, cmap="Greys") + plt.xticks(range(len(labels)), labels, rotation='vertical', fontsize=4) + plt.yticks(range(len(labels)), labels, fontsize=4) + if figurename is not None: + plt.savefig(figurename, bbox_inches='tight') + + +def confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False): + feat_learner = util_feature_learning.Transformer() + accuracy, predictions = util_feature_learning.classification_accuracy(X_train, Y_train, + X_test, Y_test, model=util_feature_learning.modelLDA) + labels = np.unique(Y_test) # TODO: countries in geographical proximity + CF = metrics.confusion_matrix(Y_test, predictions, labels=labels) + if saveCF: + np.savetxt('data/CFlabels.csv', labels, fmt='%s') + np.savetxt('data/CF.csv', CF, fmt='%10.5f') + if plots: + plot_CF(CF, labels=labels, figurename='data/conf_matrix.pdf') + return accuracy, predictions + + +if __name__ == '__main__': + df_results = classify_for_filenames(file_list=FILENAMES) + max_i = np.argmax(df_results[:, 1]) + feat_learning_i = max_i % 4 # 4 classifiers for each feature learning method + filename = FILENAMES[feat_learning_i] + X, Y, Yaudio = load_data_from_pickle(filename) + traininds, testinds = get_train_test_indices() + X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds) + confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=True, plots=True) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/outliers.py Tue Sep 12 23:18:19 2017 +0100 @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Jul 12 20:49:48 2016 + +@author: mariapanteli +""" + +import numpy as np +import pandas as pd +import pickle +from collections import Counter +from sklearn.cluster import KMeans + +import utils +import utils_spatial + + +def country_outlier_df(counts, labels, out_file=None, normalize=False): + if len(counts.keys()) < len(np.unique(labels)): + for label in np.unique(labels): + if not counts.has_key(label): + counts.update({label:0}) + if normalize: + counts = normalize_outlier_counts(counts, Counter(labels)) + df = pd.DataFrame.from_dict(counts, orient='index').reset_index() + df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True) + if out_file is not None: + df.to_csv(out_file, index=False) + return df + + +def normalize_outlier_counts(outlier_counts, country_counts): + '''Normalize a dictionary of outlier counts per country by + the total number of recordings per country + ''' + for key in outlier_counts.keys(): + # dictionaries should have the same keys + outlier_counts[key] = float(outlier_counts[key]) / float(country_counts[key]) + return outlier_counts + + +def get_outliers_df(X, Y, chi2thr=0.999, out_file=None): + threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr) + global_counts = Counter(Y[y_pred]) + df = country_outlier_df(global_counts, Y, normalize=True) + if out_file is not None: + df.to_csv(out_file, index=False) + return df, threshold, MD + + +def print_most_least_outliers_topN(df, N=10): + sort_inds = df['Outliers'].argsort() # ascending order + df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]] + df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]] + print "most outliers " + print df_most + print "least outliers " + print df_least + + +def load_metadata(Yaudio, metadata_file): + df = pd.read_csv(metadata_file) + df_audio = pd.DataFrame({'Audio':Yaudio}) + ddf = pd.merge(df_audio, df, on='Audio', suffixes=['', '_r']) # in the order of Yaudio + return ddf + + +def clusters_metadata(df, cl_pred, out_file=None): + def get_top_N_counts(labels, N=3): + ulab, ucount = np.unique(labels, return_counts=True) + inds = np.argsort(ucount) + return zip(ulab[inds[-N:]],ucount[inds[-N:]]) + info = np.array([str(df['Country'].iloc[i]) for i in range(len(df))]) + styles_description = [] + uniq_cl = np.unique(cl_pred) + for ccl in uniq_cl: + inds = np.where(cl_pred==ccl)[0] + styles_description.append(get_top_N_counts(info[inds], N=3)) + df_styles = pd.DataFrame(data=styles_description, index=uniq_cl) + print df_styles.to_latex() + if out_file is not None: + df_styles.to_csv(out_file, index=False) + + +if __name__ == '__main__': + # load LDA-transformed frames + X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb')) + ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv') + w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) + w_dict = utils_spatial.from_weights_to_dict(w, data_countries) + X = np.concatenate(X_list, axis=1) + + # global outliers + df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999) + print_most_least_outliers_topN(df_global, N=10) + + spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) + spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) + df_local = country_outlier_df(spatial_counts, Y, normalize=True) + print_most_least_outliers_topN(df_local, N=10) + + feat = [Xrhy, Xmel, Xmfc, Xchr] + feat_labels = ['rhy', 'mel', 'mfc', 'chr'] + tabs_feat = [] + for i in range(len(feat)): + XX = feat[i] + df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999) + print_most_least_outliers_topN(df_feat, N=5) + + # how many styles are there + #bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") + bestncl = 13 + + # get cluster predictions and metadata for each cluster + cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) + centroids = cluster_model.cluster_centers_ + cl_pred = cluster_model.predict(X) + ddf['Clusters'] = cl_pred + clusters_metadata(ddf, cl_pred) + + # how similar are the cultures and which ones seem to be global outliers + cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids) + + # Moran on Mahalanobis distances + data = cluster_freq.get_values() + data_countries = cluster_freq.index + #threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999) + threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999) + y = np.sqrt(MD) + utils_spatial.print_Moran_outliers(y, w, data_countries) + utils_spatial.plot_Moran_scatterplot(y, w, data_countries)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_load_dataset.py Tue Sep 12 23:18:19 2017 +0100 @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Sep 1 19:11:52 2017 + +@author: mariapanteli +""" + +import pytest + +import numpy as np + +import scripts.load_dataset as load_dataset + + +def test_get_train_val_test_idx(): + X = np.arange(10) + Y = np.concatenate([np.ones(5), np.zeros(5)]) + train, val, test = load_dataset.get_train_val_test_idx(X, Y, seed=1) + assert len(train[0]) == 6 and len(val[0]) == 2 and len(test[0]) == 2 + + +def test_get_train_val_test_idx_stratify(): + X = np.arange(10) + Y = np.concatenate([np.ones(5), np.zeros(5)]) + train, val, test = load_dataset.get_train_val_test_idx(X, Y, seed=1) + assert np.array_equal(np.unique(train[1]), np.unique(val[1])) + + +def test_subset_labels(): + Y = np.concatenate([np.ones(5), 2*np.ones(10), 3*np.ones(100)]) + subset_idx = load_dataset.subset_labels(Y, seed=1) + subset_idx = np.sort(subset_idx) + subset_idx_true = np.arange(5, 115) + assert np.array_equal(subset_idx, subset_idx_true) + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_map_and_average.py Tue Sep 12 23:18:19 2017 +0100 @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Sep 1 19:11:52 2017 + +@author: mariapanteli +""" + +import pytest + +import numpy as np + +import scripts.map_and_average as map_and_average + + +def test_remove_inds(): + labels = np.array(['a', 'a', 'b', 'unknown']) + features = np.array([[0, 1], [0,2], [0, 3], [0, 4]]) + audiolabels = np.array(['a', 'b', 'c', 'd']) + features, labels, audiolabels = map_and_average.remove_inds(features, labels, audiolabels) + assert len(features) == 3 and len(labels) == 3 and len(audiolabels) == 3 + + +def test_remove_inds(): + labels = np.array(['a', 'a', 'b', 'unknown']) + features = np.array([[0, 1], [0,2], [0, 3], [0, 4]]) + audiolabels = np.array(['a', 'b', 'c', 'd']) + features, labels, audiolabels = map_and_average.remove_inds(features, labels, audiolabels) + features_true = np.array([[0, 1], [0,2], [0, 3]]) + assert np.array_equal(features, features_true) + + +def test_averageframes(): + classlabels = np.array(['a', 'a', 'b', 'b', 'b']) + features = np.array([[0, 1], [0,2], [0, 1], [1, 1], [2, 1]]) + audiolabels = np.array(['a', 'a', 'b', 'b', 'b']) + feat, audio, labels = map_and_average.averageframes(features, audiolabels, classlabels) + feat_true = np.array([[0, 0.5], [1, 1]]) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_outliers.py Tue Sep 12 23:18:19 2017 +0100 @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Sep 1 19:11:52 2017 + +@author: mariapanteli +""" + +import pytest + +import numpy as np +import pandas as pd + +import scripts.outliers as outliers + + +def test_country_outlier_df(): + counts = {'a':2, 'b':3} + labels = np.array(['a', 'a', 'a', 'a', 'b', 'b', 'b']) + df = outliers.country_outlier_df(counts, labels, normalize=True) + assert np.array_equal(df['Outliers'].get_values(), np.array([0.5, 1.0])) + + +def test_normalize_outlier_counts(): + outlier_counts = {'a':2, 'b':3} + country_counts = {'a':4, 'b':3} + outlier_counts = outliers.normalize_outlier_counts(outlier_counts, country_counts) + outlier_counts_true = {'a':.5, 'b':1.} + assert np.array_equal(outlier_counts, outlier_counts_true) + + +def test_get_outliers_df(): + assert True \ No newline at end of file