Mercurial > hg > plosone_underreview

--- a/notebooks/correlation_samples_outliers.ipynb	Tue Sep 12 19:11:43 2017 +0100
+++ b/notebooks/correlation_samples_outliers.ipynb	Tue Sep 12 23:18:19 2017 +0100
@@ -20,7 +20,7 @@
     "%autoreload 2\n",
     "\n",
     "sys.path.append('../')\n",
-    "import scripts.results as results\n",
+    "import scripts.outliers as outliers\n",
     "import scripts.utils_spatial as utils_spatial"
    ]
   },
@@ -87,7 +87,7 @@
    ],
    "source": [
     "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n",
-    "ddf = results.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
+    "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
     "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n",
     "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n",
     "Xrhy, Xmel, Xmfc, Xchr = X_list\n",
@@ -196,7 +196,7 @@
    ],
    "source": [
     "# global outliers\n",
-    "df_global, threshold, MD = results.get_outliers_df(X, Y, chi2thr=0.999)\n",
+    "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n",
     "df_global['N'] = np.zeros(len(df_global))\n",
     "df_global['OutliersN'] = np.zeros(len(df_global))\n",
     "for i, country in enumerate(df_global['Country']):\n",
--- a/notebooks/sensitivity_experiment.ipynb	Tue Sep 12 19:11:43 2017 +0100
+++ b/notebooks/sensitivity_experiment.ipynb	Tue Sep 12 23:18:19 2017 +0100
@@ -27,8 +27,8 @@
     "sys.path.append('../')\n",
     "import scripts.load_dataset as load_dataset\n",
     "import scripts.map_and_average as mapper\n",
-    "import scripts.results_classification as results_class\n",
-    "import scripts.results as results"
+    "import scripts.classification\n",
+    "import scripts.outliers as outliers"
    ]
   },
   {
@@ -74,14 +74,14 @@
     "    \n",
     "    # classification and confusion\n",
     "    print \"classifying...\"\n",
-    "    traininds, testinds = results_class.get_train_test_indices()\n",
-    "    X_train, Y_train, X_test, Y_test = results_class.get_train_test_sets(X, Y, traininds, testinds)\n",
-    "    accuracy, _ = results_class.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n",
+    "    traininds, testinds = classification.get_train_test_indices()\n",
+    "    X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n",
+    "    accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n",
     "    print accuracy\n",
     "    \n",
     "    # outliers\n",
     "    print \"detecting outliers...\"\n",
-    "    ddf = results.load_metadata(Yaudio, metadata_file=load_dataset.METADATA_FILE)\n",
+    "    ddf = outliers.load_metadata(Yaudio, metadata_file=load_dataset.METADATA_FILE)\n",
     "    df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)\n",
     "    print_most_least_outliers_topN(df_global, N=10)\n",
     "    \n",
--- a/notebooks/test_hubness.ipynb	Tue Sep 12 19:11:43 2017 +0100
+++ b/notebooks/test_hubness.ipynb	Tue Sep 12 23:18:19 2017 +0100
@@ -19,7 +19,7 @@
     "%autoreload 2\n",
     "\n",
     "sys.path.append('../')\n",
-    "import scripts.results as results\n",
+    "import scripts.outliers as outliers\n",
     "import scripts.utils_spatial as utils_spatial"
    ]
   },
@@ -68,14 +68,14 @@
    ],
    "source": [
     "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n",
-    "ddf = results.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
+    "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
     "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n",
     "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n",
     "Xrhy, Xmel, Xmfc, Xchr = X_list\n",
     "X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)\n",
     "\n",
     "# global outliers\n",
-    "df_global, threshold, MD = results.get_outliers_df(X, Y, chi2thr=0.999)"
+    "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)"
    ]
   },
   {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/classification.py	Tue Sep 12 23:18:19 2017 +0100
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Nov 10 15:10:32 2016
+
+@author: mariapanteli
+"""
+import numpy as np
+import pandas as pd
+from sklearn import metrics
+
+import map_and_average
+import util_feature_learning
+
+
+FILENAMES = map_and_average.OUTPUT_FILES
+
+
+def load_data_from_pickle(filename):
+    X_list, Y, Yaudio = pickle.load(open(filename,'rb'))
+    X = np.concatenate(data_list, axis=1)
+    return X, Y, Yaudio
+
+
+def get_train_test_indices():
+    trainset, valset, testset = map_and_average.load_train_val_test_sets()
+    trainaudiolabels, testaudiolabels = trainset[2], testset[2]
+    # train, test indices
+    aa_train = np.unique(trainaudiolabels)
+    aa_test = np.unique(testaudiolabels)
+    traininds = np.array([i for i, item in enumerate(audiolabs) if item in aa_train])
+    testinds = np.array([i for i, item in enumerate(audiolabs) if item in aa_test])
+    return traininds, testinds
+
+
+def get_train_test_sets(X, Y, traininds, testinds):
+    X_train = X[traininds, :]
+    Y_train = Y[traininds]
+    X_test = X[testinds, :]
+    Y_test = Y[testinds]
+    return X_train, Y_train, X_test, Y_test
+
+
+def classify_for_filenames(file_list=FILENAMES):
+    df_results = pd.DataFrame()
+    feat_learner = util_feature_learning.Transformer()
+    for filename in file_list:
+        X, Y, Yaudio = load_data_from_pickle(filename)
+        traininds, testinds = get_train_test_indices()
+        X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds)
+        df_result = feat_learner.classify(X_train, Y_train, X_test, Y_test)
+        df_results = pd.concat([df_results, df_result], axis=0, ignore_index=True)
+    return df_results
+
+
+def plot_CF(CF, labels=None, figurename=None):
+    labels[labels=='United States of America'] = 'United States Amer.'
+    plt.imshow(CF, cmap="Greys")
+    plt.xticks(range(len(labels)), labels, rotation='vertical', fontsize=4)
+    plt.yticks(range(len(labels)), labels, fontsize=4)
+    if figurename is not None:
+        plt.savefig(figurename, bbox_inches='tight')
+
+
+def confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False):
+    feat_learner = util_feature_learning.Transformer()
+    accuracy, predictions = util_feature_learning.classification_accuracy(X_train, Y_train,
+                        X_test, Y_test, model=util_feature_learning.modelLDA)
+    labels = np.unique(Y_test)  # TODO: countries in geographical proximity
+    CF = metrics.confusion_matrix(Y_test, predictions, labels=labels)
+    if saveCF:
+        np.savetxt('data/CFlabels.csv', labels, fmt='%s')
+        np.savetxt('data/CF.csv', CF, fmt='%10.5f')
+    if plots:
+        plot_CF(CF, labels=labels, figurename='data/conf_matrix.pdf')
+    return accuracy, predictions
+
+
+if __name__ == '__main__':
+    df_results = classify_for_filenames(file_list=FILENAMES)
+    max_i = np.argmax(df_results[:, 1])
+    feat_learning_i = max_i % 4  # 4 classifiers for each feature learning method
+    filename = FILENAMES[feat_learning_i]
+    X, Y, Yaudio = load_data_from_pickle(filename)
+    traininds, testinds = get_train_test_indices()
+    X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds)
+    confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=True, plots=True)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/outliers.py	Tue Sep 12 23:18:19 2017 +0100
@@ -0,0 +1,131 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jul 12 20:49:48 2016
+
+@author: mariapanteli
+"""
+
+import numpy as np
+import pandas as pd
+import pickle
+from collections import Counter
+from sklearn.cluster import KMeans
+
+import utils
+import utils_spatial
+
+
+def country_outlier_df(counts, labels, out_file=None, normalize=False):
+    if len(counts.keys()) < len(np.unique(labels)):
+        for label in np.unique(labels):
+            if not counts.has_key(label):
+                counts.update({label:0})
+    if normalize:
+        counts = normalize_outlier_counts(counts, Counter(labels))
+    df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
+    df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True)
+    if out_file is not None:
+        df.to_csv(out_file, index=False)
+    return df
+
+
+def normalize_outlier_counts(outlier_counts, country_counts):
+    '''Normalize a dictionary of outlier counts per country by
+        the total number of recordings per country
+    '''
+    for key in outlier_counts.keys():
+        # dictionaries should have the same keys
+        outlier_counts[key] = float(outlier_counts[key]) / float(country_counts[key])
+    return outlier_counts
+
+
+def get_outliers_df(X, Y, chi2thr=0.999, out_file=None):
+    threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr)
+    global_counts = Counter(Y[y_pred])
+    df = country_outlier_df(global_counts, Y, normalize=True)
+    if out_file is not None:
+        df.to_csv(out_file, index=False)
+    return df, threshold, MD
+
+
+def print_most_least_outliers_topN(df, N=10):
+    sort_inds = df['Outliers'].argsort()  # ascending order
+    df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]]
+    df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]]
+    print "most outliers "
+    print df_most
+    print "least outliers "
+    print df_least
+
+
+def load_metadata(Yaudio, metadata_file):
+    df = pd.read_csv(metadata_file)
+    df_audio = pd.DataFrame({'Audio':Yaudio})
+    ddf = pd.merge(df_audio, df, on='Audio', suffixes=['', '_r']) # in the order of Yaudio
+    return ddf
+
+
+def clusters_metadata(df, cl_pred, out_file=None):
+    def get_top_N_counts(labels, N=3):
+        ulab, ucount = np.unique(labels, return_counts=True)
+        inds = np.argsort(ucount)
+        return zip(ulab[inds[-N:]],ucount[inds[-N:]])
+    info = np.array([str(df['Country'].iloc[i]) for i in range(len(df))])
+    styles_description = []
+    uniq_cl = np.unique(cl_pred)
+    for ccl in uniq_cl:
+        inds = np.where(cl_pred==ccl)[0]
+        styles_description.append(get_top_N_counts(info[inds], N=3))
+    df_styles = pd.DataFrame(data=styles_description, index=uniq_cl)
+    print df_styles.to_latex()
+    if out_file is not None:
+        df_styles.to_csv(out_file, index=False)
+
+
+if __name__ == '__main__':
+    # load LDA-transformed frames
+    X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb'))
+    ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv')
+    w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)
+    w_dict = utils_spatial.from_weights_to_dict(w, data_countries)
+    X = np.concatenate(X_list, axis=1)
+
+    # global outliers
+    df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)
+    print_most_least_outliers_topN(df_global, N=10)
+
+    spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)
+    spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))
+    df_local = country_outlier_df(spatial_counts, Y, normalize=True)
+    print_most_least_outliers_topN(df_local, N=10)
+
+    feat = [Xrhy, Xmel, Xmfc, Xchr]
+    feat_labels = ['rhy', 'mel', 'mfc', 'chr']
+    tabs_feat = []
+    for i in range(len(feat)):
+        XX = feat[i]
+        df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999)
+        print_most_least_outliers_topN(df_feat, N=5)
+
+    # how many styles are there
+    #bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine")
+    bestncl = 13
+
+    # get cluster predictions and metadata for each cluster
+    cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
+    centroids = cluster_model.cluster_centers_
+    cl_pred = cluster_model.predict(X)
+    ddf['Clusters'] = cl_pred
+    clusters_metadata(ddf, cl_pred)
+
+    # how similar are the cultures and which ones seem to be global outliers
+    cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids)
+
+    # Moran on Mahalanobis distances
+    data = cluster_freq.get_values()
+    data_countries = cluster_freq.index
+    #threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999)
+    threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999)
+    y = np.sqrt(MD)
+    utils_spatial.print_Moran_outliers(y, w, data_countries)
+    utils_spatial.plot_Moran_scatterplot(y, w, data_countries)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_load_dataset.py	Tue Sep 12 23:18:19 2017 +0100
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep  1 19:11:52 2017
+
+@author: mariapanteli
+"""
+
+import pytest
+
+import numpy as np
+
+import scripts.load_dataset as load_dataset
+
+
+def test_get_train_val_test_idx():
+    X = np.arange(10)
+    Y = np.concatenate([np.ones(5), np.zeros(5)])
+    train, val, test = load_dataset.get_train_val_test_idx(X, Y, seed=1)
+    assert len(train[0]) == 6 and len(val[0]) == 2 and len(test[0]) == 2
+
+
+def test_get_train_val_test_idx_stratify():
+    X = np.arange(10)
+    Y = np.concatenate([np.ones(5), np.zeros(5)])
+    train, val, test = load_dataset.get_train_val_test_idx(X, Y, seed=1)
+    assert np.array_equal(np.unique(train[1]), np.unique(val[1]))
+
+
+def test_subset_labels():
+    Y = np.concatenate([np.ones(5), 2*np.ones(10), 3*np.ones(100)])
+    subset_idx = load_dataset.subset_labels(Y, seed=1)
+    subset_idx = np.sort(subset_idx)
+    subset_idx_true = np.arange(5, 115)
+    assert np.array_equal(subset_idx, subset_idx_true)
+
+
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_map_and_average.py	Tue Sep 12 23:18:19 2017 +0100
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep  1 19:11:52 2017
+
+@author: mariapanteli
+"""
+
+import pytest
+
+import numpy as np
+
+import scripts.map_and_average as map_and_average
+
+
+def test_remove_inds():
+    labels = np.array(['a', 'a', 'b', 'unknown'])
+    features = np.array([[0, 1], [0,2], [0, 3], [0, 4]])
+    audiolabels = np.array(['a', 'b', 'c', 'd'])
+    features, labels, audiolabels = map_and_average.remove_inds(features, labels, audiolabels)
+    assert len(features) == 3 and len(labels) == 3 and len(audiolabels) == 3
+
+
+def test_remove_inds():
+    labels = np.array(['a', 'a', 'b', 'unknown'])
+    features = np.array([[0, 1], [0,2], [0, 3], [0, 4]])
+    audiolabels = np.array(['a', 'b', 'c', 'd'])
+    features, labels, audiolabels = map_and_average.remove_inds(features, labels, audiolabels)
+    features_true = np.array([[0, 1], [0,2], [0, 3]])
+    assert np.array_equal(features, features_true)
+
+
+def test_averageframes():
+    classlabels = np.array(['a', 'a', 'b', 'b', 'b'])
+    features = np.array([[0, 1], [0,2], [0, 1], [1, 1], [2, 1]])
+    audiolabels = np.array(['a', 'a', 'b', 'b', 'b'])
+    feat, audio, labels = map_and_average.averageframes(features, audiolabels, classlabels)
+    feat_true = np.array([[0, 0.5], [1, 1]])
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_outliers.py	Tue Sep 12 23:18:19 2017 +0100
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep  1 19:11:52 2017
+
+@author: mariapanteli
+"""
+
+import pytest
+
+import numpy as np
+import pandas as pd
+
+import scripts.outliers as outliers
+
+
+def test_country_outlier_df():
+    counts = {'a':2, 'b':3}
+    labels = np.array(['a', 'a', 'a', 'a', 'b', 'b', 'b'])
+    df = outliers.country_outlier_df(counts, labels, normalize=True)
+    assert np.array_equal(df['Outliers'].get_values(), np.array([0.5, 1.0]))
+
+
+def test_normalize_outlier_counts():
+    outlier_counts = {'a':2, 'b':3}
+    country_counts = {'a':4, 'b':3}
+    outlier_counts = outliers.normalize_outlier_counts(outlier_counts, country_counts)
+    outlier_counts_true = {'a':.5, 'b':1.}
+    assert np.array_equal(outlier_counts, outlier_counts_true)
+
+
+def test_get_outliers_df():
+    assert True
\ No newline at end of file