diff scripts/results.py @ 4:e50c63cf96be branch-tests

rearranging folders
author Maria Panteli
date Mon, 11 Sep 2017 11:51:50 +0100
parents
children 0f3eba42b425
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/results.py	Mon Sep 11 11:51:50 2017 +0100
@@ -0,0 +1,131 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jul 12 20:49:48 2016
+
+@author: mariapanteli
+"""
+
+import numpy as np
+import pandas as pd
+import pickle
+from collections import Counter
+from sklearn.cluster import KMeans
+
+import utils
+import utils_spatial
+
+
+def country_outlier_df(counts, labels, out_file=None, normalize=False):
+    if len(counts.keys()) < len(np.unique(labels)):
+        for label in np.unique(labels):
+            if not counts.has_key(label):
+                counts.update({label:0})
+    if normalize is True:
+        counts = normalize_outlier_counts(counts, Counter(labels))
+    df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
+    df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True)
+    if out_file is not None:   
+        df.to_csv(out_file, index=False)
+    return df
+
+
+def normalize_outlier_counts(outlier_counts, country_counts):
+    '''Normalize a dictionary of outlier counts per country by 
+        the total number of recordings per country
+    '''
+    for key in outlier_counts.keys():
+        # dictionaries should have the same keys
+        outlier_counts[key] = float(outlier_counts[key]) / float(country_counts[key])
+    return outlier_counts
+
+
+def get_outliers_df(X, Y, chi2thr=0.999, out_file=None):
+    threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr)
+    global_counts = Counter(Y[y_pred])
+    df = country_outlier_df(global_counts, Y, normalize=True)
+    if out_file is not None:
+        df.to_csv(out_file, index=False)
+    return df, threshold, MD
+
+
+def print_most_least_outliers_topN(df, N=10):
+    sort_inds = df['Outliers'].argsort()  # ascending order
+    df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]]
+    df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]]
+    print "most outliers " 
+    print df_most
+    print "least outliers " 
+    print df_least
+    
+
+def load_metadata(Yaudio, metadata_file):
+    df = pd.read_csv(metadata_file)
+    df_audio = pd.DataFrame({'Audio':Yaudio})
+    ddf = pd.merge(df_audio, df, on='Audio', suffixes=['', '_r']) # in the order of Yaudio
+    return ddf
+
+
+def clusters_metadata(df, cl_pred, out_file=None):
+    def get_top_N_counts(labels, N=3):
+        ulab, ucount = np.unique(labels, return_counts=True)
+        inds = np.argsort(ucount)
+        return zip(ulab[inds[-N:]],ucount[inds[-N:]])
+    info = np.array([str(df['Country'].iloc[i]) for i in range(len(df))])
+    styles_description = []
+    uniq_cl = np.unique(cl_pred)
+    for ccl in uniq_cl:
+        inds = np.where(cl_pred==ccl)[0]
+        styles_description.append(get_top_N_counts(info[inds], N=3))
+    df_styles = pd.DataFrame(data=styles_description, index=uniq_cl)
+    print df_styles.to_latex()
+    if out_file is not None:
+        df_styles.to_csv(out_file, index=False)
+
+
+# load LDA-transformed frames
+X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb'))
+ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv')
+w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)
+w_dict = utils_spatial.from_weights_to_dict(w, data_countries)
+Xrhy, Xmel, Xmfc, Xchr = X_list
+X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)
+
+# global outliers
+df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)
+print_most_least_outliers_topN(df_global, N=10)
+
+spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)
+spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))
+df_local = country_outlier_df(spatial_counts, Y, normalize=True)
+print_most_least_outliers_topN(df_local, N=10)
+
+feat = [Xrhy, Xmel, Xmfc, Xchr]
+feat_labels = ['rhy', 'mel', 'mfc', 'chr']
+tabs_feat = []
+for i in range(len(feat)):
+    XX = feat[i]
+    df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999)
+    print_most_least_outliers_topN(df_feat, N=5)
+
+# how many styles are there
+#bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine")
+bestncl = 13
+
+# get cluster predictions and metadata for each cluster
+cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
+centroids = cluster_model.cluster_centers_
+cl_pred = cluster_model.predict(X)
+ddf['Clusters'] = cl_pred
+clusters_metadata(ddf, cl_pred)
+
+# how similar are the cultures and which ones seem to be global outliers
+cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids)
+
+# Moran on Mahalanobis distances
+data = cluster_freq.get_values()
+data_countries = cluster_freq.index
+#threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999)
+threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999)
+y = np.sqrt(MD)
+utils_spatial.print_Moran_outliers(y, w, data_countries)
+utils_spatial.plot_Moran_scatterplot(y, w, data_countries)