annotate scripts/outliers.py @ 68:a6606b255ad7 branch-tests

?
author Maria Panteli
date Thu, 21 Sep 2017 22:02:42 +0100
parents dbcd5b2a4efa
children 9b10b688c2ac 861fe1b57672
rev   line source
Maria@18 1 # -*- coding: utf-8 -*-
Maria@18 2 """
Maria@18 3 Created on Tue Jul 12 20:49:48 2016
Maria@18 4
Maria@18 5 @author: mariapanteli
Maria@18 6 """
Maria@18 7
Maria@18 8 import numpy as np
Maria@18 9 import pandas as pd
Maria@18 10 import pickle
Maria@18 11 from collections import Counter
Maria@18 12 from sklearn.cluster import KMeans
Maria@18 13
Maria@18 14 import utils
Maria@18 15 import utils_spatial
Maria@18 16
Maria@18 17
m@51 18 def country_outlier_df(counts, labels, normalize=False):
Maria@18 19 if len(counts.keys()) < len(np.unique(labels)):
Maria@18 20 for label in np.unique(labels):
Maria@18 21 if not counts.has_key(label):
Maria@18 22 counts.update({label:0})
Maria@18 23 if normalize:
m@54 24 norm_counts = normalize_outlier_counts(counts, Counter(labels))
m@54 25 df = pd.DataFrame.from_dict(norm_counts, orient='index').reset_index()
m@54 26 else:
m@54 27 df = pd.DataFrame.from_dict(Counter(counts), orient='index').reset_index()
Maria@18 28 df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True)
m@54 29 # append number of recordings and number of outliers per country
m@54 30 df_n_country = pd.DataFrame.from_dict(Counter(labels), orient='index').reset_index()
m@54 31 df_n_country.rename(columns={'index':'Country', 0:'N_Country'}, inplace=True)
m@54 32 df_n_outliers = pd.DataFrame.from_dict(Counter(counts), orient='index').reset_index()
m@54 33 df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True)
m@54 34 df = pd.merge(df, df_n_country, on='Country', how='left')
m@54 35 df = pd.merge(df, df_n_outliers, on='Country', how='left')
Maria@18 36 return df
Maria@18 37
Maria@18 38
Maria@18 39 def normalize_outlier_counts(outlier_counts, country_counts):
Maria@18 40 '''Normalize a dictionary of outlier counts per country by
Maria@18 41 the total number of recordings per country
Maria@18 42 '''
m@54 43 norm_counts = {}
Maria@18 44 for key in outlier_counts.keys():
Maria@18 45 # dictionaries should have the same keys
m@54 46 norm_counts[key] = float(outlier_counts[key]) / float(country_counts[key])
m@54 47 return norm_counts
Maria@18 48
Maria@18 49
Maria@18 50 def get_outliers_df(X, Y, chi2thr=0.999, out_file=None):
Maria@18 51 threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr)
Maria@18 52 global_counts = Counter(Y[y_pred])
Maria@18 53 df = country_outlier_df(global_counts, Y, normalize=True)
Maria@18 54 if out_file is not None:
Maria@18 55 df.to_csv(out_file, index=False)
Maria@18 56 return df, threshold, MD
Maria@18 57
Maria@18 58
Maria@18 59 def print_most_least_outliers_topN(df, N=10):
Maria@18 60 sort_inds = df['Outliers'].argsort() # ascending order
m@51 61 #df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]]
m@51 62 #df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]]
m@51 63 df_most = df.iloc[sort_inds[::-1][:N]]
m@51 64 df_least = df.iloc[sort_inds[:N]]
Maria@18 65 print "most outliers "
Maria@18 66 print df_most
Maria@18 67 print "least outliers "
Maria@18 68 print df_least
Maria@18 69
Maria@18 70
Maria@18 71 def load_metadata(Yaudio, metadata_file):
Maria@18 72 df = pd.read_csv(metadata_file)
Maria@18 73 df_audio = pd.DataFrame({'Audio':Yaudio})
Maria@18 74 ddf = pd.merge(df_audio, df, on='Audio', suffixes=['', '_r']) # in the order of Yaudio
Maria@18 75 return ddf
Maria@18 76
Maria@18 77
m@20 78 def print_clusters_metadata(df, cl_pred, out_file=None):
Maria@18 79 def get_top_N_counts(labels, N=3):
Maria@18 80 ulab, ucount = np.unique(labels, return_counts=True)
Maria@18 81 inds = np.argsort(ucount)
Maria@18 82 return zip(ulab[inds[-N:]],ucount[inds[-N:]])
Maria@18 83 info = np.array([str(df['Country'].iloc[i]) for i in range(len(df))])
Maria@18 84 styles_description = []
Maria@18 85 uniq_cl = np.unique(cl_pred)
Maria@18 86 for ccl in uniq_cl:
Maria@18 87 inds = np.where(cl_pred==ccl)[0]
Maria@18 88 styles_description.append(get_top_N_counts(info[inds], N=3))
Maria@18 89 df_styles = pd.DataFrame(data=styles_description, index=uniq_cl)
Maria@18 90 print df_styles.to_latex()
Maria@18 91 if out_file is not None:
Maria@18 92 df_styles.to_csv(out_file, index=False)
Maria@18 93
Maria@18 94
Maria@18 95 if __name__ == '__main__':
Maria@18 96 # load LDA-transformed frames
Maria@18 97 X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb'))
Maria@18 98 ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv')
Maria@18 99 w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)
Maria@18 100 w_dict = utils_spatial.from_weights_to_dict(w, data_countries)
Maria@18 101 X = np.concatenate(X_list, axis=1)
Maria@18 102
Maria@18 103 # global outliers
Maria@18 104 df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)
Maria@18 105 print_most_least_outliers_topN(df_global, N=10)
Maria@18 106
Maria@18 107 spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)
Maria@18 108 spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))
Maria@18 109 df_local = country_outlier_df(spatial_counts, Y, normalize=True)
Maria@18 110 print_most_least_outliers_topN(df_local, N=10)
Maria@18 111
Maria@18 112 feat = [Xrhy, Xmel, Xmfc, Xchr]
Maria@18 113 feat_labels = ['rhy', 'mel', 'mfc', 'chr']
Maria@18 114 tabs_feat = []
Maria@18 115 for i in range(len(feat)):
Maria@18 116 XX = feat[i]
Maria@18 117 df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999)
Maria@18 118 print_most_least_outliers_topN(df_feat, N=5)
Maria@18 119
Maria@18 120 # how many styles are there
Maria@18 121 #bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine")
Maria@18 122 bestncl = 13
Maria@18 123
Maria@18 124 # get cluster predictions and metadata for each cluster
Maria@18 125 cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
Maria@18 126 centroids = cluster_model.cluster_centers_
Maria@18 127 cl_pred = cluster_model.predict(X)
Maria@18 128 ddf['Clusters'] = cl_pred
m@20 129 print_clusters_metadata(ddf, cl_pred)
Maria@18 130
Maria@18 131 # how similar are the cultures and which ones seem to be global outliers
Maria@18 132 cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids)
Maria@18 133
Maria@18 134 # Moran on Mahalanobis distances
Maria@18 135 data = cluster_freq.get_values()
Maria@18 136 data_countries = cluster_freq.index
Maria@18 137 #threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999)
Maria@18 138 threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999)
Maria@18 139 y = np.sqrt(MD)
Maria@18 140 utils_spatial.print_Moran_outliers(y, w, data_countries)
Maria@18 141 utils_spatial.plot_Moran_scatterplot(y, w, data_countries)