Mercurial > hg > plosone_underreview
comparison scripts/results.py @ 4:e50c63cf96be branch-tests
rearranging folders
author | Maria Panteli |
---|---|
date | Mon, 11 Sep 2017 11:51:50 +0100 |
parents | |
children | 0f3eba42b425 |
comparison
equal
deleted
inserted
replaced
3:230a0cf17de0 | 4:e50c63cf96be |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 Created on Tue Jul 12 20:49:48 2016 | |
4 | |
5 @author: mariapanteli | |
6 """ | |
7 | |
8 import numpy as np | |
9 import pandas as pd | |
10 import pickle | |
11 from collections import Counter | |
12 from sklearn.cluster import KMeans | |
13 | |
14 import utils | |
15 import utils_spatial | |
16 | |
17 | |
18 def country_outlier_df(counts, labels, out_file=None, normalize=False): | |
19 if len(counts.keys()) < len(np.unique(labels)): | |
20 for label in np.unique(labels): | |
21 if not counts.has_key(label): | |
22 counts.update({label:0}) | |
23 if normalize is True: | |
24 counts = normalize_outlier_counts(counts, Counter(labels)) | |
25 df = pd.DataFrame.from_dict(counts, orient='index').reset_index() | |
26 df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True) | |
27 if out_file is not None: | |
28 df.to_csv(out_file, index=False) | |
29 return df | |
30 | |
31 | |
32 def normalize_outlier_counts(outlier_counts, country_counts): | |
33 '''Normalize a dictionary of outlier counts per country by | |
34 the total number of recordings per country | |
35 ''' | |
36 for key in outlier_counts.keys(): | |
37 # dictionaries should have the same keys | |
38 outlier_counts[key] = float(outlier_counts[key]) / float(country_counts[key]) | |
39 return outlier_counts | |
40 | |
41 | |
42 def get_outliers_df(X, Y, chi2thr=0.999, out_file=None): | |
43 threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr) | |
44 global_counts = Counter(Y[y_pred]) | |
45 df = country_outlier_df(global_counts, Y, normalize=True) | |
46 if out_file is not None: | |
47 df.to_csv(out_file, index=False) | |
48 return df, threshold, MD | |
49 | |
50 | |
51 def print_most_least_outliers_topN(df, N=10): | |
52 sort_inds = df['Outliers'].argsort() # ascending order | |
53 df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]] | |
54 df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]] | |
55 print "most outliers " | |
56 print df_most | |
57 print "least outliers " | |
58 print df_least | |
59 | |
60 | |
61 def load_metadata(Yaudio, metadata_file): | |
62 df = pd.read_csv(metadata_file) | |
63 df_audio = pd.DataFrame({'Audio':Yaudio}) | |
64 ddf = pd.merge(df_audio, df, on='Audio', suffixes=['', '_r']) # in the order of Yaudio | |
65 return ddf | |
66 | |
67 | |
68 def clusters_metadata(df, cl_pred, out_file=None): | |
69 def get_top_N_counts(labels, N=3): | |
70 ulab, ucount = np.unique(labels, return_counts=True) | |
71 inds = np.argsort(ucount) | |
72 return zip(ulab[inds[-N:]],ucount[inds[-N:]]) | |
73 info = np.array([str(df['Country'].iloc[i]) for i in range(len(df))]) | |
74 styles_description = [] | |
75 uniq_cl = np.unique(cl_pred) | |
76 for ccl in uniq_cl: | |
77 inds = np.where(cl_pred==ccl)[0] | |
78 styles_description.append(get_top_N_counts(info[inds], N=3)) | |
79 df_styles = pd.DataFrame(data=styles_description, index=uniq_cl) | |
80 print df_styles.to_latex() | |
81 if out_file is not None: | |
82 df_styles.to_csv(out_file, index=False) | |
83 | |
84 | |
85 # load LDA-transformed frames | |
86 X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb')) | |
87 ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv') | |
88 w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) | |
89 w_dict = utils_spatial.from_weights_to_dict(w, data_countries) | |
90 Xrhy, Xmel, Xmfc, Xchr = X_list | |
91 X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1) | |
92 | |
93 # global outliers | |
94 df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999) | |
95 print_most_least_outliers_topN(df_global, N=10) | |
96 | |
97 spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) | |
98 spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) | |
99 df_local = country_outlier_df(spatial_counts, Y, normalize=True) | |
100 print_most_least_outliers_topN(df_local, N=10) | |
101 | |
102 feat = [Xrhy, Xmel, Xmfc, Xchr] | |
103 feat_labels = ['rhy', 'mel', 'mfc', 'chr'] | |
104 tabs_feat = [] | |
105 for i in range(len(feat)): | |
106 XX = feat[i] | |
107 df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999) | |
108 print_most_least_outliers_topN(df_feat, N=5) | |
109 | |
110 # how many styles are there | |
111 #bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") | |
112 bestncl = 13 | |
113 | |
114 # get cluster predictions and metadata for each cluster | |
115 cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) | |
116 centroids = cluster_model.cluster_centers_ | |
117 cl_pred = cluster_model.predict(X) | |
118 ddf['Clusters'] = cl_pred | |
119 clusters_metadata(ddf, cl_pred) | |
120 | |
121 # how similar are the cultures and which ones seem to be global outliers | |
122 cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids) | |
123 | |
124 # Moran on Mahalanobis distances | |
125 data = cluster_freq.get_values() | |
126 data_countries = cluster_freq.index | |
127 #threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999) | |
128 threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999) | |
129 y = np.sqrt(MD) | |
130 utils_spatial.print_Moran_outliers(y, w, data_countries) | |
131 utils_spatial.plot_Moran_scatterplot(y, w, data_countries) |