Maria@18
|
1 # -*- coding: utf-8 -*-
|
Maria@18
|
2 """
|
Maria@18
|
3 Created on Tue Jul 12 20:49:48 2016
|
Maria@18
|
4
|
Maria@18
|
5 @author: mariapanteli
|
Maria@18
|
6 """
|
Maria@18
|
7
|
Maria@18
|
8 import numpy as np
|
Maria@18
|
9 import pandas as pd
|
Maria@18
|
10 import pickle
|
Maria@18
|
11 from collections import Counter
|
Maria@18
|
12 from sklearn.cluster import KMeans
|
Maria@18
|
13
|
Maria@18
|
14 import utils
|
Maria@18
|
15 import utils_spatial
|
Maria@18
|
16
|
Maria@18
|
17
|
m@75
|
18 def country_outlier_df(counts, labels, normalize=False, out_file=None):
|
Maria@18
|
19 if len(counts.keys()) < len(np.unique(labels)):
|
Maria@18
|
20 for label in np.unique(labels):
|
Maria@18
|
21 if not counts.has_key(label):
|
Maria@18
|
22 counts.update({label:0})
|
Maria@18
|
23 if normalize:
|
m@54
|
24 norm_counts = normalize_outlier_counts(counts, Counter(labels))
|
m@54
|
25 df = pd.DataFrame.from_dict(norm_counts, orient='index').reset_index()
|
m@54
|
26 else:
|
m@54
|
27 df = pd.DataFrame.from_dict(Counter(counts), orient='index').reset_index()
|
Maria@18
|
28 df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True)
|
m@54
|
29 # append number of recordings and number of outliers per country
|
m@54
|
30 df_n_country = pd.DataFrame.from_dict(Counter(labels), orient='index').reset_index()
|
m@54
|
31 df_n_country.rename(columns={'index':'Country', 0:'N_Country'}, inplace=True)
|
m@54
|
32 df_n_outliers = pd.DataFrame.from_dict(Counter(counts), orient='index').reset_index()
|
m@54
|
33 df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True)
|
m@54
|
34 df = pd.merge(df, df_n_country, on='Country', how='left')
|
m@54
|
35 df = pd.merge(df, df_n_outliers, on='Country', how='left')
|
m@75
|
36 if out_file is not None:
|
m@75
|
37 df.to_csv(out_file, index=False)
|
Maria@18
|
38 return df
|
Maria@18
|
39
|
Maria@18
|
40
|
Maria@18
|
41 def normalize_outlier_counts(outlier_counts, country_counts):
|
Maria@18
|
42 '''Normalize a dictionary of outlier counts per country by
|
Maria@18
|
43 the total number of recordings per country
|
Maria@18
|
44 '''
|
m@54
|
45 norm_counts = {}
|
Maria@18
|
46 for key in outlier_counts.keys():
|
Maria@18
|
47 # dictionaries should have the same keys
|
m@54
|
48 norm_counts[key] = float(outlier_counts[key]) / float(country_counts[key])
|
m@54
|
49 return norm_counts
|
Maria@18
|
50
|
Maria@18
|
51
|
Maria@18
|
52 def get_outliers_df(X, Y, chi2thr=0.999, out_file=None):
|
Maria@18
|
53 threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr)
|
Maria@18
|
54 global_counts = Counter(Y[y_pred])
|
m@75
|
55 df = country_outlier_df(global_counts, Y, normalize=True, out_file=out_file)
|
Maria@18
|
56 return df, threshold, MD
|
Maria@18
|
57
|
Maria@18
|
58
|
Maria@18
|
59 def print_most_least_outliers_topN(df, N=10):
|
Maria@18
|
60 sort_inds = df['Outliers'].argsort() # ascending order
|
m@51
|
61 #df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]]
|
m@51
|
62 #df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]]
|
m@51
|
63 df_most = df.iloc[sort_inds[::-1][:N]]
|
m@51
|
64 df_least = df.iloc[sort_inds[:N]]
|
Maria@18
|
65 print "most outliers "
|
Maria@18
|
66 print df_most
|
Maria@18
|
67 print "least outliers "
|
Maria@18
|
68 print df_least
|
Maria@18
|
69
|
Maria@18
|
70
|
Maria@18
|
71 def load_metadata(Yaudio, metadata_file):
|
Maria@18
|
72 df = pd.read_csv(metadata_file)
|
Maria@18
|
73 df_audio = pd.DataFrame({'Audio':Yaudio})
|
Maria@18
|
74 ddf = pd.merge(df_audio, df, on='Audio', suffixes=['', '_r']) # in the order of Yaudio
|
Maria@18
|
75 return ddf
|
Maria@18
|
76
|
Maria@18
|
77
|
m@20
|
78 def print_clusters_metadata(df, cl_pred, out_file=None):
|
Maria@18
|
79 def get_top_N_counts(labels, N=3):
|
Maria@18
|
80 ulab, ucount = np.unique(labels, return_counts=True)
|
Maria@18
|
81 inds = np.argsort(ucount)
|
Maria@18
|
82 return zip(ulab[inds[-N:]],ucount[inds[-N:]])
|
Maria@18
|
83 info = np.array([str(df['Country'].iloc[i]) for i in range(len(df))])
|
Maria@18
|
84 styles_description = []
|
Maria@18
|
85 uniq_cl = np.unique(cl_pred)
|
Maria@18
|
86 for ccl in uniq_cl:
|
Maria@18
|
87 inds = np.where(cl_pred==ccl)[0]
|
Maria@18
|
88 styles_description.append(get_top_N_counts(info[inds], N=3))
|
Maria@18
|
89 df_styles = pd.DataFrame(data=styles_description, index=uniq_cl)
|
Maria@18
|
90 print df_styles.to_latex()
|
Maria@18
|
91 if out_file is not None:
|
Maria@18
|
92 df_styles.to_csv(out_file, index=False)
|
Maria@18
|
93
|
Maria@18
|
94
|
m@65
|
95 def load_data(pickle_file, metadata_file):
|
m@65
|
96 X_list, Y, Yaudio = pickle.load(open(pickle_file,'rb'))
|
m@65
|
97 ddf = load_metadata(Yaudio, metadata_file=metadata_file)
|
m@65
|
98 w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)
|
m@65
|
99 w_dict = utils_spatial.from_weights_to_dict(w, data_countries)
|
m@77
|
100 ddf = utils_spatial.append_regions(ddf)
|
m@65
|
101 return [X_list, Y, Yaudio], ddf, w_dict
|
m@65
|
102
|
m@65
|
103
|
m@75
|
104 def get_local_outliers_df(X, Y, w_dict, out_file=None):
|
m@65
|
105 spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)
|
m@65
|
106 spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))
|
m@75
|
107 df_local = country_outlier_df(spatial_counts, Y, normalize=True, out_file=out_file)
|
m@65
|
108 return df_local
|
m@65
|
109
|
m@65
|
110
|
m@77
|
111 def get_country_clusters(X, bestncl=None, min_ncl=5, max_ncl=50):
|
m@65
|
112 if bestncl is None:
|
m@77
|
113 bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=min_ncl, max_ncl=max_ncl, metric="cosine")
|
m@65
|
114 # get cluster predictions and metadata for each cluster
|
m@65
|
115 cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
|
m@65
|
116 centroids = cluster_model.cluster_centers_
|
m@65
|
117 cl_pred = cluster_model.predict(X)
|
m@65
|
118 return centroids, cl_pred
|
m@65
|
119
|
m@65
|
120
|
Maria@18
|
121 if __name__ == '__main__':
|
Maria@18
|
122 # load LDA-transformed frames
|
m@65
|
123 dataset, ddf, w_dict = load_data('data/lda_data_8.pickle', 'data/metadata.csv')
|
m@65
|
124 X_list, Y, Yaudio = dataset
|
Maria@18
|
125 X = np.concatenate(X_list, axis=1)
|
Maria@18
|
126
|
Maria@18
|
127 # global outliers
|
Maria@18
|
128 df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)
|
Maria@18
|
129 print_most_least_outliers_topN(df_global, N=10)
|
Maria@18
|
130
|
m@65
|
131 # local outliers
|
m@65
|
132 df_local = get_local_outliers_df(X, Y, w_dict)
|
Maria@18
|
133 print_most_least_outliers_topN(df_local, N=10)
|
Maria@18
|
134
|
m@65
|
135 # outliers for features
|
m@65
|
136 feat = X_list
|
Maria@18
|
137 feat_labels = ['rhy', 'mel', 'mfc', 'chr']
|
Maria@18
|
138 tabs_feat = []
|
Maria@18
|
139 for i in range(len(feat)):
|
Maria@18
|
140 XX = feat[i]
|
Maria@18
|
141 df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999)
|
Maria@18
|
142 print_most_least_outliers_topN(df_feat, N=5)
|
Maria@18
|
143
|
m@65
|
144 ## how many styles are there
|
m@65
|
145 ##bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine")
|
m@99
|
146 centroids, cl_pred = get_country_clusters(X, bestncl=10)
|
Maria@18
|
147 ddf['Clusters'] = cl_pred
|
m@20
|
148 print_clusters_metadata(ddf, cl_pred)
|
Maria@18
|
149
|
Maria@18
|
150 # how similar are the cultures and which ones seem to be global outliers
|
Maria@18
|
151 cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids)
|