comparison scripts/results.py @ 8:0f3eba42b425 branch-tests

added notebooks and utils
author Maria Panteli <m.x.panteli@gmail.com>
date Mon, 11 Sep 2017 18:23:14 +0100
parents e50c63cf96be
children 98718fdd8326
comparison
equal deleted inserted replaced
7:46b2c713cc73 8:0f3eba42b425
80 print df_styles.to_latex() 80 print df_styles.to_latex()
81 if out_file is not None: 81 if out_file is not None:
82 df_styles.to_csv(out_file, index=False) 82 df_styles.to_csv(out_file, index=False)
83 83
84 84
85 # load LDA-transformed frames 85 if __name__ == '__main__':
86 X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb')) 86 # load LDA-transformed frames
87 ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv') 87 X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb'))
88 w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) 88 ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv')
89 w_dict = utils_spatial.from_weights_to_dict(w, data_countries) 89 w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)
90 Xrhy, Xmel, Xmfc, Xchr = X_list 90 w_dict = utils_spatial.from_weights_to_dict(w, data_countries)
91 X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1) 91 Xrhy, Xmel, Xmfc, Xchr = X_list
92 X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)
92 93
93 # global outliers 94 # global outliers
94 df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999) 95 df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)
95 print_most_least_outliers_topN(df_global, N=10) 96 print_most_least_outliers_topN(df_global, N=10)
96 97
97 spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) 98 spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)
98 spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) 99 spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))
99 df_local = country_outlier_df(spatial_counts, Y, normalize=True) 100 df_local = country_outlier_df(spatial_counts, Y, normalize=True)
100 print_most_least_outliers_topN(df_local, N=10) 101 print_most_least_outliers_topN(df_local, N=10)
101 102
102 feat = [Xrhy, Xmel, Xmfc, Xchr] 103 feat = [Xrhy, Xmel, Xmfc, Xchr]
103 feat_labels = ['rhy', 'mel', 'mfc', 'chr'] 104 feat_labels = ['rhy', 'mel', 'mfc', 'chr']
104 tabs_feat = [] 105 tabs_feat = []
105 for i in range(len(feat)): 106 for i in range(len(feat)):
106 XX = feat[i] 107 XX = feat[i]
107 df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999) 108 df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999)
108 print_most_least_outliers_topN(df_feat, N=5) 109 print_most_least_outliers_topN(df_feat, N=5)
109 110
110 # how many styles are there 111 # how many styles are there
111 #bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") 112 #bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine")
112 bestncl = 13 113 bestncl = 13
113 114
114 # get cluster predictions and metadata for each cluster 115 # get cluster predictions and metadata for each cluster
115 cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) 116 cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
116 centroids = cluster_model.cluster_centers_ 117 centroids = cluster_model.cluster_centers_
117 cl_pred = cluster_model.predict(X) 118 cl_pred = cluster_model.predict(X)
118 ddf['Clusters'] = cl_pred 119 ddf['Clusters'] = cl_pred
119 clusters_metadata(ddf, cl_pred) 120 clusters_metadata(ddf, cl_pred)
120 121
121 # how similar are the cultures and which ones seem to be global outliers 122 # how similar are the cultures and which ones seem to be global outliers
122 cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids) 123 cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids)
123 124
124 # Moran on Mahalanobis distances 125 # Moran on Mahalanobis distances
125 data = cluster_freq.get_values() 126 data = cluster_freq.get_values()
126 data_countries = cluster_freq.index 127 data_countries = cluster_freq.index
127 #threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999) 128 #threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999)
128 threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999) 129 threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999)
129 y = np.sqrt(MD) 130 y = np.sqrt(MD)
130 utils_spatial.print_Moran_outliers(y, w, data_countries) 131 utils_spatial.print_Moran_outliers(y, w, data_countries)
131 utils_spatial.plot_Moran_scatterplot(y, w, data_countries) 132 utils_spatial.plot_Moran_scatterplot(y, w, data_countries)