Mercurial > hg > plosone_underreview
comparison scripts/results.py @ 8:0f3eba42b425 branch-tests
added notebooks and utils
author | Maria Panteli <m.x.panteli@gmail.com> |
---|---|
date | Mon, 11 Sep 2017 18:23:14 +0100 |
parents | e50c63cf96be |
children | 98718fdd8326 |
comparison
equal
deleted
inserted
replaced
7:46b2c713cc73 | 8:0f3eba42b425 |
---|---|
80 print df_styles.to_latex() | 80 print df_styles.to_latex() |
81 if out_file is not None: | 81 if out_file is not None: |
82 df_styles.to_csv(out_file, index=False) | 82 df_styles.to_csv(out_file, index=False) |
83 | 83 |
84 | 84 |
85 # load LDA-transformed frames | 85 if __name__ == '__main__': |
86 X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb')) | 86 # load LDA-transformed frames |
87 ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv') | 87 X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb')) |
88 w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) | 88 ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv') |
89 w_dict = utils_spatial.from_weights_to_dict(w, data_countries) | 89 w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) |
90 Xrhy, Xmel, Xmfc, Xchr = X_list | 90 w_dict = utils_spatial.from_weights_to_dict(w, data_countries) |
91 X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1) | 91 Xrhy, Xmel, Xmfc, Xchr = X_list |
92 X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1) | |
92 | 93 |
93 # global outliers | 94 # global outliers |
94 df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999) | 95 df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999) |
95 print_most_least_outliers_topN(df_global, N=10) | 96 print_most_least_outliers_topN(df_global, N=10) |
96 | 97 |
97 spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) | 98 spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) |
98 spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) | 99 spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) |
99 df_local = country_outlier_df(spatial_counts, Y, normalize=True) | 100 df_local = country_outlier_df(spatial_counts, Y, normalize=True) |
100 print_most_least_outliers_topN(df_local, N=10) | 101 print_most_least_outliers_topN(df_local, N=10) |
101 | 102 |
102 feat = [Xrhy, Xmel, Xmfc, Xchr] | 103 feat = [Xrhy, Xmel, Xmfc, Xchr] |
103 feat_labels = ['rhy', 'mel', 'mfc', 'chr'] | 104 feat_labels = ['rhy', 'mel', 'mfc', 'chr'] |
104 tabs_feat = [] | 105 tabs_feat = [] |
105 for i in range(len(feat)): | 106 for i in range(len(feat)): |
106 XX = feat[i] | 107 XX = feat[i] |
107 df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999) | 108 df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999) |
108 print_most_least_outliers_topN(df_feat, N=5) | 109 print_most_least_outliers_topN(df_feat, N=5) |
109 | 110 |
110 # how many styles are there | 111 # how many styles are there |
111 #bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") | 112 #bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") |
112 bestncl = 13 | 113 bestncl = 13 |
113 | 114 |
114 # get cluster predictions and metadata for each cluster | 115 # get cluster predictions and metadata for each cluster |
115 cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) | 116 cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) |
116 centroids = cluster_model.cluster_centers_ | 117 centroids = cluster_model.cluster_centers_ |
117 cl_pred = cluster_model.predict(X) | 118 cl_pred = cluster_model.predict(X) |
118 ddf['Clusters'] = cl_pred | 119 ddf['Clusters'] = cl_pred |
119 clusters_metadata(ddf, cl_pred) | 120 clusters_metadata(ddf, cl_pred) |
120 | 121 |
121 # how similar are the cultures and which ones seem to be global outliers | 122 # how similar are the cultures and which ones seem to be global outliers |
122 cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids) | 123 cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids) |
123 | 124 |
124 # Moran on Mahalanobis distances | 125 # Moran on Mahalanobis distances |
125 data = cluster_freq.get_values() | 126 data = cluster_freq.get_values() |
126 data_countries = cluster_freq.index | 127 data_countries = cluster_freq.index |
127 #threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999) | 128 #threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999) |
128 threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999) | 129 threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999) |
129 y = np.sqrt(MD) | 130 y = np.sqrt(MD) |
130 utils_spatial.print_Moran_outliers(y, w, data_countries) | 131 utils_spatial.print_Moran_outliers(y, w, data_countries) |
131 utils_spatial.plot_Moran_scatterplot(y, w, data_countries) | 132 utils_spatial.plot_Moran_scatterplot(y, w, data_countries) |