Mercurial > hg > plosone_underreview
comparison scripts/outliers.py @ 65:9b10b688c2ac branch-tests
results for 30 seconds
author | mpanteli <m.x.panteli@gmail.com> |
---|---|
date | Thu, 21 Sep 2017 20:11:43 +0100 |
parents | dbcd5b2a4efa |
children | bfb9ed45c417 |
comparison
equal
deleted
inserted
replaced
64:e83ecc296669 | 65:9b10b688c2ac |
---|---|
90 print df_styles.to_latex() | 90 print df_styles.to_latex() |
91 if out_file is not None: | 91 if out_file is not None: |
92 df_styles.to_csv(out_file, index=False) | 92 df_styles.to_csv(out_file, index=False) |
93 | 93 |
94 | 94 |
95 def load_data(pickle_file, metadata_file): | |
96 X_list, Y, Yaudio = pickle.load(open(pickle_file,'rb')) | |
97 ddf = load_metadata(Yaudio, metadata_file=metadata_file) | |
98 w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) | |
99 w_dict = utils_spatial.from_weights_to_dict(w, data_countries) | |
100 return [X_list, Y, Yaudio], ddf, w_dict | |
101 | |
102 | |
103 def get_local_outliers_df(X, Y, w_dict): | |
104 spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) | |
105 spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) | |
106 df_local = country_outlier_df(spatial_counts, Y, normalize=True) | |
107 return df_local | |
108 | |
109 | |
110 def get_country_clusters(X, bestncl=None): | |
111 if bestncl is None: | |
112 bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") | |
113 # get cluster predictions and metadata for each cluster | |
114 cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) | |
115 centroids = cluster_model.cluster_centers_ | |
116 cl_pred = cluster_model.predict(X) | |
117 return centroids, cl_pred | |
118 | |
119 | |
95 if __name__ == '__main__': | 120 if __name__ == '__main__': |
96 # load LDA-transformed frames | 121 # load LDA-transformed frames |
97 X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb')) | 122 dataset, ddf, w_dict = load_data('data/lda_data_8.pickle', 'data/metadata.csv') |
98 ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv') | 123 X_list, Y, Yaudio = dataset |
99 w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) | |
100 w_dict = utils_spatial.from_weights_to_dict(w, data_countries) | |
101 X = np.concatenate(X_list, axis=1) | 124 X = np.concatenate(X_list, axis=1) |
102 | 125 |
103 # global outliers | 126 # global outliers |
104 df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999) | 127 df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999) |
105 print_most_least_outliers_topN(df_global, N=10) | 128 print_most_least_outliers_topN(df_global, N=10) |
106 | 129 |
107 spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) | 130 # local outliers |
108 spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) | 131 df_local = get_local_outliers_df(X, Y, w_dict) |
109 df_local = country_outlier_df(spatial_counts, Y, normalize=True) | |
110 print_most_least_outliers_topN(df_local, N=10) | 132 print_most_least_outliers_topN(df_local, N=10) |
111 | 133 |
112 feat = [Xrhy, Xmel, Xmfc, Xchr] | 134 # outliers for features |
135 feat = X_list | |
113 feat_labels = ['rhy', 'mel', 'mfc', 'chr'] | 136 feat_labels = ['rhy', 'mel', 'mfc', 'chr'] |
114 tabs_feat = [] | 137 tabs_feat = [] |
115 for i in range(len(feat)): | 138 for i in range(len(feat)): |
116 XX = feat[i] | 139 XX = feat[i] |
117 df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999) | 140 df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999) |
118 print_most_least_outliers_topN(df_feat, N=5) | 141 print_most_least_outliers_topN(df_feat, N=5) |
119 | 142 |
120 # how many styles are there | 143 ## how many styles are there |
121 #bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") | 144 ##bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") |
122 bestncl = 13 | 145 #bestncl = 13 |
123 | 146 ## get cluster predictions and metadata for each cluster |
124 # get cluster predictions and metadata for each cluster | 147 #cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) |
125 cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) | 148 #centroids = cluster_model.cluster_centers_ |
126 centroids = cluster_model.cluster_centers_ | 149 #cl_pred = cluster_model.predict(X) |
127 cl_pred = cluster_model.predict(X) | 150 centroids, cl_pred = get_country_clusters(X, bestncl=13) |
128 ddf['Clusters'] = cl_pred | 151 ddf['Clusters'] = cl_pred |
129 print_clusters_metadata(ddf, cl_pred) | 152 print_clusters_metadata(ddf, cl_pred) |
130 | 153 |
131 # how similar are the cultures and which ones seem to be global outliers | 154 # how similar are the cultures and which ones seem to be global outliers |
132 cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids) | 155 cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids) |
133 | 156 |
134 # Moran on Mahalanobis distances | 157 ## Moran on Mahalanobis distances |
135 data = cluster_freq.get_values() | 158 #data = cluster_freq.get_values() |
136 data_countries = cluster_freq.index | 159 #data_countries = cluster_freq.index |
137 #threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999) | 160 ##threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999) |
138 threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999) | 161 #threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999) |
139 y = np.sqrt(MD) | 162 #y = np.sqrt(MD) |
140 utils_spatial.print_Moran_outliers(y, w, data_countries) | 163 #utils_spatial.print_Moran_outliers(y, w, data_countries) |
141 utils_spatial.plot_Moran_scatterplot(y, w, data_countries) | 164 #utils_spatial.plot_Moran_scatterplot(y, w, data_countries) |