Mercurial > hg > plosone_underreview
diff scripts/results.py @ 8:0f3eba42b425 branch-tests
added notebooks and utils
author | Maria Panteli <m.x.panteli@gmail.com> |
---|---|
date | Mon, 11 Sep 2017 18:23:14 +0100 |
parents | e50c63cf96be |
children | 98718fdd8326 |
line wrap: on
line diff
--- a/scripts/results.py Mon Sep 11 14:53:13 2017 +0100 +++ b/scripts/results.py Mon Sep 11 18:23:14 2017 +0100 @@ -82,50 +82,51 @@ df_styles.to_csv(out_file, index=False) -# load LDA-transformed frames -X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb')) -ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv') -w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) -w_dict = utils_spatial.from_weights_to_dict(w, data_countries) -Xrhy, Xmel, Xmfc, Xchr = X_list -X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1) +if __name__ == '__main__': + # load LDA-transformed frames + X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb')) + ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv') + w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) + w_dict = utils_spatial.from_weights_to_dict(w, data_countries) + Xrhy, Xmel, Xmfc, Xchr = X_list + X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1) -# global outliers -df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999) -print_most_least_outliers_topN(df_global, N=10) + # global outliers + df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999) + print_most_least_outliers_topN(df_global, N=10) -spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) -spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) -df_local = country_outlier_df(spatial_counts, Y, normalize=True) -print_most_least_outliers_topN(df_local, N=10) + spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) + spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) + df_local = country_outlier_df(spatial_counts, Y, normalize=True) + print_most_least_outliers_topN(df_local, N=10) -feat = [Xrhy, Xmel, Xmfc, Xchr] -feat_labels = ['rhy', 'mel', 'mfc', 'chr'] -tabs_feat = [] -for i in range(len(feat)): - XX = feat[i] - df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999) - print_most_least_outliers_topN(df_feat, N=5) + feat = [Xrhy, Xmel, Xmfc, Xchr] + feat_labels = ['rhy', 'mel', 'mfc', 'chr'] + tabs_feat = [] + for i in range(len(feat)): + XX = feat[i] + df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999) + print_most_least_outliers_topN(df_feat, N=5) -# how many styles are there -#bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") -bestncl = 13 + # how many styles are there + #bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") + bestncl = 13 -# get cluster predictions and metadata for each cluster -cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) -centroids = cluster_model.cluster_centers_ -cl_pred = cluster_model.predict(X) -ddf['Clusters'] = cl_pred -clusters_metadata(ddf, cl_pred) + # get cluster predictions and metadata for each cluster + cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) + centroids = cluster_model.cluster_centers_ + cl_pred = cluster_model.predict(X) + ddf['Clusters'] = cl_pred + clusters_metadata(ddf, cl_pred) -# how similar are the cultures and which ones seem to be global outliers -cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids) + # how similar are the cultures and which ones seem to be global outliers + cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids) -# Moran on Mahalanobis distances -data = cluster_freq.get_values() -data_countries = cluster_freq.index -#threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999) -threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999) -y = np.sqrt(MD) -utils_spatial.print_Moran_outliers(y, w, data_countries) -utils_spatial.plot_Moran_scatterplot(y, w, data_countries) + # Moran on Mahalanobis distances + data = cluster_freq.get_values() + data_countries = cluster_freq.index + #threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999) + threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999) + y = np.sqrt(MD) + utils_spatial.print_Moran_outliers(y, w, data_countries) + utils_spatial.plot_Moran_scatterplot(y, w, data_countries)