diff scripts/results.py @ 8:0f3eba42b425 branch-tests

added notebooks and utils
author Maria Panteli <m.x.panteli@gmail.com>
date Mon, 11 Sep 2017 18:23:14 +0100
parents e50c63cf96be
children 98718fdd8326
line wrap: on
line diff
--- a/scripts/results.py	Mon Sep 11 14:53:13 2017 +0100
+++ b/scripts/results.py	Mon Sep 11 18:23:14 2017 +0100
@@ -82,50 +82,51 @@
         df_styles.to_csv(out_file, index=False)
 
 
-# load LDA-transformed frames
-X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb'))
-ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv')
-w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)
-w_dict = utils_spatial.from_weights_to_dict(w, data_countries)
-Xrhy, Xmel, Xmfc, Xchr = X_list
-X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)
+if __name__ == '__main__':
+    # load LDA-transformed frames
+    X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb'))
+    ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv')
+    w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)
+    w_dict = utils_spatial.from_weights_to_dict(w, data_countries)
+    Xrhy, Xmel, Xmfc, Xchr = X_list
+    X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)
 
-# global outliers
-df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)
-print_most_least_outliers_topN(df_global, N=10)
+    # global outliers
+    df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)
+    print_most_least_outliers_topN(df_global, N=10)
 
-spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)
-spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))
-df_local = country_outlier_df(spatial_counts, Y, normalize=True)
-print_most_least_outliers_topN(df_local, N=10)
+    spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)
+    spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))
+    df_local = country_outlier_df(spatial_counts, Y, normalize=True)
+    print_most_least_outliers_topN(df_local, N=10)
 
-feat = [Xrhy, Xmel, Xmfc, Xchr]
-feat_labels = ['rhy', 'mel', 'mfc', 'chr']
-tabs_feat = []
-for i in range(len(feat)):
-    XX = feat[i]
-    df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999)
-    print_most_least_outliers_topN(df_feat, N=5)
+    feat = [Xrhy, Xmel, Xmfc, Xchr]
+    feat_labels = ['rhy', 'mel', 'mfc', 'chr']
+    tabs_feat = []
+    for i in range(len(feat)):
+        XX = feat[i]
+        df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999)
+        print_most_least_outliers_topN(df_feat, N=5)
 
-# how many styles are there
-#bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine")
-bestncl = 13
+    # how many styles are there
+    #bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine")
+    bestncl = 13
 
-# get cluster predictions and metadata for each cluster
-cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
-centroids = cluster_model.cluster_centers_
-cl_pred = cluster_model.predict(X)
-ddf['Clusters'] = cl_pred
-clusters_metadata(ddf, cl_pred)
+    # get cluster predictions and metadata for each cluster
+    cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
+    centroids = cluster_model.cluster_centers_
+    cl_pred = cluster_model.predict(X)
+    ddf['Clusters'] = cl_pred
+    clusters_metadata(ddf, cl_pred)
 
-# how similar are the cultures and which ones seem to be global outliers
-cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids)
+    # how similar are the cultures and which ones seem to be global outliers
+    cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids)
 
-# Moran on Mahalanobis distances
-data = cluster_freq.get_values()
-data_countries = cluster_freq.index
-#threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999)
-threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999)
-y = np.sqrt(MD)
-utils_spatial.print_Moran_outliers(y, w, data_countries)
-utils_spatial.plot_Moran_scatterplot(y, w, data_countries)
+    # Moran on Mahalanobis distances
+    data = cluster_freq.get_values()
+    data_countries = cluster_freq.index
+    #threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999)
+    threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999)
+    y = np.sqrt(MD)
+    utils_spatial.print_Moran_outliers(y, w, data_countries)
+    utils_spatial.plot_Moran_scatterplot(y, w, data_countries)