diff scripts/outliers.py @ 65:9b10b688c2ac branch-tests

results for 30 seconds
author mpanteli <m.x.panteli@gmail.com>
date Thu, 21 Sep 2017 20:11:43 +0100
parents dbcd5b2a4efa
children bfb9ed45c417
line wrap: on
line diff
--- a/scripts/outliers.py	Thu Sep 21 17:36:16 2017 +0100
+++ b/scripts/outliers.py	Thu Sep 21 20:11:43 2017 +0100
@@ -92,24 +92,47 @@
         df_styles.to_csv(out_file, index=False)
 
 
+def load_data(pickle_file, metadata_file):
+    X_list, Y, Yaudio = pickle.load(open(pickle_file,'rb'))
+    ddf = load_metadata(Yaudio, metadata_file=metadata_file)
+    w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)
+    w_dict = utils_spatial.from_weights_to_dict(w, data_countries)
+    return [X_list, Y, Yaudio], ddf, w_dict
+
+
+def get_local_outliers_df(X, Y, w_dict):
+    spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)
+    spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))
+    df_local = country_outlier_df(spatial_counts, Y, normalize=True)
+    return df_local
+
+
+def get_country_clusters(X, bestncl=None):
+    if bestncl is None:
+        bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine")
+    # get cluster predictions and metadata for each cluster
+    cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
+    centroids = cluster_model.cluster_centers_
+    cl_pred = cluster_model.predict(X)
+    return centroids, cl_pred
+
+
 if __name__ == '__main__':
     # load LDA-transformed frames
-    X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb'))
-    ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv')
-    w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)
-    w_dict = utils_spatial.from_weights_to_dict(w, data_countries)
+    dataset, ddf, w_dict = load_data('data/lda_data_8.pickle', 'data/metadata.csv')
+    X_list, Y, Yaudio = dataset
     X = np.concatenate(X_list, axis=1)
 
     # global outliers
     df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)
     print_most_least_outliers_topN(df_global, N=10)
 
-    spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)
-    spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))
-    df_local = country_outlier_df(spatial_counts, Y, normalize=True)
+    # local outliers
+    df_local = get_local_outliers_df(X, Y, w_dict)
     print_most_least_outliers_topN(df_local, N=10)
 
-    feat = [Xrhy, Xmel, Xmfc, Xchr]
+    # outliers for features
+    feat = X_list
     feat_labels = ['rhy', 'mel', 'mfc', 'chr']
     tabs_feat = []
     for i in range(len(feat)):
@@ -117,25 +140,25 @@
         df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999)
         print_most_least_outliers_topN(df_feat, N=5)
 
-    # how many styles are there
-    #bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine")
-    bestncl = 13
-
-    # get cluster predictions and metadata for each cluster
-    cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
-    centroids = cluster_model.cluster_centers_
-    cl_pred = cluster_model.predict(X)
+    ## how many styles are there
+    ##bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine")
+    #bestncl = 13
+    ## get cluster predictions and metadata for each cluster
+    #cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
+    #centroids = cluster_model.cluster_centers_
+    #cl_pred = cluster_model.predict(X)
+    centroids, cl_pred = get_country_clusters(X, bestncl=13)
     ddf['Clusters'] = cl_pred
     print_clusters_metadata(ddf, cl_pred)
 
     # how similar are the cultures and which ones seem to be global outliers
     cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids)
 
-    # Moran on Mahalanobis distances
-    data = cluster_freq.get_values()
-    data_countries = cluster_freq.index
-    #threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999)
-    threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999)
-    y = np.sqrt(MD)
-    utils_spatial.print_Moran_outliers(y, w, data_countries)
-    utils_spatial.plot_Moran_scatterplot(y, w, data_countries)
+    ## Moran on Mahalanobis distances
+    #data = cluster_freq.get_values()
+    #data_countries = cluster_freq.index
+    ##threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999)
+    #threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999)
+    #y = np.sqrt(MD)
+    #utils_spatial.print_Moran_outliers(y, w, data_countries)
+    #utils_spatial.plot_Moran_scatterplot(y, w, data_countries)