diff scripts/outliers.py @ 75:02faad4a996b branch-tests

results and figures
author Maria Panteli <m.x.panteli@gmail.com>
date Fri, 22 Sep 2017 16:30:28 +0100
parents bfb9ed45c417
children bde45ce0eeab
line wrap: on
line diff
--- a/scripts/outliers.py	Thu Sep 21 20:12:47 2017 +0100
+++ b/scripts/outliers.py	Fri Sep 22 16:30:28 2017 +0100
@@ -15,7 +15,7 @@
 import utils_spatial
 
 
-def country_outlier_df(counts, labels, normalize=False):
+def country_outlier_df(counts, labels, normalize=False, out_file=None):
     if len(counts.keys()) < len(np.unique(labels)):
         for label in np.unique(labels):
             if not counts.has_key(label):
@@ -33,6 +33,8 @@
     df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True)
     df = pd.merge(df, df_n_country, on='Country', how='left')
     df = pd.merge(df, df_n_outliers, on='Country', how='left')
+    if out_file is not None:
+        df.to_csv(out_file, index=False)
     return df
 
 
@@ -50,9 +52,7 @@
 def get_outliers_df(X, Y, chi2thr=0.999, out_file=None):
     threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr)
     global_counts = Counter(Y[y_pred])
-    df = country_outlier_df(global_counts, Y, normalize=True)
-    if out_file is not None:
-        df.to_csv(out_file, index=False)
+    df = country_outlier_df(global_counts, Y, normalize=True, out_file=out_file)
     return df, threshold, MD
 
 
@@ -100,16 +100,16 @@
     return [X_list, Y, Yaudio], ddf, w_dict
 
 
-def get_local_outliers_df(X, Y, w_dict):
+def get_local_outliers_df(X, Y, w_dict, out_file=None):
     spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)
     spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))
-    df_local = country_outlier_df(spatial_counts, Y, normalize=True)
+    df_local = country_outlier_df(spatial_counts, Y, normalize=True, out_file=out_file)
     return df_local
 
 
-def get_country_clusters(X, bestncl=None):
+def get_country_clusters(X, bestncl=None, max_ncl=50):
     if bestncl is None:
-        bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine")
+        bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=max_ncl, metric="cosine")
     # get cluster predictions and metadata for each cluster
     cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
     centroids = cluster_model.cluster_centers_