Mercurial > hg > plosone_underreview
diff scripts/outliers.py @ 75:02faad4a996b branch-tests
results and figures
author | Maria Panteli <m.x.panteli@gmail.com> |
---|---|
date | Fri, 22 Sep 2017 16:30:28 +0100 |
parents | bfb9ed45c417 |
children | bde45ce0eeab |
line wrap: on
line diff
--- a/scripts/outliers.py Thu Sep 21 20:12:47 2017 +0100 +++ b/scripts/outliers.py Fri Sep 22 16:30:28 2017 +0100 @@ -15,7 +15,7 @@ import utils_spatial -def country_outlier_df(counts, labels, normalize=False): +def country_outlier_df(counts, labels, normalize=False, out_file=None): if len(counts.keys()) < len(np.unique(labels)): for label in np.unique(labels): if not counts.has_key(label): @@ -33,6 +33,8 @@ df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True) df = pd.merge(df, df_n_country, on='Country', how='left') df = pd.merge(df, df_n_outliers, on='Country', how='left') + if out_file is not None: + df.to_csv(out_file, index=False) return df @@ -50,9 +52,7 @@ def get_outliers_df(X, Y, chi2thr=0.999, out_file=None): threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr) global_counts = Counter(Y[y_pred]) - df = country_outlier_df(global_counts, Y, normalize=True) - if out_file is not None: - df.to_csv(out_file, index=False) + df = country_outlier_df(global_counts, Y, normalize=True, out_file=out_file) return df, threshold, MD @@ -100,16 +100,16 @@ return [X_list, Y, Yaudio], ddf, w_dict -def get_local_outliers_df(X, Y, w_dict): +def get_local_outliers_df(X, Y, w_dict, out_file=None): spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) - df_local = country_outlier_df(spatial_counts, Y, normalize=True) + df_local = country_outlier_df(spatial_counts, Y, normalize=True, out_file=out_file) return df_local -def get_country_clusters(X, bestncl=None): +def get_country_clusters(X, bestncl=None, max_ncl=50): if bestncl is None: - bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") + bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=max_ncl, metric="cosine") # get cluster predictions and metadata for each cluster cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) centroids = cluster_model.cluster_centers_