plosone_underreview: scripts/outliers.py comparison

comparison scripts/outliers.py @ 75:02faad4a996b branch-tests

results and figures

author	Maria Panteli <m.x.panteli@gmail.com>
date	Fri, 22 Sep 2017 16:30:28 +0100
parents	bfb9ed45c417
children	bde45ce0eeab

comparison

equal deleted inserted replaced

-:bfb9ed45c417
+:02faad4a996b
 import utils
 import utils_spatial
-def country_outlier_df(counts, labels, normalize=False):
+def country_outlier_df(counts, labels, normalize=False, out_file=None):
 if len(counts.keys()) < len(np.unique(labels)):
 for label in np.unique(labels):
 if not counts.has_key(label):
 counts.update({label:0})
 if normalize:
 df_n_country.rename(columns={'index':'Country', 0:'N_Country'}, inplace=True)
 df_n_outliers = pd.DataFrame.from_dict(Counter(counts), orient='index').reset_index()
 df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True)
 df = pd.merge(df, df_n_country, on='Country', how='left')
 df = pd.merge(df, df_n_outliers, on='Country', how='left')
+if out_file is not None:
+df.to_csv(out_file, index=False)
 return df
 def normalize_outlier_counts(outlier_counts, country_counts):
 '''Normalize a dictionary of outlier counts per country by
 def get_outliers_df(X, Y, chi2thr=0.999, out_file=None):
 threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr)
 global_counts = Counter(Y[y_pred])
-df = country_outlier_df(global_counts, Y, normalize=True)
+df = country_outlier_df(global_counts, Y, normalize=True, out_file=out_file)
-if out_file is not None:
-df.to_csv(out_file, index=False)
 return df, threshold, MD
 def print_most_least_outliers_topN(df, N=10):
 sort_inds = df['Outliers'].argsort()  # ascending order
 w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)
 w_dict = utils_spatial.from_weights_to_dict(w, data_countries)
 return [X_list, Y, Yaudio], ddf, w_dict
-def get_local_outliers_df(X, Y, w_dict):
+def get_local_outliers_df(X, Y, w_dict, out_file=None):
 spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)
 spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))
-df_local = country_outlier_df(spatial_counts, Y, normalize=True)
+df_local = country_outlier_df(spatial_counts, Y, normalize=True, out_file=out_file)
 return df_local
-def get_country_clusters(X, bestncl=None):
+def get_country_clusters(X, bestncl=None, max_ncl=50):
 if bestncl is None:
-bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine")
+bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=max_ncl, metric="cosine")
 # get cluster predictions and metadata for each cluster
 cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
 centroids = cluster_model.cluster_centers_
 cl_pred = cluster_model.predict(X)
 return centroids, cl_pred

Mercurial > hg > plosone_underreview

comparison scripts/outliers.py @ 75:02faad4a996b branch-tests