Mercurial > hg > plosone_underreview
comparison scripts/outliers.py @ 75:02faad4a996b branch-tests
results and figures
author | Maria Panteli <m.x.panteli@gmail.com> |
---|---|
date | Fri, 22 Sep 2017 16:30:28 +0100 |
parents | bfb9ed45c417 |
children | bde45ce0eeab |
comparison
equal
deleted
inserted
replaced
67:bfb9ed45c417 | 75:02faad4a996b |
---|---|
13 | 13 |
14 import utils | 14 import utils |
15 import utils_spatial | 15 import utils_spatial |
16 | 16 |
17 | 17 |
18 def country_outlier_df(counts, labels, normalize=False): | 18 def country_outlier_df(counts, labels, normalize=False, out_file=None): |
19 if len(counts.keys()) < len(np.unique(labels)): | 19 if len(counts.keys()) < len(np.unique(labels)): |
20 for label in np.unique(labels): | 20 for label in np.unique(labels): |
21 if not counts.has_key(label): | 21 if not counts.has_key(label): |
22 counts.update({label:0}) | 22 counts.update({label:0}) |
23 if normalize: | 23 if normalize: |
31 df_n_country.rename(columns={'index':'Country', 0:'N_Country'}, inplace=True) | 31 df_n_country.rename(columns={'index':'Country', 0:'N_Country'}, inplace=True) |
32 df_n_outliers = pd.DataFrame.from_dict(Counter(counts), orient='index').reset_index() | 32 df_n_outliers = pd.DataFrame.from_dict(Counter(counts), orient='index').reset_index() |
33 df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True) | 33 df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True) |
34 df = pd.merge(df, df_n_country, on='Country', how='left') | 34 df = pd.merge(df, df_n_country, on='Country', how='left') |
35 df = pd.merge(df, df_n_outliers, on='Country', how='left') | 35 df = pd.merge(df, df_n_outliers, on='Country', how='left') |
36 if out_file is not None: | |
37 df.to_csv(out_file, index=False) | |
36 return df | 38 return df |
37 | 39 |
38 | 40 |
39 def normalize_outlier_counts(outlier_counts, country_counts): | 41 def normalize_outlier_counts(outlier_counts, country_counts): |
40 '''Normalize a dictionary of outlier counts per country by | 42 '''Normalize a dictionary of outlier counts per country by |
48 | 50 |
49 | 51 |
50 def get_outliers_df(X, Y, chi2thr=0.999, out_file=None): | 52 def get_outliers_df(X, Y, chi2thr=0.999, out_file=None): |
51 threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr) | 53 threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr) |
52 global_counts = Counter(Y[y_pred]) | 54 global_counts = Counter(Y[y_pred]) |
53 df = country_outlier_df(global_counts, Y, normalize=True) | 55 df = country_outlier_df(global_counts, Y, normalize=True, out_file=out_file) |
54 if out_file is not None: | |
55 df.to_csv(out_file, index=False) | |
56 return df, threshold, MD | 56 return df, threshold, MD |
57 | 57 |
58 | 58 |
59 def print_most_least_outliers_topN(df, N=10): | 59 def print_most_least_outliers_topN(df, N=10): |
60 sort_inds = df['Outliers'].argsort() # ascending order | 60 sort_inds = df['Outliers'].argsort() # ascending order |
98 w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) | 98 w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) |
99 w_dict = utils_spatial.from_weights_to_dict(w, data_countries) | 99 w_dict = utils_spatial.from_weights_to_dict(w, data_countries) |
100 return [X_list, Y, Yaudio], ddf, w_dict | 100 return [X_list, Y, Yaudio], ddf, w_dict |
101 | 101 |
102 | 102 |
103 def get_local_outliers_df(X, Y, w_dict): | 103 def get_local_outliers_df(X, Y, w_dict, out_file=None): |
104 spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) | 104 spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) |
105 spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) | 105 spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) |
106 df_local = country_outlier_df(spatial_counts, Y, normalize=True) | 106 df_local = country_outlier_df(spatial_counts, Y, normalize=True, out_file=out_file) |
107 return df_local | 107 return df_local |
108 | 108 |
109 | 109 |
110 def get_country_clusters(X, bestncl=None): | 110 def get_country_clusters(X, bestncl=None, max_ncl=50): |
111 if bestncl is None: | 111 if bestncl is None: |
112 bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") | 112 bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=max_ncl, metric="cosine") |
113 # get cluster predictions and metadata for each cluster | 113 # get cluster predictions and metadata for each cluster |
114 cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) | 114 cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) |
115 centroids = cluster_model.cluster_centers_ | 115 centroids = cluster_model.cluster_centers_ |
116 cl_pred = cluster_model.predict(X) | 116 cl_pred = cluster_model.predict(X) |
117 return centroids, cl_pred | 117 return centroids, cl_pred |