comparison scripts/outliers.py @ 75:02faad4a996b branch-tests

results and figures
author Maria Panteli <m.x.panteli@gmail.com>
date Fri, 22 Sep 2017 16:30:28 +0100
parents bfb9ed45c417
children bde45ce0eeab
comparison
equal deleted inserted replaced
67:bfb9ed45c417 75:02faad4a996b
13 13
14 import utils 14 import utils
15 import utils_spatial 15 import utils_spatial
16 16
17 17
18 def country_outlier_df(counts, labels, normalize=False): 18 def country_outlier_df(counts, labels, normalize=False, out_file=None):
19 if len(counts.keys()) < len(np.unique(labels)): 19 if len(counts.keys()) < len(np.unique(labels)):
20 for label in np.unique(labels): 20 for label in np.unique(labels):
21 if not counts.has_key(label): 21 if not counts.has_key(label):
22 counts.update({label:0}) 22 counts.update({label:0})
23 if normalize: 23 if normalize:
31 df_n_country.rename(columns={'index':'Country', 0:'N_Country'}, inplace=True) 31 df_n_country.rename(columns={'index':'Country', 0:'N_Country'}, inplace=True)
32 df_n_outliers = pd.DataFrame.from_dict(Counter(counts), orient='index').reset_index() 32 df_n_outliers = pd.DataFrame.from_dict(Counter(counts), orient='index').reset_index()
33 df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True) 33 df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True)
34 df = pd.merge(df, df_n_country, on='Country', how='left') 34 df = pd.merge(df, df_n_country, on='Country', how='left')
35 df = pd.merge(df, df_n_outliers, on='Country', how='left') 35 df = pd.merge(df, df_n_outliers, on='Country', how='left')
36 if out_file is not None:
37 df.to_csv(out_file, index=False)
36 return df 38 return df
37 39
38 40
39 def normalize_outlier_counts(outlier_counts, country_counts): 41 def normalize_outlier_counts(outlier_counts, country_counts):
40 '''Normalize a dictionary of outlier counts per country by 42 '''Normalize a dictionary of outlier counts per country by
48 50
49 51
50 def get_outliers_df(X, Y, chi2thr=0.999, out_file=None): 52 def get_outliers_df(X, Y, chi2thr=0.999, out_file=None):
51 threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr) 53 threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr)
52 global_counts = Counter(Y[y_pred]) 54 global_counts = Counter(Y[y_pred])
53 df = country_outlier_df(global_counts, Y, normalize=True) 55 df = country_outlier_df(global_counts, Y, normalize=True, out_file=out_file)
54 if out_file is not None:
55 df.to_csv(out_file, index=False)
56 return df, threshold, MD 56 return df, threshold, MD
57 57
58 58
59 def print_most_least_outliers_topN(df, N=10): 59 def print_most_least_outliers_topN(df, N=10):
60 sort_inds = df['Outliers'].argsort() # ascending order 60 sort_inds = df['Outliers'].argsort() # ascending order
98 w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) 98 w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)
99 w_dict = utils_spatial.from_weights_to_dict(w, data_countries) 99 w_dict = utils_spatial.from_weights_to_dict(w, data_countries)
100 return [X_list, Y, Yaudio], ddf, w_dict 100 return [X_list, Y, Yaudio], ddf, w_dict
101 101
102 102
103 def get_local_outliers_df(X, Y, w_dict): 103 def get_local_outliers_df(X, Y, w_dict, out_file=None):
104 spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) 104 spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)
105 spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) 105 spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))
106 df_local = country_outlier_df(spatial_counts, Y, normalize=True) 106 df_local = country_outlier_df(spatial_counts, Y, normalize=True, out_file=out_file)
107 return df_local 107 return df_local
108 108
109 109
110 def get_country_clusters(X, bestncl=None): 110 def get_country_clusters(X, bestncl=None, max_ncl=50):
111 if bestncl is None: 111 if bestncl is None:
112 bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") 112 bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=max_ncl, metric="cosine")
113 # get cluster predictions and metadata for each cluster 113 # get cluster predictions and metadata for each cluster
114 cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) 114 cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
115 centroids = cluster_model.cluster_centers_ 115 centroids = cluster_model.cluster_centers_
116 cl_pred = cluster_model.predict(X) 116 cl_pred = cluster_model.predict(X)
117 return centroids, cl_pred 117 return centroids, cl_pred