Mercurial > hg > plosone_underreview
comparison scripts/utils.py @ 4:e50c63cf96be branch-tests
rearranging folders
author | Maria Panteli |
---|---|
date | Mon, 11 Sep 2017 11:51:50 +0100 |
parents | |
children | 5eba53437755 |
comparison
equal
deleted
inserted
replaced
3:230a0cf17de0 | 4:e50c63cf96be |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 Created on Wed May 17 11:26:01 2017 | |
4 | |
5 @author: mariapanteli | |
6 """ | |
7 | |
8 import numpy as np | |
9 import pandas as pd | |
10 from sklearn.cluster import KMeans | |
11 from sklearn import metrics | |
12 from collections import Counter | |
13 from sklearn.decomposition import PCA | |
14 from scipy import stats | |
15 from scipy.spatial.distance import mahalanobis | |
16 from sklearn.covariance import MinCovDet | |
17 | |
18 | |
19 def get_outliers(X, chi2thr=0.975): | |
20 robust_cov = MinCovDet().fit(X) | |
21 MD = robust_cov.mahalanobis(X) | |
22 chi2 = stats.chi2 | |
23 degrees_of_freedom = X.shape[1] | |
24 threshold = chi2.ppf(chi2thr, degrees_of_freedom) | |
25 y_pred = MD>threshold | |
26 return threshold, y_pred, MD | |
27 | |
28 | |
29 def get_outliers_Mahal(X, chi2thr=0.975): | |
30 n_samples = X.shape[0] | |
31 inv_cov = np.linalg.inv(np.cov(X.T)) | |
32 col_mean = np.mean(X, axis=0) | |
33 MD = np.zeros(n_samples, dtype='float') | |
34 for i in range(n_samples): | |
35 MD[i] = mahalanobis(X[i,:], col_mean, inv_cov) | |
36 MD = MD ** 2 | |
37 degrees_of_freedom = X.shape[1] | |
38 chi2 = stats.chi2 | |
39 threshold = chi2.ppf(chi2thr, degrees_of_freedom) | |
40 y_pred = MD>threshold | |
41 return threshold, y_pred, MD | |
42 | |
43 | |
44 def pca_data(X, min_variance=None): | |
45 # rotate data to avoid singularity in Mahalanobis/covariance matrix | |
46 model = PCA(whiten=True).fit(X) | |
47 model.explained_variance_ratio_.sum() | |
48 if min_variance is None: | |
49 n_pc = X.shape[1] | |
50 else: | |
51 n_pc = np.where(model.explained_variance_ratio_.cumsum()>min_variance)[0][0] | |
52 X_pca = PCA(n_components=n_pc, whiten=True).fit_transform(X) | |
53 return X_pca, n_pc | |
54 | |
55 | |
56 def get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.975, do_pca=False): | |
57 uniq_labels = np.unique(Y) | |
58 spatial_outliers = [] | |
59 for uniq_label in uniq_labels: | |
60 countries_neighbors = w_dict[uniq_label] | |
61 if len(countries_neighbors)==0: | |
62 print uniq_label, " no neighbors found" | |
63 continue | |
64 inds_neighborhood = [] | |
65 for country in countries_neighbors: | |
66 inds = np.where(Y==country)[0] | |
67 inds_neighborhood.append(inds) # append neighboring countries | |
68 if len(np.concatenate(inds_neighborhood))==0: | |
69 print "no neighbors found" | |
70 continue | |
71 inds_neighborhood.append(np.where(Y==uniq_label)[0]) # append query country | |
72 inds_neighborhood = np.concatenate(inds_neighborhood) | |
73 if do_pca: | |
74 XX, _ = pca_data(X[inds_neighborhood, :], min_variance=0.99) | |
75 else: | |
76 XX = X[inds_neighborhood, :] # assume X is already in PCA | |
77 print len(inds_neighborhood) | |
78 if len(inds_neighborhood)<XX.shape[1]: | |
79 print uniq_label, " neighborhood too small for number of features" | |
80 continue | |
81 threshold, y_pred, MD = get_outliers_Mahal(XX, chi2thr=chi2thr) | |
82 counts = Counter(Y[inds_neighborhood[y_pred]]) | |
83 spatial_outliers.append([uniq_label, counts[uniq_label], threshold, y_pred, MD, counts, inds_neighborhood]) | |
84 return spatial_outliers | |
85 | |
86 | |
87 def best_n_clusters_silhouette(X, min_ncl=2, max_ncl=50, metric='euclidean'): | |
88 ave_silh = [] | |
89 for i in range(min_ncl): | |
90 ave_silh.append(np.nan) # for ncl=0, ncl=1 no clustering | |
91 for ncl in range(min_ncl, max_ncl): | |
92 print ncl | |
93 cl_pred = KMeans(n_clusters=ncl, random_state=50).fit_predict(X) | |
94 ave_silh.append(metrics.silhouette_score(X, cl_pred, metric=metric)) # silhouette avg | |
95 ave_silh = np.array(ave_silh) | |
96 bestncl = np.nanargmax(ave_silh) | |
97 return bestncl, ave_silh | |
98 | |
99 | |
100 def get_cluster_freq_linear(X, Y, centroids): | |
101 """ for each label in Y get the distribution of clusters by linear encoding | |
102 """ | |
103 def encode_linear(X, centroids): | |
104 """Linear encoding via the dot product | |
105 """ | |
106 return np.dot(X, centroids.T) | |
107 encoding = encode_linear(X, centroids) | |
108 encoding_df = pd.DataFrame(data=encoding, index=Y) | |
109 encoding_df_sum = encoding_df.groupby(encoding_df.index).sum() | |
110 cluster_freq = (encoding_df_sum - np.mean(encoding_df_sum)) / np.std(encoding_df_sum) | |
111 cluster_freq.index.name = 'labels' | |
112 return cluster_freq | |
113 | |
114 | |
115 def get_cluster_predictions(X, n_clusters=10): | |
116 cl_pred = KMeans(n_clusters=n_clusters, random_state=50).fit_predict(X) | |
117 return cl_pred |