Maria@4
|
1 # -*- coding: utf-8 -*-
|
Maria@4
|
2 """
|
Maria@4
|
3 Created on Wed May 17 11:26:01 2017
|
Maria@4
|
4
|
Maria@4
|
5 @author: mariapanteli
|
Maria@4
|
6 """
|
Maria@4
|
7
|
Maria@4
|
8 import numpy as np
|
Maria@4
|
9 import pandas as pd
|
Maria@4
|
10 from sklearn.cluster import KMeans
|
Maria@4
|
11 from sklearn import metrics
|
Maria@4
|
12 from collections import Counter
|
Maria@4
|
13 from sklearn.decomposition import PCA
|
Maria@4
|
14 from scipy import stats
|
Maria@4
|
15 from scipy.spatial.distance import mahalanobis
|
Maria@4
|
16 from sklearn.covariance import MinCovDet
|
Maria@4
|
17
|
Maria@4
|
18
|
Maria@4
|
19 def get_outliers_Mahal(X, chi2thr=0.975):
|
Maria@4
|
20 n_samples = X.shape[0]
|
Maria@4
|
21 inv_cov = np.linalg.inv(np.cov(X.T))
|
Maria@4
|
22 col_mean = np.mean(X, axis=0)
|
Maria@4
|
23 MD = np.zeros(n_samples, dtype='float')
|
Maria@4
|
24 for i in range(n_samples):
|
Maria@4
|
25 MD[i] = mahalanobis(X[i,:], col_mean, inv_cov)
|
Maria@4
|
26 MD = MD ** 2
|
Maria@4
|
27 degrees_of_freedom = X.shape[1]
|
Maria@4
|
28 chi2 = stats.chi2
|
Maria@4
|
29 threshold = chi2.ppf(chi2thr, degrees_of_freedom)
|
Maria@4
|
30 y_pred = MD>threshold
|
Maria@4
|
31 return threshold, y_pred, MD
|
Maria@4
|
32
|
Maria@4
|
33
|
Maria@4
|
34 def pca_data(X, min_variance=None):
|
Maria@4
|
35 # rotate data to avoid singularity in Mahalanobis/covariance matrix
|
Maria@4
|
36 model = PCA(whiten=True).fit(X)
|
Maria@4
|
37 model.explained_variance_ratio_.sum()
|
Maria@4
|
38 if min_variance is None:
|
Maria@4
|
39 n_pc = X.shape[1]
|
Maria@4
|
40 else:
|
Maria@4
|
41 n_pc = np.where(model.explained_variance_ratio_.cumsum()>min_variance)[0][0]
|
Maria@4
|
42 X_pca = PCA(n_components=n_pc, whiten=True).fit_transform(X)
|
Maria@4
|
43 return X_pca, n_pc
|
Maria@4
|
44
|
Maria@4
|
45
|
Maria@4
|
46 def get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.975, do_pca=False):
|
Maria@4
|
47 uniq_labels = np.unique(Y)
|
Maria@4
|
48 spatial_outliers = []
|
Maria@4
|
49 for uniq_label in uniq_labels:
|
Maria@4
|
50 countries_neighbors = w_dict[uniq_label]
|
Maria@4
|
51 if len(countries_neighbors)==0:
|
Maria@4
|
52 print uniq_label, " no neighbors found"
|
Maria@4
|
53 continue
|
Maria@4
|
54 inds_neighborhood = []
|
Maria@4
|
55 for country in countries_neighbors:
|
Maria@4
|
56 inds = np.where(Y==country)[0]
|
Maria@4
|
57 inds_neighborhood.append(inds) # append neighboring countries
|
Maria@4
|
58 if len(np.concatenate(inds_neighborhood))==0:
|
Maria@4
|
59 print "no neighbors found"
|
Maria@4
|
60 continue
|
Maria@4
|
61 inds_neighborhood.append(np.where(Y==uniq_label)[0]) # append query country
|
Maria@4
|
62 inds_neighborhood = np.concatenate(inds_neighborhood)
|
Maria@4
|
63 if do_pca:
|
Maria@4
|
64 XX, _ = pca_data(X[inds_neighborhood, :], min_variance=0.99)
|
Maria@4
|
65 else:
|
Maria@4
|
66 XX = X[inds_neighborhood, :] # assume X is already in PCA
|
m@98
|
67 #print len(inds_neighborhood)
|
Maria@4
|
68 if len(inds_neighborhood)<XX.shape[1]:
|
Maria@4
|
69 print uniq_label, " neighborhood too small for number of features"
|
Maria@4
|
70 continue
|
Maria@4
|
71 threshold, y_pred, MD = get_outliers_Mahal(XX, chi2thr=chi2thr)
|
Maria@4
|
72 counts = Counter(Y[inds_neighborhood[y_pred]])
|
Maria@4
|
73 spatial_outliers.append([uniq_label, counts[uniq_label], threshold, y_pred, MD, counts, inds_neighborhood])
|
Maria@4
|
74 return spatial_outliers
|
Maria@4
|
75
|
Maria@4
|
76
|
Maria@4
|
77 def best_n_clusters_silhouette(X, min_ncl=2, max_ncl=50, metric='euclidean'):
|
Maria@4
|
78 ave_silh = []
|
Maria@4
|
79 for i in range(min_ncl):
|
Maria@4
|
80 ave_silh.append(np.nan) # for ncl=0, ncl=1 no clustering
|
Maria@4
|
81 for ncl in range(min_ncl, max_ncl):
|
Maria@4
|
82 print ncl
|
Maria@4
|
83 cl_pred = KMeans(n_clusters=ncl, random_state=50).fit_predict(X)
|
Maria@4
|
84 ave_silh.append(metrics.silhouette_score(X, cl_pred, metric=metric)) # silhouette avg
|
Maria@4
|
85 ave_silh = np.array(ave_silh)
|
Maria@4
|
86 bestncl = np.nanargmax(ave_silh)
|
Maria@4
|
87 return bestncl, ave_silh
|
Maria@4
|
88
|
Maria@4
|
89
|
Maria@4
|
90 def get_cluster_freq_linear(X, Y, centroids):
|
Maria@4
|
91 """ for each label in Y get the distribution of clusters by linear encoding
|
Maria@4
|
92 """
|
Maria@4
|
93 def encode_linear(X, centroids):
|
Maria@4
|
94 """Linear encoding via the dot product
|
Maria@4
|
95 """
|
Maria@4
|
96 return np.dot(X, centroids.T)
|
Maria@4
|
97 encoding = encode_linear(X, centroids)
|
Maria@4
|
98 encoding_df = pd.DataFrame(data=encoding, index=Y)
|
Maria@4
|
99 encoding_df_sum = encoding_df.groupby(encoding_df.index).sum()
|
Maria@4
|
100 cluster_freq = (encoding_df_sum - np.mean(encoding_df_sum)) / np.std(encoding_df_sum)
|
Maria@4
|
101 cluster_freq.index.name = 'labels'
|
Maria@4
|
102 return cluster_freq
|
Maria@4
|
103
|
Maria@4
|
104
|
Maria@4
|
105 def get_cluster_predictions(X, n_clusters=10):
|
Maria@4
|
106 cl_pred = KMeans(n_clusters=n_clusters, random_state=50).fit_predict(X)
|
Maria@4
|
107 return cl_pred
|