comparison scripts/utils.py @ 4:e50c63cf96be branch-tests

rearranging folders
author Maria Panteli
date Mon, 11 Sep 2017 11:51:50 +0100
parents
children 5eba53437755
comparison
equal deleted inserted replaced
3:230a0cf17de0 4:e50c63cf96be
1 # -*- coding: utf-8 -*-
2 """
3 Created on Wed May 17 11:26:01 2017
4
5 @author: mariapanteli
6 """
7
8 import numpy as np
9 import pandas as pd
10 from sklearn.cluster import KMeans
11 from sklearn import metrics
12 from collections import Counter
13 from sklearn.decomposition import PCA
14 from scipy import stats
15 from scipy.spatial.distance import mahalanobis
16 from sklearn.covariance import MinCovDet
17
18
19 def get_outliers(X, chi2thr=0.975):
20 robust_cov = MinCovDet().fit(X)
21 MD = robust_cov.mahalanobis(X)
22 chi2 = stats.chi2
23 degrees_of_freedom = X.shape[1]
24 threshold = chi2.ppf(chi2thr, degrees_of_freedom)
25 y_pred = MD>threshold
26 return threshold, y_pred, MD
27
28
29 def get_outliers_Mahal(X, chi2thr=0.975):
30 n_samples = X.shape[0]
31 inv_cov = np.linalg.inv(np.cov(X.T))
32 col_mean = np.mean(X, axis=0)
33 MD = np.zeros(n_samples, dtype='float')
34 for i in range(n_samples):
35 MD[i] = mahalanobis(X[i,:], col_mean, inv_cov)
36 MD = MD ** 2
37 degrees_of_freedom = X.shape[1]
38 chi2 = stats.chi2
39 threshold = chi2.ppf(chi2thr, degrees_of_freedom)
40 y_pred = MD>threshold
41 return threshold, y_pred, MD
42
43
44 def pca_data(X, min_variance=None):
45 # rotate data to avoid singularity in Mahalanobis/covariance matrix
46 model = PCA(whiten=True).fit(X)
47 model.explained_variance_ratio_.sum()
48 if min_variance is None:
49 n_pc = X.shape[1]
50 else:
51 n_pc = np.where(model.explained_variance_ratio_.cumsum()>min_variance)[0][0]
52 X_pca = PCA(n_components=n_pc, whiten=True).fit_transform(X)
53 return X_pca, n_pc
54
55
56 def get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.975, do_pca=False):
57 uniq_labels = np.unique(Y)
58 spatial_outliers = []
59 for uniq_label in uniq_labels:
60 countries_neighbors = w_dict[uniq_label]
61 if len(countries_neighbors)==0:
62 print uniq_label, " no neighbors found"
63 continue
64 inds_neighborhood = []
65 for country in countries_neighbors:
66 inds = np.where(Y==country)[0]
67 inds_neighborhood.append(inds) # append neighboring countries
68 if len(np.concatenate(inds_neighborhood))==0:
69 print "no neighbors found"
70 continue
71 inds_neighborhood.append(np.where(Y==uniq_label)[0]) # append query country
72 inds_neighborhood = np.concatenate(inds_neighborhood)
73 if do_pca:
74 XX, _ = pca_data(X[inds_neighborhood, :], min_variance=0.99)
75 else:
76 XX = X[inds_neighborhood, :] # assume X is already in PCA
77 print len(inds_neighborhood)
78 if len(inds_neighborhood)<XX.shape[1]:
79 print uniq_label, " neighborhood too small for number of features"
80 continue
81 threshold, y_pred, MD = get_outliers_Mahal(XX, chi2thr=chi2thr)
82 counts = Counter(Y[inds_neighborhood[y_pred]])
83 spatial_outliers.append([uniq_label, counts[uniq_label], threshold, y_pred, MD, counts, inds_neighborhood])
84 return spatial_outliers
85
86
87 def best_n_clusters_silhouette(X, min_ncl=2, max_ncl=50, metric='euclidean'):
88 ave_silh = []
89 for i in range(min_ncl):
90 ave_silh.append(np.nan) # for ncl=0, ncl=1 no clustering
91 for ncl in range(min_ncl, max_ncl):
92 print ncl
93 cl_pred = KMeans(n_clusters=ncl, random_state=50).fit_predict(X)
94 ave_silh.append(metrics.silhouette_score(X, cl_pred, metric=metric)) # silhouette avg
95 ave_silh = np.array(ave_silh)
96 bestncl = np.nanargmax(ave_silh)
97 return bestncl, ave_silh
98
99
100 def get_cluster_freq_linear(X, Y, centroids):
101 """ for each label in Y get the distribution of clusters by linear encoding
102 """
103 def encode_linear(X, centroids):
104 """Linear encoding via the dot product
105 """
106 return np.dot(X, centroids.T)
107 encoding = encode_linear(X, centroids)
108 encoding_df = pd.DataFrame(data=encoding, index=Y)
109 encoding_df_sum = encoding_df.groupby(encoding_df.index).sum()
110 cluster_freq = (encoding_df_sum - np.mean(encoding_df_sum)) / np.std(encoding_df_sum)
111 cluster_freq.index.name = 'labels'
112 return cluster_freq
113
114
115 def get_cluster_predictions(X, n_clusters=10):
116 cl_pred = KMeans(n_clusters=n_clusters, random_state=50).fit_predict(X)
117 return cl_pred