annotate scripts/utils.py @ 105:edd82eb89b4b branch-tests tip

Merge
author Maria Panteli
date Sun, 15 Oct 2017 13:36:59 +0100
parents 192259977b50
children
rev   line source
Maria@4 1 # -*- coding: utf-8 -*-
Maria@4 2 """
Maria@4 3 Created on Wed May 17 11:26:01 2017
Maria@4 4
Maria@4 5 @author: mariapanteli
Maria@4 6 """
Maria@4 7
Maria@4 8 import numpy as np
Maria@4 9 import pandas as pd
Maria@4 10 from sklearn.cluster import KMeans
Maria@4 11 from sklearn import metrics
Maria@4 12 from collections import Counter
Maria@4 13 from sklearn.decomposition import PCA
Maria@4 14 from scipy import stats
Maria@4 15 from scipy.spatial.distance import mahalanobis
Maria@4 16 from sklearn.covariance import MinCovDet
Maria@4 17
Maria@4 18
Maria@4 19 def get_outliers_Mahal(X, chi2thr=0.975):
Maria@4 20 n_samples = X.shape[0]
Maria@4 21 inv_cov = np.linalg.inv(np.cov(X.T))
Maria@4 22 col_mean = np.mean(X, axis=0)
Maria@4 23 MD = np.zeros(n_samples, dtype='float')
Maria@4 24 for i in range(n_samples):
Maria@4 25 MD[i] = mahalanobis(X[i,:], col_mean, inv_cov)
Maria@4 26 MD = MD ** 2
Maria@4 27 degrees_of_freedom = X.shape[1]
Maria@4 28 chi2 = stats.chi2
Maria@4 29 threshold = chi2.ppf(chi2thr, degrees_of_freedom)
Maria@4 30 y_pred = MD>threshold
Maria@4 31 return threshold, y_pred, MD
Maria@4 32
Maria@4 33
Maria@4 34 def pca_data(X, min_variance=None):
Maria@4 35 # rotate data to avoid singularity in Mahalanobis/covariance matrix
Maria@4 36 model = PCA(whiten=True).fit(X)
Maria@4 37 model.explained_variance_ratio_.sum()
Maria@4 38 if min_variance is None:
Maria@4 39 n_pc = X.shape[1]
Maria@4 40 else:
Maria@4 41 n_pc = np.where(model.explained_variance_ratio_.cumsum()>min_variance)[0][0]
Maria@4 42 X_pca = PCA(n_components=n_pc, whiten=True).fit_transform(X)
Maria@4 43 return X_pca, n_pc
Maria@4 44
Maria@4 45
Maria@4 46 def get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.975, do_pca=False):
Maria@4 47 uniq_labels = np.unique(Y)
Maria@4 48 spatial_outliers = []
Maria@4 49 for uniq_label in uniq_labels:
Maria@4 50 countries_neighbors = w_dict[uniq_label]
Maria@4 51 if len(countries_neighbors)==0:
Maria@4 52 print uniq_label, " no neighbors found"
Maria@4 53 continue
Maria@4 54 inds_neighborhood = []
Maria@4 55 for country in countries_neighbors:
Maria@4 56 inds = np.where(Y==country)[0]
Maria@4 57 inds_neighborhood.append(inds) # append neighboring countries
Maria@4 58 if len(np.concatenate(inds_neighborhood))==0:
Maria@4 59 print "no neighbors found"
Maria@4 60 continue
Maria@4 61 inds_neighborhood.append(np.where(Y==uniq_label)[0]) # append query country
Maria@4 62 inds_neighborhood = np.concatenate(inds_neighborhood)
Maria@4 63 if do_pca:
Maria@4 64 XX, _ = pca_data(X[inds_neighborhood, :], min_variance=0.99)
Maria@4 65 else:
Maria@4 66 XX = X[inds_neighborhood, :] # assume X is already in PCA
m@98 67 #print len(inds_neighborhood)
Maria@4 68 if len(inds_neighborhood)<XX.shape[1]:
Maria@4 69 print uniq_label, " neighborhood too small for number of features"
Maria@4 70 continue
Maria@4 71 threshold, y_pred, MD = get_outliers_Mahal(XX, chi2thr=chi2thr)
Maria@4 72 counts = Counter(Y[inds_neighborhood[y_pred]])
Maria@4 73 spatial_outliers.append([uniq_label, counts[uniq_label], threshold, y_pred, MD, counts, inds_neighborhood])
Maria@4 74 return spatial_outliers
Maria@4 75
Maria@4 76
Maria@4 77 def best_n_clusters_silhouette(X, min_ncl=2, max_ncl=50, metric='euclidean'):
Maria@4 78 ave_silh = []
Maria@4 79 for i in range(min_ncl):
Maria@4 80 ave_silh.append(np.nan) # for ncl=0, ncl=1 no clustering
Maria@4 81 for ncl in range(min_ncl, max_ncl):
Maria@4 82 print ncl
Maria@4 83 cl_pred = KMeans(n_clusters=ncl, random_state=50).fit_predict(X)
Maria@4 84 ave_silh.append(metrics.silhouette_score(X, cl_pred, metric=metric)) # silhouette avg
Maria@4 85 ave_silh = np.array(ave_silh)
Maria@4 86 bestncl = np.nanargmax(ave_silh)
Maria@4 87 return bestncl, ave_silh
Maria@4 88
Maria@4 89
Maria@4 90 def get_cluster_freq_linear(X, Y, centroids):
Maria@4 91 """ for each label in Y get the distribution of clusters by linear encoding
Maria@4 92 """
Maria@4 93 def encode_linear(X, centroids):
Maria@4 94 """Linear encoding via the dot product
Maria@4 95 """
Maria@4 96 return np.dot(X, centroids.T)
Maria@4 97 encoding = encode_linear(X, centroids)
Maria@4 98 encoding_df = pd.DataFrame(data=encoding, index=Y)
Maria@4 99 encoding_df_sum = encoding_df.groupby(encoding_df.index).sum()
Maria@4 100 cluster_freq = (encoding_df_sum - np.mean(encoding_df_sum)) / np.std(encoding_df_sum)
Maria@4 101 cluster_freq.index.name = 'labels'
Maria@4 102 return cluster_freq
Maria@4 103
Maria@4 104
Maria@4 105 def get_cluster_predictions(X, n_clusters=10):
Maria@4 106 cl_pred = KMeans(n_clusters=n_clusters, random_state=50).fit_predict(X)
Maria@4 107 return cl_pred