Maria@4: # -*- coding: utf-8 -*- Maria@4: """ Maria@4: Created on Wed May 17 11:26:01 2017 Maria@4: Maria@4: @author: mariapanteli Maria@4: """ Maria@4: Maria@4: import numpy as np Maria@4: import pandas as pd Maria@4: from sklearn.cluster import KMeans Maria@4: from sklearn import metrics Maria@4: from collections import Counter Maria@4: from sklearn.decomposition import PCA Maria@4: from scipy import stats Maria@4: from scipy.spatial.distance import mahalanobis Maria@4: from sklearn.covariance import MinCovDet Maria@4: Maria@4: Maria@4: def get_outliers_Mahal(X, chi2thr=0.975): Maria@4: n_samples = X.shape[0] Maria@4: inv_cov = np.linalg.inv(np.cov(X.T)) Maria@4: col_mean = np.mean(X, axis=0) Maria@4: MD = np.zeros(n_samples, dtype='float') Maria@4: for i in range(n_samples): Maria@4: MD[i] = mahalanobis(X[i,:], col_mean, inv_cov) Maria@4: MD = MD ** 2 Maria@4: degrees_of_freedom = X.shape[1] Maria@4: chi2 = stats.chi2 Maria@4: threshold = chi2.ppf(chi2thr, degrees_of_freedom) Maria@4: y_pred = MD>threshold Maria@4: return threshold, y_pred, MD Maria@4: Maria@4: Maria@4: def pca_data(X, min_variance=None): Maria@4: # rotate data to avoid singularity in Mahalanobis/covariance matrix Maria@4: model = PCA(whiten=True).fit(X) Maria@4: model.explained_variance_ratio_.sum() Maria@4: if min_variance is None: Maria@4: n_pc = X.shape[1] Maria@4: else: Maria@4: n_pc = np.where(model.explained_variance_ratio_.cumsum()>min_variance)[0][0] Maria@4: X_pca = PCA(n_components=n_pc, whiten=True).fit_transform(X) Maria@4: return X_pca, n_pc Maria@4: Maria@4: Maria@4: def get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.975, do_pca=False): Maria@4: uniq_labels = np.unique(Y) Maria@4: spatial_outliers = [] Maria@4: for uniq_label in uniq_labels: Maria@4: countries_neighbors = w_dict[uniq_label] Maria@4: if len(countries_neighbors)==0: Maria@4: print uniq_label, " no neighbors found" Maria@4: continue Maria@4: inds_neighborhood = [] Maria@4: for country in countries_neighbors: Maria@4: inds = np.where(Y==country)[0] Maria@4: inds_neighborhood.append(inds) # append neighboring countries Maria@4: if len(np.concatenate(inds_neighborhood))==0: Maria@4: print "no neighbors found" Maria@4: continue Maria@4: inds_neighborhood.append(np.where(Y==uniq_label)[0]) # append query country Maria@4: inds_neighborhood = np.concatenate(inds_neighborhood) Maria@4: if do_pca: Maria@4: XX, _ = pca_data(X[inds_neighborhood, :], min_variance=0.99) Maria@4: else: Maria@4: XX = X[inds_neighborhood, :] # assume X is already in PCA m@98: #print len(inds_neighborhood) Maria@4: if len(inds_neighborhood)