plosone_underreview: scripts/utils.py comparison

comparison scripts/utils.py @ 4:e50c63cf96be branch-tests

rearranging folders

author	Maria Panteli
date	Mon, 11 Sep 2017 11:51:50 +0100
parents
children	5eba53437755

comparison

equal deleted inserted replaced

-:230a0cf17de0
+:e50c63cf96be
+# -*- coding: utf-8 -*-
+"""
+Created on Wed May 17 11:26:01 2017
+@author: mariapanteli
+"""
+import numpy as np
+import pandas as pd
+from sklearn.cluster import KMeans
+from sklearn import metrics
+from collections import Counter
+from sklearn.decomposition import PCA
+from scipy import stats
+from scipy.spatial.distance import mahalanobis
+from sklearn.covariance import MinCovDet
+def get_outliers(X, chi2thr=0.975):
+robust_cov = MinCovDet().fit(X)
+MD = robust_cov.mahalanobis(X)
+chi2 = stats.chi2
+degrees_of_freedom = X.shape[1]
+threshold = chi2.ppf(chi2thr, degrees_of_freedom)
+y_pred = MD>threshold
+return threshold, y_pred, MD
+def get_outliers_Mahal(X, chi2thr=0.975):
+n_samples = X.shape[0]
+inv_cov = np.linalg.inv(np.cov(X.T))
+col_mean = np.mean(X, axis=0)
+MD = np.zeros(n_samples, dtype='float')
+for i in range(n_samples):
+MD[i] = mahalanobis(X[i,:], col_mean, inv_cov)
+MD = MD ** 2
+degrees_of_freedom = X.shape[1]
+chi2 = stats.chi2
+threshold = chi2.ppf(chi2thr, degrees_of_freedom)
+y_pred = MD>threshold
+return threshold, y_pred, MD
+def pca_data(X, min_variance=None):
+# rotate data to avoid singularity in Mahalanobis/covariance matrix
+model = PCA(whiten=True).fit(X)
+model.explained_variance_ratio_.sum()
+if min_variance is None:
+n_pc = X.shape[1]
+else:
+n_pc = np.where(model.explained_variance_ratio_.cumsum()>min_variance)[0][0]
+X_pca = PCA(n_components=n_pc, whiten=True).fit_transform(X)
+return X_pca, n_pc
+def get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.975, do_pca=False):
+uniq_labels = np.unique(Y)
+spatial_outliers = []
+for uniq_label in uniq_labels:
+countries_neighbors = w_dict[uniq_label]
+if len(countries_neighbors)==0:
+print uniq_label, " no neighbors found"
+continue
+inds_neighborhood = []
+for country in countries_neighbors:
+inds = np.where(Y==country)[0]
+inds_neighborhood.append(inds) # append neighboring countries
+if len(np.concatenate(inds_neighborhood))==0:
+print "no neighbors found"
+continue
+inds_neighborhood.append(np.where(Y==uniq_label)[0]) # append query country
+inds_neighborhood = np.concatenate(inds_neighborhood)
+if do_pca:
+XX, _ = pca_data(X[inds_neighborhood, :], min_variance=0.99)
+else:
+XX = X[inds_neighborhood, :]  # assume X is already in PCA
+print len(inds_neighborhood)
+if len(inds_neighborhood)<XX.shape[1]:
+print uniq_label, " neighborhood too small for number of features"
+continue
+threshold, y_pred, MD = get_outliers_Mahal(XX, chi2thr=chi2thr)
+counts = Counter(Y[inds_neighborhood[y_pred]])
+spatial_outliers.append([uniq_label, counts[uniq_label], threshold, y_pred, MD, counts, inds_neighborhood])
+return spatial_outliers
+def best_n_clusters_silhouette(X, min_ncl=2, max_ncl=50, metric='euclidean'):
+ave_silh = []
+for i in range(min_ncl):
+ave_silh.append(np.nan) # for ncl=0, ncl=1 no clustering
+for ncl in range(min_ncl, max_ncl):
+print ncl
+cl_pred = KMeans(n_clusters=ncl, random_state=50).fit_predict(X)
+ave_silh.append(metrics.silhouette_score(X, cl_pred, metric=metric)) # silhouette avg
+ave_silh = np.array(ave_silh)
+bestncl = np.nanargmax(ave_silh)
+return bestncl, ave_silh
+def get_cluster_freq_linear(X, Y, centroids):
+""" for each label in Y get the distribution of clusters by linear encoding
+"""
+def encode_linear(X, centroids):
+"""Linear encoding via the dot product
+"""
+return np.dot(X, centroids.T)
+encoding = encode_linear(X, centroids)
+encoding_df = pd.DataFrame(data=encoding, index=Y)
+encoding_df_sum = encoding_df.groupby(encoding_df.index).sum()
+cluster_freq = (encoding_df_sum - np.mean(encoding_df_sum)) / np.std(encoding_df_sum)
+cluster_freq.index.name = 'labels'
+return cluster_freq
+def get_cluster_predictions(X, n_clusters=10):
+cl_pred = KMeans(n_clusters=n_clusters, random_state=50).fit_predict(X)
+return cl_pred

Mercurial > hg > plosone_underreview

comparison scripts/utils.py @ 4:e50c63cf96be branch-tests