Mercurial > hg > plosone_underreview
diff scripts/utils.py @ 4:e50c63cf96be branch-tests
rearranging folders
author | Maria Panteli |
---|---|
date | Mon, 11 Sep 2017 11:51:50 +0100 |
parents | |
children | 5eba53437755 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/utils.py Mon Sep 11 11:51:50 2017 +0100 @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed May 17 11:26:01 2017 + +@author: mariapanteli +""" + +import numpy as np +import pandas as pd +from sklearn.cluster import KMeans +from sklearn import metrics +from collections import Counter +from sklearn.decomposition import PCA +from scipy import stats +from scipy.spatial.distance import mahalanobis +from sklearn.covariance import MinCovDet + + +def get_outliers(X, chi2thr=0.975): + robust_cov = MinCovDet().fit(X) + MD = robust_cov.mahalanobis(X) + chi2 = stats.chi2 + degrees_of_freedom = X.shape[1] + threshold = chi2.ppf(chi2thr, degrees_of_freedom) + y_pred = MD>threshold + return threshold, y_pred, MD + + +def get_outliers_Mahal(X, chi2thr=0.975): + n_samples = X.shape[0] + inv_cov = np.linalg.inv(np.cov(X.T)) + col_mean = np.mean(X, axis=0) + MD = np.zeros(n_samples, dtype='float') + for i in range(n_samples): + MD[i] = mahalanobis(X[i,:], col_mean, inv_cov) + MD = MD ** 2 + degrees_of_freedom = X.shape[1] + chi2 = stats.chi2 + threshold = chi2.ppf(chi2thr, degrees_of_freedom) + y_pred = MD>threshold + return threshold, y_pred, MD + + +def pca_data(X, min_variance=None): + # rotate data to avoid singularity in Mahalanobis/covariance matrix + model = PCA(whiten=True).fit(X) + model.explained_variance_ratio_.sum() + if min_variance is None: + n_pc = X.shape[1] + else: + n_pc = np.where(model.explained_variance_ratio_.cumsum()>min_variance)[0][0] + X_pca = PCA(n_components=n_pc, whiten=True).fit_transform(X) + return X_pca, n_pc + + +def get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.975, do_pca=False): + uniq_labels = np.unique(Y) + spatial_outliers = [] + for uniq_label in uniq_labels: + countries_neighbors = w_dict[uniq_label] + if len(countries_neighbors)==0: + print uniq_label, " no neighbors found" + continue + inds_neighborhood = [] + for country in countries_neighbors: + inds = np.where(Y==country)[0] + inds_neighborhood.append(inds) # append neighboring countries + if len(np.concatenate(inds_neighborhood))==0: + print "no neighbors found" + continue + inds_neighborhood.append(np.where(Y==uniq_label)[0]) # append query country + inds_neighborhood = np.concatenate(inds_neighborhood) + if do_pca: + XX, _ = pca_data(X[inds_neighborhood, :], min_variance=0.99) + else: + XX = X[inds_neighborhood, :] # assume X is already in PCA + print len(inds_neighborhood) + if len(inds_neighborhood)<XX.shape[1]: + print uniq_label, " neighborhood too small for number of features" + continue + threshold, y_pred, MD = get_outliers_Mahal(XX, chi2thr=chi2thr) + counts = Counter(Y[inds_neighborhood[y_pred]]) + spatial_outliers.append([uniq_label, counts[uniq_label], threshold, y_pred, MD, counts, inds_neighborhood]) + return spatial_outliers + + +def best_n_clusters_silhouette(X, min_ncl=2, max_ncl=50, metric='euclidean'): + ave_silh = [] + for i in range(min_ncl): + ave_silh.append(np.nan) # for ncl=0, ncl=1 no clustering + for ncl in range(min_ncl, max_ncl): + print ncl + cl_pred = KMeans(n_clusters=ncl, random_state=50).fit_predict(X) + ave_silh.append(metrics.silhouette_score(X, cl_pred, metric=metric)) # silhouette avg + ave_silh = np.array(ave_silh) + bestncl = np.nanargmax(ave_silh) + return bestncl, ave_silh + + +def get_cluster_freq_linear(X, Y, centroids): + """ for each label in Y get the distribution of clusters by linear encoding + """ + def encode_linear(X, centroids): + """Linear encoding via the dot product + """ + return np.dot(X, centroids.T) + encoding = encode_linear(X, centroids) + encoding_df = pd.DataFrame(data=encoding, index=Y) + encoding_df_sum = encoding_df.groupby(encoding_df.index).sum() + cluster_freq = (encoding_df_sum - np.mean(encoding_df_sum)) / np.std(encoding_df_sum) + cluster_freq.index.name = 'labels' + return cluster_freq + + +def get_cluster_predictions(X, n_clusters=10): + cl_pred = KMeans(n_clusters=n_clusters, random_state=50).fit_predict(X) + return cl_pred