Mercurial > hg > plosone_underreview
diff scripts/results.py @ 4:e50c63cf96be branch-tests
rearranging folders
author | Maria Panteli |
---|---|
date | Mon, 11 Sep 2017 11:51:50 +0100 |
parents | |
children | 0f3eba42b425 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/results.py Mon Sep 11 11:51:50 2017 +0100 @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Jul 12 20:49:48 2016 + +@author: mariapanteli +""" + +import numpy as np +import pandas as pd +import pickle +from collections import Counter +from sklearn.cluster import KMeans + +import utils +import utils_spatial + + +def country_outlier_df(counts, labels, out_file=None, normalize=False): + if len(counts.keys()) < len(np.unique(labels)): + for label in np.unique(labels): + if not counts.has_key(label): + counts.update({label:0}) + if normalize is True: + counts = normalize_outlier_counts(counts, Counter(labels)) + df = pd.DataFrame.from_dict(counts, orient='index').reset_index() + df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True) + if out_file is not None: + df.to_csv(out_file, index=False) + return df + + +def normalize_outlier_counts(outlier_counts, country_counts): + '''Normalize a dictionary of outlier counts per country by + the total number of recordings per country + ''' + for key in outlier_counts.keys(): + # dictionaries should have the same keys + outlier_counts[key] = float(outlier_counts[key]) / float(country_counts[key]) + return outlier_counts + + +def get_outliers_df(X, Y, chi2thr=0.999, out_file=None): + threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr) + global_counts = Counter(Y[y_pred]) + df = country_outlier_df(global_counts, Y, normalize=True) + if out_file is not None: + df.to_csv(out_file, index=False) + return df, threshold, MD + + +def print_most_least_outliers_topN(df, N=10): + sort_inds = df['Outliers'].argsort() # ascending order + df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]] + df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]] + print "most outliers " + print df_most + print "least outliers " + print df_least + + +def load_metadata(Yaudio, metadata_file): + df = pd.read_csv(metadata_file) + df_audio = pd.DataFrame({'Audio':Yaudio}) + ddf = pd.merge(df_audio, df, on='Audio', suffixes=['', '_r']) # in the order of Yaudio + return ddf + + +def clusters_metadata(df, cl_pred, out_file=None): + def get_top_N_counts(labels, N=3): + ulab, ucount = np.unique(labels, return_counts=True) + inds = np.argsort(ucount) + return zip(ulab[inds[-N:]],ucount[inds[-N:]]) + info = np.array([str(df['Country'].iloc[i]) for i in range(len(df))]) + styles_description = [] + uniq_cl = np.unique(cl_pred) + for ccl in uniq_cl: + inds = np.where(cl_pred==ccl)[0] + styles_description.append(get_top_N_counts(info[inds], N=3)) + df_styles = pd.DataFrame(data=styles_description, index=uniq_cl) + print df_styles.to_latex() + if out_file is not None: + df_styles.to_csv(out_file, index=False) + + +# load LDA-transformed frames +X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb')) +ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv') +w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) +w_dict = utils_spatial.from_weights_to_dict(w, data_countries) +Xrhy, Xmel, Xmfc, Xchr = X_list +X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1) + +# global outliers +df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999) +print_most_least_outliers_topN(df_global, N=10) + +spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) +spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) +df_local = country_outlier_df(spatial_counts, Y, normalize=True) +print_most_least_outliers_topN(df_local, N=10) + +feat = [Xrhy, Xmel, Xmfc, Xchr] +feat_labels = ['rhy', 'mel', 'mfc', 'chr'] +tabs_feat = [] +for i in range(len(feat)): + XX = feat[i] + df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999) + print_most_least_outliers_topN(df_feat, N=5) + +# how many styles are there +#bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") +bestncl = 13 + +# get cluster predictions and metadata for each cluster +cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) +centroids = cluster_model.cluster_centers_ +cl_pred = cluster_model.predict(X) +ddf['Clusters'] = cl_pred +clusters_metadata(ddf, cl_pred) + +# how similar are the cultures and which ones seem to be global outliers +cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids) + +# Moran on Mahalanobis distances +data = cluster_freq.get_values() +data_countries = cluster_freq.index +#threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999) +threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999) +y = np.sqrt(MD) +utils_spatial.print_Moran_outliers(y, w, data_countries) +utils_spatial.plot_Moran_scatterplot(y, w, data_countries)