Maria@18: # -*- coding: utf-8 -*- Maria@18: """ Maria@18: Created on Tue Jul 12 20:49:48 2016 Maria@18: Maria@18: @author: mariapanteli Maria@18: """ Maria@18: Maria@18: import numpy as np Maria@18: import pandas as pd Maria@18: import pickle Maria@18: from collections import Counter Maria@18: from sklearn.cluster import KMeans Maria@18: Maria@18: import utils Maria@18: import utils_spatial Maria@18: Maria@18: m@75: def country_outlier_df(counts, labels, normalize=False, out_file=None): Maria@18: if len(counts.keys()) < len(np.unique(labels)): Maria@18: for label in np.unique(labels): Maria@18: if not counts.has_key(label): Maria@18: counts.update({label:0}) Maria@18: if normalize: m@54: norm_counts = normalize_outlier_counts(counts, Counter(labels)) m@54: df = pd.DataFrame.from_dict(norm_counts, orient='index').reset_index() m@54: else: m@54: df = pd.DataFrame.from_dict(Counter(counts), orient='index').reset_index() Maria@18: df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True) m@54: # append number of recordings and number of outliers per country m@54: df_n_country = pd.DataFrame.from_dict(Counter(labels), orient='index').reset_index() m@54: df_n_country.rename(columns={'index':'Country', 0:'N_Country'}, inplace=True) m@54: df_n_outliers = pd.DataFrame.from_dict(Counter(counts), orient='index').reset_index() m@54: df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True) m@54: df = pd.merge(df, df_n_country, on='Country', how='left') m@54: df = pd.merge(df, df_n_outliers, on='Country', how='left') m@75: if out_file is not None: m@75: df.to_csv(out_file, index=False) Maria@18: return df Maria@18: Maria@18: Maria@18: def normalize_outlier_counts(outlier_counts, country_counts): Maria@18: '''Normalize a dictionary of outlier counts per country by Maria@18: the total number of recordings per country Maria@18: ''' m@54: norm_counts = {} Maria@18: for key in outlier_counts.keys(): Maria@18: # dictionaries should have the same keys m@54: norm_counts[key] = float(outlier_counts[key]) / float(country_counts[key]) m@54: return norm_counts Maria@18: Maria@18: Maria@18: def get_outliers_df(X, Y, chi2thr=0.999, out_file=None): Maria@18: threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr) Maria@18: global_counts = Counter(Y[y_pred]) m@75: df = country_outlier_df(global_counts, Y, normalize=True, out_file=out_file) Maria@18: return df, threshold, MD Maria@18: Maria@18: Maria@18: def print_most_least_outliers_topN(df, N=10): Maria@18: sort_inds = df['Outliers'].argsort() # ascending order m@51: #df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]] m@51: #df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]] m@51: df_most = df.iloc[sort_inds[::-1][:N]] m@51: df_least = df.iloc[sort_inds[:N]] Maria@18: print "most outliers " Maria@18: print df_most Maria@18: print "least outliers " Maria@18: print df_least Maria@18: Maria@18: Maria@18: def load_metadata(Yaudio, metadata_file): Maria@18: df = pd.read_csv(metadata_file) Maria@18: df_audio = pd.DataFrame({'Audio':Yaudio}) Maria@18: ddf = pd.merge(df_audio, df, on='Audio', suffixes=['', '_r']) # in the order of Yaudio Maria@18: return ddf Maria@18: Maria@18: m@20: def print_clusters_metadata(df, cl_pred, out_file=None): Maria@18: def get_top_N_counts(labels, N=3): Maria@18: ulab, ucount = np.unique(labels, return_counts=True) Maria@18: inds = np.argsort(ucount) Maria@18: return zip(ulab[inds[-N:]],ucount[inds[-N:]]) Maria@18: info = np.array([str(df['Country'].iloc[i]) for i in range(len(df))]) Maria@18: styles_description = [] Maria@18: uniq_cl = np.unique(cl_pred) Maria@18: for ccl in uniq_cl: Maria@18: inds = np.where(cl_pred==ccl)[0] Maria@18: styles_description.append(get_top_N_counts(info[inds], N=3)) Maria@18: df_styles = pd.DataFrame(data=styles_description, index=uniq_cl) Maria@18: print df_styles.to_latex() Maria@18: if out_file is not None: Maria@18: df_styles.to_csv(out_file, index=False) Maria@18: Maria@18: m@65: def load_data(pickle_file, metadata_file): m@65: X_list, Y, Yaudio = pickle.load(open(pickle_file,'rb')) m@65: ddf = load_metadata(Yaudio, metadata_file=metadata_file) m@65: w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) m@65: w_dict = utils_spatial.from_weights_to_dict(w, data_countries) m@77: ddf = utils_spatial.append_regions(ddf) m@65: return [X_list, Y, Yaudio], ddf, w_dict m@65: m@65: m@75: def get_local_outliers_df(X, Y, w_dict, out_file=None): m@65: spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) m@65: spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) m@75: df_local = country_outlier_df(spatial_counts, Y, normalize=True, out_file=out_file) m@65: return df_local m@65: m@65: m@77: def get_country_clusters(X, bestncl=None, min_ncl=5, max_ncl=50): m@65: if bestncl is None: m@77: bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=min_ncl, max_ncl=max_ncl, metric="cosine") m@65: # get cluster predictions and metadata for each cluster m@65: cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) m@65: centroids = cluster_model.cluster_centers_ m@65: cl_pred = cluster_model.predict(X) m@65: return centroids, cl_pred m@65: m@65: Maria@18: if __name__ == '__main__': Maria@18: # load LDA-transformed frames m@65: dataset, ddf, w_dict = load_data('data/lda_data_8.pickle', 'data/metadata.csv') m@65: X_list, Y, Yaudio = dataset Maria@18: X = np.concatenate(X_list, axis=1) Maria@18: Maria@18: # global outliers Maria@18: df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999) Maria@18: print_most_least_outliers_topN(df_global, N=10) Maria@18: m@65: # local outliers m@65: df_local = get_local_outliers_df(X, Y, w_dict) Maria@18: print_most_least_outliers_topN(df_local, N=10) Maria@18: m@65: # outliers for features m@65: feat = X_list Maria@18: feat_labels = ['rhy', 'mel', 'mfc', 'chr'] Maria@18: tabs_feat = [] Maria@18: for i in range(len(feat)): Maria@18: XX = feat[i] Maria@18: df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999) Maria@18: print_most_least_outliers_topN(df_feat, N=5) Maria@18: m@65: ## how many styles are there m@65: ##bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") m@99: centroids, cl_pred = get_country_clusters(X, bestncl=10) Maria@18: ddf['Clusters'] = cl_pred m@20: print_clusters_metadata(ddf, cl_pred) Maria@18: Maria@18: # how similar are the cultures and which ones seem to be global outliers Maria@18: cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids)