Mercurial > hg > plosone_underreview
diff scripts/outliers.py @ 65:9b10b688c2ac branch-tests
results for 30 seconds
author | mpanteli <m.x.panteli@gmail.com> |
---|---|
date | Thu, 21 Sep 2017 20:11:43 +0100 |
parents | dbcd5b2a4efa |
children | bfb9ed45c417 |
line wrap: on
line diff
--- a/scripts/outliers.py Thu Sep 21 17:36:16 2017 +0100 +++ b/scripts/outliers.py Thu Sep 21 20:11:43 2017 +0100 @@ -92,24 +92,47 @@ df_styles.to_csv(out_file, index=False) +def load_data(pickle_file, metadata_file): + X_list, Y, Yaudio = pickle.load(open(pickle_file,'rb')) + ddf = load_metadata(Yaudio, metadata_file=metadata_file) + w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) + w_dict = utils_spatial.from_weights_to_dict(w, data_countries) + return [X_list, Y, Yaudio], ddf, w_dict + + +def get_local_outliers_df(X, Y, w_dict): + spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) + spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) + df_local = country_outlier_df(spatial_counts, Y, normalize=True) + return df_local + + +def get_country_clusters(X, bestncl=None): + if bestncl is None: + bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") + # get cluster predictions and metadata for each cluster + cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) + centroids = cluster_model.cluster_centers_ + cl_pred = cluster_model.predict(X) + return centroids, cl_pred + + if __name__ == '__main__': # load LDA-transformed frames - X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb')) - ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv') - w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) - w_dict = utils_spatial.from_weights_to_dict(w, data_countries) + dataset, ddf, w_dict = load_data('data/lda_data_8.pickle', 'data/metadata.csv') + X_list, Y, Yaudio = dataset X = np.concatenate(X_list, axis=1) # global outliers df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999) print_most_least_outliers_topN(df_global, N=10) - spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) - spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) - df_local = country_outlier_df(spatial_counts, Y, normalize=True) + # local outliers + df_local = get_local_outliers_df(X, Y, w_dict) print_most_least_outliers_topN(df_local, N=10) - feat = [Xrhy, Xmel, Xmfc, Xchr] + # outliers for features + feat = X_list feat_labels = ['rhy', 'mel', 'mfc', 'chr'] tabs_feat = [] for i in range(len(feat)): @@ -117,25 +140,25 @@ df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999) print_most_least_outliers_topN(df_feat, N=5) - # how many styles are there - #bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") - bestncl = 13 - - # get cluster predictions and metadata for each cluster - cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) - centroids = cluster_model.cluster_centers_ - cl_pred = cluster_model.predict(X) + ## how many styles are there + ##bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") + #bestncl = 13 + ## get cluster predictions and metadata for each cluster + #cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) + #centroids = cluster_model.cluster_centers_ + #cl_pred = cluster_model.predict(X) + centroids, cl_pred = get_country_clusters(X, bestncl=13) ddf['Clusters'] = cl_pred print_clusters_metadata(ddf, cl_pred) # how similar are the cultures and which ones seem to be global outliers cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids) - # Moran on Mahalanobis distances - data = cluster_freq.get_values() - data_countries = cluster_freq.index - #threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999) - threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999) - y = np.sqrt(MD) - utils_spatial.print_Moran_outliers(y, w, data_countries) - utils_spatial.plot_Moran_scatterplot(y, w, data_countries) + ## Moran on Mahalanobis distances + #data = cluster_freq.get_values() + #data_countries = cluster_freq.index + ##threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999) + #threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999) + #y = np.sqrt(MD) + #utils_spatial.print_Moran_outliers(y, w, data_countries) + #utils_spatial.plot_Moran_scatterplot(y, w, data_countries)