Mercurial > hg > plosone_underreview
changeset 51:9430abe45e4a branch-tests
print absolute number of outliers with percentage
author | Maria Panteli <m.x.panteli@gmail.com> |
---|---|
date | Sun, 17 Sep 2017 18:37:37 +0100 |
parents | d3de9ac0d545 |
children | 635028c5be34 |
files | notebooks/correlation_samples_outliers.ipynb scripts/outliers.py |
diffstat | 2 files changed, 150 insertions(+), 36 deletions(-) [+] |
line wrap: on
line diff
--- a/notebooks/correlation_samples_outliers.ipynb Fri Sep 15 17:49:24 2017 +0100 +++ b/notebooks/correlation_samples_outliers.ipynb Sun Sep 17 18:37:37 2017 +0100 @@ -2,11 +2,20 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 11, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "import numpy as np\n", "import pickle\n", @@ -26,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 12, "metadata": { "collapsed": false }, @@ -57,31 +66,7 @@ "Solomon Islands\n", "South Korea\n", "The Bahamas\n", - "Trinidad and Tobago\n", - "most outliers \n", - " Country Outliers\n", - "60 Chad 0.636364\n", - "86 Gambia 0.540000\n", - "17 French Guiana 0.535714\n", - "43 Benin 0.500000\n", - "78 El Salvador 0.484848\n", - "136 Botswana 0.477778\n", - "6 Bolivia 0.457143\n", - "104 Bhutan 0.454545\n", - "14 Liberia 0.450000\n", - "63 Senegal 0.439024\n", - "least outliers \n", - " Country Outliers\n", - "1 Lithuania 0.000000\n", - "120 Kazakhstan 0.000000\n", - "119 Denmark 0.000000\n", - "107 Kiribati 0.000000\n", - "109 Democratic Republic of the Congo 0.042553\n", - "105 Sudan 0.044118\n", - "15 Netherlands 0.044776\n", - "84 Iraq 0.045977\n", - "74 Czech Republic 0.048780\n", - "85 Sierra Leone 0.050000\n" + "Trinidad and Tobago\n" ] } ], @@ -121,6 +106,50 @@ }, { "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "60 Chad 0.636364 11 7\n", + "86 Gambia 0.540000 50 27\n", + "17 French Guiana 0.535714 28 15\n", + "43 Benin 0.500000 26 13\n", + "78 El Salvador 0.484848 33 16\n", + "136 Botswana 0.477778 90 43\n", + "6 Bolivia 0.457143 35 16\n", + "104 Bhutan 0.454545 11 5\n", + "14 Liberia 0.450000 40 18\n", + "63 Senegal 0.439024 41 18\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "1 Lithuania 0.000000 47 NaN\n", + "120 Kazakhstan 0.000000 88 NaN\n", + "119 Denmark 0.000000 16 NaN\n", + "107 Kiribati 0.000000 17 NaN\n", + "109 Democratic Republic of the Congo 0.042553 47 2\n", + "105 Sudan 0.044118 68 3\n", + "15 Netherlands 0.044776 67 3\n", + "84 Iraq 0.045977 87 4\n", + "74 Czech Republic 0.048780 41 2\n", + "85 Sierra Leone 0.050000 100 5\n" + ] + } + ], + "source": [ + "# global outliers\n", + "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n", + "outliers.print_most_least_outliers_topN(df_global, N=10)" + ] + }, + { + "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false @@ -195,8 +224,6 @@ } ], "source": [ - "# global outliers\n", - "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n", "df_global['N'] = np.zeros(len(df_global))\n", "df_global['OutliersN'] = np.zeros(len(df_global))\n", "for i, country in enumerate(df_global['Country']):\n", @@ -296,6 +323,86 @@ }, { "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "43 Benin 0.461538 26 12\n", + "136 Botswana 0.422222 90 38\n", + "84 Belize 0.418605 43 18\n", + "19 Yemen 0.416667 12 5\n", + "67 Brazil 0.370000 100 37\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "113 Iceland 0 14 NaN\n", + "70 Costa Rica 0 11 NaN\n", + "28 Tajikistan 0 19 NaN\n", + "27 South Korea 0 11 NaN\n", + "107 Kiribati 0 17 NaN\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "96 Uruguay 0.580645 31 18\n", + "117 Zimbabwe 0.533333 15 8\n", + "61 Chad 0.454545 11 5\n", + "69 Guinea 0.454545 11 5\n", + "86 Gambia 0.440000 50 22\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "17 Ivory Coast 0.000000 15 NaN\n", + "107 Kiribati 0.000000 17 NaN\n", + "38 Rwanda 0.000000 17 NaN\n", + "119 Denmark 0.000000 16 NaN\n", + "39 Somalia 0.010309 97 1\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "17 French Guiana 0.714286 28 20\n", + "136 Botswana 0.500000 90 45\n", + "23 Azerbaijan 0.384615 13 5\n", + "40 Laos 0.333333 21 7\n", + "69 Panama 0.333333 12 4\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "94 Iraq 0 87 NaN\n", + "62 Nicaragua 0 21 NaN\n", + "74 Czech Republic 0 41 NaN\n", + "77 Algeria 0 27 NaN\n", + "37 Rwanda 0 17 NaN\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "44 Benin 0.500000 26 13\n", + "22 Pakistan 0.395604 91 36\n", + "53 Indonesia 0.390000 100 39\n", + "61 Chad 0.363636 11 4\n", + "104 Bhutan 0.363636 11 4\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "120 Kazakhstan 0 88 NaN\n", + "100 Antigua and Barbuda 0 42 NaN\n", + "99 Tunisia 0 39 NaN\n", + "81 Belgium 0 16 NaN\n", + "71 Costa Rica 0 11 NaN\n" + ] + } + ], + "source": [ + "feat = [Xrhy, Xmel, Xmfc, Xchr]\n", + "feat_labels = ['rhy', 'mel', 'mfc', 'chr']\n", + "tabs_feat = []\n", + "for i in range(len(feat)):\n", + " XX = feat[i]\n", + " df_feat, threshold, MD = outliers.get_outliers_df(XX, Y, chi2thr=0.999)\n", + " outliers.print_most_least_outliers_topN(df_feat, N=5)" + ] + }, + { + "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true @@ -320,7 +427,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.11" + "version": "2.7.12" } }, "nbformat": 4,
--- a/scripts/outliers.py Fri Sep 15 17:49:24 2017 +0100 +++ b/scripts/outliers.py Sun Sep 17 18:37:37 2017 +0100 @@ -15,7 +15,7 @@ import utils_spatial -def country_outlier_df(counts, labels, out_file=None, normalize=False): +def country_outlier_df(counts, labels, normalize=False): if len(counts.keys()) < len(np.unique(labels)): for label in np.unique(labels): if not counts.has_key(label): @@ -24,8 +24,6 @@ counts = normalize_outlier_counts(counts, Counter(labels)) df = pd.DataFrame.from_dict(counts, orient='index').reset_index() df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True) - if out_file is not None: - df.to_csv(out_file, index=False) return df @@ -43,6 +41,13 @@ threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr) global_counts = Counter(Y[y_pred]) df = country_outlier_df(global_counts, Y, normalize=True) + # append number of recordings and number of outliers per country + df_n_country = pd.DataFrame.from_dict(Counter(Y), orient='index').reset_index() + df_n_country.rename(columns={'index':'Country', 0:'N_Country'}, inplace=True) + df_n_outliers = pd.DataFrame.from_dict(Counter(Y[y_pred]), orient='index').reset_index() + df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True) + df = pd.merge(df, df_n_country, on='Country', how='left') + df = pd.merge(df, df_n_outliers, on='Country', how='left') if out_file is not None: df.to_csv(out_file, index=False) return df, threshold, MD @@ -50,8 +55,10 @@ def print_most_least_outliers_topN(df, N=10): sort_inds = df['Outliers'].argsort() # ascending order - df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]] - df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]] + #df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]] + #df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]] + df_most = df.iloc[sort_inds[::-1][:N]] + df_least = df.iloc[sort_inds[:N]] print "most outliers " print df_most print "least outliers "