changeset 51:9430abe45e4a branch-tests

print absolute number of outliers with percentage
author Maria Panteli <m.x.panteli@gmail.com>
date Sun, 17 Sep 2017 18:37:37 +0100
parents d3de9ac0d545
children 635028c5be34
files notebooks/correlation_samples_outliers.ipynb scripts/outliers.py
diffstat 2 files changed, 150 insertions(+), 36 deletions(-) [+]
line wrap: on
line diff
--- a/notebooks/correlation_samples_outliers.ipynb	Fri Sep 15 17:49:24 2017 +0100
+++ b/notebooks/correlation_samples_outliers.ipynb	Sun Sep 17 18:37:37 2017 +0100
@@ -2,11 +2,20 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 11,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
    "source": [
     "import numpy as np\n",
     "import pickle\n",
@@ -26,7 +35,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 12,
    "metadata": {
     "collapsed": false
    },
@@ -57,31 +66,7 @@
       "Solomon Islands\n",
       "South Korea\n",
       "The Bahamas\n",
-      "Trinidad and Tobago\n",
-      "most outliers \n",
-      "           Country  Outliers\n",
-      "60            Chad  0.636364\n",
-      "86          Gambia  0.540000\n",
-      "17   French Guiana  0.535714\n",
-      "43           Benin  0.500000\n",
-      "78     El Salvador  0.484848\n",
-      "136       Botswana  0.477778\n",
-      "6          Bolivia  0.457143\n",
-      "104         Bhutan  0.454545\n",
-      "14         Liberia  0.450000\n",
-      "63         Senegal  0.439024\n",
-      "least outliers \n",
-      "                              Country  Outliers\n",
-      "1                           Lithuania  0.000000\n",
-      "120                        Kazakhstan  0.000000\n",
-      "119                           Denmark  0.000000\n",
-      "107                          Kiribati  0.000000\n",
-      "109  Democratic Republic of the Congo  0.042553\n",
-      "105                             Sudan  0.044118\n",
-      "15                        Netherlands  0.044776\n",
-      "84                               Iraq  0.045977\n",
-      "74                     Czech Republic  0.048780\n",
-      "85                       Sierra Leone  0.050000\n"
+      "Trinidad and Tobago\n"
      ]
     }
    ],
@@ -121,6 +106,50 @@
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "most outliers \n",
+      "           Country  Outliers  N_Country  N_Outliers\n",
+      "60            Chad  0.636364         11           7\n",
+      "86          Gambia  0.540000         50          27\n",
+      "17   French Guiana  0.535714         28          15\n",
+      "43           Benin  0.500000         26          13\n",
+      "78     El Salvador  0.484848         33          16\n",
+      "136       Botswana  0.477778         90          43\n",
+      "6          Bolivia  0.457143         35          16\n",
+      "104         Bhutan  0.454545         11           5\n",
+      "14         Liberia  0.450000         40          18\n",
+      "63         Senegal  0.439024         41          18\n",
+      "least outliers \n",
+      "                              Country  Outliers  N_Country  N_Outliers\n",
+      "1                           Lithuania  0.000000         47         NaN\n",
+      "120                        Kazakhstan  0.000000         88         NaN\n",
+      "119                           Denmark  0.000000         16         NaN\n",
+      "107                          Kiribati  0.000000         17         NaN\n",
+      "109  Democratic Republic of the Congo  0.042553         47           2\n",
+      "105                             Sudan  0.044118         68           3\n",
+      "15                        Netherlands  0.044776         67           3\n",
+      "84                               Iraq  0.045977         87           4\n",
+      "74                     Czech Republic  0.048780         41           2\n",
+      "85                       Sierra Leone  0.050000        100           5\n"
+     ]
+    }
+   ],
+   "source": [
+    "# global outliers\n",
+    "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n",
+    "outliers.print_most_least_outliers_topN(df_global, N=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
    "execution_count": 30,
    "metadata": {
     "collapsed": false
@@ -195,8 +224,6 @@
     }
    ],
    "source": [
-    "# global outliers\n",
-    "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n",
     "df_global['N'] = np.zeros(len(df_global))\n",
     "df_global['OutliersN'] = np.zeros(len(df_global))\n",
     "for i, country in enumerate(df_global['Country']):\n",
@@ -296,6 +323,86 @@
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "most outliers \n",
+      "      Country  Outliers  N_Country  N_Outliers\n",
+      "43      Benin  0.461538         26          12\n",
+      "136  Botswana  0.422222         90          38\n",
+      "84     Belize  0.418605         43          18\n",
+      "19      Yemen  0.416667         12           5\n",
+      "67     Brazil  0.370000        100          37\n",
+      "least outliers \n",
+      "         Country  Outliers  N_Country  N_Outliers\n",
+      "113      Iceland         0         14         NaN\n",
+      "70    Costa Rica         0         11         NaN\n",
+      "28    Tajikistan         0         19         NaN\n",
+      "27   South Korea         0         11         NaN\n",
+      "107     Kiribati         0         17         NaN\n",
+      "most outliers \n",
+      "      Country  Outliers  N_Country  N_Outliers\n",
+      "96    Uruguay  0.580645         31          18\n",
+      "117  Zimbabwe  0.533333         15           8\n",
+      "61       Chad  0.454545         11           5\n",
+      "69     Guinea  0.454545         11           5\n",
+      "86     Gambia  0.440000         50          22\n",
+      "least outliers \n",
+      "         Country  Outliers  N_Country  N_Outliers\n",
+      "17   Ivory Coast  0.000000         15         NaN\n",
+      "107     Kiribati  0.000000         17         NaN\n",
+      "38        Rwanda  0.000000         17         NaN\n",
+      "119      Denmark  0.000000         16         NaN\n",
+      "39       Somalia  0.010309         97           1\n",
+      "most outliers \n",
+      "           Country  Outliers  N_Country  N_Outliers\n",
+      "17   French Guiana  0.714286         28          20\n",
+      "136       Botswana  0.500000         90          45\n",
+      "23      Azerbaijan  0.384615         13           5\n",
+      "40            Laos  0.333333         21           7\n",
+      "69          Panama  0.333333         12           4\n",
+      "least outliers \n",
+      "           Country  Outliers  N_Country  N_Outliers\n",
+      "94            Iraq         0         87         NaN\n",
+      "62       Nicaragua         0         21         NaN\n",
+      "74  Czech Republic         0         41         NaN\n",
+      "77         Algeria         0         27         NaN\n",
+      "37          Rwanda         0         17         NaN\n",
+      "most outliers \n",
+      "       Country  Outliers  N_Country  N_Outliers\n",
+      "44       Benin  0.500000         26          13\n",
+      "22    Pakistan  0.395604         91          36\n",
+      "53   Indonesia  0.390000        100          39\n",
+      "61        Chad  0.363636         11           4\n",
+      "104     Bhutan  0.363636         11           4\n",
+      "least outliers \n",
+      "                 Country  Outliers  N_Country  N_Outliers\n",
+      "120           Kazakhstan         0         88         NaN\n",
+      "100  Antigua and Barbuda         0         42         NaN\n",
+      "99               Tunisia         0         39         NaN\n",
+      "81               Belgium         0         16         NaN\n",
+      "71            Costa Rica         0         11         NaN\n"
+     ]
+    }
+   ],
+   "source": [
+    "feat = [Xrhy, Xmel, Xmfc, Xchr]\n",
+    "feat_labels = ['rhy', 'mel', 'mfc', 'chr']\n",
+    "tabs_feat = []\n",
+    "for i in range(len(feat)):\n",
+    "    XX = feat[i]\n",
+    "    df_feat, threshold, MD = outliers.get_outliers_df(XX, Y, chi2thr=0.999)\n",
+    "    outliers.print_most_least_outliers_topN(df_feat, N=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
@@ -320,7 +427,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.11"
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
--- a/scripts/outliers.py	Fri Sep 15 17:49:24 2017 +0100
+++ b/scripts/outliers.py	Sun Sep 17 18:37:37 2017 +0100
@@ -15,7 +15,7 @@
 import utils_spatial
 
 
-def country_outlier_df(counts, labels, out_file=None, normalize=False):
+def country_outlier_df(counts, labels, normalize=False):
     if len(counts.keys()) < len(np.unique(labels)):
         for label in np.unique(labels):
             if not counts.has_key(label):
@@ -24,8 +24,6 @@
         counts = normalize_outlier_counts(counts, Counter(labels))
     df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
     df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True)
-    if out_file is not None:   
-        df.to_csv(out_file, index=False)
     return df
 
 
@@ -43,6 +41,13 @@
     threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr)
     global_counts = Counter(Y[y_pred])
     df = country_outlier_df(global_counts, Y, normalize=True)
+    # append number of recordings and number of outliers per country 
+    df_n_country = pd.DataFrame.from_dict(Counter(Y), orient='index').reset_index()
+    df_n_country.rename(columns={'index':'Country', 0:'N_Country'}, inplace=True)
+    df_n_outliers = pd.DataFrame.from_dict(Counter(Y[y_pred]), orient='index').reset_index()
+    df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True)
+    df = pd.merge(df, df_n_country, on='Country', how='left')
+    df = pd.merge(df, df_n_outliers, on='Country', how='left')
     if out_file is not None:
         df.to_csv(out_file, index=False)
     return df, threshold, MD
@@ -50,8 +55,10 @@
 
 def print_most_least_outliers_topN(df, N=10):
     sort_inds = df['Outliers'].argsort()  # ascending order
-    df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]]
-    df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]]
+    #df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]]
+    #df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]]
+    df_most = df.iloc[sort_inds[::-1][:N]]
+    df_least = df.iloc[sort_inds[:N]]
     print "most outliers " 
     print df_most
     print "least outliers "