Mercurial > hg > plosone_underreview

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from collections import Counter\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import sys\n",
    "sys.path.append('../')\n",
    "import scripts.classification as classification\n",
    "import scripts.outliers as outliers"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sample 80% of the dataset, for 10 times"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's sample only 80% of the recordings each time (in a stratified manner) so that the set of recordings considered for each country is changed every time."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "results_file = '../data/lda_data_8.pickle'\n",
    "n_iters = 10\n",
    "for n in range(n_iters):\n",
    "    print \"iteration %d\" % n\n",
    "    print results_file\n",
    "    X, Y, Yaudio = classification.load_data_from_pickle(results_file)\n",
    "    # get only 80% of the dataset.. to vary the choice of outliers\n",
    "    X, _, Y, _ = train_test_split(X, Y, train_size=0.8, stratify=Y)\n",
    "    print X.shape, Y.shape\n",
    "    # outliers\n",
    "    print \"detecting outliers...\"\n",
    "    df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n",
    "    outliers.print_most_least_outliers_topN(df_global, N=10)\n",
    "    \n",
    "    # write output\n",
    "    print \"writing file\"\n",
    "    df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_iters = 10\n",
    "ranked_countries = pd.DataFrame()\n",
    "ranked_outliers = pd.DataFrame()\n",
    "for n in range(n_iters):\n",
    "    df_global = pd.read_csv('../data/outliers_'+str(n)+'.csv')\n",
    "    df_global = df_global.sort_values('Outliers', axis=0, ascending=False).reset_index()\n",
    "    ranked_countries = pd.concat([ranked_countries, df_global['Country']], axis=1)\n",
    "    ranked_outliers = pd.concat([ranked_outliers, df_global['Outliers']], axis=1)\n",
    "ranked_countries_arr = ranked_countries.get_values()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Estimate precision at K"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First get the ground truth from a majority vote on the top K=10 positions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# majority voting + precision at K\n",
    "K_vote = 10\n",
    "country_vote = Counter(ranked_countries_arr[:K_vote, :].ravel())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Pakistan</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Chad</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Gambia</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Ivory Coast</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Botswana</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Nepal</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Benin</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Senegal</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>French Guiana</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>El Salvador</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Mozambique</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Uganda</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Bhutan</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Liberia</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            index   0\n",
       "0        Pakistan  10\n",
       "2            Chad  10\n",
       "5          Gambia  10\n",
       "10    Ivory Coast  10\n",
       "12       Botswana  10\n",
       "6           Nepal   9\n",
       "13          Benin   8\n",
       "8         Senegal   7\n",
       "9   French Guiana   7\n",
       "4     El Salvador   5\n",
       "11     Mozambique   5\n",
       "7          Uganda   4\n",
       "1          Bhutan   3\n",
       "3         Liberia   2"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_country_vote = pd.DataFrame.from_dict(country_vote, orient='index').reset_index()\n",
    "df_country_vote.sort_values(0, ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def precision_at_k(array, gr_truth, k):\n",
    "    return len(set(array[:k]) & set(gr_truth[:k])) / float(k)\n",
    "    \n",
    "k = 10\n",
    "ground_truth = df_country_vote['index'].get_values()\n",
    "p_ = []\n",
    "for j in range(ranked_countries_arr.shape[1]):\n",
    "    p_.append(precision_at_k(ranked_countries_arr[:, j], ground_truth, k))\n",
    "p_ = np.array(p_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mean 0.67\n",
      "std 0.0640312423743\n"
     ]
    }
   ],
   "source": [
    "print 'mean', np.mean(p_) \n",
    "print 'std', np.std(p_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0.6  0.7  0.7  0.6  0.6  0.7  0.8  0.6  0.7  0.7]\n"
     ]
    }
   ],
   "source": [
    "print p_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
author	mpanteli <m.x.panteli@gmail.com>
date	Mon, 02 Oct 2017 18:59:29 +0100
parents
children