Mercurial > hg > plosone_underreview
diff notebooks/sensitivity_experiment_outliers.ipynb @ 94:69521f86d931 branch-tests
notebooks for publication
author | mpanteli <m.x.panteli@gmail.com> |
---|---|
date | Mon, 02 Oct 2017 18:59:29 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/notebooks/sensitivity_experiment_outliers.ipynb Mon Oct 02 18:59:29 2017 +0100 @@ -0,0 +1,326 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from collections import Counter\n", + "\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import sys\n", + "sys.path.append('../')\n", + "import scripts.classification as classification\n", + "import scripts.outliers as outliers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sample 80% of the dataset, for 10 times" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's sample only 80% of the recordings each time (in a stratified manner) so that the set of recordings considered for each country is changed every time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "results_file = '../data/lda_data_8.pickle'\n", + "n_iters = 10\n", + "for n in range(n_iters):\n", + " print \"iteration %d\" % n\n", + " print results_file\n", + " X, Y, Yaudio = classification.load_data_from_pickle(results_file)\n", + " # get only 80% of the dataset.. to vary the choice of outliers\n", + " X, _, Y, _ = train_test_split(X, Y, train_size=0.8, stratify=Y)\n", + " print X.shape, Y.shape\n", + " # outliers\n", + " print \"detecting outliers...\"\n", + " df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n", + " outliers.print_most_least_outliers_topN(df_global, N=10)\n", + " \n", + " # write output\n", + " print \"writing file\"\n", + " df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "n_iters = 10\n", + "ranked_countries = pd.DataFrame()\n", + "ranked_outliers = pd.DataFrame()\n", + "for n in range(n_iters):\n", + " df_global = pd.read_csv('../data/outliers_'+str(n)+'.csv')\n", + " df_global = df_global.sort_values('Outliers', axis=0, ascending=False).reset_index()\n", + " ranked_countries = pd.concat([ranked_countries, df_global['Country']], axis=1)\n", + " ranked_outliers = pd.concat([ranked_outliers, df_global['Outliers']], axis=1)\n", + "ranked_countries_arr = ranked_countries.get_values()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Estimate precision at K" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First get the ground truth from a majority vote on the top K=10 positions." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# majority voting + precision at K\n", + "K_vote = 10\n", + "country_vote = Counter(ranked_countries_arr[:K_vote, :].ravel())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>index</th>\n", + " <th>0</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Pakistan</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Chad</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>Gambia</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>Ivory Coast</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>Botswana</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>Nepal</td>\n", + " <td>9</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>Benin</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>Senegal</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>French Guiana</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>El Salvador</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>Mozambique</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>Uganda</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Bhutan</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Liberia</td>\n", + " <td>2</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " index 0\n", + "0 Pakistan 10\n", + "2 Chad 10\n", + "5 Gambia 10\n", + "10 Ivory Coast 10\n", + "12 Botswana 10\n", + "6 Nepal 9\n", + "13 Benin 8\n", + "8 Senegal 7\n", + "9 French Guiana 7\n", + "4 El Salvador 5\n", + "11 Mozambique 5\n", + "7 Uganda 4\n", + "1 Bhutan 3\n", + "3 Liberia 2" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_country_vote = pd.DataFrame.from_dict(country_vote, orient='index').reset_index()\n", + "df_country_vote.sort_values(0, ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def precision_at_k(array, gr_truth, k):\n", + " return len(set(array[:k]) & set(gr_truth[:k])) / float(k)\n", + " \n", + "k = 10\n", + "ground_truth = df_country_vote['index'].get_values()\n", + "p_ = []\n", + "for j in range(ranked_countries_arr.shape[1]):\n", + " p_.append(precision_at_k(ranked_countries_arr[:, j], ground_truth, k))\n", + "p_ = np.array(p_)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean 0.67\n", + "std 0.0640312423743\n" + ] + } + ], + "source": [ + "print 'mean', np.mean(p_) \n", + "print 'std', np.std(p_)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0.6 0.7 0.7 0.6 0.6 0.7 0.8 0.6 0.7 0.7]\n" + ] + } + ], + "source": [ + "print p_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}