Mercurial > hg > plosone_underreview
view notebooks/sensitivity_experiment_outliers.ipynb @ 105:edd82eb89b4b branch-tests tip
Merge
author | Maria Panteli |
---|---|
date | Sun, 15 Oct 2017 13:36:59 +0100 |
parents | 69521f86d931 |
children |
line wrap: on
line source
{ "cells": [ { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The autoreload extension is already loaded. To reload it, use:\n", " %reload_ext autoreload\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from collections import Counter\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import sys\n", "sys.path.append('../')\n", "import scripts.classification as classification\n", "import scripts.outliers as outliers" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sample 80% of the dataset, for 10 times" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's sample only 80% of the recordings each time (in a stratified manner) so that the set of recordings considered for each country is changed every time." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "results_file = '../data/lda_data_8.pickle'\n", "n_iters = 10\n", "for n in range(n_iters):\n", " print \"iteration %d\" % n\n", " print results_file\n", " X, Y, Yaudio = classification.load_data_from_pickle(results_file)\n", " # get only 80% of the dataset.. to vary the choice of outliers\n", " X, _, Y, _ = train_test_split(X, Y, train_size=0.8, stratify=Y)\n", " print X.shape, Y.shape\n", " # outliers\n", " print \"detecting outliers...\"\n", " df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n", " outliers.print_most_least_outliers_topN(df_global, N=10)\n", " \n", " # write output\n", " print \"writing file\"\n", " df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "n_iters = 10\n", "ranked_countries = pd.DataFrame()\n", "ranked_outliers = pd.DataFrame()\n", "for n in range(n_iters):\n", " df_global = pd.read_csv('../data/outliers_'+str(n)+'.csv')\n", " df_global = df_global.sort_values('Outliers', axis=0, ascending=False).reset_index()\n", " ranked_countries = pd.concat([ranked_countries, df_global['Country']], axis=1)\n", " ranked_outliers = pd.concat([ranked_outliers, df_global['Outliers']], axis=1)\n", "ranked_countries_arr = ranked_countries.get_values()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Estimate precision at K" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First get the ground truth from a majority vote on the top K=10 positions." ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# majority voting + precision at K\n", "K_vote = 10\n", "country_vote = Counter(ranked_countries_arr[:K_vote, :].ravel())" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>index</th>\n", " <th>0</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Pakistan</td>\n", " <td>10</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Chad</td>\n", " <td>10</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>Gambia</td>\n", " <td>10</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>Ivory Coast</td>\n", " <td>10</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>Botswana</td>\n", " <td>10</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>Nepal</td>\n", " <td>9</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>Benin</td>\n", " <td>8</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>Senegal</td>\n", " <td>7</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>French Guiana</td>\n", " <td>7</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>El Salvador</td>\n", " <td>5</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>Mozambique</td>\n", " <td>5</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>Uganda</td>\n", " <td>4</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Bhutan</td>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Liberia</td>\n", " <td>2</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " index 0\n", "0 Pakistan 10\n", "2 Chad 10\n", "5 Gambia 10\n", "10 Ivory Coast 10\n", "12 Botswana 10\n", "6 Nepal 9\n", "13 Benin 8\n", "8 Senegal 7\n", "9 French Guiana 7\n", "4 El Salvador 5\n", "11 Mozambique 5\n", "7 Uganda 4\n", "1 Bhutan 3\n", "3 Liberia 2" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_country_vote = pd.DataFrame.from_dict(country_vote, orient='index').reset_index()\n", "df_country_vote.sort_values(0, ascending=False)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def precision_at_k(array, gr_truth, k):\n", " return len(set(array[:k]) & set(gr_truth[:k])) / float(k)\n", " \n", "k = 10\n", "ground_truth = df_country_vote['index'].get_values()\n", "p_ = []\n", "for j in range(ranked_countries_arr.shape[1]):\n", " p_.append(precision_at_k(ranked_countries_arr[:, j], ground_truth, k))\n", "p_ = np.array(p_)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mean 0.67\n", "std 0.0640312423743\n" ] } ], "source": [ "print 'mean', np.mean(p_) \n", "print 'std', np.std(p_)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ 0.6 0.7 0.7 0.6 0.6 0.7 0.8 0.6 0.7 0.7]\n" ] } ], "source": [ "print p_" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 2 }