m@94: { m@94: "cells": [ m@94: { m@94: "cell_type": "code", m@94: "execution_count": 6, m@94: "metadata": {}, m@94: "outputs": [ m@94: { m@94: "name": "stdout", m@94: "output_type": "stream", m@94: "text": [ m@94: "The autoreload extension is already loaded. To reload it, use:\n", m@94: " %reload_ext autoreload\n" m@94: ] m@94: } m@94: ], m@94: "source": [ m@94: "import numpy as np\n", m@94: "import pandas as pd\n", m@94: "from sklearn.model_selection import train_test_split\n", m@94: "from collections import Counter\n", m@94: "\n", m@94: "%matplotlib inline\n", m@94: "import matplotlib.pyplot as plt\n", m@94: "\n", m@94: "%load_ext autoreload\n", m@94: "%autoreload 2\n", m@94: "\n", m@94: "import sys\n", m@94: "sys.path.append('../')\n", m@94: "import scripts.classification as classification\n", m@94: "import scripts.outliers as outliers" m@94: ] m@94: }, m@94: { m@94: "cell_type": "markdown", m@94: "metadata": {}, m@94: "source": [ m@94: "## Sample 80% of the dataset, for 10 times" m@94: ] m@94: }, m@94: { m@94: "cell_type": "markdown", m@94: "metadata": {}, m@94: "source": [ m@94: "Let's sample only 80% of the recordings each time (in a stratified manner) so that the set of recordings considered for each country is changed every time." m@94: ] m@94: }, m@94: { m@94: "cell_type": "code", m@94: "execution_count": null, m@94: "metadata": { m@94: "collapsed": true m@94: }, m@94: "outputs": [], m@94: "source": [ m@94: "results_file = '../data/lda_data_8.pickle'\n", m@94: "n_iters = 10\n", m@94: "for n in range(n_iters):\n", m@94: " print \"iteration %d\" % n\n", m@94: " print results_file\n", m@94: " X, Y, Yaudio = classification.load_data_from_pickle(results_file)\n", m@94: " # get only 80% of the dataset.. to vary the choice of outliers\n", m@94: " X, _, Y, _ = train_test_split(X, Y, train_size=0.8, stratify=Y)\n", m@94: " print X.shape, Y.shape\n", m@94: " # outliers\n", m@94: " print \"detecting outliers...\"\n", m@94: " df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n", m@94: " outliers.print_most_least_outliers_topN(df_global, N=10)\n", m@94: " \n", m@94: " # write output\n", m@94: " print \"writing file\"\n", m@94: " df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)" m@94: ] m@94: }, m@94: { m@94: "cell_type": "code", m@94: "execution_count": 3, m@94: "metadata": {}, m@94: "outputs": [], m@94: "source": [ m@94: "n_iters = 10\n", m@94: "ranked_countries = pd.DataFrame()\n", m@94: "ranked_outliers = pd.DataFrame()\n", m@94: "for n in range(n_iters):\n", m@94: " df_global = pd.read_csv('../data/outliers_'+str(n)+'.csv')\n", m@94: " df_global = df_global.sort_values('Outliers', axis=0, ascending=False).reset_index()\n", m@94: " ranked_countries = pd.concat([ranked_countries, df_global['Country']], axis=1)\n", m@94: " ranked_outliers = pd.concat([ranked_outliers, df_global['Outliers']], axis=1)\n", m@94: "ranked_countries_arr = ranked_countries.get_values()" m@94: ] m@94: }, m@94: { m@94: "cell_type": "markdown", m@94: "metadata": {}, m@94: "source": [ m@94: "## Estimate precision at K" m@94: ] m@94: }, m@94: { m@94: "cell_type": "markdown", m@94: "metadata": {}, m@94: "source": [ m@94: "First get the ground truth from a majority vote on the top K=10 positions." m@94: ] m@94: }, m@94: { m@94: "cell_type": "code", m@94: "execution_count": 5, m@94: "metadata": { m@94: "collapsed": true m@94: }, m@94: "outputs": [], m@94: "source": [ m@94: "# majority voting + precision at K\n", m@94: "K_vote = 10\n", m@94: "country_vote = Counter(ranked_countries_arr[:K_vote, :].ravel())" m@94: ] m@94: }, m@94: { m@94: "cell_type": "code", m@94: "execution_count": 8, m@94: "metadata": {}, m@94: "outputs": [ m@94: { m@94: "data": { m@94: "text/html": [ m@94: "
\n", m@94: "\n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: " \n", m@94: "
index0
0Pakistan10
2Chad10
5Gambia10
10Ivory Coast10
12Botswana10
6Nepal9
13Benin8
8Senegal7
9French Guiana7
4El Salvador5
11Mozambique5
7Uganda4
1Bhutan3
3Liberia2
\n", m@94: "
" m@94: ], m@94: "text/plain": [ m@94: " index 0\n", m@94: "0 Pakistan 10\n", m@94: "2 Chad 10\n", m@94: "5 Gambia 10\n", m@94: "10 Ivory Coast 10\n", m@94: "12 Botswana 10\n", m@94: "6 Nepal 9\n", m@94: "13 Benin 8\n", m@94: "8 Senegal 7\n", m@94: "9 French Guiana 7\n", m@94: "4 El Salvador 5\n", m@94: "11 Mozambique 5\n", m@94: "7 Uganda 4\n", m@94: "1 Bhutan 3\n", m@94: "3 Liberia 2" m@94: ] m@94: }, m@94: "execution_count": 8, m@94: "metadata": {}, m@94: "output_type": "execute_result" m@94: } m@94: ], m@94: "source": [ m@94: "df_country_vote = pd.DataFrame.from_dict(country_vote, orient='index').reset_index()\n", m@94: "df_country_vote.sort_values(0, ascending=False)" m@94: ] m@94: }, m@94: { m@94: "cell_type": "code", m@94: "execution_count": 9, m@94: "metadata": { m@94: "collapsed": true m@94: }, m@94: "outputs": [], m@94: "source": [ m@94: "def precision_at_k(array, gr_truth, k):\n", m@94: " return len(set(array[:k]) & set(gr_truth[:k])) / float(k)\n", m@94: " \n", m@94: "k = 10\n", m@94: "ground_truth = df_country_vote['index'].get_values()\n", m@94: "p_ = []\n", m@94: "for j in range(ranked_countries_arr.shape[1]):\n", m@94: " p_.append(precision_at_k(ranked_countries_arr[:, j], ground_truth, k))\n", m@94: "p_ = np.array(p_)" m@94: ] m@94: }, m@94: { m@94: "cell_type": "code", m@94: "execution_count": 10, m@94: "metadata": {}, m@94: "outputs": [ m@94: { m@94: "name": "stdout", m@94: "output_type": "stream", m@94: "text": [ m@94: "mean 0.67\n", m@94: "std 0.0640312423743\n" m@94: ] m@94: } m@94: ], m@94: "source": [ m@94: "print 'mean', np.mean(p_) \n", m@94: "print 'std', np.std(p_)" m@94: ] m@94: }, m@94: { m@94: "cell_type": "code", m@94: "execution_count": 11, m@94: "metadata": {}, m@94: "outputs": [ m@94: { m@94: "name": "stdout", m@94: "output_type": "stream", m@94: "text": [ m@94: "[ 0.6 0.7 0.7 0.6 0.6 0.7 0.8 0.6 0.7 0.7]\n" m@94: ] m@94: } m@94: ], m@94: "source": [ m@94: "print p_" m@94: ] m@94: }, m@94: { m@94: "cell_type": "code", m@94: "execution_count": null, m@94: "metadata": { m@94: "collapsed": true m@94: }, m@94: "outputs": [], m@94: "source": [] m@94: } m@94: ], m@94: "metadata": { m@94: "kernelspec": { m@94: "display_name": "Python 2", m@94: "language": "python", m@94: "name": "python2" m@94: }, m@94: "language_info": { m@94: "codemirror_mode": { m@94: "name": "ipython", m@94: "version": 2 m@94: }, m@94: "file_extension": ".py", m@94: "mimetype": "text/x-python", m@94: "name": "python", m@94: "nbconvert_exporter": "python", m@94: "pygments_lexer": "ipython2", m@94: "version": "2.7.12" m@94: } m@94: }, m@94: "nbformat": 4, m@94: "nbformat_minor": 2 m@94: }