Mercurial > hg > plosone_underreview
changeset 59:444041185ba9 branch-tests
changes in sensitivity experiment
author | mpanteli <m.x.panteli@gmail.com> |
---|---|
date | Thu, 21 Sep 2017 15:23:05 +0100 |
parents | 98cd5317e504 |
children | 402f43d5b7ad |
files | notebooks/sensitivity_experiment.ipynb |
diffstat | 1 files changed, 284 insertions(+), 25 deletions(-) [+] |
line wrap: on
line diff
--- a/notebooks/sensitivity_experiment.ipynb Tue Sep 19 21:27:09 2017 +0100 +++ b/notebooks/sensitivity_experiment.ipynb Thu Sep 21 15:23:05 2017 +0100 @@ -2,15 +2,15 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 20, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "/homes/mp305/anaconda/lib/python2.7/site-packages/librosa/core/audio.py:33: UserWarning: Could not import scikits.samplerate. Falling back to scipy.signal\n", - " warnings.warn('Could not import scikits.samplerate. '\n" + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" ] } ], @@ -4547,7 +4547,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "MAPPER_OUTPUT_FILES = mapper.OUTPUT_FILES\n", @@ -4572,7 +4574,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -4788,12 +4790,108 @@ "31 Afghanistan 0.041667 24 1\n", "105 Sudan 0.045455 66 3\n", "120 Kazakhstan 0.045455 88 4\n", + "writing file\n", + "iteration 7\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "classifying...\n", + "/import/c4dm-04/mariap/train_data_melodia_8_7.pickle\n", + "0.179777654473\n", + "detecting outliers...\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "136 Botswana 0.636364 88 56\n", + "95 Chad 0.636364 11 7\n", + "86 Gambia 0.511111 45 23\n", + "42 Benin 0.500000 26 13\n", + "14 Liberia 0.500000 40 20\n", + "63 Mozambique 0.500000 34 17\n", + "78 El Salvador 0.424242 33 14\n", + "62 Senegal 0.416667 36 15\n", + "20 Pakistan 0.415730 89 37\n", + "106 Nepal 0.402174 92 37\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "1 Lithuania 0.000000 47 0\n", + "119 Denmark 0.000000 16 0\n", + "113 Iceland 0.000000 14 0\n", + "27 South Korea 0.000000 11 0\n", + "15 Netherlands 0.015152 66 1\n", + "120 Kazakhstan 0.034884 86 3\n", + "30 Afghanistan 0.041667 24 1\n", + "102 Nicaragua 0.050000 20 1\n", + "112 Israel 0.050000 100 5\n", + "28 Tajikistan 0.052632 19 1\n", + "writing file\n", + "iteration 8\n", + "classifying...\n", + "/import/c4dm-04/mariap/train_data_melodia_8_8.pickle\n", + "0.165005035342\n", + "detecting outliers...\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "95 Chad 0.636364 11 7\n", + "43 Benin 0.576923 26 15\n", + "136 Botswana 0.571429 77 44\n", + "14 Liberia 0.525000 40 21\n", + "86 Gambia 0.488889 45 22\n", + "78 El Salvador 0.484848 33 16\n", + "64 Mozambique 0.470588 34 16\n", + "62 Fiji 0.466667 15 7\n", + "20 Pakistan 0.436782 87 38\n", + "63 Senegal 0.416667 36 15\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "1 Lithuania 0.000000 47 0\n", + "119 Denmark 0.000000 16 0\n", + "113 Iceland 0.000000 14 0\n", + "27 South Korea 0.000000 11 0\n", + "102 Nicaragua 0.000000 20 0\n", + "28 Tajikistan 0.000000 19 0\n", + "15 Netherlands 0.015152 66 1\n", + "89 Croatia 0.032258 31 1\n", + "120 Kazakhstan 0.034884 86 3\n", + "30 Afghanistan 0.041667 24 1\n", + "writing file\n", + "iteration 9\n", + "classifying...\n", + "/import/c4dm-04/mariap/train_data_melodia_8_9.pickle\n", + "0.168630986212\n", + "detecting outliers...\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "43 Benin 0.576923 26 15\n", + "136 Botswana 0.567901 81 46\n", + "60 Chad 0.545455 11 6\n", + "86 Gambia 0.533333 45 24\n", + "14 Liberia 0.525000 40 21\n", + "65 Uganda 0.482759 87 42\n", + "64 Mozambique 0.470588 34 16\n", + "20 Pakistan 0.465909 88 41\n", + "135 French Guiana 0.464286 28 13\n", + "67 Brazil 0.460000 100 46\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "1 Lithuania 0.000000 47 0\n", + "90 French Polynesia 0.000000 15 0\n", + "102 Nicaragua 0.000000 20 0\n", + "113 Iceland 0.000000 14 0\n", + "119 Denmark 0.000000 16 0\n", + "15 Netherlands 0.015152 66 1\n", + "18 New Zealand 0.029412 34 1\n", + "120 Kazakhstan 0.034884 86 3\n", + "31 Czech Republic 0.048780 41 2\n", + "28 Tajikistan 0.052632 19 1\n", "writing file\n" ] } ], "source": [ - "n_iters = 7\n", + "n_iters = 10\n", "OUTPUT_FILES = load_dataset.OUTPUT_FILES\n", "MAPPER_OUTPUT_FILES = mapper.OUTPUT_FILES\n", "for n in range(n_iters):\n", @@ -4840,22 +4938,40 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, + "execution_count": 45, + "metadata": {}, "outputs": [], "source": [ "ranked_countries = pd.DataFrame()\n", "ranked_outliers = pd.DataFrame()\n", "for n in range(n_iters):\n", " df_global = pd.read_csv('../data/outliers_'+str(n)+'.csv')\n", - " df_global = df_global.sort_values('Outliers', axis=0, ascending=False, inplace=True)\n", + " df_global = df_global.sort_values('Outliers', axis=0, ascending=False).reset_index()\n", " ranked_countries = pd.concat([ranked_countries, df_global['Country']], axis=1)\n", " ranked_outliers = pd.concat([ranked_outliers, df_global['Outliers']], axis=1)" ] }, { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(133, 10)" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ranked_outliers.shape" + ] + }, + { "cell_type": "markdown", "metadata": {}, "source": [ @@ -4864,11 +4980,42 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Country Country Country Country Country Country \\\n", + "0 Botswana Chad Botswana Botswana Chad Botswana \n", + "1 Ivory Coast Fiji Gambia Ivory Coast Botswana Ivory Coast \n", + "2 Gambia Gambia Ivory Coast Gambia Ivory Coast Pakistan \n", + "3 Benin Benin Fiji Benin Fiji Chad \n", + "4 Fiji Pakistan Benin Fiji Gambia Fiji \n", + "\n", + " Country Country Country Country \n", + "0 Botswana Botswana Chad Benin \n", + "1 Ivory Coast Chad Benin Botswana \n", + "2 Gambia Gambia Botswana Chad \n", + "3 Pakistan Mozambique Liberia Gambia \n", + "4 Fiji Benin Gambia Liberia \n", + " Outliers Outliers Outliers Outliers Outliers Outliers Outliers \\\n", + "0 0.590909 0.545455 0.615385 0.617284 0.727273 0.607143 0.574468 \n", + "1 0.571429 0.533333 0.520833 0.571429 0.630952 0.571429 0.571429 \n", + "2 0.541667 0.520833 0.500000 0.541667 0.571429 0.553191 0.520833 \n", + "3 0.538462 0.500000 0.466667 0.538462 0.533333 0.545455 0.516854 \n", + "4 0.466667 0.500000 0.461538 0.533333 0.520833 0.533333 0.466667 \n", + "\n", + " Outliers Outliers Outliers \n", + "0 0.636364 0.636364 0.576923 \n", + "1 0.636364 0.576923 0.567901 \n", + "2 0.511111 0.571429 0.545455 \n", + "3 0.500000 0.525000 0.533333 \n", + "4 0.500000 0.488889 0.525000 \n" + ] + } + ], "source": [ "zero_idx = np.where(np.sum(ranked_outliers, axis=1)==0)[0]\n", "first_zero_idx = np.min(zero_idx)\n", @@ -4888,43 +5035,155 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "KendalltauResult(correlation=0.99999999999999989, pvalue=2.5428927239036995e-67)\n" + "KendalltauResult(correlation=0.11870585554796083, pvalue=0.042684955693776824)\n", + "KendalltauResult(correlation=0.061289587605377081, pvalue=0.29535042403787393)\n", + "KendalltauResult(correlation=0.14057871952608797, pvalue=0.016384498702657929)\n", + "KendalltauResult(correlation=0.043062200956937809, pvalue=0.46219181347134564)\n", + "KendalltauResult(correlation=0.038049669628617007, pvalue=0.51591269004232343)\n", + "KendalltauResult(correlation=0.15516062884483939, pvalue=0.0080680863973824919)\n", + "KendalltauResult(correlation=0.097972203235361141, pvalue=0.094371801845320874)\n", + "KendalltauResult(correlation=0.070403280929596718, pvalue=0.22933906132681292)\n", + "KendalltauResult(correlation=0.087263613579403057, pvalue=0.13624109595088119)\n", + "KendalltauResult(correlation=0.026657552973342449, pvalue=0.64900123852931668)\n", + "KendalltauResult(correlation=0.012531328320802006, pvalue=0.83057867073317604)\n", + "KendalltauResult(correlation=0.15698336750968331, pvalue=0.0073549938316186895)\n", + "KendalltauResult(correlation=0.072226019594440652, pvalue=0.21750692637496993)\n", + "KendalltauResult(correlation=0.064479380268853956, pvalue=0.27093205134080134)\n", + "KendalltauResult(correlation=0.07518796992481204, pvalue=0.19922707586147026)\n", + "KendalltauResult(correlation=0.017088174982911826, pvalue=0.77046791234681555)\n", + "KendalltauResult(correlation=0.098200045568466648, pvalue=0.093608177106345392)\n", + "KendalltauResult(correlation=0.11004784688995217, pvalue=0.060250899787989511)\n", + "KendalltauResult(correlation=0.051720209614946465, pvalue=0.37719896672100306)\n", + "KendalltauResult(correlation=0.099567099567099596, pvalue=0.089129953079656793)\n", + "KendalltauResult(correlation=-0.081795397584871282, pvalue=0.16254238954046385)\n", + "KendalltauResult(correlation=0.089769879243563472, pvalue=0.12534294310051713)\n", + "KendalltauResult(correlation=0.10047846889952156, pvalue=0.086241531926005505)\n", + "KendalltauResult(correlation=0.014809751651856917, pvalue=0.80037548797424396)\n", + "KendalltauResult(correlation=0.021189336978810668, pvalue=0.71751195692767422)\n", + "KendalltauResult(correlation=0.020733652312599684, pvalue=0.7233346465763022)\n", + "KendalltauResult(correlation=-0.057644110275689227, pvalue=0.32501053989276085)\n", + "KendalltauResult(correlation=0.04647983595352017, pvalue=0.42743119135699703)\n", + "KendalltauResult(correlation=-0.02939166097060834, pvalue=0.6157855679677966)\n", + "KendalltauResult(correlation=-0.01754385964912281, pvalue=0.76452558103925983)\n", + "KendalltauResult(correlation=-0.00022784233310549102, pvalue=0.99689609964041026)\n", + "KendalltauResult(correlation=0.053087263613579412, pvalue=0.36471883993264553)\n", + "KendalltauResult(correlation=0.11027568922305765, pvalue=0.059721613251292195)\n", + "KendalltauResult(correlation=0.1319207108680793, pvalue=0.024296399889465414)\n", + "KendalltauResult(correlation=0.11050353155616316, pvalue=0.059196189350124301)\n", + "KendalltauResult(correlation=0.081339712918660295, pvalue=0.16489618845189757)\n", + "KendalltauResult(correlation=0.091136933242196419, pvalue=0.11969173188443738)\n", + "KendalltauResult(correlation=0.010252904989747097, pvalue=0.86103426355600943)\n", + "KendalltauResult(correlation=0.026201868307131469, pvalue=0.6546080905364744)\n", + "KendalltauResult(correlation=0.056049213943950793, pvalue=0.33857618122131272)\n", + "KendalltauResult(correlation=0.075415812257917533, pvalue=0.19786889281527764)\n", + "KendalltauResult(correlation=0.026657552973342449, pvalue=0.64900123852931668)\n", + "KendalltauResult(correlation=0.091136933242196419, pvalue=0.11969173188443738)\n", + "KendalltauResult(correlation=0.1964000911369333, pvalue=0.00079845943724486494)\n", + "KendalltauResult(correlation=0.049441786283891551, pvalue=0.39857590952666144)\n" ] } ], "source": [ "from scipy.stats import kendalltau\n", - "for i in range(len(ranked_countries)-1):\n", - " for j in range(i+1, len(ranked_countries)):\n", + "for i in range(n_iters-1):\n", + " for j in range(i+1, n_iters):\n", " print kendalltau(ranked_countries.iloc[:, i], ranked_countries.iloc[:, j])" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "SpearmanrResult(correlation=1.0, pvalue=0.0)" + "133" ] }, - "execution_count": 34, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "len(ranked_countries)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ "from scipy.stats import spearmanr\n", - "spearmanr(ranked_countries)" + "r, p = spearmanr(ranked_countries)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 1.00000000e+00, 1.74432009e-01, 8.97001663e-02,\n", + " 1.99727609e-01, 6.82200753e-02, 5.39272197e-02,\n", + " 2.21325022e-01, 1.33629528e-01, 1.08109487e-01,\n", + " 1.31114761e-01],\n", + " [ 1.74432009e-01, 1.00000000e+00, 4.20573142e-02,\n", + " 2.07251507e-02, 2.28481652e-01, 1.01916936e-01,\n", + " 1.01442548e-01, 1.12532008e-01, 1.89806266e-02,\n", + " 1.48213138e-01],\n", + " [ 8.97001663e-02, 4.20573142e-02, 1.00000000e+00,\n", + " 1.53308985e-01, 7.91412044e-02, 1.41734934e-01,\n", + " -1.14419359e-01, 1.23519450e-01, 1.50641189e-01,\n", + " 3.17074913e-02],\n", + " [ 1.99727609e-01, 2.07251507e-02, 1.53308985e-01,\n", + " 1.00000000e+00, 3.04934657e-02, 3.27786903e-02,\n", + " -7.58255884e-02, 6.98727824e-02, -4.16900460e-02,\n", + " -2.15208986e-02],\n", + " [ 6.82200753e-02, 2.28481652e-01, 7.91412044e-02,\n", + " 3.04934657e-02, 1.00000000e+00, -8.00848798e-04,\n", + " 8.02532110e-02, 1.65796105e-01, 1.91678314e-01,\n", + " 1.62863060e-01],\n", + " [ 5.39272197e-02, 1.01916936e-01, 1.41734934e-01,\n", + " 3.27786903e-02, -8.00848798e-04, 1.00000000e+00,\n", + " 1.17969619e-01, 1.31221881e-01, 2.06996460e-02,\n", + " 3.92160863e-02],\n", + " [ 2.21325022e-01, 1.01442548e-01, -1.14419359e-01,\n", + " -7.58255884e-02, 8.02532110e-02, 1.17969619e-01,\n", + " 1.00000000e+00, 8.75832730e-02, 1.10578345e-01,\n", + " 4.28326583e-02],\n", + " [ 1.33629528e-01, 1.12532008e-01, 1.23519450e-01,\n", + " 6.98727824e-02, 1.65796105e-01, 1.31221881e-01,\n", + " 8.75832730e-02, 1.00000000e+00, 1.31374909e-01,\n", + " 2.78868814e-01],\n", + " [ 1.08109487e-01, 1.89806266e-02, 1.50641189e-01,\n", + " -4.16900460e-02, 1.91678314e-01, 2.06996460e-02,\n", + " 1.10578345e-01, 1.31374909e-01, 1.00000000e+00,\n", + " 7.53103927e-02],\n", + " [ 1.31114761e-01, 1.48213138e-01, 3.17074913e-02,\n", + " -2.15208986e-02, 1.62863060e-01, 3.92160863e-02,\n", + " 4.28326583e-02, 2.78868814e-01, 7.53103927e-02,\n", + " 1.00000000e+00]])" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r" ] }, {