Mercurial > hg > plosone_underreview
comparison notebooks/sensitivity_experiment.ipynb @ 90:e279ccea5f9b branch-tests
results on 30sec
author | mpanteli <m.x.panteli@gmail.com> |
---|---|
date | Mon, 02 Oct 2017 15:32:23 +0100 |
parents | 4395037087b6 |
children |
comparison
equal
deleted
inserted
replaced
89:8a2d56880050 | 90:e279ccea5f9b |
---|---|
1 { | 1 { |
2 "cells": [ | 2 "cells": [ |
3 { | 3 { |
4 "cell_type": "code", | 4 "cell_type": "code", |
5 "execution_count": 1, | 5 "execution_count": 16, |
6 "metadata": {}, | 6 "metadata": {}, |
7 "outputs": [ | 7 "outputs": [ |
8 { | 8 { |
9 "name": "stdout", | 9 "name": "stdout", |
10 "output_type": "stream", | 10 "output_type": "stream", |
11 "text": [ | 11 "text": [ |
12 "ERROR! Session/line number was not unique in database. History logging moved to new session 32\n" | 12 "The autoreload extension is already loaded. To reload it, use:\n", |
13 ] | 13 " %reload_ext autoreload\n" |
14 }, | |
15 { | |
16 "name": "stderr", | |
17 "output_type": "stream", | |
18 "text": [ | |
19 "/homes/mp305/anaconda/lib/python2.7/site-packages/librosa/core/audio.py:33: UserWarning: Could not import scikits.samplerate. Falling back to scipy.signal\n", | |
20 " warnings.warn('Could not import scikits.samplerate. '\n" | |
21 ] | 14 ] |
22 } | 15 } |
23 ], | 16 ], |
24 "source": [ | 17 "source": [ |
25 "import numpy as np\n", | 18 "import numpy as np\n", |
40 ] | 33 ] |
41 }, | 34 }, |
42 { | 35 { |
43 "cell_type": "code", | 36 "cell_type": "code", |
44 "execution_count": 2, | 37 "execution_count": 2, |
45 "metadata": {}, | 38 "metadata": { |
39 "collapsed": true | |
40 }, | |
46 "outputs": [], | 41 "outputs": [], |
47 "source": [ | 42 "source": [ |
48 "OUTPUT_FILES = load_dataset.OUTPUT_FILES\n", | 43 "OUTPUT_FILES = load_dataset.OUTPUT_FILES\n", |
49 "n_iters = 10" | 44 "n_iters = 10" |
50 ] | 45 ] |
5253 "Let's try without changing the LDA mapping, so just load the original dataset and get outlier countries by selecting 80% of the recordigns (in stratified manner).'" | 5248 "Let's try without changing the LDA mapping, so just load the original dataset and get outlier countries by selecting 80% of the recordigns (in stratified manner).'" |
5254 ] | 5249 ] |
5255 }, | 5250 }, |
5256 { | 5251 { |
5257 "cell_type": "code", | 5252 "cell_type": "code", |
5258 "execution_count": 67, | 5253 "execution_count": 17, |
5259 "metadata": {}, | 5254 "metadata": {}, |
5260 "outputs": [ | 5255 "outputs": [ |
5261 { | 5256 { |
5262 "name": "stdout", | 5257 "name": "stdout", |
5263 "output_type": "stream", | 5258 "output_type": "stream", |
5264 "text": [ | 5259 "text": [ |
5265 "iteration 0\n", | 5260 "iteration 0\n", |
5266 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", | 5261 "../data/lda_data_melodia_8_30sec.pickle\n", |
5267 "(6560, 380) (6560,)\n", | 5262 "(6560, 381) (6560,)\n", |
5268 "detecting outliers...\n", | 5263 "detecting outliers...\n", |
5269 "most outliers \n", | 5264 "most outliers \n", |
5270 " Country Outliers N_Country N_Outliers\n", | 5265 " Country Outliers N_Country N_Outliers\n", |
5271 "95 Chad 0.555556 9 5\n", | 5266 "136 Botswana 0.625000 72 45\n", |
5272 "86 Gambia 0.525000 40 21\n", | 5267 "59 Chad 0.555556 9 5\n", |
5273 "135 French Guiana 0.500000 22 11\n", | 5268 "42 Benin 0.523810 21 11\n", |
5274 "44 Benin 0.476190 21 10\n", | 5269 "31 Ivory Coast 0.500000 12 6\n", |
5275 "15 Liberia 0.468750 32 15\n", | 5270 "20 Pakistan 0.493151 73 36\n", |
5276 "136 Botswana 0.458333 72 33\n", | 5271 "63 Mozambique 0.481481 27 13\n", |
5272 "106 Nepal 0.460526 76 35\n", | |
5273 "17 French Guiana 0.454545 22 10\n", | |
5277 "104 Bhutan 0.444444 9 4\n", | 5274 "104 Bhutan 0.444444 9 4\n", |
5278 "68 Brazil 0.437500 80 35\n", | 5275 "86 Gambia 0.425000 40 17\n", |
5279 "92 Switzerland 0.428571 42 18\n", | |
5280 "78 El Salvador 0.423077 26 11\n", | |
5281 "least outliers \n", | 5276 "least outliers \n", |
5282 " Country Outliers N_Country N_Outliers\n", | 5277 " Country Outliers N_Country N_Outliers\n", |
5283 "1 Lithuania 0.000000 38 0\n", | 5278 "100 Antigua and Barbuda 0.000000 34 0\n", |
5284 "29 Tajikistan 0.000000 15 0\n", | 5279 "28 Tajikistan 0.000000 15 0\n", |
5285 "32 Czech Republic 0.000000 33 0\n", | 5280 "113 Iceland 0.000000 11 0\n", |
5286 "107 Kiribati 0.000000 14 0\n", | 5281 "119 Denmark 0.000000 13 0\n", |
5287 "120 Kazakhstan 0.000000 70 0\n", | 5282 "27 South Korea 0.000000 9 0\n", |
5288 "119 Denmark 0.000000 13 0\n", | 5283 "1 Lithuania 0.000000 38 0\n", |
5289 "0 Canada 0.050000 80 4\n", | 5284 "120 Kazakhstan 0.014286 70 1\n", |
5290 "73 Nigeria 0.051948 77 4\n", | 5285 "15 Netherlands 0.018519 54 1\n", |
5291 "109 Democratic Republic of the Congo 0.052632 38 2\n", | 5286 "74 Czech Republic 0.030303 33 1\n", |
5292 "105 Sudan 0.055556 54 3\n", | 5287 "105 Sudan 0.037037 54 2\n", |
5293 "writing file\n", | 5288 "writing file\n", |
5294 "iteration 1\n", | 5289 "iteration 1\n", |
5295 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", | 5290 "../data/lda_data_melodia_8_30sec.pickle\n", |
5296 "(6560, 380) (6560,)\n", | 5291 "(6560, 381) (6560,)\n", |
5297 "detecting outliers...\n", | 5292 "detecting outliers...\n", |
5298 "most outliers \n", | 5293 "most outliers \n", |
5299 " Country Outliers N_Country N_Outliers\n", | 5294 " Country Outliers N_Country N_Outliers\n", |
5300 "95 Chad 0.666667 9 6\n", | 5295 "31 Ivory Coast 0.666667 12 8\n", |
5301 "17 French Guiana 0.545455 22 12\n", | 5296 "136 Botswana 0.638889 72 46\n", |
5302 "86 Gambia 0.525000 40 21\n", | 5297 "95 Chad 0.555556 9 5\n", |
5303 "44 Benin 0.523810 21 11\n", | 5298 "20 Pakistan 0.479452 73 35\n", |
5304 "6 Bolivia 0.500000 28 14\n", | 5299 "43 Benin 0.476190 21 10\n", |
5305 "78 El Salvador 0.500000 26 13\n", | 5300 "86 Gambia 0.475000 40 19\n", |
5306 "136 Botswana 0.486111 72 35\n", | 5301 "78 El Salvador 0.461538 26 12\n", |
5307 "10 Guatemala 0.465116 43 20\n", | |
5308 "115 Senegal 0.454545 33 15\n", | 5302 "115 Senegal 0.454545 33 15\n", |
5303 "135 French Guiana 0.454545 22 10\n", | |
5309 "104 Bhutan 0.444444 9 4\n", | 5304 "104 Bhutan 0.444444 9 4\n", |
5310 "least outliers \n", | 5305 "least outliers \n", |
5311 " Country Outliers N_Country N_Outliers\n", | 5306 " Country Outliers N_Country N_Outliers\n", |
5312 "120 Kazakhstan 0.000000 70 0\n", | 5307 "1 Lithuania 0.000000 38 0\n", |
5313 "1 Lithuania 0.000000 38 0\n", | 5308 "107 Kiribati 0.000000 14 0\n", |
5314 "107 Kiribati 0.000000 14 0\n", | 5309 "119 Denmark 0.000000 13 0\n", |
5315 "119 Denmark 0.000000 13 0\n", | 5310 "27 South Korea 0.000000 9 0\n", |
5316 "9 Saudi Arabia 0.000000 8 0\n", | 5311 "120 Kazakhstan 0.014286 70 1\n", |
5317 "98 Uzbekistan 0.030303 33 1\n", | 5312 "105 Sudan 0.018519 54 1\n", |
5318 "15 Netherlands 0.037037 54 2\n", | 5313 "74 Czech Republic 0.030303 33 1\n", |
5319 "57 Russia 0.037975 79 3\n", | 5314 "93 Grenada 0.033333 30 1\n", |
5320 "109 Democratic Republic of the Congo 0.052632 38 2\n", | 5315 "15 Netherlands 0.037037 54 2\n", |
5321 "105 Sudan 0.055556 54 3\n", | 5316 "0 Canada 0.037500 80 3\n", |
5322 "writing file\n", | 5317 "writing file\n", |
5323 "iteration 2\n", | 5318 "iteration 2\n", |
5324 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", | 5319 "../data/lda_data_melodia_8_30sec.pickle\n", |
5325 "(6560, 380) (6560,)\n", | 5320 "(6560, 381) (6560,)\n", |
5326 "detecting outliers...\n", | 5321 "detecting outliers...\n", |
5327 "most outliers \n", | 5322 "most outliers \n", |
5328 " Country Outliers N_Country N_Outliers\n", | 5323 " Country Outliers N_Country N_Outliers\n", |
5329 "95 Chad 0.666667 9 6\n", | 5324 "61 Chad 0.666667 9 6\n", |
5330 "104 Bhutan 0.555556 9 5\n", | 5325 "136 Botswana 0.625000 72 45\n", |
5331 "86 Gambia 0.550000 40 22\n", | 5326 "72 Ivory Coast 0.583333 12 7\n", |
5332 "135 French Guiana 0.545455 22 12\n", | 5327 "20 Pakistan 0.534247 73 39\n", |
5333 "78 El Salvador 0.538462 26 14\n", | 5328 "86 Gambia 0.525000 40 21\n", |
5334 "43 Benin 0.523810 21 11\n", | 5329 "44 Benin 0.476190 21 10\n", |
5335 "6 Bolivia 0.500000 28 14\n", | 5330 "78 El Salvador 0.461538 26 12\n", |
5336 "136 Botswana 0.486111 72 35\n", | 5331 "106 Nepal 0.434211 76 33\n", |
5337 "64 Mozambique 0.444444 27 12\n", | 5332 "66 Uganda 0.426471 68 29\n", |
5338 "14 Liberia 0.437500 32 14\n", | 5333 "135 French Guiana 0.409091 22 9\n", |
5339 "least outliers \n", | 5334 "least outliers \n", |
5340 " Country Outliers N_Country N_Outliers\n", | 5335 " Country Outliers N_Country N_Outliers\n", |
5341 "1 Lithuania 0.000000 38 0\n", | 5336 "1 Lithuania 0.000000 38 0\n", |
5342 "107 Kiribati 0.000000 14 0\n", | 5337 "119 Denmark 0.000000 13 0\n", |
5343 "119 Denmark 0.000000 13 0\n", | 5338 "31 Czech Republic 0.000000 33 0\n", |
5344 "120 Kazakhstan 0.000000 70 0\n", | 5339 "30 Afghanistan 0.000000 19 0\n", |
5345 "15 Netherlands 0.018519 54 1\n", | 5340 "27 South Korea 0.000000 9 0\n", |
5346 "105 Sudan 0.037037 54 2\n", | 5341 "102 Nicaragua 0.000000 17 0\n", |
5347 "0 Canada 0.050000 80 4\n", | 5342 "120 Kazakhstan 0.014286 70 1\n", |
5348 "109 Democratic Republic of the Congo 0.052632 38 2\n", | 5343 "15 Netherlands 0.018519 54 1\n", |
5349 "94 Iraq 0.057971 69 4\n", | 5344 "43 Malawi 0.040000 25 1\n", |
5350 "31 Czech Republic 0.060606 33 2\n", | 5345 "0 Canada 0.050000 80 4\n", |
5351 "writing file\n", | 5346 "writing file\n", |
5352 "iteration 3\n", | 5347 "iteration 3\n", |
5353 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", | 5348 "../data/lda_data_melodia_8_30sec.pickle\n", |
5354 "(6560, 380) (6560,)\n", | 5349 "(6560, 381) (6560,)\n", |
5350 "detecting outliers...\n", | |
5351 "most outliers \n", | |
5352 " Country Outliers N_Country N_Outliers\n", | |
5353 "95 Chad 0.666667 9 6\n", | |
5354 "136 Botswana 0.583333 72 42\n", | |
5355 "86 Gambia 0.575000 40 23\n", | |
5356 "63 Mozambique 0.518519 27 14\n", | |
5357 "31 Ivory Coast 0.500000 12 6\n", | |
5358 "42 Benin 0.476190 21 10\n", | |
5359 "106 Nepal 0.473684 76 36\n", | |
5360 "20 Pakistan 0.452055 73 33\n", | |
5361 "64 Uganda 0.426471 68 29\n", | |
5362 "62 Senegal 0.424242 33 14\n", | |
5363 "least outliers \n", | |
5364 " Country Outliers N_Country N_Outliers\n", | |
5365 "1 Lithuania 0.000000 38 0\n", | |
5366 "74 Czech Republic 0.000000 33 0\n", | |
5367 "27 South Korea 0.000000 9 0\n", | |
5368 "119 Denmark 0.000000 13 0\n", | |
5369 "120 Kazakhstan 0.014286 70 1\n", | |
5370 "105 Sudan 0.037037 54 2\n", | |
5371 "15 Netherlands 0.037037 54 2\n", | |
5372 "65 Hungary 0.049180 61 3\n", | |
5373 "0 Canada 0.050000 80 4\n", | |
5374 "44 United States of America 0.051282 78 4\n", | |
5375 "writing file\n", | |
5376 "iteration 4\n", | |
5377 "../data/lda_data_melodia_8_30sec.pickle\n", | |
5378 "(6560, 381) (6560,)\n", | |
5379 "detecting outliers...\n", | |
5380 "most outliers \n", | |
5381 " Country Outliers N_Country N_Outliers\n", | |
5382 "43 Benin 0.619048 21 13\n", | |
5383 "136 Botswana 0.597222 72 43\n", | |
5384 "72 Ivory Coast 0.583333 12 7\n", | |
5385 "95 Chad 0.555556 9 5\n", | |
5386 "86 Gambia 0.525000 40 21\n", | |
5387 "64 Mozambique 0.518519 27 14\n", | |
5388 "20 Pakistan 0.506849 73 37\n", | |
5389 "106 Nepal 0.486842 76 37\n", | |
5390 "65 Uganda 0.470588 68 32\n", | |
5391 "63 Senegal 0.454545 33 15\n", | |
5392 "least outliers \n", | |
5393 " Country Outliers N_Country N_Outliers\n", | |
5394 "120 Kazakhstan 0.000000 70 0\n", | |
5395 "119 Denmark 0.000000 13 0\n", | |
5396 "27 South Korea 0.000000 9 0\n", | |
5397 "1 Lithuania 0.000000 38 0\n", | |
5398 "107 Kiribati 0.000000 14 0\n", | |
5399 "31 Czech Republic 0.030303 33 1\n", | |
5400 "15 Netherlands 0.037037 54 2\n", | |
5401 "0 Canada 0.037500 80 3\n", | |
5402 "50 Finland 0.052632 19 1\n", | |
5403 "30 Afghanistan 0.052632 19 1\n", | |
5404 "writing file\n", | |
5405 "iteration 5\n", | |
5406 "../data/lda_data_melodia_8_30sec.pickle\n", | |
5407 "(6560, 381) (6560,)\n", | |
5355 "detecting outliers...\n", | 5408 "detecting outliers...\n", |
5356 "most outliers \n", | 5409 "most outliers \n", |
5357 " Country Outliers N_Country N_Outliers\n", | 5410 " Country Outliers N_Country N_Outliers\n", |
5358 "60 Chad 0.666667 9 6\n", | 5411 "60 Chad 0.666667 9 6\n", |
5359 "17 French Guiana 0.590909 22 13\n", | 5412 "43 Benin 0.619048 21 13\n", |
5360 "86 Gambia 0.550000 40 22\n", | 5413 "136 Botswana 0.583333 72 42\n", |
5361 "6 Bolivia 0.535714 28 15\n", | 5414 "72 Ivory Coast 0.583333 12 7\n", |
5362 "136 Botswana 0.513889 72 37\n", | 5415 "20 Pakistan 0.479452 73 35\n", |
5363 "64 Mozambique 0.481481 27 13\n", | 5416 "86 Gambia 0.475000 40 19\n", |
5364 "14 Liberia 0.468750 32 15\n", | |
5365 "78 El Salvador 0.461538 26 12\n", | 5417 "78 El Salvador 0.461538 26 12\n", |
5366 "115 Senegal 0.454545 33 15\n", | 5418 "106 Nepal 0.460526 76 35\n", |
5367 "108 Malta 0.437500 16 7\n", | 5419 "63 Senegal 0.454545 33 15\n", |
5420 "17 French Guiana 0.409091 22 9\n", | |
5368 "least outliers \n", | 5421 "least outliers \n", |
5369 " Country Outliers N_Country N_Outliers\n", | 5422 " Country Outliers N_Country N_Outliers\n", |
5370 "120 Kazakhstan 0.000000 70 0\n", | |
5371 "1 Lithuania 0.000000 38 0\n", | 5423 "1 Lithuania 0.000000 38 0\n", |
5372 "30 Afghanistan 0.000000 19 0\n", | 5424 "27 South Korea 0.000000 9 0\n", |
5373 "119 Denmark 0.000000 13 0\n", | 5425 "119 Denmark 0.000000 13 0\n", |
5374 "107 Kiribati 0.000000 14 0\n", | 5426 "9 Saudi Arabia 0.000000 8 0\n", |
5427 "120 Kazakhstan 0.014286 70 1\n", | |
5375 "31 Czech Republic 0.030303 33 1\n", | 5428 "31 Czech Republic 0.030303 33 1\n", |
5376 "98 Uzbekistan 0.030303 33 1\n", | |
5377 "15 Netherlands 0.037037 54 2\n", | 5429 "15 Netherlands 0.037037 54 2\n", |
5378 "105 Sudan 0.037037 54 2\n", | 5430 "105 Sudan 0.037037 54 2\n", |
5379 "84 Iraq 0.042857 70 3\n", | 5431 "0 Canada 0.037500 80 3\n", |
5432 "112 Israel 0.037500 80 3\n", | |
5380 "writing file\n", | 5433 "writing file\n", |
5381 "iteration 4\n", | 5434 "iteration 6\n", |
5382 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", | 5435 "../data/lda_data_melodia_8_30sec.pickle\n", |
5383 "(6560, 380) (6560,)\n", | 5436 "(6560, 381) (6560,)\n", |
5384 "detecting outliers...\n", | 5437 "detecting outliers...\n", |
5385 "most outliers \n", | 5438 "most outliers \n", |
5386 " Country Outliers N_Country N_Outliers\n", | 5439 " Country Outliers N_Country N_Outliers\n", |
5387 "117 Zimbabwe 0.583333 12 7\n", | 5440 "136 Botswana 0.597222 72 43\n", |
5388 "60 Chad 0.555556 9 5\n", | 5441 "72 Ivory Coast 0.583333 12 7\n", |
5389 "86 Gambia 0.550000 40 22\n", | 5442 "106 Nepal 0.500000 76 38\n", |
5390 "43 Benin 0.523810 21 11\n", | 5443 "86 Gambia 0.500000 40 20\n", |
5391 "6 Bolivia 0.500000 28 14\n", | 5444 "115 Senegal 0.484848 33 16\n", |
5392 "135 French Guiana 0.500000 22 11\n", | 5445 "14 Liberia 0.468750 32 15\n", |
5393 "136 Botswana 0.472222 72 34\n", | |
5394 "78 El Salvador 0.461538 26 12\n", | 5446 "78 El Salvador 0.461538 26 12\n", |
5395 "10 Guatemala 0.441860 43 19\n", | 5447 "135 French Guiana 0.454545 22 10\n", |
5396 "14 Liberia 0.437500 32 14\n", | 5448 "20 Pakistan 0.452055 73 33\n", |
5449 "95 Chad 0.444444 9 4\n", | |
5397 "least outliers \n", | 5450 "least outliers \n", |
5398 " Country Outliers N_Country N_Outliers\n", | 5451 " Country Outliers N_Country N_Outliers\n", |
5399 "1 Lithuania 0.000000 38 0\n", | 5452 "113 Iceland 0.000000 11 0\n", |
5400 "107 Kiribati 0.000000 14 0\n", | 5453 "1 Lithuania 0.000000 38 0\n", |
5401 "119 Denmark 0.000000 13 0\n", | 5454 "119 Denmark 0.000000 13 0\n", |
5402 "120 Kazakhstan 0.000000 70 0\n", | 5455 "31 Czech Republic 0.000000 33 0\n", |
5403 "27 South Korea 0.000000 9 0\n", | 5456 "27 South Korea 0.000000 9 0\n", |
5404 "109 Democratic Republic of the Congo 0.026316 38 1\n", | 5457 "15 Netherlands 0.000000 54 0\n", |
5405 "94 Iraq 0.028571 70 2\n", | 5458 "120 Kazakhstan 0.014286 70 1\n", |
5406 "31 Czech Republic 0.030303 33 1\n", | 5459 "30 Afghanistan 0.052632 19 1\n", |
5407 "105 Sudan 0.037037 54 2\n", | 5460 "58 Bulgaria 0.054054 37 2\n", |
5408 "85 Sierra Leone 0.050000 80 4\n", | 5461 "105 Sudan 0.055556 54 3\n", |
5409 "writing file\n", | 5462 "writing file\n", |
5410 "iteration 5\n", | 5463 "iteration 7\n", |
5411 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", | 5464 "../data/lda_data_melodia_8_30sec.pickle\n" |
5412 "(6560, 380) (6560,)\n", | 5465 ] |
5466 }, | |
5467 { | |
5468 "name": "stdout", | |
5469 "output_type": "stream", | |
5470 "text": [ | |
5471 "(6560, 381) (6560,)\n", | |
5472 "detecting outliers...\n", | |
5473 "most outliers \n", | |
5474 " Country Outliers N_Country N_Outliers\n", | |
5475 "72 Ivory Coast 0.666667 12 8\n", | |
5476 "136 Botswana 0.611111 72 44\n", | |
5477 "86 Gambia 0.575000 40 23\n", | |
5478 "95 Chad 0.555556 9 5\n", | |
5479 "44 Benin 0.523810 21 11\n", | |
5480 "64 Senegal 0.484848 33 16\n", | |
5481 "106 Nepal 0.460526 76 35\n", | |
5482 "20 Pakistan 0.452055 73 33\n", | |
5483 "65 Mozambique 0.444444 27 12\n", | |
5484 "66 Uganda 0.441176 68 30\n", | |
5485 "least outliers \n", | |
5486 " Country Outliers N_Country N_Outliers\n", | |
5487 "1 Lithuania 0.000000 38 0\n", | |
5488 "119 Denmark 0.000000 13 0\n", | |
5489 "113 Iceland 0.000000 11 0\n", | |
5490 "27 South Korea 0.000000 9 0\n", | |
5491 "120 Kazakhstan 0.014286 70 1\n", | |
5492 "57 Russia 0.025316 79 2\n", | |
5493 "46 United States of America 0.025641 78 2\n", | |
5494 "31 Czech Republic 0.030303 33 1\n", | |
5495 "15 Netherlands 0.037037 54 2\n", | |
5496 "0 Canada 0.050000 80 4\n", | |
5497 "writing file\n", | |
5498 "iteration 8\n", | |
5499 "../data/lda_data_melodia_8_30sec.pickle\n", | |
5500 "(6560, 381) (6560,)\n", | |
5413 "detecting outliers...\n", | 5501 "detecting outliers...\n", |
5414 "most outliers \n", | 5502 "most outliers \n", |
5415 " Country Outliers N_Country N_Outliers\n", | 5503 " Country Outliers N_Country N_Outliers\n", |
5416 "61 Chad 0.666667 9 6\n", | 5504 "136 Botswana 0.625000 72 45\n", |
5417 "44 Benin 0.619048 21 13\n", | 5505 "72 Ivory Coast 0.583333 12 7\n", |
5418 "104 Bhutan 0.555556 9 5\n", | 5506 "86 Gambia 0.475000 40 19\n", |
5419 "18 French Guiana 0.545455 22 12\n", | 5507 "106 Nepal 0.460526 76 35\n", |
5420 "86 Gambia 0.525000 40 21\n", | 5508 "63 Senegal 0.454545 33 15\n", |
5421 "136 Botswana 0.500000 72 36\n", | 5509 "135 French Guiana 0.454545 22 10\n", |
5422 "117 Zimbabwe 0.500000 12 6\n", | 5510 "20 Pakistan 0.452055 73 33\n", |
5423 "15 Liberia 0.500000 32 16\n", | 5511 "60 Chad 0.444444 9 4\n", |
5424 "64 Senegal 0.484848 33 16\n", | 5512 "64 Mozambique 0.444444 27 12\n", |
5425 "78 El Salvador 0.461538 26 12\n", | 5513 "14 Liberia 0.437500 32 14\n", |
5426 "least outliers \n", | 5514 "least outliers \n", |
5427 " Country Outliers N_Country N_Outliers\n", | 5515 " Country Outliers N_Country N_Outliers\n", |
5428 "1 Lithuania 0.000000 38 0\n", | 5516 "1 Lithuania 0.000000 38 0\n", |
5429 "120 Kazakhstan 0.000000 70 0\n", | 5517 "27 South Korea 0.000000 9 0\n", |
5430 "119 Denmark 0.000000 13 0\n", | 5518 "30 Afghanistan 0.000000 19 0\n", |
5431 "107 Kiribati 0.000000 14 0\n", | 5519 "31 Czech Republic 0.000000 33 0\n", |
5432 "9 Saudi Arabia 0.000000 8 0\n", | 5520 "119 Denmark 0.000000 13 0\n", |
5433 "0 Canada 0.025000 80 2\n", | 5521 "120 Kazakhstan 0.014286 70 1\n", |
5434 "57 Russia 0.050633 79 4\n", | 5522 "15 Netherlands 0.037037 54 2\n", |
5435 "109 Democratic Republic of the Congo 0.052632 38 2\n", | 5523 "105 Sudan 0.037037 54 2\n", |
5436 "51 Finland 0.052632 19 1\n", | 5524 "45 United States of America 0.051282 78 4\n", |
5437 "105 Sudan 0.055556 54 3\n", | 5525 "134 Paraguay 0.055556 18 1\n", |
5438 "writing file\n", | 5526 "writing file\n", |
5439 "iteration 6\n", | 5527 "iteration 9\n", |
5440 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", | 5528 "../data/lda_data_melodia_8_30sec.pickle\n", |
5441 "(6560, 380) (6560,)\n", | 5529 "(6560, 381) (6560,)\n", |
5442 "detecting outliers...\n", | 5530 "detecting outliers...\n", |
5443 "most outliers \n", | 5531 "most outliers \n", |
5444 " Country Outliers N_Country N_Outliers\n", | 5532 " Country Outliers N_Country N_Outliers\n", |
5445 "60 Chad 0.666667 9 6\n", | 5533 "31 Ivory Coast 0.666667 12 8\n", |
5534 "136 Botswana 0.611111 72 44\n", | |
5446 "17 French Guiana 0.590909 22 13\n", | 5535 "17 French Guiana 0.590909 22 13\n", |
5447 "117 Zimbabwe 0.583333 12 7\n", | 5536 "59 Chad 0.555556 9 5\n", |
5448 "86 Gambia 0.575000 40 23\n", | |
5449 "78 El Salvador 0.538462 26 14\n", | 5537 "78 El Salvador 0.538462 26 14\n", |
5450 "43 Benin 0.523810 21 11\n", | 5538 "20 Pakistan 0.493151 73 36\n", |
5451 "115 Senegal 0.515152 33 17\n", | 5539 "106 Nepal 0.486842 76 37\n", |
5452 "136 Botswana 0.472222 72 34\n", | 5540 "42 Benin 0.476190 21 10\n", |
5541 "86 Gambia 0.450000 40 18\n", | |
5453 "104 Bhutan 0.444444 9 4\n", | 5542 "104 Bhutan 0.444444 9 4\n", |
5454 "84 Belize 0.441176 34 15\n", | |
5455 "least outliers \n", | 5543 "least outliers \n", |
5456 " Country Outliers N_Country N_Outliers\n", | 5544 " Country Outliers N_Country N_Outliers\n", |
5457 "1 Lithuania 0.000000 38 0\n", | 5545 "1 Lithuania 0.000000 38 0\n", |
5458 "107 Kiribati 0.000000 14 0\n", | 5546 "27 South Korea 0.000000 9 0\n", |
5459 "113 Iceland 0.000000 11 0\n", | 5547 "119 Denmark 0.000000 13 0\n", |
5460 "72 Ivory Coast 0.000000 12 0\n", | 5548 "44 United States of America 0.012821 78 1\n", |
5461 "119 Denmark 0.000000 13 0\n", | 5549 "120 Kazakhstan 0.014286 70 1\n", |
5462 "120 Kazakhstan 0.000000 70 0\n", | 5550 "74 Czech Republic 0.030303 33 1\n", |
5463 "28 Tajikistan 0.000000 15 0\n", | 5551 "18 New Zealand 0.037037 27 1\n", |
5464 "105 Sudan 0.018519 54 1\n", | 5552 "15 Netherlands 0.037037 54 2\n", |
5465 "15 Netherlands 0.018519 54 1\n", | 5553 "105 Sudan 0.037037 54 2\n", |
5466 "109 Democratic Republic of the Congo 0.026316 38 1\n", | 5554 "0 Canada 0.050000 80 4\n", |
5467 "writing file\n", | |
5468 "iteration 7\n", | |
5469 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", | |
5470 "(6560, 380) (6560,)\n", | |
5471 "detecting outliers...\n", | |
5472 "most outliers \n", | |
5473 " Country Outliers N_Country N_Outliers\n", | |
5474 "95 Chad 0.555556 9 5\n", | |
5475 "86 Gambia 0.525000 40 21\n", | |
5476 "43 Benin 0.523810 21 11\n", | |
5477 "135 French Guiana 0.500000 22 11\n", | |
5478 "63 Senegal 0.484848 33 16\n", | |
5479 "14 Liberia 0.468750 32 15\n", | |
5480 "52 Indonesia 0.437500 80 35\n", | |
5481 "136 Botswana 0.430556 72 31\n", | |
5482 "6 Bolivia 0.428571 28 12\n", | |
5483 "92 Switzerland 0.428571 42 18\n", | |
5484 "least outliers \n", | |
5485 " Country Outliers N_Country N_Outliers\n", | |
5486 "119 Denmark 0.000000 13 0\n", | |
5487 "1 Lithuania 0.000000 38 0\n", | |
5488 "107 Kiribati 0.000000 14 0\n", | |
5489 "120 Kazakhstan 0.000000 70 0\n", | |
5490 "113 Iceland 0.000000 11 0\n", | |
5491 "94 Iraq 0.028571 70 2\n", | |
5492 "98 Uzbekistan 0.030303 33 1\n", | |
5493 "105 Sudan 0.037037 54 2\n", | |
5494 "85 Sierra Leone 0.037500 80 3\n", | |
5495 "109 Democratic Republic of the Congo 0.052632 38 2\n", | |
5496 "writing file\n", | |
5497 "iteration 8\n", | |
5498 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", | |
5499 "(6560, 380) (6560,)\n", | |
5500 "detecting outliers...\n", | |
5501 "most outliers \n", | |
5502 " Country Outliers N_Country N_Outliers\n", | |
5503 "61 Chad 0.666667 9 6\n", | |
5504 "78 El Salvador 0.576923 26 15\n", | |
5505 "44 Benin 0.571429 21 12\n", | |
5506 "104 Bhutan 0.555556 9 5\n", | |
5507 "86 Gambia 0.550000 40 22\n", | |
5508 "17 French Guiana 0.545455 22 12\n", | |
5509 "94 Belize 0.470588 34 16\n", | |
5510 "14 Liberia 0.468750 32 15\n", | |
5511 "92 Switzerland 0.452381 42 19\n", | |
5512 "53 Indonesia 0.450000 80 36\n", | |
5513 "least outliers \n", | |
5514 " Country Outliers N_Country N_Outliers\n", | |
5515 "119 Denmark 0.000000 13 0\n", | |
5516 "1 Lithuania 0.000000 38 0\n", | |
5517 "120 Kazakhstan 0.000000 70 0\n", | |
5518 "107 Kiribati 0.000000 14 0\n", | |
5519 "98 Uzbekistan 0.030303 33 1\n", | |
5520 "105 Sudan 0.037037 54 2\n", | |
5521 "15 Netherlands 0.037037 54 2\n", | |
5522 "85 Sierra Leone 0.037500 80 3\n", | |
5523 "84 Iraq 0.042857 70 3\n", | |
5524 "109 Democratic Republic of the Congo 0.052632 38 2\n", | |
5525 "writing file\n", | |
5526 "iteration 9\n", | |
5527 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", | |
5528 "(6560, 380) (6560,)\n", | |
5529 "detecting outliers...\n", | |
5530 "most outliers \n", | |
5531 " Country Outliers N_Country N_Outliers\n", | |
5532 "95 Chad 0.555556 9 5\n", | |
5533 "104 Bhutan 0.555556 9 5\n", | |
5534 "86 Gambia 0.550000 40 22\n", | |
5535 "78 El Salvador 0.538462 26 14\n", | |
5536 "18 French Guiana 0.500000 22 11\n", | |
5537 "115 Senegal 0.484848 33 16\n", | |
5538 "44 Benin 0.476190 21 10\n", | |
5539 "41 Laos 0.470588 17 8\n", | |
5540 "6 Bolivia 0.464286 28 13\n", | |
5541 "65 Mozambique 0.444444 27 12\n", | |
5542 "least outliers \n", | |
5543 " Country Outliers N_Country N_Outliers\n", | |
5544 "119 Denmark 0.000000 13 0\n", | |
5545 "1 Lithuania 0.000000 38 0\n", | |
5546 "120 Kazakhstan 0.000000 70 0\n", | |
5547 "107 Kiribati 0.000000 14 0\n", | |
5548 "32 Czech Republic 0.000000 33 0\n", | |
5549 "85 Sierra Leone 0.050000 80 4\n", | |
5550 "0 Canada 0.050000 80 4\n", | |
5551 "109 Democratic Republic of the Congo 0.052632 38 2\n", | |
5552 "105 Sudan 0.055556 54 3\n", | |
5553 "16 Netherlands 0.055556 54 3\n", | |
5554 "writing file\n" | 5555 "writing file\n" |
5555 ] | 5556 ] |
5556 } | 5557 } |
5557 ], | 5558 ], |
5558 "source": [ | 5559 "source": [ |
5559 "from sklearn.model_selection import train_test_split\n", | 5560 "from sklearn.model_selection import train_test_split\n", |
5560 "\n", | 5561 "\n", |
5562 "#results_file = mapper.OUTPUT_FILES[0]\n", | |
5563 "results_file = '../data/lda_data_melodia_8_30sec.pickle'\n", | |
5561 "n_iters = 10\n", | 5564 "n_iters = 10\n", |
5562 "for n in range(n_iters):\n", | 5565 "for n in range(n_iters):\n", |
5563 " print \"iteration %d\" % n\n", | 5566 " print \"iteration %d\" % n\n", |
5564 " results_file = mapper.OUTPUT_FILES[0]\n", | |
5565 " print results_file\n", | 5567 " print results_file\n", |
5566 " X, Y, Yaudio = classification.load_data_from_pickle(results_file)\n", | 5568 " X, Y, Yaudio = classification.load_data_from_pickle(results_file)\n", |
5567 " # get only 80% of the dataset.. to vary the choice of outliers\n", | 5569 " # get only 80% of the dataset.. to vary the choice of outliers\n", |
5568 " X, _, Y, _ = train_test_split(X, Y, train_size=0.8, stratify=Y)\n", | 5570 " X, _, Y, _ = train_test_split(X, Y, train_size=0.8, stratify=Y)\n", |
5569 " print X.shape, Y.shape\n", | 5571 " print X.shape, Y.shape\n", |
5594 "<br> Sort by outlier percentage in descending order." | 5596 "<br> Sort by outlier percentage in descending order." |
5595 ] | 5597 ] |
5596 }, | 5598 }, |
5597 { | 5599 { |
5598 "cell_type": "code", | 5600 "cell_type": "code", |
5599 "execution_count": 7, | 5601 "execution_count": 18, |
5600 "metadata": { | 5602 "metadata": { |
5601 "collapsed": true | 5603 "collapsed": true |
5602 }, | 5604 }, |
5603 "outputs": [], | 5605 "outputs": [], |
5604 "source": [ | 5606 "source": [ |
5611 " ranked_outliers = pd.concat([ranked_outliers, df_global['Outliers']], axis=1)" | 5613 " ranked_outliers = pd.concat([ranked_outliers, df_global['Outliers']], axis=1)" |
5612 ] | 5614 ] |
5613 }, | 5615 }, |
5614 { | 5616 { |
5615 "cell_type": "code", | 5617 "cell_type": "code", |
5616 "execution_count": 8, | 5618 "execution_count": 19, |
5617 "metadata": {}, | 5619 "metadata": {}, |
5618 "outputs": [ | 5620 "outputs": [ |
5619 { | 5621 { |
5620 "data": { | 5622 "data": { |
5621 "text/plain": [ | 5623 "text/plain": [ |
5622 "(137, 10)" | 5624 "(137, 10)" |
5623 ] | 5625 ] |
5624 }, | 5626 }, |
5625 "execution_count": 8, | 5627 "execution_count": 19, |
5626 "metadata": {}, | 5628 "metadata": {}, |
5627 "output_type": "execute_result" | 5629 "output_type": "execute_result" |
5628 } | 5630 } |
5629 ], | 5631 ], |
5630 "source": [ | 5632 "source": [ |
5638 "Remove countries with 0% outliers as these are in random (probably alphabetical) order." | 5640 "Remove countries with 0% outliers as these are in random (probably alphabetical) order." |
5639 ] | 5641 ] |
5640 }, | 5642 }, |
5641 { | 5643 { |
5642 "cell_type": "code", | 5644 "cell_type": "code", |
5643 "execution_count": 9, | 5645 "execution_count": 20, |
5644 "metadata": {}, | 5646 "metadata": {}, |
5645 "outputs": [ | 5647 "outputs": [ |
5646 { | 5648 { |
5647 "name": "stdout", | 5649 "name": "stdout", |
5648 "output_type": "stream", | 5650 "output_type": "stream", |
5649 "text": [ | 5651 "text": [ |
5650 " Country Country Country Country Country \\\n", | 5652 " Country Country Country Country Country \\\n", |
5651 "0 Chad Chad Chad Chad Zimbabwe \n", | 5653 "0 Botswana Ivory Coast Chad Chad Benin \n", |
5652 "1 Gambia French Guiana Bhutan French Guiana Chad \n", | 5654 "1 Chad Botswana Botswana Botswana Botswana \n", |
5653 "2 French Guiana Gambia Gambia Gambia Gambia \n", | 5655 "2 Benin Chad Ivory Coast Gambia Ivory Coast \n", |
5654 "3 Benin Benin French Guiana Bolivia Benin \n", | 5656 "3 Ivory Coast Pakistan Pakistan Mozambique Chad \n", |
5655 "4 Liberia Bolivia El Salvador Botswana Bolivia \n", | 5657 "4 Pakistan Benin Gambia Ivory Coast Gambia \n", |
5656 "\n", | 5658 "\n", |
5657 " Country Country Country Country Country \n", | 5659 " Country Country Country Country Country \n", |
5658 "0 Chad Chad Chad Chad Bhutan \n", | 5660 "0 Chad Botswana Ivory Coast Botswana Ivory Coast \n", |
5659 "1 Benin French Guiana Gambia El Salvador Chad \n", | 5661 "1 Benin Ivory Coast Botswana Ivory Coast Botswana \n", |
5660 "2 Bhutan Zimbabwe Benin Benin Gambia \n", | 5662 "2 Botswana Gambia Gambia Gambia French Guiana \n", |
5661 "3 French Guiana Gambia French Guiana Bhutan El Salvador \n", | 5663 "3 Ivory Coast Nepal Chad Nepal Chad \n", |
5662 "4 Gambia El Salvador Senegal Gambia French Guiana \n", | 5664 "4 Pakistan Senegal Benin French Guiana El Salvador \n", |
5663 " Outliers Outliers Outliers Outliers Outliers Outliers Outliers \\\n", | 5665 " Outliers Outliers Outliers Outliers Outliers Outliers Outliers \\\n", |
5664 "0 0.555556 0.666667 0.666667 0.666667 0.583333 0.666667 0.666667 \n", | 5666 "0 0.625000 0.666667 0.666667 0.666667 0.619048 0.666667 0.597222 \n", |
5665 "1 0.525000 0.545455 0.555556 0.590909 0.555556 0.619048 0.590909 \n", | 5667 "1 0.555556 0.638889 0.625000 0.583333 0.597222 0.619048 0.583333 \n", |
5666 "2 0.500000 0.525000 0.550000 0.550000 0.550000 0.555556 0.583333 \n", | 5668 "2 0.523810 0.555556 0.583333 0.575000 0.583333 0.583333 0.500000 \n", |
5667 "3 0.476190 0.523810 0.545455 0.535714 0.523810 0.545455 0.575000 \n", | 5669 "3 0.500000 0.479452 0.534247 0.518519 0.555556 0.583333 0.500000 \n", |
5668 "4 0.468750 0.500000 0.538462 0.513889 0.500000 0.525000 0.538462 \n", | 5670 "4 0.493151 0.476190 0.525000 0.500000 0.525000 0.479452 0.484848 \n", |
5669 "\n", | 5671 "\n", |
5670 " Outliers Outliers Outliers \n", | 5672 " Outliers Outliers Outliers \n", |
5671 "0 0.555556 0.666667 0.555556 \n", | 5673 "0 0.666667 0.625000 0.666667 \n", |
5672 "1 0.525000 0.576923 0.555556 \n", | 5674 "1 0.611111 0.583333 0.611111 \n", |
5673 "2 0.523810 0.571429 0.550000 \n", | 5675 "2 0.575000 0.475000 0.590909 \n", |
5674 "3 0.500000 0.555556 0.538462 \n", | 5676 "3 0.555556 0.460526 0.555556 \n", |
5675 "4 0.484848 0.550000 0.500000 \n" | 5677 "4 0.523810 0.454545 0.538462 \n" |
5676 ] | 5678 ] |
5677 } | 5679 } |
5678 ], | 5680 ], |
5679 "source": [ | 5681 "source": [ |
5680 "zero_idx = np.where(np.sum(ranked_outliers, axis=1)==0)[0]\n", | 5682 "zero_idx = np.where(np.sum(ranked_outliers, axis=1)==0)[0]\n", |
5694 "And now kendalltau correlation" | 5696 "And now kendalltau correlation" |
5695 ] | 5697 ] |
5696 }, | 5698 }, |
5697 { | 5699 { |
5698 "cell_type": "code", | 5700 "cell_type": "code", |
5699 "execution_count": 71, | 5701 "execution_count": 21, |
5700 "metadata": { | 5702 "metadata": {}, |
5701 "collapsed": true | 5703 "outputs": [ |
5702 }, | 5704 { |
5703 "outputs": [], | 5705 "name": "stderr", |
5706 "output_type": "stream", | |
5707 "text": [ | |
5708 "/homes/mp305/anaconda/lib/python2.7/site-packages/scipy/stats/stats.py:250: RuntimeWarning: The input array could not be properly checked for nan values. nan values will be ignored.\n", | |
5709 " \"values. nan values will be ignored.\", RuntimeWarning)\n" | |
5710 ] | |
5711 } | |
5712 ], | |
5704 "source": [ | 5713 "source": [ |
5705 "from scipy.stats import kendalltau\n", | 5714 "from scipy.stats import kendalltau\n", |
5706 "r_, p_ = [], []\n", | 5715 "r_, p_ = [], []\n", |
5707 "ranked_countries_arr = ranked_countries.get_values()\n", | 5716 "ranked_countries_arr = ranked_countries.get_values()\n", |
5708 "for i in range(n_iters-1):\n", | 5717 "for i in range(n_iters-1):\n", |
5714 "p_ = np.array(p_)" | 5723 "p_ = np.array(p_)" |
5715 ] | 5724 ] |
5716 }, | 5725 }, |
5717 { | 5726 { |
5718 "cell_type": "code", | 5727 "cell_type": "code", |
5719 "execution_count": 72, | 5728 "execution_count": 22, |
5720 "metadata": {}, | 5729 "metadata": {}, |
5721 "outputs": [ | 5730 "outputs": [ |
5722 { | 5731 { |
5723 "name": "stdout", | 5732 "name": "stdout", |
5724 "output_type": "stream", | 5733 "output_type": "stream", |
5725 "text": [ | 5734 "text": [ |
5726 "0.0493253335359 0.410409379365\n" | 5735 "0.0554645319767 0.37638195368\n" |
5727 ] | 5736 ] |
5728 } | 5737 } |
5729 ], | 5738 ], |
5730 "source": [ | 5739 "source": [ |
5731 "print np.mean(r_), np.mean(p_)" | 5740 "print np.mean(r_), np.mean(p_)" |
5732 ] | 5741 ] |
5733 }, | 5742 }, |
5734 { | 5743 { |
5735 "cell_type": "code", | 5744 "cell_type": "code", |
5736 "execution_count": 80, | 5745 "execution_count": 23, |
5737 "metadata": {}, | 5746 "metadata": {}, |
5738 "outputs": [ | 5747 "outputs": [ |
5739 { | 5748 { |
5740 "name": "stdout", | 5749 "name": "stdout", |
5741 "output_type": "stream", | 5750 "output_type": "stream", |
5742 "text": [ | 5751 "text": [ |
5743 "0.240026302342 0.351418392739\n" | 5752 "0.248540800214 0.311313597605\n" |
5744 ] | 5753 ] |
5745 } | 5754 } |
5746 ], | 5755 ], |
5747 "source": [ | 5756 "source": [ |
5748 "from scipy.stats import spearmanr\n", | 5757 "from scipy.stats import spearmanr\n", |
5760 "let's focus only on the top K results" | 5769 "let's focus only on the top K results" |
5761 ] | 5770 ] |
5762 }, | 5771 }, |
5763 { | 5772 { |
5764 "cell_type": "code", | 5773 "cell_type": "code", |
5765 "execution_count": 81, | 5774 "execution_count": 24, |
5766 "metadata": {}, | 5775 "metadata": {}, |
5767 "outputs": [ | 5776 "outputs": [ |
5768 { | 5777 { |
5769 "name": "stdout", | 5778 "name": "stdout", |
5770 "output_type": "stream", | 5779 "output_type": "stream", |
5771 "text": [ | 5780 "text": [ |
5772 "0.237245179063 0.417925582965\n" | 5781 "0.294545454545 0.449007896087\n" |
5773 ] | 5782 ] |
5774 } | 5783 } |
5775 ], | 5784 ], |
5776 "source": [ | 5785 "source": [ |
5777 "k=10\n", | 5786 "k=10\n", |
5782 "print np.mean(r), np.mean(p)" | 5791 "print np.mean(r), np.mean(p)" |
5783 ] | 5792 ] |
5784 }, | 5793 }, |
5785 { | 5794 { |
5786 "cell_type": "code", | 5795 "cell_type": "code", |
5787 "execution_count": 75, | 5796 "execution_count": 25, |
5788 "metadata": { | 5797 "metadata": { |
5789 "collapsed": true | 5798 "collapsed": true |
5790 }, | 5799 }, |
5791 "outputs": [], | 5800 "outputs": [], |
5792 "source": [ | 5801 "source": [ |
5795 " common_set = common_set & set(ranked_countries_arr[:k, i])" | 5804 " common_set = common_set & set(ranked_countries_arr[:k, i])" |
5796 ] | 5805 ] |
5797 }, | 5806 }, |
5798 { | 5807 { |
5799 "cell_type": "code", | 5808 "cell_type": "code", |
5800 "execution_count": 76, | 5809 "execution_count": 26, |
5801 "metadata": {}, | 5810 "metadata": {}, |
5802 "outputs": [ | 5811 "outputs": [ |
5803 { | 5812 { |
5804 "data": { | 5813 "data": { |
5805 "text/plain": [ | 5814 "text/plain": [ |
5806 "{'Chad', 'French Guiana', 'Gambia'}" | 5815 "{'Botswana', 'Chad', 'Gambia', 'Ivory Coast', 'Pakistan'}" |
5807 ] | 5816 ] |
5808 }, | 5817 }, |
5809 "execution_count": 76, | 5818 "execution_count": 26, |
5810 "metadata": {}, | 5819 "metadata": {}, |
5811 "output_type": "execute_result" | 5820 "output_type": "execute_result" |
5812 } | 5821 } |
5813 ], | 5822 ], |
5814 "source": [ | 5823 "source": [ |
5822 "## Try precision at K" | 5831 "## Try precision at K" |
5823 ] | 5832 ] |
5824 }, | 5833 }, |
5825 { | 5834 { |
5826 "cell_type": "code", | 5835 "cell_type": "code", |
5827 "execution_count": 10, | 5836 "execution_count": 27, |
5828 "metadata": {}, | 5837 "metadata": { |
5838 "collapsed": true | |
5839 }, | |
5829 "outputs": [], | 5840 "outputs": [], |
5830 "source": [ | 5841 "source": [ |
5831 "# majority voting + precision at K (top5?)\n", | 5842 "# majority voting + precision at K (top5?)\n", |
5832 "from collections import Counter\n", | 5843 "from collections import Counter\n", |
5833 "K_vote = 10\n", | 5844 "K_vote = 10\n", |
5834 "country_vote = Counter(ranked_countries_arr[:K_vote, :].ravel())" | 5845 "country_vote = Counter(ranked_countries_arr[:K_vote, :].ravel())" |
5835 ] | 5846 ] |
5836 }, | 5847 }, |
5837 { | 5848 { |
5838 "cell_type": "code", | 5849 "cell_type": "code", |
5839 "execution_count": 11, | 5850 "execution_count": 28, |
5840 "metadata": {}, | 5851 "metadata": {}, |
5841 "outputs": [ | 5852 "outputs": [ |
5842 { | 5853 { |
5843 "data": { | 5854 "data": { |
5844 "text/html": [ | 5855 "text/html": [ |
5852 " </tr>\n", | 5863 " </tr>\n", |
5853 " </thead>\n", | 5864 " </thead>\n", |
5854 " <tbody>\n", | 5865 " <tbody>\n", |
5855 " <tr>\n", | 5866 " <tr>\n", |
5856 " <th>0</th>\n", | 5867 " <th>0</th>\n", |
5857 " <td>Brazil</td>\n", | 5868 " <td>Pakistan</td>\n", |
5858 " <td>1</td>\n", | 5869 " <td>10</td>\n", |
5859 " </tr>\n", | 5870 " </tr>\n", |
5860 " <tr>\n", | 5871 " <tr>\n", |
5861 " <th>1</th>\n", | 5872 " <th>1</th>\n", |
5862 " <td>Liberia</td>\n", | 5873 " <td>Bhutan</td>\n", |
5863 " <td>7</td>\n", | 5874 " <td>3</td>\n", |
5864 " </tr>\n", | 5875 " </tr>\n", |
5865 " <tr>\n", | 5876 " <tr>\n", |
5866 " <th>2</th>\n", | 5877 " <th>2</th>\n", |
5867 " <td>Belize</td>\n", | |
5868 " <td>2</td>\n", | |
5869 " </tr>\n", | |
5870 " <tr>\n", | |
5871 " <th>3</th>\n", | |
5872 " <td>Chad</td>\n", | 5878 " <td>Chad</td>\n", |
5873 " <td>10</td>\n", | 5879 " <td>10</td>\n", |
5874 " </tr>\n", | 5880 " </tr>\n", |
5875 " <tr>\n", | 5881 " <tr>\n", |
5882 " <th>3</th>\n", | |
5883 " <td>Liberia</td>\n", | |
5884 " <td>2</td>\n", | |
5885 " </tr>\n", | |
5886 " <tr>\n", | |
5876 " <th>4</th>\n", | 5887 " <th>4</th>\n", |
5877 " <td>Bhutan</td>\n", | 5888 " <td>El Salvador</td>\n", |
5878 " <td>7</td>\n", | 5889 " <td>5</td>\n", |
5879 " </tr>\n", | 5890 " </tr>\n", |
5880 " </tbody>\n", | 5891 " </tbody>\n", |
5881 "</table>\n", | 5892 "</table>\n", |
5882 "</div>" | 5893 "</div>" |
5883 ], | 5894 ], |
5884 "text/plain": [ | 5895 "text/plain": [ |
5885 " index 0\n", | 5896 " index 0\n", |
5886 "0 Brazil 1\n", | 5897 "0 Pakistan 10\n", |
5887 "1 Liberia 7\n", | 5898 "1 Bhutan 3\n", |
5888 "2 Belize 2\n", | 5899 "2 Chad 10\n", |
5889 "3 Chad 10\n", | 5900 "3 Liberia 2\n", |
5890 "4 Bhutan 7" | 5901 "4 El Salvador 5" |
5891 ] | 5902 ] |
5892 }, | 5903 }, |
5893 "execution_count": 11, | 5904 "execution_count": 28, |
5894 "metadata": {}, | 5905 "metadata": {}, |
5895 "output_type": "execute_result" | 5906 "output_type": "execute_result" |
5896 } | 5907 } |
5897 ], | 5908 ], |
5898 "source": [ | 5909 "source": [ |
5900 "df_country_vote.head()" | 5911 "df_country_vote.head()" |
5901 ] | 5912 ] |
5902 }, | 5913 }, |
5903 { | 5914 { |
5904 "cell_type": "code", | 5915 "cell_type": "code", |
5905 "execution_count": 12, | 5916 "execution_count": 29, |
5906 "metadata": {}, | 5917 "metadata": {}, |
5907 "outputs": [ | 5918 "outputs": [ |
5908 { | 5919 { |
5909 "data": { | 5920 "data": { |
5910 "text/html": [ | 5921 "text/html": [ |
5917 " <th>0</th>\n", | 5928 " <th>0</th>\n", |
5918 " </tr>\n", | 5929 " </tr>\n", |
5919 " </thead>\n", | 5930 " </thead>\n", |
5920 " <tbody>\n", | 5931 " <tbody>\n", |
5921 " <tr>\n", | 5932 " <tr>\n", |
5922 " <th>3</th>\n", | 5933 " <th>0</th>\n", |
5934 " <td>Pakistan</td>\n", | |
5935 " <td>10</td>\n", | |
5936 " </tr>\n", | |
5937 " <tr>\n", | |
5938 " <th>2</th>\n", | |
5923 " <td>Chad</td>\n", | 5939 " <td>Chad</td>\n", |
5924 " <td>10</td>\n", | 5940 " <td>10</td>\n", |
5925 " </tr>\n", | 5941 " </tr>\n", |
5926 " <tr>\n", | 5942 " <tr>\n", |
5927 " <th>6</th>\n", | 5943 " <th>5</th>\n", |
5928 " <td>Gambia</td>\n", | 5944 " <td>Gambia</td>\n", |
5929 " <td>10</td>\n", | 5945 " <td>10</td>\n", |
5930 " </tr>\n", | 5946 " </tr>\n", |
5931 " <tr>\n", | 5947 " <tr>\n", |
5932 " <th>12</th>\n", | 5948 " <th>10</th>\n", |
5933 " <td>French Guiana</td>\n", | 5949 " <td>Ivory Coast</td>\n", |
5934 " <td>10</td>\n", | 5950 " <td>10</td>\n", |
5935 " </tr>\n", | 5951 " </tr>\n", |
5936 " <tr>\n", | 5952 " <tr>\n", |
5937 " <th>18</th>\n", | 5953 " <th>12</th>\n", |
5938 " <td>Benin</td>\n", | 5954 " <td>Botswana</td>\n", |
5955 " <td>10</td>\n", | |
5956 " </tr>\n", | |
5957 " <tr>\n", | |
5958 " <th>6</th>\n", | |
5959 " <td>Nepal</td>\n", | |
5939 " <td>9</td>\n", | 5960 " <td>9</td>\n", |
5940 " </tr>\n", | 5961 " </tr>\n", |
5941 " <tr>\n", | 5962 " <tr>\n", |
5942 " <th>5</th>\n", | 5963 " <th>13</th>\n", |
5943 " <td>El Salvador</td>\n", | 5964 " <td>Benin</td>\n", |
5944 " <td>9</td>\n", | |
5945 " </tr>\n", | |
5946 " <tr>\n", | |
5947 " <th>17</th>\n", | |
5948 " <td>Botswana</td>\n", | |
5949 " <td>8</td>\n", | 5965 " <td>8</td>\n", |
5950 " </tr>\n", | 5966 " </tr>\n", |
5951 " <tr>\n", | 5967 " <tr>\n", |
5952 " <th>4</th>\n", | 5968 " <th>8</th>\n", |
5953 " <td>Bhutan</td>\n", | 5969 " <td>Senegal</td>\n", |
5954 " <td>7</td>\n", | 5970 " <td>7</td>\n", |
5955 " </tr>\n", | 5971 " </tr>\n", |
5956 " <tr>\n", | 5972 " <tr>\n", |
5957 " <th>1</th>\n", | 5973 " <th>9</th>\n", |
5958 " <td>Liberia</td>\n", | 5974 " <td>French Guiana</td>\n", |
5959 " <td>7</td>\n", | 5975 " <td>7</td>\n", |
5960 " </tr>\n", | 5976 " </tr>\n", |
5961 " <tr>\n", | 5977 " <tr>\n", |
5962 " <th>16</th>\n", | 5978 " <th>4</th>\n", |
5963 " <td>Bolivia</td>\n", | 5979 " <td>El Salvador</td>\n", |
5964 " <td>6</td>\n", | 5980 " <td>5</td>\n", |
5965 " </tr>\n", | |
5966 " <tr>\n", | |
5967 " <th>10</th>\n", | |
5968 " <td>Senegal</td>\n", | |
5969 " <td>6</td>\n", | |
5970 " </tr>\n", | 5981 " </tr>\n", |
5971 " <tr>\n", | 5982 " <tr>\n", |
5972 " <th>11</th>\n", | 5983 " <th>11</th>\n", |
5973 " <td>Zimbabwe</td>\n", | 5984 " <td>Mozambique</td>\n", |
5985 " <td>5</td>\n", | |
5986 " </tr>\n", | |
5987 " <tr>\n", | |
5988 " <th>7</th>\n", | |
5989 " <td>Uganda</td>\n", | |
5990 " <td>4</td>\n", | |
5991 " </tr>\n", | |
5992 " <tr>\n", | |
5993 " <th>1</th>\n", | |
5994 " <td>Bhutan</td>\n", | |
5974 " <td>3</td>\n", | 5995 " <td>3</td>\n", |
5975 " </tr>\n", | 5996 " </tr>\n", |
5976 " <tr>\n", | 5997 " <tr>\n", |
5977 " <th>14</th>\n", | 5998 " <th>3</th>\n", |
5978 " <td>Switzerland</td>\n", | 5999 " <td>Liberia</td>\n", |
5979 " <td>3</td>\n", | |
5980 " </tr>\n", | |
5981 " <tr>\n", | |
5982 " <th>15</th>\n", | |
5983 " <td>Mozambique</td>\n", | |
5984 " <td>3</td>\n", | |
5985 " </tr>\n", | |
5986 " <tr>\n", | |
5987 " <th>2</th>\n", | |
5988 " <td>Belize</td>\n", | |
5989 " <td>2</td>\n", | 6000 " <td>2</td>\n", |
5990 " </tr>\n", | |
5991 " <tr>\n", | |
5992 " <th>7</th>\n", | |
5993 " <td>Indonesia</td>\n", | |
5994 " <td>2</td>\n", | |
5995 " </tr>\n", | |
5996 " <tr>\n", | |
5997 " <th>8</th>\n", | |
5998 " <td>Guatemala</td>\n", | |
5999 " <td>2</td>\n", | |
6000 " </tr>\n", | |
6001 " <tr>\n", | |
6002 " <th>0</th>\n", | |
6003 " <td>Brazil</td>\n", | |
6004 " <td>1</td>\n", | |
6005 " </tr>\n", | |
6006 " <tr>\n", | |
6007 " <th>13</th>\n", | |
6008 " <td>Laos</td>\n", | |
6009 " <td>1</td>\n", | |
6010 " </tr>\n", | |
6011 " <tr>\n", | |
6012 " <th>9</th>\n", | |
6013 " <td>Malta</td>\n", | |
6014 " <td>1</td>\n", | |
6015 " </tr>\n", | 6001 " </tr>\n", |
6016 " </tbody>\n", | 6002 " </tbody>\n", |
6017 "</table>\n", | 6003 "</table>\n", |
6018 "</div>" | 6004 "</div>" |
6019 ], | 6005 ], |
6020 "text/plain": [ | 6006 "text/plain": [ |
6021 " index 0\n", | 6007 " index 0\n", |
6022 "3 Chad 10\n", | 6008 "0 Pakistan 10\n", |
6023 "6 Gambia 10\n", | 6009 "2 Chad 10\n", |
6024 "12 French Guiana 10\n", | 6010 "5 Gambia 10\n", |
6025 "18 Benin 9\n", | 6011 "10 Ivory Coast 10\n", |
6026 "5 El Salvador 9\n", | 6012 "12 Botswana 10\n", |
6027 "17 Botswana 8\n", | 6013 "6 Nepal 9\n", |
6028 "4 Bhutan 7\n", | 6014 "13 Benin 8\n", |
6029 "1 Liberia 7\n", | 6015 "8 Senegal 7\n", |
6030 "16 Bolivia 6\n", | 6016 "9 French Guiana 7\n", |
6031 "10 Senegal 6\n", | 6017 "4 El Salvador 5\n", |
6032 "11 Zimbabwe 3\n", | 6018 "11 Mozambique 5\n", |
6033 "14 Switzerland 3\n", | 6019 "7 Uganda 4\n", |
6034 "15 Mozambique 3\n", | 6020 "1 Bhutan 3\n", |
6035 "2 Belize 2\n", | 6021 "3 Liberia 2" |
6036 "7 Indonesia 2\n", | |
6037 "8 Guatemala 2\n", | |
6038 "0 Brazil 1\n", | |
6039 "13 Laos 1\n", | |
6040 "9 Malta 1" | |
6041 ] | 6022 ] |
6042 }, | 6023 }, |
6043 "execution_count": 12, | 6024 "execution_count": 29, |
6044 "metadata": {}, | 6025 "metadata": {}, |
6045 "output_type": "execute_result" | 6026 "output_type": "execute_result" |
6046 } | 6027 } |
6047 ], | 6028 ], |
6048 "source": [ | 6029 "source": [ |
6049 "df_country_vote.sort_values(0, ascending=False)" | 6030 "df_country_vote.sort_values(0, ascending=False)" |
6050 ] | 6031 ] |
6051 }, | 6032 }, |
6052 { | 6033 { |
6053 "cell_type": "code", | 6034 "cell_type": "code", |
6054 "execution_count": 14, | 6035 "execution_count": 30, |
6055 "metadata": {}, | 6036 "metadata": {}, |
6056 "outputs": [ | 6037 "outputs": [ |
6057 { | 6038 { |
6058 "name": "stdout", | 6039 "name": "stdout", |
6059 "output_type": "stream", | 6040 "output_type": "stream", |
6060 "text": [ | 6041 "text": [ |
6061 "0.51 0.0830662386292\n" | 6042 "0.67 0.0640312423743\n" |
6062 ] | 6043 ] |
6063 } | 6044 } |
6064 ], | 6045 ], |
6065 "source": [ | 6046 "source": [ |
6066 "def precision_at_k(array, gr_truth, k):\n", | 6047 "def precision_at_k(array, gr_truth, k):\n", |
6075 "print np.mean(p_), np.std(p_)" | 6056 "print np.mean(p_), np.std(p_)" |
6076 ] | 6057 ] |
6077 }, | 6058 }, |
6078 { | 6059 { |
6079 "cell_type": "code", | 6060 "cell_type": "code", |
6080 "execution_count": 15, | 6061 "execution_count": 31, |
6081 "metadata": {}, | 6062 "metadata": {}, |
6082 "outputs": [ | 6063 "outputs": [ |
6083 { | 6064 { |
6084 "data": { | 6065 "data": { |
6085 "text/plain": [ | 6066 "text/plain": [ |
6086 "array([ 0.6, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.4, 0.7, 0.4])" | 6067 "array([ 0.6, 0.7, 0.7, 0.6, 0.6, 0.7, 0.8, 0.6, 0.7, 0.7])" |
6087 ] | 6068 ] |
6088 }, | 6069 }, |
6089 "execution_count": 15, | 6070 "execution_count": 31, |
6090 "metadata": {}, | 6071 "metadata": {}, |
6091 "output_type": "execute_result" | 6072 "output_type": "execute_result" |
6092 } | 6073 } |
6093 ], | 6074 ], |
6094 "source": [ | 6075 "source": [ |