comparison notebooks/sensitivity_experiment.ipynb @ 90:e279ccea5f9b branch-tests

results on 30sec
author mpanteli <m.x.panteli@gmail.com>
date Mon, 02 Oct 2017 15:32:23 +0100
parents 4395037087b6
children
comparison
equal deleted inserted replaced
89:8a2d56880050 90:e279ccea5f9b
1 { 1 {
2 "cells": [ 2 "cells": [
3 { 3 {
4 "cell_type": "code", 4 "cell_type": "code",
5 "execution_count": 1, 5 "execution_count": 16,
6 "metadata": {}, 6 "metadata": {},
7 "outputs": [ 7 "outputs": [
8 { 8 {
9 "name": "stdout", 9 "name": "stdout",
10 "output_type": "stream", 10 "output_type": "stream",
11 "text": [ 11 "text": [
12 "ERROR! Session/line number was not unique in database. History logging moved to new session 32\n" 12 "The autoreload extension is already loaded. To reload it, use:\n",
13 ] 13 " %reload_ext autoreload\n"
14 },
15 {
16 "name": "stderr",
17 "output_type": "stream",
18 "text": [
19 "/homes/mp305/anaconda/lib/python2.7/site-packages/librosa/core/audio.py:33: UserWarning: Could not import scikits.samplerate. Falling back to scipy.signal\n",
20 " warnings.warn('Could not import scikits.samplerate. '\n"
21 ] 14 ]
22 } 15 }
23 ], 16 ],
24 "source": [ 17 "source": [
25 "import numpy as np\n", 18 "import numpy as np\n",
40 ] 33 ]
41 }, 34 },
42 { 35 {
43 "cell_type": "code", 36 "cell_type": "code",
44 "execution_count": 2, 37 "execution_count": 2,
45 "metadata": {}, 38 "metadata": {
39 "collapsed": true
40 },
46 "outputs": [], 41 "outputs": [],
47 "source": [ 42 "source": [
48 "OUTPUT_FILES = load_dataset.OUTPUT_FILES\n", 43 "OUTPUT_FILES = load_dataset.OUTPUT_FILES\n",
49 "n_iters = 10" 44 "n_iters = 10"
50 ] 45 ]
5253 "Let's try without changing the LDA mapping, so just load the original dataset and get outlier countries by selecting 80% of the recordigns (in stratified manner).'" 5248 "Let's try without changing the LDA mapping, so just load the original dataset and get outlier countries by selecting 80% of the recordigns (in stratified manner).'"
5254 ] 5249 ]
5255 }, 5250 },
5256 { 5251 {
5257 "cell_type": "code", 5252 "cell_type": "code",
5258 "execution_count": 67, 5253 "execution_count": 17,
5259 "metadata": {}, 5254 "metadata": {},
5260 "outputs": [ 5255 "outputs": [
5261 { 5256 {
5262 "name": "stdout", 5257 "name": "stdout",
5263 "output_type": "stream", 5258 "output_type": "stream",
5264 "text": [ 5259 "text": [
5265 "iteration 0\n", 5260 "iteration 0\n",
5266 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", 5261 "../data/lda_data_melodia_8_30sec.pickle\n",
5267 "(6560, 380) (6560,)\n", 5262 "(6560, 381) (6560,)\n",
5268 "detecting outliers...\n", 5263 "detecting outliers...\n",
5269 "most outliers \n", 5264 "most outliers \n",
5270 " Country Outliers N_Country N_Outliers\n", 5265 " Country Outliers N_Country N_Outliers\n",
5271 "95 Chad 0.555556 9 5\n", 5266 "136 Botswana 0.625000 72 45\n",
5272 "86 Gambia 0.525000 40 21\n", 5267 "59 Chad 0.555556 9 5\n",
5273 "135 French Guiana 0.500000 22 11\n", 5268 "42 Benin 0.523810 21 11\n",
5274 "44 Benin 0.476190 21 10\n", 5269 "31 Ivory Coast 0.500000 12 6\n",
5275 "15 Liberia 0.468750 32 15\n", 5270 "20 Pakistan 0.493151 73 36\n",
5276 "136 Botswana 0.458333 72 33\n", 5271 "63 Mozambique 0.481481 27 13\n",
5272 "106 Nepal 0.460526 76 35\n",
5273 "17 French Guiana 0.454545 22 10\n",
5277 "104 Bhutan 0.444444 9 4\n", 5274 "104 Bhutan 0.444444 9 4\n",
5278 "68 Brazil 0.437500 80 35\n", 5275 "86 Gambia 0.425000 40 17\n",
5279 "92 Switzerland 0.428571 42 18\n",
5280 "78 El Salvador 0.423077 26 11\n",
5281 "least outliers \n", 5276 "least outliers \n",
5282 " Country Outliers N_Country N_Outliers\n", 5277 " Country Outliers N_Country N_Outliers\n",
5283 "1 Lithuania 0.000000 38 0\n", 5278 "100 Antigua and Barbuda 0.000000 34 0\n",
5284 "29 Tajikistan 0.000000 15 0\n", 5279 "28 Tajikistan 0.000000 15 0\n",
5285 "32 Czech Republic 0.000000 33 0\n", 5280 "113 Iceland 0.000000 11 0\n",
5286 "107 Kiribati 0.000000 14 0\n", 5281 "119 Denmark 0.000000 13 0\n",
5287 "120 Kazakhstan 0.000000 70 0\n", 5282 "27 South Korea 0.000000 9 0\n",
5288 "119 Denmark 0.000000 13 0\n", 5283 "1 Lithuania 0.000000 38 0\n",
5289 "0 Canada 0.050000 80 4\n", 5284 "120 Kazakhstan 0.014286 70 1\n",
5290 "73 Nigeria 0.051948 77 4\n", 5285 "15 Netherlands 0.018519 54 1\n",
5291 "109 Democratic Republic of the Congo 0.052632 38 2\n", 5286 "74 Czech Republic 0.030303 33 1\n",
5292 "105 Sudan 0.055556 54 3\n", 5287 "105 Sudan 0.037037 54 2\n",
5293 "writing file\n", 5288 "writing file\n",
5294 "iteration 1\n", 5289 "iteration 1\n",
5295 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", 5290 "../data/lda_data_melodia_8_30sec.pickle\n",
5296 "(6560, 380) (6560,)\n", 5291 "(6560, 381) (6560,)\n",
5297 "detecting outliers...\n", 5292 "detecting outliers...\n",
5298 "most outliers \n", 5293 "most outliers \n",
5299 " Country Outliers N_Country N_Outliers\n", 5294 " Country Outliers N_Country N_Outliers\n",
5300 "95 Chad 0.666667 9 6\n", 5295 "31 Ivory Coast 0.666667 12 8\n",
5301 "17 French Guiana 0.545455 22 12\n", 5296 "136 Botswana 0.638889 72 46\n",
5302 "86 Gambia 0.525000 40 21\n", 5297 "95 Chad 0.555556 9 5\n",
5303 "44 Benin 0.523810 21 11\n", 5298 "20 Pakistan 0.479452 73 35\n",
5304 "6 Bolivia 0.500000 28 14\n", 5299 "43 Benin 0.476190 21 10\n",
5305 "78 El Salvador 0.500000 26 13\n", 5300 "86 Gambia 0.475000 40 19\n",
5306 "136 Botswana 0.486111 72 35\n", 5301 "78 El Salvador 0.461538 26 12\n",
5307 "10 Guatemala 0.465116 43 20\n",
5308 "115 Senegal 0.454545 33 15\n", 5302 "115 Senegal 0.454545 33 15\n",
5303 "135 French Guiana 0.454545 22 10\n",
5309 "104 Bhutan 0.444444 9 4\n", 5304 "104 Bhutan 0.444444 9 4\n",
5310 "least outliers \n", 5305 "least outliers \n",
5311 " Country Outliers N_Country N_Outliers\n", 5306 " Country Outliers N_Country N_Outliers\n",
5312 "120 Kazakhstan 0.000000 70 0\n", 5307 "1 Lithuania 0.000000 38 0\n",
5313 "1 Lithuania 0.000000 38 0\n", 5308 "107 Kiribati 0.000000 14 0\n",
5314 "107 Kiribati 0.000000 14 0\n", 5309 "119 Denmark 0.000000 13 0\n",
5315 "119 Denmark 0.000000 13 0\n", 5310 "27 South Korea 0.000000 9 0\n",
5316 "9 Saudi Arabia 0.000000 8 0\n", 5311 "120 Kazakhstan 0.014286 70 1\n",
5317 "98 Uzbekistan 0.030303 33 1\n", 5312 "105 Sudan 0.018519 54 1\n",
5318 "15 Netherlands 0.037037 54 2\n", 5313 "74 Czech Republic 0.030303 33 1\n",
5319 "57 Russia 0.037975 79 3\n", 5314 "93 Grenada 0.033333 30 1\n",
5320 "109 Democratic Republic of the Congo 0.052632 38 2\n", 5315 "15 Netherlands 0.037037 54 2\n",
5321 "105 Sudan 0.055556 54 3\n", 5316 "0 Canada 0.037500 80 3\n",
5322 "writing file\n", 5317 "writing file\n",
5323 "iteration 2\n", 5318 "iteration 2\n",
5324 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", 5319 "../data/lda_data_melodia_8_30sec.pickle\n",
5325 "(6560, 380) (6560,)\n", 5320 "(6560, 381) (6560,)\n",
5326 "detecting outliers...\n", 5321 "detecting outliers...\n",
5327 "most outliers \n", 5322 "most outliers \n",
5328 " Country Outliers N_Country N_Outliers\n", 5323 " Country Outliers N_Country N_Outliers\n",
5329 "95 Chad 0.666667 9 6\n", 5324 "61 Chad 0.666667 9 6\n",
5330 "104 Bhutan 0.555556 9 5\n", 5325 "136 Botswana 0.625000 72 45\n",
5331 "86 Gambia 0.550000 40 22\n", 5326 "72 Ivory Coast 0.583333 12 7\n",
5332 "135 French Guiana 0.545455 22 12\n", 5327 "20 Pakistan 0.534247 73 39\n",
5333 "78 El Salvador 0.538462 26 14\n", 5328 "86 Gambia 0.525000 40 21\n",
5334 "43 Benin 0.523810 21 11\n", 5329 "44 Benin 0.476190 21 10\n",
5335 "6 Bolivia 0.500000 28 14\n", 5330 "78 El Salvador 0.461538 26 12\n",
5336 "136 Botswana 0.486111 72 35\n", 5331 "106 Nepal 0.434211 76 33\n",
5337 "64 Mozambique 0.444444 27 12\n", 5332 "66 Uganda 0.426471 68 29\n",
5338 "14 Liberia 0.437500 32 14\n", 5333 "135 French Guiana 0.409091 22 9\n",
5339 "least outliers \n", 5334 "least outliers \n",
5340 " Country Outliers N_Country N_Outliers\n", 5335 " Country Outliers N_Country N_Outliers\n",
5341 "1 Lithuania 0.000000 38 0\n", 5336 "1 Lithuania 0.000000 38 0\n",
5342 "107 Kiribati 0.000000 14 0\n", 5337 "119 Denmark 0.000000 13 0\n",
5343 "119 Denmark 0.000000 13 0\n", 5338 "31 Czech Republic 0.000000 33 0\n",
5344 "120 Kazakhstan 0.000000 70 0\n", 5339 "30 Afghanistan 0.000000 19 0\n",
5345 "15 Netherlands 0.018519 54 1\n", 5340 "27 South Korea 0.000000 9 0\n",
5346 "105 Sudan 0.037037 54 2\n", 5341 "102 Nicaragua 0.000000 17 0\n",
5347 "0 Canada 0.050000 80 4\n", 5342 "120 Kazakhstan 0.014286 70 1\n",
5348 "109 Democratic Republic of the Congo 0.052632 38 2\n", 5343 "15 Netherlands 0.018519 54 1\n",
5349 "94 Iraq 0.057971 69 4\n", 5344 "43 Malawi 0.040000 25 1\n",
5350 "31 Czech Republic 0.060606 33 2\n", 5345 "0 Canada 0.050000 80 4\n",
5351 "writing file\n", 5346 "writing file\n",
5352 "iteration 3\n", 5347 "iteration 3\n",
5353 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", 5348 "../data/lda_data_melodia_8_30sec.pickle\n",
5354 "(6560, 380) (6560,)\n", 5349 "(6560, 381) (6560,)\n",
5350 "detecting outliers...\n",
5351 "most outliers \n",
5352 " Country Outliers N_Country N_Outliers\n",
5353 "95 Chad 0.666667 9 6\n",
5354 "136 Botswana 0.583333 72 42\n",
5355 "86 Gambia 0.575000 40 23\n",
5356 "63 Mozambique 0.518519 27 14\n",
5357 "31 Ivory Coast 0.500000 12 6\n",
5358 "42 Benin 0.476190 21 10\n",
5359 "106 Nepal 0.473684 76 36\n",
5360 "20 Pakistan 0.452055 73 33\n",
5361 "64 Uganda 0.426471 68 29\n",
5362 "62 Senegal 0.424242 33 14\n",
5363 "least outliers \n",
5364 " Country Outliers N_Country N_Outliers\n",
5365 "1 Lithuania 0.000000 38 0\n",
5366 "74 Czech Republic 0.000000 33 0\n",
5367 "27 South Korea 0.000000 9 0\n",
5368 "119 Denmark 0.000000 13 0\n",
5369 "120 Kazakhstan 0.014286 70 1\n",
5370 "105 Sudan 0.037037 54 2\n",
5371 "15 Netherlands 0.037037 54 2\n",
5372 "65 Hungary 0.049180 61 3\n",
5373 "0 Canada 0.050000 80 4\n",
5374 "44 United States of America 0.051282 78 4\n",
5375 "writing file\n",
5376 "iteration 4\n",
5377 "../data/lda_data_melodia_8_30sec.pickle\n",
5378 "(6560, 381) (6560,)\n",
5379 "detecting outliers...\n",
5380 "most outliers \n",
5381 " Country Outliers N_Country N_Outliers\n",
5382 "43 Benin 0.619048 21 13\n",
5383 "136 Botswana 0.597222 72 43\n",
5384 "72 Ivory Coast 0.583333 12 7\n",
5385 "95 Chad 0.555556 9 5\n",
5386 "86 Gambia 0.525000 40 21\n",
5387 "64 Mozambique 0.518519 27 14\n",
5388 "20 Pakistan 0.506849 73 37\n",
5389 "106 Nepal 0.486842 76 37\n",
5390 "65 Uganda 0.470588 68 32\n",
5391 "63 Senegal 0.454545 33 15\n",
5392 "least outliers \n",
5393 " Country Outliers N_Country N_Outliers\n",
5394 "120 Kazakhstan 0.000000 70 0\n",
5395 "119 Denmark 0.000000 13 0\n",
5396 "27 South Korea 0.000000 9 0\n",
5397 "1 Lithuania 0.000000 38 0\n",
5398 "107 Kiribati 0.000000 14 0\n",
5399 "31 Czech Republic 0.030303 33 1\n",
5400 "15 Netherlands 0.037037 54 2\n",
5401 "0 Canada 0.037500 80 3\n",
5402 "50 Finland 0.052632 19 1\n",
5403 "30 Afghanistan 0.052632 19 1\n",
5404 "writing file\n",
5405 "iteration 5\n",
5406 "../data/lda_data_melodia_8_30sec.pickle\n",
5407 "(6560, 381) (6560,)\n",
5355 "detecting outliers...\n", 5408 "detecting outliers...\n",
5356 "most outliers \n", 5409 "most outliers \n",
5357 " Country Outliers N_Country N_Outliers\n", 5410 " Country Outliers N_Country N_Outliers\n",
5358 "60 Chad 0.666667 9 6\n", 5411 "60 Chad 0.666667 9 6\n",
5359 "17 French Guiana 0.590909 22 13\n", 5412 "43 Benin 0.619048 21 13\n",
5360 "86 Gambia 0.550000 40 22\n", 5413 "136 Botswana 0.583333 72 42\n",
5361 "6 Bolivia 0.535714 28 15\n", 5414 "72 Ivory Coast 0.583333 12 7\n",
5362 "136 Botswana 0.513889 72 37\n", 5415 "20 Pakistan 0.479452 73 35\n",
5363 "64 Mozambique 0.481481 27 13\n", 5416 "86 Gambia 0.475000 40 19\n",
5364 "14 Liberia 0.468750 32 15\n",
5365 "78 El Salvador 0.461538 26 12\n", 5417 "78 El Salvador 0.461538 26 12\n",
5366 "115 Senegal 0.454545 33 15\n", 5418 "106 Nepal 0.460526 76 35\n",
5367 "108 Malta 0.437500 16 7\n", 5419 "63 Senegal 0.454545 33 15\n",
5420 "17 French Guiana 0.409091 22 9\n",
5368 "least outliers \n", 5421 "least outliers \n",
5369 " Country Outliers N_Country N_Outliers\n", 5422 " Country Outliers N_Country N_Outliers\n",
5370 "120 Kazakhstan 0.000000 70 0\n",
5371 "1 Lithuania 0.000000 38 0\n", 5423 "1 Lithuania 0.000000 38 0\n",
5372 "30 Afghanistan 0.000000 19 0\n", 5424 "27 South Korea 0.000000 9 0\n",
5373 "119 Denmark 0.000000 13 0\n", 5425 "119 Denmark 0.000000 13 0\n",
5374 "107 Kiribati 0.000000 14 0\n", 5426 "9 Saudi Arabia 0.000000 8 0\n",
5427 "120 Kazakhstan 0.014286 70 1\n",
5375 "31 Czech Republic 0.030303 33 1\n", 5428 "31 Czech Republic 0.030303 33 1\n",
5376 "98 Uzbekistan 0.030303 33 1\n",
5377 "15 Netherlands 0.037037 54 2\n", 5429 "15 Netherlands 0.037037 54 2\n",
5378 "105 Sudan 0.037037 54 2\n", 5430 "105 Sudan 0.037037 54 2\n",
5379 "84 Iraq 0.042857 70 3\n", 5431 "0 Canada 0.037500 80 3\n",
5432 "112 Israel 0.037500 80 3\n",
5380 "writing file\n", 5433 "writing file\n",
5381 "iteration 4\n", 5434 "iteration 6\n",
5382 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", 5435 "../data/lda_data_melodia_8_30sec.pickle\n",
5383 "(6560, 380) (6560,)\n", 5436 "(6560, 381) (6560,)\n",
5384 "detecting outliers...\n", 5437 "detecting outliers...\n",
5385 "most outliers \n", 5438 "most outliers \n",
5386 " Country Outliers N_Country N_Outliers\n", 5439 " Country Outliers N_Country N_Outliers\n",
5387 "117 Zimbabwe 0.583333 12 7\n", 5440 "136 Botswana 0.597222 72 43\n",
5388 "60 Chad 0.555556 9 5\n", 5441 "72 Ivory Coast 0.583333 12 7\n",
5389 "86 Gambia 0.550000 40 22\n", 5442 "106 Nepal 0.500000 76 38\n",
5390 "43 Benin 0.523810 21 11\n", 5443 "86 Gambia 0.500000 40 20\n",
5391 "6 Bolivia 0.500000 28 14\n", 5444 "115 Senegal 0.484848 33 16\n",
5392 "135 French Guiana 0.500000 22 11\n", 5445 "14 Liberia 0.468750 32 15\n",
5393 "136 Botswana 0.472222 72 34\n",
5394 "78 El Salvador 0.461538 26 12\n", 5446 "78 El Salvador 0.461538 26 12\n",
5395 "10 Guatemala 0.441860 43 19\n", 5447 "135 French Guiana 0.454545 22 10\n",
5396 "14 Liberia 0.437500 32 14\n", 5448 "20 Pakistan 0.452055 73 33\n",
5449 "95 Chad 0.444444 9 4\n",
5397 "least outliers \n", 5450 "least outliers \n",
5398 " Country Outliers N_Country N_Outliers\n", 5451 " Country Outliers N_Country N_Outliers\n",
5399 "1 Lithuania 0.000000 38 0\n", 5452 "113 Iceland 0.000000 11 0\n",
5400 "107 Kiribati 0.000000 14 0\n", 5453 "1 Lithuania 0.000000 38 0\n",
5401 "119 Denmark 0.000000 13 0\n", 5454 "119 Denmark 0.000000 13 0\n",
5402 "120 Kazakhstan 0.000000 70 0\n", 5455 "31 Czech Republic 0.000000 33 0\n",
5403 "27 South Korea 0.000000 9 0\n", 5456 "27 South Korea 0.000000 9 0\n",
5404 "109 Democratic Republic of the Congo 0.026316 38 1\n", 5457 "15 Netherlands 0.000000 54 0\n",
5405 "94 Iraq 0.028571 70 2\n", 5458 "120 Kazakhstan 0.014286 70 1\n",
5406 "31 Czech Republic 0.030303 33 1\n", 5459 "30 Afghanistan 0.052632 19 1\n",
5407 "105 Sudan 0.037037 54 2\n", 5460 "58 Bulgaria 0.054054 37 2\n",
5408 "85 Sierra Leone 0.050000 80 4\n", 5461 "105 Sudan 0.055556 54 3\n",
5409 "writing file\n", 5462 "writing file\n",
5410 "iteration 5\n", 5463 "iteration 7\n",
5411 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", 5464 "../data/lda_data_melodia_8_30sec.pickle\n"
5412 "(6560, 380) (6560,)\n", 5465 ]
5466 },
5467 {
5468 "name": "stdout",
5469 "output_type": "stream",
5470 "text": [
5471 "(6560, 381) (6560,)\n",
5472 "detecting outliers...\n",
5473 "most outliers \n",
5474 " Country Outliers N_Country N_Outliers\n",
5475 "72 Ivory Coast 0.666667 12 8\n",
5476 "136 Botswana 0.611111 72 44\n",
5477 "86 Gambia 0.575000 40 23\n",
5478 "95 Chad 0.555556 9 5\n",
5479 "44 Benin 0.523810 21 11\n",
5480 "64 Senegal 0.484848 33 16\n",
5481 "106 Nepal 0.460526 76 35\n",
5482 "20 Pakistan 0.452055 73 33\n",
5483 "65 Mozambique 0.444444 27 12\n",
5484 "66 Uganda 0.441176 68 30\n",
5485 "least outliers \n",
5486 " Country Outliers N_Country N_Outliers\n",
5487 "1 Lithuania 0.000000 38 0\n",
5488 "119 Denmark 0.000000 13 0\n",
5489 "113 Iceland 0.000000 11 0\n",
5490 "27 South Korea 0.000000 9 0\n",
5491 "120 Kazakhstan 0.014286 70 1\n",
5492 "57 Russia 0.025316 79 2\n",
5493 "46 United States of America 0.025641 78 2\n",
5494 "31 Czech Republic 0.030303 33 1\n",
5495 "15 Netherlands 0.037037 54 2\n",
5496 "0 Canada 0.050000 80 4\n",
5497 "writing file\n",
5498 "iteration 8\n",
5499 "../data/lda_data_melodia_8_30sec.pickle\n",
5500 "(6560, 381) (6560,)\n",
5413 "detecting outliers...\n", 5501 "detecting outliers...\n",
5414 "most outliers \n", 5502 "most outliers \n",
5415 " Country Outliers N_Country N_Outliers\n", 5503 " Country Outliers N_Country N_Outliers\n",
5416 "61 Chad 0.666667 9 6\n", 5504 "136 Botswana 0.625000 72 45\n",
5417 "44 Benin 0.619048 21 13\n", 5505 "72 Ivory Coast 0.583333 12 7\n",
5418 "104 Bhutan 0.555556 9 5\n", 5506 "86 Gambia 0.475000 40 19\n",
5419 "18 French Guiana 0.545455 22 12\n", 5507 "106 Nepal 0.460526 76 35\n",
5420 "86 Gambia 0.525000 40 21\n", 5508 "63 Senegal 0.454545 33 15\n",
5421 "136 Botswana 0.500000 72 36\n", 5509 "135 French Guiana 0.454545 22 10\n",
5422 "117 Zimbabwe 0.500000 12 6\n", 5510 "20 Pakistan 0.452055 73 33\n",
5423 "15 Liberia 0.500000 32 16\n", 5511 "60 Chad 0.444444 9 4\n",
5424 "64 Senegal 0.484848 33 16\n", 5512 "64 Mozambique 0.444444 27 12\n",
5425 "78 El Salvador 0.461538 26 12\n", 5513 "14 Liberia 0.437500 32 14\n",
5426 "least outliers \n", 5514 "least outliers \n",
5427 " Country Outliers N_Country N_Outliers\n", 5515 " Country Outliers N_Country N_Outliers\n",
5428 "1 Lithuania 0.000000 38 0\n", 5516 "1 Lithuania 0.000000 38 0\n",
5429 "120 Kazakhstan 0.000000 70 0\n", 5517 "27 South Korea 0.000000 9 0\n",
5430 "119 Denmark 0.000000 13 0\n", 5518 "30 Afghanistan 0.000000 19 0\n",
5431 "107 Kiribati 0.000000 14 0\n", 5519 "31 Czech Republic 0.000000 33 0\n",
5432 "9 Saudi Arabia 0.000000 8 0\n", 5520 "119 Denmark 0.000000 13 0\n",
5433 "0 Canada 0.025000 80 2\n", 5521 "120 Kazakhstan 0.014286 70 1\n",
5434 "57 Russia 0.050633 79 4\n", 5522 "15 Netherlands 0.037037 54 2\n",
5435 "109 Democratic Republic of the Congo 0.052632 38 2\n", 5523 "105 Sudan 0.037037 54 2\n",
5436 "51 Finland 0.052632 19 1\n", 5524 "45 United States of America 0.051282 78 4\n",
5437 "105 Sudan 0.055556 54 3\n", 5525 "134 Paraguay 0.055556 18 1\n",
5438 "writing file\n", 5526 "writing file\n",
5439 "iteration 6\n", 5527 "iteration 9\n",
5440 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n", 5528 "../data/lda_data_melodia_8_30sec.pickle\n",
5441 "(6560, 380) (6560,)\n", 5529 "(6560, 381) (6560,)\n",
5442 "detecting outliers...\n", 5530 "detecting outliers...\n",
5443 "most outliers \n", 5531 "most outliers \n",
5444 " Country Outliers N_Country N_Outliers\n", 5532 " Country Outliers N_Country N_Outliers\n",
5445 "60 Chad 0.666667 9 6\n", 5533 "31 Ivory Coast 0.666667 12 8\n",
5534 "136 Botswana 0.611111 72 44\n",
5446 "17 French Guiana 0.590909 22 13\n", 5535 "17 French Guiana 0.590909 22 13\n",
5447 "117 Zimbabwe 0.583333 12 7\n", 5536 "59 Chad 0.555556 9 5\n",
5448 "86 Gambia 0.575000 40 23\n",
5449 "78 El Salvador 0.538462 26 14\n", 5537 "78 El Salvador 0.538462 26 14\n",
5450 "43 Benin 0.523810 21 11\n", 5538 "20 Pakistan 0.493151 73 36\n",
5451 "115 Senegal 0.515152 33 17\n", 5539 "106 Nepal 0.486842 76 37\n",
5452 "136 Botswana 0.472222 72 34\n", 5540 "42 Benin 0.476190 21 10\n",
5541 "86 Gambia 0.450000 40 18\n",
5453 "104 Bhutan 0.444444 9 4\n", 5542 "104 Bhutan 0.444444 9 4\n",
5454 "84 Belize 0.441176 34 15\n",
5455 "least outliers \n", 5543 "least outliers \n",
5456 " Country Outliers N_Country N_Outliers\n", 5544 " Country Outliers N_Country N_Outliers\n",
5457 "1 Lithuania 0.000000 38 0\n", 5545 "1 Lithuania 0.000000 38 0\n",
5458 "107 Kiribati 0.000000 14 0\n", 5546 "27 South Korea 0.000000 9 0\n",
5459 "113 Iceland 0.000000 11 0\n", 5547 "119 Denmark 0.000000 13 0\n",
5460 "72 Ivory Coast 0.000000 12 0\n", 5548 "44 United States of America 0.012821 78 1\n",
5461 "119 Denmark 0.000000 13 0\n", 5549 "120 Kazakhstan 0.014286 70 1\n",
5462 "120 Kazakhstan 0.000000 70 0\n", 5550 "74 Czech Republic 0.030303 33 1\n",
5463 "28 Tajikistan 0.000000 15 0\n", 5551 "18 New Zealand 0.037037 27 1\n",
5464 "105 Sudan 0.018519 54 1\n", 5552 "15 Netherlands 0.037037 54 2\n",
5465 "15 Netherlands 0.018519 54 1\n", 5553 "105 Sudan 0.037037 54 2\n",
5466 "109 Democratic Republic of the Congo 0.026316 38 1\n", 5554 "0 Canada 0.050000 80 4\n",
5467 "writing file\n",
5468 "iteration 7\n",
5469 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n",
5470 "(6560, 380) (6560,)\n",
5471 "detecting outliers...\n",
5472 "most outliers \n",
5473 " Country Outliers N_Country N_Outliers\n",
5474 "95 Chad 0.555556 9 5\n",
5475 "86 Gambia 0.525000 40 21\n",
5476 "43 Benin 0.523810 21 11\n",
5477 "135 French Guiana 0.500000 22 11\n",
5478 "63 Senegal 0.484848 33 16\n",
5479 "14 Liberia 0.468750 32 15\n",
5480 "52 Indonesia 0.437500 80 35\n",
5481 "136 Botswana 0.430556 72 31\n",
5482 "6 Bolivia 0.428571 28 12\n",
5483 "92 Switzerland 0.428571 42 18\n",
5484 "least outliers \n",
5485 " Country Outliers N_Country N_Outliers\n",
5486 "119 Denmark 0.000000 13 0\n",
5487 "1 Lithuania 0.000000 38 0\n",
5488 "107 Kiribati 0.000000 14 0\n",
5489 "120 Kazakhstan 0.000000 70 0\n",
5490 "113 Iceland 0.000000 11 0\n",
5491 "94 Iraq 0.028571 70 2\n",
5492 "98 Uzbekistan 0.030303 33 1\n",
5493 "105 Sudan 0.037037 54 2\n",
5494 "85 Sierra Leone 0.037500 80 3\n",
5495 "109 Democratic Republic of the Congo 0.052632 38 2\n",
5496 "writing file\n",
5497 "iteration 8\n",
5498 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n",
5499 "(6560, 380) (6560,)\n",
5500 "detecting outliers...\n",
5501 "most outliers \n",
5502 " Country Outliers N_Country N_Outliers\n",
5503 "61 Chad 0.666667 9 6\n",
5504 "78 El Salvador 0.576923 26 15\n",
5505 "44 Benin 0.571429 21 12\n",
5506 "104 Bhutan 0.555556 9 5\n",
5507 "86 Gambia 0.550000 40 22\n",
5508 "17 French Guiana 0.545455 22 12\n",
5509 "94 Belize 0.470588 34 16\n",
5510 "14 Liberia 0.468750 32 15\n",
5511 "92 Switzerland 0.452381 42 19\n",
5512 "53 Indonesia 0.450000 80 36\n",
5513 "least outliers \n",
5514 " Country Outliers N_Country N_Outliers\n",
5515 "119 Denmark 0.000000 13 0\n",
5516 "1 Lithuania 0.000000 38 0\n",
5517 "120 Kazakhstan 0.000000 70 0\n",
5518 "107 Kiribati 0.000000 14 0\n",
5519 "98 Uzbekistan 0.030303 33 1\n",
5520 "105 Sudan 0.037037 54 2\n",
5521 "15 Netherlands 0.037037 54 2\n",
5522 "85 Sierra Leone 0.037500 80 3\n",
5523 "84 Iraq 0.042857 70 3\n",
5524 "109 Democratic Republic of the Congo 0.052632 38 2\n",
5525 "writing file\n",
5526 "iteration 9\n",
5527 "/import/c4dm-04/mariap/lda_data_melodia_8.pickle\n",
5528 "(6560, 380) (6560,)\n",
5529 "detecting outliers...\n",
5530 "most outliers \n",
5531 " Country Outliers N_Country N_Outliers\n",
5532 "95 Chad 0.555556 9 5\n",
5533 "104 Bhutan 0.555556 9 5\n",
5534 "86 Gambia 0.550000 40 22\n",
5535 "78 El Salvador 0.538462 26 14\n",
5536 "18 French Guiana 0.500000 22 11\n",
5537 "115 Senegal 0.484848 33 16\n",
5538 "44 Benin 0.476190 21 10\n",
5539 "41 Laos 0.470588 17 8\n",
5540 "6 Bolivia 0.464286 28 13\n",
5541 "65 Mozambique 0.444444 27 12\n",
5542 "least outliers \n",
5543 " Country Outliers N_Country N_Outliers\n",
5544 "119 Denmark 0.000000 13 0\n",
5545 "1 Lithuania 0.000000 38 0\n",
5546 "120 Kazakhstan 0.000000 70 0\n",
5547 "107 Kiribati 0.000000 14 0\n",
5548 "32 Czech Republic 0.000000 33 0\n",
5549 "85 Sierra Leone 0.050000 80 4\n",
5550 "0 Canada 0.050000 80 4\n",
5551 "109 Democratic Republic of the Congo 0.052632 38 2\n",
5552 "105 Sudan 0.055556 54 3\n",
5553 "16 Netherlands 0.055556 54 3\n",
5554 "writing file\n" 5555 "writing file\n"
5555 ] 5556 ]
5556 } 5557 }
5557 ], 5558 ],
5558 "source": [ 5559 "source": [
5559 "from sklearn.model_selection import train_test_split\n", 5560 "from sklearn.model_selection import train_test_split\n",
5560 "\n", 5561 "\n",
5562 "#results_file = mapper.OUTPUT_FILES[0]\n",
5563 "results_file = '../data/lda_data_melodia_8_30sec.pickle'\n",
5561 "n_iters = 10\n", 5564 "n_iters = 10\n",
5562 "for n in range(n_iters):\n", 5565 "for n in range(n_iters):\n",
5563 " print \"iteration %d\" % n\n", 5566 " print \"iteration %d\" % n\n",
5564 " results_file = mapper.OUTPUT_FILES[0]\n",
5565 " print results_file\n", 5567 " print results_file\n",
5566 " X, Y, Yaudio = classification.load_data_from_pickle(results_file)\n", 5568 " X, Y, Yaudio = classification.load_data_from_pickle(results_file)\n",
5567 " # get only 80% of the dataset.. to vary the choice of outliers\n", 5569 " # get only 80% of the dataset.. to vary the choice of outliers\n",
5568 " X, _, Y, _ = train_test_split(X, Y, train_size=0.8, stratify=Y)\n", 5570 " X, _, Y, _ = train_test_split(X, Y, train_size=0.8, stratify=Y)\n",
5569 " print X.shape, Y.shape\n", 5571 " print X.shape, Y.shape\n",
5594 "<br> Sort by outlier percentage in descending order." 5596 "<br> Sort by outlier percentage in descending order."
5595 ] 5597 ]
5596 }, 5598 },
5597 { 5599 {
5598 "cell_type": "code", 5600 "cell_type": "code",
5599 "execution_count": 7, 5601 "execution_count": 18,
5600 "metadata": { 5602 "metadata": {
5601 "collapsed": true 5603 "collapsed": true
5602 }, 5604 },
5603 "outputs": [], 5605 "outputs": [],
5604 "source": [ 5606 "source": [
5611 " ranked_outliers = pd.concat([ranked_outliers, df_global['Outliers']], axis=1)" 5613 " ranked_outliers = pd.concat([ranked_outliers, df_global['Outliers']], axis=1)"
5612 ] 5614 ]
5613 }, 5615 },
5614 { 5616 {
5615 "cell_type": "code", 5617 "cell_type": "code",
5616 "execution_count": 8, 5618 "execution_count": 19,
5617 "metadata": {}, 5619 "metadata": {},
5618 "outputs": [ 5620 "outputs": [
5619 { 5621 {
5620 "data": { 5622 "data": {
5621 "text/plain": [ 5623 "text/plain": [
5622 "(137, 10)" 5624 "(137, 10)"
5623 ] 5625 ]
5624 }, 5626 },
5625 "execution_count": 8, 5627 "execution_count": 19,
5626 "metadata": {}, 5628 "metadata": {},
5627 "output_type": "execute_result" 5629 "output_type": "execute_result"
5628 } 5630 }
5629 ], 5631 ],
5630 "source": [ 5632 "source": [
5638 "Remove countries with 0% outliers as these are in random (probably alphabetical) order." 5640 "Remove countries with 0% outliers as these are in random (probably alphabetical) order."
5639 ] 5641 ]
5640 }, 5642 },
5641 { 5643 {
5642 "cell_type": "code", 5644 "cell_type": "code",
5643 "execution_count": 9, 5645 "execution_count": 20,
5644 "metadata": {}, 5646 "metadata": {},
5645 "outputs": [ 5647 "outputs": [
5646 { 5648 {
5647 "name": "stdout", 5649 "name": "stdout",
5648 "output_type": "stream", 5650 "output_type": "stream",
5649 "text": [ 5651 "text": [
5650 " Country Country Country Country Country \\\n", 5652 " Country Country Country Country Country \\\n",
5651 "0 Chad Chad Chad Chad Zimbabwe \n", 5653 "0 Botswana Ivory Coast Chad Chad Benin \n",
5652 "1 Gambia French Guiana Bhutan French Guiana Chad \n", 5654 "1 Chad Botswana Botswana Botswana Botswana \n",
5653 "2 French Guiana Gambia Gambia Gambia Gambia \n", 5655 "2 Benin Chad Ivory Coast Gambia Ivory Coast \n",
5654 "3 Benin Benin French Guiana Bolivia Benin \n", 5656 "3 Ivory Coast Pakistan Pakistan Mozambique Chad \n",
5655 "4 Liberia Bolivia El Salvador Botswana Bolivia \n", 5657 "4 Pakistan Benin Gambia Ivory Coast Gambia \n",
5656 "\n", 5658 "\n",
5657 " Country Country Country Country Country \n", 5659 " Country Country Country Country Country \n",
5658 "0 Chad Chad Chad Chad Bhutan \n", 5660 "0 Chad Botswana Ivory Coast Botswana Ivory Coast \n",
5659 "1 Benin French Guiana Gambia El Salvador Chad \n", 5661 "1 Benin Ivory Coast Botswana Ivory Coast Botswana \n",
5660 "2 Bhutan Zimbabwe Benin Benin Gambia \n", 5662 "2 Botswana Gambia Gambia Gambia French Guiana \n",
5661 "3 French Guiana Gambia French Guiana Bhutan El Salvador \n", 5663 "3 Ivory Coast Nepal Chad Nepal Chad \n",
5662 "4 Gambia El Salvador Senegal Gambia French Guiana \n", 5664 "4 Pakistan Senegal Benin French Guiana El Salvador \n",
5663 " Outliers Outliers Outliers Outliers Outliers Outliers Outliers \\\n", 5665 " Outliers Outliers Outliers Outliers Outliers Outliers Outliers \\\n",
5664 "0 0.555556 0.666667 0.666667 0.666667 0.583333 0.666667 0.666667 \n", 5666 "0 0.625000 0.666667 0.666667 0.666667 0.619048 0.666667 0.597222 \n",
5665 "1 0.525000 0.545455 0.555556 0.590909 0.555556 0.619048 0.590909 \n", 5667 "1 0.555556 0.638889 0.625000 0.583333 0.597222 0.619048 0.583333 \n",
5666 "2 0.500000 0.525000 0.550000 0.550000 0.550000 0.555556 0.583333 \n", 5668 "2 0.523810 0.555556 0.583333 0.575000 0.583333 0.583333 0.500000 \n",
5667 "3 0.476190 0.523810 0.545455 0.535714 0.523810 0.545455 0.575000 \n", 5669 "3 0.500000 0.479452 0.534247 0.518519 0.555556 0.583333 0.500000 \n",
5668 "4 0.468750 0.500000 0.538462 0.513889 0.500000 0.525000 0.538462 \n", 5670 "4 0.493151 0.476190 0.525000 0.500000 0.525000 0.479452 0.484848 \n",
5669 "\n", 5671 "\n",
5670 " Outliers Outliers Outliers \n", 5672 " Outliers Outliers Outliers \n",
5671 "0 0.555556 0.666667 0.555556 \n", 5673 "0 0.666667 0.625000 0.666667 \n",
5672 "1 0.525000 0.576923 0.555556 \n", 5674 "1 0.611111 0.583333 0.611111 \n",
5673 "2 0.523810 0.571429 0.550000 \n", 5675 "2 0.575000 0.475000 0.590909 \n",
5674 "3 0.500000 0.555556 0.538462 \n", 5676 "3 0.555556 0.460526 0.555556 \n",
5675 "4 0.484848 0.550000 0.500000 \n" 5677 "4 0.523810 0.454545 0.538462 \n"
5676 ] 5678 ]
5677 } 5679 }
5678 ], 5680 ],
5679 "source": [ 5681 "source": [
5680 "zero_idx = np.where(np.sum(ranked_outliers, axis=1)==0)[0]\n", 5682 "zero_idx = np.where(np.sum(ranked_outliers, axis=1)==0)[0]\n",
5694 "And now kendalltau correlation" 5696 "And now kendalltau correlation"
5695 ] 5697 ]
5696 }, 5698 },
5697 { 5699 {
5698 "cell_type": "code", 5700 "cell_type": "code",
5699 "execution_count": 71, 5701 "execution_count": 21,
5700 "metadata": { 5702 "metadata": {},
5701 "collapsed": true 5703 "outputs": [
5702 }, 5704 {
5703 "outputs": [], 5705 "name": "stderr",
5706 "output_type": "stream",
5707 "text": [
5708 "/homes/mp305/anaconda/lib/python2.7/site-packages/scipy/stats/stats.py:250: RuntimeWarning: The input array could not be properly checked for nan values. nan values will be ignored.\n",
5709 " \"values. nan values will be ignored.\", RuntimeWarning)\n"
5710 ]
5711 }
5712 ],
5704 "source": [ 5713 "source": [
5705 "from scipy.stats import kendalltau\n", 5714 "from scipy.stats import kendalltau\n",
5706 "r_, p_ = [], []\n", 5715 "r_, p_ = [], []\n",
5707 "ranked_countries_arr = ranked_countries.get_values()\n", 5716 "ranked_countries_arr = ranked_countries.get_values()\n",
5708 "for i in range(n_iters-1):\n", 5717 "for i in range(n_iters-1):\n",
5714 "p_ = np.array(p_)" 5723 "p_ = np.array(p_)"
5715 ] 5724 ]
5716 }, 5725 },
5717 { 5726 {
5718 "cell_type": "code", 5727 "cell_type": "code",
5719 "execution_count": 72, 5728 "execution_count": 22,
5720 "metadata": {}, 5729 "metadata": {},
5721 "outputs": [ 5730 "outputs": [
5722 { 5731 {
5723 "name": "stdout", 5732 "name": "stdout",
5724 "output_type": "stream", 5733 "output_type": "stream",
5725 "text": [ 5734 "text": [
5726 "0.0493253335359 0.410409379365\n" 5735 "0.0554645319767 0.37638195368\n"
5727 ] 5736 ]
5728 } 5737 }
5729 ], 5738 ],
5730 "source": [ 5739 "source": [
5731 "print np.mean(r_), np.mean(p_)" 5740 "print np.mean(r_), np.mean(p_)"
5732 ] 5741 ]
5733 }, 5742 },
5734 { 5743 {
5735 "cell_type": "code", 5744 "cell_type": "code",
5736 "execution_count": 80, 5745 "execution_count": 23,
5737 "metadata": {}, 5746 "metadata": {},
5738 "outputs": [ 5747 "outputs": [
5739 { 5748 {
5740 "name": "stdout", 5749 "name": "stdout",
5741 "output_type": "stream", 5750 "output_type": "stream",
5742 "text": [ 5751 "text": [
5743 "0.240026302342 0.351418392739\n" 5752 "0.248540800214 0.311313597605\n"
5744 ] 5753 ]
5745 } 5754 }
5746 ], 5755 ],
5747 "source": [ 5756 "source": [
5748 "from scipy.stats import spearmanr\n", 5757 "from scipy.stats import spearmanr\n",
5760 "let's focus only on the top K results" 5769 "let's focus only on the top K results"
5761 ] 5770 ]
5762 }, 5771 },
5763 { 5772 {
5764 "cell_type": "code", 5773 "cell_type": "code",
5765 "execution_count": 81, 5774 "execution_count": 24,
5766 "metadata": {}, 5775 "metadata": {},
5767 "outputs": [ 5776 "outputs": [
5768 { 5777 {
5769 "name": "stdout", 5778 "name": "stdout",
5770 "output_type": "stream", 5779 "output_type": "stream",
5771 "text": [ 5780 "text": [
5772 "0.237245179063 0.417925582965\n" 5781 "0.294545454545 0.449007896087\n"
5773 ] 5782 ]
5774 } 5783 }
5775 ], 5784 ],
5776 "source": [ 5785 "source": [
5777 "k=10\n", 5786 "k=10\n",
5782 "print np.mean(r), np.mean(p)" 5791 "print np.mean(r), np.mean(p)"
5783 ] 5792 ]
5784 }, 5793 },
5785 { 5794 {
5786 "cell_type": "code", 5795 "cell_type": "code",
5787 "execution_count": 75, 5796 "execution_count": 25,
5788 "metadata": { 5797 "metadata": {
5789 "collapsed": true 5798 "collapsed": true
5790 }, 5799 },
5791 "outputs": [], 5800 "outputs": [],
5792 "source": [ 5801 "source": [
5795 " common_set = common_set & set(ranked_countries_arr[:k, i])" 5804 " common_set = common_set & set(ranked_countries_arr[:k, i])"
5796 ] 5805 ]
5797 }, 5806 },
5798 { 5807 {
5799 "cell_type": "code", 5808 "cell_type": "code",
5800 "execution_count": 76, 5809 "execution_count": 26,
5801 "metadata": {}, 5810 "metadata": {},
5802 "outputs": [ 5811 "outputs": [
5803 { 5812 {
5804 "data": { 5813 "data": {
5805 "text/plain": [ 5814 "text/plain": [
5806 "{'Chad', 'French Guiana', 'Gambia'}" 5815 "{'Botswana', 'Chad', 'Gambia', 'Ivory Coast', 'Pakistan'}"
5807 ] 5816 ]
5808 }, 5817 },
5809 "execution_count": 76, 5818 "execution_count": 26,
5810 "metadata": {}, 5819 "metadata": {},
5811 "output_type": "execute_result" 5820 "output_type": "execute_result"
5812 } 5821 }
5813 ], 5822 ],
5814 "source": [ 5823 "source": [
5822 "## Try precision at K" 5831 "## Try precision at K"
5823 ] 5832 ]
5824 }, 5833 },
5825 { 5834 {
5826 "cell_type": "code", 5835 "cell_type": "code",
5827 "execution_count": 10, 5836 "execution_count": 27,
5828 "metadata": {}, 5837 "metadata": {
5838 "collapsed": true
5839 },
5829 "outputs": [], 5840 "outputs": [],
5830 "source": [ 5841 "source": [
5831 "# majority voting + precision at K (top5?)\n", 5842 "# majority voting + precision at K (top5?)\n",
5832 "from collections import Counter\n", 5843 "from collections import Counter\n",
5833 "K_vote = 10\n", 5844 "K_vote = 10\n",
5834 "country_vote = Counter(ranked_countries_arr[:K_vote, :].ravel())" 5845 "country_vote = Counter(ranked_countries_arr[:K_vote, :].ravel())"
5835 ] 5846 ]
5836 }, 5847 },
5837 { 5848 {
5838 "cell_type": "code", 5849 "cell_type": "code",
5839 "execution_count": 11, 5850 "execution_count": 28,
5840 "metadata": {}, 5851 "metadata": {},
5841 "outputs": [ 5852 "outputs": [
5842 { 5853 {
5843 "data": { 5854 "data": {
5844 "text/html": [ 5855 "text/html": [
5852 " </tr>\n", 5863 " </tr>\n",
5853 " </thead>\n", 5864 " </thead>\n",
5854 " <tbody>\n", 5865 " <tbody>\n",
5855 " <tr>\n", 5866 " <tr>\n",
5856 " <th>0</th>\n", 5867 " <th>0</th>\n",
5857 " <td>Brazil</td>\n", 5868 " <td>Pakistan</td>\n",
5858 " <td>1</td>\n", 5869 " <td>10</td>\n",
5859 " </tr>\n", 5870 " </tr>\n",
5860 " <tr>\n", 5871 " <tr>\n",
5861 " <th>1</th>\n", 5872 " <th>1</th>\n",
5862 " <td>Liberia</td>\n", 5873 " <td>Bhutan</td>\n",
5863 " <td>7</td>\n", 5874 " <td>3</td>\n",
5864 " </tr>\n", 5875 " </tr>\n",
5865 " <tr>\n", 5876 " <tr>\n",
5866 " <th>2</th>\n", 5877 " <th>2</th>\n",
5867 " <td>Belize</td>\n",
5868 " <td>2</td>\n",
5869 " </tr>\n",
5870 " <tr>\n",
5871 " <th>3</th>\n",
5872 " <td>Chad</td>\n", 5878 " <td>Chad</td>\n",
5873 " <td>10</td>\n", 5879 " <td>10</td>\n",
5874 " </tr>\n", 5880 " </tr>\n",
5875 " <tr>\n", 5881 " <tr>\n",
5882 " <th>3</th>\n",
5883 " <td>Liberia</td>\n",
5884 " <td>2</td>\n",
5885 " </tr>\n",
5886 " <tr>\n",
5876 " <th>4</th>\n", 5887 " <th>4</th>\n",
5877 " <td>Bhutan</td>\n", 5888 " <td>El Salvador</td>\n",
5878 " <td>7</td>\n", 5889 " <td>5</td>\n",
5879 " </tr>\n", 5890 " </tr>\n",
5880 " </tbody>\n", 5891 " </tbody>\n",
5881 "</table>\n", 5892 "</table>\n",
5882 "</div>" 5893 "</div>"
5883 ], 5894 ],
5884 "text/plain": [ 5895 "text/plain": [
5885 " index 0\n", 5896 " index 0\n",
5886 "0 Brazil 1\n", 5897 "0 Pakistan 10\n",
5887 "1 Liberia 7\n", 5898 "1 Bhutan 3\n",
5888 "2 Belize 2\n", 5899 "2 Chad 10\n",
5889 "3 Chad 10\n", 5900 "3 Liberia 2\n",
5890 "4 Bhutan 7" 5901 "4 El Salvador 5"
5891 ] 5902 ]
5892 }, 5903 },
5893 "execution_count": 11, 5904 "execution_count": 28,
5894 "metadata": {}, 5905 "metadata": {},
5895 "output_type": "execute_result" 5906 "output_type": "execute_result"
5896 } 5907 }
5897 ], 5908 ],
5898 "source": [ 5909 "source": [
5900 "df_country_vote.head()" 5911 "df_country_vote.head()"
5901 ] 5912 ]
5902 }, 5913 },
5903 { 5914 {
5904 "cell_type": "code", 5915 "cell_type": "code",
5905 "execution_count": 12, 5916 "execution_count": 29,
5906 "metadata": {}, 5917 "metadata": {},
5907 "outputs": [ 5918 "outputs": [
5908 { 5919 {
5909 "data": { 5920 "data": {
5910 "text/html": [ 5921 "text/html": [
5917 " <th>0</th>\n", 5928 " <th>0</th>\n",
5918 " </tr>\n", 5929 " </tr>\n",
5919 " </thead>\n", 5930 " </thead>\n",
5920 " <tbody>\n", 5931 " <tbody>\n",
5921 " <tr>\n", 5932 " <tr>\n",
5922 " <th>3</th>\n", 5933 " <th>0</th>\n",
5934 " <td>Pakistan</td>\n",
5935 " <td>10</td>\n",
5936 " </tr>\n",
5937 " <tr>\n",
5938 " <th>2</th>\n",
5923 " <td>Chad</td>\n", 5939 " <td>Chad</td>\n",
5924 " <td>10</td>\n", 5940 " <td>10</td>\n",
5925 " </tr>\n", 5941 " </tr>\n",
5926 " <tr>\n", 5942 " <tr>\n",
5927 " <th>6</th>\n", 5943 " <th>5</th>\n",
5928 " <td>Gambia</td>\n", 5944 " <td>Gambia</td>\n",
5929 " <td>10</td>\n", 5945 " <td>10</td>\n",
5930 " </tr>\n", 5946 " </tr>\n",
5931 " <tr>\n", 5947 " <tr>\n",
5932 " <th>12</th>\n", 5948 " <th>10</th>\n",
5933 " <td>French Guiana</td>\n", 5949 " <td>Ivory Coast</td>\n",
5934 " <td>10</td>\n", 5950 " <td>10</td>\n",
5935 " </tr>\n", 5951 " </tr>\n",
5936 " <tr>\n", 5952 " <tr>\n",
5937 " <th>18</th>\n", 5953 " <th>12</th>\n",
5938 " <td>Benin</td>\n", 5954 " <td>Botswana</td>\n",
5955 " <td>10</td>\n",
5956 " </tr>\n",
5957 " <tr>\n",
5958 " <th>6</th>\n",
5959 " <td>Nepal</td>\n",
5939 " <td>9</td>\n", 5960 " <td>9</td>\n",
5940 " </tr>\n", 5961 " </tr>\n",
5941 " <tr>\n", 5962 " <tr>\n",
5942 " <th>5</th>\n", 5963 " <th>13</th>\n",
5943 " <td>El Salvador</td>\n", 5964 " <td>Benin</td>\n",
5944 " <td>9</td>\n",
5945 " </tr>\n",
5946 " <tr>\n",
5947 " <th>17</th>\n",
5948 " <td>Botswana</td>\n",
5949 " <td>8</td>\n", 5965 " <td>8</td>\n",
5950 " </tr>\n", 5966 " </tr>\n",
5951 " <tr>\n", 5967 " <tr>\n",
5952 " <th>4</th>\n", 5968 " <th>8</th>\n",
5953 " <td>Bhutan</td>\n", 5969 " <td>Senegal</td>\n",
5954 " <td>7</td>\n", 5970 " <td>7</td>\n",
5955 " </tr>\n", 5971 " </tr>\n",
5956 " <tr>\n", 5972 " <tr>\n",
5957 " <th>1</th>\n", 5973 " <th>9</th>\n",
5958 " <td>Liberia</td>\n", 5974 " <td>French Guiana</td>\n",
5959 " <td>7</td>\n", 5975 " <td>7</td>\n",
5960 " </tr>\n", 5976 " </tr>\n",
5961 " <tr>\n", 5977 " <tr>\n",
5962 " <th>16</th>\n", 5978 " <th>4</th>\n",
5963 " <td>Bolivia</td>\n", 5979 " <td>El Salvador</td>\n",
5964 " <td>6</td>\n", 5980 " <td>5</td>\n",
5965 " </tr>\n",
5966 " <tr>\n",
5967 " <th>10</th>\n",
5968 " <td>Senegal</td>\n",
5969 " <td>6</td>\n",
5970 " </tr>\n", 5981 " </tr>\n",
5971 " <tr>\n", 5982 " <tr>\n",
5972 " <th>11</th>\n", 5983 " <th>11</th>\n",
5973 " <td>Zimbabwe</td>\n", 5984 " <td>Mozambique</td>\n",
5985 " <td>5</td>\n",
5986 " </tr>\n",
5987 " <tr>\n",
5988 " <th>7</th>\n",
5989 " <td>Uganda</td>\n",
5990 " <td>4</td>\n",
5991 " </tr>\n",
5992 " <tr>\n",
5993 " <th>1</th>\n",
5994 " <td>Bhutan</td>\n",
5974 " <td>3</td>\n", 5995 " <td>3</td>\n",
5975 " </tr>\n", 5996 " </tr>\n",
5976 " <tr>\n", 5997 " <tr>\n",
5977 " <th>14</th>\n", 5998 " <th>3</th>\n",
5978 " <td>Switzerland</td>\n", 5999 " <td>Liberia</td>\n",
5979 " <td>3</td>\n",
5980 " </tr>\n",
5981 " <tr>\n",
5982 " <th>15</th>\n",
5983 " <td>Mozambique</td>\n",
5984 " <td>3</td>\n",
5985 " </tr>\n",
5986 " <tr>\n",
5987 " <th>2</th>\n",
5988 " <td>Belize</td>\n",
5989 " <td>2</td>\n", 6000 " <td>2</td>\n",
5990 " </tr>\n",
5991 " <tr>\n",
5992 " <th>7</th>\n",
5993 " <td>Indonesia</td>\n",
5994 " <td>2</td>\n",
5995 " </tr>\n",
5996 " <tr>\n",
5997 " <th>8</th>\n",
5998 " <td>Guatemala</td>\n",
5999 " <td>2</td>\n",
6000 " </tr>\n",
6001 " <tr>\n",
6002 " <th>0</th>\n",
6003 " <td>Brazil</td>\n",
6004 " <td>1</td>\n",
6005 " </tr>\n",
6006 " <tr>\n",
6007 " <th>13</th>\n",
6008 " <td>Laos</td>\n",
6009 " <td>1</td>\n",
6010 " </tr>\n",
6011 " <tr>\n",
6012 " <th>9</th>\n",
6013 " <td>Malta</td>\n",
6014 " <td>1</td>\n",
6015 " </tr>\n", 6001 " </tr>\n",
6016 " </tbody>\n", 6002 " </tbody>\n",
6017 "</table>\n", 6003 "</table>\n",
6018 "</div>" 6004 "</div>"
6019 ], 6005 ],
6020 "text/plain": [ 6006 "text/plain": [
6021 " index 0\n", 6007 " index 0\n",
6022 "3 Chad 10\n", 6008 "0 Pakistan 10\n",
6023 "6 Gambia 10\n", 6009 "2 Chad 10\n",
6024 "12 French Guiana 10\n", 6010 "5 Gambia 10\n",
6025 "18 Benin 9\n", 6011 "10 Ivory Coast 10\n",
6026 "5 El Salvador 9\n", 6012 "12 Botswana 10\n",
6027 "17 Botswana 8\n", 6013 "6 Nepal 9\n",
6028 "4 Bhutan 7\n", 6014 "13 Benin 8\n",
6029 "1 Liberia 7\n", 6015 "8 Senegal 7\n",
6030 "16 Bolivia 6\n", 6016 "9 French Guiana 7\n",
6031 "10 Senegal 6\n", 6017 "4 El Salvador 5\n",
6032 "11 Zimbabwe 3\n", 6018 "11 Mozambique 5\n",
6033 "14 Switzerland 3\n", 6019 "7 Uganda 4\n",
6034 "15 Mozambique 3\n", 6020 "1 Bhutan 3\n",
6035 "2 Belize 2\n", 6021 "3 Liberia 2"
6036 "7 Indonesia 2\n",
6037 "8 Guatemala 2\n",
6038 "0 Brazil 1\n",
6039 "13 Laos 1\n",
6040 "9 Malta 1"
6041 ] 6022 ]
6042 }, 6023 },
6043 "execution_count": 12, 6024 "execution_count": 29,
6044 "metadata": {}, 6025 "metadata": {},
6045 "output_type": "execute_result" 6026 "output_type": "execute_result"
6046 } 6027 }
6047 ], 6028 ],
6048 "source": [ 6029 "source": [
6049 "df_country_vote.sort_values(0, ascending=False)" 6030 "df_country_vote.sort_values(0, ascending=False)"
6050 ] 6031 ]
6051 }, 6032 },
6052 { 6033 {
6053 "cell_type": "code", 6034 "cell_type": "code",
6054 "execution_count": 14, 6035 "execution_count": 30,
6055 "metadata": {}, 6036 "metadata": {},
6056 "outputs": [ 6037 "outputs": [
6057 { 6038 {
6058 "name": "stdout", 6039 "name": "stdout",
6059 "output_type": "stream", 6040 "output_type": "stream",
6060 "text": [ 6041 "text": [
6061 "0.51 0.0830662386292\n" 6042 "0.67 0.0640312423743\n"
6062 ] 6043 ]
6063 } 6044 }
6064 ], 6045 ],
6065 "source": [ 6046 "source": [
6066 "def precision_at_k(array, gr_truth, k):\n", 6047 "def precision_at_k(array, gr_truth, k):\n",
6075 "print np.mean(p_), np.std(p_)" 6056 "print np.mean(p_), np.std(p_)"
6076 ] 6057 ]
6077 }, 6058 },
6078 { 6059 {
6079 "cell_type": "code", 6060 "cell_type": "code",
6080 "execution_count": 15, 6061 "execution_count": 31,
6081 "metadata": {}, 6062 "metadata": {},
6082 "outputs": [ 6063 "outputs": [
6083 { 6064 {
6084 "data": { 6065 "data": {
6085 "text/plain": [ 6066 "text/plain": [
6086 "array([ 0.6, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.4, 0.7, 0.4])" 6067 "array([ 0.6, 0.7, 0.7, 0.6, 0.6, 0.7, 0.8, 0.6, 0.7, 0.7])"
6087 ] 6068 ]
6088 }, 6069 },
6089 "execution_count": 15, 6070 "execution_count": 31,
6090 "metadata": {}, 6071 "metadata": {},
6091 "output_type": "execute_result" 6072 "output_type": "execute_result"
6092 } 6073 }
6093 ], 6074 ],
6094 "source": [ 6075 "source": [