annotate notebooks/results_30_seconds_and_figures.ipynb @ 75:02faad4a996b branch-tests

results and figures
author Maria Panteli <m.x.panteli@gmail.com>
date Fri, 22 Sep 2017 16:30:28 +0100
parents
children bde45ce0eeab
rev   line source
m@75 1 {
m@75 2 "cells": [
m@75 3 {
m@75 4 "cell_type": "code",
m@75 5 "execution_count": 12,
m@75 6 "metadata": {
m@75 7 "collapsed": false
m@75 8 },
m@75 9 "outputs": [
m@75 10 {
m@75 11 "name": "stdout",
m@75 12 "output_type": "stream",
m@75 13 "text": [
m@75 14 "The autoreload extension is already loaded. To reload it, use:\n",
m@75 15 " %reload_ext autoreload\n"
m@75 16 ]
m@75 17 }
m@75 18 ],
m@75 19 "source": [
m@75 20 "import numpy as np\n",
m@75 21 "import pandas as pd\n",
m@75 22 "import pickle \n",
m@75 23 "\n",
m@75 24 "%load_ext autoreload\n",
m@75 25 "%autoreload 2\n",
m@75 26 "\n",
m@75 27 "%matplotlib inline\n",
m@75 28 "import matplotlib.pyplot as plt\n",
m@75 29 "\n",
m@75 30 "import sys\n",
m@75 31 "sys.path.append('../')\n",
m@75 32 "import scripts.outliers as outliers\n",
m@75 33 "import scripts.utils as utils"
m@75 34 ]
m@75 35 },
m@75 36 {
m@75 37 "cell_type": "code",
m@75 38 "execution_count": 7,
m@75 39 "metadata": {
m@75 40 "collapsed": false
m@75 41 },
m@75 42 "outputs": [
m@75 43 {
m@75 44 "name": "stdout",
m@75 45 "output_type": "stream",
m@75 46 "text": [
m@75 47 "WARNING: there are 21 disconnected observations\n",
m@75 48 "Island ids: [3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121]\n",
m@75 49 "Antigua and Barbuda\n",
m@75 50 "Australia\n",
m@75 51 "Cuba\n",
m@75 52 "Fiji\n",
m@75 53 "French Polynesia\n",
m@75 54 "Grenada\n",
m@75 55 "Iceland\n",
m@75 56 "Jamaica\n",
m@75 57 "Japan\n",
m@75 58 "Kiribati\n",
m@75 59 "Malta\n",
m@75 60 "New Zealand\n",
m@75 61 "Philippines\n",
m@75 62 "Puerto Rico\n",
m@75 63 "Republic of Serbia\n",
m@75 64 "Saint Lucia\n",
m@75 65 "Samoa\n",
m@75 66 "Solomon Islands\n",
m@75 67 "South Korea\n",
m@75 68 "The Bahamas\n",
m@75 69 "Trinidad and Tobago\n"
m@75 70 ]
m@75 71 },
m@75 72 {
m@75 73 "name": "stderr",
m@75 74 "output_type": "stream",
m@75 75 "text": [
m@75 76 "/Users/mariapanteli/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n",
m@75 77 " data = self._reader.read(nrows)\n"
m@75 78 ]
m@75 79 }
m@75 80 ],
m@75 81 "source": [
m@75 82 "DATA_FILE = '../data/lda_data_melodia_8_30sec.pickle'\n",
m@75 83 "METADATA_FILE = '/Users/mariapanteli/Documents/QMUL/Code/MyPythonCode/MergeBL-Smith/data/metadata_BLSM_language_all.csv'\n",
m@75 84 "#METADATA_FILE = '../data/metadata.csv'\n",
m@75 85 "\n",
m@75 86 "dataset, ddf, w_dict = outliers.load_data(DATA_FILE, METADATA_FILE)"
m@75 87 ]
m@75 88 },
m@75 89 {
m@75 90 "cell_type": "code",
m@75 91 "execution_count": 9,
m@75 92 "metadata": {
m@75 93 "collapsed": false
m@75 94 },
m@75 95 "outputs": [
m@75 96 {
m@75 97 "data": {
m@75 98 "text/plain": [
m@75 99 "(8200, 108)"
m@75 100 ]
m@75 101 },
m@75 102 "execution_count": 9,
m@75 103 "metadata": {},
m@75 104 "output_type": "execute_result"
m@75 105 }
m@75 106 ],
m@75 107 "source": [
m@75 108 "X_list, Y, Yaudio = dataset\n",
m@75 109 "X = np.concatenate(X_list, axis=1)\n",
m@75 110 "ddf.shape"
m@75 111 ]
m@75 112 },
m@75 113 {
m@75 114 "cell_type": "code",
m@75 115 "execution_count": 11,
m@75 116 "metadata": {
m@75 117 "collapsed": false
m@75 118 },
m@75 119 "outputs": [
m@75 120 {
m@75 121 "name": "stdout",
m@75 122 "output_type": "stream",
m@75 123 "text": [
m@75 124 "most outliers \n",
m@75 125 " Country Outliers N_Country N_Outliers\n",
m@75 126 "136 Botswana 0.611111 90 55\n",
m@75 127 "72 Ivory Coast 0.600000 15 9\n",
m@75 128 "95 Chad 0.545455 11 6\n",
m@75 129 "43 Benin 0.538462 26 14\n",
m@75 130 "86 Gambia 0.500000 50 25\n",
m@75 131 "20 Pakistan 0.494505 91 45\n",
m@75 132 "106 Nepal 0.473684 95 45\n",
m@75 133 "78 El Salvador 0.454545 33 15\n",
m@75 134 "64 Mozambique 0.441176 34 15\n",
m@75 135 "135 French Guiana 0.428571 28 12\n",
m@75 136 "least outliers \n",
m@75 137 " Country Outliers N_Country N_Outliers\n",
m@75 138 "1 Lithuania 0.000000 47 0\n",
m@75 139 "119 Denmark 0.000000 16 0\n",
m@75 140 "27 South Korea 0.000000 11 0\n",
m@75 141 "120 Kazakhstan 0.011364 88 1\n",
m@75 142 "31 Czech Republic 0.024390 41 1\n",
m@75 143 "15 Netherlands 0.029851 67 2\n",
m@75 144 "30 Afghanistan 0.041667 24 1\n",
m@75 145 "105 Sudan 0.044118 68 3\n",
m@75 146 "102 Nicaragua 0.047619 21 1\n",
m@75 147 "0 Canada 0.050000 100 5\n"
m@75 148 ]
m@75 149 }
m@75 150 ],
m@75 151 "source": [
m@75 152 "# global outliers\n",
m@75 153 "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999, out_file='../data/results/global_outliers.csv')\n",
m@75 154 "outliers.print_most_least_outliers_topN(df_global, N=10)"
m@75 155 ]
m@75 156 },
m@75 157 {
m@75 158 "cell_type": "code",
m@75 159 "execution_count": 14,
m@75 160 "metadata": {
m@75 161 "collapsed": false,
m@75 162 "scrolled": true
m@75 163 },
m@75 164 "outputs": [
m@75 165 {
m@75 166 "name": "stdout",
m@75 167 "output_type": "stream",
m@75 168 "text": [
m@75 169 "328\n",
m@75 170 "210\n",
m@75 171 "194\n",
m@75 172 "85\n",
m@75 173 "388\n",
m@75 174 "266\n",
m@75 175 "309\n",
m@75 176 "455\n",
m@75 177 "365\n",
m@75 178 "282\n",
m@75 179 "197\n",
m@75 180 "122\n",
m@75 181 "206\n",
m@75 182 "457\n",
m@75 183 "298\n",
m@75 184 "597\n",
m@75 185 "354\n",
m@75 186 "191\n",
m@75 187 "193\n",
m@75 188 "198\n",
m@75 189 "263\n",
m@75 190 "334\n",
m@75 191 "812\n",
m@75 192 "415\n",
m@75 193 "44\n",
m@75 194 "107\n",
m@75 195 "366\n",
m@75 196 "323\n",
m@75 197 "450\n",
m@75 198 "116\n",
m@75 199 "150\n",
m@75 200 "260\n",
m@75 201 "230\n",
m@75 202 "118\n",
m@75 203 "389\n",
m@75 204 "237\n",
m@75 205 "274\n",
m@75 206 "466\n",
m@75 207 "147\n",
m@75 208 "134\n",
m@75 209 "86\n",
m@75 210 "91\n",
m@75 211 "574\n",
m@75 212 "111\n",
m@75 213 "296\n",
m@75 214 "221\n",
m@75 215 "261\n",
m@75 216 "224\n",
m@75 217 "190\n",
m@75 218 "150\n",
m@75 219 "139\n",
m@75 220 "350\n",
m@75 221 "268\n",
m@75 222 "453\n",
m@75 223 "192\n",
m@75 224 "468\n",
m@75 225 "266\n",
m@75 226 "187\n",
m@75 227 "275\n",
m@75 228 "337\n",
m@75 229 "179\n",
m@75 230 "366\n",
m@75 231 "211\n",
m@75 232 "213\n",
m@75 233 "428\n",
m@75 234 "468\n",
m@75 235 "164\n",
m@75 236 "348\n",
m@75 237 "328\n",
m@75 238 "193\n",
m@75 239 "197\n",
m@75 240 "193\n",
m@75 241 "166\n",
m@75 242 "290\n",
m@75 243 "196\n",
m@75 244 "224\n",
m@75 245 "111\n",
m@75 246 "258\n",
m@75 247 "295\n",
m@75 248 "227\n",
m@75 249 "252\n",
m@75 250 "433\n",
m@75 251 "305\n",
m@75 252 "290\n",
m@75 253 "183\n",
m@75 254 "243\n",
m@75 255 "63\n",
m@75 256 "197\n",
m@75 257 "274\n",
m@75 258 "363\n",
m@75 259 "113\n",
m@75 260 "192\n",
m@75 261 "258\n",
m@75 262 "494\n",
m@75 263 "299\n",
m@75 264 "484\n",
m@75 265 "198\n",
m@75 266 "191\n",
m@75 267 "174\n",
m@75 268 "280\n",
m@75 269 "735\n",
m@75 270 "211\n",
m@75 271 "221\n",
m@75 272 "134\n",
m@75 273 "125\n",
m@75 274 "119\n",
m@75 275 "151\n",
m@75 276 "203\n",
m@75 277 "229\n",
m@75 278 "430\n",
m@75 279 "311\n",
m@75 280 "424\n",
m@75 281 "337\n",
m@75 282 "268\n",
m@75 283 "175\n",
m@75 284 "228\n",
m@75 285 "175\n",
m@75 286 "437\n",
m@75 287 "284\n",
m@75 288 "129\n",
m@75 289 "366\n",
m@75 290 "222\n",
m@75 291 "66\n",
m@75 292 "498\n",
m@75 293 "400\n",
m@75 294 "430\n",
m@75 295 "187\n",
m@75 296 "470\n",
m@75 297 "298\n",
m@75 298 "231\n",
m@75 299 "272\n",
m@75 300 "261\n",
m@75 301 "239\n",
m@75 302 "154\n",
m@75 303 "22\n",
m@75 304 "426\n",
m@75 305 "332\n",
m@75 306 "most outliers \n",
m@75 307 " Country Outliers N_Country N_Outliers\n",
m@75 308 "46 China 0.260000 100 26\n",
m@75 309 "67 Brazil 0.240000 100 24\n",
m@75 310 "101 Colombia 0.211111 90 19\n",
m@75 311 "64 Mozambique 0.205882 34 7\n",
m@75 312 "76 Iran 0.188679 53 10\n",
m@75 313 "65 Uganda 0.176471 85 15\n",
m@75 314 "27 Kenya 0.164948 97 16\n",
m@75 315 "126 South Sudan 0.163043 92 15\n",
m@75 316 "24 Azerbaijan 0.153846 13 2\n",
m@75 317 "23 India 0.147368 95 14\n",
m@75 318 "least outliers \n",
m@75 319 " Country Outliers N_Country N_Outliers\n",
m@75 320 "0 Canada 0 100 0\n",
m@75 321 "95 Portugal 0 100 0\n",
m@75 322 "94 Iraq 0 87 0\n",
m@75 323 "93 Grenada 0 37 0\n",
m@75 324 "90 French Polynesia 0 15 0\n",
m@75 325 "89 Croatia 0 31 0\n",
m@75 326 "88 Morocco 0 40 0\n",
m@75 327 "87 Philippines 0 100 0\n",
m@75 328 "86 Gambia 0 50 0\n",
m@75 329 "85 Sierra Leone 0 100 0\n"
m@75 330 ]
m@75 331 }
m@75 332 ],
m@75 333 "source": [
m@75 334 "# local outliers\n",
m@75 335 "df_local = outliers.get_local_outliers_df(X, Y, w_dict, out_file='../data/results/spatial_outliers.csv')\n",
m@75 336 "outliers.print_most_least_outliers_topN(df_local, N=10)"
m@75 337 ]
m@75 338 },
m@75 339 {
m@75 340 "cell_type": "code",
m@75 341 "execution_count": 16,
m@75 342 "metadata": {
m@75 343 "collapsed": false,
m@75 344 "scrolled": true
m@75 345 },
m@75 346 "outputs": [
m@75 347 {
m@75 348 "name": "stdout",
m@75 349 "output_type": "stream",
m@75 350 "text": [
m@75 351 "most outliers \n",
m@75 352 " Country Outliers N_Country N_Outliers\n",
m@75 353 "43 Benin 0.500000 26 13\n",
m@75 354 "136 Botswana 0.488889 90 44\n",
m@75 355 "106 Nepal 0.421053 95 40\n",
m@75 356 "84 Belize 0.418605 43 18\n",
m@75 357 "19 Yemen 0.416667 12 5\n",
m@75 358 "least outliers \n",
m@75 359 " Country Outliers N_Country N_Outliers\n",
m@75 360 "28 Tajikistan 0 19 0\n",
m@75 361 "119 Denmark 0 16 0\n",
m@75 362 "96 Uruguay 0 31 0\n",
m@75 363 "25 Republic of Serbia 0 16 0\n",
m@75 364 "27 South Korea 0 11 0\n",
m@75 365 "most outliers \n",
m@75 366 " Country Outliers N_Country N_Outliers\n",
m@75 367 "117 Zimbabwe 0.533333 15 8\n",
m@75 368 "96 Uruguay 0.483871 31 15\n",
m@75 369 "68 Guinea 0.454545 11 5\n",
m@75 370 "63 Senegal 0.390244 41 16\n",
m@75 371 "86 Gambia 0.380000 50 19\n",
m@75 372 "least outliers \n",
m@75 373 " Country Outliers N_Country N_Outliers\n",
m@75 374 "90 French Polynesia 0.000000 15 0\n",
m@75 375 "37 Rwanda 0.000000 17 0\n",
m@75 376 "119 Denmark 0.000000 16 0\n",
m@75 377 "18 New Zealand 0.000000 34 0\n",
m@75 378 "120 Kazakhstan 0.022727 88 2\n",
m@75 379 "most outliers \n",
m@75 380 " Country Outliers N_Country N_Outliers\n",
m@75 381 "17 French Guiana 0.678571 28 19\n",
m@75 382 "136 Botswana 0.477778 90 43\n",
m@75 383 "72 Ivory Coast 0.400000 15 6\n",
m@75 384 "23 Azerbaijan 0.384615 13 5\n",
m@75 385 "106 Nepal 0.347368 95 33\n",
m@75 386 "least outliers \n",
m@75 387 " Country Outliers N_Country N_Outliers\n",
m@75 388 "68 Guinea 0 11 0\n",
m@75 389 "55 Mali 0 17 0\n",
m@75 390 "77 Algeria 0 27 0\n",
m@75 391 "33 Saint Lucia 0 43 0\n",
m@75 392 "31 Czech Republic 0 41 0\n",
m@75 393 "most outliers \n",
m@75 394 " Country Outliers N_Country N_Outliers\n",
m@75 395 "43 Benin 0.538462 26 14\n",
m@75 396 "20 Pakistan 0.461538 91 42\n",
m@75 397 "86 Gambia 0.360000 50 18\n",
m@75 398 "52 Indonesia 0.350000 100 35\n",
m@75 399 "136 Botswana 0.311111 90 28\n",
m@75 400 "least outliers \n",
m@75 401 " Country Outliers N_Country N_Outliers\n",
m@75 402 "107 Kiribati 0 17 0\n",
m@75 403 "1 Lithuania 0 47 0\n",
m@75 404 "134 Paraguay 0 23 0\n",
m@75 405 "131 Tunisia 0 39 0\n",
m@75 406 "19 Yemen 0 12 0\n"
m@75 407 ]
m@75 408 }
m@75 409 ],
m@75 410 "source": [
m@75 411 "# outliers for features\n",
m@75 412 "feat = X_list\n",
m@75 413 "feat_labels = ['rhy', 'mel', 'mfc', 'chr']\n",
m@75 414 "tabs_feat = []\n",
m@75 415 "for i in range(len(feat)):\n",
m@75 416 " XX = feat[i]\n",
m@75 417 " output_csv = '../data/results/global_outliers_'+feat_labels[i]+'.csv'\n",
m@75 418 " df_feat, threshold, MD = outliers.get_outliers_df(XX, Y, chi2thr=0.999, out_file=output_csv)\n",
m@75 419 " outliers.print_most_least_outliers_topN(df_feat, N=5)"
m@75 420 ]
m@75 421 },
m@75 422 {
m@75 423 "cell_type": "code",
m@75 424 "execution_count": null,
m@75 425 "metadata": {
m@75 426 "collapsed": false
m@75 427 },
m@75 428 "outputs": [
m@75 429 {
m@75 430 "name": "stdout",
m@75 431 "output_type": "stream",
m@75 432 "text": [
m@75 433 "5\n",
m@75 434 "6\n",
m@75 435 "7\n",
m@75 436 "8\n",
m@75 437 "9\n",
m@75 438 "10\n",
m@75 439 "11\n",
m@75 440 "12\n",
m@75 441 "13\n",
m@75 442 "14\n",
m@75 443 "15\n",
m@75 444 "16\n",
m@75 445 "17\n",
m@75 446 "18"
m@75 447 ]
m@75 448 }
m@75 449 ],
m@75 450 "source": [
m@75 451 "centroids, cl_pred = outliers.get_country_clusters(X, bestncl=None, max_ncl=30)\n",
m@75 452 "ddf['Clusters'] = cl_pred\n",
m@75 453 "ddf.to_csv('../data/results/df_and_clusters.csv', index=False)\n",
m@75 454 "print len(np.unique(cl_pred))\n",
m@75 455 "outliers.print_clusters_metadata(ddf, cl_pred)"
m@75 456 ]
m@75 457 },
m@75 458 {
m@75 459 "cell_type": "code",
m@75 460 "execution_count": null,
m@75 461 "metadata": {
m@75 462 "collapsed": true
m@75 463 },
m@75 464 "outputs": [],
m@75 465 "source": []
m@75 466 }
m@75 467 ],
m@75 468 "metadata": {
m@75 469 "kernelspec": {
m@75 470 "display_name": "Python 2",
m@75 471 "language": "python",
m@75 472 "name": "python2"
m@75 473 },
m@75 474 "language_info": {
m@75 475 "codemirror_mode": {
m@75 476 "name": "ipython",
m@75 477 "version": 2
m@75 478 },
m@75 479 "file_extension": ".py",
m@75 480 "mimetype": "text/x-python",
m@75 481 "name": "python",
m@75 482 "nbconvert_exporter": "python",
m@75 483 "pygments_lexer": "ipython2",
m@75 484 "version": "2.7.12"
m@75 485 }
m@75 486 },
m@75 487 "nbformat": 4,
m@75 488 "nbformat_minor": 0
m@75 489 }