Mercurial > hg > plosone_underreview
view notebooks/results_30_seconds_and_figures.ipynb @ 75:02faad4a996b branch-tests
results and figures
author | Maria Panteli <m.x.panteli@gmail.com> |
---|---|
date | Fri, 22 Sep 2017 16:30:28 +0100 |
parents | |
children | bde45ce0eeab |
line wrap: on
line source
{ "cells": [ { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The autoreload extension is already loaded. To reload it, use:\n", " %reload_ext autoreload\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import pickle \n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "import sys\n", "sys.path.append('../')\n", "import scripts.outliers as outliers\n", "import scripts.utils as utils" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING: there are 21 disconnected observations\n", "Island ids: [3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121]\n", "Antigua and Barbuda\n", "Australia\n", "Cuba\n", "Fiji\n", "French Polynesia\n", "Grenada\n", "Iceland\n", "Jamaica\n", "Japan\n", "Kiribati\n", "Malta\n", "New Zealand\n", "Philippines\n", "Puerto Rico\n", "Republic of Serbia\n", "Saint Lucia\n", "Samoa\n", "Solomon Islands\n", "South Korea\n", "The Bahamas\n", "Trinidad and Tobago\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/mariapanteli/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = self._reader.read(nrows)\n" ] } ], "source": [ "DATA_FILE = '../data/lda_data_melodia_8_30sec.pickle'\n", "METADATA_FILE = '/Users/mariapanteli/Documents/QMUL/Code/MyPythonCode/MergeBL-Smith/data/metadata_BLSM_language_all.csv'\n", "#METADATA_FILE = '../data/metadata.csv'\n", "\n", "dataset, ddf, w_dict = outliers.load_data(DATA_FILE, METADATA_FILE)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(8200, 108)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_list, Y, Yaudio = dataset\n", "X = np.concatenate(X_list, axis=1)\n", "ddf.shape" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "most outliers \n", " Country Outliers N_Country N_Outliers\n", "136 Botswana 0.611111 90 55\n", "72 Ivory Coast 0.600000 15 9\n", "95 Chad 0.545455 11 6\n", "43 Benin 0.538462 26 14\n", "86 Gambia 0.500000 50 25\n", "20 Pakistan 0.494505 91 45\n", "106 Nepal 0.473684 95 45\n", "78 El Salvador 0.454545 33 15\n", "64 Mozambique 0.441176 34 15\n", "135 French Guiana 0.428571 28 12\n", "least outliers \n", " Country Outliers N_Country N_Outliers\n", "1 Lithuania 0.000000 47 0\n", "119 Denmark 0.000000 16 0\n", "27 South Korea 0.000000 11 0\n", "120 Kazakhstan 0.011364 88 1\n", "31 Czech Republic 0.024390 41 1\n", "15 Netherlands 0.029851 67 2\n", "30 Afghanistan 0.041667 24 1\n", "105 Sudan 0.044118 68 3\n", "102 Nicaragua 0.047619 21 1\n", "0 Canada 0.050000 100 5\n" ] } ], "source": [ "# global outliers\n", "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999, out_file='../data/results/global_outliers.csv')\n", "outliers.print_most_least_outliers_topN(df_global, N=10)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "328\n", "210\n", "194\n", "85\n", "388\n", "266\n", "309\n", "455\n", "365\n", "282\n", "197\n", "122\n", "206\n", "457\n", "298\n", "597\n", "354\n", "191\n", "193\n", "198\n", "263\n", "334\n", "812\n", "415\n", "44\n", "107\n", "366\n", "323\n", "450\n", "116\n", "150\n", "260\n", "230\n", "118\n", "389\n", "237\n", "274\n", "466\n", "147\n", "134\n", "86\n", "91\n", "574\n", "111\n", "296\n", "221\n", "261\n", "224\n", "190\n", "150\n", "139\n", "350\n", "268\n", "453\n", "192\n", "468\n", "266\n", "187\n", "275\n", "337\n", "179\n", "366\n", "211\n", "213\n", "428\n", "468\n", "164\n", "348\n", "328\n", "193\n", "197\n", "193\n", "166\n", "290\n", "196\n", "224\n", "111\n", "258\n", "295\n", "227\n", "252\n", "433\n", "305\n", "290\n", "183\n", "243\n", "63\n", "197\n", "274\n", "363\n", "113\n", "192\n", "258\n", "494\n", "299\n", "484\n", "198\n", "191\n", "174\n", "280\n", "735\n", "211\n", "221\n", "134\n", "125\n", "119\n", "151\n", "203\n", "229\n", "430\n", "311\n", "424\n", "337\n", "268\n", "175\n", "228\n", "175\n", "437\n", "284\n", "129\n", "366\n", "222\n", "66\n", "498\n", "400\n", "430\n", "187\n", "470\n", "298\n", "231\n", "272\n", "261\n", "239\n", "154\n", "22\n", "426\n", "332\n", "most outliers \n", " Country Outliers N_Country N_Outliers\n", "46 China 0.260000 100 26\n", "67 Brazil 0.240000 100 24\n", "101 Colombia 0.211111 90 19\n", "64 Mozambique 0.205882 34 7\n", "76 Iran 0.188679 53 10\n", "65 Uganda 0.176471 85 15\n", "27 Kenya 0.164948 97 16\n", "126 South Sudan 0.163043 92 15\n", "24 Azerbaijan 0.153846 13 2\n", "23 India 0.147368 95 14\n", "least outliers \n", " Country Outliers N_Country N_Outliers\n", "0 Canada 0 100 0\n", "95 Portugal 0 100 0\n", "94 Iraq 0 87 0\n", "93 Grenada 0 37 0\n", "90 French Polynesia 0 15 0\n", "89 Croatia 0 31 0\n", "88 Morocco 0 40 0\n", "87 Philippines 0 100 0\n", "86 Gambia 0 50 0\n", "85 Sierra Leone 0 100 0\n" ] } ], "source": [ "# local outliers\n", "df_local = outliers.get_local_outliers_df(X, Y, w_dict, out_file='../data/results/spatial_outliers.csv')\n", "outliers.print_most_least_outliers_topN(df_local, N=10)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "most outliers \n", " Country Outliers N_Country N_Outliers\n", "43 Benin 0.500000 26 13\n", "136 Botswana 0.488889 90 44\n", "106 Nepal 0.421053 95 40\n", "84 Belize 0.418605 43 18\n", "19 Yemen 0.416667 12 5\n", "least outliers \n", " Country Outliers N_Country N_Outliers\n", "28 Tajikistan 0 19 0\n", "119 Denmark 0 16 0\n", "96 Uruguay 0 31 0\n", "25 Republic of Serbia 0 16 0\n", "27 South Korea 0 11 0\n", "most outliers \n", " Country Outliers N_Country N_Outliers\n", "117 Zimbabwe 0.533333 15 8\n", "96 Uruguay 0.483871 31 15\n", "68 Guinea 0.454545 11 5\n", "63 Senegal 0.390244 41 16\n", "86 Gambia 0.380000 50 19\n", "least outliers \n", " Country Outliers N_Country N_Outliers\n", "90 French Polynesia 0.000000 15 0\n", "37 Rwanda 0.000000 17 0\n", "119 Denmark 0.000000 16 0\n", "18 New Zealand 0.000000 34 0\n", "120 Kazakhstan 0.022727 88 2\n", "most outliers \n", " Country Outliers N_Country N_Outliers\n", "17 French Guiana 0.678571 28 19\n", "136 Botswana 0.477778 90 43\n", "72 Ivory Coast 0.400000 15 6\n", "23 Azerbaijan 0.384615 13 5\n", "106 Nepal 0.347368 95 33\n", "least outliers \n", " Country Outliers N_Country N_Outliers\n", "68 Guinea 0 11 0\n", "55 Mali 0 17 0\n", "77 Algeria 0 27 0\n", "33 Saint Lucia 0 43 0\n", "31 Czech Republic 0 41 0\n", "most outliers \n", " Country Outliers N_Country N_Outliers\n", "43 Benin 0.538462 26 14\n", "20 Pakistan 0.461538 91 42\n", "86 Gambia 0.360000 50 18\n", "52 Indonesia 0.350000 100 35\n", "136 Botswana 0.311111 90 28\n", "least outliers \n", " Country Outliers N_Country N_Outliers\n", "107 Kiribati 0 17 0\n", "1 Lithuania 0 47 0\n", "134 Paraguay 0 23 0\n", "131 Tunisia 0 39 0\n", "19 Yemen 0 12 0\n" ] } ], "source": [ "# outliers for features\n", "feat = X_list\n", "feat_labels = ['rhy', 'mel', 'mfc', 'chr']\n", "tabs_feat = []\n", "for i in range(len(feat)):\n", " XX = feat[i]\n", " output_csv = '../data/results/global_outliers_'+feat_labels[i]+'.csv'\n", " df_feat, threshold, MD = outliers.get_outliers_df(XX, Y, chi2thr=0.999, out_file=output_csv)\n", " outliers.print_most_least_outliers_topN(df_feat, N=5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5\n", "6\n", "7\n", "8\n", "9\n", "10\n", "11\n", "12\n", "13\n", "14\n", "15\n", "16\n", "17\n", "18" ] } ], "source": [ "centroids, cl_pred = outliers.get_country_clusters(X, bestncl=None, max_ncl=30)\n", "ddf['Clusters'] = cl_pred\n", "ddf.to_csv('../data/results/df_and_clusters.csv', index=False)\n", "print len(np.unique(cl_pred))\n", "outliers.print_clusters_metadata(ddf, cl_pred)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 0 }