m@75: { m@75: "cells": [ m@75: { m@75: "cell_type": "code", m@75: "execution_count": 12, m@75: "metadata": { m@75: "collapsed": false m@75: }, m@75: "outputs": [ m@75: { m@75: "name": "stdout", m@75: "output_type": "stream", m@75: "text": [ m@75: "The autoreload extension is already loaded. To reload it, use:\n", m@75: " %reload_ext autoreload\n" m@75: ] m@75: } m@75: ], m@75: "source": [ m@75: "import numpy as np\n", m@75: "import pandas as pd\n", m@75: "import pickle \n", m@75: "\n", m@75: "%load_ext autoreload\n", m@75: "%autoreload 2\n", m@75: "\n", m@75: "%matplotlib inline\n", m@75: "import matplotlib.pyplot as plt\n", m@75: "\n", m@75: "import sys\n", m@75: "sys.path.append('../')\n", m@75: "import scripts.outliers as outliers\n", m@75: "import scripts.utils as utils" m@75: ] m@75: }, m@75: { m@75: "cell_type": "code", m@75: "execution_count": 7, m@75: "metadata": { m@75: "collapsed": false m@75: }, m@75: "outputs": [ m@75: { m@75: "name": "stdout", m@75: "output_type": "stream", m@75: "text": [ m@75: "WARNING: there are 21 disconnected observations\n", m@75: "Island ids: [3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121]\n", m@75: "Antigua and Barbuda\n", m@75: "Australia\n", m@75: "Cuba\n", m@75: "Fiji\n", m@75: "French Polynesia\n", m@75: "Grenada\n", m@75: "Iceland\n", m@75: "Jamaica\n", m@75: "Japan\n", m@75: "Kiribati\n", m@75: "Malta\n", m@75: "New Zealand\n", m@75: "Philippines\n", m@75: "Puerto Rico\n", m@75: "Republic of Serbia\n", m@75: "Saint Lucia\n", m@75: "Samoa\n", m@75: "Solomon Islands\n", m@75: "South Korea\n", m@75: "The Bahamas\n", m@75: "Trinidad and Tobago\n" m@75: ] m@75: }, m@75: { m@75: "name": "stderr", m@75: "output_type": "stream", m@75: "text": [ m@75: "/Users/mariapanteli/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n", m@75: " data = self._reader.read(nrows)\n" m@75: ] m@75: } m@75: ], m@75: "source": [ m@75: "DATA_FILE = '../data/lda_data_melodia_8_30sec.pickle'\n", m@75: "METADATA_FILE = '/Users/mariapanteli/Documents/QMUL/Code/MyPythonCode/MergeBL-Smith/data/metadata_BLSM_language_all.csv'\n", m@75: "#METADATA_FILE = '../data/metadata.csv'\n", m@75: "\n", m@75: "dataset, ddf, w_dict = outliers.load_data(DATA_FILE, METADATA_FILE)" m@75: ] m@75: }, m@75: { m@75: "cell_type": "code", m@75: "execution_count": 9, m@75: "metadata": { m@75: "collapsed": false m@75: }, m@75: "outputs": [ m@75: { m@75: "data": { m@75: "text/plain": [ m@75: "(8200, 108)" m@75: ] m@75: }, m@75: "execution_count": 9, m@75: "metadata": {}, m@75: "output_type": "execute_result" m@75: } m@75: ], m@75: "source": [ m@75: "X_list, Y, Yaudio = dataset\n", m@75: "X = np.concatenate(X_list, axis=1)\n", m@75: "ddf.shape" m@75: ] m@75: }, m@75: { m@75: "cell_type": "code", m@75: "execution_count": 11, m@75: "metadata": { m@75: "collapsed": false m@75: }, m@75: "outputs": [ m@75: { m@75: "name": "stdout", m@75: "output_type": "stream", m@75: "text": [ m@75: "most outliers \n", m@75: " Country Outliers N_Country N_Outliers\n", m@75: "136 Botswana 0.611111 90 55\n", m@75: "72 Ivory Coast 0.600000 15 9\n", m@75: "95 Chad 0.545455 11 6\n", m@75: "43 Benin 0.538462 26 14\n", m@75: "86 Gambia 0.500000 50 25\n", m@75: "20 Pakistan 0.494505 91 45\n", m@75: "106 Nepal 0.473684 95 45\n", m@75: "78 El Salvador 0.454545 33 15\n", m@75: "64 Mozambique 0.441176 34 15\n", m@75: "135 French Guiana 0.428571 28 12\n", m@75: "least outliers \n", m@75: " Country Outliers N_Country N_Outliers\n", m@75: "1 Lithuania 0.000000 47 0\n", m@75: "119 Denmark 0.000000 16 0\n", m@75: "27 South Korea 0.000000 11 0\n", m@75: "120 Kazakhstan 0.011364 88 1\n", m@75: "31 Czech Republic 0.024390 41 1\n", m@75: "15 Netherlands 0.029851 67 2\n", m@75: "30 Afghanistan 0.041667 24 1\n", m@75: "105 Sudan 0.044118 68 3\n", m@75: "102 Nicaragua 0.047619 21 1\n", m@75: "0 Canada 0.050000 100 5\n" m@75: ] m@75: } m@75: ], m@75: "source": [ m@75: "# global outliers\n", m@75: "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999, out_file='../data/results/global_outliers.csv')\n", m@75: "outliers.print_most_least_outliers_topN(df_global, N=10)" m@75: ] m@75: }, m@75: { m@75: "cell_type": "code", m@75: "execution_count": 14, m@75: "metadata": { m@75: "collapsed": false, m@75: "scrolled": true m@75: }, m@75: "outputs": [ m@75: { m@75: "name": "stdout", m@75: "output_type": "stream", m@75: "text": [ m@75: "328\n", m@75: "210\n", m@75: "194\n", m@75: "85\n", m@75: "388\n", m@75: "266\n", m@75: "309\n", m@75: "455\n", m@75: "365\n", m@75: "282\n", m@75: "197\n", m@75: "122\n", m@75: "206\n", m@75: "457\n", m@75: "298\n", m@75: "597\n", m@75: "354\n", m@75: "191\n", m@75: "193\n", m@75: "198\n", m@75: "263\n", m@75: "334\n", m@75: "812\n", m@75: "415\n", m@75: "44\n", m@75: "107\n", m@75: "366\n", m@75: "323\n", m@75: "450\n", m@75: "116\n", m@75: "150\n", m@75: "260\n", m@75: "230\n", m@75: "118\n", m@75: "389\n", m@75: "237\n", m@75: "274\n", m@75: "466\n", m@75: "147\n", m@75: "134\n", m@75: "86\n", m@75: "91\n", m@75: "574\n", m@75: "111\n", m@75: "296\n", m@75: "221\n", m@75: "261\n", m@75: "224\n", m@75: "190\n", m@75: "150\n", m@75: "139\n", m@75: "350\n", m@75: "268\n", m@75: "453\n", m@75: "192\n", m@75: "468\n", m@75: "266\n", m@75: "187\n", m@75: "275\n", m@75: "337\n", m@75: "179\n", m@75: "366\n", m@75: "211\n", m@75: "213\n", m@75: "428\n", m@75: "468\n", m@75: "164\n", m@75: "348\n", m@75: "328\n", m@75: "193\n", m@75: "197\n", m@75: "193\n", m@75: "166\n", m@75: "290\n", m@75: "196\n", m@75: "224\n", m@75: "111\n", m@75: "258\n", m@75: "295\n", m@75: "227\n", m@75: "252\n", m@75: "433\n", m@75: "305\n", m@75: "290\n", m@75: "183\n", m@75: "243\n", m@75: "63\n", m@75: "197\n", m@75: "274\n", m@75: "363\n", m@75: "113\n", m@75: "192\n", m@75: "258\n", m@75: "494\n", m@75: "299\n", m@75: "484\n", m@75: "198\n", m@75: "191\n", m@75: "174\n", m@75: "280\n", m@75: "735\n", m@75: "211\n", m@75: "221\n", m@75: "134\n", m@75: "125\n", m@75: "119\n", m@75: "151\n", m@75: "203\n", m@75: "229\n", m@75: "430\n", m@75: "311\n", m@75: "424\n", m@75: "337\n", m@75: "268\n", m@75: "175\n", m@75: "228\n", m@75: "175\n", m@75: "437\n", m@75: "284\n", m@75: "129\n", m@75: "366\n", m@75: "222\n", m@75: "66\n", m@75: "498\n", m@75: "400\n", m@75: "430\n", m@75: "187\n", m@75: "470\n", m@75: "298\n", m@75: "231\n", m@75: "272\n", m@75: "261\n", m@75: "239\n", m@75: "154\n", m@75: "22\n", m@75: "426\n", m@75: "332\n", m@75: "most outliers \n", m@75: " Country Outliers N_Country N_Outliers\n", m@75: "46 China 0.260000 100 26\n", m@75: "67 Brazil 0.240000 100 24\n", m@75: "101 Colombia 0.211111 90 19\n", m@75: "64 Mozambique 0.205882 34 7\n", m@75: "76 Iran 0.188679 53 10\n", m@75: "65 Uganda 0.176471 85 15\n", m@75: "27 Kenya 0.164948 97 16\n", m@75: "126 South Sudan 0.163043 92 15\n", m@75: "24 Azerbaijan 0.153846 13 2\n", m@75: "23 India 0.147368 95 14\n", m@75: "least outliers \n", m@75: " Country Outliers N_Country N_Outliers\n", m@75: "0 Canada 0 100 0\n", m@75: "95 Portugal 0 100 0\n", m@75: "94 Iraq 0 87 0\n", m@75: "93 Grenada 0 37 0\n", m@75: "90 French Polynesia 0 15 0\n", m@75: "89 Croatia 0 31 0\n", m@75: "88 Morocco 0 40 0\n", m@75: "87 Philippines 0 100 0\n", m@75: "86 Gambia 0 50 0\n", m@75: "85 Sierra Leone 0 100 0\n" m@75: ] m@75: } m@75: ], m@75: "source": [ m@75: "# local outliers\n", m@75: "df_local = outliers.get_local_outliers_df(X, Y, w_dict, out_file='../data/results/spatial_outliers.csv')\n", m@75: "outliers.print_most_least_outliers_topN(df_local, N=10)" m@75: ] m@75: }, m@75: { m@75: "cell_type": "code", m@75: "execution_count": 16, m@75: "metadata": { m@75: "collapsed": false, m@75: "scrolled": true m@75: }, m@75: "outputs": [ m@75: { m@75: "name": "stdout", m@75: "output_type": "stream", m@75: "text": [ m@75: "most outliers \n", m@75: " Country Outliers N_Country N_Outliers\n", m@75: "43 Benin 0.500000 26 13\n", m@75: "136 Botswana 0.488889 90 44\n", m@75: "106 Nepal 0.421053 95 40\n", m@75: "84 Belize 0.418605 43 18\n", m@75: "19 Yemen 0.416667 12 5\n", m@75: "least outliers \n", m@75: " Country Outliers N_Country N_Outliers\n", m@75: "28 Tajikistan 0 19 0\n", m@75: "119 Denmark 0 16 0\n", m@75: "96 Uruguay 0 31 0\n", m@75: "25 Republic of Serbia 0 16 0\n", m@75: "27 South Korea 0 11 0\n", m@75: "most outliers \n", m@75: " Country Outliers N_Country N_Outliers\n", m@75: "117 Zimbabwe 0.533333 15 8\n", m@75: "96 Uruguay 0.483871 31 15\n", m@75: "68 Guinea 0.454545 11 5\n", m@75: "63 Senegal 0.390244 41 16\n", m@75: "86 Gambia 0.380000 50 19\n", m@75: "least outliers \n", m@75: " Country Outliers N_Country N_Outliers\n", m@75: "90 French Polynesia 0.000000 15 0\n", m@75: "37 Rwanda 0.000000 17 0\n", m@75: "119 Denmark 0.000000 16 0\n", m@75: "18 New Zealand 0.000000 34 0\n", m@75: "120 Kazakhstan 0.022727 88 2\n", m@75: "most outliers \n", m@75: " Country Outliers N_Country N_Outliers\n", m@75: "17 French Guiana 0.678571 28 19\n", m@75: "136 Botswana 0.477778 90 43\n", m@75: "72 Ivory Coast 0.400000 15 6\n", m@75: "23 Azerbaijan 0.384615 13 5\n", m@75: "106 Nepal 0.347368 95 33\n", m@75: "least outliers \n", m@75: " Country Outliers N_Country N_Outliers\n", m@75: "68 Guinea 0 11 0\n", m@75: "55 Mali 0 17 0\n", m@75: "77 Algeria 0 27 0\n", m@75: "33 Saint Lucia 0 43 0\n", m@75: "31 Czech Republic 0 41 0\n", m@75: "most outliers \n", m@75: " Country Outliers N_Country N_Outliers\n", m@75: "43 Benin 0.538462 26 14\n", m@75: "20 Pakistan 0.461538 91 42\n", m@75: "86 Gambia 0.360000 50 18\n", m@75: "52 Indonesia 0.350000 100 35\n", m@75: "136 Botswana 0.311111 90 28\n", m@75: "least outliers \n", m@75: " Country Outliers N_Country N_Outliers\n", m@75: "107 Kiribati 0 17 0\n", m@75: "1 Lithuania 0 47 0\n", m@75: "134 Paraguay 0 23 0\n", m@75: "131 Tunisia 0 39 0\n", m@75: "19 Yemen 0 12 0\n" m@75: ] m@75: } m@75: ], m@75: "source": [ m@75: "# outliers for features\n", m@75: "feat = X_list\n", m@75: "feat_labels = ['rhy', 'mel', 'mfc', 'chr']\n", m@75: "tabs_feat = []\n", m@75: "for i in range(len(feat)):\n", m@75: " XX = feat[i]\n", m@75: " output_csv = '../data/results/global_outliers_'+feat_labels[i]+'.csv'\n", m@75: " df_feat, threshold, MD = outliers.get_outliers_df(XX, Y, chi2thr=0.999, out_file=output_csv)\n", m@75: " outliers.print_most_least_outliers_topN(df_feat, N=5)" m@75: ] m@75: }, m@75: { m@75: "cell_type": "code", m@75: "execution_count": null, m@75: "metadata": { m@75: "collapsed": false m@75: }, m@75: "outputs": [ m@75: { m@75: "name": "stdout", m@75: "output_type": "stream", m@75: "text": [ m@75: "5\n", m@75: "6\n", m@75: "7\n", m@75: "8\n", m@75: "9\n", m@75: "10\n", m@75: "11\n", m@75: "12\n", m@75: "13\n", m@75: "14\n", m@75: "15\n", m@75: "16\n", m@75: "17\n", m@75: "18" m@75: ] m@75: } m@75: ], m@75: "source": [ m@75: "centroids, cl_pred = outliers.get_country_clusters(X, bestncl=None, max_ncl=30)\n", m@75: "ddf['Clusters'] = cl_pred\n", m@75: "ddf.to_csv('../data/results/df_and_clusters.csv', index=False)\n", m@75: "print len(np.unique(cl_pred))\n", m@75: "outliers.print_clusters_metadata(ddf, cl_pred)" m@75: ] m@75: }, m@75: { m@75: "cell_type": "code", m@75: "execution_count": null, m@75: "metadata": { m@75: "collapsed": true m@75: }, m@75: "outputs": [], m@75: "source": [] m@75: } m@75: ], m@75: "metadata": { m@75: "kernelspec": { m@75: "display_name": "Python 2", m@75: "language": "python", m@75: "name": "python2" m@75: }, m@75: "language_info": { m@75: "codemirror_mode": { m@75: "name": "ipython", m@75: "version": 2 m@75: }, m@75: "file_extension": ".py", m@75: "mimetype": "text/x-python", m@75: "name": "python", m@75: "nbconvert_exporter": "python", m@75: "pygments_lexer": "ipython2", m@75: "version": "2.7.12" m@75: } m@75: }, m@75: "nbformat": 4, m@75: "nbformat_minor": 0 m@75: }