Mercurial > hg > plosone_underreview

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import pickle \n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import sys\n",
    "sys.path.append('../')\n",
    "import scripts.outliers as outliers\n",
    "import scripts.utils as utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING: there are 21 disconnected observations\n",
      "Island ids:  [3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121]\n",
      "Antigua and Barbuda\n",
      "Australia\n",
      "Cuba\n",
      "Fiji\n",
      "French Polynesia\n",
      "Grenada\n",
      "Iceland\n",
      "Jamaica\n",
      "Japan\n",
      "Kiribati\n",
      "Malta\n",
      "New Zealand\n",
      "Philippines\n",
      "Puerto Rico\n",
      "Republic of Serbia\n",
      "Saint Lucia\n",
      "Samoa\n",
      "Solomon Islands\n",
      "South Korea\n",
      "The Bahamas\n",
      "Trinidad and Tobago\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/mariapanteli/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  data = self._reader.read(nrows)\n"
     ]
    }
   ],
   "source": [
    "DATA_FILE = '../data/lda_data_melodia_8_30sec.pickle'\n",
    "METADATA_FILE = '/Users/mariapanteli/Documents/QMUL/Code/MyPythonCode/MergeBL-Smith/data/metadata_BLSM_language_all.csv'\n",
    "#METADATA_FILE = '../data/metadata.csv'\n",
    "\n",
    "dataset, ddf, w_dict = outliers.load_data(DATA_FILE, METADATA_FILE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(8200, 108)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_list, Y, Yaudio = dataset\n",
    "X = np.concatenate(X_list, axis=1)\n",
    "ddf.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "most outliers \n",
      "           Country  Outliers  N_Country  N_Outliers\n",
      "136       Botswana  0.611111         90          55\n",
      "72     Ivory Coast  0.600000         15           9\n",
      "95            Chad  0.545455         11           6\n",
      "43           Benin  0.538462         26          14\n",
      "86          Gambia  0.500000         50          25\n",
      "20        Pakistan  0.494505         91          45\n",
      "106          Nepal  0.473684         95          45\n",
      "78     El Salvador  0.454545         33          15\n",
      "64      Mozambique  0.441176         34          15\n",
      "135  French Guiana  0.428571         28          12\n",
      "least outliers \n",
      "            Country  Outliers  N_Country  N_Outliers\n",
      "1         Lithuania  0.000000         47           0\n",
      "119         Denmark  0.000000         16           0\n",
      "27      South Korea  0.000000         11           0\n",
      "120      Kazakhstan  0.011364         88           1\n",
      "31   Czech Republic  0.024390         41           1\n",
      "15      Netherlands  0.029851         67           2\n",
      "30      Afghanistan  0.041667         24           1\n",
      "105           Sudan  0.044118         68           3\n",
      "102       Nicaragua  0.047619         21           1\n",
      "0            Canada  0.050000        100           5\n"
     ]
    }
   ],
   "source": [
    "# global outliers\n",
    "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999, out_file='../data/results/global_outliers.csv')\n",
    "outliers.print_most_least_outliers_topN(df_global, N=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "328\n",
      "210\n",
      "194\n",
      "85\n",
      "388\n",
      "266\n",
      "309\n",
      "455\n",
      "365\n",
      "282\n",
      "197\n",
      "122\n",
      "206\n",
      "457\n",
      "298\n",
      "597\n",
      "354\n",
      "191\n",
      "193\n",
      "198\n",
      "263\n",
      "334\n",
      "812\n",
      "415\n",
      "44\n",
      "107\n",
      "366\n",
      "323\n",
      "450\n",
      "116\n",
      "150\n",
      "260\n",
      "230\n",
      "118\n",
      "389\n",
      "237\n",
      "274\n",
      "466\n",
      "147\n",
      "134\n",
      "86\n",
      "91\n",
      "574\n",
      "111\n",
      "296\n",
      "221\n",
      "261\n",
      "224\n",
      "190\n",
      "150\n",
      "139\n",
      "350\n",
      "268\n",
      "453\n",
      "192\n",
      "468\n",
      "266\n",
      "187\n",
      "275\n",
      "337\n",
      "179\n",
      "366\n",
      "211\n",
      "213\n",
      "428\n",
      "468\n",
      "164\n",
      "348\n",
      "328\n",
      "193\n",
      "197\n",
      "193\n",
      "166\n",
      "290\n",
      "196\n",
      "224\n",
      "111\n",
      "258\n",
      "295\n",
      "227\n",
      "252\n",
      "433\n",
      "305\n",
      "290\n",
      "183\n",
      "243\n",
      "63\n",
      "197\n",
      "274\n",
      "363\n",
      "113\n",
      "192\n",
      "258\n",
      "494\n",
      "299\n",
      "484\n",
      "198\n",
      "191\n",
      "174\n",
      "280\n",
      "735\n",
      "211\n",
      "221\n",
      "134\n",
      "125\n",
      "119\n",
      "151\n",
      "203\n",
      "229\n",
      "430\n",
      "311\n",
      "424\n",
      "337\n",
      "268\n",
      "175\n",
      "228\n",
      "175\n",
      "437\n",
      "284\n",
      "129\n",
      "366\n",
      "222\n",
      "66\n",
      "498\n",
      "400\n",
      "430\n",
      "187\n",
      "470\n",
      "298\n",
      "231\n",
      "272\n",
      "261\n",
      "239\n",
      "154\n",
      "22\n",
      "426\n",
      "332\n",
      "most outliers \n",
      "         Country  Outliers  N_Country  N_Outliers\n",
      "46         China  0.260000        100          26\n",
      "67        Brazil  0.240000        100          24\n",
      "101     Colombia  0.211111         90          19\n",
      "64    Mozambique  0.205882         34           7\n",
      "76          Iran  0.188679         53          10\n",
      "65        Uganda  0.176471         85          15\n",
      "27         Kenya  0.164948         97          16\n",
      "126  South Sudan  0.163043         92          15\n",
      "24    Azerbaijan  0.153846         13           2\n",
      "23         India  0.147368         95          14\n",
      "least outliers \n",
      "             Country  Outliers  N_Country  N_Outliers\n",
      "0             Canada         0        100           0\n",
      "95          Portugal         0        100           0\n",
      "94              Iraq         0         87           0\n",
      "93           Grenada         0         37           0\n",
      "90  French Polynesia         0         15           0\n",
      "89           Croatia         0         31           0\n",
      "88           Morocco         0         40           0\n",
      "87       Philippines         0        100           0\n",
      "86            Gambia         0         50           0\n",
      "85      Sierra Leone         0        100           0\n"
     ]
    }
   ],
   "source": [
    "# local outliers\n",
    "df_local = outliers.get_local_outliers_df(X, Y, w_dict, out_file='../data/results/spatial_outliers.csv')\n",
    "outliers.print_most_least_outliers_topN(df_local, N=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "most outliers \n",
      "      Country  Outliers  N_Country  N_Outliers\n",
      "43      Benin  0.500000         26          13\n",
      "136  Botswana  0.488889         90          44\n",
      "106     Nepal  0.421053         95          40\n",
      "84     Belize  0.418605         43          18\n",
      "19      Yemen  0.416667         12           5\n",
      "least outliers \n",
      "                Country  Outliers  N_Country  N_Outliers\n",
      "28           Tajikistan         0         19           0\n",
      "119             Denmark         0         16           0\n",
      "96              Uruguay         0         31           0\n",
      "25   Republic of Serbia         0         16           0\n",
      "27          South Korea         0         11           0\n",
      "most outliers \n",
      "      Country  Outliers  N_Country  N_Outliers\n",
      "117  Zimbabwe  0.533333         15           8\n",
      "96    Uruguay  0.483871         31          15\n",
      "68     Guinea  0.454545         11           5\n",
      "63    Senegal  0.390244         41          16\n",
      "86     Gambia  0.380000         50          19\n",
      "least outliers \n",
      "              Country  Outliers  N_Country  N_Outliers\n",
      "90   French Polynesia  0.000000         15           0\n",
      "37             Rwanda  0.000000         17           0\n",
      "119           Denmark  0.000000         16           0\n",
      "18        New Zealand  0.000000         34           0\n",
      "120        Kazakhstan  0.022727         88           2\n",
      "most outliers \n",
      "           Country  Outliers  N_Country  N_Outliers\n",
      "17   French Guiana  0.678571         28          19\n",
      "136       Botswana  0.477778         90          43\n",
      "72     Ivory Coast  0.400000         15           6\n",
      "23      Azerbaijan  0.384615         13           5\n",
      "106          Nepal  0.347368         95          33\n",
      "least outliers \n",
      "           Country  Outliers  N_Country  N_Outliers\n",
      "68          Guinea         0         11           0\n",
      "55            Mali         0         17           0\n",
      "77         Algeria         0         27           0\n",
      "33     Saint Lucia         0         43           0\n",
      "31  Czech Republic         0         41           0\n",
      "most outliers \n",
      "       Country  Outliers  N_Country  N_Outliers\n",
      "43       Benin  0.538462         26          14\n",
      "20    Pakistan  0.461538         91          42\n",
      "86      Gambia  0.360000         50          18\n",
      "52   Indonesia  0.350000        100          35\n",
      "136   Botswana  0.311111         90          28\n",
      "least outliers \n",
      "       Country  Outliers  N_Country  N_Outliers\n",
      "107   Kiribati         0         17           0\n",
      "1    Lithuania         0         47           0\n",
      "134   Paraguay         0         23           0\n",
      "131    Tunisia         0         39           0\n",
      "19       Yemen         0         12           0\n"
     ]
    }
   ],
   "source": [
    "# outliers for features\n",
    "feat = X_list\n",
    "feat_labels = ['rhy', 'mel', 'mfc', 'chr']\n",
    "tabs_feat = []\n",
    "for i in range(len(feat)):\n",
    "    XX = feat[i]\n",
    "    output_csv = '../data/results/global_outliers_'+feat_labels[i]+'.csv'\n",
    "    df_feat, threshold, MD = outliers.get_outliers_df(XX, Y, chi2thr=0.999, out_file=output_csv)\n",
    "    outliers.print_most_least_outliers_topN(df_feat, N=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5\n",
      "6\n",
      "7\n",
      "8\n",
      "9\n",
      "10\n",
      "11\n",
      "12\n",
      "13\n",
      "14\n",
      "15\n",
      "16\n",
      "17\n",
      "18"
     ]
    }
   ],
   "source": [
    "centroids, cl_pred = outliers.get_country_clusters(X, bestncl=None, max_ncl=30)\n",
    "ddf['Clusters'] = cl_pred\n",
    "ddf.to_csv('../data/results/df_and_clusters.csv', index=False)\n",
    "print len(np.unique(cl_pred))\n",
    "outliers.print_clusters_metadata(ddf, cl_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
author	Maria Panteli <m.x.panteli@gmail.com>
date	Fri, 22 Sep 2017 16:30:28 +0100
parents
children	bde45ce0eeab