changeset 76:d17833be50ca branch-tests

merged
author Maria Panteli <m.x.panteli@gmail.com>
date Fri, 22 Sep 2017 16:30:36 +0100
parents 02faad4a996b (diff) cc028157502a (current diff)
children bde45ce0eeab
files
diffstat 3 files changed, 529 insertions(+), 18 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/notebooks/results_30_seconds_and_figures.ipynb	Fri Sep 22 16:30:36 2017 +0100
@@ -0,0 +1,489 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import pickle \n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "%matplotlib inline\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "import sys\n",
+    "sys.path.append('../')\n",
+    "import scripts.outliers as outliers\n",
+    "import scripts.utils as utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING: there are 21 disconnected observations\n",
+      "Island ids:  [3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121]\n",
+      "Antigua and Barbuda\n",
+      "Australia\n",
+      "Cuba\n",
+      "Fiji\n",
+      "French Polynesia\n",
+      "Grenada\n",
+      "Iceland\n",
+      "Jamaica\n",
+      "Japan\n",
+      "Kiribati\n",
+      "Malta\n",
+      "New Zealand\n",
+      "Philippines\n",
+      "Puerto Rico\n",
+      "Republic of Serbia\n",
+      "Saint Lucia\n",
+      "Samoa\n",
+      "Solomon Islands\n",
+      "South Korea\n",
+      "The Bahamas\n",
+      "Trinidad and Tobago\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/mariapanteli/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  data = self._reader.read(nrows)\n"
+     ]
+    }
+   ],
+   "source": [
+    "DATA_FILE = '../data/lda_data_melodia_8_30sec.pickle'\n",
+    "METADATA_FILE = '/Users/mariapanteli/Documents/QMUL/Code/MyPythonCode/MergeBL-Smith/data/metadata_BLSM_language_all.csv'\n",
+    "#METADATA_FILE = '../data/metadata.csv'\n",
+    "\n",
+    "dataset, ddf, w_dict = outliers.load_data(DATA_FILE, METADATA_FILE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(8200, 108)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_list, Y, Yaudio = dataset\n",
+    "X = np.concatenate(X_list, axis=1)\n",
+    "ddf.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "most outliers \n",
+      "           Country  Outliers  N_Country  N_Outliers\n",
+      "136       Botswana  0.611111         90          55\n",
+      "72     Ivory Coast  0.600000         15           9\n",
+      "95            Chad  0.545455         11           6\n",
+      "43           Benin  0.538462         26          14\n",
+      "86          Gambia  0.500000         50          25\n",
+      "20        Pakistan  0.494505         91          45\n",
+      "106          Nepal  0.473684         95          45\n",
+      "78     El Salvador  0.454545         33          15\n",
+      "64      Mozambique  0.441176         34          15\n",
+      "135  French Guiana  0.428571         28          12\n",
+      "least outliers \n",
+      "            Country  Outliers  N_Country  N_Outliers\n",
+      "1         Lithuania  0.000000         47           0\n",
+      "119         Denmark  0.000000         16           0\n",
+      "27      South Korea  0.000000         11           0\n",
+      "120      Kazakhstan  0.011364         88           1\n",
+      "31   Czech Republic  0.024390         41           1\n",
+      "15      Netherlands  0.029851         67           2\n",
+      "30      Afghanistan  0.041667         24           1\n",
+      "105           Sudan  0.044118         68           3\n",
+      "102       Nicaragua  0.047619         21           1\n",
+      "0            Canada  0.050000        100           5\n"
+     ]
+    }
+   ],
+   "source": [
+    "# global outliers\n",
+    "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999, out_file='../data/results/global_outliers.csv')\n",
+    "outliers.print_most_least_outliers_topN(df_global, N=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "328\n",
+      "210\n",
+      "194\n",
+      "85\n",
+      "388\n",
+      "266\n",
+      "309\n",
+      "455\n",
+      "365\n",
+      "282\n",
+      "197\n",
+      "122\n",
+      "206\n",
+      "457\n",
+      "298\n",
+      "597\n",
+      "354\n",
+      "191\n",
+      "193\n",
+      "198\n",
+      "263\n",
+      "334\n",
+      "812\n",
+      "415\n",
+      "44\n",
+      "107\n",
+      "366\n",
+      "323\n",
+      "450\n",
+      "116\n",
+      "150\n",
+      "260\n",
+      "230\n",
+      "118\n",
+      "389\n",
+      "237\n",
+      "274\n",
+      "466\n",
+      "147\n",
+      "134\n",
+      "86\n",
+      "91\n",
+      "574\n",
+      "111\n",
+      "296\n",
+      "221\n",
+      "261\n",
+      "224\n",
+      "190\n",
+      "150\n",
+      "139\n",
+      "350\n",
+      "268\n",
+      "453\n",
+      "192\n",
+      "468\n",
+      "266\n",
+      "187\n",
+      "275\n",
+      "337\n",
+      "179\n",
+      "366\n",
+      "211\n",
+      "213\n",
+      "428\n",
+      "468\n",
+      "164\n",
+      "348\n",
+      "328\n",
+      "193\n",
+      "197\n",
+      "193\n",
+      "166\n",
+      "290\n",
+      "196\n",
+      "224\n",
+      "111\n",
+      "258\n",
+      "295\n",
+      "227\n",
+      "252\n",
+      "433\n",
+      "305\n",
+      "290\n",
+      "183\n",
+      "243\n",
+      "63\n",
+      "197\n",
+      "274\n",
+      "363\n",
+      "113\n",
+      "192\n",
+      "258\n",
+      "494\n",
+      "299\n",
+      "484\n",
+      "198\n",
+      "191\n",
+      "174\n",
+      "280\n",
+      "735\n",
+      "211\n",
+      "221\n",
+      "134\n",
+      "125\n",
+      "119\n",
+      "151\n",
+      "203\n",
+      "229\n",
+      "430\n",
+      "311\n",
+      "424\n",
+      "337\n",
+      "268\n",
+      "175\n",
+      "228\n",
+      "175\n",
+      "437\n",
+      "284\n",
+      "129\n",
+      "366\n",
+      "222\n",
+      "66\n",
+      "498\n",
+      "400\n",
+      "430\n",
+      "187\n",
+      "470\n",
+      "298\n",
+      "231\n",
+      "272\n",
+      "261\n",
+      "239\n",
+      "154\n",
+      "22\n",
+      "426\n",
+      "332\n",
+      "most outliers \n",
+      "         Country  Outliers  N_Country  N_Outliers\n",
+      "46         China  0.260000        100          26\n",
+      "67        Brazil  0.240000        100          24\n",
+      "101     Colombia  0.211111         90          19\n",
+      "64    Mozambique  0.205882         34           7\n",
+      "76          Iran  0.188679         53          10\n",
+      "65        Uganda  0.176471         85          15\n",
+      "27         Kenya  0.164948         97          16\n",
+      "126  South Sudan  0.163043         92          15\n",
+      "24    Azerbaijan  0.153846         13           2\n",
+      "23         India  0.147368         95          14\n",
+      "least outliers \n",
+      "             Country  Outliers  N_Country  N_Outliers\n",
+      "0             Canada         0        100           0\n",
+      "95          Portugal         0        100           0\n",
+      "94              Iraq         0         87           0\n",
+      "93           Grenada         0         37           0\n",
+      "90  French Polynesia         0         15           0\n",
+      "89           Croatia         0         31           0\n",
+      "88           Morocco         0         40           0\n",
+      "87       Philippines         0        100           0\n",
+      "86            Gambia         0         50           0\n",
+      "85      Sierra Leone         0        100           0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# local outliers\n",
+    "df_local = outliers.get_local_outliers_df(X, Y, w_dict, out_file='../data/results/spatial_outliers.csv')\n",
+    "outliers.print_most_least_outliers_topN(df_local, N=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "most outliers \n",
+      "      Country  Outliers  N_Country  N_Outliers\n",
+      "43      Benin  0.500000         26          13\n",
+      "136  Botswana  0.488889         90          44\n",
+      "106     Nepal  0.421053         95          40\n",
+      "84     Belize  0.418605         43          18\n",
+      "19      Yemen  0.416667         12           5\n",
+      "least outliers \n",
+      "                Country  Outliers  N_Country  N_Outliers\n",
+      "28           Tajikistan         0         19           0\n",
+      "119             Denmark         0         16           0\n",
+      "96              Uruguay         0         31           0\n",
+      "25   Republic of Serbia         0         16           0\n",
+      "27          South Korea         0         11           0\n",
+      "most outliers \n",
+      "      Country  Outliers  N_Country  N_Outliers\n",
+      "117  Zimbabwe  0.533333         15           8\n",
+      "96    Uruguay  0.483871         31          15\n",
+      "68     Guinea  0.454545         11           5\n",
+      "63    Senegal  0.390244         41          16\n",
+      "86     Gambia  0.380000         50          19\n",
+      "least outliers \n",
+      "              Country  Outliers  N_Country  N_Outliers\n",
+      "90   French Polynesia  0.000000         15           0\n",
+      "37             Rwanda  0.000000         17           0\n",
+      "119           Denmark  0.000000         16           0\n",
+      "18        New Zealand  0.000000         34           0\n",
+      "120        Kazakhstan  0.022727         88           2\n",
+      "most outliers \n",
+      "           Country  Outliers  N_Country  N_Outliers\n",
+      "17   French Guiana  0.678571         28          19\n",
+      "136       Botswana  0.477778         90          43\n",
+      "72     Ivory Coast  0.400000         15           6\n",
+      "23      Azerbaijan  0.384615         13           5\n",
+      "106          Nepal  0.347368         95          33\n",
+      "least outliers \n",
+      "           Country  Outliers  N_Country  N_Outliers\n",
+      "68          Guinea         0         11           0\n",
+      "55            Mali         0         17           0\n",
+      "77         Algeria         0         27           0\n",
+      "33     Saint Lucia         0         43           0\n",
+      "31  Czech Republic         0         41           0\n",
+      "most outliers \n",
+      "       Country  Outliers  N_Country  N_Outliers\n",
+      "43       Benin  0.538462         26          14\n",
+      "20    Pakistan  0.461538         91          42\n",
+      "86      Gambia  0.360000         50          18\n",
+      "52   Indonesia  0.350000        100          35\n",
+      "136   Botswana  0.311111         90          28\n",
+      "least outliers \n",
+      "       Country  Outliers  N_Country  N_Outliers\n",
+      "107   Kiribati         0         17           0\n",
+      "1    Lithuania         0         47           0\n",
+      "134   Paraguay         0         23           0\n",
+      "131    Tunisia         0         39           0\n",
+      "19       Yemen         0         12           0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# outliers for features\n",
+    "feat = X_list\n",
+    "feat_labels = ['rhy', 'mel', 'mfc', 'chr']\n",
+    "tabs_feat = []\n",
+    "for i in range(len(feat)):\n",
+    "    XX = feat[i]\n",
+    "    output_csv = '../data/results/global_outliers_'+feat_labels[i]+'.csv'\n",
+    "    df_feat, threshold, MD = outliers.get_outliers_df(XX, Y, chi2thr=0.999, out_file=output_csv)\n",
+    "    outliers.print_most_least_outliers_topN(df_feat, N=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "5\n",
+      "6\n",
+      "7\n",
+      "8\n",
+      "9\n",
+      "10\n",
+      "11\n",
+      "12\n",
+      "13\n",
+      "14\n",
+      "15\n",
+      "16\n",
+      "17\n",
+      "18"
+     ]
+    }
+   ],
+   "source": [
+    "centroids, cl_pred = outliers.get_country_clusters(X, bestncl=None, max_ncl=30)\n",
+    "ddf['Clusters'] = cl_pred\n",
+    "ddf.to_csv('../data/results/df_and_clusters.csv', index=False)\n",
+    "print len(np.unique(cl_pred))\n",
+    "outliers.print_clusters_metadata(ddf, cl_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/notebooks/results_for_30_seconds.ipynb	Fri Sep 22 16:29:32 2017 +0100
+++ b/notebooks/results_for_30_seconds.ipynb	Fri Sep 22 16:30:36 2017 +0100
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 36,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -88,7 +90,9 @@
   {
    "cell_type": "code",
    "execution_count": 37,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -124,7 +128,9 @@
   {
    "cell_type": "code",
    "execution_count": 38,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -145,7 +151,9 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -216,6 +224,7 @@
    "cell_type": "code",
    "execution_count": 29,
    "metadata": {
+    "collapsed": false,
     "scrolled": true
    },
    "outputs": [
@@ -270,6 +279,7 @@
    "cell_type": "code",
    "execution_count": 63,
    "metadata": {
+    "collapsed": false,
     "scrolled": true
    },
    "outputs": [
@@ -324,7 +334,9 @@
   {
    "cell_type": "code",
    "execution_count": 43,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -371,7 +383,9 @@
   {
    "cell_type": "code",
    "execution_count": 46,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -450,7 +464,9 @@
   {
    "cell_type": "code",
    "execution_count": 48,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stderr",
@@ -662,7 +678,9 @@
   {
    "cell_type": "code",
    "execution_count": 53,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -718,7 +736,9 @@
   {
    "cell_type": "code",
    "execution_count": 55,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -738,7 +758,9 @@
   {
    "cell_type": "code",
    "execution_count": 62,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
--- a/scripts/outliers.py	Fri Sep 22 16:29:32 2017 +0100
+++ b/scripts/outliers.py	Fri Sep 22 16:30:36 2017 +0100
@@ -15,7 +15,7 @@
 import utils_spatial
 
 
-def country_outlier_df(counts, labels, normalize=False):
+def country_outlier_df(counts, labels, normalize=False, out_file=None):
     if len(counts.keys()) < len(np.unique(labels)):
         for label in np.unique(labels):
             if not counts.has_key(label):
@@ -33,6 +33,8 @@
     df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True)
     df = pd.merge(df, df_n_country, on='Country', how='left')
     df = pd.merge(df, df_n_outliers, on='Country', how='left')
+    if out_file is not None:
+        df.to_csv(out_file, index=False)
     return df
 
 
@@ -50,9 +52,7 @@
 def get_outliers_df(X, Y, chi2thr=0.999, out_file=None):
     threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr)
     global_counts = Counter(Y[y_pred])
-    df = country_outlier_df(global_counts, Y, normalize=True)
-    if out_file is not None:
-        df.to_csv(out_file, index=False)
+    df = country_outlier_df(global_counts, Y, normalize=True, out_file=out_file)
     return df, threshold, MD
 
 
@@ -100,16 +100,16 @@
     return [X_list, Y, Yaudio], ddf, w_dict
 
 
-def get_local_outliers_df(X, Y, w_dict):
+def get_local_outliers_df(X, Y, w_dict, out_file=None):
     spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)
     spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))
-    df_local = country_outlier_df(spatial_counts, Y, normalize=True)
+    df_local = country_outlier_df(spatial_counts, Y, normalize=True, out_file=out_file)
     return df_local
 
 
-def get_country_clusters(X, bestncl=None):
+def get_country_clusters(X, bestncl=None, max_ncl=50):
     if bestncl is None:
-        bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine")
+        bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=max_ncl, metric="cosine")
     # get cluster predictions and metadata for each cluster
     cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
     centroids = cluster_model.cluster_centers_