changeset 42:90f8a2ea6f6f branch-tests

notebook results and load_features minor edits
author mpanteli <m.x.panteli@gmail.com>
date Fri, 15 Sep 2017 16:17:17 +0100
parents 57f53b0d1eaa
children 06e5711f9f62
files notebooks/sensitivity_experiment.ipynb notebooks/test_hubness.ipynb notebooks/test_music_segments.ipynb scripts/load_features.py
diffstat 4 files changed, 507 insertions(+), 81 deletions(-) [+]
line wrap: on
line diff
--- a/notebooks/sensitivity_experiment.ipynb	Fri Sep 15 12:27:11 2017 +0100
+++ b/notebooks/sensitivity_experiment.ipynb	Fri Sep 15 16:17:17 2017 +0100
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -16,6 +16,7 @@
    ],
    "source": [
     "import numpy as np\n",
+    "import pandas as pd\n",
     "\n",
     "%matplotlib inline\n",
     "import matplotlib.pyplot as plt\n",
@@ -27,19 +28,20 @@
     "sys.path.append('../')\n",
     "import scripts.load_dataset as load_dataset\n",
     "import scripts.map_and_average as mapper\n",
-    "import scripts.classification\n",
+    "import scripts.classification as classification\n",
     "import scripts.outliers as outliers"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
-   "metadata": {},
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "OUTPUT_FILES = load_dataset.OUTPUT_FILES\n",
-    "n_iters = 10\n",
-    "df = load_dataset.sample_dataset(csv_file=load_dataset.METADATA_FILE)"
+    "n_iters = 10"
    ]
   },
   {
@@ -59,6 +61,7 @@
     }
    ],
    "source": [
+    "df = load_dataset.sample_dataset(csv_file=load_dataset.METADATA_FILE)\n",
     "df.shape"
    ]
   },
@@ -4612,11 +4615,83 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "iteration 0\n",
+      "mapping...\n",
+      "/import/c4dm-04/mariap/train_data_melodia_8_0.pickle\n",
+      "(203219, 840) (68100, 840) (67143, 840)\n",
+      "mapping rhy\n",
+      "training with PCA transform...\n",
+      "variance explained 1.0\n",
+      "140 400\n",
+      "training with PCA transform...\n",
+      "variance explained 0.990203912455\n",
+      "training with LDA transform...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/homes/mp305/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
+      "  y = column_or_1d(y, warn=True)\n",
+      "/homes/mp305/anaconda/lib/python2.7/site-packages/sklearn/discriminant_analysis.py:455: UserWarning: The priors do not sum to 1. Renormalizing\n",
+      "  UserWarning)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "variance explained 1.0\n",
+      "transform test data...\n",
+      "mapping mel\n",
+      "training with PCA transform...\n",
+      "variance explained 1.0\n",
+      "214 240\n",
+      "training with PCA transform...\n",
+      "variance explained 0.990094273777\n",
+      "training with LDA transform...\n",
+      "variance explained 1.0\n",
+      "transform test data...\n",
+      "mapping mfc\n",
+      "training with PCA transform...\n",
+      "variance explained 1.0\n",
+      "39 80\n",
+      "training with PCA transform...\n",
+      "variance explained 0.9914399357\n",
+      "training with LDA transform...\n",
+      "variance explained 0.941390777379\n",
+      "transform test data...\n",
+      "mapping chr\n",
+      "training with PCA transform...\n",
+      "variance explained 1.0\n",
+      "70 120\n",
+      "training with PCA transform...\n",
+      "variance explained 0.990511935176\n",
+      "training with LDA transform...\n",
+      "variance explained 0.953613938607\n",
+      "transform test data...\n"
+     ]
+    },
+    {
+     "ename": "ValueError",
+     "evalue": "all the input array dimensions except for the concatenation axis must match exactly",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-3-971892d5bd8d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      6\u001b[0m                                  output_file in OUTPUT_FILES]\n\u001b[1;32m      7\u001b[0m     \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mldadata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlda_map_and_average_frames\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_variance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.99\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m     \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcatenate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mldadata_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m     \u001b[0;31m# classification and confusion\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mValueError\u001b[0m: all the input array dimensions except for the concatenation axis must match exactly"
+     ]
+    }
+   ],
    "source": [
     "for n in range(n_iters):\n",
     "    print \"iteration %d\" % n\n",
@@ -4625,7 +4700,7 @@
     "    mapper.INPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n",
     "                                 output_file in OUTPUT_FILES]\n",
     "    _, _, ldadata_list, _, _, Y, Yaudio = mapper.lda_map_and_average_frames(min_variance=0.99)\n",
-    "    X = np.concatenate(ldadata_list)\n",
+    "    X = np.concatenate(ldadata_list, axis=1)\n",
     "    \n",
     "    # classification and confusion\n",
     "    print \"classifying...\"\n",
@@ -4636,14 +4711,232 @@
     "    \n",
     "    # outliers\n",
     "    print \"detecting outliers...\"\n",
-    "    ddf = outliers.load_metadata(Yaudio, metadata_file=load_dataset.METADATA_FILE)\n",
-    "    df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)\n",
-    "    print_most_least_outliers_topN(df_global, N=10)\n",
+    "    #ddf = outliers.load_metadata(Yaudio, metadata_file=load_dataset.METADATA_FILE)\n",
+    "    df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n",
+    "    outliers.print_most_least_outliers_topN(df_global, N=10)\n",
     "    \n",
     "    # write output\n",
     "    print \"writing file\"\n",
     "    df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "X = np.concatenate(ldadata_list, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(8089, 381)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.176354062249\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/homes/mp305/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
+      "  'precision', 'predicted', average, warn_for)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#traininds, testinds = classification.get_train_test_indices()\n",
+    "traininds = np.arange(5000)\n",
+    "testinds = np.arange(len(X)-1600, len(X))\n",
+    "X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n",
+    "accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n",
+    "print accuracy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "detecting outliers...\n",
+      "most outliers \n",
+      "            Country  Outliers\n",
+      "136        Botswana  0.590909\n",
+      "71      Ivory Coast  0.571429\n",
+      "86           Gambia  0.541667\n",
+      "43            Benin  0.538462\n",
+      "62             Fiji  0.466667\n",
+      "20         Pakistan  0.461538\n",
+      "65           Uganda  0.437500\n",
+      "14          Liberia  0.425000\n",
+      "78      El Salvador  0.424242\n",
+      "51   Western Sahara  0.421687\n",
+      "least outliers \n",
+      "            Country  Outliers\n",
+      "119         Denmark  0.000000\n",
+      "30      Afghanistan  0.000000\n",
+      "113         Iceland  0.000000\n",
+      "28       Tajikistan  0.000000\n",
+      "74   Czech Republic  0.000000\n",
+      "27      South Korea  0.000000\n",
+      "1         Lithuania  0.000000\n",
+      "15      Netherlands  0.014925\n",
+      "121          Poland  0.040000\n",
+      "134        Paraguay  0.043478\n"
+     ]
+    }
+   ],
+   "source": [
+    "print \"detecting outliers...\"\n",
+    "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n",
+    "outliers.print_most_least_outliers_topN(df_global, N=10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## correlation of outlier results"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's use Kendal correlation to compare the ranked list of countries sorted by most to least outliers.\n",
+    "<br> First load the ranked list of outlier countries.\n",
+    "<br> Sort by outlier percentage in descending order."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "ranked_countries = pd.DataFrame()\n",
+    "ranked_outliers = pd.DataFrame()\n",
+    "for n in range(n_iters):\n",
+    "    df_global = pd.read_csv('../data/outliers_'+str(n)+'.csv')\n",
+    "    df_global = df_global.sort_values('Outliers', axis=0, ascending=False, inplace=True)\n",
+    "    ranked_countries = pd.concat([ranked_countries, df_global['Country']], axis=1)\n",
+    "    ranked_outliers = pd.concat([ranked_outliers, df_global['Outliers']], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Remove countries with 0% outliers as these are in random (probably alphabetical) order."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "zero_idx = np.where(np.sum(ranked_outliers, axis=1)==0)[0]\n",
+    "first_zero_idx = np.min(zero_idx)\n",
+    "ranked_countries = ranked_countries.iloc[:first_zero_idx, :]\n",
+    "ranked_outliers = ranked_outliers.iloc[:first_zero_idx, :]\n",
+    "\n",
+    "print ranked_countries.head()\n",
+    "print ranked_outliers.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And now kendalltau correlation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "KendalltauResult(correlation=0.99999999999999989, pvalue=2.5428927239036995e-67)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from scipy.stats import kendalltau\n",
+    "for i in range(len(ranked_countries)-1):\n",
+    "    for j in range(i+1, len(ranked_countries)):\n",
+    "        print kendalltau(ranked_countries.iloc[:, i], ranked_countries.iloc[:, j])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "SpearmanrResult(correlation=1.0, pvalue=0.0)"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from scipy.stats import spearmanr\n",
+    "spearmanr(ranked_countries)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
--- a/notebooks/test_hubness.ipynb	Fri Sep 15 12:27:11 2017 +0100
+++ b/notebooks/test_hubness.ipynb	Fri Sep 15 16:17:17 2017 +0100
@@ -27,57 +27,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:189: UserWarning: There are 21 disconnected observations\n",
-      "  warnings.warn(\"There are %d disconnected observations\" % ni)\n",
-      "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:190: UserWarning: Island ids: 3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121\n",
-      "  warnings.warn(\"Island ids: %s\" % ', '.join(str(island) for island in self.islands))\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Antigua and Barbuda\n",
-      "Australia\n",
-      "Cuba\n",
-      "Fiji\n",
-      "French Polynesia\n",
-      "Grenada\n",
-      "Iceland\n",
-      "Jamaica\n",
-      "Japan\n",
-      "Kiribati\n",
-      "Malta\n",
-      "New Zealand\n",
-      "Philippines\n",
-      "Puerto Rico\n",
-      "Republic of Serbia\n",
-      "Saint Lucia\n",
-      "Samoa\n",
-      "Solomon Islands\n",
-      "South Korea\n",
-      "The Bahamas\n",
-      "Trinidad and Tobago\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n",
-    "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
-    "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n",
-    "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n",
-    "Xrhy, Xmel, Xmfc, Xchr = X_list\n",
-    "X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)\n",
+    "#ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
+    "#w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n",
+    "#w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n",
+    "X = np.concatenate(X_list, axis=1)\n",
     "\n",
     "# global outliers\n",
-    "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)"
+    "#df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)"
    ]
   },
   {
@@ -223,7 +184,9 @@
   {
    "cell_type": "code",
    "execution_count": 16,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "sort_idx = np.argsort(D, axis=1)\n",
@@ -304,6 +267,186 @@
    ]
   },
   {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## let's get the audio url to listen to tracks identified as large hubs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/homes/mp305/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2822: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  if self.run_code(code, result):\n"
+     ]
+    }
+   ],
+   "source": [
+    "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata_BLSM_language_all.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(8200, 108)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ddf.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\"songurls_Album\" in ddf.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 'https://sounds.bl.uk/World-and-traditional-music/Decca-West-African-recordings/025A-1CS0043663XX-0100A0.mp3',\n",
+       "       'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025A-C0811X0005XX-2000A0.mp3',\n",
+       "       'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025A-C0485X0085XX-3100A0.mp3',\n",
+       "       'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025A-C0811X0005XX-1300A0.mp3',\n",
+       "       'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025A-C0485X0031XX-0500A0.mp3'], dtype=object)"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ddf['songurls_Album'].iloc[large_hubs_idx].get_values()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### first, fix the url for BL tracks (because it was changed recently and the metadata.csv file is not updated) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/homes/mp305/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py:115: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+      "  self._setitem_with_indexer(indexer, value)\n"
+     ]
+    }
+   ],
+   "source": [
+    "bl_inds = np.where(np.isnan(ddf['BuyLinkTrackDownload']))[0]\n",
+    "for bl_ind in bl_inds:\n",
+    "    ddf['songurls_Album'].iloc[bl_ind] = ('https://sounds.bl.uk/World-and-traditional-music/' + \n",
+    "                                        ddf['Folder'].iloc[bl_ind] + '/' + \n",
+    "                                        ddf['MetaFile'].iloc[bl_ind].split('.')[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>songurls_Album</th>\n",
+       "      <th>Country</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>515</th>\n",
+       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
+       "      <td>Nigeria</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2549</th>\n",
+       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
+       "      <td>Swaziland</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3486</th>\n",
+       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
+       "      <td>Kazakhstan</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5020</th>\n",
+       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
+       "      <td>Swaziland</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5119</th>\n",
+       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
+       "      <td>Pakistan</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from IPython.display import HTML\n",
+    "HTML(ddf[['songurls_Album', 'Country']].iloc[large_hubs_idx, :].to_html())"
+   ]
+  },
+  {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
--- a/notebooks/test_music_segments.ipynb	Fri Sep 15 12:27:11 2017 +0100
+++ b/notebooks/test_music_segments.ipynb	Fri Sep 15 16:17:17 2017 +0100
@@ -64,9 +64,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -92,9 +90,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -120,9 +116,7 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -153,9 +147,7 @@
   {
    "cell_type": "code",
    "execution_count": 23,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -190,9 +182,7 @@
   {
    "cell_type": "code",
    "execution_count": 22,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -243,7 +233,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.11"
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
--- a/scripts/load_features.py	Fri Sep 15 12:27:11 2017 +0100
+++ b/scripts/load_features.py	Fri Sep 15 16:17:17 2017 +0100
@@ -103,7 +103,7 @@
         return music_idx
     
     
-    def get_features(self, df, stop_sec=30.0, class_label='Country', precomp_melody=False):
+    def get_features(self, df, stop_sec=30.0, class_label='Country', precomp_melody=True):
         oplist = []
         mflist = []
         chlist = []
@@ -119,7 +119,7 @@
             if len(music_idx)==0:
                 # no music segments -> skip this file
                 continue
-            try:
+            if 1:
                 op, mfcc = self.get_op_mfcc_for_file(df['Melspec'].iloc[i], stop_sec=stop_sec)
                 ch = self.get_chroma_for_file(df['Chroma'].iloc[i], stop_sec=stop_sec)
                 pb = self.get_pb_for_file(df['Melodia'].iloc[i], precomp_melody=precomp_melody, stop_sec=stop_sec)
@@ -127,7 +127,7 @@
                 #    pb = self.load_precomputed_pb_from_melodia(df['Melodia'].iloc[i], stop_sec=stop_sec)
                 #else:
                 #    pb = self.get_pb_from_melodia(df['Melodia'].iloc[i], stop_sec=stop_sec)
-            except:
+            else:
                 continue
             n_stop = np.int(np.ceil(stop_sec * self.framessr2))
             print n_stop, len(op), len(mfcc), len(ch), len(pb)