diff notebooks/test_hubness.ipynb @ 54:dbcd5b2a4efa branch-tests

additions in notebooks
author Maria Panteli <m.x.panteli@gmail.com>
date Tue, 19 Sep 2017 18:41:14 +0100
parents 90f8a2ea6f6f
children 98cd5317e504 b2c38538f127
line wrap: on
line diff
--- a/notebooks/test_hubness.ipynb	Mon Sep 18 11:25:05 2017 +0100
+++ b/notebooks/test_hubness.ipynb	Tue Sep 19 18:41:14 2017 +0100
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {
     "collapsed": true
    },
@@ -27,8 +27,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n",
@@ -44,7 +46,9 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -62,6 +66,13 @@
    ]
   },
   {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## distance matrix"
+   ]
+  },
+  {
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {
@@ -75,7 +86,9 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -89,13 +102,17 @@
     }
    ],
    "source": [
+    "np.savetxt('../data/D_mahal.csv', D)\n",
+    "D = np.loadtxt('../data/D_mahal.csv')\n",
     "D.shape"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 6,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -113,6 +130,13 @@
    ]
   },
   {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## n-occurrence and stats"
+   ]
+  },
+  {
    "cell_type": "code",
    "execution_count": 7,
    "metadata": {
@@ -132,7 +156,9 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -173,11 +199,21 @@
     }
    ],
    "source": [
-    "N_k = n_occurrence_from_D(D, k=100)\n",
+    "# take k mean of country sample\n",
+    "uniq_countries, uniq_counts = np.unique(Y, return_counts=True)\n",
+    "k = np.int(np.round(np.mean(uniq_counts)))\n",
+    "print k\n",
+    "N_k = n_occurrence_from_D(D, k=k)\n",
     "print skew(N_k)\n",
+    "print np.median(N_k)\n",
+    "print np.mean(N_k)\n",
+    "print np.std(N_k)\n",
+    "print len(np.where(N_k>1000))\n",
     "plt.figure()\n",
     "plt.hist(N_k, bins=100);\n",
     "plt.figure()\n",
+    "plt.hist(N_k[N_k<1000], bins=100);\n",
+    "plt.figure()\n",
     "plt.plot(np.sort(N_k))"
    ]
   },
@@ -197,7 +233,9 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -223,7 +261,9 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -249,7 +289,9 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -275,26 +317,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/homes/mp305/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2822: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n",
-      "  if self.run_code(code, result):\n"
+      "/Users/mariapanteli/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  data = self._reader.read(nrows)\n"
      ]
     }
    ],
    "source": [
-    "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata_BLSM_language_all.csv')"
+    "#ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata_BLSM_language_all.csv')\n",
+    "ddf = outliers.load_metadata(Yaudio, metadata_file='/Users/mariapanteli/Documents/'+\n",
+    "                             'QMUL/Code/MyPythonCode/MergeBL-Smith/data/metadata_BLSM_language_all.csv')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -302,7 +350,7 @@
        "(8200, 108)"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -313,8 +361,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -322,7 +372,7 @@
        "True"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -332,30 +382,6 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([ 'https://sounds.bl.uk/World-and-traditional-music/Decca-West-African-recordings/025A-1CS0043663XX-0100A0.mp3',\n",
-       "       'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025A-C0811X0005XX-2000A0.mp3',\n",
-       "       'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025A-C0485X0085XX-3100A0.mp3',\n",
-       "       'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025A-C0811X0005XX-1300A0.mp3',\n",
-       "       'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025A-C0485X0031XX-0500A0.mp3'], dtype=object)"
-      ]
-     },
-     "execution_count": 37,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ddf['songurls_Album'].iloc[large_hubs_idx].get_values()"
-   ]
-  },
-  {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -364,17 +390,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
-   "metadata": {},
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/homes/mp305/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py:115: SettingWithCopyWarning: \n",
+      "/Users/mariapanteli/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py:121: SettingWithCopyWarning: \n",
       "A value is trying to be set on a copy of a slice from a DataFrame\n",
       "\n",
-      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+      "See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
       "  self._setitem_with_indexer(indexer, value)\n"
      ]
     }
@@ -389,61 +417,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>songurls_Album</th>\n",
-       "      <th>Country</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>515</th>\n",
-       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
-       "      <td>Nigeria</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2549</th>\n",
-       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
-       "      <td>Swaziland</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3486</th>\n",
-       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
-       "      <td>Kazakhstan</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5020</th>\n",
-       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
-       "      <td>Swaziland</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5119</th>\n",
-       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
-       "      <td>Pakistan</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>"
-      ],
       "text/plain": [
-       "<IPython.core.display.HTML object>"
+       "array([ 'https://sounds.bl.uk/World-and-traditional-music/Decca-West-African-recordings/025M-1CS0043663XX-0100V0',\n",
+       "       'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025M-C0811X0005XX-2000V0',\n",
+       "       'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025M-C0485X0085XX-3100V0',\n",
+       "       'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025M-C0811X0005XX-1300V0',\n",
+       "       'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025M-C0485X0031XX-0500V0'], dtype=object)"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "from IPython.display import HTML\n",
-    "HTML(ddf[['songurls_Album', 'Country']].iloc[large_hubs_idx, :].to_html())"
+    "large_hubs_idx = np.array([515, 2549, 3486, 5020, 5119])\n",
+    "ddf['songurls_Album'].iloc[large_hubs_idx].get_values()"
    ]
   },
   {