diff notebooks/test_hubness.ipynb @ 42:90f8a2ea6f6f branch-tests

notebook results and load_features minor edits
author mpanteli <m.x.panteli@gmail.com>
date Fri, 15 Sep 2017 16:17:17 +0100
parents 57f53b0d1eaa
children dbcd5b2a4efa
line wrap: on
line diff
--- a/notebooks/test_hubness.ipynb	Fri Sep 15 12:27:11 2017 +0100
+++ b/notebooks/test_hubness.ipynb	Fri Sep 15 16:17:17 2017 +0100
@@ -27,57 +27,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:189: UserWarning: There are 21 disconnected observations\n",
-      "  warnings.warn(\"There are %d disconnected observations\" % ni)\n",
-      "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:190: UserWarning: Island ids: 3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121\n",
-      "  warnings.warn(\"Island ids: %s\" % ', '.join(str(island) for island in self.islands))\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Antigua and Barbuda\n",
-      "Australia\n",
-      "Cuba\n",
-      "Fiji\n",
-      "French Polynesia\n",
-      "Grenada\n",
-      "Iceland\n",
-      "Jamaica\n",
-      "Japan\n",
-      "Kiribati\n",
-      "Malta\n",
-      "New Zealand\n",
-      "Philippines\n",
-      "Puerto Rico\n",
-      "Republic of Serbia\n",
-      "Saint Lucia\n",
-      "Samoa\n",
-      "Solomon Islands\n",
-      "South Korea\n",
-      "The Bahamas\n",
-      "Trinidad and Tobago\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n",
-    "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
-    "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n",
-    "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n",
-    "Xrhy, Xmel, Xmfc, Xchr = X_list\n",
-    "X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)\n",
+    "#ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
+    "#w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n",
+    "#w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n",
+    "X = np.concatenate(X_list, axis=1)\n",
     "\n",
     "# global outliers\n",
-    "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)"
+    "#df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)"
    ]
   },
   {
@@ -223,7 +184,9 @@
   {
    "cell_type": "code",
    "execution_count": 16,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "sort_idx = np.argsort(D, axis=1)\n",
@@ -304,6 +267,186 @@
    ]
   },
   {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## let's get the audio url to listen to tracks identified as large hubs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/homes/mp305/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2822: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  if self.run_code(code, result):\n"
+     ]
+    }
+   ],
+   "source": [
+    "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata_BLSM_language_all.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(8200, 108)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ddf.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\"songurls_Album\" in ddf.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 'https://sounds.bl.uk/World-and-traditional-music/Decca-West-African-recordings/025A-1CS0043663XX-0100A0.mp3',\n",
+       "       'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025A-C0811X0005XX-2000A0.mp3',\n",
+       "       'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025A-C0485X0085XX-3100A0.mp3',\n",
+       "       'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025A-C0811X0005XX-1300A0.mp3',\n",
+       "       'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025A-C0485X0031XX-0500A0.mp3'], dtype=object)"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ddf['songurls_Album'].iloc[large_hubs_idx].get_values()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### first, fix the url for BL tracks (because it was changed recently and the metadata.csv file is not updated) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/homes/mp305/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py:115: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+      "  self._setitem_with_indexer(indexer, value)\n"
+     ]
+    }
+   ],
+   "source": [
+    "bl_inds = np.where(np.isnan(ddf['BuyLinkTrackDownload']))[0]\n",
+    "for bl_ind in bl_inds:\n",
+    "    ddf['songurls_Album'].iloc[bl_ind] = ('https://sounds.bl.uk/World-and-traditional-music/' + \n",
+    "                                        ddf['Folder'].iloc[bl_ind] + '/' + \n",
+    "                                        ddf['MetaFile'].iloc[bl_ind].split('.')[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>songurls_Album</th>\n",
+       "      <th>Country</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>515</th>\n",
+       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
+       "      <td>Nigeria</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2549</th>\n",
+       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
+       "      <td>Swaziland</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3486</th>\n",
+       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
+       "      <td>Kazakhstan</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5020</th>\n",
+       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
+       "      <td>Swaziland</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5119</th>\n",
+       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
+       "      <td>Pakistan</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from IPython.display import HTML\n",
+    "HTML(ddf[['songurls_Album', 'Country']].iloc[large_hubs_idx, :].to_html())"
+   ]
+  },
+  {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {