comparison notebooks/test_hubness.ipynb @ 42:90f8a2ea6f6f branch-tests

notebook results and load_features minor edits
author mpanteli <m.x.panteli@gmail.com>
date Fri, 15 Sep 2017 16:17:17 +0100
parents 57f53b0d1eaa
children dbcd5b2a4efa
comparison
equal deleted inserted replaced
41:57f53b0d1eaa 42:90f8a2ea6f6f
25 "import scripts.utils_spatial as utils_spatial" 25 "import scripts.utils_spatial as utils_spatial"
26 ] 26 ]
27 }, 27 },
28 { 28 {
29 "cell_type": "code", 29 "cell_type": "code",
30 "execution_count": 2, 30 "execution_count": 5,
31 "metadata": {}, 31 "metadata": {},
32 "outputs": [ 32 "outputs": [],
33 {
34 "name": "stderr",
35 "output_type": "stream",
36 "text": [
37 "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:189: UserWarning: There are 21 disconnected observations\n",
38 " warnings.warn(\"There are %d disconnected observations\" % ni)\n",
39 "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:190: UserWarning: Island ids: 3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121\n",
40 " warnings.warn(\"Island ids: %s\" % ', '.join(str(island) for island in self.islands))\n"
41 ]
42 },
43 {
44 "name": "stdout",
45 "output_type": "stream",
46 "text": [
47 "Antigua and Barbuda\n",
48 "Australia\n",
49 "Cuba\n",
50 "Fiji\n",
51 "French Polynesia\n",
52 "Grenada\n",
53 "Iceland\n",
54 "Jamaica\n",
55 "Japan\n",
56 "Kiribati\n",
57 "Malta\n",
58 "New Zealand\n",
59 "Philippines\n",
60 "Puerto Rico\n",
61 "Republic of Serbia\n",
62 "Saint Lucia\n",
63 "Samoa\n",
64 "Solomon Islands\n",
65 "South Korea\n",
66 "The Bahamas\n",
67 "Trinidad and Tobago\n"
68 ]
69 }
70 ],
71 "source": [ 33 "source": [
72 "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n", 34 "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n",
73 "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n", 35 "#ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
74 "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n", 36 "#w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n",
75 "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n", 37 "#w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n",
76 "Xrhy, Xmel, Xmfc, Xchr = X_list\n", 38 "X = np.concatenate(X_list, axis=1)\n",
77 "X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)\n",
78 "\n", 39 "\n",
79 "# global outliers\n", 40 "# global outliers\n",
80 "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)" 41 "#df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)"
81 ] 42 ]
82 }, 43 },
83 { 44 {
84 "cell_type": "code", 45 "cell_type": "code",
85 "execution_count": 3, 46 "execution_count": 3,
221 ] 182 ]
222 }, 183 },
223 { 184 {
224 "cell_type": "code", 185 "cell_type": "code",
225 "execution_count": 16, 186 "execution_count": 16,
226 "metadata": {}, 187 "metadata": {
188 "collapsed": true
189 },
227 "outputs": [], 190 "outputs": [],
228 "source": [ 191 "source": [
229 "sort_idx = np.argsort(D, axis=1)\n", 192 "sort_idx = np.argsort(D, axis=1)\n",
230 "k = 100\n", 193 "k = 100\n",
231 "D_k = sort_idx[:, 1:(k+1)]" 194 "D_k = sort_idx[:, 1:(k+1)]"
302 "source": [ 265 "source": [
303 "np.where(D_k==515)[0].shape" 266 "np.where(D_k==515)[0].shape"
304 ] 267 ]
305 }, 268 },
306 { 269 {
270 "cell_type": "markdown",
271 "metadata": {},
272 "source": [
273 "## let's get the audio url to listen to tracks identified as large hubs"
274 ]
275 },
276 {
277 "cell_type": "code",
278 "execution_count": 6,
279 "metadata": {},
280 "outputs": [
281 {
282 "name": "stderr",
283 "output_type": "stream",
284 "text": [
285 "/homes/mp305/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2822: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n",
286 " if self.run_code(code, result):\n"
287 ]
288 }
289 ],
290 "source": [
291 "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata_BLSM_language_all.csv')"
292 ]
293 },
294 {
295 "cell_type": "code",
296 "execution_count": 8,
297 "metadata": {},
298 "outputs": [
299 {
300 "data": {
301 "text/plain": [
302 "(8200, 108)"
303 ]
304 },
305 "execution_count": 8,
306 "metadata": {},
307 "output_type": "execute_result"
308 }
309 ],
310 "source": [
311 "ddf.shape"
312 ]
313 },
314 {
315 "cell_type": "code",
316 "execution_count": 12,
317 "metadata": {},
318 "outputs": [
319 {
320 "data": {
321 "text/plain": [
322 "True"
323 ]
324 },
325 "execution_count": 12,
326 "metadata": {},
327 "output_type": "execute_result"
328 }
329 ],
330 "source": [
331 "\"songurls_Album\" in ddf.columns"
332 ]
333 },
334 {
335 "cell_type": "code",
336 "execution_count": 37,
337 "metadata": {},
338 "outputs": [
339 {
340 "data": {
341 "text/plain": [
342 "array([ 'https://sounds.bl.uk/World-and-traditional-music/Decca-West-African-recordings/025A-1CS0043663XX-0100A0.mp3',\n",
343 " 'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025A-C0811X0005XX-2000A0.mp3',\n",
344 " 'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025A-C0485X0085XX-3100A0.mp3',\n",
345 " 'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025A-C0811X0005XX-1300A0.mp3',\n",
346 " 'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025A-C0485X0031XX-0500A0.mp3'], dtype=object)"
347 ]
348 },
349 "execution_count": 37,
350 "metadata": {},
351 "output_type": "execute_result"
352 }
353 ],
354 "source": [
355 "ddf['songurls_Album'].iloc[large_hubs_idx].get_values()"
356 ]
357 },
358 {
359 "cell_type": "markdown",
360 "metadata": {},
361 "source": [
362 "### first, fix the url for BL tracks (because it was changed recently and the metadata.csv file is not updated) "
363 ]
364 },
365 {
366 "cell_type": "code",
367 "execution_count": 41,
368 "metadata": {},
369 "outputs": [
370 {
371 "name": "stderr",
372 "output_type": "stream",
373 "text": [
374 "/homes/mp305/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py:115: SettingWithCopyWarning: \n",
375 "A value is trying to be set on a copy of a slice from a DataFrame\n",
376 "\n",
377 "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
378 " self._setitem_with_indexer(indexer, value)\n"
379 ]
380 }
381 ],
382 "source": [
383 "bl_inds = np.where(np.isnan(ddf['BuyLinkTrackDownload']))[0]\n",
384 "for bl_ind in bl_inds:\n",
385 " ddf['songurls_Album'].iloc[bl_ind] = ('https://sounds.bl.uk/World-and-traditional-music/' + \n",
386 " ddf['Folder'].iloc[bl_ind] + '/' + \n",
387 " ddf['MetaFile'].iloc[bl_ind].split('.')[0])"
388 ]
389 },
390 {
391 "cell_type": "code",
392 "execution_count": 32,
393 "metadata": {},
394 "outputs": [
395 {
396 "data": {
397 "text/html": [
398 "<table border=\"1\" class=\"dataframe\">\n",
399 " <thead>\n",
400 " <tr style=\"text-align: right;\">\n",
401 " <th></th>\n",
402 " <th>songurls_Album</th>\n",
403 " <th>Country</th>\n",
404 " </tr>\n",
405 " </thead>\n",
406 " <tbody>\n",
407 " <tr>\n",
408 " <th>515</th>\n",
409 " <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
410 " <td>Nigeria</td>\n",
411 " </tr>\n",
412 " <tr>\n",
413 " <th>2549</th>\n",
414 " <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
415 " <td>Swaziland</td>\n",
416 " </tr>\n",
417 " <tr>\n",
418 " <th>3486</th>\n",
419 " <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
420 " <td>Kazakhstan</td>\n",
421 " </tr>\n",
422 " <tr>\n",
423 " <th>5020</th>\n",
424 " <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
425 " <td>Swaziland</td>\n",
426 " </tr>\n",
427 " <tr>\n",
428 " <th>5119</th>\n",
429 " <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
430 " <td>Pakistan</td>\n",
431 " </tr>\n",
432 " </tbody>\n",
433 "</table>"
434 ],
435 "text/plain": [
436 "<IPython.core.display.HTML object>"
437 ]
438 },
439 "execution_count": 32,
440 "metadata": {},
441 "output_type": "execute_result"
442 }
443 ],
444 "source": [
445 "from IPython.display import HTML\n",
446 "HTML(ddf[['songurls_Album', 'Country']].iloc[large_hubs_idx, :].to_html())"
447 ]
448 },
449 {
307 "cell_type": "code", 450 "cell_type": "code",
308 "execution_count": null, 451 "execution_count": null,
309 "metadata": { 452 "metadata": {
310 "collapsed": true 453 "collapsed": true
311 }, 454 },