Mercurial > hg > plosone_underreview
comparison notebooks/test_hubness.ipynb @ 42:90f8a2ea6f6f branch-tests
notebook results and load_features minor edits
author | mpanteli <m.x.panteli@gmail.com> |
---|---|
date | Fri, 15 Sep 2017 16:17:17 +0100 |
parents | 57f53b0d1eaa |
children | dbcd5b2a4efa |
comparison
equal
deleted
inserted
replaced
41:57f53b0d1eaa | 42:90f8a2ea6f6f |
---|---|
25 "import scripts.utils_spatial as utils_spatial" | 25 "import scripts.utils_spatial as utils_spatial" |
26 ] | 26 ] |
27 }, | 27 }, |
28 { | 28 { |
29 "cell_type": "code", | 29 "cell_type": "code", |
30 "execution_count": 2, | 30 "execution_count": 5, |
31 "metadata": {}, | 31 "metadata": {}, |
32 "outputs": [ | 32 "outputs": [], |
33 { | |
34 "name": "stderr", | |
35 "output_type": "stream", | |
36 "text": [ | |
37 "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:189: UserWarning: There are 21 disconnected observations\n", | |
38 " warnings.warn(\"There are %d disconnected observations\" % ni)\n", | |
39 "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:190: UserWarning: Island ids: 3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121\n", | |
40 " warnings.warn(\"Island ids: %s\" % ', '.join(str(island) for island in self.islands))\n" | |
41 ] | |
42 }, | |
43 { | |
44 "name": "stdout", | |
45 "output_type": "stream", | |
46 "text": [ | |
47 "Antigua and Barbuda\n", | |
48 "Australia\n", | |
49 "Cuba\n", | |
50 "Fiji\n", | |
51 "French Polynesia\n", | |
52 "Grenada\n", | |
53 "Iceland\n", | |
54 "Jamaica\n", | |
55 "Japan\n", | |
56 "Kiribati\n", | |
57 "Malta\n", | |
58 "New Zealand\n", | |
59 "Philippines\n", | |
60 "Puerto Rico\n", | |
61 "Republic of Serbia\n", | |
62 "Saint Lucia\n", | |
63 "Samoa\n", | |
64 "Solomon Islands\n", | |
65 "South Korea\n", | |
66 "The Bahamas\n", | |
67 "Trinidad and Tobago\n" | |
68 ] | |
69 } | |
70 ], | |
71 "source": [ | 33 "source": [ |
72 "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n", | 34 "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n", |
73 "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n", | 35 "#ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n", |
74 "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n", | 36 "#w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n", |
75 "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n", | 37 "#w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n", |
76 "Xrhy, Xmel, Xmfc, Xchr = X_list\n", | 38 "X = np.concatenate(X_list, axis=1)\n", |
77 "X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)\n", | |
78 "\n", | 39 "\n", |
79 "# global outliers\n", | 40 "# global outliers\n", |
80 "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)" | 41 "#df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)" |
81 ] | 42 ] |
82 }, | 43 }, |
83 { | 44 { |
84 "cell_type": "code", | 45 "cell_type": "code", |
85 "execution_count": 3, | 46 "execution_count": 3, |
221 ] | 182 ] |
222 }, | 183 }, |
223 { | 184 { |
224 "cell_type": "code", | 185 "cell_type": "code", |
225 "execution_count": 16, | 186 "execution_count": 16, |
226 "metadata": {}, | 187 "metadata": { |
188 "collapsed": true | |
189 }, | |
227 "outputs": [], | 190 "outputs": [], |
228 "source": [ | 191 "source": [ |
229 "sort_idx = np.argsort(D, axis=1)\n", | 192 "sort_idx = np.argsort(D, axis=1)\n", |
230 "k = 100\n", | 193 "k = 100\n", |
231 "D_k = sort_idx[:, 1:(k+1)]" | 194 "D_k = sort_idx[:, 1:(k+1)]" |
302 "source": [ | 265 "source": [ |
303 "np.where(D_k==515)[0].shape" | 266 "np.where(D_k==515)[0].shape" |
304 ] | 267 ] |
305 }, | 268 }, |
306 { | 269 { |
270 "cell_type": "markdown", | |
271 "metadata": {}, | |
272 "source": [ | |
273 "## let's get the audio url to listen to tracks identified as large hubs" | |
274 ] | |
275 }, | |
276 { | |
277 "cell_type": "code", | |
278 "execution_count": 6, | |
279 "metadata": {}, | |
280 "outputs": [ | |
281 { | |
282 "name": "stderr", | |
283 "output_type": "stream", | |
284 "text": [ | |
285 "/homes/mp305/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2822: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n", | |
286 " if self.run_code(code, result):\n" | |
287 ] | |
288 } | |
289 ], | |
290 "source": [ | |
291 "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata_BLSM_language_all.csv')" | |
292 ] | |
293 }, | |
294 { | |
295 "cell_type": "code", | |
296 "execution_count": 8, | |
297 "metadata": {}, | |
298 "outputs": [ | |
299 { | |
300 "data": { | |
301 "text/plain": [ | |
302 "(8200, 108)" | |
303 ] | |
304 }, | |
305 "execution_count": 8, | |
306 "metadata": {}, | |
307 "output_type": "execute_result" | |
308 } | |
309 ], | |
310 "source": [ | |
311 "ddf.shape" | |
312 ] | |
313 }, | |
314 { | |
315 "cell_type": "code", | |
316 "execution_count": 12, | |
317 "metadata": {}, | |
318 "outputs": [ | |
319 { | |
320 "data": { | |
321 "text/plain": [ | |
322 "True" | |
323 ] | |
324 }, | |
325 "execution_count": 12, | |
326 "metadata": {}, | |
327 "output_type": "execute_result" | |
328 } | |
329 ], | |
330 "source": [ | |
331 "\"songurls_Album\" in ddf.columns" | |
332 ] | |
333 }, | |
334 { | |
335 "cell_type": "code", | |
336 "execution_count": 37, | |
337 "metadata": {}, | |
338 "outputs": [ | |
339 { | |
340 "data": { | |
341 "text/plain": [ | |
342 "array([ 'https://sounds.bl.uk/World-and-traditional-music/Decca-West-African-recordings/025A-1CS0043663XX-0100A0.mp3',\n", | |
343 " 'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025A-C0811X0005XX-2000A0.mp3',\n", | |
344 " 'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025A-C0485X0085XX-3100A0.mp3',\n", | |
345 " 'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025A-C0811X0005XX-1300A0.mp3',\n", | |
346 " 'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025A-C0485X0031XX-0500A0.mp3'], dtype=object)" | |
347 ] | |
348 }, | |
349 "execution_count": 37, | |
350 "metadata": {}, | |
351 "output_type": "execute_result" | |
352 } | |
353 ], | |
354 "source": [ | |
355 "ddf['songurls_Album'].iloc[large_hubs_idx].get_values()" | |
356 ] | |
357 }, | |
358 { | |
359 "cell_type": "markdown", | |
360 "metadata": {}, | |
361 "source": [ | |
362 "### first, fix the url for BL tracks (because it was changed recently and the metadata.csv file is not updated) " | |
363 ] | |
364 }, | |
365 { | |
366 "cell_type": "code", | |
367 "execution_count": 41, | |
368 "metadata": {}, | |
369 "outputs": [ | |
370 { | |
371 "name": "stderr", | |
372 "output_type": "stream", | |
373 "text": [ | |
374 "/homes/mp305/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py:115: SettingWithCopyWarning: \n", | |
375 "A value is trying to be set on a copy of a slice from a DataFrame\n", | |
376 "\n", | |
377 "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", | |
378 " self._setitem_with_indexer(indexer, value)\n" | |
379 ] | |
380 } | |
381 ], | |
382 "source": [ | |
383 "bl_inds = np.where(np.isnan(ddf['BuyLinkTrackDownload']))[0]\n", | |
384 "for bl_ind in bl_inds:\n", | |
385 " ddf['songurls_Album'].iloc[bl_ind] = ('https://sounds.bl.uk/World-and-traditional-music/' + \n", | |
386 " ddf['Folder'].iloc[bl_ind] + '/' + \n", | |
387 " ddf['MetaFile'].iloc[bl_ind].split('.')[0])" | |
388 ] | |
389 }, | |
390 { | |
391 "cell_type": "code", | |
392 "execution_count": 32, | |
393 "metadata": {}, | |
394 "outputs": [ | |
395 { | |
396 "data": { | |
397 "text/html": [ | |
398 "<table border=\"1\" class=\"dataframe\">\n", | |
399 " <thead>\n", | |
400 " <tr style=\"text-align: right;\">\n", | |
401 " <th></th>\n", | |
402 " <th>songurls_Album</th>\n", | |
403 " <th>Country</th>\n", | |
404 " </tr>\n", | |
405 " </thead>\n", | |
406 " <tbody>\n", | |
407 " <tr>\n", | |
408 " <th>515</th>\n", | |
409 " <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n", | |
410 " <td>Nigeria</td>\n", | |
411 " </tr>\n", | |
412 " <tr>\n", | |
413 " <th>2549</th>\n", | |
414 " <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n", | |
415 " <td>Swaziland</td>\n", | |
416 " </tr>\n", | |
417 " <tr>\n", | |
418 " <th>3486</th>\n", | |
419 " <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n", | |
420 " <td>Kazakhstan</td>\n", | |
421 " </tr>\n", | |
422 " <tr>\n", | |
423 " <th>5020</th>\n", | |
424 " <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n", | |
425 " <td>Swaziland</td>\n", | |
426 " </tr>\n", | |
427 " <tr>\n", | |
428 " <th>5119</th>\n", | |
429 " <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n", | |
430 " <td>Pakistan</td>\n", | |
431 " </tr>\n", | |
432 " </tbody>\n", | |
433 "</table>" | |
434 ], | |
435 "text/plain": [ | |
436 "<IPython.core.display.HTML object>" | |
437 ] | |
438 }, | |
439 "execution_count": 32, | |
440 "metadata": {}, | |
441 "output_type": "execute_result" | |
442 } | |
443 ], | |
444 "source": [ | |
445 "from IPython.display import HTML\n", | |
446 "HTML(ddf[['songurls_Album', 'Country']].iloc[large_hubs_idx, :].to_html())" | |
447 ] | |
448 }, | |
449 { | |
307 "cell_type": "code", | 450 "cell_type": "code", |
308 "execution_count": null, | 451 "execution_count": null, |
309 "metadata": { | 452 "metadata": { |
310 "collapsed": true | 453 "collapsed": true |
311 }, | 454 }, |