Mercurial > hg > plosone_underreview
changeset 54:dbcd5b2a4efa branch-tests
additions in notebooks
author | Maria Panteli <m.x.panteli@gmail.com> |
---|---|
date | Tue, 19 Sep 2017 18:41:14 +0100 |
parents | 7532363b9dda |
children | 98cd5317e504 b2c38538f127 |
files | notebooks/correlation_samples_outliers.ipynb notebooks/test_hubness.ipynb scripts/outliers.py |
diffstat | 3 files changed, 206 insertions(+), 114 deletions(-) [+] |
line wrap: on
line diff
--- a/notebooks/correlation_samples_outliers.ipynb Mon Sep 18 11:25:05 2017 +0100 +++ b/notebooks/correlation_samples_outliers.ipynb Tue Sep 19 18:41:14 2017 +0100 @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": { "collapsed": false }, @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 2, "metadata": { "collapsed": false }, @@ -117,22 +117,22 @@ "text": [ "most outliers \n", " Country Outliers N_Country N_Outliers\n", - "60 Chad 0.636364 11 7\n", + "59 Chad 0.636364 11 7\n", "86 Gambia 0.540000 50 27\n", "17 French Guiana 0.535714 28 15\n", - "43 Benin 0.500000 26 13\n", + "42 Benin 0.500000 26 13\n", "78 El Salvador 0.484848 33 16\n", "136 Botswana 0.477778 90 43\n", "6 Bolivia 0.457143 35 16\n", "104 Bhutan 0.454545 11 5\n", "14 Liberia 0.450000 40 18\n", - "63 Senegal 0.439024 41 18\n", + "62 Senegal 0.439024 41 18\n", "least outliers \n", " Country Outliers N_Country N_Outliers\n", - "1 Lithuania 0.000000 47 NaN\n", - "120 Kazakhstan 0.000000 88 NaN\n", - "119 Denmark 0.000000 16 NaN\n", - "107 Kiribati 0.000000 17 NaN\n", + "1 Lithuania 0.000000 47 0\n", + "107 Kiribati 0.000000 17 0\n", + "119 Denmark 0.000000 16 0\n", + "120 Kazakhstan 0.000000 88 0\n", "109 Democratic Republic of the Congo 0.042553 47 2\n", "105 Sudan 0.044118 68 3\n", "15 Netherlands 0.044776 67 3\n", @@ -403,6 +403,99 @@ }, { "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "64 Mozambique 0.382353 34 13\n", + "117 Zimbabwe 0.333333 15 5\n", + "27 Kenya 0.288660 97 28\n", + "67 Brazil 0.270000 100 27\n", + "76 Iran 0.264151 53 14\n", + "30 Turkey 0.240000 100 24\n", + "65 Uganda 0.211765 85 18\n", + "4 Ethiopia 0.200000 35 7\n", + "126 South Sudan 0.195652 92 18\n", + "91 United Republic of Tanzania 0.193548 62 12\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "0 Canada 0 100 0\n", + "94 Iraq 0 87 0\n", + "93 Grenada 0 37 0\n", + "90 French Polynesia 0 15 0\n", + "89 Croatia 0 31 0\n", + "88 Morocco 0 40 0\n", + "87 Philippines 0 100 0\n", + "86 Gambia 0 50 0\n", + "85 Sierra Leone 0 100 0\n", + "84 Belize 0 43 0\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.path.append('../')\n", + "import scripts.utils as utils\n", + "from collections import Counter\n", + "#spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)\n", + "spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))\n", + "df_local = outliers.country_outlier_df(spatial_counts, Y, normalize=True)\n", + "outliers.print_most_least_outliers_topN(df_local, N=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(8200, 380)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(8200,)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Y.shape" + ] + }, + { + "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true
--- a/notebooks/test_hubness.ipynb Mon Sep 18 11:25:05 2017 +0100 +++ b/notebooks/test_hubness.ipynb Tue Sep 19 18:41:14 2017 +0100 @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "collapsed": true }, @@ -27,8 +27,10 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, + "execution_count": 3, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n", @@ -44,7 +46,9 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -62,6 +66,13 @@ ] }, { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## distance matrix" + ] + }, + { "cell_type": "code", "execution_count": 4, "metadata": { @@ -75,7 +86,9 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -89,13 +102,17 @@ } ], "source": [ + "np.savetxt('../data/D_mahal.csv', D)\n", + "D = np.loadtxt('../data/D_mahal.csv')\n", "D.shape" ] }, { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -113,6 +130,13 @@ ] }, { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## n-occurrence and stats" + ] + }, + { "cell_type": "code", "execution_count": 7, "metadata": { @@ -132,7 +156,9 @@ { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -173,11 +199,21 @@ } ], "source": [ - "N_k = n_occurrence_from_D(D, k=100)\n", + "# take k mean of country sample\n", + "uniq_countries, uniq_counts = np.unique(Y, return_counts=True)\n", + "k = np.int(np.round(np.mean(uniq_counts)))\n", + "print k\n", + "N_k = n_occurrence_from_D(D, k=k)\n", "print skew(N_k)\n", + "print np.median(N_k)\n", + "print np.mean(N_k)\n", + "print np.std(N_k)\n", + "print len(np.where(N_k>1000))\n", "plt.figure()\n", "plt.hist(N_k, bins=100);\n", "plt.figure()\n", + "plt.hist(N_k[N_k<1000], bins=100);\n", + "plt.figure()\n", "plt.plot(np.sort(N_k))" ] }, @@ -197,7 +233,9 @@ { "cell_type": "code", "execution_count": 17, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -223,7 +261,9 @@ { "cell_type": "code", "execution_count": 13, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -249,7 +289,9 @@ { "cell_type": "code", "execution_count": 18, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -275,26 +317,32 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": {}, + "execution_count": 5, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/homes/mp305/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2822: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n", - " if self.run_code(code, result):\n" + "/Users/mariapanteli/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " data = self._reader.read(nrows)\n" ] } ], "source": [ - "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata_BLSM_language_all.csv')" + "#ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata_BLSM_language_all.csv')\n", + "ddf = outliers.load_metadata(Yaudio, metadata_file='/Users/mariapanteli/Documents/'+\n", + " 'QMUL/Code/MyPythonCode/MergeBL-Smith/data/metadata_BLSM_language_all.csv')" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, + "execution_count": 6, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -302,7 +350,7 @@ "(8200, 108)" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -313,8 +361,10 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": {}, + "execution_count": 7, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -322,7 +372,7 @@ "True" ] }, - "execution_count": 12, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -332,30 +382,6 @@ ] }, { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 'https://sounds.bl.uk/World-and-traditional-music/Decca-West-African-recordings/025A-1CS0043663XX-0100A0.mp3',\n", - " 'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025A-C0811X0005XX-2000A0.mp3',\n", - " 'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025A-C0485X0085XX-3100A0.mp3',\n", - " 'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025A-C0811X0005XX-1300A0.mp3',\n", - " 'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025A-C0485X0031XX-0500A0.mp3'], dtype=object)" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf['songurls_Album'].iloc[large_hubs_idx].get_values()" - ] - }, - { "cell_type": "markdown", "metadata": {}, "source": [ @@ -364,17 +390,19 @@ }, { "cell_type": "code", - "execution_count": 41, - "metadata": {}, + "execution_count": 8, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/homes/mp305/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py:115: SettingWithCopyWarning: \n", + "/Users/mariapanteli/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py:121: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " self._setitem_with_indexer(indexer, value)\n" ] } @@ -389,61 +417,29 @@ }, { "cell_type": "code", - "execution_count": 32, - "metadata": {}, + "execution_count": 10, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { - "text/html": [ - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>songurls_Album</th>\n", - " <th>Country</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>515</th>\n", - " <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n", - " <td>Nigeria</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2549</th>\n", - " <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n", - " <td>Swaziland</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3486</th>\n", - " <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n", - " <td>Kazakhstan</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5020</th>\n", - " <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n", - " <td>Swaziland</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5119</th>\n", - " <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n", - " <td>Pakistan</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>" - ], "text/plain": [ - "<IPython.core.display.HTML object>" + "array([ 'https://sounds.bl.uk/World-and-traditional-music/Decca-West-African-recordings/025M-1CS0043663XX-0100V0',\n", + " 'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025M-C0811X0005XX-2000V0',\n", + " 'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025M-C0485X0085XX-3100V0',\n", + " 'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025M-C0811X0005XX-1300V0',\n", + " 'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025M-C0485X0031XX-0500V0'], dtype=object)" ] }, - "execution_count": 32, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from IPython.display import HTML\n", - "HTML(ddf[['songurls_Album', 'Country']].iloc[large_hubs_idx, :].to_html())" + "large_hubs_idx = np.array([515, 2549, 3486, 5020, 5119])\n", + "ddf['songurls_Album'].iloc[large_hubs_idx].get_values()" ] }, {
--- a/scripts/outliers.py Mon Sep 18 11:25:05 2017 +0100 +++ b/scripts/outliers.py Tue Sep 19 18:41:14 2017 +0100 @@ -21,9 +21,18 @@ if not counts.has_key(label): counts.update({label:0}) if normalize: - counts = normalize_outlier_counts(counts, Counter(labels)) - df = pd.DataFrame.from_dict(counts, orient='index').reset_index() + norm_counts = normalize_outlier_counts(counts, Counter(labels)) + df = pd.DataFrame.from_dict(norm_counts, orient='index').reset_index() + else: + df = pd.DataFrame.from_dict(Counter(counts), orient='index').reset_index() df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True) + # append number of recordings and number of outliers per country + df_n_country = pd.DataFrame.from_dict(Counter(labels), orient='index').reset_index() + df_n_country.rename(columns={'index':'Country', 0:'N_Country'}, inplace=True) + df_n_outliers = pd.DataFrame.from_dict(Counter(counts), orient='index').reset_index() + df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True) + df = pd.merge(df, df_n_country, on='Country', how='left') + df = pd.merge(df, df_n_outliers, on='Country', how='left') return df @@ -31,23 +40,17 @@ '''Normalize a dictionary of outlier counts per country by the total number of recordings per country ''' + norm_counts = {} for key in outlier_counts.keys(): # dictionaries should have the same keys - outlier_counts[key] = float(outlier_counts[key]) / float(country_counts[key]) - return outlier_counts + norm_counts[key] = float(outlier_counts[key]) / float(country_counts[key]) + return norm_counts def get_outliers_df(X, Y, chi2thr=0.999, out_file=None): threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr) global_counts = Counter(Y[y_pred]) df = country_outlier_df(global_counts, Y, normalize=True) - # append number of recordings and number of outliers per country - df_n_country = pd.DataFrame.from_dict(Counter(Y), orient='index').reset_index() - df_n_country.rename(columns={'index':'Country', 0:'N_Country'}, inplace=True) - df_n_outliers = pd.DataFrame.from_dict(Counter(Y[y_pred]), orient='index').reset_index() - df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True) - df = pd.merge(df, df_n_country, on='Country', how='left') - df = pd.merge(df, df_n_outliers, on='Country', how='left') if out_file is not None: df.to_csv(out_file, index=False) return df, threshold, MD