Mercurial > hg > plosone_underreview

--- a/notebooks/correlation_samples_outliers.ipynb	Mon Sep 18 11:25:05 2017 +0100
+++ b/notebooks/correlation_samples_outliers.ipynb	Tue Sep 19 18:41:14 2017 +0100
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "metadata": {
     "collapsed": false
    },
@@ -35,7 +35,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 2,
    "metadata": {
     "collapsed": false
    },
@@ -117,22 +117,22 @@
      "text": [
       "most outliers \n",
       "           Country  Outliers  N_Country  N_Outliers\n",
-      "60            Chad  0.636364         11           7\n",
+      "59            Chad  0.636364         11           7\n",
       "86          Gambia  0.540000         50          27\n",
       "17   French Guiana  0.535714         28          15\n",
-      "43           Benin  0.500000         26          13\n",
+      "42           Benin  0.500000         26          13\n",
       "78     El Salvador  0.484848         33          16\n",
       "136       Botswana  0.477778         90          43\n",
       "6          Bolivia  0.457143         35          16\n",
       "104         Bhutan  0.454545         11           5\n",
       "14         Liberia  0.450000         40          18\n",
-      "63         Senegal  0.439024         41          18\n",
+      "62         Senegal  0.439024         41          18\n",
       "least outliers \n",
       "                              Country  Outliers  N_Country  N_Outliers\n",
-      "1                           Lithuania  0.000000         47         NaN\n",
-      "120                        Kazakhstan  0.000000         88         NaN\n",
-      "119                           Denmark  0.000000         16         NaN\n",
-      "107                          Kiribati  0.000000         17         NaN\n",
+      "1                           Lithuania  0.000000         47           0\n",
+      "107                          Kiribati  0.000000         17           0\n",
+      "119                           Denmark  0.000000         16           0\n",
+      "120                        Kazakhstan  0.000000         88           0\n",
       "109  Democratic Republic of the Congo  0.042553         47           2\n",
       "105                             Sudan  0.044118         68           3\n",
       "15                        Netherlands  0.044776         67           3\n",
@@ -403,6 +403,99 @@
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "most outliers \n",
+      "                         Country  Outliers  N_Country  N_Outliers\n",
+      "64                    Mozambique  0.382353         34          13\n",
+      "117                     Zimbabwe  0.333333         15           5\n",
+      "27                         Kenya  0.288660         97          28\n",
+      "67                        Brazil  0.270000        100          27\n",
+      "76                          Iran  0.264151         53          14\n",
+      "30                        Turkey  0.240000        100          24\n",
+      "65                        Uganda  0.211765         85          18\n",
+      "4                       Ethiopia  0.200000         35           7\n",
+      "126                  South Sudan  0.195652         92          18\n",
+      "91   United Republic of Tanzania  0.193548         62          12\n",
+      "least outliers \n",
+      "             Country  Outliers  N_Country  N_Outliers\n",
+      "0             Canada         0        100           0\n",
+      "94              Iraq         0         87           0\n",
+      "93           Grenada         0         37           0\n",
+      "90  French Polynesia         0         15           0\n",
+      "89           Croatia         0         31           0\n",
+      "88           Morocco         0         40           0\n",
+      "87       Philippines         0        100           0\n",
+      "86            Gambia         0         50           0\n",
+      "85      Sierra Leone         0        100           0\n",
+      "84            Belize         0         43           0\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "sys.path.append('../')\n",
+    "import scripts.utils as utils\n",
+    "from collections import Counter\n",
+    "#spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)\n",
+    "spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))\n",
+    "df_local = outliers.country_outlier_df(spatial_counts, Y, normalize=True)\n",
+    "outliers.print_most_least_outliers_topN(df_local, N=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(8200, 380)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(8200,)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Y.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
--- a/notebooks/test_hubness.ipynb	Mon Sep 18 11:25:05 2017 +0100
+++ b/notebooks/test_hubness.ipynb	Tue Sep 19 18:41:14 2017 +0100
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {
     "collapsed": true
    },
@@ -27,8 +27,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n",
@@ -44,7 +46,9 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -62,6 +66,13 @@
    ]
   },
   {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## distance matrix"
+   ]
+  },
+  {
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {
@@ -75,7 +86,9 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -89,13 +102,17 @@
     }
    ],
    "source": [
+    "np.savetxt('../data/D_mahal.csv', D)\n",
+    "D = np.loadtxt('../data/D_mahal.csv')\n",
     "D.shape"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 6,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -113,6 +130,13 @@
    ]
   },
   {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## n-occurrence and stats"
+   ]
+  },
+  {
    "cell_type": "code",
    "execution_count": 7,
    "metadata": {
@@ -132,7 +156,9 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -173,11 +199,21 @@
     }
    ],
    "source": [
-    "N_k = n_occurrence_from_D(D, k=100)\n",
+    "# take k mean of country sample\n",
+    "uniq_countries, uniq_counts = np.unique(Y, return_counts=True)\n",
+    "k = np.int(np.round(np.mean(uniq_counts)))\n",
+    "print k\n",
+    "N_k = n_occurrence_from_D(D, k=k)\n",
     "print skew(N_k)\n",
+    "print np.median(N_k)\n",
+    "print np.mean(N_k)\n",
+    "print np.std(N_k)\n",
+    "print len(np.where(N_k>1000))\n",
     "plt.figure()\n",
     "plt.hist(N_k, bins=100);\n",
     "plt.figure()\n",
+    "plt.hist(N_k[N_k<1000], bins=100);\n",
+    "plt.figure()\n",
     "plt.plot(np.sort(N_k))"
    ]
   },
@@ -197,7 +233,9 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -223,7 +261,9 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -249,7 +289,9 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -275,26 +317,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/homes/mp305/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2822: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n",
-      "  if self.run_code(code, result):\n"
+      "/Users/mariapanteli/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  data = self._reader.read(nrows)\n"
      ]
     }
    ],
    "source": [
-    "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata_BLSM_language_all.csv')"
+    "#ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata_BLSM_language_all.csv')\n",
+    "ddf = outliers.load_metadata(Yaudio, metadata_file='/Users/mariapanteli/Documents/'+\n",
+    "                             'QMUL/Code/MyPythonCode/MergeBL-Smith/data/metadata_BLSM_language_all.csv')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -302,7 +350,7 @@
        "(8200, 108)"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -313,8 +361,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -322,7 +372,7 @@
        "True"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -332,30 +382,6 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([ 'https://sounds.bl.uk/World-and-traditional-music/Decca-West-African-recordings/025A-1CS0043663XX-0100A0.mp3',\n",
-       "       'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025A-C0811X0005XX-2000A0.mp3',\n",
-       "       'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025A-C0485X0085XX-3100A0.mp3',\n",
-       "       'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025A-C0811X0005XX-1300A0.mp3',\n",
-       "       'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025A-C0485X0031XX-0500A0.mp3'], dtype=object)"
-      ]
-     },
-     "execution_count": 37,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ddf['songurls_Album'].iloc[large_hubs_idx].get_values()"
-   ]
-  },
-  {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -364,17 +390,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
-   "metadata": {},
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/homes/mp305/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py:115: SettingWithCopyWarning: \n",
+      "/Users/mariapanteli/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py:121: SettingWithCopyWarning: \n",
       "A value is trying to be set on a copy of a slice from a DataFrame\n",
       "\n",
-      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+      "See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
       "  self._setitem_with_indexer(indexer, value)\n"
      ]
     }
@@ -389,61 +417,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>songurls_Album</th>\n",
-       "      <th>Country</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>515</th>\n",
-       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
-       "      <td>Nigeria</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2549</th>\n",
-       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
-       "      <td>Swaziland</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3486</th>\n",
-       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
-       "      <td>Kazakhstan</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5020</th>\n",
-       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
-       "      <td>Swaziland</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5119</th>\n",
-       "      <td>https://sounds.bl.uk/World-and-traditional-mus...</td>\n",
-       "      <td>Pakistan</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>"
-      ],
       "text/plain": [
-       "<IPython.core.display.HTML object>"
+       "array([ 'https://sounds.bl.uk/World-and-traditional-music/Decca-West-African-recordings/025M-1CS0043663XX-0100V0',\n",
+       "       'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025M-C0811X0005XX-2000V0',\n",
+       "       'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025M-C0485X0085XX-3100V0',\n",
+       "       'https://sounds.bl.uk/World-and-traditional-music/Rycroft/025M-C0811X0005XX-1300V0',\n",
+       "       'https://sounds.bl.uk/World-and-traditional-music/Colin-Huehns-Pakistan/025M-C0485X0031XX-0500V0'], dtype=object)"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "from IPython.display import HTML\n",
-    "HTML(ddf[['songurls_Album', 'Country']].iloc[large_hubs_idx, :].to_html())"
+    "large_hubs_idx = np.array([515, 2549, 3486, 5020, 5119])\n",
+    "ddf['songurls_Album'].iloc[large_hubs_idx].get_values()"
    ]
   },
   {
--- a/scripts/outliers.py	Mon Sep 18 11:25:05 2017 +0100
+++ b/scripts/outliers.py	Tue Sep 19 18:41:14 2017 +0100
@@ -21,9 +21,18 @@
             if not counts.has_key(label):
                 counts.update({label:0})
     if normalize:
-        counts = normalize_outlier_counts(counts, Counter(labels))
-    df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
+        norm_counts = normalize_outlier_counts(counts, Counter(labels))
+        df = pd.DataFrame.from_dict(norm_counts, orient='index').reset_index()
+    else:
+        df = pd.DataFrame.from_dict(Counter(counts), orient='index').reset_index()
     df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True)
+    # append number of recordings and number of outliers per country
+    df_n_country = pd.DataFrame.from_dict(Counter(labels), orient='index').reset_index()
+    df_n_country.rename(columns={'index':'Country', 0:'N_Country'}, inplace=True)
+    df_n_outliers = pd.DataFrame.from_dict(Counter(counts), orient='index').reset_index()
+    df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True)
+    df = pd.merge(df, df_n_country, on='Country', how='left')
+    df = pd.merge(df, df_n_outliers, on='Country', how='left')
     return df


@@ -31,23 +40,17 @@
     '''Normalize a dictionary of outlier counts per country by
         the total number of recordings per country
     '''
+    norm_counts = {}
     for key in outlier_counts.keys():
         # dictionaries should have the same keys
-        outlier_counts[key] = float(outlier_counts[key]) / float(country_counts[key])
-    return outlier_counts
+        norm_counts[key] = float(outlier_counts[key]) / float(country_counts[key])
+    return norm_counts


 def get_outliers_df(X, Y, chi2thr=0.999, out_file=None):
     threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr)
     global_counts = Counter(Y[y_pred])
     df = country_outlier_df(global_counts, Y, normalize=True)
-    # append number of recordings and number of outliers per country
-    df_n_country = pd.DataFrame.from_dict(Counter(Y), orient='index').reset_index()
-    df_n_country.rename(columns={'index':'Country', 0:'N_Country'}, inplace=True)
-    df_n_outliers = pd.DataFrame.from_dict(Counter(Y[y_pred]), orient='index').reset_index()
-    df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True)
-    df = pd.merge(df, df_n_country, on='Country', how='left')
-    df = pd.merge(df, df_n_outliers, on='Country', how='left')
     if out_file is not None:
         df.to_csv(out_file, index=False)
     return df, threshold, MD