view notebooks/test_hubness.ipynb @ 18:ed109218dd4b branch-tests

rename result scripts and more tests
author Maria Panteli
date Tue, 12 Sep 2017 23:18:19 +0100
parents ff18f364bbac
children 29b5ee381305 bd284065aeb6
line wrap: on
line source
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pickle\n",
    "from scipy.stats import pearsonr\n",
    "from scipy.stats import skew\n",
    "import sys\n",
    "from sklearn.metrics.pairwise import pairwise_distances\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "sys.path.append('../')\n",
    "import scripts.outliers as outliers\n",
    "import scripts.utils_spatial as utils_spatial"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:189: UserWarning: There are 21 disconnected observations\n",
      "  warnings.warn(\"There are %d disconnected observations\" % ni)\n",
      "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:190: UserWarning: Island ids: 3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121\n",
      "  warnings.warn(\"Island ids: %s\" % ', '.join(str(island) for island in self.islands))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Antigua and Barbuda\n",
      "Australia\n",
      "Cuba\n",
      "Fiji\n",
      "French Polynesia\n",
      "Grenada\n",
      "Iceland\n",
      "Jamaica\n",
      "Japan\n",
      "Kiribati\n",
      "Malta\n",
      "New Zealand\n",
      "Philippines\n",
      "Puerto Rico\n",
      "Republic of Serbia\n",
      "Saint Lucia\n",
      "Samoa\n",
      "Solomon Islands\n",
      "South Korea\n",
      "The Bahamas\n",
      "Trinidad and Tobago\n"
     ]
    }
   ],
   "source": [
    "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n",
    "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
    "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n",
    "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n",
    "Xrhy, Xmel, Xmfc, Xchr = X_list\n",
    "X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)\n",
    "\n",
    "# global outliers\n",
    "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(8200, 380)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "D = pairwise_distances(X, metric='mahalanobis')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(8200, 8200)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "D.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZUAAAEACAYAAAB78OvLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGWtJREFUeJzt3W2sXdV95/HvDwyUPIwd05F5sCFWa6Q4StrADM50WvUy\nJGCiCniRgKsJeBKrL0pnYPqiE8xIYKtRO54KUaoRRGrzYFCDQdASoqBgB7iaSiNwEiClIYzNqJ7B\nl9pEBpO0o05t8Z8XZ128ubm2r303vvccfz/SkddZe6919jpczu/svfbeJ1WFJEl9OGWuN0CSNDoM\nFUlSbwwVSVJvDBVJUm8MFUlSbwwVSVJvZhQqSRYleSjJj5K8mGRVksVJtiXZkWRrkkWd9dcn2Znk\npSSXd+ovTvJCW3ZXp/6MJA+0+qeTXNBZtra9xo4kN3Tqlyd5prXZkuS02b8dkqTZmOmeyl3AY1X1\nIeCjwEvALcC2qroQeKI9J8lK4DpgJbAauDtJWj/3AOuqagWwIsnqVr8O2Nfq7wQ2tb4WA7cBl7TH\n7UkWtjabgDtamzdaH5KkOXTUUGkf4r9WVV8BqKqDVfUmcBWwua22Gbimla8G7q+qA1W1C3gZWJXk\nHOD9VbW9rXdvp023r4eBy1r5CmBrVe2vqv3ANuDKFlKXAg9N8/qSpDkykz2V5cCPk3w1ybNJ/jTJ\ne4ElVbW3rbMXWNLK5wK7O+13A+dNUz/R6mn/vgKD0ALeTHLWEfpaDOyvqrem6UuSNEdmEioLgIuA\nu6vqIuAfaIe6JtXgXi8n6n4v3ldGkuapBTNYZzewu6q+254/BKwH9iQ5u6r2tENbr7XlE8CyTvul\nrY+JVp5aP9nmfODVJAuAhVW1L8kEMNZpswx4EngdWJTklLa3srT18Q5JDCBJOkZVlaOvNb2j7qlU\n1R7glSQXtqpPAD8EvgmsbXVrgUda+VFgTZLTkywHVgDbWz8/aWeOBbge+EanzWRfn2Yw8Q+wFbi8\nnX32AeCTwONtz+gp4DPTvP7U7R/Jx+233z7n2+D4HJ/jG73HbM1kTwXgPwB/nuR04H8BnwNOBR5M\nsg7YBVzbPsRfTPIg8CJwELixDm3pjcDXgDMZnE327Vb/ZeC+JDuBfcCa1tfrSX4fmNxL2liDCXuA\nLwBbknwReLb1IUmaQzMKlar6AfAvp1n0icOs/wfAH0xT/33gI9PU/z9aKE2z7KvAV6ep/1tg1RE3\nXBw6m3ugj28iknQ4XlE/pMbGxo5h7RN5HkU/jm18w8fxDbdRH99sZJS/uSapUR7fTAz2VCbfg7in\nIumIklDv5kS9JEkzNdOJeo2I7hyLey2S+uaeykln+OZXJA0PQ0WS1BtDRZLUG0NFktQbQ0WS1BvP\n/hpBU6+il6QTxT2VkeVZXpJOPENFktQbQ0WS1BtDRZLUG0NFktQbQ0WS1BtDRZLUG0NFktQbQ0WS\n1BuvqD+J+dsqkvrmnspJzavuJfXLUJEk9cZQkST1xlCRJPXGUJEk9cZQkST1xlCRJPXGUJEk9WZG\noZJkV5K/TvJcku2tbnGSbUl2JNmaZFFn/fVJdiZ5KcnlnfqLk7zQlt3VqT8jyQOt/ukkF3SWrW2v\nsSPJDZ365UmeaW22JDlttm+GJGl2ZrqnUsBYVX2sqi5pdbcA26rqQuCJ9pwkK4HrgJXAauDuHLp0\n+x5gXVWtAFYkWd3q1wH7Wv2dwKbW12LgNuCS9rg9ycLWZhNwR2vzRutDkjSHjuXwV6Y8vwrY3Mqb\ngWta+Wrg/qo6UFW7gJeBVUnOAd5fVdvbevd22nT7ehi4rJWvALZW1f6q2g9sA65sIXUp8NA0ry9J\nmiPHsqfynSTfS/JbrW5JVe1t5b3AklY+F9jdabsbOG+a+olWT/v3FYCqOgi8meSsI/S1GNhfVW9N\n05ckaY7M9IaS/7qq/i7JPwe2JXmpu7CqKsmJuomUN6uSpHlqRqFSVX/X/v1xkr9kML+xN8nZVbWn\nHdp6ra0+ASzrNF/KYA9jopWn1k+2OR94NckCYGFV7UsyAYx12iwDngReBxYlOaXtrSxtffyMDRs2\nvF0eGxtjbGxsutWGXveOw5I0U+Pj44yPj/fWX452y/Mk7wFOraqfJnkvsBXYCHyCweT6piS3AIuq\n6pY2Uf91BsFzHvAd4Bfb3swzwE3AduBbwJ9U1beT3Ah8pKp+O8ka4JqqWtMm6r8HXMRgTuf7wEVV\ntT/Jg8DDVfVAki8Bz1fVl6Zse50st3QfhMrkWI+9fLK8T5KOLAlVddzfUmcSKsuBv2xPFwB/XlV/\n2D7wH2Swh7ELuLZNppPkVuDzwEHg5qp6vNVfDHwNOBN4rKpuavVnAPcBHwP2AWvaJD9JPgfc2l7/\ni1W1ubNdWxjMrzwLfLaqDkzZdkPFUJF0DN71UBlmhsrMyyfL+yTpyGYbKl5RL0nqjaEiSeqNoSJJ\n6s1Mr1PRiOuekuz8iqTj5Z6KmsLrSiXNlqEiSeqNoSJJ6o2hIknqjaEiSeqNoSJJ6o2hIknqjaEi\nSeqNoSJJ6o2hIknqjaEiSeqNoSJJ6o2hIknqjaEiSeqNoSJJ6o2hIknqjaEiSeqNoSJJ6o2hIknq\njaEiSeqNoSJJ6o2hIknqjaEiSerNgrneAM0/Sd4uV9UcbomkYeOeiqZR7SFJx2ZGoZLk1CTPJflm\ne744ybYkO5JsTbKos+76JDuTvJTk8k79xUleaMvu6tSfkeSBVv90kgs6y9a219iR5IZO/fIkz7Q2\nW5KcNts3QpI0ezPdU7kZeJFDX19vAbZV1YXAE+05SVYC1wErgdXA3Tl0LOUeYF1VrQBWJFnd6tcB\n+1r9ncCm1tdi4Dbgkva4PcnC1mYTcEdr80brQ5I0x44aKkmWAp8C/gyYDIirgM2tvBm4ppWvBu6v\nqgNVtQt4GViV5Bzg/VW1va13b6dNt6+Hgcta+Qpga1Xtr6r9wDbgyhZSlwIPTfP6kqQ5NJM9lTuB\n3wPe6tQtqaq9rbwXWNLK5wK7O+vtBs6bpn6i1dP+fQWgqg4CbyY56wh9LQb2V9Vb0/R1Ukny9kOS\n5oMjnv2V5DeA16rquSRj061TVZXkRM3qHvPrbNiw4e3y2NgYY2NjPW7OfDD5lhgsko7d+Pg44+Pj\nvfV3tFOKfwW4KsmngJ8D/lmS+4C9Sc6uqj3t0NZrbf0JYFmn/VIGexgTrTy1frLN+cCrSRYAC6tq\nX5IJYKzTZhnwJPA6sCjJKW1vZWnrY1rdUJEkvdPUL9sbN26cVX9HPPxVVbdW1bKqWg6sAZ6squuB\nR4G1bbW1wCOt/CiwJsnpSZYDK4DtVbUH+EmSVW1O5HrgG502k319msHEP8BW4PIki5J8APgk8HgN\nLpx4CvjMNK8vSZpDx3rx4+Sxlv8CPJhkHbALuBagql5M8iCDM8UOAjfWoavnbgS+BpwJPFZV3271\nXwbuS7IT2McgvKiq15P8PvDdtt7GNmEP8AVgS5IvAs+2PiRJcyyjfMV0khrx8fHOOZX+y6P8/kn6\nWUmoquOepPWKeklSbwwVSVJvDBVJUm8MFUlSbwwVSVJvDBVJUm8MFUlSbwwVSVJvDBVJUm8MFUlS\nbwwVSVJvDBVJUm8MFUlSb4711vc6yXR/qtg7Fks6GvdUdBTFcfyKs6STlKEiSeqNoSJJ6o2hIknq\njaEiSeqNoSJJ6o2hIknqjaEiSeqNoSJJ6o2hIknqjaEiSeqNoSJJ6o2hIknqjaEiSerNEUMlyc8l\neSbJ80leTPKHrX5xkm1JdiTZmmRRp836JDuTvJTk8k79xUleaMvu6tSfkeSBVv90kgs6y9a219iR\n5IZO/fK2XTuTbElyWl9viCTp+B0xVKrqH4FLq+qXgY8Clyb5VeAWYFtVXQg80Z6TZCVwHbASWA3c\nnUM/yHEPsK6qVgArkqxu9euAfa3+TmBT62sxcBtwSXvcnmRha7MJuKO1eaP1IUmaY0c9/FVV/7cV\nTwdOZfAhfhWwudVvBq5p5auB+6vqQFXtAl4GViU5B3h/VW1v693badPt62Hgsla+AthaVfuraj+w\nDbiyhdSlwEPTvL4kaQ4dNVSSnJLkeWAv8FRV/RBYUlV72yp7gSWtfC6wu9N8N3DeNPUTrZ727ysA\nVXUQeDPJWUfoazGwv6remqYvSdIcOurPCbcP719uh54eT3LplOWV5ET9NOAxv86GDRveLo+NjTE2\nNtbj5kjScBsfH2d8fLy3/mb8G/VV9WaSbwEXA3uTnF1Ve9qhrdfaahPAsk6zpQz2MCZaeWr9ZJvz\ngVeTLAAWVtW+JBPAWKfNMuBJ4HVgUZJTWuAtbX1MqxsqkqR3mvple+PGjbPq72hnf/385JldSc4E\nPgk8BzwKrG2rrQUeaeVHgTVJTk+yHFgBbK+qPcBPkqxqcyLXA9/otJns69MMJv4BtgKXJ1mU5APt\ntR+vqgKeAj4zzetLkuZQBp/Rh1mYfITBRPgp7XFfVf1ROzPrQQZ7GLuAa9tkOkluBT4PHARurqrH\nW/3FwNeAM4HHquqmVn8GcB/wMWAfsKZN8pPkc8CtbXO+WFWbW/1yYAuD+ZVngc9W1YFptr+ONL5h\ndOhkukmT48sJKHdedcTeV0kDSaiqqR80M28/yh8OoxsqJzJIpi+P2vsqaWC2oeIV9ZKk3hgqkqTe\nGCqSpN4YKpKk3hgqkqTeGCqSpN4YKpKk3hgqkqTeGCqSpN4YKpKk3hgqkqTeGCqSpN4YKpKk3hgq\nkqTeGCqSpN4YKpKk3hgqkqTeLJjrDdBw6v6ssb8CKWmSeyo6TsWhnxeWpAFDRZLUG0NFktQbQ0WS\n1BtDRZLUG0NFktQbQ0WS1BtDRZLUG0NFktQbQ0WS1JujhkqSZUmeSvLDJH+T5KZWvzjJtiQ7kmxN\nsqjTZn2SnUleSnJ5p/7iJC+0ZXd16s9I8kCrfzrJBZ1la9tr7EhyQ6d+eZJnWpstSU7r4w2RJB2/\nmeypHAB+t6o+DHwc+J0kHwJuAbZV1YXAE+05SVYC1wErgdXA3Tl0o6h7gHVVtQJYkWR1q18H7Gv1\ndwKbWl+LgduAS9rj9iQLW5tNwB2tzRutD0nSHDpqqFTVnqp6vpX/HvgRcB5wFbC5rbYZuKaVrwbu\nr6oDVbULeBlYleQc4P1Vtb2td2+nTbevh4HLWvkKYGtV7a+q/cA24MoWUpcCD03z+pKkOXJMcypJ\nPgh8DHgGWFJVe9uivcCSVj4X2N1ptptBCE2tn2j1tH9fAaiqg8CbSc46Ql+Lgf1V9dY0fUmS5siM\nb32f5H0M9iJurqqfTrn1eSU5UbesPabX2bBhw9vlsbExxsbGet4cSRpe4+PjjI+P99bfjEKlTYI/\nDNxXVY+06r1Jzq6qPe3Q1mutfgJY1mm+lMEexkQrT62fbHM+8GqSBcDCqtqXZAIY67RZBjwJvA4s\nSnJK21tZ2vr4Gd1Q0bvD31aRhtfUL9sbN26cVX8zOfsrwJeBF6vqjzuLHgXWtvJa4JFO/ZokpydZ\nDqwAtlfVHuAnSVa1Pq8HvjFNX59mMPEPsBW4PMmiJB8APgk8XoNPrqeAz0zz+jrh/G0VSQM52jfL\nJL8K/Hfgrzn0ybEe2A48yGAPYxdwbZtMJ8mtwOeBgwwOlz3e6i8GvgacCTxWVZOnJ58B3MdgvmYf\nsKZN8pPkc8Ct7XW/WFWbW/1yYAuD+ZVngc9W1YEp216j9s15kMeTY5p/5VF7v6WTTRKqKkdf8zDt\nR/lDYFRCpXt4aWDuw8NQkUbTbEPFK+qHhoeYJM1/hookqTeGiiSpN4aKJKk3hookqTeGiiSpN4aK\nJKk3hookqTeGiiSpNzO+S7E0E95cUjq5uaeinnnlv3QyM1QkSb0xVCRJvTFUJEm9MVQkSb0xVCRJ\nvTFUJEm9MVQkSb0xVCRJvTFUJEm9MVQkSb3x3l9613gfMOnk456K3kXeB0w62RgqkqTeGCqSpN4Y\nKpKk3hgqkqTeGCqSpN4cNVSSfCXJ3iQvdOoWJ9mWZEeSrUkWdZatT7IzyUtJLu/UX5zkhbbsrk79\nGUkeaPVPJ7mgs2xte40dSW7o1C9P8kxrsyXJabN9IyRJszeTPZWvAqun1N0CbKuqC4En2nOSrASu\nA1a2Nnfn0MUK9wDrqmoFsCLJZJ/rgH2t/k5gU+trMXAbcEl73J5kYWuzCbijtXmj9TFSkrz9kKRh\ncdRQqaq/YvDB3XUVsLmVNwPXtPLVwP1VdaCqdgEvA6uSnAO8v6q2t/Xu7bTp9vUwcFkrXwFsrar9\nVbUf2AZc2ULqUuChaV5/xHidh6ThcrxzKkuqam8r7wWWtPK5wO7OeruB86apn2j1tH9fAaiqg8Cb\nSc46Ql+Lgf1V9dY0fWmecs9LOjnMeqK+BvffOFFfp/3aPrTc65JOBsd776+9Sc6uqj3t0NZrrX4C\nWNZZbymDPYyJVp5aP9nmfODVJAuAhVW1L8kEMNZpswx4EngdWJTklLa3srT1Ma0NGza8XR4bG2Ns\nbOxwq0rSSWd8fJzx8fHe+stMbvSX5IPAN6vqI+35f2Uwub4pyS3Aoqq6pU3Uf53BxPp5wHeAX6yq\nSvIMcBOwHfgW8CdV9e0kNwIfqarfTrIGuKaq1rSJ+u8BFwEBvg9cVFX7kzwIPFxVDyT5EvB8VX1p\nmu2uYb2R4eAw0eS2j1Z5WP+bSCeDJFTVcR+nPmqoJLkf+HXg5xnMn9wGfAN4kMEexi7g2jaZTpJb\ngc8DB4Gbq+rxVn8x8DXgTOCxqrqp1Z8B3Ad8DNgHrGmT/CT5HHBr25QvVtXmVr8c2MJgfuVZ4LNV\ndWCabTdU5mF5WP+bSCeDdz1UhpmhMj/Lw/rfRDoZzDZUvKJektQbQ0WS1Bt/+VEnnL8IKY0u91Q0\nB7xmRRpVhookqTeGiiSpN4aKJKk3hookqTee/aU55Zlg0mhxT0VzzDPBpFFiqEiSeuPhr3nEH7CS\nNOwMlXmnexPGk4vzK9Lw8/CX5hHnV6RhZ6hIknpjqEiSeuOciuYl51ek4eSeiuYp51ekYWSoSJJ6\nY6hIknrjnIrmPedXpOHhnoqGgPMr0rAwVCRJvfHw1xzzfl/HxkNh0vzmnsq84OGdmfO9kuYz91Q0\ntNxrkeYfQ0VD7NAdnQ93GNGwkU4sQ0UjovuTAdOHjQEjvfuGek4lyeokLyXZmeQLc709mo8OzcEk\nefsh6d0xtKGS5FTgvwGrgZXAbyb50Nxu1cz08+E23tfmzFPj70Kf8ydgxsfH5+R1TxTHd/Ia2lAB\nLgFerqpdVXUA2AJcPcfbdAxmexbTeE/bMV+Nv8v9z23AjPqHkuM7eQ3znMp5wCud57uBVVNX6h5H\n97CHpnf0Cf8Z9eKcjTTUoTKj/4NPOWWwM7Zr1y4uuOCCd3WDjsRAGxbTT/jPpHys/403btx4/Jt5\nGDMNtiNtq+Go2ciw/gEl+TiwoapWt+frgbeqalNnneEcnCTNoao67m/BwxwqC4D/CVwGvApsB36z\nqn40pxsmSSexoT38VVUHk/x74HHgVODLBookza2h3VORJM0/w3xK8WGN2kWRSZYleSrJD5P8TZKb\nWv3iJNuS7EiyNcmiud7W45Xk1CTPJflmez5KY1uU5KEkP0ryYpJVIza+9e1v84UkX09yxjCPL8lX\nkuxN8kKn7rDjaePf2T5zLp+brZ65w4zvj9rf5w+S/EWShZ1lxzS+kQuVYb4o8ggOAL9bVR8GPg78\nThvTLcC2qroQeKI9H1Y3Ay9y6HSqURrbXcBjVfUh4KPAS4zI+JJ8EPgt4KKq+giDQ9FrGO7xfZXB\n50fXtONJshK4jsFnzWrg7iTz/XN1uvFtBT5cVb8E7ADWw/GNb74P/ngM+UWRP6uq9lTV863898CP\nGFyncxWwua22GbhmbrZwdpIsBT4F/BmD83RhdMa2EPi1qvoKDOYCq+pNRmR8wE8YfOl5Tzt55j0M\nTpwZ2vFV1V8Bb0ypPtx4rgbur6oDVbULeJnBZ9C8Nd34qmpbVb3Vnj4DLG3lYx7fKIbKdBdFnjdH\n29K79s3wYwz+wy+pqr1t0V5gyRxt1mzdCfwe8FanblTGthz4cZKvJnk2yZ8meS8jMr6qeh24A/g/\nDMJkf1VtY0TG13G48ZzL4DNm0ih83nweeKyVj3l8oxgqI3vmQZL3AQ8DN1fVT7vLanDGxdCNPclv\nAK9V1XMc2kt5h2EdW7MAuAi4u6ouAv6BKYeChnl8SX4B+I/ABxl8AL0vyWe76wzz+KYzg/EM7ViT\n/Gfgn6rq60dY7YjjG8VQmQCWdZ4v451JO5SSnMYgUO6rqkda9d4kZ7fl5wCvzdX2zcKvAFcl+Vvg\nfuDfJLmP0RgbDP72dlfVd9vzhxiEzJ4RGd+/AP5HVe2rqoPAXwD/itEZ36TD/T1O/bxZ2uqGTpJ/\nx+Aw9L/tVB/z+EYxVL4HrEjywSSnM5hkenSOt2lWMrinxpeBF6vqjzuLHgXWtvJa4JGpbee7qrq1\nqpZV1XIGE7xPVtX1jMDYYDAfBryS5MJW9Qngh8A3GYHxMTjp4ONJzmx/p59gcMLFqIxv0uH+Hh8F\n1iQ5PclyYAWDC7GHSpLVDA5BX11V/9hZdOzjq6qRewBXMrja/mVg/VxvTw/j+VUG8w3PA8+1x2pg\nMfAdBmdrbAUWzfW2znKcvw482sojMzbgl4DvAj9g8E1+4YiN7z8xCMoXGExinzbM42Owx/wq8E8M\n5mc/d6TxALe2z5qXgCvmevuPY3yfB3YC/7vz+XL38Y7Pix8lSb0ZxcNfkqQ5YqhIknpjqEiSemOo\nSJJ6Y6hIknpjqEiSemOoSJJ6Y6hIknrz/wF0zsvts73EjAAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7f3668585f50>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(D.ravel(), bins=100);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def n_occurrence_from_D(D, k=10, n_items=None):\n",
    "    if n_items is None:\n",
    "        n_items = len(D)\n",
    "    sort_idx = np.argsort(D, axis=1)\n",
    "    D_k = sort_idx[:, 1:(k+1)]  # nearest neighbour is the item itself\n",
    "    N_k = np.bincount(D_k.astype(int).ravel(), minlength=n_items)\n",
    "    return N_k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8.18316065981\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEACAYAAABcXmojAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFsBJREFUeJzt3X+QXWd93/H3BwuDbC+oKh3554zVZL22MqQYpYgUU69b\nxxUl2P5DY8szeDSpSodRUpN2hiAx09p/EcNMJhh17E5TwDIDShUCHrk4imXHdyaTtJYMchAWiq0m\nIqyC1hgSnBCqWuNv/7jPHl02QruSrlZX9vs1c8fPec5zzv2eXet+9nnOvbupKiRJAnjd2S5AkjQ6\nDAVJUsdQkCR1DAVJUsdQkCR1DAVJUmfOUEiyKcmzSfYm+UKSNyRZmmRnkueSPJZkyazxzyfZn+Sm\ngf6V7RzPJ7nvTF2QJOnUnTAUklwJfAB4e1W9FTgPWAtsBHZW1VXAE22bJCuA24EVwGrg/iRpp3sA\nWF9V48B4ktVDvxpJ0mmZa6bwEvAycEGSRcAFwF8CNwNb2pgtwK2tfQuwtaperqqDwAFgVZJLgLGq\n2tXGPTRwjCRpRJwwFKrq+8BvAH9BPwz+uqp2AsuqaroNmwaWtfalwNTAKaaAy47Tf6j1S5JGyFzL\nRz8F/CpwJf0X9ouSvH9wTPV/T4a/K0OSXgUWzbH/54A/rqrvAST5EvDzwOEkF1fV4bY09EIbfwi4\nYuD4y+nPEA619mD/oeM9YRIDRpJOUlVl7lFzm+uewn7gnUkWtxvGNwL7gEeAdW3MOuDh1t4OrE1y\nfpLlwDiwq6oOAy8lWdXOc+fAMX9PVY304+677z7rNVindVqnNc48humEM4Wq+pMkDwFPA68AXwP+\nGzAGbEuyHjgI3NbG70uyjX5wHAU21LGKNwAPAouBR6tqx1CvRJJ02uZaPqKqPgF8Ylb39+nPGo43\n/mPAx47T/1XgradQoyRpgfiJ5lMwOTl5tkuYF+scLuscrnOhznOhxmHLsNejTleSGrWaJGmUJaEW\n6EazJOk1xFCQJHUMBUlSx1CQJHUMBUlSZyRD4brr3ssNN7yXqampuQdLkoZmzg+vnQ1/9EcbWLz4\n3/HDH/7wbJciSa8pIzlTgPeyaNFFZ7sISXrNGdFQkCSdDYaCJKljKEiSOoaCJKljKEiSOoaCJKlj\nKEiSOoaCJKljKEiSOnOGQpKJJHsGHj9IcleSpUl2JnkuyWNJlgwcsynJ80n2J7lpoH9lkr1t331n\n6qIkSadmzlCoqj+tqmur6lpgJfB3wJeBjcDOqroKeKJtk2QFcDuwAlgN3J9k5s/EPQCsr6pxYDzJ\n6mFfkCTp1J3s8tGNwIGq+jZwM7Cl9W8Bbm3tW4CtVfVyVR0EDgCrklwCjFXVrjbuoYFjJEkj4GRD\nYS2wtbWXVdV0a08Dy1r7UmDwd15PAZcdp/9Q65ckjYh5h0KS84H3Ab8ze19VFVBDrEuSdBaczN9T\neA/w1ar6btueTnJxVR1uS0MvtP5DwBUDx11Of4ZwqLUH+w8d/6nu4ciRF9m8eTNr1qxhcnLyJMqU\npFe3Xq9Hr9c7I+dO/4f8eQxMfhv4vara0rY/AXyvqj6eZCOwpKo2thvNXwDeQX956HHgp6uqkjwF\n3AXsAr4CfKqqdsx6noJibGyC3bu3MzExMaRLlaRXpyRUVeYeObd5zRSSXEj/JvMHBrrvBbYlWQ8c\nBG4DqKp9SbYB+4CjwIY6ljwbgAeBxcCjswNBknR2zXumsFCcKUjSyRnmTMFPNEuSOoaCJKljKEiS\nOoaCJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOoaC\nJKljKEiSOoaCJKljKEiSOvMKhSRLknwxyTeT7EuyKsnSJDuTPJfksSRLBsZvSvJ8kv1JbhroX5lk\nb9t335m4IEnSqZvvTOE+4NGqugb4WWA/sBHYWVVXAU+0bZKsAG4HVgCrgfuTzPxB6QeA9VU1Down\nWT20K5EknbY5QyHJm4F3V9VnAKrqaFX9ALgZ2NKGbQFube1bgK1V9XJVHQQOAKuSXAKMVdWuNu6h\ngWMkSSNgPjOF5cB3k3w2ydeS/FaSC4FlVTXdxkwDy1r7UmBq4Pgp4LLj9B9q/ZKkEbFonmPeDvxK\nVe1O8knaUtGMqqokNbyy7uHIkRfZvHkza9asYXJycninlqRzXK/Xo9frnZFzp+rEr+VJLgb+V1Ut\nb9vXAZuAfwzcUFWH29LQk1V1dZKNAFV1bxu/A7gb+FYbc03rvwO4vqo+OOv5CoqxsQl2797OxMTE\nMK9Xkl51klBVmXvk3OZcPqqqw8C3k1zVum4EngUeAda1vnXAw629HVib5Pwky4FxYFc7z0vtnUsB\n7hw4RpI0AuazfATw74HPJzkf+D/ALwHnAduSrAcOArcBVNW+JNuAfcBRYEMdm45sAB4EFtN/N9OO\nIV2HJGkI5lw+WmguH0nSyVnQ5SNJ0muHoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgK\nkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqTOvEIhycEkX0+y\nJ8mu1rc0yc4kzyV5LMmSgfGbkjyfZH+Smwb6VybZ2/bdN/zLkSSdjvnOFAqYrKprq+odrW8jsLOq\nrgKeaNskWQHcDqwAVgP3J5n5g9IPAOurahwYT7J6SNchSRqCk1k+yqztm4Etrb0FuLW1bwG2VtXL\nVXUQOACsSnIJMFZVu9q4hwaOkSSNgJOZKTye5OkkH2h9y6pqurWngWWtfSkwNXDsFHDZcfoPtX5J\n0ohYNM9x76qq7yT5R8DOJPsHd1ZVJanhlXUPR468yObNm1mzZg2Tk5PDO7UkneN6vR69Xu+MnDtV\nJ/danuRu4G+BD9C/z3C4LQ09WVVXJ9kIUFX3tvE7gLuBb7Ux17T+O4Drq+qDs85fUIyNTbB793Ym\nJiZO8xIl6dUtCVU1e4n/lMy5fJTkgiRjrX0hcBOwF9gOrGvD1gEPt/Z2YG2S85MsB8aBXVV1GHgp\nyap24/nOgWMkSSNgPstHy4AvtzcQLQI+X1WPJXka2JZkPXAQuA2gqvYl2QbsA44CG+rYdGQD8CCw\nGHi0qnYM8VokSafppJePzjSXjyTp5Czo8pEk6bXDUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLH\nUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVJnXqGQ\n5Lwke5I80raXJtmZ5LkkjyVZMjB2U5Lnk+xPctNA/8oke9u++4Z/KZKk0zXfmcKHgH1Ate2NwM6q\nugp4om2TZAVwO7ACWA3cn2Tmj0k/AKyvqnFgPMnq4VyCJGlY5gyFJJcD/xr478DMC/zNwJbW3gLc\n2tq3AFur6uWqOggcAFYluQQYq6pdbdxDA8dIkkbEfGYKvwl8GHhloG9ZVU239jSwrLUvBaYGxk0B\nlx2n/1DrlySNkEUn2pnkF4EXqmpPksnjjamqSlLH23fq7uHIkRfZvHkza9asYXLyuE8tSa9JvV6P\nXq93Rs6dqp/8ep7kY8CdwFHgjcCbgC8B/xSYrKrDbWnoyaq6OslGgKq6tx2/A7gb+FYbc03rvwO4\nvqo+eJznLCjGxibYvXs7ExMTQ7xcSXr1SUJVZe6Rczvh8lFVfbSqrqiq5cBa4A+q6k5gO7CuDVsH\nPNza24G1Sc5PshwYB3ZV1WHgpSSr2o3nOweOkSSNiBMuHx3HzLTiXmBbkvXAQeA2gKral2Qb/Xcq\nHQU21LGpyAbgQWAx8GhV7Ti90iVJw3bC5aOzweUjSTo5C7Z8JEl6bTEUJEkdQ0GS1DEUJEkdQ0GS\n1DEUJEkdQ0GS1DEUJEkdQ0GS1DEUJEkdQ0GS1DEUJEkdQ0GS1DEUJEkdQ0GS1DEUJEkdQ0GS1DEU\nJEkdQ0GS1DlhKCR5Y5KnkjyTZF+SX2/9S5PsTPJckseSLBk4ZlOS55PsT3LTQP/KJHvbvvvO3CVJ\nkk7VCUOhqv4vcENVvQ34WeCGJNcBG4GdVXUV8ETbJskK4HZgBbAauD/JzB+TfgBYX1XjwHiS1Wfi\ngiRJp27O5aOq+rvWPB84D/gr4GZgS+vfAtza2rcAW6vq5ao6CBwAViW5BBirql1t3EMDx0iSRsSc\noZDkdUmeAaaBJ6vqWWBZVU23IdPAsta+FJgaOHwKuOw4/YdavyRphCyaa0BVvQK8Lcmbgd9PcsOs\n/ZWkhlvWPRw58iKbN29mzZo1TE5ODvf0knQO6/V69Hq9M3LuVM3/9TzJfwJ+BPxbYLKqDreloSer\n6uokGwGq6t42fgdwN/CtNuaa1n8HcH1VffA4z1FQjI1NsHv3diYmJk7zEiXp1S0JVZW5R85trncf\nvWXmnUVJFgO/AOwBtgPr2rB1wMOtvR1Ym+T8JMuBcWBXVR0GXkqyqt14vnPgGEnSiJhr+egSYEuS\n19EPkM9V1RNJ9gDbkqwHDgK3AVTVviTbgH3AUWBDHZuKbAAeBBYDj1bVjmFfjCTp9JzU8tFCcPlI\nkk7Ogi0fSZJeWwwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwF\nSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdeYMhSRXJHkyybNJvpHkrta/NMnOJM8l\neSzJkoFjNiV5Psn+JDcN9K9Msrftu+/MXJIk6VTNZ6bwMvAfqupngHcCv5zkGmAjsLOqrgKeaNsk\nWQHcDqwAVgP3J5n5g9IPAOurahwYT7J6qFcjSTotc4ZCVR2uqmda+2+BbwKXATcDW9qwLcCtrX0L\nsLWqXq6qg8ABYFWSS4CxqtrVxj00cIwkaQSc1D2FJFcC1wJPAcuqarrtmgaWtfalwNTAYVP0Q2R2\n/6HWL0kaEYvmOzDJRcDvAh+qqr85tiIEVVVJanhl3cORIy+yefNm1qxZw+Tk5PBOLUnnuF6vR6/X\nOyPnTtXcr+VJXg/8T+D3quqTrW8/MFlVh9vS0JNVdXWSjQBVdW8btwO4G/hWG3NN678DuL6qPjjr\nuQqKsbEJdu/ezsTExNAuVpJejZJQVZl75Nzm8+6jAJ8G9s0EQrMdWNfa64CHB/rXJjk/yXJgHNhV\nVYeBl5Ksaue8c+AYSdIImM/y0buA9wNfT7Kn9W0C7gW2JVkPHARuA6iqfUm2AfuAo8CGOjYd2QA8\nCCwGHq2qHUO6DknSEMxr+WghuXwkSSdnQZePJEmvHYaCJKljKEiSOoaCJKljKEiSOoaCJKljKEiS\nOoaCJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOoaCJKkzZygk+UyS\n6SR7B/qWJtmZ5LkkjyVZMrBvU5Lnk+xPctNA/8oke9u++4Z/KZKk0zWfmcJngdWz+jYCO6vqKuCJ\ntk2SFcDtwIp2zP1JZv5u6APA+qoaB8aTzD6nJOksmzMUquoPgb+a1X0zsKW1twC3tvYtwNaqermq\nDgIHgFVJLgHGqmpXG/fQwDGSpBFxqvcUllXVdGtPA8ta+1JgamDcFHDZcfoPtX5J0gg57RvNVVVA\nDaEWSdJZtugUj5tOcnFVHW5LQy+0/kPAFQPjLqc/QzjU2oP9h37y6e/hyJEX2bx5M2vWrGFycvIU\ny5SkV59er0ev1zsj507/B/05BiVXAo9U1Vvb9ieA71XVx5NsBJZU1cZ2o/kLwDvoLw89Dvx0VVWS\np4C7gF3AV4BPVdWO4zxXQTE2NsHu3duZmJgYyoVK0qtVEqoqc4+c25wzhSRbgeuBtyT5NvCfgXuB\nbUnWAweB2wCqal+SbcA+4CiwoY6lzgbgQWAx8OjxAkGSdHbNa6awkJwpSNLJGeZMwU80S5I6hoIk\nqWMoSJI6hoIkqWMoSJI6hoIkqWMoSJI6hoIkqWMoSJI6hoIkqWMoSJI6hoIkqWMoSJI6hoIkqWMo\nSJI6hoIkqWMoSJI6hoIkqWMoSJI6Cx4KSVYn2Z/k+SQfWejnlyT9ZAsaCknOA/4LsBpYAdyR5Jqf\nNP7qq68mSfcYFb1e72yXMC/WOVzWOVznQp3nQo3DttAzhXcAB6rqYFW9DPw2cMuJD6n2GB3nyv8o\n1jlc1jlc50Kd50KNw7bQoXAZ8O2B7anWNy+Ds4ZRnEFI0rlu0QI/37x+5H/Tm97Hj340dYLD82Pt\nMxUMVWd2hjK77jP9fJI0lyzkC1GSdwL3VNXqtr0JeKWqPj4wxldGSTpJVTWUn44XOhQWAX8K/Evg\nL4FdwB1V9c0FK0KS9BMt6PJRVR1N8ivA7wPnAZ82ECRpdCzoTEGSNNpG5hPNZ/tDbUk+k2Q6yd6B\nvqVJdiZ5LsljSZYM7NvUat2f5KaB/pVJ9rZ99w25xiuSPJnk2STfSHLXiNb5xiRPJXkmyb4kvz6K\ndQ48x3lJ9iR5ZFTrTHIwyddbnbtGuM4lSb6Y5Jvte79qlOpMMtG+hjOPHyS5a5RqnPW8z7bn+EKS\nNyxInVV11h/0l5IOAFcCrweeAa5Z4BreDVwL7B3o+wTwa639EeDe1l7Ranx9q/kAx2Zdu4B3tPaj\nwOoh1ngx8LbWvoj+/ZlrRq3Ods4L2n8XAf8buG4U62zn/Y/A54Hto/h9b+f8c2DprL5RrHML8G8G\nvvdvHsU623lfB3wHuGLUamzP9WfAG9r2/wDWLUSdQ/0in8YX4OeBHQPbG4GNZ6GOK/nxUNgPLGvt\ni4H9rb0J+MjAuB3AO4FLgG8O9K8F/usZrPdh4MZRrhO4ANgN/Mwo1glcDjwO3AA8Mqrfd/qh8A9n\n9Y1UnfQD4M+O0z9SdQ6c9ybgD0exRmAp/R/6/gH9cH0E+IWFqHNUlo9O60NtZ9Cyqppu7WlgWWtf\nSr/GGTP1zu4/xBm6jiRX0p/ZPDWKdSZ5XZJnWj1PVtWzo1gn8JvAh4FXBvpGsc4CHk/ydJIPjGid\ny4HvJvlskq8l+a0kF45gnTPWAltbe6RqrKrvA78B/AX9d2r+dVXtXIg6RyUURv5ud/VjdiTqTHIR\n8LvAh6rqbwb3jUqdVfVKVb2N/k/i/zzJDbP2n/U6k/wi8EJV7aH/ici/ZxTqbN5VVdcC7wF+Ocm7\nB3eOSJ2LgLcD91fV24Ef0p/1d0akTpKcD7wP+J3Z+0ahxiQ/Bfwq/dWLS4GLkrx/cMyZqnNUQuEQ\n/XW9GVfw4+l2tkwnuRggySXAC61/dr2X06/3UGsP9h8aZkFJXk8/ED5XVQ+Pap0zquoHwFeAlSNY\n5z8Dbk7y5/R/YvwXST43gnVSVd9p//0u8GX6v0ds1OqcAqaqanfb/iL9kDg8YnVCP1y/2r6eMHpf\ny58D/riqvldVR4Ev0V9mP+Nfy1EJhaeB8SRXtgS/Hdh+lmuCfg3rWnsd/TX8mf61Sc5PshwYB3ZV\n1WHgpfaOiwB3Dhxz2to5Pw3sq6pPjnCdb5l5V0SSxfTXQveMWp1V9dGquqKqltNfSviDqrpz1OpM\nckGSsda+kP5a+N5Rq7Od/9tJrmpdNwLP0l8PH5k6mzs4tnQ0U8so1bgfeGeSxe38NwL7WIiv5bBv\n3pzGjZX30L+xcgDYdBaefyv9tbv/R//+xi/Rv9nzOPAc8BiwZGD8R1ut+4F/NdC/kv4/2APAp4Zc\n43X0176fof8iu4f+ryEftTrfCnyt1fl14MOtf6TqnFXz9Rx799FI1Ul/rf6Z9vjGzL+PUauznf+f\n0H9jwZ/Q/+n2zaNWJ3Ah8CIwNtA3UjW28/8a/VDdS/9dXa9fiDr98JokqTMqy0eSpBFgKEiSOoaC\nJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOv8fBtDhFf0jZeoAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7f35fe65fe50>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "N_k = n_occurrence_from_D(D, k=100)\n",
    "print skew(N_k)\n",
    "plt.hist(N_k, bins=100);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "#sort_idx = np.argsort(D, axis=1)\n",
    "k = 10\n",
    "D_k = sort_idx[:, 1:(k+1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[4650, 2942, 3520, ..., 1318, 6678, 6056],\n",
       "       [1933, 6143, 6757, ..., 7269, 4321, 1563],\n",
       "       [3170, 2549, 4860, ..., 6678, 7414, 6056],\n",
       "       ..., \n",
       "       [6016, 2243, 1616, ..., 7627, 2018,  515],\n",
       "       [7027, 4860, 6346, ...,  997, 3892, 1846],\n",
       "       [5119, 1563, 4035, ..., 3486, 7617, 3854]])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "D_k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}