view notebooks/test_hubness.ipynb @ 29:6aa08c9c95e9 branch-tests

merged
author Maria Panteli <m.x.panteli@gmail.com>
date Wed, 13 Sep 2017 17:35:06 +0100
parents bd284065aeb6 29b5ee381305
children 03ff14ba9fa2
line wrap: on
line source
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pickle\n",
    "from scipy.stats import pearsonr\n",
    "from scipy.stats import skew\n",
    "import sys\n",
    "from sklearn.metrics.pairwise import pairwise_distances\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "sys.path.append('../')\n",
    "import scripts.outliers as outliers\n",
    "import scripts.utils_spatial as utils_spatial"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING: there are 21 disconnected observations\n",
      "Island ids:  [3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121]\n",
      "Antigua and Barbuda\n",
      "Australia\n",
      "Cuba\n",
      "Fiji\n",
      "French Polynesia\n",
      "Grenada\n",
      "Iceland\n",
      "Jamaica\n",
      "Japan\n",
      "Kiribati\n",
      "Malta\n",
      "New Zealand\n",
      "Philippines\n",
      "Puerto Rico\n",
      "Republic of Serbia\n",
      "Saint Lucia\n",
      "Samoa\n",
      "Solomon Islands\n",
      "South Korea\n",
      "The Bahamas\n",
      "Trinidad and Tobago\n"
     ]
    }
   ],
   "source": [
    "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n",
    "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
    "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n",
    "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n",
    "Xrhy, Xmel, Xmfc, Xchr = X_list\n",
    "X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)\n",
    "\n",
    "# global outliers\n",
    "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(8200, 380)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "D = pairwise_distances(X, metric='mahalanobis')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
    "collapsed": false
   },
   "metadata": {},
   "outputs": [],
   "source": [
    "D.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
    "collapsed": false
   },
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.hist(D.ravel(), bins=100);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def n_occurrence_from_D(D, k=10, n_items=None):\n",
    "    if n_items is None:\n",
    "        n_items = len(D)\n",
    "    sort_idx = np.argsort(D, axis=1)\n",
    "    D_k = sort_idx[:, 1:(k+1)]  # nearest neighbour is the item itself\n",
    "    N_k = np.bincount(D_k.astype(int).ravel(), minlength=n_items)\n",
    "    return N_k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-1-0aacb5dec8fd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mN_k\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_occurrence_from_D\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mD\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mskew\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mN_k\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfigure\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mN_k\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbins\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfigure\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'n_occurrence_from_D' is not defined"
   "metadata": {},
   "outputs": [],
   "source": [
    "N_k = n_occurrence_from_D(D, k=100)\n",
    "print skew(N_k)\n",
    "plt.figure()\n",
    "plt.hist(N_k, bins=100);\n",
    "plt.figure()\n",
    "plt.plot(np.sort(N_k))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#sort_idx = np.argsort(D, axis=1)\n",
    "k = 10\n",
    "D_k = sort_idx[:, 1:(k+1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[4650, 2942, 3520, ..., 1318, 6678, 6056],\n",
       "       [1933, 6143, 6757, ..., 7269, 4321, 1563],\n",
       "       [3170, 2549, 4860, ..., 6678, 7414, 6056],\n",
       "       ..., \n",
       "       [6016, 2243, 1616, ..., 7627, 2018,  515],\n",
       "       [7027, 4860, 6346, ...,  997, 3892, 1846],\n",
       "       [5119, 1563, 4035, ..., 3486, 7617, 3854]])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "D_k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}