view notebooks/test_hubness.ipynb @ 10:8e897e82af51 branch-tests

edits in feature learning
author Maria Panteli <m.x.panteli@gmail.com>
date Tue, 12 Sep 2017 13:31:42 +0100
parents 0f3eba42b425
children a1a9b472bcdb
line wrap: on
line source
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pickle\n",
    "from scipy.stats import pearsonr\n",
    "from scipy.stats import skew\n",
    "import sys\n",
    "from sklearn.metrics.pairwise import pairwise_distances\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "sys.path.append('../')\n",
    "import scripts.results as results\n",
    "import scripts.utils_spatial as utils_spatial"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING: there are 21 disconnected observations\n",
      "Island ids:  [3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121]\n",
      "Antigua and Barbuda\n",
      "Australia\n",
      "Cuba\n",
      "Fiji\n",
      "French Polynesia\n",
      "Grenada\n",
      "Iceland\n",
      "Jamaica\n",
      "Japan\n",
      "Kiribati\n",
      "Malta\n",
      "New Zealand\n",
      "Philippines\n",
      "Puerto Rico\n",
      "Republic of Serbia\n",
      "Saint Lucia\n",
      "Samoa\n",
      "Solomon Islands\n",
      "South Korea\n",
      "The Bahamas\n",
      "Trinidad and Tobago\n"
     ]
    }
   ],
   "source": [
    "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n",
    "ddf = results.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
    "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n",
    "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n",
    "Xrhy, Xmel, Xmfc, Xchr = X_list\n",
    "X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)\n",
    "\n",
    "# global outliers\n",
    "df_global, threshold, MD = results.get_outliers_df(X, Y, chi2thr=0.999)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(8200, 380)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "D = pairwise_distances(X, metric='mahalanobis')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.hist(D.ravel(), bins=100);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def n_occurrence_from_D(D, k=10, n_items=None):\n",
    "    if n_items is None:\n",
    "        n_items = len(D)\n",
    "    sort_idx = np.argsort(D, axis=1)\n",
    "    D_k = sort_idx[:, 1:(k+1)]  # nearest neighbour is the item itself\n",
    "    N_k = np.bincount(D_k.astype(int).ravel(), minlength=n_items)\n",
    "    return N_k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "N_k = n_occurrence_from_D(D, k=100)\n",
    "print skew(N_k)\n",
    "plt.hist(N_k, bins=100);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "N_k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}