annotate notebooks/test_hubness.ipynb @ 10:8e897e82af51 branch-tests

edits in feature learning
author Maria Panteli <m.x.panteli@gmail.com>
date Tue, 12 Sep 2017 13:31:42 +0100
parents 0f3eba42b425
children a1a9b472bcdb
rev   line source
m@8 1 {
m@8 2 "cells": [
m@8 3 {
m@8 4 "cell_type": "code",
m@10 5 "execution_count": 20,
m@8 6 "metadata": {
m@8 7 "collapsed": false
m@8 8 },
m@8 9 "outputs": [
m@8 10 {
m@8 11 "name": "stdout",
m@8 12 "output_type": "stream",
m@8 13 "text": [
m@8 14 "The autoreload extension is already loaded. To reload it, use:\n",
m@8 15 " %reload_ext autoreload\n"
m@8 16 ]
m@8 17 }
m@8 18 ],
m@8 19 "source": [
m@8 20 "import numpy as np\n",
m@8 21 "import pickle\n",
m@8 22 "from scipy.stats import pearsonr\n",
m@8 23 "from scipy.stats import skew\n",
m@8 24 "import sys\n",
m@8 25 "from sklearn.metrics.pairwise import pairwise_distances\n",
m@8 26 "%matplotlib inline\n",
m@8 27 "import matplotlib.pyplot as plt\n",
m@8 28 "\n",
m@8 29 "%load_ext autoreload\n",
m@8 30 "%autoreload 2\n",
m@8 31 "\n",
m@8 32 "sys.path.append('../')\n",
m@8 33 "import scripts.results as results\n",
m@8 34 "import scripts.utils_spatial as utils_spatial"
m@8 35 ]
m@8 36 },
m@8 37 {
m@8 38 "cell_type": "code",
m@10 39 "execution_count": 21,
m@8 40 "metadata": {
m@8 41 "collapsed": false
m@8 42 },
m@8 43 "outputs": [
m@8 44 {
m@8 45 "name": "stdout",
m@8 46 "output_type": "stream",
m@8 47 "text": [
m@8 48 "WARNING: there are 21 disconnected observations\n",
m@8 49 "Island ids: [3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121]\n",
m@8 50 "Antigua and Barbuda\n",
m@8 51 "Australia\n",
m@8 52 "Cuba\n",
m@8 53 "Fiji\n",
m@8 54 "French Polynesia\n",
m@8 55 "Grenada\n",
m@8 56 "Iceland\n",
m@8 57 "Jamaica\n",
m@8 58 "Japan\n",
m@8 59 "Kiribati\n",
m@8 60 "Malta\n",
m@8 61 "New Zealand\n",
m@8 62 "Philippines\n",
m@8 63 "Puerto Rico\n",
m@8 64 "Republic of Serbia\n",
m@8 65 "Saint Lucia\n",
m@8 66 "Samoa\n",
m@8 67 "Solomon Islands\n",
m@8 68 "South Korea\n",
m@8 69 "The Bahamas\n",
m@8 70 "Trinidad and Tobago\n"
m@8 71 ]
m@8 72 }
m@8 73 ],
m@8 74 "source": [
m@8 75 "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n",
m@8 76 "ddf = results.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
m@8 77 "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n",
m@8 78 "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n",
m@8 79 "Xrhy, Xmel, Xmfc, Xchr = X_list\n",
m@8 80 "X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)\n",
m@8 81 "\n",
m@8 82 "# global outliers\n",
m@8 83 "df_global, threshold, MD = results.get_outliers_df(X, Y, chi2thr=0.999)"
m@8 84 ]
m@8 85 },
m@8 86 {
m@8 87 "cell_type": "code",
m@10 88 "execution_count": null,
m@8 89 "metadata": {
m@8 90 "collapsed": false
m@8 91 },
m@8 92 "outputs": [
m@8 93 {
m@8 94 "data": {
m@8 95 "text/plain": [
m@8 96 "(8200, 380)"
m@8 97 ]
m@8 98 },
m@10 99 "execution_count": 22,
m@8 100 "metadata": {},
m@8 101 "output_type": "execute_result"
m@8 102 }
m@8 103 ],
m@8 104 "source": [
m@8 105 "X.shape"
m@8 106 ]
m@8 107 },
m@8 108 {
m@8 109 "cell_type": "code",
m@8 110 "execution_count": null,
m@8 111 "metadata": {
m@8 112 "collapsed": false
m@8 113 },
m@8 114 "outputs": [],
m@8 115 "source": [
m@8 116 "D = pairwise_distances(X, metric='mahalanobis')"
m@8 117 ]
m@8 118 },
m@8 119 {
m@8 120 "cell_type": "code",
m@10 121 "execution_count": null,
m@8 122 "metadata": {
m@8 123 "collapsed": false
m@8 124 },
m@10 125 "outputs": [],
m@8 126 "source": [
m@8 127 "plt.hist(D.ravel(), bins=100);"
m@8 128 ]
m@8 129 },
m@8 130 {
m@8 131 "cell_type": "code",
m@10 132 "execution_count": null,
m@8 133 "metadata": {
m@8 134 "collapsed": true
m@8 135 },
m@8 136 "outputs": [],
m@8 137 "source": [
m@8 138 "def n_occurrence_from_D(D, k=10, n_items=None):\n",
m@8 139 " if n_items is None:\n",
m@8 140 " n_items = len(D)\n",
m@8 141 " sort_idx = np.argsort(D, axis=1)\n",
m@8 142 " D_k = sort_idx[:, 1:(k+1)] # nearest neighbour is the item itself\n",
m@8 143 " N_k = np.bincount(D_k.astype(int).ravel(), minlength=n_items)\n",
m@8 144 " return N_k"
m@8 145 ]
m@8 146 },
m@8 147 {
m@8 148 "cell_type": "code",
m@10 149 "execution_count": null,
m@8 150 "metadata": {
m@8 151 "collapsed": false
m@8 152 },
m@10 153 "outputs": [],
m@8 154 "source": [
m@8 155 "N_k = n_occurrence_from_D(D, k=100)\n",
m@8 156 "print skew(N_k)\n",
m@8 157 "plt.hist(N_k, bins=100);"
m@8 158 ]
m@8 159 },
m@8 160 {
m@8 161 "cell_type": "code",
m@8 162 "execution_count": 9,
m@8 163 "metadata": {
m@8 164 "collapsed": true
m@8 165 },
m@8 166 "outputs": [],
m@8 167 "source": [
m@8 168 "N_k"
m@8 169 ]
m@8 170 },
m@8 171 {
m@8 172 "cell_type": "code",
m@8 173 "execution_count": null,
m@8 174 "metadata": {
m@8 175 "collapsed": true
m@8 176 },
m@8 177 "outputs": [],
m@8 178 "source": []
m@8 179 }
m@8 180 ],
m@8 181 "metadata": {
m@8 182 "kernelspec": {
m@8 183 "display_name": "Python 2",
m@8 184 "language": "python",
m@8 185 "name": "python2"
m@8 186 },
m@8 187 "language_info": {
m@8 188 "codemirror_mode": {
m@8 189 "name": "ipython",
m@8 190 "version": 2
m@8 191 },
m@8 192 "file_extension": ".py",
m@8 193 "mimetype": "text/x-python",
m@8 194 "name": "python",
m@8 195 "nbconvert_exporter": "python",
m@8 196 "pygments_lexer": "ipython2",
m@8 197 "version": "2.7.12"
m@8 198 }
m@8 199 },
m@8 200 "nbformat": 4,
m@8 201 "nbformat_minor": 0
m@8 202 }