annotate notebooks/test_hubness.ipynb @ 26:29b5ee381305 branch-tests

notebooks rerunning, and changes in load features for melodia
author mpanteli <m.x.panteli@gmail.com>
date Wed, 13 Sep 2017 17:33:48 +0100
parents ed109218dd4b
children e4736064d282 6aa08c9c95e9
rev   line source
m@8 1 {
m@8 2 "cells": [
m@8 3 {
m@8 4 "cell_type": "code",
m@11 5 "execution_count": 1,
m@26 6 "metadata": {
m@26 7 "collapsed": true
m@26 8 },
m@11 9 "outputs": [],
m@8 10 "source": [
m@8 11 "import numpy as np\n",
m@8 12 "import pickle\n",
m@8 13 "from scipy.stats import pearsonr\n",
m@8 14 "from scipy.stats import skew\n",
m@8 15 "import sys\n",
m@8 16 "from sklearn.metrics.pairwise import pairwise_distances\n",
m@8 17 "%matplotlib inline\n",
m@8 18 "import matplotlib.pyplot as plt\n",
m@8 19 "\n",
m@8 20 "%load_ext autoreload\n",
m@8 21 "%autoreload 2\n",
m@8 22 "\n",
m@8 23 "sys.path.append('../')\n",
Maria@18 24 "import scripts.outliers as outliers\n",
m@8 25 "import scripts.utils_spatial as utils_spatial"
m@8 26 ]
m@8 27 },
m@8 28 {
m@8 29 "cell_type": "code",
m@12 30 "execution_count": 2,
m@11 31 "metadata": {},
m@8 32 "outputs": [
m@8 33 {
m@12 34 "name": "stderr",
m@12 35 "output_type": "stream",
m@12 36 "text": [
m@12 37 "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:189: UserWarning: There are 21 disconnected observations\n",
m@12 38 " warnings.warn(\"There are %d disconnected observations\" % ni)\n",
m@12 39 "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:190: UserWarning: Island ids: 3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121\n",
m@12 40 " warnings.warn(\"Island ids: %s\" % ', '.join(str(island) for island in self.islands))\n"
m@12 41 ]
m@12 42 },
m@12 43 {
m@8 44 "name": "stdout",
m@8 45 "output_type": "stream",
m@8 46 "text": [
m@8 47 "Antigua and Barbuda\n",
m@8 48 "Australia\n",
m@8 49 "Cuba\n",
m@8 50 "Fiji\n",
m@8 51 "French Polynesia\n",
m@8 52 "Grenada\n",
m@8 53 "Iceland\n",
m@8 54 "Jamaica\n",
m@8 55 "Japan\n",
m@8 56 "Kiribati\n",
m@8 57 "Malta\n",
m@8 58 "New Zealand\n",
m@8 59 "Philippines\n",
m@8 60 "Puerto Rico\n",
m@8 61 "Republic of Serbia\n",
m@8 62 "Saint Lucia\n",
m@8 63 "Samoa\n",
m@8 64 "Solomon Islands\n",
m@8 65 "South Korea\n",
m@8 66 "The Bahamas\n",
m@8 67 "Trinidad and Tobago\n"
m@8 68 ]
m@8 69 }
m@8 70 ],
m@8 71 "source": [
m@8 72 "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n",
Maria@18 73 "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
m@8 74 "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n",
m@8 75 "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n",
m@8 76 "Xrhy, Xmel, Xmfc, Xchr = X_list\n",
m@8 77 "X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)\n",
m@8 78 "\n",
m@8 79 "# global outliers\n",
Maria@18 80 "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)"
m@8 81 ]
m@8 82 },
m@8 83 {
m@8 84 "cell_type": "code",
m@26 85 "execution_count": null,
m@11 86 "metadata": {},
m@8 87 "outputs": [
m@8 88 {
m@8 89 "data": {
m@8 90 "text/plain": [
m@8 91 "(8200, 380)"
m@8 92 ]
m@8 93 },
m@12 94 "execution_count": 3,
m@8 95 "metadata": {},
m@8 96 "output_type": "execute_result"
m@8 97 }
m@8 98 ],
m@8 99 "source": [
m@8 100 "X.shape"
m@8 101 ]
m@8 102 },
m@8 103 {
m@8 104 "cell_type": "code",
m@26 105 "execution_count": null,
m@26 106 "metadata": {
m@26 107 "collapsed": true
m@26 108 },
m@8 109 "outputs": [],
m@8 110 "source": [
m@8 111 "D = pairwise_distances(X, metric='mahalanobis')"
m@8 112 ]
m@8 113 },
m@8 114 {
m@8 115 "cell_type": "code",
m@26 116 "execution_count": null,
m@12 117 "metadata": {},
m@26 118 "outputs": [],
m@12 119 "source": [
m@12 120 "D.shape"
m@12 121 ]
m@12 122 },
m@12 123 {
m@12 124 "cell_type": "code",
m@26 125 "execution_count": null,
m@11 126 "metadata": {},
m@26 127 "outputs": [],
m@8 128 "source": [
m@8 129 "plt.hist(D.ravel(), bins=100);"
m@8 130 ]
m@8 131 },
m@8 132 {
m@8 133 "cell_type": "code",
m@26 134 "execution_count": null,
m@8 135 "metadata": {
m@8 136 "collapsed": true
m@8 137 },
m@8 138 "outputs": [],
m@8 139 "source": [
m@8 140 "def n_occurrence_from_D(D, k=10, n_items=None):\n",
m@8 141 " if n_items is None:\n",
m@8 142 " n_items = len(D)\n",
m@8 143 " sort_idx = np.argsort(D, axis=1)\n",
m@8 144 " D_k = sort_idx[:, 1:(k+1)] # nearest neighbour is the item itself\n",
m@8 145 " N_k = np.bincount(D_k.astype(int).ravel(), minlength=n_items)\n",
m@8 146 " return N_k"
m@8 147 ]
m@8 148 },
m@8 149 {
m@8 150 "cell_type": "code",
m@26 151 "execution_count": null,
m@11 152 "metadata": {},
m@26 153 "outputs": [],
m@8 154 "source": [
m@8 155 "N_k = n_occurrence_from_D(D, k=100)\n",
m@8 156 "print skew(N_k)\n",
m@26 157 "plt.figure()\n",
m@26 158 "plt.hist(N_k, bins=100);\n",
m@26 159 "plt.figure()\n",
m@26 160 "plt.plot(np.sort(N_k))"
m@8 161 ]
m@8 162 },
m@8 163 {
m@8 164 "cell_type": "code",
m@12 165 "execution_count": 11,
m@26 166 "metadata": {
m@26 167 "collapsed": true
m@26 168 },
m@8 169 "outputs": [],
m@8 170 "source": [
m@12 171 "#sort_idx = np.argsort(D, axis=1)\n",
m@12 172 "k = 10\n",
m@12 173 "D_k = sort_idx[:, 1:(k+1)]"
m@12 174 ]
m@12 175 },
m@12 176 {
m@12 177 "cell_type": "code",
m@12 178 "execution_count": 12,
m@12 179 "metadata": {},
m@12 180 "outputs": [
m@12 181 {
m@12 182 "data": {
m@12 183 "text/plain": [
m@12 184 "array([[4650, 2942, 3520, ..., 1318, 6678, 6056],\n",
m@12 185 " [1933, 6143, 6757, ..., 7269, 4321, 1563],\n",
m@12 186 " [3170, 2549, 4860, ..., 6678, 7414, 6056],\n",
m@12 187 " ..., \n",
m@12 188 " [6016, 2243, 1616, ..., 7627, 2018, 515],\n",
m@12 189 " [7027, 4860, 6346, ..., 997, 3892, 1846],\n",
m@12 190 " [5119, 1563, 4035, ..., 3486, 7617, 3854]])"
m@12 191 ]
m@12 192 },
m@12 193 "execution_count": 12,
m@12 194 "metadata": {},
m@12 195 "output_type": "execute_result"
m@12 196 }
m@12 197 ],
m@12 198 "source": [
m@12 199 "D_k"
m@8 200 ]
m@8 201 },
m@8 202 {
m@8 203 "cell_type": "code",
m@8 204 "execution_count": null,
m@8 205 "metadata": {
m@8 206 "collapsed": true
m@8 207 },
m@8 208 "outputs": [],
m@8 209 "source": []
m@8 210 }
m@8 211 ],
m@8 212 "metadata": {
m@8 213 "kernelspec": {
m@8 214 "display_name": "Python 2",
m@8 215 "language": "python",
m@8 216 "name": "python2"
m@8 217 },
m@8 218 "language_info": {
m@8 219 "codemirror_mode": {
m@8 220 "name": "ipython",
m@8 221 "version": 2
m@8 222 },
m@8 223 "file_extension": ".py",
m@8 224 "mimetype": "text/x-python",
m@8 225 "name": "python",
m@8 226 "nbconvert_exporter": "python",
m@8 227 "pygments_lexer": "ipython2",
m@8 228 "version": "2.7.12"
m@8 229 }
m@8 230 },
m@8 231 "nbformat": 4,
m@11 232 "nbformat_minor": 1
m@8 233 }