plosone_underreview: notebooks/test

annotate notebooks/test_hubness.ipynb @ 10:8e897e82af51 branch-tests

edits in feature learning

author	Maria Panteli <m.x.panteli@gmail.com>
date	Tue, 12 Sep 2017 13:31:42 +0100
parents	0f3eba42b425
children	a1a9b472bcdb

rev	line source
m@8	1 {
m@8	2 "cells": [
m@8	3 {
m@8	4 "cell_type": "code",
m@10	5 "execution_count": 20,
m@8	6 "metadata": {
m@8	7 "collapsed": false
m@8	8 },
m@8	9 "outputs": [
m@8	10 {
m@8	11 "name": "stdout",
m@8	12 "output_type": "stream",
m@8	13 "text": [
m@8	14 "The autoreload extension is already loaded. To reload it, use:\n",
m@8	15 " %reload_ext autoreload\n"
m@8	16 ]
m@8	17 }
m@8	18 ],
m@8	19 "source": [
m@8	20 "import numpy as np\n",
m@8	21 "import pickle\n",
m@8	22 "from scipy.stats import pearsonr\n",
m@8	23 "from scipy.stats import skew\n",
m@8	24 "import sys\n",
m@8	25 "from sklearn.metrics.pairwise import pairwise_distances\n",
m@8	26 "%matplotlib inline\n",
m@8	27 "import matplotlib.pyplot as plt\n",
m@8	28 "\n",
m@8	29 "%load_ext autoreload\n",
m@8	30 "%autoreload 2\n",
m@8	31 "\n",
m@8	32 "sys.path.append('../')\n",
m@8	33 "import scripts.results as results\n",
m@8	34 "import scripts.utils_spatial as utils_spatial"
m@8	35 ]
m@8	36 },
m@8	37 {
m@8	38 "cell_type": "code",
m@10	39 "execution_count": 21,
m@8	40 "metadata": {
m@8	41 "collapsed": false
m@8	42 },
m@8	43 "outputs": [
m@8	44 {
m@8	45 "name": "stdout",
m@8	46 "output_type": "stream",
m@8	47 "text": [
m@8	48 "WARNING: there are 21 disconnected observations\n",
m@8	49 "Island ids: [3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121]\n",
m@8	50 "Antigua and Barbuda\n",
m@8	51 "Australia\n",
m@8	52 "Cuba\n",
m@8	53 "Fiji\n",
m@8	54 "French Polynesia\n",
m@8	55 "Grenada\n",
m@8	56 "Iceland\n",
m@8	57 "Jamaica\n",
m@8	58 "Japan\n",
m@8	59 "Kiribati\n",
m@8	60 "Malta\n",
m@8	61 "New Zealand\n",
m@8	62 "Philippines\n",
m@8	63 "Puerto Rico\n",
m@8	64 "Republic of Serbia\n",
m@8	65 "Saint Lucia\n",
m@8	66 "Samoa\n",
m@8	67 "Solomon Islands\n",
m@8	68 "South Korea\n",
m@8	69 "The Bahamas\n",
m@8	70 "Trinidad and Tobago\n"
m@8	71 ]
m@8	72 }
m@8	73 ],
m@8	74 "source": [
m@8	75 "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n",
m@8	76 "ddf = results.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
m@8	77 "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n",
m@8	78 "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n",
m@8	79 "Xrhy, Xmel, Xmfc, Xchr = X_list\n",
m@8	80 "X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)\n",
m@8	81 "\n",
m@8	82 "# global outliers\n",
m@8	83 "df_global, threshold, MD = results.get_outliers_df(X, Y, chi2thr=0.999)"
m@8	84 ]
m@8	85 },
m@8	86 {
m@8	87 "cell_type": "code",
m@10	88 "execution_count": null,
m@8	89 "metadata": {
m@8	90 "collapsed": false
m@8	91 },
m@8	92 "outputs": [
m@8	93 {
m@8	94 "data": {
m@8	95 "text/plain": [
m@8	96 "(8200, 380)"
m@8	97 ]
m@8	98 },
m@10	99 "execution_count": 22,
m@8	100 "metadata": {},
m@8	101 "output_type": "execute_result"
m@8	102 }
m@8	103 ],
m@8	104 "source": [
m@8	105 "X.shape"
m@8	106 ]
m@8	107 },
m@8	108 {
m@8	109 "cell_type": "code",
m@8	110 "execution_count": null,
m@8	111 "metadata": {
m@8	112 "collapsed": false
m@8	113 },
m@8	114 "outputs": [],
m@8	115 "source": [
m@8	116 "D = pairwise_distances(X, metric='mahalanobis')"
m@8	117 ]
m@8	118 },
m@8	119 {
m@8	120 "cell_type": "code",
m@10	121 "execution_count": null,
m@8	122 "metadata": {
m@8	123 "collapsed": false
m@8	124 },
m@10	125 "outputs": [],
m@8	126 "source": [
m@8	127 "plt.hist(D.ravel(), bins=100);"
m@8	128 ]
m@8	129 },
m@8	130 {
m@8	131 "cell_type": "code",
m@10	132 "execution_count": null,
m@8	133 "metadata": {
m@8	134 "collapsed": true
m@8	135 },
m@8	136 "outputs": [],
m@8	137 "source": [
m@8	138 "def n_occurrence_from_D(D, k=10, n_items=None):\n",
m@8	139 " if n_items is None:\n",
m@8	140 " n_items = len(D)\n",
m@8	141 " sort_idx = np.argsort(D, axis=1)\n",
m@8	142 " D_k = sort_idx[:, 1:(k+1)] # nearest neighbour is the item itself\n",
m@8	143 " N_k = np.bincount(D_k.astype(int).ravel(), minlength=n_items)\n",
m@8	144 " return N_k"
m@8	145 ]
m@8	146 },
m@8	147 {
m@8	148 "cell_type": "code",
m@10	149 "execution_count": null,
m@8	150 "metadata": {
m@8	151 "collapsed": false
m@8	152 },
m@10	153 "outputs": [],
m@8	154 "source": [
m@8	155 "N_k = n_occurrence_from_D(D, k=100)\n",
m@8	156 "print skew(N_k)\n",
m@8	157 "plt.hist(N_k, bins=100);"
m@8	158 ]
m@8	159 },
m@8	160 {
m@8	161 "cell_type": "code",
m@8	162 "execution_count": 9,
m@8	163 "metadata": {
m@8	164 "collapsed": true
m@8	165 },
m@8	166 "outputs": [],
m@8	167 "source": [
m@8	168 "N_k"
m@8	169 ]
m@8	170 },
m@8	171 {
m@8	172 "cell_type": "code",
m@8	173 "execution_count": null,
m@8	174 "metadata": {
m@8	175 "collapsed": true
m@8	176 },
m@8	177 "outputs": [],
m@8	178 "source": []
m@8	179 }
m@8	180 ],
m@8	181 "metadata": {
m@8	182 "kernelspec": {
m@8	183 "display_name": "Python 2",
m@8	184 "language": "python",
m@8	185 "name": "python2"
m@8	186 },
m@8	187 "language_info": {
m@8	188 "codemirror_mode": {
m@8	189 "name": "ipython",
m@8	190 "version": 2
m@8	191 },
m@8	192 "file_extension": ".py",
m@8	193 "mimetype": "text/x-python",
m@8	194 "name": "python",
m@8	195 "nbconvert_exporter": "python",
m@8	196 "pygments_lexer": "ipython2",
m@8	197 "version": "2.7.12"
m@8	198 }
m@8	199 },
m@8	200 "nbformat": 4,
m@8	201 "nbformat_minor": 0
m@8	202 }

Mercurial > hg > plosone_underreview

annotate notebooks/test_hubness.ipynb @ 10:8e897e82af51 branch-tests