m@8
|
1 {
|
m@8
|
2 "cells": [
|
m@8
|
3 {
|
m@8
|
4 "cell_type": "code",
|
m@10
|
5 "execution_count": 20,
|
m@8
|
6 "metadata": {
|
m@8
|
7 "collapsed": false
|
m@8
|
8 },
|
m@8
|
9 "outputs": [
|
m@8
|
10 {
|
m@8
|
11 "name": "stdout",
|
m@8
|
12 "output_type": "stream",
|
m@8
|
13 "text": [
|
m@8
|
14 "The autoreload extension is already loaded. To reload it, use:\n",
|
m@8
|
15 " %reload_ext autoreload\n"
|
m@8
|
16 ]
|
m@8
|
17 }
|
m@8
|
18 ],
|
m@8
|
19 "source": [
|
m@8
|
20 "import numpy as np\n",
|
m@8
|
21 "import pickle\n",
|
m@8
|
22 "from scipy.stats import pearsonr\n",
|
m@8
|
23 "from scipy.stats import skew\n",
|
m@8
|
24 "import sys\n",
|
m@8
|
25 "from sklearn.metrics.pairwise import pairwise_distances\n",
|
m@8
|
26 "%matplotlib inline\n",
|
m@8
|
27 "import matplotlib.pyplot as plt\n",
|
m@8
|
28 "\n",
|
m@8
|
29 "%load_ext autoreload\n",
|
m@8
|
30 "%autoreload 2\n",
|
m@8
|
31 "\n",
|
m@8
|
32 "sys.path.append('../')\n",
|
m@8
|
33 "import scripts.results as results\n",
|
m@8
|
34 "import scripts.utils_spatial as utils_spatial"
|
m@8
|
35 ]
|
m@8
|
36 },
|
m@8
|
37 {
|
m@8
|
38 "cell_type": "code",
|
m@10
|
39 "execution_count": 21,
|
m@8
|
40 "metadata": {
|
m@8
|
41 "collapsed": false
|
m@8
|
42 },
|
m@8
|
43 "outputs": [
|
m@8
|
44 {
|
m@8
|
45 "name": "stdout",
|
m@8
|
46 "output_type": "stream",
|
m@8
|
47 "text": [
|
m@8
|
48 "WARNING: there are 21 disconnected observations\n",
|
m@8
|
49 "Island ids: [3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121]\n",
|
m@8
|
50 "Antigua and Barbuda\n",
|
m@8
|
51 "Australia\n",
|
m@8
|
52 "Cuba\n",
|
m@8
|
53 "Fiji\n",
|
m@8
|
54 "French Polynesia\n",
|
m@8
|
55 "Grenada\n",
|
m@8
|
56 "Iceland\n",
|
m@8
|
57 "Jamaica\n",
|
m@8
|
58 "Japan\n",
|
m@8
|
59 "Kiribati\n",
|
m@8
|
60 "Malta\n",
|
m@8
|
61 "New Zealand\n",
|
m@8
|
62 "Philippines\n",
|
m@8
|
63 "Puerto Rico\n",
|
m@8
|
64 "Republic of Serbia\n",
|
m@8
|
65 "Saint Lucia\n",
|
m@8
|
66 "Samoa\n",
|
m@8
|
67 "Solomon Islands\n",
|
m@8
|
68 "South Korea\n",
|
m@8
|
69 "The Bahamas\n",
|
m@8
|
70 "Trinidad and Tobago\n"
|
m@8
|
71 ]
|
m@8
|
72 }
|
m@8
|
73 ],
|
m@8
|
74 "source": [
|
m@8
|
75 "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n",
|
m@8
|
76 "ddf = results.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
|
m@8
|
77 "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n",
|
m@8
|
78 "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n",
|
m@8
|
79 "Xrhy, Xmel, Xmfc, Xchr = X_list\n",
|
m@8
|
80 "X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)\n",
|
m@8
|
81 "\n",
|
m@8
|
82 "# global outliers\n",
|
m@8
|
83 "df_global, threshold, MD = results.get_outliers_df(X, Y, chi2thr=0.999)"
|
m@8
|
84 ]
|
m@8
|
85 },
|
m@8
|
86 {
|
m@8
|
87 "cell_type": "code",
|
m@10
|
88 "execution_count": null,
|
m@8
|
89 "metadata": {
|
m@8
|
90 "collapsed": false
|
m@8
|
91 },
|
m@8
|
92 "outputs": [
|
m@8
|
93 {
|
m@8
|
94 "data": {
|
m@8
|
95 "text/plain": [
|
m@8
|
96 "(8200, 380)"
|
m@8
|
97 ]
|
m@8
|
98 },
|
m@10
|
99 "execution_count": 22,
|
m@8
|
100 "metadata": {},
|
m@8
|
101 "output_type": "execute_result"
|
m@8
|
102 }
|
m@8
|
103 ],
|
m@8
|
104 "source": [
|
m@8
|
105 "X.shape"
|
m@8
|
106 ]
|
m@8
|
107 },
|
m@8
|
108 {
|
m@8
|
109 "cell_type": "code",
|
m@8
|
110 "execution_count": null,
|
m@8
|
111 "metadata": {
|
m@8
|
112 "collapsed": false
|
m@8
|
113 },
|
m@8
|
114 "outputs": [],
|
m@8
|
115 "source": [
|
m@8
|
116 "D = pairwise_distances(X, metric='mahalanobis')"
|
m@8
|
117 ]
|
m@8
|
118 },
|
m@8
|
119 {
|
m@8
|
120 "cell_type": "code",
|
m@10
|
121 "execution_count": null,
|
m@8
|
122 "metadata": {
|
m@8
|
123 "collapsed": false
|
m@8
|
124 },
|
m@10
|
125 "outputs": [],
|
m@8
|
126 "source": [
|
m@8
|
127 "plt.hist(D.ravel(), bins=100);"
|
m@8
|
128 ]
|
m@8
|
129 },
|
m@8
|
130 {
|
m@8
|
131 "cell_type": "code",
|
m@10
|
132 "execution_count": null,
|
m@8
|
133 "metadata": {
|
m@8
|
134 "collapsed": true
|
m@8
|
135 },
|
m@8
|
136 "outputs": [],
|
m@8
|
137 "source": [
|
m@8
|
138 "def n_occurrence_from_D(D, k=10, n_items=None):\n",
|
m@8
|
139 " if n_items is None:\n",
|
m@8
|
140 " n_items = len(D)\n",
|
m@8
|
141 " sort_idx = np.argsort(D, axis=1)\n",
|
m@8
|
142 " D_k = sort_idx[:, 1:(k+1)] # nearest neighbour is the item itself\n",
|
m@8
|
143 " N_k = np.bincount(D_k.astype(int).ravel(), minlength=n_items)\n",
|
m@8
|
144 " return N_k"
|
m@8
|
145 ]
|
m@8
|
146 },
|
m@8
|
147 {
|
m@8
|
148 "cell_type": "code",
|
m@10
|
149 "execution_count": null,
|
m@8
|
150 "metadata": {
|
m@8
|
151 "collapsed": false
|
m@8
|
152 },
|
m@10
|
153 "outputs": [],
|
m@8
|
154 "source": [
|
m@8
|
155 "N_k = n_occurrence_from_D(D, k=100)\n",
|
m@8
|
156 "print skew(N_k)\n",
|
m@8
|
157 "plt.hist(N_k, bins=100);"
|
m@8
|
158 ]
|
m@8
|
159 },
|
m@8
|
160 {
|
m@8
|
161 "cell_type": "code",
|
m@8
|
162 "execution_count": 9,
|
m@8
|
163 "metadata": {
|
m@8
|
164 "collapsed": true
|
m@8
|
165 },
|
m@8
|
166 "outputs": [],
|
m@8
|
167 "source": [
|
m@8
|
168 "N_k"
|
m@8
|
169 ]
|
m@8
|
170 },
|
m@8
|
171 {
|
m@8
|
172 "cell_type": "code",
|
m@8
|
173 "execution_count": null,
|
m@8
|
174 "metadata": {
|
m@8
|
175 "collapsed": true
|
m@8
|
176 },
|
m@8
|
177 "outputs": [],
|
m@8
|
178 "source": []
|
m@8
|
179 }
|
m@8
|
180 ],
|
m@8
|
181 "metadata": {
|
m@8
|
182 "kernelspec": {
|
m@8
|
183 "display_name": "Python 2",
|
m@8
|
184 "language": "python",
|
m@8
|
185 "name": "python2"
|
m@8
|
186 },
|
m@8
|
187 "language_info": {
|
m@8
|
188 "codemirror_mode": {
|
m@8
|
189 "name": "ipython",
|
m@8
|
190 "version": 2
|
m@8
|
191 },
|
m@8
|
192 "file_extension": ".py",
|
m@8
|
193 "mimetype": "text/x-python",
|
m@8
|
194 "name": "python",
|
m@8
|
195 "nbconvert_exporter": "python",
|
m@8
|
196 "pygments_lexer": "ipython2",
|
m@8
|
197 "version": "2.7.12"
|
m@8
|
198 }
|
m@8
|
199 },
|
m@8
|
200 "nbformat": 4,
|
m@8
|
201 "nbformat_minor": 0
|
m@8
|
202 }
|