m@8
|
1 {
|
m@8
|
2 "cells": [
|
m@8
|
3 {
|
m@8
|
4 "cell_type": "code",
|
m@11
|
5 "execution_count": 1,
|
m@26
|
6 "metadata": {
|
m@26
|
7 "collapsed": true
|
m@26
|
8 },
|
m@11
|
9 "outputs": [],
|
m@8
|
10 "source": [
|
m@8
|
11 "import numpy as np\n",
|
m@8
|
12 "import pickle\n",
|
m@8
|
13 "from scipy.stats import pearsonr\n",
|
m@8
|
14 "from scipy.stats import skew\n",
|
m@8
|
15 "import sys\n",
|
m@8
|
16 "from sklearn.metrics.pairwise import pairwise_distances\n",
|
m@8
|
17 "%matplotlib inline\n",
|
m@8
|
18 "import matplotlib.pyplot as plt\n",
|
m@8
|
19 "\n",
|
m@8
|
20 "%load_ext autoreload\n",
|
m@8
|
21 "%autoreload 2\n",
|
m@8
|
22 "\n",
|
m@8
|
23 "sys.path.append('../')\n",
|
Maria@18
|
24 "import scripts.outliers as outliers\n",
|
m@8
|
25 "import scripts.utils_spatial as utils_spatial"
|
m@8
|
26 ]
|
m@8
|
27 },
|
m@8
|
28 {
|
m@8
|
29 "cell_type": "code",
|
m@12
|
30 "execution_count": 2,
|
m@11
|
31 "metadata": {},
|
m@8
|
32 "outputs": [
|
m@8
|
33 {
|
m@12
|
34 "name": "stderr",
|
m@12
|
35 "output_type": "stream",
|
m@12
|
36 "text": [
|
m@12
|
37 "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:189: UserWarning: There are 21 disconnected observations\n",
|
m@12
|
38 " warnings.warn(\"There are %d disconnected observations\" % ni)\n",
|
m@12
|
39 "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:190: UserWarning: Island ids: 3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121\n",
|
m@12
|
40 " warnings.warn(\"Island ids: %s\" % ', '.join(str(island) for island in self.islands))\n"
|
m@12
|
41 ]
|
m@12
|
42 },
|
m@12
|
43 {
|
m@8
|
44 "name": "stdout",
|
m@8
|
45 "output_type": "stream",
|
m@8
|
46 "text": [
|
m@8
|
47 "Antigua and Barbuda\n",
|
m@8
|
48 "Australia\n",
|
m@8
|
49 "Cuba\n",
|
m@8
|
50 "Fiji\n",
|
m@8
|
51 "French Polynesia\n",
|
m@8
|
52 "Grenada\n",
|
m@8
|
53 "Iceland\n",
|
m@8
|
54 "Jamaica\n",
|
m@8
|
55 "Japan\n",
|
m@8
|
56 "Kiribati\n",
|
m@8
|
57 "Malta\n",
|
m@8
|
58 "New Zealand\n",
|
m@8
|
59 "Philippines\n",
|
m@8
|
60 "Puerto Rico\n",
|
m@8
|
61 "Republic of Serbia\n",
|
m@8
|
62 "Saint Lucia\n",
|
m@8
|
63 "Samoa\n",
|
m@8
|
64 "Solomon Islands\n",
|
m@8
|
65 "South Korea\n",
|
m@8
|
66 "The Bahamas\n",
|
m@8
|
67 "Trinidad and Tobago\n"
|
m@8
|
68 ]
|
m@8
|
69 }
|
m@8
|
70 ],
|
m@8
|
71 "source": [
|
m@8
|
72 "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n",
|
Maria@18
|
73 "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n",
|
m@8
|
74 "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n",
|
m@8
|
75 "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n",
|
m@8
|
76 "Xrhy, Xmel, Xmfc, Xchr = X_list\n",
|
m@8
|
77 "X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)\n",
|
m@8
|
78 "\n",
|
m@8
|
79 "# global outliers\n",
|
Maria@18
|
80 "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)"
|
m@8
|
81 ]
|
m@8
|
82 },
|
m@8
|
83 {
|
m@8
|
84 "cell_type": "code",
|
m@26
|
85 "execution_count": null,
|
m@11
|
86 "metadata": {},
|
m@8
|
87 "outputs": [
|
m@8
|
88 {
|
m@8
|
89 "data": {
|
m@8
|
90 "text/plain": [
|
m@8
|
91 "(8200, 380)"
|
m@8
|
92 ]
|
m@8
|
93 },
|
m@12
|
94 "execution_count": 3,
|
m@8
|
95 "metadata": {},
|
m@8
|
96 "output_type": "execute_result"
|
m@8
|
97 }
|
m@8
|
98 ],
|
m@8
|
99 "source": [
|
m@8
|
100 "X.shape"
|
m@8
|
101 ]
|
m@8
|
102 },
|
m@8
|
103 {
|
m@8
|
104 "cell_type": "code",
|
m@26
|
105 "execution_count": null,
|
m@26
|
106 "metadata": {
|
m@26
|
107 "collapsed": true
|
m@26
|
108 },
|
m@8
|
109 "outputs": [],
|
m@8
|
110 "source": [
|
m@8
|
111 "D = pairwise_distances(X, metric='mahalanobis')"
|
m@8
|
112 ]
|
m@8
|
113 },
|
m@8
|
114 {
|
m@8
|
115 "cell_type": "code",
|
m@26
|
116 "execution_count": null,
|
m@12
|
117 "metadata": {},
|
m@26
|
118 "outputs": [],
|
m@12
|
119 "source": [
|
m@12
|
120 "D.shape"
|
m@12
|
121 ]
|
m@12
|
122 },
|
m@12
|
123 {
|
m@12
|
124 "cell_type": "code",
|
m@26
|
125 "execution_count": null,
|
m@11
|
126 "metadata": {},
|
m@26
|
127 "outputs": [],
|
m@8
|
128 "source": [
|
m@8
|
129 "plt.hist(D.ravel(), bins=100);"
|
m@8
|
130 ]
|
m@8
|
131 },
|
m@8
|
132 {
|
m@8
|
133 "cell_type": "code",
|
m@26
|
134 "execution_count": null,
|
m@8
|
135 "metadata": {
|
m@8
|
136 "collapsed": true
|
m@8
|
137 },
|
m@8
|
138 "outputs": [],
|
m@8
|
139 "source": [
|
m@8
|
140 "def n_occurrence_from_D(D, k=10, n_items=None):\n",
|
m@8
|
141 " if n_items is None:\n",
|
m@8
|
142 " n_items = len(D)\n",
|
m@8
|
143 " sort_idx = np.argsort(D, axis=1)\n",
|
m@8
|
144 " D_k = sort_idx[:, 1:(k+1)] # nearest neighbour is the item itself\n",
|
m@8
|
145 " N_k = np.bincount(D_k.astype(int).ravel(), minlength=n_items)\n",
|
m@8
|
146 " return N_k"
|
m@8
|
147 ]
|
m@8
|
148 },
|
m@8
|
149 {
|
m@8
|
150 "cell_type": "code",
|
m@26
|
151 "execution_count": null,
|
m@11
|
152 "metadata": {},
|
m@26
|
153 "outputs": [],
|
m@8
|
154 "source": [
|
m@8
|
155 "N_k = n_occurrence_from_D(D, k=100)\n",
|
m@8
|
156 "print skew(N_k)\n",
|
m@26
|
157 "plt.figure()\n",
|
m@26
|
158 "plt.hist(N_k, bins=100);\n",
|
m@26
|
159 "plt.figure()\n",
|
m@26
|
160 "plt.plot(np.sort(N_k))"
|
m@8
|
161 ]
|
m@8
|
162 },
|
m@8
|
163 {
|
m@8
|
164 "cell_type": "code",
|
m@12
|
165 "execution_count": 11,
|
m@26
|
166 "metadata": {
|
m@26
|
167 "collapsed": true
|
m@26
|
168 },
|
m@8
|
169 "outputs": [],
|
m@8
|
170 "source": [
|
m@12
|
171 "#sort_idx = np.argsort(D, axis=1)\n",
|
m@12
|
172 "k = 10\n",
|
m@12
|
173 "D_k = sort_idx[:, 1:(k+1)]"
|
m@12
|
174 ]
|
m@12
|
175 },
|
m@12
|
176 {
|
m@12
|
177 "cell_type": "code",
|
m@12
|
178 "execution_count": 12,
|
m@12
|
179 "metadata": {},
|
m@12
|
180 "outputs": [
|
m@12
|
181 {
|
m@12
|
182 "data": {
|
m@12
|
183 "text/plain": [
|
m@12
|
184 "array([[4650, 2942, 3520, ..., 1318, 6678, 6056],\n",
|
m@12
|
185 " [1933, 6143, 6757, ..., 7269, 4321, 1563],\n",
|
m@12
|
186 " [3170, 2549, 4860, ..., 6678, 7414, 6056],\n",
|
m@12
|
187 " ..., \n",
|
m@12
|
188 " [6016, 2243, 1616, ..., 7627, 2018, 515],\n",
|
m@12
|
189 " [7027, 4860, 6346, ..., 997, 3892, 1846],\n",
|
m@12
|
190 " [5119, 1563, 4035, ..., 3486, 7617, 3854]])"
|
m@12
|
191 ]
|
m@12
|
192 },
|
m@12
|
193 "execution_count": 12,
|
m@12
|
194 "metadata": {},
|
m@12
|
195 "output_type": "execute_result"
|
m@12
|
196 }
|
m@12
|
197 ],
|
m@12
|
198 "source": [
|
m@12
|
199 "D_k"
|
m@8
|
200 ]
|
m@8
|
201 },
|
m@8
|
202 {
|
m@8
|
203 "cell_type": "code",
|
m@8
|
204 "execution_count": null,
|
m@8
|
205 "metadata": {
|
m@8
|
206 "collapsed": true
|
m@8
|
207 },
|
m@8
|
208 "outputs": [],
|
m@8
|
209 "source": []
|
m@8
|
210 }
|
m@8
|
211 ],
|
m@8
|
212 "metadata": {
|
m@8
|
213 "kernelspec": {
|
m@8
|
214 "display_name": "Python 2",
|
m@8
|
215 "language": "python",
|
m@8
|
216 "name": "python2"
|
m@8
|
217 },
|
m@8
|
218 "language_info": {
|
m@8
|
219 "codemirror_mode": {
|
m@8
|
220 "name": "ipython",
|
m@8
|
221 "version": 2
|
m@8
|
222 },
|
m@8
|
223 "file_extension": ".py",
|
m@8
|
224 "mimetype": "text/x-python",
|
m@8
|
225 "name": "python",
|
m@8
|
226 "nbconvert_exporter": "python",
|
m@8
|
227 "pygments_lexer": "ipython2",
|
m@8
|
228 "version": "2.7.12"
|
m@8
|
229 }
|
m@8
|
230 },
|
m@8
|
231 "nbformat": 4,
|
m@11
|
232 "nbformat_minor": 1
|
m@8
|
233 }
|