m@75
|
1 {
|
m@75
|
2 "cells": [
|
m@75
|
3 {
|
m@75
|
4 "cell_type": "code",
|
m@75
|
5 "execution_count": 12,
|
m@75
|
6 "metadata": {
|
m@75
|
7 "collapsed": false
|
m@75
|
8 },
|
m@75
|
9 "outputs": [
|
m@75
|
10 {
|
m@75
|
11 "name": "stdout",
|
m@75
|
12 "output_type": "stream",
|
m@75
|
13 "text": [
|
m@75
|
14 "The autoreload extension is already loaded. To reload it, use:\n",
|
m@75
|
15 " %reload_ext autoreload\n"
|
m@75
|
16 ]
|
m@75
|
17 }
|
m@75
|
18 ],
|
m@75
|
19 "source": [
|
m@75
|
20 "import numpy as np\n",
|
m@75
|
21 "import pandas as pd\n",
|
m@75
|
22 "import pickle \n",
|
m@75
|
23 "\n",
|
m@75
|
24 "%load_ext autoreload\n",
|
m@75
|
25 "%autoreload 2\n",
|
m@75
|
26 "\n",
|
m@75
|
27 "%matplotlib inline\n",
|
m@75
|
28 "import matplotlib.pyplot as plt\n",
|
m@75
|
29 "\n",
|
m@75
|
30 "import sys\n",
|
m@75
|
31 "sys.path.append('../')\n",
|
m@75
|
32 "import scripts.outliers as outliers\n",
|
m@75
|
33 "import scripts.utils as utils"
|
m@75
|
34 ]
|
m@75
|
35 },
|
m@75
|
36 {
|
m@75
|
37 "cell_type": "code",
|
m@75
|
38 "execution_count": 7,
|
m@75
|
39 "metadata": {
|
m@75
|
40 "collapsed": false
|
m@75
|
41 },
|
m@75
|
42 "outputs": [
|
m@75
|
43 {
|
m@75
|
44 "name": "stdout",
|
m@75
|
45 "output_type": "stream",
|
m@75
|
46 "text": [
|
m@75
|
47 "WARNING: there are 21 disconnected observations\n",
|
m@75
|
48 "Island ids: [3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121]\n",
|
m@75
|
49 "Antigua and Barbuda\n",
|
m@75
|
50 "Australia\n",
|
m@75
|
51 "Cuba\n",
|
m@75
|
52 "Fiji\n",
|
m@75
|
53 "French Polynesia\n",
|
m@75
|
54 "Grenada\n",
|
m@75
|
55 "Iceland\n",
|
m@75
|
56 "Jamaica\n",
|
m@75
|
57 "Japan\n",
|
m@75
|
58 "Kiribati\n",
|
m@75
|
59 "Malta\n",
|
m@75
|
60 "New Zealand\n",
|
m@75
|
61 "Philippines\n",
|
m@75
|
62 "Puerto Rico\n",
|
m@75
|
63 "Republic of Serbia\n",
|
m@75
|
64 "Saint Lucia\n",
|
m@75
|
65 "Samoa\n",
|
m@75
|
66 "Solomon Islands\n",
|
m@75
|
67 "South Korea\n",
|
m@75
|
68 "The Bahamas\n",
|
m@75
|
69 "Trinidad and Tobago\n"
|
m@75
|
70 ]
|
m@75
|
71 },
|
m@75
|
72 {
|
m@75
|
73 "name": "stderr",
|
m@75
|
74 "output_type": "stream",
|
m@75
|
75 "text": [
|
m@75
|
76 "/Users/mariapanteli/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
m@75
|
77 " data = self._reader.read(nrows)\n"
|
m@75
|
78 ]
|
m@75
|
79 }
|
m@75
|
80 ],
|
m@75
|
81 "source": [
|
m@75
|
82 "DATA_FILE = '../data/lda_data_melodia_8_30sec.pickle'\n",
|
m@75
|
83 "METADATA_FILE = '/Users/mariapanteli/Documents/QMUL/Code/MyPythonCode/MergeBL-Smith/data/metadata_BLSM_language_all.csv'\n",
|
m@75
|
84 "#METADATA_FILE = '../data/metadata.csv'\n",
|
m@75
|
85 "\n",
|
m@75
|
86 "dataset, ddf, w_dict = outliers.load_data(DATA_FILE, METADATA_FILE)"
|
m@75
|
87 ]
|
m@75
|
88 },
|
m@75
|
89 {
|
m@75
|
90 "cell_type": "code",
|
m@75
|
91 "execution_count": 9,
|
m@75
|
92 "metadata": {
|
m@75
|
93 "collapsed": false
|
m@75
|
94 },
|
m@75
|
95 "outputs": [
|
m@75
|
96 {
|
m@75
|
97 "data": {
|
m@75
|
98 "text/plain": [
|
m@75
|
99 "(8200, 108)"
|
m@75
|
100 ]
|
m@75
|
101 },
|
m@75
|
102 "execution_count": 9,
|
m@75
|
103 "metadata": {},
|
m@75
|
104 "output_type": "execute_result"
|
m@75
|
105 }
|
m@75
|
106 ],
|
m@75
|
107 "source": [
|
m@75
|
108 "X_list, Y, Yaudio = dataset\n",
|
m@75
|
109 "X = np.concatenate(X_list, axis=1)\n",
|
m@75
|
110 "ddf.shape"
|
m@75
|
111 ]
|
m@75
|
112 },
|
m@75
|
113 {
|
m@75
|
114 "cell_type": "code",
|
m@75
|
115 "execution_count": 11,
|
m@75
|
116 "metadata": {
|
m@75
|
117 "collapsed": false
|
m@75
|
118 },
|
m@75
|
119 "outputs": [
|
m@75
|
120 {
|
m@75
|
121 "name": "stdout",
|
m@75
|
122 "output_type": "stream",
|
m@75
|
123 "text": [
|
m@75
|
124 "most outliers \n",
|
m@75
|
125 " Country Outliers N_Country N_Outliers\n",
|
m@75
|
126 "136 Botswana 0.611111 90 55\n",
|
m@75
|
127 "72 Ivory Coast 0.600000 15 9\n",
|
m@75
|
128 "95 Chad 0.545455 11 6\n",
|
m@75
|
129 "43 Benin 0.538462 26 14\n",
|
m@75
|
130 "86 Gambia 0.500000 50 25\n",
|
m@75
|
131 "20 Pakistan 0.494505 91 45\n",
|
m@75
|
132 "106 Nepal 0.473684 95 45\n",
|
m@75
|
133 "78 El Salvador 0.454545 33 15\n",
|
m@75
|
134 "64 Mozambique 0.441176 34 15\n",
|
m@75
|
135 "135 French Guiana 0.428571 28 12\n",
|
m@75
|
136 "least outliers \n",
|
m@75
|
137 " Country Outliers N_Country N_Outliers\n",
|
m@75
|
138 "1 Lithuania 0.000000 47 0\n",
|
m@75
|
139 "119 Denmark 0.000000 16 0\n",
|
m@75
|
140 "27 South Korea 0.000000 11 0\n",
|
m@75
|
141 "120 Kazakhstan 0.011364 88 1\n",
|
m@75
|
142 "31 Czech Republic 0.024390 41 1\n",
|
m@75
|
143 "15 Netherlands 0.029851 67 2\n",
|
m@75
|
144 "30 Afghanistan 0.041667 24 1\n",
|
m@75
|
145 "105 Sudan 0.044118 68 3\n",
|
m@75
|
146 "102 Nicaragua 0.047619 21 1\n",
|
m@75
|
147 "0 Canada 0.050000 100 5\n"
|
m@75
|
148 ]
|
m@75
|
149 }
|
m@75
|
150 ],
|
m@75
|
151 "source": [
|
m@75
|
152 "# global outliers\n",
|
m@75
|
153 "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999, out_file='../data/results/global_outliers.csv')\n",
|
m@75
|
154 "outliers.print_most_least_outliers_topN(df_global, N=10)"
|
m@75
|
155 ]
|
m@75
|
156 },
|
m@75
|
157 {
|
m@75
|
158 "cell_type": "code",
|
m@75
|
159 "execution_count": 14,
|
m@75
|
160 "metadata": {
|
m@75
|
161 "collapsed": false,
|
m@75
|
162 "scrolled": true
|
m@75
|
163 },
|
m@75
|
164 "outputs": [
|
m@75
|
165 {
|
m@75
|
166 "name": "stdout",
|
m@75
|
167 "output_type": "stream",
|
m@75
|
168 "text": [
|
m@75
|
169 "328\n",
|
m@75
|
170 "210\n",
|
m@75
|
171 "194\n",
|
m@75
|
172 "85\n",
|
m@75
|
173 "388\n",
|
m@75
|
174 "266\n",
|
m@75
|
175 "309\n",
|
m@75
|
176 "455\n",
|
m@75
|
177 "365\n",
|
m@75
|
178 "282\n",
|
m@75
|
179 "197\n",
|
m@75
|
180 "122\n",
|
m@75
|
181 "206\n",
|
m@75
|
182 "457\n",
|
m@75
|
183 "298\n",
|
m@75
|
184 "597\n",
|
m@75
|
185 "354\n",
|
m@75
|
186 "191\n",
|
m@75
|
187 "193\n",
|
m@75
|
188 "198\n",
|
m@75
|
189 "263\n",
|
m@75
|
190 "334\n",
|
m@75
|
191 "812\n",
|
m@75
|
192 "415\n",
|
m@75
|
193 "44\n",
|
m@75
|
194 "107\n",
|
m@75
|
195 "366\n",
|
m@75
|
196 "323\n",
|
m@75
|
197 "450\n",
|
m@75
|
198 "116\n",
|
m@75
|
199 "150\n",
|
m@75
|
200 "260\n",
|
m@75
|
201 "230\n",
|
m@75
|
202 "118\n",
|
m@75
|
203 "389\n",
|
m@75
|
204 "237\n",
|
m@75
|
205 "274\n",
|
m@75
|
206 "466\n",
|
m@75
|
207 "147\n",
|
m@75
|
208 "134\n",
|
m@75
|
209 "86\n",
|
m@75
|
210 "91\n",
|
m@75
|
211 "574\n",
|
m@75
|
212 "111\n",
|
m@75
|
213 "296\n",
|
m@75
|
214 "221\n",
|
m@75
|
215 "261\n",
|
m@75
|
216 "224\n",
|
m@75
|
217 "190\n",
|
m@75
|
218 "150\n",
|
m@75
|
219 "139\n",
|
m@75
|
220 "350\n",
|
m@75
|
221 "268\n",
|
m@75
|
222 "453\n",
|
m@75
|
223 "192\n",
|
m@75
|
224 "468\n",
|
m@75
|
225 "266\n",
|
m@75
|
226 "187\n",
|
m@75
|
227 "275\n",
|
m@75
|
228 "337\n",
|
m@75
|
229 "179\n",
|
m@75
|
230 "366\n",
|
m@75
|
231 "211\n",
|
m@75
|
232 "213\n",
|
m@75
|
233 "428\n",
|
m@75
|
234 "468\n",
|
m@75
|
235 "164\n",
|
m@75
|
236 "348\n",
|
m@75
|
237 "328\n",
|
m@75
|
238 "193\n",
|
m@75
|
239 "197\n",
|
m@75
|
240 "193\n",
|
m@75
|
241 "166\n",
|
m@75
|
242 "290\n",
|
m@75
|
243 "196\n",
|
m@75
|
244 "224\n",
|
m@75
|
245 "111\n",
|
m@75
|
246 "258\n",
|
m@75
|
247 "295\n",
|
m@75
|
248 "227\n",
|
m@75
|
249 "252\n",
|
m@75
|
250 "433\n",
|
m@75
|
251 "305\n",
|
m@75
|
252 "290\n",
|
m@75
|
253 "183\n",
|
m@75
|
254 "243\n",
|
m@75
|
255 "63\n",
|
m@75
|
256 "197\n",
|
m@75
|
257 "274\n",
|
m@75
|
258 "363\n",
|
m@75
|
259 "113\n",
|
m@75
|
260 "192\n",
|
m@75
|
261 "258\n",
|
m@75
|
262 "494\n",
|
m@75
|
263 "299\n",
|
m@75
|
264 "484\n",
|
m@75
|
265 "198\n",
|
m@75
|
266 "191\n",
|
m@75
|
267 "174\n",
|
m@75
|
268 "280\n",
|
m@75
|
269 "735\n",
|
m@75
|
270 "211\n",
|
m@75
|
271 "221\n",
|
m@75
|
272 "134\n",
|
m@75
|
273 "125\n",
|
m@75
|
274 "119\n",
|
m@75
|
275 "151\n",
|
m@75
|
276 "203\n",
|
m@75
|
277 "229\n",
|
m@75
|
278 "430\n",
|
m@75
|
279 "311\n",
|
m@75
|
280 "424\n",
|
m@75
|
281 "337\n",
|
m@75
|
282 "268\n",
|
m@75
|
283 "175\n",
|
m@75
|
284 "228\n",
|
m@75
|
285 "175\n",
|
m@75
|
286 "437\n",
|
m@75
|
287 "284\n",
|
m@75
|
288 "129\n",
|
m@75
|
289 "366\n",
|
m@75
|
290 "222\n",
|
m@75
|
291 "66\n",
|
m@75
|
292 "498\n",
|
m@75
|
293 "400\n",
|
m@75
|
294 "430\n",
|
m@75
|
295 "187\n",
|
m@75
|
296 "470\n",
|
m@75
|
297 "298\n",
|
m@75
|
298 "231\n",
|
m@75
|
299 "272\n",
|
m@75
|
300 "261\n",
|
m@75
|
301 "239\n",
|
m@75
|
302 "154\n",
|
m@75
|
303 "22\n",
|
m@75
|
304 "426\n",
|
m@75
|
305 "332\n",
|
m@75
|
306 "most outliers \n",
|
m@75
|
307 " Country Outliers N_Country N_Outliers\n",
|
m@75
|
308 "46 China 0.260000 100 26\n",
|
m@75
|
309 "67 Brazil 0.240000 100 24\n",
|
m@75
|
310 "101 Colombia 0.211111 90 19\n",
|
m@75
|
311 "64 Mozambique 0.205882 34 7\n",
|
m@75
|
312 "76 Iran 0.188679 53 10\n",
|
m@75
|
313 "65 Uganda 0.176471 85 15\n",
|
m@75
|
314 "27 Kenya 0.164948 97 16\n",
|
m@75
|
315 "126 South Sudan 0.163043 92 15\n",
|
m@75
|
316 "24 Azerbaijan 0.153846 13 2\n",
|
m@75
|
317 "23 India 0.147368 95 14\n",
|
m@75
|
318 "least outliers \n",
|
m@75
|
319 " Country Outliers N_Country N_Outliers\n",
|
m@75
|
320 "0 Canada 0 100 0\n",
|
m@75
|
321 "95 Portugal 0 100 0\n",
|
m@75
|
322 "94 Iraq 0 87 0\n",
|
m@75
|
323 "93 Grenada 0 37 0\n",
|
m@75
|
324 "90 French Polynesia 0 15 0\n",
|
m@75
|
325 "89 Croatia 0 31 0\n",
|
m@75
|
326 "88 Morocco 0 40 0\n",
|
m@75
|
327 "87 Philippines 0 100 0\n",
|
m@75
|
328 "86 Gambia 0 50 0\n",
|
m@75
|
329 "85 Sierra Leone 0 100 0\n"
|
m@75
|
330 ]
|
m@75
|
331 }
|
m@75
|
332 ],
|
m@75
|
333 "source": [
|
m@75
|
334 "# local outliers\n",
|
m@75
|
335 "df_local = outliers.get_local_outliers_df(X, Y, w_dict, out_file='../data/results/spatial_outliers.csv')\n",
|
m@75
|
336 "outliers.print_most_least_outliers_topN(df_local, N=10)"
|
m@75
|
337 ]
|
m@75
|
338 },
|
m@75
|
339 {
|
m@75
|
340 "cell_type": "code",
|
m@75
|
341 "execution_count": 16,
|
m@75
|
342 "metadata": {
|
m@75
|
343 "collapsed": false,
|
m@75
|
344 "scrolled": true
|
m@75
|
345 },
|
m@75
|
346 "outputs": [
|
m@75
|
347 {
|
m@75
|
348 "name": "stdout",
|
m@75
|
349 "output_type": "stream",
|
m@75
|
350 "text": [
|
m@75
|
351 "most outliers \n",
|
m@75
|
352 " Country Outliers N_Country N_Outliers\n",
|
m@75
|
353 "43 Benin 0.500000 26 13\n",
|
m@75
|
354 "136 Botswana 0.488889 90 44\n",
|
m@75
|
355 "106 Nepal 0.421053 95 40\n",
|
m@75
|
356 "84 Belize 0.418605 43 18\n",
|
m@75
|
357 "19 Yemen 0.416667 12 5\n",
|
m@75
|
358 "least outliers \n",
|
m@75
|
359 " Country Outliers N_Country N_Outliers\n",
|
m@75
|
360 "28 Tajikistan 0 19 0\n",
|
m@75
|
361 "119 Denmark 0 16 0\n",
|
m@75
|
362 "96 Uruguay 0 31 0\n",
|
m@75
|
363 "25 Republic of Serbia 0 16 0\n",
|
m@75
|
364 "27 South Korea 0 11 0\n",
|
m@75
|
365 "most outliers \n",
|
m@75
|
366 " Country Outliers N_Country N_Outliers\n",
|
m@75
|
367 "117 Zimbabwe 0.533333 15 8\n",
|
m@75
|
368 "96 Uruguay 0.483871 31 15\n",
|
m@75
|
369 "68 Guinea 0.454545 11 5\n",
|
m@75
|
370 "63 Senegal 0.390244 41 16\n",
|
m@75
|
371 "86 Gambia 0.380000 50 19\n",
|
m@75
|
372 "least outliers \n",
|
m@75
|
373 " Country Outliers N_Country N_Outliers\n",
|
m@75
|
374 "90 French Polynesia 0.000000 15 0\n",
|
m@75
|
375 "37 Rwanda 0.000000 17 0\n",
|
m@75
|
376 "119 Denmark 0.000000 16 0\n",
|
m@75
|
377 "18 New Zealand 0.000000 34 0\n",
|
m@75
|
378 "120 Kazakhstan 0.022727 88 2\n",
|
m@75
|
379 "most outliers \n",
|
m@75
|
380 " Country Outliers N_Country N_Outliers\n",
|
m@75
|
381 "17 French Guiana 0.678571 28 19\n",
|
m@75
|
382 "136 Botswana 0.477778 90 43\n",
|
m@75
|
383 "72 Ivory Coast 0.400000 15 6\n",
|
m@75
|
384 "23 Azerbaijan 0.384615 13 5\n",
|
m@75
|
385 "106 Nepal 0.347368 95 33\n",
|
m@75
|
386 "least outliers \n",
|
m@75
|
387 " Country Outliers N_Country N_Outliers\n",
|
m@75
|
388 "68 Guinea 0 11 0\n",
|
m@75
|
389 "55 Mali 0 17 0\n",
|
m@75
|
390 "77 Algeria 0 27 0\n",
|
m@75
|
391 "33 Saint Lucia 0 43 0\n",
|
m@75
|
392 "31 Czech Republic 0 41 0\n",
|
m@75
|
393 "most outliers \n",
|
m@75
|
394 " Country Outliers N_Country N_Outliers\n",
|
m@75
|
395 "43 Benin 0.538462 26 14\n",
|
m@75
|
396 "20 Pakistan 0.461538 91 42\n",
|
m@75
|
397 "86 Gambia 0.360000 50 18\n",
|
m@75
|
398 "52 Indonesia 0.350000 100 35\n",
|
m@75
|
399 "136 Botswana 0.311111 90 28\n",
|
m@75
|
400 "least outliers \n",
|
m@75
|
401 " Country Outliers N_Country N_Outliers\n",
|
m@75
|
402 "107 Kiribati 0 17 0\n",
|
m@75
|
403 "1 Lithuania 0 47 0\n",
|
m@75
|
404 "134 Paraguay 0 23 0\n",
|
m@75
|
405 "131 Tunisia 0 39 0\n",
|
m@75
|
406 "19 Yemen 0 12 0\n"
|
m@75
|
407 ]
|
m@75
|
408 }
|
m@75
|
409 ],
|
m@75
|
410 "source": [
|
m@75
|
411 "# outliers for features\n",
|
m@75
|
412 "feat = X_list\n",
|
m@75
|
413 "feat_labels = ['rhy', 'mel', 'mfc', 'chr']\n",
|
m@75
|
414 "tabs_feat = []\n",
|
m@75
|
415 "for i in range(len(feat)):\n",
|
m@75
|
416 " XX = feat[i]\n",
|
m@75
|
417 " output_csv = '../data/results/global_outliers_'+feat_labels[i]+'.csv'\n",
|
m@75
|
418 " df_feat, threshold, MD = outliers.get_outliers_df(XX, Y, chi2thr=0.999, out_file=output_csv)\n",
|
m@75
|
419 " outliers.print_most_least_outliers_topN(df_feat, N=5)"
|
m@75
|
420 ]
|
m@75
|
421 },
|
m@75
|
422 {
|
m@75
|
423 "cell_type": "code",
|
m@75
|
424 "execution_count": null,
|
m@75
|
425 "metadata": {
|
m@75
|
426 "collapsed": false
|
m@75
|
427 },
|
m@75
|
428 "outputs": [
|
m@75
|
429 {
|
m@75
|
430 "name": "stdout",
|
m@75
|
431 "output_type": "stream",
|
m@75
|
432 "text": [
|
m@75
|
433 "5\n",
|
m@75
|
434 "6\n",
|
m@75
|
435 "7\n",
|
m@75
|
436 "8\n",
|
m@75
|
437 "9\n",
|
m@75
|
438 "10\n",
|
m@75
|
439 "11\n",
|
m@75
|
440 "12\n",
|
m@75
|
441 "13\n",
|
m@75
|
442 "14\n",
|
m@75
|
443 "15\n",
|
m@75
|
444 "16\n",
|
m@75
|
445 "17\n",
|
m@75
|
446 "18"
|
m@75
|
447 ]
|
m@75
|
448 }
|
m@75
|
449 ],
|
m@75
|
450 "source": [
|
m@75
|
451 "centroids, cl_pred = outliers.get_country_clusters(X, bestncl=None, max_ncl=30)\n",
|
m@75
|
452 "ddf['Clusters'] = cl_pred\n",
|
m@75
|
453 "ddf.to_csv('../data/results/df_and_clusters.csv', index=False)\n",
|
m@75
|
454 "print len(np.unique(cl_pred))\n",
|
m@75
|
455 "outliers.print_clusters_metadata(ddf, cl_pred)"
|
m@75
|
456 ]
|
m@75
|
457 },
|
m@75
|
458 {
|
m@75
|
459 "cell_type": "code",
|
m@75
|
460 "execution_count": null,
|
m@75
|
461 "metadata": {
|
m@75
|
462 "collapsed": true
|
m@75
|
463 },
|
m@75
|
464 "outputs": [],
|
m@75
|
465 "source": []
|
m@75
|
466 }
|
m@75
|
467 ],
|
m@75
|
468 "metadata": {
|
m@75
|
469 "kernelspec": {
|
m@75
|
470 "display_name": "Python 2",
|
m@75
|
471 "language": "python",
|
m@75
|
472 "name": "python2"
|
m@75
|
473 },
|
m@75
|
474 "language_info": {
|
m@75
|
475 "codemirror_mode": {
|
m@75
|
476 "name": "ipython",
|
m@75
|
477 "version": 2
|
m@75
|
478 },
|
m@75
|
479 "file_extension": ".py",
|
m@75
|
480 "mimetype": "text/x-python",
|
m@75
|
481 "name": "python",
|
m@75
|
482 "nbconvert_exporter": "python",
|
m@75
|
483 "pygments_lexer": "ipython2",
|
m@75
|
484 "version": "2.7.12"
|
m@75
|
485 }
|
m@75
|
486 },
|
m@75
|
487 "nbformat": 4,
|
m@75
|
488 "nbformat_minor": 0
|
m@75
|
489 }
|