Mercurial > hg > confint
comparison sampling/sampling.Rmd @ 0:205974c9568c tip
Initial commit. Predictions not included for lack of space.
author | franrodalg <f.rodriguezalgarra@qmul.ac.uk> |
---|---|
date | Sat, 29 Jun 2019 18:45:50 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:205974c9568c |
---|---|
1 --- | |
2 title: "Sampling (Amendment 7/3/2018)" | |
3 output: html_notebook | |
4 --- | |
5 | |
6 ```{r initialisation} | |
7 library(tidyverse) | |
8 if(!require("RSQLite")){ | |
9 install.packages("RSQLite") | |
10 library("RSQLite") | |
11 } | |
12 | |
13 source('../db/access_db.R') | |
14 source('../sampling/strategies.R') | |
15 ``` | |
16 | |
17 ## Examples | |
18 | |
19 ```{r param_simul} | |
20 num_simul <- 10000 | |
21 num_folds <- 4 | |
22 seed <- 1986 | |
23 ``` | |
24 | |
25 ```{r get_info} | |
26 | |
27 get_info_list <- function(samples_list){ | |
28 | |
29 aux <- numeric(length(samples_list)) | |
30 df <- | |
31 data.frame(ex_tr = aux, ar_tr = aux, | |
32 ex_te_orig = aux, ar_te_orig = aux, | |
33 ex_te_filt = aux, ar_te_filt = aux) | |
34 | |
35 for (i in 1:length(samples_list)){ | |
36 df[i,] <- get_info(samples_list[[i]]) | |
37 } | |
38 | |
39 df | |
40 | |
41 } | |
42 | |
43 get_info <- function(samples){ | |
44 | |
45 ar_tr <- | |
46 get_artists(samples$train, unique_artist = T) | |
47 ex_filt <- | |
48 filter_excerpts(samples$test, ar_tr) | |
49 | |
50 data.frame( | |
51 ex_tr = length(samples$train), | |
52 ar_tr = length(ar_tr), | |
53 ex_te_orig = length(samples$test), | |
54 ar_te_orig = length(get_artists(samples$test, unique_artists = T)), | |
55 ex_te_filt = length(ex_filt), | |
56 ar_te_filt = length(get_artists(ex_filt, unique_artist = T)) | |
57 ) | |
58 | |
59 } | |
60 | |
61 ``` | |
62 | |
63 ```{r bs_no_strat, eval = F} | |
64 set.seed(seed) | |
65 max_iter <- num_simul | |
66 | |
67 bs_no_strat <- get_samples('bs', iter = max_iter) | |
68 ``` | |
69 ```{r bs_no_strat_info, eval = F} | |
70 bs_no_strat_info <- get_info_list(bs_no_strat) | |
71 summary(bs_no_strat_info) | |
72 ``` | |
73 | |
74 | |
75 ```{r cv_example, eval = F} | |
76 | |
77 set.seed(seed) | |
78 | |
79 max_iter <- num_simul / num_folds | |
80 cv_no_strat <- vector("list", max_iter) | |
81 for (i in 1:max_iter){ | |
82 cv_no_strat[[i]] <- get_samples('cv', iter = num_folds) | |
83 } | |
84 | |
85 cv_no_strat <- unlist(cv_no_strat, recursive = F) | |
86 ``` | |
87 ```{r cv_no_strat_info, eval = F} | |
88 cv_no_strat_info <- get_info_list(cv_no_strat) | |
89 summary(cv_no_strat_info) | |
90 ``` | |
91 | |
92 | |
93 ```{r compare_no_strat, eval = F} | |
94 bs_no_strat_info_plot <- | |
95 bs_no_strat_info %>% | |
96 melt %>% | |
97 mutate(mode = 'bs') | |
98 cv_no_strat_info_plot <- | |
99 cv_no_strat_info %>% | |
100 melt %>% | |
101 mutate(mode = 'cv') | |
102 | |
103 info_no_strat_plot <- rbind(bs_no_strat_info_plot, cv_no_strat_info_plot) %>% | |
104 select(mode, variable, value) | |
105 names(info_no_strat_plot) <- c("mode", "set", "num") | |
106 ``` | |
107 | |
108 ```{r plot_ex_no_strat} | |
109 | |
110 info_no_strat_plot_ex <- info_no_strat_plot %>% | |
111 filter(grepl('ex', set)) %>% | |
112 filter(!grepl('tr', set)) | |
113 ggplot(info_no_strat_plot_ex, | |
114 aes(x = num, y = ..count.., color = mode, linetype = set)) + | |
115 geom_density() + | |
116 scale_linetype_manual(values=c("twodash", "dotted")) + | |
117 xlim(0, 500) | |
118 | |
119 ggplot(info_no_strat_plot_ex, | |
120 aes(mode, num, color = set)) + | |
121 geom_boxplot() + | |
122 ylim(0, 500) | |
123 | |
124 ``` | |
125 | |
126 ```{r plot_ar_no_strat} | |
127 | |
128 info_no_strat_plot_ar <- info_no_strat_plot %>% | |
129 filter(grepl('ar', set)) | |
130 | |
131 ggplot(info_no_strat_plot_ar, | |
132 aes(x = num, y = ..count.., color = mode, linetype = set)) + | |
133 geom_density() + | |
134 scale_linetype_manual(values = c("solid", "twodash", "dotted")) + | |
135 xlim(0, 500) | |
136 | |
137 ggplot(info_no_strat_plot_ar, | |
138 aes(mode, num, color = set)) + | |
139 geom_boxplot() + | |
140 ylim(0, 500) | |
141 ``` | |
142 | |
143 ```{r bs_strat, eval = F} | |
144 set.seed(seed) | |
145 max_iter <- num_simul | |
146 | |
147 bs_strat <- | |
148 get_samples('bs', iter = max_iter, stratified = T) | |
149 ``` | |
150 ```{r bs_strat_info, eval = F} | |
151 bs_strat_info <- get_info_list(bs_strat) | |
152 summary(bs_strat_info) | |
153 ``` | |
154 | |
155 | |
156 ```{r cv_strat, eval = F} | |
157 set.seed(seed) | |
158 | |
159 max_iter <- num_simul / num_folds | |
160 cv_strat <- vector("list", max_iter) | |
161 for (i in 1:max_iter){ | |
162 cv_strat[[i]] <- | |
163 get_samples('cv', iter = num_folds, stratified = T) | |
164 } | |
165 cv_strat <- unlist(cv_strat, recursive = F) | |
166 ``` | |
167 ```{r cv_strat_info} | |
168 cv_strat_info <- get_info_list(cv_strat) | |
169 summary(cv_strat_info) | |
170 ``` | |
171 | |
172 ```{r compare_strat} | |
173 bs_strat_info_plot <- | |
174 bs_strat_info %>% | |
175 melt %>% | |
176 mutate(mode = 'bs') | |
177 cv_strat_info_plot <- | |
178 cv_strat_info %>% | |
179 melt %>% | |
180 mutate(mode = 'cv') | |
181 | |
182 info_strat_plot <- rbind(bs_strat_info_plot, cv_strat_info_plot) %>% | |
183 select(mode, variable, value) | |
184 names(info_strat_plot) <- c("mode", "set", "num") | |
185 ``` | |
186 | |
187 ```{r plot_ex_strat} | |
188 | |
189 info_strat_plot_ex <- info_strat_plot %>% | |
190 filter(grepl('ex', set)) %>% | |
191 filter(!grepl('tr', set)) | |
192 | |
193 ggplot(info_strat_plot_ex, | |
194 aes(mode, num, color = set)) + | |
195 geom_boxplot() + | |
196 ylim(0, 500) | |
197 | |
198 ``` | |
199 | |
200 ```{r plot_ar_strat} | |
201 | |
202 info_strat_plot_ar <- info_strat_plot %>% | |
203 filter(grepl('ar', set)) | |
204 | |
205 ggplot(info_strat_plot_ar, | |
206 aes(x = num, y = ..count.., color = mode, linetype = set)) + | |
207 geom_density() + | |
208 scale_linetype_manual(values = c("solid", "twodash", "dotted")) + | |
209 xlim(0, 500) | |
210 | |
211 ggplot(info_strat_plot_ar, | |
212 aes(mode, num, color = set)) + | |
213 geom_boxplot() + | |
214 ylim(0, 500) | |
215 ``` | |
216 | |
217 | |
218 | |
219 | |
220 | |
221 ## Full Sets | |
222 | |
223 ```{r create_sets, eval = F} | |
224 create_set_df <- function(a_list, num_sets = NULL, set = 'train'){ | |
225 | |
226 if(is.null(num_sets)) num_sets <- length(a_list) | |
227 mode <- deparse(substitute(a_list)) | |
228 | |
229 df <- data.frame(stringsAsFactors = F) | |
230 | |
231 for (i in 1:num_sets){ | |
232 df <- rbind(df, | |
233 data.frame( | |
234 mode = mode, | |
235 iteration = i, | |
236 ex_id = a_list[[i]][[set]], | |
237 stringsAsFactors = F | |
238 ) | |
239 ) | |
240 } | |
241 | |
242 df %>% | |
243 separate(mode, c("mode", "strat"), | |
244 remove = T, extra = "drop") %>% | |
245 mutate(strat = !(strat == 'no')) %>% | |
246 select(mode, strat, iteration, ex_id) | |
247 | |
248 } | |
249 ``` | |
250 | |
251 ```{r train_sets} | |
252 train_sets <- rbind( | |
253 create_set_df(cv_no_strat, num_sets = NULL, set = 'train'), | |
254 create_set_df(bs_no_strat, num_sets = NULL, set = 'train'), | |
255 create_set_df(cv_strat, num_sets = NULL, set = 'train'), | |
256 create_set_df(bs_strat, num_sets = NULL, set = 'train') | |
257 ) | |
258 ``` | |
259 | |
260 ```{r test_sets} | |
261 test_sets <- rbind( | |
262 create_set_df(cv_no_strat, num_sets = NULL, set = 'test'), | |
263 create_set_df(bs_no_strat, num_sets = NULL, set = 'test'), | |
264 create_set_df(cv_strat, num_sets = NULL, set = 'test'), | |
265 create_set_df(bs_strat, num_sets = NULL, set = 'test') | |
266 ) | |
267 ``` | |
268 | |
269 | |
270 ```{r get_train_artists} | |
271 train_sets_artists <- train_sets %>% | |
272 inner_join(get_excerpts_artists(), by = c('ex_id')) %>% | |
273 group_by(mode, strat, iteration) %>% | |
274 select(mode, strat, iteration, artist_id) %>% | |
275 unique() %>% | |
276 ungroup() | |
277 ``` | |
278 | |
279 ```{r get_test_artists} | |
280 test_sets_artists <- test_sets %>% | |
281 inner_join(get_excerpts_artists(), by = c('ex_id')) %>% | |
282 ungroup() | |
283 ``` | |
284 | |
285 ```{r filter_test_sets} | |
286 | |
287 filt_test_sets <- data.frame(stringsAsFactors = F) | |
288 | |
289 combinations <- unique(train_sets_artists %>% select(mode, strat, iteration)) | |
290 | |
291 | |
292 for (row in 1:nrow(combinations)){ | |
293 | |
294 the_mode <- combinations[row, 'mode'] %>% | |
295 unlist() %>% unname() | |
296 the_strat <- combinations[row, 'strat'] %>% | |
297 unlist() %>% unname() | |
298 the_iteration <- combinations[row, 'iteration'] %>% | |
299 unlist() %>% unname() | |
300 | |
301 unique_artists <- train_sets_artists %>% | |
302 filter(mode == the_mode, strat == the_strat, | |
303 iteration == the_iteration) %>% | |
304 select(artist_id) %>% | |
305 unlist() %>% | |
306 unname() | |
307 | |
308 filt_test_ex <- test_sets_artists %>% | |
309 filter(mode == the_mode, strat == the_strat, | |
310 iteration == the_iteration, !(artist_id %in% unique_artists)) %>% | |
311 select(ex_id) %>% | |
312 unlist() | |
313 | |
314 filt_test_sets <- rbind(filt_test_sets, | |
315 test_sets %>% | |
316 filter(mode == the_mode, strat == the_strat, | |
317 iteration == the_iteration, | |
318 ex_id %in% filt_test_ex)) | |
319 | |
320 } | |
321 ``` | |
322 | |
323 | |
324 ### Class-wise analysis | |
325 | |
326 ```{r bs_strat_class} | |
327 info_test <- data.frame() | |
328 info_filt <- data.frame() | |
329 | |
330 for (i in 1:2){ | |
331 info_test[i,] <- table(get_classes( | |
332 bs_strat[[i]]$test, unique_classes = F) | |
333 ) | |
334 info_filt[i,] <- table(get_classes( | |
335 filter_excerpts(bs_strat[[i]]$test, | |
336 get_artists(bs_strat[[i]]$train)), | |
337 unique_classes = F)) | |
338 } | |
339 ``` | |
340 | |
341 | |
342 ## Create Sets for Prediction | |
343 | |
344 ```{r num_sets, eval = F} | |
345 num_sets <- 40 | |
346 ``` | |
347 | |
348 ### Concatenate "Arbitrary" Sets | |
349 | |
350 | |
351 ```{r train_sets_pred, eval = F} | |
352 train_sets_pred <- train_sets %>% | |
353 filter(iteration <= 40) | |
354 ``` | |
355 | |
356 ```{r store_train_sets, eval = F} | |
357 write.csv(x = train_sets_pred, file = '../sets/train.csv', | |
358 row.names = F) | |
359 ``` | |
360 | |
361 ### Concatenate Test Set | |
362 | |
363 ```{r test_sets_pred, eval = F} | |
364 test_sets_pred <- test_sets %>% | |
365 filter(iteration <= 40) | |
366 ``` | |
367 | |
368 | |
369 ```{r store_test_sets, eval = F} | |
370 write.csv(x = test_sets_pred, file = '../sets/test.csv', | |
371 row.names = F) | |
372 ``` |