comparison sampling/sampling.Rmd @ 0:205974c9568c tip

Initial commit. Predictions not included for lack of space.
author franrodalg <f.rodriguezalgarra@qmul.ac.uk>
date Sat, 29 Jun 2019 18:45:50 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:205974c9568c
1 ---
2 title: "Sampling (Amendment 7/3/2018)"
3 output: html_notebook
4 ---
5
6 ```{r initialisation}
7 library(tidyverse)
8 if(!require("RSQLite")){
9 install.packages("RSQLite")
10 library("RSQLite")
11 }
12
13 source('../db/access_db.R')
14 source('../sampling/strategies.R')
15 ```
16
17 ## Examples
18
19 ```{r param_simul}
20 num_simul <- 10000
21 num_folds <- 4
22 seed <- 1986
23 ```
24
25 ```{r get_info}
26
27 get_info_list <- function(samples_list){
28
29 aux <- numeric(length(samples_list))
30 df <-
31 data.frame(ex_tr = aux, ar_tr = aux,
32 ex_te_orig = aux, ar_te_orig = aux,
33 ex_te_filt = aux, ar_te_filt = aux)
34
35 for (i in 1:length(samples_list)){
36 df[i,] <- get_info(samples_list[[i]])
37 }
38
39 df
40
41 }
42
43 get_info <- function(samples){
44
45 ar_tr <-
46 get_artists(samples$train, unique_artist = T)
47 ex_filt <-
48 filter_excerpts(samples$test, ar_tr)
49
50 data.frame(
51 ex_tr = length(samples$train),
52 ar_tr = length(ar_tr),
53 ex_te_orig = length(samples$test),
54 ar_te_orig = length(get_artists(samples$test, unique_artists = T)),
55 ex_te_filt = length(ex_filt),
56 ar_te_filt = length(get_artists(ex_filt, unique_artist = T))
57 )
58
59 }
60
61 ```
62
63 ```{r bs_no_strat, eval = F}
64 set.seed(seed)
65 max_iter <- num_simul
66
67 bs_no_strat <- get_samples('bs', iter = max_iter)
68 ```
69 ```{r bs_no_strat_info, eval = F}
70 bs_no_strat_info <- get_info_list(bs_no_strat)
71 summary(bs_no_strat_info)
72 ```
73
74
75 ```{r cv_example, eval = F}
76
77 set.seed(seed)
78
79 max_iter <- num_simul / num_folds
80 cv_no_strat <- vector("list", max_iter)
81 for (i in 1:max_iter){
82 cv_no_strat[[i]] <- get_samples('cv', iter = num_folds)
83 }
84
85 cv_no_strat <- unlist(cv_no_strat, recursive = F)
86 ```
87 ```{r cv_no_strat_info, eval = F}
88 cv_no_strat_info <- get_info_list(cv_no_strat)
89 summary(cv_no_strat_info)
90 ```
91
92
93 ```{r compare_no_strat, eval = F}
94 bs_no_strat_info_plot <-
95 bs_no_strat_info %>%
96 melt %>%
97 mutate(mode = 'bs')
98 cv_no_strat_info_plot <-
99 cv_no_strat_info %>%
100 melt %>%
101 mutate(mode = 'cv')
102
103 info_no_strat_plot <- rbind(bs_no_strat_info_plot, cv_no_strat_info_plot) %>%
104 select(mode, variable, value)
105 names(info_no_strat_plot) <- c("mode", "set", "num")
106 ```
107
108 ```{r plot_ex_no_strat}
109
110 info_no_strat_plot_ex <- info_no_strat_plot %>%
111 filter(grepl('ex', set)) %>%
112 filter(!grepl('tr', set))
113 ggplot(info_no_strat_plot_ex,
114 aes(x = num, y = ..count.., color = mode, linetype = set)) +
115 geom_density() +
116 scale_linetype_manual(values=c("twodash", "dotted")) +
117 xlim(0, 500)
118
119 ggplot(info_no_strat_plot_ex,
120 aes(mode, num, color = set)) +
121 geom_boxplot() +
122 ylim(0, 500)
123
124 ```
125
126 ```{r plot_ar_no_strat}
127
128 info_no_strat_plot_ar <- info_no_strat_plot %>%
129 filter(grepl('ar', set))
130
131 ggplot(info_no_strat_plot_ar,
132 aes(x = num, y = ..count.., color = mode, linetype = set)) +
133 geom_density() +
134 scale_linetype_manual(values = c("solid", "twodash", "dotted")) +
135 xlim(0, 500)
136
137 ggplot(info_no_strat_plot_ar,
138 aes(mode, num, color = set)) +
139 geom_boxplot() +
140 ylim(0, 500)
141 ```
142
143 ```{r bs_strat, eval = F}
144 set.seed(seed)
145 max_iter <- num_simul
146
147 bs_strat <-
148 get_samples('bs', iter = max_iter, stratified = T)
149 ```
150 ```{r bs_strat_info, eval = F}
151 bs_strat_info <- get_info_list(bs_strat)
152 summary(bs_strat_info)
153 ```
154
155
156 ```{r cv_strat, eval = F}
157 set.seed(seed)
158
159 max_iter <- num_simul / num_folds
160 cv_strat <- vector("list", max_iter)
161 for (i in 1:max_iter){
162 cv_strat[[i]] <-
163 get_samples('cv', iter = num_folds, stratified = T)
164 }
165 cv_strat <- unlist(cv_strat, recursive = F)
166 ```
167 ```{r cv_strat_info}
168 cv_strat_info <- get_info_list(cv_strat)
169 summary(cv_strat_info)
170 ```
171
172 ```{r compare_strat}
173 bs_strat_info_plot <-
174 bs_strat_info %>%
175 melt %>%
176 mutate(mode = 'bs')
177 cv_strat_info_plot <-
178 cv_strat_info %>%
179 melt %>%
180 mutate(mode = 'cv')
181
182 info_strat_plot <- rbind(bs_strat_info_plot, cv_strat_info_plot) %>%
183 select(mode, variable, value)
184 names(info_strat_plot) <- c("mode", "set", "num")
185 ```
186
187 ```{r plot_ex_strat}
188
189 info_strat_plot_ex <- info_strat_plot %>%
190 filter(grepl('ex', set)) %>%
191 filter(!grepl('tr', set))
192
193 ggplot(info_strat_plot_ex,
194 aes(mode, num, color = set)) +
195 geom_boxplot() +
196 ylim(0, 500)
197
198 ```
199
200 ```{r plot_ar_strat}
201
202 info_strat_plot_ar <- info_strat_plot %>%
203 filter(grepl('ar', set))
204
205 ggplot(info_strat_plot_ar,
206 aes(x = num, y = ..count.., color = mode, linetype = set)) +
207 geom_density() +
208 scale_linetype_manual(values = c("solid", "twodash", "dotted")) +
209 xlim(0, 500)
210
211 ggplot(info_strat_plot_ar,
212 aes(mode, num, color = set)) +
213 geom_boxplot() +
214 ylim(0, 500)
215 ```
216
217
218
219
220
221 ## Full Sets
222
223 ```{r create_sets, eval = F}
224 create_set_df <- function(a_list, num_sets = NULL, set = 'train'){
225
226 if(is.null(num_sets)) num_sets <- length(a_list)
227 mode <- deparse(substitute(a_list))
228
229 df <- data.frame(stringsAsFactors = F)
230
231 for (i in 1:num_sets){
232 df <- rbind(df,
233 data.frame(
234 mode = mode,
235 iteration = i,
236 ex_id = a_list[[i]][[set]],
237 stringsAsFactors = F
238 )
239 )
240 }
241
242 df %>%
243 separate(mode, c("mode", "strat"),
244 remove = T, extra = "drop") %>%
245 mutate(strat = !(strat == 'no')) %>%
246 select(mode, strat, iteration, ex_id)
247
248 }
249 ```
250
251 ```{r train_sets}
252 train_sets <- rbind(
253 create_set_df(cv_no_strat, num_sets = NULL, set = 'train'),
254 create_set_df(bs_no_strat, num_sets = NULL, set = 'train'),
255 create_set_df(cv_strat, num_sets = NULL, set = 'train'),
256 create_set_df(bs_strat, num_sets = NULL, set = 'train')
257 )
258 ```
259
260 ```{r test_sets}
261 test_sets <- rbind(
262 create_set_df(cv_no_strat, num_sets = NULL, set = 'test'),
263 create_set_df(bs_no_strat, num_sets = NULL, set = 'test'),
264 create_set_df(cv_strat, num_sets = NULL, set = 'test'),
265 create_set_df(bs_strat, num_sets = NULL, set = 'test')
266 )
267 ```
268
269
270 ```{r get_train_artists}
271 train_sets_artists <- train_sets %>%
272 inner_join(get_excerpts_artists(), by = c('ex_id')) %>%
273 group_by(mode, strat, iteration) %>%
274 select(mode, strat, iteration, artist_id) %>%
275 unique() %>%
276 ungroup()
277 ```
278
279 ```{r get_test_artists}
280 test_sets_artists <- test_sets %>%
281 inner_join(get_excerpts_artists(), by = c('ex_id')) %>%
282 ungroup()
283 ```
284
285 ```{r filter_test_sets}
286
287 filt_test_sets <- data.frame(stringsAsFactors = F)
288
289 combinations <- unique(train_sets_artists %>% select(mode, strat, iteration))
290
291
292 for (row in 1:nrow(combinations)){
293
294 the_mode <- combinations[row, 'mode'] %>%
295 unlist() %>% unname()
296 the_strat <- combinations[row, 'strat'] %>%
297 unlist() %>% unname()
298 the_iteration <- combinations[row, 'iteration'] %>%
299 unlist() %>% unname()
300
301 unique_artists <- train_sets_artists %>%
302 filter(mode == the_mode, strat == the_strat,
303 iteration == the_iteration) %>%
304 select(artist_id) %>%
305 unlist() %>%
306 unname()
307
308 filt_test_ex <- test_sets_artists %>%
309 filter(mode == the_mode, strat == the_strat,
310 iteration == the_iteration, !(artist_id %in% unique_artists)) %>%
311 select(ex_id) %>%
312 unlist()
313
314 filt_test_sets <- rbind(filt_test_sets,
315 test_sets %>%
316 filter(mode == the_mode, strat == the_strat,
317 iteration == the_iteration,
318 ex_id %in% filt_test_ex))
319
320 }
321 ```
322
323
324 ### Class-wise analysis
325
326 ```{r bs_strat_class}
327 info_test <- data.frame()
328 info_filt <- data.frame()
329
330 for (i in 1:2){
331 info_test[i,] <- table(get_classes(
332 bs_strat[[i]]$test, unique_classes = F)
333 )
334 info_filt[i,] <- table(get_classes(
335 filter_excerpts(bs_strat[[i]]$test,
336 get_artists(bs_strat[[i]]$train)),
337 unique_classes = F))
338 }
339 ```
340
341
342 ## Create Sets for Prediction
343
344 ```{r num_sets, eval = F}
345 num_sets <- 40
346 ```
347
348 ### Concatenate "Arbitrary" Sets
349
350
351 ```{r train_sets_pred, eval = F}
352 train_sets_pred <- train_sets %>%
353 filter(iteration <= 40)
354 ```
355
356 ```{r store_train_sets, eval = F}
357 write.csv(x = train_sets_pred, file = '../sets/train.csv',
358 row.names = F)
359 ```
360
361 ### Concatenate Test Set
362
363 ```{r test_sets_pred, eval = F}
364 test_sets_pred <- test_sets %>%
365 filter(iteration <= 40)
366 ```
367
368
369 ```{r store_test_sets, eval = F}
370 write.csv(x = test_sets_pred, file = '../sets/test.csv',
371 row.names = F)
372 ```