f@0
|
1 ---
|
f@0
|
2 title: "Sampling (Amendment 7/3/2018)"
|
f@0
|
3 output: html_notebook
|
f@0
|
4 ---
|
f@0
|
5
|
f@0
|
6 ```{r initialisation}
|
f@0
|
7 library(tidyverse)
|
f@0
|
8 if(!require("RSQLite")){
|
f@0
|
9 install.packages("RSQLite")
|
f@0
|
10 library("RSQLite")
|
f@0
|
11 }
|
f@0
|
12
|
f@0
|
13 source('../db/access_db.R')
|
f@0
|
14 source('../sampling/strategies.R')
|
f@0
|
15 ```
|
f@0
|
16
|
f@0
|
17 ## Examples
|
f@0
|
18
|
f@0
|
19 ```{r param_simul}
|
f@0
|
20 num_simul <- 10000
|
f@0
|
21 num_folds <- 4
|
f@0
|
22 seed <- 1986
|
f@0
|
23 ```
|
f@0
|
24
|
f@0
|
25 ```{r get_info}
|
f@0
|
26
|
f@0
|
27 get_info_list <- function(samples_list){
|
f@0
|
28
|
f@0
|
29 aux <- numeric(length(samples_list))
|
f@0
|
30 df <-
|
f@0
|
31 data.frame(ex_tr = aux, ar_tr = aux,
|
f@0
|
32 ex_te_orig = aux, ar_te_orig = aux,
|
f@0
|
33 ex_te_filt = aux, ar_te_filt = aux)
|
f@0
|
34
|
f@0
|
35 for (i in 1:length(samples_list)){
|
f@0
|
36 df[i,] <- get_info(samples_list[[i]])
|
f@0
|
37 }
|
f@0
|
38
|
f@0
|
39 df
|
f@0
|
40
|
f@0
|
41 }
|
f@0
|
42
|
f@0
|
43 get_info <- function(samples){
|
f@0
|
44
|
f@0
|
45 ar_tr <-
|
f@0
|
46 get_artists(samples$train, unique_artist = T)
|
f@0
|
47 ex_filt <-
|
f@0
|
48 filter_excerpts(samples$test, ar_tr)
|
f@0
|
49
|
f@0
|
50 data.frame(
|
f@0
|
51 ex_tr = length(samples$train),
|
f@0
|
52 ar_tr = length(ar_tr),
|
f@0
|
53 ex_te_orig = length(samples$test),
|
f@0
|
54 ar_te_orig = length(get_artists(samples$test, unique_artists = T)),
|
f@0
|
55 ex_te_filt = length(ex_filt),
|
f@0
|
56 ar_te_filt = length(get_artists(ex_filt, unique_artist = T))
|
f@0
|
57 )
|
f@0
|
58
|
f@0
|
59 }
|
f@0
|
60
|
f@0
|
61 ```
|
f@0
|
62
|
f@0
|
63 ```{r bs_no_strat, eval = F}
|
f@0
|
64 set.seed(seed)
|
f@0
|
65 max_iter <- num_simul
|
f@0
|
66
|
f@0
|
67 bs_no_strat <- get_samples('bs', iter = max_iter)
|
f@0
|
68 ```
|
f@0
|
69 ```{r bs_no_strat_info, eval = F}
|
f@0
|
70 bs_no_strat_info <- get_info_list(bs_no_strat)
|
f@0
|
71 summary(bs_no_strat_info)
|
f@0
|
72 ```
|
f@0
|
73
|
f@0
|
74
|
f@0
|
75 ```{r cv_example, eval = F}
|
f@0
|
76
|
f@0
|
77 set.seed(seed)
|
f@0
|
78
|
f@0
|
79 max_iter <- num_simul / num_folds
|
f@0
|
80 cv_no_strat <- vector("list", max_iter)
|
f@0
|
81 for (i in 1:max_iter){
|
f@0
|
82 cv_no_strat[[i]] <- get_samples('cv', iter = num_folds)
|
f@0
|
83 }
|
f@0
|
84
|
f@0
|
85 cv_no_strat <- unlist(cv_no_strat, recursive = F)
|
f@0
|
86 ```
|
f@0
|
87 ```{r cv_no_strat_info, eval = F}
|
f@0
|
88 cv_no_strat_info <- get_info_list(cv_no_strat)
|
f@0
|
89 summary(cv_no_strat_info)
|
f@0
|
90 ```
|
f@0
|
91
|
f@0
|
92
|
f@0
|
93 ```{r compare_no_strat, eval = F}
|
f@0
|
94 bs_no_strat_info_plot <-
|
f@0
|
95 bs_no_strat_info %>%
|
f@0
|
96 melt %>%
|
f@0
|
97 mutate(mode = 'bs')
|
f@0
|
98 cv_no_strat_info_plot <-
|
f@0
|
99 cv_no_strat_info %>%
|
f@0
|
100 melt %>%
|
f@0
|
101 mutate(mode = 'cv')
|
f@0
|
102
|
f@0
|
103 info_no_strat_plot <- rbind(bs_no_strat_info_plot, cv_no_strat_info_plot) %>%
|
f@0
|
104 select(mode, variable, value)
|
f@0
|
105 names(info_no_strat_plot) <- c("mode", "set", "num")
|
f@0
|
106 ```
|
f@0
|
107
|
f@0
|
108 ```{r plot_ex_no_strat}
|
f@0
|
109
|
f@0
|
110 info_no_strat_plot_ex <- info_no_strat_plot %>%
|
f@0
|
111 filter(grepl('ex', set)) %>%
|
f@0
|
112 filter(!grepl('tr', set))
|
f@0
|
113 ggplot(info_no_strat_plot_ex,
|
f@0
|
114 aes(x = num, y = ..count.., color = mode, linetype = set)) +
|
f@0
|
115 geom_density() +
|
f@0
|
116 scale_linetype_manual(values=c("twodash", "dotted")) +
|
f@0
|
117 xlim(0, 500)
|
f@0
|
118
|
f@0
|
119 ggplot(info_no_strat_plot_ex,
|
f@0
|
120 aes(mode, num, color = set)) +
|
f@0
|
121 geom_boxplot() +
|
f@0
|
122 ylim(0, 500)
|
f@0
|
123
|
f@0
|
124 ```
|
f@0
|
125
|
f@0
|
126 ```{r plot_ar_no_strat}
|
f@0
|
127
|
f@0
|
128 info_no_strat_plot_ar <- info_no_strat_plot %>%
|
f@0
|
129 filter(grepl('ar', set))
|
f@0
|
130
|
f@0
|
131 ggplot(info_no_strat_plot_ar,
|
f@0
|
132 aes(x = num, y = ..count.., color = mode, linetype = set)) +
|
f@0
|
133 geom_density() +
|
f@0
|
134 scale_linetype_manual(values = c("solid", "twodash", "dotted")) +
|
f@0
|
135 xlim(0, 500)
|
f@0
|
136
|
f@0
|
137 ggplot(info_no_strat_plot_ar,
|
f@0
|
138 aes(mode, num, color = set)) +
|
f@0
|
139 geom_boxplot() +
|
f@0
|
140 ylim(0, 500)
|
f@0
|
141 ```
|
f@0
|
142
|
f@0
|
143 ```{r bs_strat, eval = F}
|
f@0
|
144 set.seed(seed)
|
f@0
|
145 max_iter <- num_simul
|
f@0
|
146
|
f@0
|
147 bs_strat <-
|
f@0
|
148 get_samples('bs', iter = max_iter, stratified = T)
|
f@0
|
149 ```
|
f@0
|
150 ```{r bs_strat_info, eval = F}
|
f@0
|
151 bs_strat_info <- get_info_list(bs_strat)
|
f@0
|
152 summary(bs_strat_info)
|
f@0
|
153 ```
|
f@0
|
154
|
f@0
|
155
|
f@0
|
156 ```{r cv_strat, eval = F}
|
f@0
|
157 set.seed(seed)
|
f@0
|
158
|
f@0
|
159 max_iter <- num_simul / num_folds
|
f@0
|
160 cv_strat <- vector("list", max_iter)
|
f@0
|
161 for (i in 1:max_iter){
|
f@0
|
162 cv_strat[[i]] <-
|
f@0
|
163 get_samples('cv', iter = num_folds, stratified = T)
|
f@0
|
164 }
|
f@0
|
165 cv_strat <- unlist(cv_strat, recursive = F)
|
f@0
|
166 ```
|
f@0
|
167 ```{r cv_strat_info}
|
f@0
|
168 cv_strat_info <- get_info_list(cv_strat)
|
f@0
|
169 summary(cv_strat_info)
|
f@0
|
170 ```
|
f@0
|
171
|
f@0
|
172 ```{r compare_strat}
|
f@0
|
173 bs_strat_info_plot <-
|
f@0
|
174 bs_strat_info %>%
|
f@0
|
175 melt %>%
|
f@0
|
176 mutate(mode = 'bs')
|
f@0
|
177 cv_strat_info_plot <-
|
f@0
|
178 cv_strat_info %>%
|
f@0
|
179 melt %>%
|
f@0
|
180 mutate(mode = 'cv')
|
f@0
|
181
|
f@0
|
182 info_strat_plot <- rbind(bs_strat_info_plot, cv_strat_info_plot) %>%
|
f@0
|
183 select(mode, variable, value)
|
f@0
|
184 names(info_strat_plot) <- c("mode", "set", "num")
|
f@0
|
185 ```
|
f@0
|
186
|
f@0
|
187 ```{r plot_ex_strat}
|
f@0
|
188
|
f@0
|
189 info_strat_plot_ex <- info_strat_plot %>%
|
f@0
|
190 filter(grepl('ex', set)) %>%
|
f@0
|
191 filter(!grepl('tr', set))
|
f@0
|
192
|
f@0
|
193 ggplot(info_strat_plot_ex,
|
f@0
|
194 aes(mode, num, color = set)) +
|
f@0
|
195 geom_boxplot() +
|
f@0
|
196 ylim(0, 500)
|
f@0
|
197
|
f@0
|
198 ```
|
f@0
|
199
|
f@0
|
200 ```{r plot_ar_strat}
|
f@0
|
201
|
f@0
|
202 info_strat_plot_ar <- info_strat_plot %>%
|
f@0
|
203 filter(grepl('ar', set))
|
f@0
|
204
|
f@0
|
205 ggplot(info_strat_plot_ar,
|
f@0
|
206 aes(x = num, y = ..count.., color = mode, linetype = set)) +
|
f@0
|
207 geom_density() +
|
f@0
|
208 scale_linetype_manual(values = c("solid", "twodash", "dotted")) +
|
f@0
|
209 xlim(0, 500)
|
f@0
|
210
|
f@0
|
211 ggplot(info_strat_plot_ar,
|
f@0
|
212 aes(mode, num, color = set)) +
|
f@0
|
213 geom_boxplot() +
|
f@0
|
214 ylim(0, 500)
|
f@0
|
215 ```
|
f@0
|
216
|
f@0
|
217
|
f@0
|
218
|
f@0
|
219
|
f@0
|
220
|
f@0
|
221 ## Full Sets
|
f@0
|
222
|
f@0
|
223 ```{r create_sets, eval = F}
|
f@0
|
224 create_set_df <- function(a_list, num_sets = NULL, set = 'train'){
|
f@0
|
225
|
f@0
|
226 if(is.null(num_sets)) num_sets <- length(a_list)
|
f@0
|
227 mode <- deparse(substitute(a_list))
|
f@0
|
228
|
f@0
|
229 df <- data.frame(stringsAsFactors = F)
|
f@0
|
230
|
f@0
|
231 for (i in 1:num_sets){
|
f@0
|
232 df <- rbind(df,
|
f@0
|
233 data.frame(
|
f@0
|
234 mode = mode,
|
f@0
|
235 iteration = i,
|
f@0
|
236 ex_id = a_list[[i]][[set]],
|
f@0
|
237 stringsAsFactors = F
|
f@0
|
238 )
|
f@0
|
239 )
|
f@0
|
240 }
|
f@0
|
241
|
f@0
|
242 df %>%
|
f@0
|
243 separate(mode, c("mode", "strat"),
|
f@0
|
244 remove = T, extra = "drop") %>%
|
f@0
|
245 mutate(strat = !(strat == 'no')) %>%
|
f@0
|
246 select(mode, strat, iteration, ex_id)
|
f@0
|
247
|
f@0
|
248 }
|
f@0
|
249 ```
|
f@0
|
250
|
f@0
|
251 ```{r train_sets}
|
f@0
|
252 train_sets <- rbind(
|
f@0
|
253 create_set_df(cv_no_strat, num_sets = NULL, set = 'train'),
|
f@0
|
254 create_set_df(bs_no_strat, num_sets = NULL, set = 'train'),
|
f@0
|
255 create_set_df(cv_strat, num_sets = NULL, set = 'train'),
|
f@0
|
256 create_set_df(bs_strat, num_sets = NULL, set = 'train')
|
f@0
|
257 )
|
f@0
|
258 ```
|
f@0
|
259
|
f@0
|
260 ```{r test_sets}
|
f@0
|
261 test_sets <- rbind(
|
f@0
|
262 create_set_df(cv_no_strat, num_sets = NULL, set = 'test'),
|
f@0
|
263 create_set_df(bs_no_strat, num_sets = NULL, set = 'test'),
|
f@0
|
264 create_set_df(cv_strat, num_sets = NULL, set = 'test'),
|
f@0
|
265 create_set_df(bs_strat, num_sets = NULL, set = 'test')
|
f@0
|
266 )
|
f@0
|
267 ```
|
f@0
|
268
|
f@0
|
269
|
f@0
|
270 ```{r get_train_artists}
|
f@0
|
271 train_sets_artists <- train_sets %>%
|
f@0
|
272 inner_join(get_excerpts_artists(), by = c('ex_id')) %>%
|
f@0
|
273 group_by(mode, strat, iteration) %>%
|
f@0
|
274 select(mode, strat, iteration, artist_id) %>%
|
f@0
|
275 unique() %>%
|
f@0
|
276 ungroup()
|
f@0
|
277 ```
|
f@0
|
278
|
f@0
|
279 ```{r get_test_artists}
|
f@0
|
280 test_sets_artists <- test_sets %>%
|
f@0
|
281 inner_join(get_excerpts_artists(), by = c('ex_id')) %>%
|
f@0
|
282 ungroup()
|
f@0
|
283 ```
|
f@0
|
284
|
f@0
|
285 ```{r filter_test_sets}
|
f@0
|
286
|
f@0
|
287 filt_test_sets <- data.frame(stringsAsFactors = F)
|
f@0
|
288
|
f@0
|
289 combinations <- unique(train_sets_artists %>% select(mode, strat, iteration))
|
f@0
|
290
|
f@0
|
291
|
f@0
|
292 for (row in 1:nrow(combinations)){
|
f@0
|
293
|
f@0
|
294 the_mode <- combinations[row, 'mode'] %>%
|
f@0
|
295 unlist() %>% unname()
|
f@0
|
296 the_strat <- combinations[row, 'strat'] %>%
|
f@0
|
297 unlist() %>% unname()
|
f@0
|
298 the_iteration <- combinations[row, 'iteration'] %>%
|
f@0
|
299 unlist() %>% unname()
|
f@0
|
300
|
f@0
|
301 unique_artists <- train_sets_artists %>%
|
f@0
|
302 filter(mode == the_mode, strat == the_strat,
|
f@0
|
303 iteration == the_iteration) %>%
|
f@0
|
304 select(artist_id) %>%
|
f@0
|
305 unlist() %>%
|
f@0
|
306 unname()
|
f@0
|
307
|
f@0
|
308 filt_test_ex <- test_sets_artists %>%
|
f@0
|
309 filter(mode == the_mode, strat == the_strat,
|
f@0
|
310 iteration == the_iteration, !(artist_id %in% unique_artists)) %>%
|
f@0
|
311 select(ex_id) %>%
|
f@0
|
312 unlist()
|
f@0
|
313
|
f@0
|
314 filt_test_sets <- rbind(filt_test_sets,
|
f@0
|
315 test_sets %>%
|
f@0
|
316 filter(mode == the_mode, strat == the_strat,
|
f@0
|
317 iteration == the_iteration,
|
f@0
|
318 ex_id %in% filt_test_ex))
|
f@0
|
319
|
f@0
|
320 }
|
f@0
|
321 ```
|
f@0
|
322
|
f@0
|
323
|
f@0
|
324 ### Class-wise analysis
|
f@0
|
325
|
f@0
|
326 ```{r bs_strat_class}
|
f@0
|
327 info_test <- data.frame()
|
f@0
|
328 info_filt <- data.frame()
|
f@0
|
329
|
f@0
|
330 for (i in 1:2){
|
f@0
|
331 info_test[i,] <- table(get_classes(
|
f@0
|
332 bs_strat[[i]]$test, unique_classes = F)
|
f@0
|
333 )
|
f@0
|
334 info_filt[i,] <- table(get_classes(
|
f@0
|
335 filter_excerpts(bs_strat[[i]]$test,
|
f@0
|
336 get_artists(bs_strat[[i]]$train)),
|
f@0
|
337 unique_classes = F))
|
f@0
|
338 }
|
f@0
|
339 ```
|
f@0
|
340
|
f@0
|
341
|
f@0
|
342 ## Create Sets for Prediction
|
f@0
|
343
|
f@0
|
344 ```{r num_sets, eval = F}
|
f@0
|
345 num_sets <- 40
|
f@0
|
346 ```
|
f@0
|
347
|
f@0
|
348 ### Concatenate "Arbitrary" Sets
|
f@0
|
349
|
f@0
|
350
|
f@0
|
351 ```{r train_sets_pred, eval = F}
|
f@0
|
352 train_sets_pred <- train_sets %>%
|
f@0
|
353 filter(iteration <= 40)
|
f@0
|
354 ```
|
f@0
|
355
|
f@0
|
356 ```{r store_train_sets, eval = F}
|
f@0
|
357 write.csv(x = train_sets_pred, file = '../sets/train.csv',
|
f@0
|
358 row.names = F)
|
f@0
|
359 ```
|
f@0
|
360
|
f@0
|
361 ### Concatenate Test Set
|
f@0
|
362
|
f@0
|
363 ```{r test_sets_pred, eval = F}
|
f@0
|
364 test_sets_pred <- test_sets %>%
|
f@0
|
365 filter(iteration <= 40)
|
f@0
|
366 ```
|
f@0
|
367
|
f@0
|
368
|
f@0
|
369 ```{r store_test_sets, eval = F}
|
f@0
|
370 write.csv(x = test_sets_pred, file = '../sets/test.csv',
|
f@0
|
371 row.names = F)
|
f@0
|
372 ```
|