annotate sampling/sampling.Rmd @ 0:205974c9568c tip

Initial commit. Predictions not included for lack of space.
author franrodalg <f.rodriguezalgarra@qmul.ac.uk>
date Sat, 29 Jun 2019 18:45:50 +0100
parents
children
rev   line source
f@0 1 ---
f@0 2 title: "Sampling (Amendment 7/3/2018)"
f@0 3 output: html_notebook
f@0 4 ---
f@0 5
f@0 6 ```{r initialisation}
f@0 7 library(tidyverse)
f@0 8 if(!require("RSQLite")){
f@0 9 install.packages("RSQLite")
f@0 10 library("RSQLite")
f@0 11 }
f@0 12
f@0 13 source('../db/access_db.R')
f@0 14 source('../sampling/strategies.R')
f@0 15 ```
f@0 16
f@0 17 ## Examples
f@0 18
f@0 19 ```{r param_simul}
f@0 20 num_simul <- 10000
f@0 21 num_folds <- 4
f@0 22 seed <- 1986
f@0 23 ```
f@0 24
f@0 25 ```{r get_info}
f@0 26
f@0 27 get_info_list <- function(samples_list){
f@0 28
f@0 29 aux <- numeric(length(samples_list))
f@0 30 df <-
f@0 31 data.frame(ex_tr = aux, ar_tr = aux,
f@0 32 ex_te_orig = aux, ar_te_orig = aux,
f@0 33 ex_te_filt = aux, ar_te_filt = aux)
f@0 34
f@0 35 for (i in 1:length(samples_list)){
f@0 36 df[i,] <- get_info(samples_list[[i]])
f@0 37 }
f@0 38
f@0 39 df
f@0 40
f@0 41 }
f@0 42
f@0 43 get_info <- function(samples){
f@0 44
f@0 45 ar_tr <-
f@0 46 get_artists(samples$train, unique_artist = T)
f@0 47 ex_filt <-
f@0 48 filter_excerpts(samples$test, ar_tr)
f@0 49
f@0 50 data.frame(
f@0 51 ex_tr = length(samples$train),
f@0 52 ar_tr = length(ar_tr),
f@0 53 ex_te_orig = length(samples$test),
f@0 54 ar_te_orig = length(get_artists(samples$test, unique_artists = T)),
f@0 55 ex_te_filt = length(ex_filt),
f@0 56 ar_te_filt = length(get_artists(ex_filt, unique_artist = T))
f@0 57 )
f@0 58
f@0 59 }
f@0 60
f@0 61 ```
f@0 62
f@0 63 ```{r bs_no_strat, eval = F}
f@0 64 set.seed(seed)
f@0 65 max_iter <- num_simul
f@0 66
f@0 67 bs_no_strat <- get_samples('bs', iter = max_iter)
f@0 68 ```
f@0 69 ```{r bs_no_strat_info, eval = F}
f@0 70 bs_no_strat_info <- get_info_list(bs_no_strat)
f@0 71 summary(bs_no_strat_info)
f@0 72 ```
f@0 73
f@0 74
f@0 75 ```{r cv_example, eval = F}
f@0 76
f@0 77 set.seed(seed)
f@0 78
f@0 79 max_iter <- num_simul / num_folds
f@0 80 cv_no_strat <- vector("list", max_iter)
f@0 81 for (i in 1:max_iter){
f@0 82 cv_no_strat[[i]] <- get_samples('cv', iter = num_folds)
f@0 83 }
f@0 84
f@0 85 cv_no_strat <- unlist(cv_no_strat, recursive = F)
f@0 86 ```
f@0 87 ```{r cv_no_strat_info, eval = F}
f@0 88 cv_no_strat_info <- get_info_list(cv_no_strat)
f@0 89 summary(cv_no_strat_info)
f@0 90 ```
f@0 91
f@0 92
f@0 93 ```{r compare_no_strat, eval = F}
f@0 94 bs_no_strat_info_plot <-
f@0 95 bs_no_strat_info %>%
f@0 96 melt %>%
f@0 97 mutate(mode = 'bs')
f@0 98 cv_no_strat_info_plot <-
f@0 99 cv_no_strat_info %>%
f@0 100 melt %>%
f@0 101 mutate(mode = 'cv')
f@0 102
f@0 103 info_no_strat_plot <- rbind(bs_no_strat_info_plot, cv_no_strat_info_plot) %>%
f@0 104 select(mode, variable, value)
f@0 105 names(info_no_strat_plot) <- c("mode", "set", "num")
f@0 106 ```
f@0 107
f@0 108 ```{r plot_ex_no_strat}
f@0 109
f@0 110 info_no_strat_plot_ex <- info_no_strat_plot %>%
f@0 111 filter(grepl('ex', set)) %>%
f@0 112 filter(!grepl('tr', set))
f@0 113 ggplot(info_no_strat_plot_ex,
f@0 114 aes(x = num, y = ..count.., color = mode, linetype = set)) +
f@0 115 geom_density() +
f@0 116 scale_linetype_manual(values=c("twodash", "dotted")) +
f@0 117 xlim(0, 500)
f@0 118
f@0 119 ggplot(info_no_strat_plot_ex,
f@0 120 aes(mode, num, color = set)) +
f@0 121 geom_boxplot() +
f@0 122 ylim(0, 500)
f@0 123
f@0 124 ```
f@0 125
f@0 126 ```{r plot_ar_no_strat}
f@0 127
f@0 128 info_no_strat_plot_ar <- info_no_strat_plot %>%
f@0 129 filter(grepl('ar', set))
f@0 130
f@0 131 ggplot(info_no_strat_plot_ar,
f@0 132 aes(x = num, y = ..count.., color = mode, linetype = set)) +
f@0 133 geom_density() +
f@0 134 scale_linetype_manual(values = c("solid", "twodash", "dotted")) +
f@0 135 xlim(0, 500)
f@0 136
f@0 137 ggplot(info_no_strat_plot_ar,
f@0 138 aes(mode, num, color = set)) +
f@0 139 geom_boxplot() +
f@0 140 ylim(0, 500)
f@0 141 ```
f@0 142
f@0 143 ```{r bs_strat, eval = F}
f@0 144 set.seed(seed)
f@0 145 max_iter <- num_simul
f@0 146
f@0 147 bs_strat <-
f@0 148 get_samples('bs', iter = max_iter, stratified = T)
f@0 149 ```
f@0 150 ```{r bs_strat_info, eval = F}
f@0 151 bs_strat_info <- get_info_list(bs_strat)
f@0 152 summary(bs_strat_info)
f@0 153 ```
f@0 154
f@0 155
f@0 156 ```{r cv_strat, eval = F}
f@0 157 set.seed(seed)
f@0 158
f@0 159 max_iter <- num_simul / num_folds
f@0 160 cv_strat <- vector("list", max_iter)
f@0 161 for (i in 1:max_iter){
f@0 162 cv_strat[[i]] <-
f@0 163 get_samples('cv', iter = num_folds, stratified = T)
f@0 164 }
f@0 165 cv_strat <- unlist(cv_strat, recursive = F)
f@0 166 ```
f@0 167 ```{r cv_strat_info}
f@0 168 cv_strat_info <- get_info_list(cv_strat)
f@0 169 summary(cv_strat_info)
f@0 170 ```
f@0 171
f@0 172 ```{r compare_strat}
f@0 173 bs_strat_info_plot <-
f@0 174 bs_strat_info %>%
f@0 175 melt %>%
f@0 176 mutate(mode = 'bs')
f@0 177 cv_strat_info_plot <-
f@0 178 cv_strat_info %>%
f@0 179 melt %>%
f@0 180 mutate(mode = 'cv')
f@0 181
f@0 182 info_strat_plot <- rbind(bs_strat_info_plot, cv_strat_info_plot) %>%
f@0 183 select(mode, variable, value)
f@0 184 names(info_strat_plot) <- c("mode", "set", "num")
f@0 185 ```
f@0 186
f@0 187 ```{r plot_ex_strat}
f@0 188
f@0 189 info_strat_plot_ex <- info_strat_plot %>%
f@0 190 filter(grepl('ex', set)) %>%
f@0 191 filter(!grepl('tr', set))
f@0 192
f@0 193 ggplot(info_strat_plot_ex,
f@0 194 aes(mode, num, color = set)) +
f@0 195 geom_boxplot() +
f@0 196 ylim(0, 500)
f@0 197
f@0 198 ```
f@0 199
f@0 200 ```{r plot_ar_strat}
f@0 201
f@0 202 info_strat_plot_ar <- info_strat_plot %>%
f@0 203 filter(grepl('ar', set))
f@0 204
f@0 205 ggplot(info_strat_plot_ar,
f@0 206 aes(x = num, y = ..count.., color = mode, linetype = set)) +
f@0 207 geom_density() +
f@0 208 scale_linetype_manual(values = c("solid", "twodash", "dotted")) +
f@0 209 xlim(0, 500)
f@0 210
f@0 211 ggplot(info_strat_plot_ar,
f@0 212 aes(mode, num, color = set)) +
f@0 213 geom_boxplot() +
f@0 214 ylim(0, 500)
f@0 215 ```
f@0 216
f@0 217
f@0 218
f@0 219
f@0 220
f@0 221 ## Full Sets
f@0 222
f@0 223 ```{r create_sets, eval = F}
f@0 224 create_set_df <- function(a_list, num_sets = NULL, set = 'train'){
f@0 225
f@0 226 if(is.null(num_sets)) num_sets <- length(a_list)
f@0 227 mode <- deparse(substitute(a_list))
f@0 228
f@0 229 df <- data.frame(stringsAsFactors = F)
f@0 230
f@0 231 for (i in 1:num_sets){
f@0 232 df <- rbind(df,
f@0 233 data.frame(
f@0 234 mode = mode,
f@0 235 iteration = i,
f@0 236 ex_id = a_list[[i]][[set]],
f@0 237 stringsAsFactors = F
f@0 238 )
f@0 239 )
f@0 240 }
f@0 241
f@0 242 df %>%
f@0 243 separate(mode, c("mode", "strat"),
f@0 244 remove = T, extra = "drop") %>%
f@0 245 mutate(strat = !(strat == 'no')) %>%
f@0 246 select(mode, strat, iteration, ex_id)
f@0 247
f@0 248 }
f@0 249 ```
f@0 250
f@0 251 ```{r train_sets}
f@0 252 train_sets <- rbind(
f@0 253 create_set_df(cv_no_strat, num_sets = NULL, set = 'train'),
f@0 254 create_set_df(bs_no_strat, num_sets = NULL, set = 'train'),
f@0 255 create_set_df(cv_strat, num_sets = NULL, set = 'train'),
f@0 256 create_set_df(bs_strat, num_sets = NULL, set = 'train')
f@0 257 )
f@0 258 ```
f@0 259
f@0 260 ```{r test_sets}
f@0 261 test_sets <- rbind(
f@0 262 create_set_df(cv_no_strat, num_sets = NULL, set = 'test'),
f@0 263 create_set_df(bs_no_strat, num_sets = NULL, set = 'test'),
f@0 264 create_set_df(cv_strat, num_sets = NULL, set = 'test'),
f@0 265 create_set_df(bs_strat, num_sets = NULL, set = 'test')
f@0 266 )
f@0 267 ```
f@0 268
f@0 269
f@0 270 ```{r get_train_artists}
f@0 271 train_sets_artists <- train_sets %>%
f@0 272 inner_join(get_excerpts_artists(), by = c('ex_id')) %>%
f@0 273 group_by(mode, strat, iteration) %>%
f@0 274 select(mode, strat, iteration, artist_id) %>%
f@0 275 unique() %>%
f@0 276 ungroup()
f@0 277 ```
f@0 278
f@0 279 ```{r get_test_artists}
f@0 280 test_sets_artists <- test_sets %>%
f@0 281 inner_join(get_excerpts_artists(), by = c('ex_id')) %>%
f@0 282 ungroup()
f@0 283 ```
f@0 284
f@0 285 ```{r filter_test_sets}
f@0 286
f@0 287 filt_test_sets <- data.frame(stringsAsFactors = F)
f@0 288
f@0 289 combinations <- unique(train_sets_artists %>% select(mode, strat, iteration))
f@0 290
f@0 291
f@0 292 for (row in 1:nrow(combinations)){
f@0 293
f@0 294 the_mode <- combinations[row, 'mode'] %>%
f@0 295 unlist() %>% unname()
f@0 296 the_strat <- combinations[row, 'strat'] %>%
f@0 297 unlist() %>% unname()
f@0 298 the_iteration <- combinations[row, 'iteration'] %>%
f@0 299 unlist() %>% unname()
f@0 300
f@0 301 unique_artists <- train_sets_artists %>%
f@0 302 filter(mode == the_mode, strat == the_strat,
f@0 303 iteration == the_iteration) %>%
f@0 304 select(artist_id) %>%
f@0 305 unlist() %>%
f@0 306 unname()
f@0 307
f@0 308 filt_test_ex <- test_sets_artists %>%
f@0 309 filter(mode == the_mode, strat == the_strat,
f@0 310 iteration == the_iteration, !(artist_id %in% unique_artists)) %>%
f@0 311 select(ex_id) %>%
f@0 312 unlist()
f@0 313
f@0 314 filt_test_sets <- rbind(filt_test_sets,
f@0 315 test_sets %>%
f@0 316 filter(mode == the_mode, strat == the_strat,
f@0 317 iteration == the_iteration,
f@0 318 ex_id %in% filt_test_ex))
f@0 319
f@0 320 }
f@0 321 ```
f@0 322
f@0 323
f@0 324 ### Class-wise analysis
f@0 325
f@0 326 ```{r bs_strat_class}
f@0 327 info_test <- data.frame()
f@0 328 info_filt <- data.frame()
f@0 329
f@0 330 for (i in 1:2){
f@0 331 info_test[i,] <- table(get_classes(
f@0 332 bs_strat[[i]]$test, unique_classes = F)
f@0 333 )
f@0 334 info_filt[i,] <- table(get_classes(
f@0 335 filter_excerpts(bs_strat[[i]]$test,
f@0 336 get_artists(bs_strat[[i]]$train)),
f@0 337 unique_classes = F))
f@0 338 }
f@0 339 ```
f@0 340
f@0 341
f@0 342 ## Create Sets for Prediction
f@0 343
f@0 344 ```{r num_sets, eval = F}
f@0 345 num_sets <- 40
f@0 346 ```
f@0 347
f@0 348 ### Concatenate "Arbitrary" Sets
f@0 349
f@0 350
f@0 351 ```{r train_sets_pred, eval = F}
f@0 352 train_sets_pred <- train_sets %>%
f@0 353 filter(iteration <= 40)
f@0 354 ```
f@0 355
f@0 356 ```{r store_train_sets, eval = F}
f@0 357 write.csv(x = train_sets_pred, file = '../sets/train.csv',
f@0 358 row.names = F)
f@0 359 ```
f@0 360
f@0 361 ### Concatenate Test Set
f@0 362
f@0 363 ```{r test_sets_pred, eval = F}
f@0 364 test_sets_pred <- test_sets %>%
f@0 365 filter(iteration <= 40)
f@0 366 ```
f@0 367
f@0 368
f@0 369 ```{r store_test_sets, eval = F}
f@0 370 write.csv(x = test_sets_pred, file = '../sets/test.csv',
f@0 371 row.names = F)
f@0 372 ```