f@0: --- f@0: title: "Sampling (Amendment 7/3/2018)" f@0: output: html_notebook f@0: --- f@0: f@0: ```{r initialisation} f@0: library(tidyverse) f@0: if(!require("RSQLite")){ f@0: install.packages("RSQLite") f@0: library("RSQLite") f@0: } f@0: f@0: source('../db/access_db.R') f@0: source('../sampling/strategies.R') f@0: ``` f@0: f@0: ## Examples f@0: f@0: ```{r param_simul} f@0: num_simul <- 10000 f@0: num_folds <- 4 f@0: seed <- 1986 f@0: ``` f@0: f@0: ```{r get_info} f@0: f@0: get_info_list <- function(samples_list){ f@0: f@0: aux <- numeric(length(samples_list)) f@0: df <- f@0: data.frame(ex_tr = aux, ar_tr = aux, f@0: ex_te_orig = aux, ar_te_orig = aux, f@0: ex_te_filt = aux, ar_te_filt = aux) f@0: f@0: for (i in 1:length(samples_list)){ f@0: df[i,] <- get_info(samples_list[[i]]) f@0: } f@0: f@0: df f@0: f@0: } f@0: f@0: get_info <- function(samples){ f@0: f@0: ar_tr <- f@0: get_artists(samples$train, unique_artist = T) f@0: ex_filt <- f@0: filter_excerpts(samples$test, ar_tr) f@0: f@0: data.frame( f@0: ex_tr = length(samples$train), f@0: ar_tr = length(ar_tr), f@0: ex_te_orig = length(samples$test), f@0: ar_te_orig = length(get_artists(samples$test, unique_artists = T)), f@0: ex_te_filt = length(ex_filt), f@0: ar_te_filt = length(get_artists(ex_filt, unique_artist = T)) f@0: ) f@0: f@0: } f@0: f@0: ``` f@0: f@0: ```{r bs_no_strat, eval = F} f@0: set.seed(seed) f@0: max_iter <- num_simul f@0: f@0: bs_no_strat <- get_samples('bs', iter = max_iter) f@0: ``` f@0: ```{r bs_no_strat_info, eval = F} f@0: bs_no_strat_info <- get_info_list(bs_no_strat) f@0: summary(bs_no_strat_info) f@0: ``` f@0: f@0: f@0: ```{r cv_example, eval = F} f@0: f@0: set.seed(seed) f@0: f@0: max_iter <- num_simul / num_folds f@0: cv_no_strat <- vector("list", max_iter) f@0: for (i in 1:max_iter){ f@0: cv_no_strat[[i]] <- get_samples('cv', iter = num_folds) f@0: } f@0: f@0: cv_no_strat <- unlist(cv_no_strat, recursive = F) f@0: ``` f@0: ```{r cv_no_strat_info, eval = F} f@0: cv_no_strat_info <- get_info_list(cv_no_strat) f@0: summary(cv_no_strat_info) f@0: ``` f@0: f@0: f@0: ```{r compare_no_strat, eval = F} f@0: bs_no_strat_info_plot <- f@0: bs_no_strat_info %>% f@0: melt %>% f@0: mutate(mode = 'bs') f@0: cv_no_strat_info_plot <- f@0: cv_no_strat_info %>% f@0: melt %>% f@0: mutate(mode = 'cv') f@0: f@0: info_no_strat_plot <- rbind(bs_no_strat_info_plot, cv_no_strat_info_plot) %>% f@0: select(mode, variable, value) f@0: names(info_no_strat_plot) <- c("mode", "set", "num") f@0: ``` f@0: f@0: ```{r plot_ex_no_strat} f@0: f@0: info_no_strat_plot_ex <- info_no_strat_plot %>% f@0: filter(grepl('ex', set)) %>% f@0: filter(!grepl('tr', set)) f@0: ggplot(info_no_strat_plot_ex, f@0: aes(x = num, y = ..count.., color = mode, linetype = set)) + f@0: geom_density() + f@0: scale_linetype_manual(values=c("twodash", "dotted")) + f@0: xlim(0, 500) f@0: f@0: ggplot(info_no_strat_plot_ex, f@0: aes(mode, num, color = set)) + f@0: geom_boxplot() + f@0: ylim(0, 500) f@0: f@0: ``` f@0: f@0: ```{r plot_ar_no_strat} f@0: f@0: info_no_strat_plot_ar <- info_no_strat_plot %>% f@0: filter(grepl('ar', set)) f@0: f@0: ggplot(info_no_strat_plot_ar, f@0: aes(x = num, y = ..count.., color = mode, linetype = set)) + f@0: geom_density() + f@0: scale_linetype_manual(values = c("solid", "twodash", "dotted")) + f@0: xlim(0, 500) f@0: f@0: ggplot(info_no_strat_plot_ar, f@0: aes(mode, num, color = set)) + f@0: geom_boxplot() + f@0: ylim(0, 500) f@0: ``` f@0: f@0: ```{r bs_strat, eval = F} f@0: set.seed(seed) f@0: max_iter <- num_simul f@0: f@0: bs_strat <- f@0: get_samples('bs', iter = max_iter, stratified = T) f@0: ``` f@0: ```{r bs_strat_info, eval = F} f@0: bs_strat_info <- get_info_list(bs_strat) f@0: summary(bs_strat_info) f@0: ``` f@0: f@0: f@0: ```{r cv_strat, eval = F} f@0: set.seed(seed) f@0: f@0: max_iter <- num_simul / num_folds f@0: cv_strat <- vector("list", max_iter) f@0: for (i in 1:max_iter){ f@0: cv_strat[[i]] <- f@0: get_samples('cv', iter = num_folds, stratified = T) f@0: } f@0: cv_strat <- unlist(cv_strat, recursive = F) f@0: ``` f@0: ```{r cv_strat_info} f@0: cv_strat_info <- get_info_list(cv_strat) f@0: summary(cv_strat_info) f@0: ``` f@0: f@0: ```{r compare_strat} f@0: bs_strat_info_plot <- f@0: bs_strat_info %>% f@0: melt %>% f@0: mutate(mode = 'bs') f@0: cv_strat_info_plot <- f@0: cv_strat_info %>% f@0: melt %>% f@0: mutate(mode = 'cv') f@0: f@0: info_strat_plot <- rbind(bs_strat_info_plot, cv_strat_info_plot) %>% f@0: select(mode, variable, value) f@0: names(info_strat_plot) <- c("mode", "set", "num") f@0: ``` f@0: f@0: ```{r plot_ex_strat} f@0: f@0: info_strat_plot_ex <- info_strat_plot %>% f@0: filter(grepl('ex', set)) %>% f@0: filter(!grepl('tr', set)) f@0: f@0: ggplot(info_strat_plot_ex, f@0: aes(mode, num, color = set)) + f@0: geom_boxplot() + f@0: ylim(0, 500) f@0: f@0: ``` f@0: f@0: ```{r plot_ar_strat} f@0: f@0: info_strat_plot_ar <- info_strat_plot %>% f@0: filter(grepl('ar', set)) f@0: f@0: ggplot(info_strat_plot_ar, f@0: aes(x = num, y = ..count.., color = mode, linetype = set)) + f@0: geom_density() + f@0: scale_linetype_manual(values = c("solid", "twodash", "dotted")) + f@0: xlim(0, 500) f@0: f@0: ggplot(info_strat_plot_ar, f@0: aes(mode, num, color = set)) + f@0: geom_boxplot() + f@0: ylim(0, 500) f@0: ``` f@0: f@0: f@0: f@0: f@0: f@0: ## Full Sets f@0: f@0: ```{r create_sets, eval = F} f@0: create_set_df <- function(a_list, num_sets = NULL, set = 'train'){ f@0: f@0: if(is.null(num_sets)) num_sets <- length(a_list) f@0: mode <- deparse(substitute(a_list)) f@0: f@0: df <- data.frame(stringsAsFactors = F) f@0: f@0: for (i in 1:num_sets){ f@0: df <- rbind(df, f@0: data.frame( f@0: mode = mode, f@0: iteration = i, f@0: ex_id = a_list[[i]][[set]], f@0: stringsAsFactors = F f@0: ) f@0: ) f@0: } f@0: f@0: df %>% f@0: separate(mode, c("mode", "strat"), f@0: remove = T, extra = "drop") %>% f@0: mutate(strat = !(strat == 'no')) %>% f@0: select(mode, strat, iteration, ex_id) f@0: f@0: } f@0: ``` f@0: f@0: ```{r train_sets} f@0: train_sets <- rbind( f@0: create_set_df(cv_no_strat, num_sets = NULL, set = 'train'), f@0: create_set_df(bs_no_strat, num_sets = NULL, set = 'train'), f@0: create_set_df(cv_strat, num_sets = NULL, set = 'train'), f@0: create_set_df(bs_strat, num_sets = NULL, set = 'train') f@0: ) f@0: ``` f@0: f@0: ```{r test_sets} f@0: test_sets <- rbind( f@0: create_set_df(cv_no_strat, num_sets = NULL, set = 'test'), f@0: create_set_df(bs_no_strat, num_sets = NULL, set = 'test'), f@0: create_set_df(cv_strat, num_sets = NULL, set = 'test'), f@0: create_set_df(bs_strat, num_sets = NULL, set = 'test') f@0: ) f@0: ``` f@0: f@0: f@0: ```{r get_train_artists} f@0: train_sets_artists <- train_sets %>% f@0: inner_join(get_excerpts_artists(), by = c('ex_id')) %>% f@0: group_by(mode, strat, iteration) %>% f@0: select(mode, strat, iteration, artist_id) %>% f@0: unique() %>% f@0: ungroup() f@0: ``` f@0: f@0: ```{r get_test_artists} f@0: test_sets_artists <- test_sets %>% f@0: inner_join(get_excerpts_artists(), by = c('ex_id')) %>% f@0: ungroup() f@0: ``` f@0: f@0: ```{r filter_test_sets} f@0: f@0: filt_test_sets <- data.frame(stringsAsFactors = F) f@0: f@0: combinations <- unique(train_sets_artists %>% select(mode, strat, iteration)) f@0: f@0: f@0: for (row in 1:nrow(combinations)){ f@0: f@0: the_mode <- combinations[row, 'mode'] %>% f@0: unlist() %>% unname() f@0: the_strat <- combinations[row, 'strat'] %>% f@0: unlist() %>% unname() f@0: the_iteration <- combinations[row, 'iteration'] %>% f@0: unlist() %>% unname() f@0: f@0: unique_artists <- train_sets_artists %>% f@0: filter(mode == the_mode, strat == the_strat, f@0: iteration == the_iteration) %>% f@0: select(artist_id) %>% f@0: unlist() %>% f@0: unname() f@0: f@0: filt_test_ex <- test_sets_artists %>% f@0: filter(mode == the_mode, strat == the_strat, f@0: iteration == the_iteration, !(artist_id %in% unique_artists)) %>% f@0: select(ex_id) %>% f@0: unlist() f@0: f@0: filt_test_sets <- rbind(filt_test_sets, f@0: test_sets %>% f@0: filter(mode == the_mode, strat == the_strat, f@0: iteration == the_iteration, f@0: ex_id %in% filt_test_ex)) f@0: f@0: } f@0: ``` f@0: f@0: f@0: ### Class-wise analysis f@0: f@0: ```{r bs_strat_class} f@0: info_test <- data.frame() f@0: info_filt <- data.frame() f@0: f@0: for (i in 1:2){ f@0: info_test[i,] <- table(get_classes( f@0: bs_strat[[i]]$test, unique_classes = F) f@0: ) f@0: info_filt[i,] <- table(get_classes( f@0: filter_excerpts(bs_strat[[i]]$test, f@0: get_artists(bs_strat[[i]]$train)), f@0: unique_classes = F)) f@0: } f@0: ``` f@0: f@0: f@0: ## Create Sets for Prediction f@0: f@0: ```{r num_sets, eval = F} f@0: num_sets <- 40 f@0: ``` f@0: f@0: ### Concatenate "Arbitrary" Sets f@0: f@0: f@0: ```{r train_sets_pred, eval = F} f@0: train_sets_pred <- train_sets %>% f@0: filter(iteration <= 40) f@0: ``` f@0: f@0: ```{r store_train_sets, eval = F} f@0: write.csv(x = train_sets_pred, file = '../sets/train.csv', f@0: row.names = F) f@0: ``` f@0: f@0: ### Concatenate Test Set f@0: f@0: ```{r test_sets_pred, eval = F} f@0: test_sets_pred <- test_sets %>% f@0: filter(iteration <= 40) f@0: ``` f@0: f@0: f@0: ```{r store_test_sets, eval = F} f@0: write.csv(x = test_sets_pred, file = '../sets/test.csv', f@0: row.names = F) f@0: ```