Mercurial > hg > confint
view sampling/sampling.Rmd @ 0:205974c9568c tip
Initial commit. Predictions not included for lack of space.
author | franrodalg <f.rodriguezalgarra@qmul.ac.uk> |
---|---|
date | Sat, 29 Jun 2019 18:45:50 +0100 |
parents | |
children |
line wrap: on
line source
--- title: "Sampling (Amendment 7/3/2018)" output: html_notebook --- ```{r initialisation} library(tidyverse) if(!require("RSQLite")){ install.packages("RSQLite") library("RSQLite") } source('../db/access_db.R') source('../sampling/strategies.R') ``` ## Examples ```{r param_simul} num_simul <- 10000 num_folds <- 4 seed <- 1986 ``` ```{r get_info} get_info_list <- function(samples_list){ aux <- numeric(length(samples_list)) df <- data.frame(ex_tr = aux, ar_tr = aux, ex_te_orig = aux, ar_te_orig = aux, ex_te_filt = aux, ar_te_filt = aux) for (i in 1:length(samples_list)){ df[i,] <- get_info(samples_list[[i]]) } df } get_info <- function(samples){ ar_tr <- get_artists(samples$train, unique_artist = T) ex_filt <- filter_excerpts(samples$test, ar_tr) data.frame( ex_tr = length(samples$train), ar_tr = length(ar_tr), ex_te_orig = length(samples$test), ar_te_orig = length(get_artists(samples$test, unique_artists = T)), ex_te_filt = length(ex_filt), ar_te_filt = length(get_artists(ex_filt, unique_artist = T)) ) } ``` ```{r bs_no_strat, eval = F} set.seed(seed) max_iter <- num_simul bs_no_strat <- get_samples('bs', iter = max_iter) ``` ```{r bs_no_strat_info, eval = F} bs_no_strat_info <- get_info_list(bs_no_strat) summary(bs_no_strat_info) ``` ```{r cv_example, eval = F} set.seed(seed) max_iter <- num_simul / num_folds cv_no_strat <- vector("list", max_iter) for (i in 1:max_iter){ cv_no_strat[[i]] <- get_samples('cv', iter = num_folds) } cv_no_strat <- unlist(cv_no_strat, recursive = F) ``` ```{r cv_no_strat_info, eval = F} cv_no_strat_info <- get_info_list(cv_no_strat) summary(cv_no_strat_info) ``` ```{r compare_no_strat, eval = F} bs_no_strat_info_plot <- bs_no_strat_info %>% melt %>% mutate(mode = 'bs') cv_no_strat_info_plot <- cv_no_strat_info %>% melt %>% mutate(mode = 'cv') info_no_strat_plot <- rbind(bs_no_strat_info_plot, cv_no_strat_info_plot) %>% select(mode, variable, value) names(info_no_strat_plot) <- c("mode", "set", "num") ``` ```{r plot_ex_no_strat} info_no_strat_plot_ex <- info_no_strat_plot %>% filter(grepl('ex', set)) %>% filter(!grepl('tr', set)) ggplot(info_no_strat_plot_ex, aes(x = num, y = ..count.., color = mode, linetype = set)) + geom_density() + scale_linetype_manual(values=c("twodash", "dotted")) + xlim(0, 500) ggplot(info_no_strat_plot_ex, aes(mode, num, color = set)) + geom_boxplot() + ylim(0, 500) ``` ```{r plot_ar_no_strat} info_no_strat_plot_ar <- info_no_strat_plot %>% filter(grepl('ar', set)) ggplot(info_no_strat_plot_ar, aes(x = num, y = ..count.., color = mode, linetype = set)) + geom_density() + scale_linetype_manual(values = c("solid", "twodash", "dotted")) + xlim(0, 500) ggplot(info_no_strat_plot_ar, aes(mode, num, color = set)) + geom_boxplot() + ylim(0, 500) ``` ```{r bs_strat, eval = F} set.seed(seed) max_iter <- num_simul bs_strat <- get_samples('bs', iter = max_iter, stratified = T) ``` ```{r bs_strat_info, eval = F} bs_strat_info <- get_info_list(bs_strat) summary(bs_strat_info) ``` ```{r cv_strat, eval = F} set.seed(seed) max_iter <- num_simul / num_folds cv_strat <- vector("list", max_iter) for (i in 1:max_iter){ cv_strat[[i]] <- get_samples('cv', iter = num_folds, stratified = T) } cv_strat <- unlist(cv_strat, recursive = F) ``` ```{r cv_strat_info} cv_strat_info <- get_info_list(cv_strat) summary(cv_strat_info) ``` ```{r compare_strat} bs_strat_info_plot <- bs_strat_info %>% melt %>% mutate(mode = 'bs') cv_strat_info_plot <- cv_strat_info %>% melt %>% mutate(mode = 'cv') info_strat_plot <- rbind(bs_strat_info_plot, cv_strat_info_plot) %>% select(mode, variable, value) names(info_strat_plot) <- c("mode", "set", "num") ``` ```{r plot_ex_strat} info_strat_plot_ex <- info_strat_plot %>% filter(grepl('ex', set)) %>% filter(!grepl('tr', set)) ggplot(info_strat_plot_ex, aes(mode, num, color = set)) + geom_boxplot() + ylim(0, 500) ``` ```{r plot_ar_strat} info_strat_plot_ar <- info_strat_plot %>% filter(grepl('ar', set)) ggplot(info_strat_plot_ar, aes(x = num, y = ..count.., color = mode, linetype = set)) + geom_density() + scale_linetype_manual(values = c("solid", "twodash", "dotted")) + xlim(0, 500) ggplot(info_strat_plot_ar, aes(mode, num, color = set)) + geom_boxplot() + ylim(0, 500) ``` ## Full Sets ```{r create_sets, eval = F} create_set_df <- function(a_list, num_sets = NULL, set = 'train'){ if(is.null(num_sets)) num_sets <- length(a_list) mode <- deparse(substitute(a_list)) df <- data.frame(stringsAsFactors = F) for (i in 1:num_sets){ df <- rbind(df, data.frame( mode = mode, iteration = i, ex_id = a_list[[i]][[set]], stringsAsFactors = F ) ) } df %>% separate(mode, c("mode", "strat"), remove = T, extra = "drop") %>% mutate(strat = !(strat == 'no')) %>% select(mode, strat, iteration, ex_id) } ``` ```{r train_sets} train_sets <- rbind( create_set_df(cv_no_strat, num_sets = NULL, set = 'train'), create_set_df(bs_no_strat, num_sets = NULL, set = 'train'), create_set_df(cv_strat, num_sets = NULL, set = 'train'), create_set_df(bs_strat, num_sets = NULL, set = 'train') ) ``` ```{r test_sets} test_sets <- rbind( create_set_df(cv_no_strat, num_sets = NULL, set = 'test'), create_set_df(bs_no_strat, num_sets = NULL, set = 'test'), create_set_df(cv_strat, num_sets = NULL, set = 'test'), create_set_df(bs_strat, num_sets = NULL, set = 'test') ) ``` ```{r get_train_artists} train_sets_artists <- train_sets %>% inner_join(get_excerpts_artists(), by = c('ex_id')) %>% group_by(mode, strat, iteration) %>% select(mode, strat, iteration, artist_id) %>% unique() %>% ungroup() ``` ```{r get_test_artists} test_sets_artists <- test_sets %>% inner_join(get_excerpts_artists(), by = c('ex_id')) %>% ungroup() ``` ```{r filter_test_sets} filt_test_sets <- data.frame(stringsAsFactors = F) combinations <- unique(train_sets_artists %>% select(mode, strat, iteration)) for (row in 1:nrow(combinations)){ the_mode <- combinations[row, 'mode'] %>% unlist() %>% unname() the_strat <- combinations[row, 'strat'] %>% unlist() %>% unname() the_iteration <- combinations[row, 'iteration'] %>% unlist() %>% unname() unique_artists <- train_sets_artists %>% filter(mode == the_mode, strat == the_strat, iteration == the_iteration) %>% select(artist_id) %>% unlist() %>% unname() filt_test_ex <- test_sets_artists %>% filter(mode == the_mode, strat == the_strat, iteration == the_iteration, !(artist_id %in% unique_artists)) %>% select(ex_id) %>% unlist() filt_test_sets <- rbind(filt_test_sets, test_sets %>% filter(mode == the_mode, strat == the_strat, iteration == the_iteration, ex_id %in% filt_test_ex)) } ``` ### Class-wise analysis ```{r bs_strat_class} info_test <- data.frame() info_filt <- data.frame() for (i in 1:2){ info_test[i,] <- table(get_classes( bs_strat[[i]]$test, unique_classes = F) ) info_filt[i,] <- table(get_classes( filter_excerpts(bs_strat[[i]]$test, get_artists(bs_strat[[i]]$train)), unique_classes = F)) } ``` ## Create Sets for Prediction ```{r num_sets, eval = F} num_sets <- 40 ``` ### Concatenate "Arbitrary" Sets ```{r train_sets_pred, eval = F} train_sets_pred <- train_sets %>% filter(iteration <= 40) ``` ```{r store_train_sets, eval = F} write.csv(x = train_sets_pred, file = '../sets/train.csv', row.names = F) ``` ### Concatenate Test Set ```{r test_sets_pred, eval = F} test_sets_pred <- test_sets %>% filter(iteration <= 40) ``` ```{r store_test_sets, eval = F} write.csv(x = test_sets_pred, file = '../sets/test.csv', row.names = F) ```