Mercurial > hg > confint
diff sampling/sampling.Rmd @ 0:205974c9568c tip
Initial commit. Predictions not included for lack of space.
author | franrodalg <f.rodriguezalgarra@qmul.ac.uk> |
---|---|
date | Sat, 29 Jun 2019 18:45:50 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sampling/sampling.Rmd Sat Jun 29 18:45:50 2019 +0100 @@ -0,0 +1,372 @@ +--- +title: "Sampling (Amendment 7/3/2018)" +output: html_notebook +--- + +```{r initialisation} +library(tidyverse) +if(!require("RSQLite")){ + install.packages("RSQLite") + library("RSQLite") +} + +source('../db/access_db.R') +source('../sampling/strategies.R') +``` + +## Examples + +```{r param_simul} +num_simul <- 10000 +num_folds <- 4 +seed <- 1986 +``` + +```{r get_info} + +get_info_list <- function(samples_list){ + + aux <- numeric(length(samples_list)) + df <- + data.frame(ex_tr = aux, ar_tr = aux, + ex_te_orig = aux, ar_te_orig = aux, + ex_te_filt = aux, ar_te_filt = aux) + + for (i in 1:length(samples_list)){ + df[i,] <- get_info(samples_list[[i]]) + } + + df + +} + +get_info <- function(samples){ + + ar_tr <- + get_artists(samples$train, unique_artist = T) + ex_filt <- + filter_excerpts(samples$test, ar_tr) + + data.frame( + ex_tr = length(samples$train), + ar_tr = length(ar_tr), + ex_te_orig = length(samples$test), + ar_te_orig = length(get_artists(samples$test, unique_artists = T)), + ex_te_filt = length(ex_filt), + ar_te_filt = length(get_artists(ex_filt, unique_artist = T)) + ) + +} + +``` + +```{r bs_no_strat, eval = F} +set.seed(seed) +max_iter <- num_simul + +bs_no_strat <- get_samples('bs', iter = max_iter) +``` +```{r bs_no_strat_info, eval = F} +bs_no_strat_info <- get_info_list(bs_no_strat) +summary(bs_no_strat_info) +``` + + +```{r cv_example, eval = F} + +set.seed(seed) + +max_iter <- num_simul / num_folds +cv_no_strat <- vector("list", max_iter) +for (i in 1:max_iter){ + cv_no_strat[[i]] <- get_samples('cv', iter = num_folds) +} + +cv_no_strat <- unlist(cv_no_strat, recursive = F) +``` +```{r cv_no_strat_info, eval = F} +cv_no_strat_info <- get_info_list(cv_no_strat) +summary(cv_no_strat_info) +``` + + +```{r compare_no_strat, eval = F} +bs_no_strat_info_plot <- + bs_no_strat_info %>% + melt %>% + mutate(mode = 'bs') +cv_no_strat_info_plot <- + cv_no_strat_info %>% + melt %>% + mutate(mode = 'cv') + +info_no_strat_plot <- rbind(bs_no_strat_info_plot, cv_no_strat_info_plot) %>% + select(mode, variable, value) +names(info_no_strat_plot) <- c("mode", "set", "num") +``` + +```{r plot_ex_no_strat} + +info_no_strat_plot_ex <- info_no_strat_plot %>% + filter(grepl('ex', set)) %>% + filter(!grepl('tr', set)) +ggplot(info_no_strat_plot_ex, + aes(x = num, y = ..count.., color = mode, linetype = set)) + + geom_density() + + scale_linetype_manual(values=c("twodash", "dotted")) + + xlim(0, 500) + +ggplot(info_no_strat_plot_ex, + aes(mode, num, color = set)) + + geom_boxplot() + + ylim(0, 500) + +``` + +```{r plot_ar_no_strat} + +info_no_strat_plot_ar <- info_no_strat_plot %>% + filter(grepl('ar', set)) + +ggplot(info_no_strat_plot_ar, + aes(x = num, y = ..count.., color = mode, linetype = set)) + + geom_density() + + scale_linetype_manual(values = c("solid", "twodash", "dotted")) + + xlim(0, 500) + +ggplot(info_no_strat_plot_ar, + aes(mode, num, color = set)) + + geom_boxplot() + + ylim(0, 500) +``` + +```{r bs_strat, eval = F} +set.seed(seed) +max_iter <- num_simul + +bs_strat <- + get_samples('bs', iter = max_iter, stratified = T) +``` +```{r bs_strat_info, eval = F} +bs_strat_info <- get_info_list(bs_strat) +summary(bs_strat_info) +``` + + +```{r cv_strat, eval = F} +set.seed(seed) + +max_iter <- num_simul / num_folds +cv_strat <- vector("list", max_iter) +for (i in 1:max_iter){ + cv_strat[[i]] <- + get_samples('cv', iter = num_folds, stratified = T) +} +cv_strat <- unlist(cv_strat, recursive = F) +``` +```{r cv_strat_info} +cv_strat_info <- get_info_list(cv_strat) +summary(cv_strat_info) +``` + +```{r compare_strat} +bs_strat_info_plot <- + bs_strat_info %>% + melt %>% + mutate(mode = 'bs') +cv_strat_info_plot <- + cv_strat_info %>% + melt %>% + mutate(mode = 'cv') + +info_strat_plot <- rbind(bs_strat_info_plot, cv_strat_info_plot) %>% + select(mode, variable, value) +names(info_strat_plot) <- c("mode", "set", "num") +``` + +```{r plot_ex_strat} + +info_strat_plot_ex <- info_strat_plot %>% + filter(grepl('ex', set)) %>% + filter(!grepl('tr', set)) + +ggplot(info_strat_plot_ex, + aes(mode, num, color = set)) + + geom_boxplot() + + ylim(0, 500) + +``` + +```{r plot_ar_strat} + +info_strat_plot_ar <- info_strat_plot %>% + filter(grepl('ar', set)) + +ggplot(info_strat_plot_ar, + aes(x = num, y = ..count.., color = mode, linetype = set)) + + geom_density() + + scale_linetype_manual(values = c("solid", "twodash", "dotted")) + + xlim(0, 500) + +ggplot(info_strat_plot_ar, + aes(mode, num, color = set)) + + geom_boxplot() + + ylim(0, 500) +``` + + + + + +## Full Sets + +```{r create_sets, eval = F} +create_set_df <- function(a_list, num_sets = NULL, set = 'train'){ + + if(is.null(num_sets)) num_sets <- length(a_list) + mode <- deparse(substitute(a_list)) + + df <- data.frame(stringsAsFactors = F) + + for (i in 1:num_sets){ + df <- rbind(df, + data.frame( + mode = mode, + iteration = i, + ex_id = a_list[[i]][[set]], + stringsAsFactors = F + ) + ) + } + + df %>% + separate(mode, c("mode", "strat"), + remove = T, extra = "drop") %>% + mutate(strat = !(strat == 'no')) %>% + select(mode, strat, iteration, ex_id) + +} +``` + +```{r train_sets} +train_sets <- rbind( + create_set_df(cv_no_strat, num_sets = NULL, set = 'train'), + create_set_df(bs_no_strat, num_sets = NULL, set = 'train'), + create_set_df(cv_strat, num_sets = NULL, set = 'train'), + create_set_df(bs_strat, num_sets = NULL, set = 'train') +) +``` + +```{r test_sets} +test_sets <- rbind( + create_set_df(cv_no_strat, num_sets = NULL, set = 'test'), + create_set_df(bs_no_strat, num_sets = NULL, set = 'test'), + create_set_df(cv_strat, num_sets = NULL, set = 'test'), + create_set_df(bs_strat, num_sets = NULL, set = 'test') +) +``` + + +```{r get_train_artists} +train_sets_artists <- train_sets %>% + inner_join(get_excerpts_artists(), by = c('ex_id')) %>% + group_by(mode, strat, iteration) %>% + select(mode, strat, iteration, artist_id) %>% + unique() %>% + ungroup() +``` + +```{r get_test_artists} +test_sets_artists <- test_sets %>% + inner_join(get_excerpts_artists(), by = c('ex_id')) %>% + ungroup() +``` + +```{r filter_test_sets} + +filt_test_sets <- data.frame(stringsAsFactors = F) + +combinations <- unique(train_sets_artists %>% select(mode, strat, iteration)) + + +for (row in 1:nrow(combinations)){ + + the_mode <- combinations[row, 'mode'] %>% + unlist() %>% unname() + the_strat <- combinations[row, 'strat'] %>% + unlist() %>% unname() + the_iteration <- combinations[row, 'iteration'] %>% + unlist() %>% unname() + + unique_artists <- train_sets_artists %>% + filter(mode == the_mode, strat == the_strat, + iteration == the_iteration) %>% + select(artist_id) %>% + unlist() %>% + unname() + + filt_test_ex <- test_sets_artists %>% + filter(mode == the_mode, strat == the_strat, + iteration == the_iteration, !(artist_id %in% unique_artists)) %>% + select(ex_id) %>% + unlist() + + filt_test_sets <- rbind(filt_test_sets, + test_sets %>% + filter(mode == the_mode, strat == the_strat, + iteration == the_iteration, + ex_id %in% filt_test_ex)) + +} +``` + + +### Class-wise analysis + +```{r bs_strat_class} +info_test <- data.frame() +info_filt <- data.frame() + +for (i in 1:2){ + info_test[i,] <- table(get_classes( + bs_strat[[i]]$test, unique_classes = F) + ) + info_filt[i,] <- table(get_classes( + filter_excerpts(bs_strat[[i]]$test, + get_artists(bs_strat[[i]]$train)), + unique_classes = F)) +} +``` + + +## Create Sets for Prediction + +```{r num_sets, eval = F} +num_sets <- 40 +``` + +### Concatenate "Arbitrary" Sets + + +```{r train_sets_pred, eval = F} +train_sets_pred <- train_sets %>% + filter(iteration <= 40) +``` + +```{r store_train_sets, eval = F} +write.csv(x = train_sets_pred, file = '../sets/train.csv', + row.names = F) +``` + +### Concatenate Test Set + +```{r test_sets_pred, eval = F} +test_sets_pred <- test_sets %>% + filter(iteration <= 40) +``` + + +```{r store_test_sets, eval = F} +write.csv(x = test_sets_pred, file = '../sets/test.csv', + row.names = F) +```