diff sampling/sampling.Rmd @ 0:205974c9568c tip

Initial commit. Predictions not included for lack of space.
author franrodalg <f.rodriguezalgarra@qmul.ac.uk>
date Sat, 29 Jun 2019 18:45:50 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sampling/sampling.Rmd	Sat Jun 29 18:45:50 2019 +0100
@@ -0,0 +1,372 @@
+---
+title: "Sampling (Amendment 7/3/2018)"
+output: html_notebook
+---
+
+```{r initialisation}
+library(tidyverse)
+if(!require("RSQLite")){
+  install.packages("RSQLite")
+  library("RSQLite")
+}
+
+source('../db/access_db.R')
+source('../sampling/strategies.R')
+```
+
+## Examples
+
+```{r param_simul}
+num_simul <- 10000
+num_folds <- 4
+seed <- 1986
+```
+
+```{r get_info}
+
+get_info_list <- function(samples_list){
+  
+  aux <- numeric(length(samples_list))
+  df <- 
+    data.frame(ex_tr = aux, ar_tr = aux,
+               ex_te_orig = aux, ar_te_orig = aux,
+               ex_te_filt = aux, ar_te_filt = aux)
+  
+  for (i in 1:length(samples_list)){
+    df[i,] <- get_info(samples_list[[i]])
+  }
+  
+  df
+  
+}
+
+get_info <- function(samples){
+  
+  ar_tr <- 
+    get_artists(samples$train, unique_artist = T)
+  ex_filt <- 
+    filter_excerpts(samples$test, ar_tr)
+  
+  data.frame(
+    ex_tr = length(samples$train),
+    ar_tr = length(ar_tr),
+    ex_te_orig = length(samples$test),
+    ar_te_orig = length(get_artists(samples$test, unique_artists = T)),
+    ex_te_filt = length(ex_filt),
+    ar_te_filt = length(get_artists(ex_filt, unique_artist = T))
+  )
+
+}
+
+```
+
+```{r bs_no_strat, eval = F}
+set.seed(seed)
+max_iter <- num_simul
+
+bs_no_strat <- get_samples('bs', iter = max_iter)
+```
+```{r bs_no_strat_info, eval = F}
+bs_no_strat_info <- get_info_list(bs_no_strat)
+summary(bs_no_strat_info)
+```
+
+
+```{r cv_example, eval = F}
+
+set.seed(seed)
+
+max_iter <- num_simul / num_folds
+cv_no_strat <- vector("list", max_iter)
+for (i in 1:max_iter){
+  cv_no_strat[[i]] <- get_samples('cv', iter = num_folds)
+}
+
+cv_no_strat <- unlist(cv_no_strat, recursive = F)
+```
+```{r cv_no_strat_info, eval = F}
+cv_no_strat_info <- get_info_list(cv_no_strat)
+summary(cv_no_strat_info)
+```
+
+
+```{r compare_no_strat, eval = F}
+bs_no_strat_info_plot <- 
+  bs_no_strat_info %>%
+  melt %>%
+  mutate(mode = 'bs')
+cv_no_strat_info_plot <-
+  cv_no_strat_info %>%
+  melt %>%
+  mutate(mode = 'cv')
+
+info_no_strat_plot <- rbind(bs_no_strat_info_plot, cv_no_strat_info_plot) %>%
+  select(mode, variable, value)
+names(info_no_strat_plot) <- c("mode", "set", "num")
+```
+
+```{r plot_ex_no_strat}
+
+info_no_strat_plot_ex <- info_no_strat_plot %>%
+  filter(grepl('ex', set)) %>%
+  filter(!grepl('tr', set))
+ggplot(info_no_strat_plot_ex, 
+       aes(x = num, y = ..count.., color = mode, linetype = set)) +
+  geom_density() +
+  scale_linetype_manual(values=c("twodash", "dotted")) +
+  xlim(0, 500)
+
+ggplot(info_no_strat_plot_ex, 
+       aes(mode, num, color = set)) +
+  geom_boxplot() +
+  ylim(0, 500)
+
+```
+
+```{r plot_ar_no_strat}
+
+info_no_strat_plot_ar <- info_no_strat_plot %>%
+  filter(grepl('ar', set))
+
+ggplot(info_no_strat_plot_ar, 
+       aes(x = num, y = ..count.., color = mode, linetype = set)) +
+  geom_density() +
+  scale_linetype_manual(values = c("solid", "twodash", "dotted")) +
+  xlim(0, 500)
+
+ggplot(info_no_strat_plot_ar, 
+       aes(mode, num, color = set)) +
+  geom_boxplot() +
+  ylim(0, 500)
+```
+
+```{r bs_strat, eval = F}
+set.seed(seed)
+max_iter <- num_simul
+
+bs_strat <- 
+  get_samples('bs', iter = max_iter, stratified = T)
+```
+```{r bs_strat_info, eval = F}
+bs_strat_info <- get_info_list(bs_strat)
+summary(bs_strat_info)
+```
+
+
+```{r cv_strat, eval = F}
+set.seed(seed)
+
+max_iter <- num_simul / num_folds
+cv_strat <- vector("list", max_iter)
+for (i in 1:max_iter){
+  cv_strat[[i]] <- 
+    get_samples('cv', iter = num_folds, stratified =  T)
+}
+cv_strat <- unlist(cv_strat, recursive = F)
+```
+```{r cv_strat_info}
+cv_strat_info <- get_info_list(cv_strat)
+summary(cv_strat_info)
+```
+
+```{r compare_strat}
+bs_strat_info_plot <- 
+  bs_strat_info %>%
+  melt %>%
+  mutate(mode = 'bs')
+cv_strat_info_plot <-
+  cv_strat_info %>%
+  melt %>%
+  mutate(mode = 'cv')
+
+info_strat_plot <- rbind(bs_strat_info_plot, cv_strat_info_plot) %>%
+  select(mode, variable, value)
+names(info_strat_plot) <- c("mode", "set", "num")
+```
+
+```{r plot_ex_strat}
+
+info_strat_plot_ex <- info_strat_plot %>%
+  filter(grepl('ex', set)) %>%
+  filter(!grepl('tr', set))
+
+ggplot(info_strat_plot_ex, 
+       aes(mode, num, color = set)) +
+  geom_boxplot() +
+  ylim(0, 500)
+
+```
+
+```{r plot_ar_strat}
+
+info_strat_plot_ar <- info_strat_plot %>%
+  filter(grepl('ar', set))
+
+ggplot(info_strat_plot_ar, 
+       aes(x = num, y = ..count.., color = mode, linetype = set)) +
+  geom_density() +
+  scale_linetype_manual(values = c("solid", "twodash", "dotted")) +
+  xlim(0, 500)
+
+ggplot(info_strat_plot_ar, 
+       aes(mode, num, color = set)) +
+  geom_boxplot() +
+  ylim(0, 500)
+```
+
+
+
+
+
+## Full Sets
+
+```{r create_sets, eval = F}
+create_set_df <- function(a_list, num_sets = NULL, set = 'train'){
+  
+  if(is.null(num_sets)) num_sets <- length(a_list)
+  mode <- deparse(substitute(a_list))
+  
+  df <- data.frame(stringsAsFactors = F)
+  
+  for (i  in 1:num_sets){
+    df <- rbind(df,
+                data.frame(
+                  mode = mode,
+                  iteration = i,
+                  ex_id = a_list[[i]][[set]],
+                  stringsAsFactors = F
+                  )
+    )
+  }
+  
+  df %>%
+    separate(mode, c("mode", "strat"),
+             remove = T, extra = "drop") %>%
+    mutate(strat = !(strat == 'no')) %>%
+    select(mode, strat, iteration, ex_id)
+  
+}
+```
+
+```{r train_sets}
+train_sets <- rbind(
+  create_set_df(cv_no_strat, num_sets = NULL, set = 'train'),
+  create_set_df(bs_no_strat, num_sets = NULL, set = 'train'),
+  create_set_df(cv_strat, num_sets = NULL, set = 'train'),
+  create_set_df(bs_strat, num_sets = NULL, set = 'train')
+)
+```
+
+```{r test_sets}
+test_sets <- rbind(
+  create_set_df(cv_no_strat, num_sets = NULL, set = 'test'),
+  create_set_df(bs_no_strat, num_sets = NULL, set = 'test'),
+  create_set_df(cv_strat, num_sets = NULL, set = 'test'),
+  create_set_df(bs_strat, num_sets = NULL, set = 'test')
+)
+```
+
+
+```{r get_train_artists}
+train_sets_artists <- train_sets %>%
+  inner_join(get_excerpts_artists(), by = c('ex_id')) %>%
+  group_by(mode, strat, iteration) %>%
+  select(mode, strat, iteration, artist_id) %>%
+  unique() %>%
+  ungroup()
+```
+
+```{r get_test_artists}
+test_sets_artists <- test_sets %>%
+  inner_join(get_excerpts_artists(), by = c('ex_id')) %>%
+  ungroup()
+```
+
+```{r filter_test_sets}
+
+filt_test_sets <- data.frame(stringsAsFactors = F)
+
+combinations <- unique(train_sets_artists %>% select(mode, strat, iteration))
+
+
+for (row in 1:nrow(combinations)){
+  
+  the_mode <- combinations[row, 'mode'] %>%
+    unlist() %>% unname()
+  the_strat <- combinations[row, 'strat'] %>%
+    unlist() %>% unname()
+  the_iteration <- combinations[row, 'iteration'] %>%
+    unlist() %>% unname()
+  
+  unique_artists <- train_sets_artists %>%
+    filter(mode == the_mode, strat == the_strat,
+           iteration == the_iteration) %>%
+    select(artist_id) %>%
+    unlist() %>%
+    unname()
+  
+  filt_test_ex <- test_sets_artists %>%
+    filter(mode == the_mode, strat == the_strat, 
+           iteration == the_iteration, !(artist_id %in% unique_artists)) %>%
+    select(ex_id) %>%
+    unlist()
+  
+  filt_test_sets <- rbind(filt_test_sets,
+                          test_sets %>%
+                            filter(mode == the_mode, strat == the_strat,
+                                   iteration == the_iteration,
+                                   ex_id %in% filt_test_ex))
+  
+}
+```
+
+
+### Class-wise analysis
+
+```{r bs_strat_class}
+info_test <- data.frame()
+info_filt <- data.frame()
+
+for (i in 1:2){
+  info_test[i,] <- table(get_classes(
+    bs_strat[[i]]$test, unique_classes = F)
+    )
+  info_filt[i,] <- table(get_classes(
+    filter_excerpts(bs_strat[[i]]$test,
+                    get_artists(bs_strat[[i]]$train)),
+    unique_classes = F))
+}
+```
+
+
+## Create Sets for Prediction
+
+```{r num_sets, eval = F}
+num_sets <- 40
+```
+
+### Concatenate "Arbitrary" Sets
+
+
+```{r train_sets_pred, eval = F}
+train_sets_pred <- train_sets %>%
+  filter(iteration <= 40)
+```
+
+```{r store_train_sets, eval = F}
+write.csv(x = train_sets_pred, file = '../sets/train.csv',
+          row.names = F)
+```
+
+### Concatenate Test Set
+
+```{r test_sets_pred, eval = F}
+test_sets_pred <- test_sets %>%
+  filter(iteration <= 40)
+```
+
+
+```{r store_test_sets, eval = F}
+write.csv(x = test_sets_pred, file = '../sets/test.csv',
+          row.names = F)
+```