view sampling/sampling.Rmd @ 0:205974c9568c tip

Initial commit. Predictions not included for lack of space.
author franrodalg <f.rodriguezalgarra@qmul.ac.uk>
date Sat, 29 Jun 2019 18:45:50 +0100
parents
children
line wrap: on
line source
---
title: "Sampling (Amendment 7/3/2018)"
output: html_notebook
---

```{r initialisation}
library(tidyverse)
if(!require("RSQLite")){
  install.packages("RSQLite")
  library("RSQLite")
}

source('../db/access_db.R')
source('../sampling/strategies.R')
```

## Examples

```{r param_simul}
num_simul <- 10000
num_folds <- 4
seed <- 1986
```

```{r get_info}

get_info_list <- function(samples_list){
  
  aux <- numeric(length(samples_list))
  df <- 
    data.frame(ex_tr = aux, ar_tr = aux,
               ex_te_orig = aux, ar_te_orig = aux,
               ex_te_filt = aux, ar_te_filt = aux)
  
  for (i in 1:length(samples_list)){
    df[i,] <- get_info(samples_list[[i]])
  }
  
  df
  
}

get_info <- function(samples){
  
  ar_tr <- 
    get_artists(samples$train, unique_artist = T)
  ex_filt <- 
    filter_excerpts(samples$test, ar_tr)
  
  data.frame(
    ex_tr = length(samples$train),
    ar_tr = length(ar_tr),
    ex_te_orig = length(samples$test),
    ar_te_orig = length(get_artists(samples$test, unique_artists = T)),
    ex_te_filt = length(ex_filt),
    ar_te_filt = length(get_artists(ex_filt, unique_artist = T))
  )

}

```

```{r bs_no_strat, eval = F}
set.seed(seed)
max_iter <- num_simul

bs_no_strat <- get_samples('bs', iter = max_iter)
```
```{r bs_no_strat_info, eval = F}
bs_no_strat_info <- get_info_list(bs_no_strat)
summary(bs_no_strat_info)
```


```{r cv_example, eval = F}

set.seed(seed)

max_iter <- num_simul / num_folds
cv_no_strat <- vector("list", max_iter)
for (i in 1:max_iter){
  cv_no_strat[[i]] <- get_samples('cv', iter = num_folds)
}

cv_no_strat <- unlist(cv_no_strat, recursive = F)
```
```{r cv_no_strat_info, eval = F}
cv_no_strat_info <- get_info_list(cv_no_strat)
summary(cv_no_strat_info)
```


```{r compare_no_strat, eval = F}
bs_no_strat_info_plot <- 
  bs_no_strat_info %>%
  melt %>%
  mutate(mode = 'bs')
cv_no_strat_info_plot <-
  cv_no_strat_info %>%
  melt %>%
  mutate(mode = 'cv')

info_no_strat_plot <- rbind(bs_no_strat_info_plot, cv_no_strat_info_plot) %>%
  select(mode, variable, value)
names(info_no_strat_plot) <- c("mode", "set", "num")
```

```{r plot_ex_no_strat}

info_no_strat_plot_ex <- info_no_strat_plot %>%
  filter(grepl('ex', set)) %>%
  filter(!grepl('tr', set))
ggplot(info_no_strat_plot_ex, 
       aes(x = num, y = ..count.., color = mode, linetype = set)) +
  geom_density() +
  scale_linetype_manual(values=c("twodash", "dotted")) +
  xlim(0, 500)

ggplot(info_no_strat_plot_ex, 
       aes(mode, num, color = set)) +
  geom_boxplot() +
  ylim(0, 500)

```

```{r plot_ar_no_strat}

info_no_strat_plot_ar <- info_no_strat_plot %>%
  filter(grepl('ar', set))

ggplot(info_no_strat_plot_ar, 
       aes(x = num, y = ..count.., color = mode, linetype = set)) +
  geom_density() +
  scale_linetype_manual(values = c("solid", "twodash", "dotted")) +
  xlim(0, 500)

ggplot(info_no_strat_plot_ar, 
       aes(mode, num, color = set)) +
  geom_boxplot() +
  ylim(0, 500)
```

```{r bs_strat, eval = F}
set.seed(seed)
max_iter <- num_simul

bs_strat <- 
  get_samples('bs', iter = max_iter, stratified = T)
```
```{r bs_strat_info, eval = F}
bs_strat_info <- get_info_list(bs_strat)
summary(bs_strat_info)
```


```{r cv_strat, eval = F}
set.seed(seed)

max_iter <- num_simul / num_folds
cv_strat <- vector("list", max_iter)
for (i in 1:max_iter){
  cv_strat[[i]] <- 
    get_samples('cv', iter = num_folds, stratified =  T)
}
cv_strat <- unlist(cv_strat, recursive = F)
```
```{r cv_strat_info}
cv_strat_info <- get_info_list(cv_strat)
summary(cv_strat_info)
```

```{r compare_strat}
bs_strat_info_plot <- 
  bs_strat_info %>%
  melt %>%
  mutate(mode = 'bs')
cv_strat_info_plot <-
  cv_strat_info %>%
  melt %>%
  mutate(mode = 'cv')

info_strat_plot <- rbind(bs_strat_info_plot, cv_strat_info_plot) %>%
  select(mode, variable, value)
names(info_strat_plot) <- c("mode", "set", "num")
```

```{r plot_ex_strat}

info_strat_plot_ex <- info_strat_plot %>%
  filter(grepl('ex', set)) %>%
  filter(!grepl('tr', set))

ggplot(info_strat_plot_ex, 
       aes(mode, num, color = set)) +
  geom_boxplot() +
  ylim(0, 500)

```

```{r plot_ar_strat}

info_strat_plot_ar <- info_strat_plot %>%
  filter(grepl('ar', set))

ggplot(info_strat_plot_ar, 
       aes(x = num, y = ..count.., color = mode, linetype = set)) +
  geom_density() +
  scale_linetype_manual(values = c("solid", "twodash", "dotted")) +
  xlim(0, 500)

ggplot(info_strat_plot_ar, 
       aes(mode, num, color = set)) +
  geom_boxplot() +
  ylim(0, 500)
```





## Full Sets

```{r create_sets, eval = F}
create_set_df <- function(a_list, num_sets = NULL, set = 'train'){
  
  if(is.null(num_sets)) num_sets <- length(a_list)
  mode <- deparse(substitute(a_list))
  
  df <- data.frame(stringsAsFactors = F)
  
  for (i  in 1:num_sets){
    df <- rbind(df,
                data.frame(
                  mode = mode,
                  iteration = i,
                  ex_id = a_list[[i]][[set]],
                  stringsAsFactors = F
                  )
    )
  }
  
  df %>%
    separate(mode, c("mode", "strat"),
             remove = T, extra = "drop") %>%
    mutate(strat = !(strat == 'no')) %>%
    select(mode, strat, iteration, ex_id)
  
}
```

```{r train_sets}
train_sets <- rbind(
  create_set_df(cv_no_strat, num_sets = NULL, set = 'train'),
  create_set_df(bs_no_strat, num_sets = NULL, set = 'train'),
  create_set_df(cv_strat, num_sets = NULL, set = 'train'),
  create_set_df(bs_strat, num_sets = NULL, set = 'train')
)
```

```{r test_sets}
test_sets <- rbind(
  create_set_df(cv_no_strat, num_sets = NULL, set = 'test'),
  create_set_df(bs_no_strat, num_sets = NULL, set = 'test'),
  create_set_df(cv_strat, num_sets = NULL, set = 'test'),
  create_set_df(bs_strat, num_sets = NULL, set = 'test')
)
```


```{r get_train_artists}
train_sets_artists <- train_sets %>%
  inner_join(get_excerpts_artists(), by = c('ex_id')) %>%
  group_by(mode, strat, iteration) %>%
  select(mode, strat, iteration, artist_id) %>%
  unique() %>%
  ungroup()
```

```{r get_test_artists}
test_sets_artists <- test_sets %>%
  inner_join(get_excerpts_artists(), by = c('ex_id')) %>%
  ungroup()
```

```{r filter_test_sets}

filt_test_sets <- data.frame(stringsAsFactors = F)

combinations <- unique(train_sets_artists %>% select(mode, strat, iteration))


for (row in 1:nrow(combinations)){
  
  the_mode <- combinations[row, 'mode'] %>%
    unlist() %>% unname()
  the_strat <- combinations[row, 'strat'] %>%
    unlist() %>% unname()
  the_iteration <- combinations[row, 'iteration'] %>%
    unlist() %>% unname()
  
  unique_artists <- train_sets_artists %>%
    filter(mode == the_mode, strat == the_strat,
           iteration == the_iteration) %>%
    select(artist_id) %>%
    unlist() %>%
    unname()
  
  filt_test_ex <- test_sets_artists %>%
    filter(mode == the_mode, strat == the_strat, 
           iteration == the_iteration, !(artist_id %in% unique_artists)) %>%
    select(ex_id) %>%
    unlist()
  
  filt_test_sets <- rbind(filt_test_sets,
                          test_sets %>%
                            filter(mode == the_mode, strat == the_strat,
                                   iteration == the_iteration,
                                   ex_id %in% filt_test_ex))
  
}
```


### Class-wise analysis

```{r bs_strat_class}
info_test <- data.frame()
info_filt <- data.frame()

for (i in 1:2){
  info_test[i,] <- table(get_classes(
    bs_strat[[i]]$test, unique_classes = F)
    )
  info_filt[i,] <- table(get_classes(
    filter_excerpts(bs_strat[[i]]$test,
                    get_artists(bs_strat[[i]]$train)),
    unique_classes = F))
}
```


## Create Sets for Prediction

```{r num_sets, eval = F}
num_sets <- 40
```

### Concatenate "Arbitrary" Sets


```{r train_sets_pred, eval = F}
train_sets_pred <- train_sets %>%
  filter(iteration <= 40)
```

```{r store_train_sets, eval = F}
write.csv(x = train_sets_pred, file = '../sets/train.csv',
          row.names = F)
```

### Concatenate Test Set

```{r test_sets_pred, eval = F}
test_sets_pred <- test_sets %>%
  filter(iteration <= 40)
```


```{r store_test_sets, eval = F}
write.csv(x = test_sets_pred, file = '../sets/test.csv',
          row.names = F)
```