annotate sampling/strategies.R @ 0:205974c9568c tip

Initial commit. Predictions not included for lack of space.
author franrodalg <f.rodriguezalgarra@qmul.ac.uk>
date Sat, 29 Jun 2019 18:45:50 +0100
parents
children
rev   line source
f@0 1 ## Generic sampling function
f@0 2 get_samples <- function(num_iter = 1, mode = 'cv',
f@0 3 N = 0, num_folds = 4,
f@0 4 stratified = F, keep_prop = F,
f@0 5 db = "../db/gtzan.db", classes = NULL){
f@0 6
f@0 7 if (mode == 'cv')
f@0 8 get_cv_samples(
f@0 9 num_iter = num_iter, num_folds = num_folds,
f@0 10 stratified = stratified, db = db, classes = classes)
f@0 11 else
f@0 12 get_bs_samples(
f@0 13 N = N, num_iter = num_iter, stratified = stratified,
f@0 14 keep_prop = keep_prop, db = db, classes = classes)
f@0 15
f@0 16 }
f@0 17
f@0 18 # CROSS-VALIDATION
f@0 19
f@0 20 ## Cross-Validation sampling
f@0 21 get_cv_samples <- function(num_iter = 1, num_folds = 4, stratified = F,
f@0 22 db = "../db/gtzan.db", classes = NULL){
f@0 23
f@0 24 train <- data.frame()
f@0 25 test <- data.frame()
f@0 26
f@0 27 for (i in 1:num_iter){
f@0 28 folds <-
f@0 29 get_folds(num_folds = num_folds, stratified = stratified,
f@0 30 db = db, classes = classes)
f@0 31
f@0 32 for (j in 1:num_folds){
f@0 33
f@0 34 train <- rbind(train,
f@0 35 data.frame(ex_id = folds[[j]]$train,
f@0 36 iter = i, fold = j,
f@0 37 mode = 'cv',
f@0 38 stratified = stratified))
f@0 39 test <- rbind(test,
f@0 40 data.frame(ex_id = folds[[j]]$test,
f@0 41 iter = i, fold = j,
f@0 42 mode = 'cv',
f@0 43 stratified = stratified))
f@0 44 }
f@0 45
f@0 46 }
f@0 47
f@0 48 return(list(train = train, test = test))
f@0 49
f@0 50 }
f@0 51
f@0 52
f@0 53 # Random assignment into folds
f@0 54 .get_assignment <- function(N, num_folds){
f@0 55
f@0 56 sample(
f@0 57 as.numeric(
f@0 58 sapply(1:num_folds,
f@0 59 function(x) rep(x, N/num_folds))))
f@0 60
f@0 61 }
f@0 62
f@0 63 # Obtain folds for Cross-Validation
f@0 64 get_folds <- function(num_folds = 4, stratified = F,
f@0 65 db = "../db/gtzan.db", classes = NULL){
f@0 66
f@0 67 res <- get_excerpts_classes(db = db, classes) %>%
f@0 68 arrange(class_id, ex_id_class)
f@0 69
f@0 70 if(stratified){
f@0 71 assignment <- numeric(0)
f@0 72 for(i in unique(res$class_id)){
f@0 73 N_class <-
f@0 74 filter(res, class_id == i) %>%
f@0 75 nrow()
f@0 76 assignment <-
f@0 77 c(assignment, .get_assignment(N_class, num_folds))
f@0 78 }
f@0 79 }
f@0 80 else
f@0 81 assignment <- .get_assignment(nrow(res), num_folds)
f@0 82
f@0 83 folds <- vector("list", num_folds)
f@0 84 aux <- res$ex_id
f@0 85
f@0 86 for (i in 1:num_folds){
f@0 87 folds[[i]][['test']] <-
f@0 88 aux[which(assignment == i)]
f@0 89 folds[[i]][['train']] <-
f@0 90 aux[which(assignment != i)]
f@0 91 }
f@0 92
f@0 93 folds
f@0 94
f@0 95 }
f@0 96
f@0 97
f@0 98 # BOOTSTRAP
f@0 99
f@0 100 ## Boostrap sampling
f@0 101 get_bs_samples <- function(N, num_iter, stratified = F, keep_prop = F,
f@0 102 db = "../db/gtzan.db", classes = NULL,
f@0 103 hold_out_ex_ids = NULL){
f@0 104
f@0 105 res <- get_excerpts_classes(db, classes)
f@0 106
f@0 107 if (N == 0 | is.null(N))
f@0 108 N <- nrow(res)
f@0 109
f@0 110 train <- data.frame()
f@0 111 test <- data.frame()
f@0 112
f@0 113 for (i in 1:num_iter){
f@0 114 train_test <-
f@0 115 get_bs_sample(N, stratified, keep_prop, db, classes, res,
f@0 116 hold_out_ex_ids)
f@0 117 train <- rbind(train,
f@0 118 data.frame(ex_id = train_test$train,
f@0 119 iter = i,
f@0 120 mode = 'bs',
f@0 121 stratified = stratified,
f@0 122 keep_prop = keep_prop))
f@0 123 test <- rbind(test,
f@0 124 data.frame(ex_id = train_test$test,
f@0 125 iter = i,
f@0 126 mode = 'bs',
f@0 127 stratified = stratified,
f@0 128 keep_prop = keep_prop))
f@0 129 }
f@0 130
f@0 131 return(list(train = train, test = test))
f@0 132
f@0 133 }
f@0 134
f@0 135 ## Correct proportions for test set stratification
f@0 136 .correct_prop <- function(samples, reference_samples,
f@0 137 db = "../db/gtzan.db"){
f@0 138
f@0 139 ref <- as.factor(
f@0 140 get_classes(reference_samples, unique_classes = F, db = db))
f@0 141 classes <- factor(
f@0 142 get_classes(samples, unique_classes = F, db = db),
f@0 143 levels = levels(ref))
f@0 144
f@0 145 ref_counts <- table(ref)
f@0 146 ref_prop <- ref_counts / sum(ref_counts)
f@0 147
f@0 148 prior_counts <- table(classes)
f@0 149 post_counts <- round(ref_prop * min(table(classes) / ref_prop))
f@0 150
f@0 151 post_samples <- numeric(0)
f@0 152 for (class in levels(classes)){
f@0 153 post_samples <-
f@0 154 c(post_samples,
f@0 155 sample(samples[which(classes == class)],
f@0 156 size = post_counts[[class]]))
f@0 157 }
f@0 158
f@0 159 post_samples
f@0 160
f@0 161 }
f@0 162
f@0 163 ## Bootstrap single train/test pair
f@0 164 get_bs_sample <- function(N, stratified = F, keep_prop = F,
f@0 165 db = "../db/gtzan.db", classes = NULL,
f@0 166 df = NULL,
f@0 167 hold_out_ex_ids = NULL
f@0 168 ){
f@0 169
f@0 170 if(is.null(df))
f@0 171 df <- get_excerpts_classes(db, classes)
f@0 172
f@0 173 n_ex_class <- table(df$class_id)
f@0 174 print(n_ex_class)
f@0 175
f@0 176 if(!is.null(hold_out_ex_ids))
f@0 177 df <- df %>% filter(! (ex_id %in% hold_out_ex_ids))
f@0 178
f@0 179 train <- numeric(0)
f@0 180
f@0 181 if(stratified){
f@0 182 for(j in unique(df$class_id)){
f@0 183 ex_class <- filter(df, class_id == j) %>%
f@0 184 select(ex_id) %>%
f@0 185 unlist() %>%
f@0 186 unname()
f@0 187 train <- c(train,
f@0 188 sample(ex_class,
f@0 189 size = n_ex_class[which(names(n_ex_class) == j)],
f@0 190 replace = T))
f@0 191 }
f@0 192 }
f@0 193 else{
f@0 194 train <- sample(df$ex_id, size = N, replace = T)
f@0 195 }
f@0 196
f@0 197 test <- sort(c(df$ex_id[!(df$ex_id %in% train)], hold_out_ex_ids))
f@0 198 if(keep_prop){
f@0 199 test <- .correct_prop(test, reference = train)
f@0 200 }
f@0 201
f@0 202 return(list(train = train, test = test))
f@0 203
f@0 204 }