Mercurial > hg > confint
view sampling/strategies.R @ 0:205974c9568c tip
Initial commit. Predictions not included for lack of space.
author | franrodalg <f.rodriguezalgarra@qmul.ac.uk> |
---|---|
date | Sat, 29 Jun 2019 18:45:50 +0100 |
parents | |
children |
line wrap: on
line source
## Generic sampling function get_samples <- function(num_iter = 1, mode = 'cv', N = 0, num_folds = 4, stratified = F, keep_prop = F, db = "../db/gtzan.db", classes = NULL){ if (mode == 'cv') get_cv_samples( num_iter = num_iter, num_folds = num_folds, stratified = stratified, db = db, classes = classes) else get_bs_samples( N = N, num_iter = num_iter, stratified = stratified, keep_prop = keep_prop, db = db, classes = classes) } # CROSS-VALIDATION ## Cross-Validation sampling get_cv_samples <- function(num_iter = 1, num_folds = 4, stratified = F, db = "../db/gtzan.db", classes = NULL){ train <- data.frame() test <- data.frame() for (i in 1:num_iter){ folds <- get_folds(num_folds = num_folds, stratified = stratified, db = db, classes = classes) for (j in 1:num_folds){ train <- rbind(train, data.frame(ex_id = folds[[j]]$train, iter = i, fold = j, mode = 'cv', stratified = stratified)) test <- rbind(test, data.frame(ex_id = folds[[j]]$test, iter = i, fold = j, mode = 'cv', stratified = stratified)) } } return(list(train = train, test = test)) } # Random assignment into folds .get_assignment <- function(N, num_folds){ sample( as.numeric( sapply(1:num_folds, function(x) rep(x, N/num_folds)))) } # Obtain folds for Cross-Validation get_folds <- function(num_folds = 4, stratified = F, db = "../db/gtzan.db", classes = NULL){ res <- get_excerpts_classes(db = db, classes) %>% arrange(class_id, ex_id_class) if(stratified){ assignment <- numeric(0) for(i in unique(res$class_id)){ N_class <- filter(res, class_id == i) %>% nrow() assignment <- c(assignment, .get_assignment(N_class, num_folds)) } } else assignment <- .get_assignment(nrow(res), num_folds) folds <- vector("list", num_folds) aux <- res$ex_id for (i in 1:num_folds){ folds[[i]][['test']] <- aux[which(assignment == i)] folds[[i]][['train']] <- aux[which(assignment != i)] } folds } # BOOTSTRAP ## Boostrap sampling get_bs_samples <- function(N, num_iter, stratified = F, keep_prop = F, db = "../db/gtzan.db", classes = NULL, hold_out_ex_ids = NULL){ res <- get_excerpts_classes(db, classes) if (N == 0 | is.null(N)) N <- nrow(res) train <- data.frame() test <- data.frame() for (i in 1:num_iter){ train_test <- get_bs_sample(N, stratified, keep_prop, db, classes, res, hold_out_ex_ids) train <- rbind(train, data.frame(ex_id = train_test$train, iter = i, mode = 'bs', stratified = stratified, keep_prop = keep_prop)) test <- rbind(test, data.frame(ex_id = train_test$test, iter = i, mode = 'bs', stratified = stratified, keep_prop = keep_prop)) } return(list(train = train, test = test)) } ## Correct proportions for test set stratification .correct_prop <- function(samples, reference_samples, db = "../db/gtzan.db"){ ref <- as.factor( get_classes(reference_samples, unique_classes = F, db = db)) classes <- factor( get_classes(samples, unique_classes = F, db = db), levels = levels(ref)) ref_counts <- table(ref) ref_prop <- ref_counts / sum(ref_counts) prior_counts <- table(classes) post_counts <- round(ref_prop * min(table(classes) / ref_prop)) post_samples <- numeric(0) for (class in levels(classes)){ post_samples <- c(post_samples, sample(samples[which(classes == class)], size = post_counts[[class]])) } post_samples } ## Bootstrap single train/test pair get_bs_sample <- function(N, stratified = F, keep_prop = F, db = "../db/gtzan.db", classes = NULL, df = NULL, hold_out_ex_ids = NULL ){ if(is.null(df)) df <- get_excerpts_classes(db, classes) n_ex_class <- table(df$class_id) print(n_ex_class) if(!is.null(hold_out_ex_ids)) df <- df %>% filter(! (ex_id %in% hold_out_ex_ids)) train <- numeric(0) if(stratified){ for(j in unique(df$class_id)){ ex_class <- filter(df, class_id == j) %>% select(ex_id) %>% unlist() %>% unname() train <- c(train, sample(ex_class, size = n_ex_class[which(names(n_ex_class) == j)], replace = T)) } } else{ train <- sample(df$ex_id, size = N, replace = T) } test <- sort(c(df$ex_id[!(df$ex_id %in% train)], hold_out_ex_ids)) if(keep_prop){ test <- .correct_prop(test, reference = train) } return(list(train = train, test = test)) }