view analysis/simulate_sets.R @ 0:205974c9568c tip

Initial commit. Predictions not included for lack of space.
author franrodalg <f.rodriguezalgarra@qmul.ac.uk>
date Sat, 29 Jun 2019 18:45:50 +0100
parents
children
line wrap: on
line source



extract_sizes <- function(preds, sets) {
  preds %>%
    inner_join(sets, by = c('iteration', 'ex_id')) %>%
    select(iteration, ex_id, gt) %>% unique() %>%
    group_by(iteration, gt) %>%
    summarise(num_ex = n())
}

class_names <- function(preds, sets) {
  preds %>%
    inner_join(sets, by = c('iteration', 'ex_id')) %>%
    select(ex_id, gt) %>% unique()
}

names <- class_names(preds_orig, test_sets)

sizes <- extract_sizes(preds_orig, reg_sets)

simulate_set <- function(x, sizes) {
  y <- data.frame()
  for (i in 1:nrow(sizes)) {
    y <- rbind(
      y,
      x %>%
        filter(gt == sizes[i,]$gt) %>%
        sample_n(sizes[i,]$num_ex)
    )
  }
  y
}

simulate_sets <- function(x, names, sizes, num_simul = 10) {
  y <- data.frame()
  for (s in 1:num_simul) {
    for (it in unique(x$iteration)) {
      y <- rbind(
        y,
        simulate_set(x %>%
                       filter(iteration == it) %>%
                       inner_join(names, by = c('ex_id')),
                     sizes %>%
                       filter(iteration == it)) %>%
          mutate(iteration = it, simul = s))
    }}
  y
}

set.seed(1986)

simul_sets <- simulate_sets(test_sets, names, sizes, num_simul = 100)


simulate_foms <- function(preds, simul_sets) {
  foms <- data.frame()
  for (i in unique(simul_sets$simul)) {
    print(i)
    foms <- rbind(
      foms,
      get_fom_table(preds %>%
                      inner_join(simul_sets %>%
                                   filter(simul == i) %>%
                                   select(-simul),
                                 by = c('iteration', 'ex_id', 'gt'))) %>%
        ungroup() %>%
        mutate(simul = i) 
    )
  }
  foms
  
}

simul_foms <- simulate_foms(preds_orig, simul_sets)

compare <- function(x, y) {
  x %>% inner_join(
    y, by = c('iteration', 'alg', 'feat_col', 'feat_set')) %>%
    ungroup() %>%
    mutate(loe = mean_recall.x >= mean_recall.y) %>%
    summarise(sum(loe) / n()) %>% pull()
}

simulate_comparisons <- function(foms, simul_foms, test=F) {
  if (!test) {
      sapply(X = unique(simul_foms$simul),
         FUN = function(i) {
           compare(foms, simul_foms %>% filter(simul == i))}) 
  } else {
      sapply(X = unique(simul_foms$simul),
           FUN = function(i) {
             compare(simul_foms %>% filter(simul == i), foms)}) 
  }
}


reg_comparisons <- simulate_comparisons(reg_fom_iter_orig, simul_foms)

reg_test <- compare(reg_fom_iter_orig, test_fom_iter_orig)


test_comparisons <- simulate_comparisons(test_fom_iter_orig, simul_foms, test=T)

simulate_all_comparisons <- function(simul_foms) {
  all_comp <- data.frame()
  for (i in unique(simul_foms$simul)) {
    all_comp <- rbind(
      all_comp,
      data.frame(
        simul = i,
        simul_comp = unique(simul_foms$simul)[-i],
        prop = simulate_comparisons(
          simul_foms %>% filter(simul == i),
          simul_foms %>% filter(simul != i))))
  }
  all_comp
}

all_comparisons <- simulate_all_comparisons(simul_foms)

ggplot() +
  geom_density(data=all_comparisons,
               aes(x=prop, group=simul),
               color='grey85') +
  geom_density(data=all_comparisons %>%
                 group_by(simul) %>%
                 summarise(av_prop = mean(prop)),
               aes(x=av_prop), linetype='dashed') +
  geom_density(data=data.frame(prop=reg_comparisons),
               aes(x=prop)) +
  xlim(c(0, 1)) +
  theme_bw() +
  xlab('Proportion of measurements') +
  ylab('Density')