diff analysis/analyse_sets.Rmd @ 0:205974c9568c tip

Initial commit. Predictions not included for lack of space.
author franrodalg <f.rodriguezalgarra@qmul.ac.uk>
date Sat, 29 Jun 2019 18:45:50 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/analysis/analyse_sets.Rmd	Sat Jun 29 18:45:50 2019 +0100
@@ -0,0 +1,123 @@
+---
+title: "Analyse Sets"
+output: html_notebook
+---
+
+```{r}
+source('../db/access_db.R')
+library(tidyverse)
+```
+
+
+```{r}
+excerpts_classes <- get_excerpts_classes()
+
+excerpts_artist <- get_excerpts_artists()
+
+classes_artists <- excerpts_classes %>%
+  inner_join(excerpts_artist, by = c('ex_id')) %>%
+  inner_join(get_class_names(), by = c('class_id'))
+
+```
+
+## Excerpts
+
+```{r}
+sets_excerpts <- rbind(
+  read.csv('../sets/train_fixed.csv') %>% mutate(set = 'train'),
+  read.csv('../sets/test_fixed.csv') %>% mutate(set = 'test'),
+  read.csv('../sets/filt_fixed.csv') %>% mutate(set = 'filt')) %>%
+  inner_join(excerpts_classes, by = c('ex_id'))
+
+(sets_excerpts_table <- 
+  sets_excerpts %>% 
+  group_by(class_id, set, iter) %>%
+  unique() %>%
+  summarise(num_excerpts = n()) %>%
+  ungroup())
+```
+
+
+```{r }
+
+sets_excerpts_table$set <- factor(sets_excerpts_table$set,
+                                  levels = c('train', 'test', 'filt'))
+
+sets_excerpts_table$set <- plyr::mapvalues(sets_excerpts_table$set,
+                                  from = c('filt', 'test', 'train'),
+                                  to = c('Pr. Test',
+                                         'Test',
+                                         'Train'))
+sets_excerpts_table <- sets_excerpts_table %>%
+  inner_join(get_class_names(), by = c('class_id'))
+```
+
+```{r fig.height = 3, fig.width = 6}
+excerpts_p <- ggplot(sets_excerpts_table) +
+  geom_violin(aes(x = set,
+                   y = num_excerpts,
+                   color = set, fill = set),
+              alpha = 0.6, size = 1,
+              draw_quantiles = c(0.5)) +
+  theme_bw() +
+  scale_y_continuous(name = 'Unique Excerpts',
+                     limits=c(0, 80)) +
+  xlab('Collection') +
+  theme(axis.title.x = element_text(size = 12),
+        axis.title.y = element_text(size = 11),
+        axis.text = element_text(size = 11),
+        axis.text.x = element_text(
+          angle = 45, hjust = 1, vjust = 1),
+        legend.position = 'none', 
+        strip.text.x = element_text(size = 11)) +
+  facet_grid(.~class)
+```
+
+## Artists
+
+```{r}
+sets_artists <- sets_excerpts %>%
+  inner_join(excerpts_artist, by = c('ex_id'))
+
+(sets_artists_table <- 
+  sets_artists %>% 
+  group_by(class_id, set, iter) %>%
+  summarise(num_artists = n_distinct(artist_id)) %>%
+  ungroup())
+```
+
+
+```{r }
+
+sets_artists_table$set <- factor(sets_artists_table$set,
+                                  levels = c('train', 'test', 'filt'))
+
+sets_artists_table$set <- plyr::mapvalues(sets_artists_table$set,
+                                  from = c('filt', 'test', 'train'),
+                                  to = c('Pr. Test',
+                                         'Test',
+                                         'Train'))
+sets_artists_table <- sets_artists_table %>%
+  inner_join(get_class_names(), by = c('class_id'))
+```
+
+```{r fig.height = 3, fig.width = 6}
+artists_p <- ggplot(sets_artists_table) +
+  geom_violin(aes(x = set,
+                   y = num_artists,
+                   color = set, fill = set),
+              alpha = 0.6, size = 1,
+              draw_quantiles = c(0.5)) +
+  theme_bw() +
+  scale_y_continuous(name = 'Unique Artists',
+                     limits=c(0, 60)) +
+  xlab('Collection') +
+  theme(axis.title.x = element_text(size = 12),
+        axis.title.y = element_text(size = 11),
+        axis.text = element_text(size = 11),
+        axis.text.x = element_text(
+          angle = 45, hjust = 1, vjust = 1),
+        legend.position = 'none', 
+        strip.text.x = element_text(size = 11)) +
+  facet_grid(.~class)
+```