annotate db/create_db.Rmd @ 0:205974c9568c tip

Initial commit. Predictions not included for lack of space.
author franrodalg <f.rodriguezalgarra@qmul.ac.uk>
date Sat, 29 Jun 2019 18:45:50 +0100
parents
children
rev   line source
f@0 1 ---
f@0 2 title: "Create Database"
f@0 3 output: html_notebook
f@0 4 ---
f@0 5
f@0 6 ```{r}
f@0 7 library(tidyverse)
f@0 8 if(!require("RSQLite")){
f@0 9 install.packages("RSQLite")
f@0 10 library("RSQLite")
f@0 11 }
f@0 12 ```
f@0 13
f@0 14 ```{r read_csv}
f@0 15 filename <- '../metadata/GTZANindex.csv'
f@0 16 data <- read.csv(filename, stringsAsFactors = F)
f@0 17 ```
f@0 18
f@0 19 ```{r fill_empty}
f@0 20 data <-
f@0 21 data %>%
f@0 22 mutate(
f@0 23 artist_list = ifelse(
f@0 24 artist_list == '',
f@0 25 paste0("Unknown Artist (", class, "_", ex_id_class, ")"),
f@0 26 artist_list),
f@0 27 track_name = ifelse(
f@0 28 track_name == '',
f@0 29 paste0("Unknown Track (", class, "_", ex_id_class, ")"),
f@0 30 track_name)
f@0 31 )
f@0 32 ```
f@0 33
f@0 34 ```{r reconstruct_file}
f@0 35 ext <- 'au'
f@0 36
f@0 37 get_filename <- function(class, ex_id_class, ext = 'wav'){
f@0 38 paste0(class, '.',
f@0 39 ifelse(ex_id_class < 10, "0000", "000"),
f@0 40 ex_id_class, '.', ext)
f@0 41 }
f@0 42 ```
f@0 43
f@0 44 ```{r create_rec_id}
f@0 45 rep <- read.csv(file = 'rep.csv')
f@0 46
f@0 47 data$rec_id <-
f@0 48 sapply(1:max(data$ex_id),
f@0 49 function(x) ifelse(x %in% rep$ex_id, {
f@0 50 i <- rep[which(x == rep$ex_id), 'rep_id']
f@0 51 min(rep[which(rep$rep_id == i), 'ex_id'])},
f@0 52 x))
f@0 53 ```
f@0 54
f@0 55
f@0 56 ## Create Tables
f@0 57
f@0 58 ```{r excerpts}
f@0 59 excerpts <- data %>% select(ex_id, rec_id)
f@0 60 ```
f@0 61 ```{r recordings}
f@0 62 recordings <- data %>%
f@0 63 select(rec_id, track_name) %>%
f@0 64 rename(title = track_name) %>%
f@0 65 unique()
f@0 66 ```
f@0 67 ```{r artists}
f@0 68 # Find and split individual artists from artist lists
f@0 69 artist_list <- data %>% select(artist_list) %>% unlist()
f@0 70 artists <- data.frame(artist_id = numeric(),
f@0 71 artist_name = character(),
f@0 72 stringsAsFactors = F)
f@0 73 recordings_artists <- data.frame(rec_id = numeric(),
f@0 74 artist_id = numeric())
f@0 75
f@0 76 for (i in 1:nrow(data)){
f@0 77 aux <- unname(
f@0 78 sapply(
f@0 79 unlist(strsplit(data[i, 'artist_list'], split = ';')), trimws))
f@0 80
f@0 81 for (j in 1:length(aux)){
f@0 82 if (any(aux[j] %in% artists$artist_name)){
f@0 83 artist_id <- as.numeric(
f@0 84 artists[artists$artist_name == aux[j], 'artist_id'])
f@0 85 }
f@0 86 else {
f@0 87 artist_id <- ifelse(nrow(artists) < 1, 1,
f@0 88 max(artists$artist_id) + 1)
f@0 89 artists <-
f@0 90 rbind(artists,
f@0 91 data.frame(artist_id = artist_id,
f@0 92 artist_name = unname(aux[j]),
f@0 93 stringsAsFactors = F))
f@0 94 }
f@0 95 recordings_artists <-
f@0 96 rbind(recordings_artists,
f@0 97 data.frame(rec_id = as.numeric(data[i, 'rec_id']),
f@0 98 artist_id = artist_id))
f@0 99 }
f@0 100
f@0 101 }
f@0 102
f@0 103 recordings_artists <- unique(recordings_artists)
f@0 104 ```
f@0 105
f@0 106 ```{r classes}
f@0 107 class_names <- unique(data$class)
f@0 108 classes <- data.frame(class_id = 1:length(class_names),
f@0 109 class = class_names, stringsAsFactors = F)
f@0 110 excerpts_classes <- data %>%
f@0 111 select(ex_id, class, ex_id_class) %>%
f@0 112 inner_join(classes, by = c('class')) %>%
f@0 113 select(ex_id, class_id, ex_id_class)
f@0 114 ```
f@0 115
f@0 116
f@0 117
f@0 118 ```{r create_db}
f@0 119
f@0 120 tables <- c(
f@0 121 "excerpts", "recordings", "artists", "recordings_artists",
f@0 122 "classes", "excerpts_classes"
f@0 123 )
f@0 124
f@0 125 con <- dbConnect(SQLite(), "gtzan.db")
f@0 126
f@0 127 sapply(
f@0 128 X = tables,
f@0 129 FUN = function(x)
f@0 130 dbWriteTable(con, name = x, value = get(x), overwrite = T)
f@0 131 )
f@0 132
f@0 133 dbDisconnect(con)
f@0 134 ```
f@0 135