view db/create_db.Rmd @ 0:205974c9568c tip

Initial commit. Predictions not included for lack of space.
author franrodalg <f.rodriguezalgarra@qmul.ac.uk>
date Sat, 29 Jun 2019 18:45:50 +0100
parents
children
line wrap: on
line source
---
title: "Create Database"
output: html_notebook
---

```{r}
library(tidyverse)
if(!require("RSQLite")){
  install.packages("RSQLite")
  library("RSQLite")
}
```

```{r read_csv}
filename <- '../metadata/GTZANindex.csv'
data <- read.csv(filename, stringsAsFactors = F)
```

```{r fill_empty}
data <- 
  data %>%
  mutate(
    artist_list = ifelse(
      artist_list == '',
      paste0("Unknown Artist (", class, "_", ex_id_class, ")"), 
      artist_list),
    track_name = ifelse(
      track_name == '',
      paste0("Unknown Track (", class, "_", ex_id_class, ")"), 
      track_name)    
    )
```

```{r reconstruct_file}
ext <- 'au'

get_filename <- function(class, ex_id_class, ext = 'wav'){
  paste0(class, '.',
         ifelse(ex_id_class < 10, "0000", "000"),
         ex_id_class, '.', ext)
}
```

```{r create_rec_id}
rep <- read.csv(file = 'rep.csv')

data$rec_id <- 
  sapply(1:max(data$ex_id),
         function(x) ifelse(x %in% rep$ex_id, {
           i <- rep[which(x == rep$ex_id), 'rep_id']
           min(rep[which(rep$rep_id == i), 'ex_id'])},
           x))
```


## Create Tables

```{r excerpts}
excerpts <- data %>% select(ex_id, rec_id)
```
```{r recordings}
recordings <- data %>%
  select(rec_id, track_name) %>%
  rename(title = track_name) %>%
  unique()
```
```{r artists}
# Find and split individual artists from artist lists
artist_list <- data %>% select(artist_list) %>% unlist()
artists <- data.frame(artist_id = numeric(), 
                      artist_name = character(),
                      stringsAsFactors = F)
recordings_artists <- data.frame(rec_id = numeric(), 
                                 artist_id = numeric())

for (i in 1:nrow(data)){
  aux <- unname(
    sapply(
    unlist(strsplit(data[i, 'artist_list'], split = ';')), trimws))

  for (j in 1:length(aux)){
   if (any(aux[j] %in% artists$artist_name)){
      artist_id <- as.numeric(
        artists[artists$artist_name == aux[j], 'artist_id'])
    }
    else {
      artist_id <- ifelse(nrow(artists) < 1, 1, 
                          max(artists$artist_id) + 1)
      artists <- 
        rbind(artists,
              data.frame(artist_id = artist_id, 
                         artist_name = unname(aux[j]),
                         stringsAsFactors = F))
    }
    recordings_artists <-
      rbind(recordings_artists,
            data.frame(rec_id = as.numeric(data[i, 'rec_id']),
                       artist_id = artist_id))
  }
  
}

recordings_artists <- unique(recordings_artists)
```

```{r classes}
class_names <- unique(data$class)
classes <- data.frame(class_id = 1:length(class_names),
                      class = class_names, stringsAsFactors = F)
excerpts_classes <- data %>%
  select(ex_id, class, ex_id_class) %>%
  inner_join(classes, by = c('class')) %>% 
    select(ex_id, class_id, ex_id_class)
```



```{r create_db}

tables <- c(
  "excerpts", "recordings", "artists", "recordings_artists",
  "classes", "excerpts_classes"
)

con <- dbConnect(SQLite(), "gtzan.db")

sapply(
  X = tables, 
  FUN = function(x) 
    dbWriteTable(con, name = x,  value = get(x),  overwrite = T)
)

dbDisconnect(con)
```