Mercurial > hg > confint
view db/create_db.Rmd @ 0:205974c9568c tip
Initial commit. Predictions not included for lack of space.
author | franrodalg <f.rodriguezalgarra@qmul.ac.uk> |
---|---|
date | Sat, 29 Jun 2019 18:45:50 +0100 |
parents | |
children |
line wrap: on
line source
--- title: "Create Database" output: html_notebook --- ```{r} library(tidyverse) if(!require("RSQLite")){ install.packages("RSQLite") library("RSQLite") } ``` ```{r read_csv} filename <- '../metadata/GTZANindex.csv' data <- read.csv(filename, stringsAsFactors = F) ``` ```{r fill_empty} data <- data %>% mutate( artist_list = ifelse( artist_list == '', paste0("Unknown Artist (", class, "_", ex_id_class, ")"), artist_list), track_name = ifelse( track_name == '', paste0("Unknown Track (", class, "_", ex_id_class, ")"), track_name) ) ``` ```{r reconstruct_file} ext <- 'au' get_filename <- function(class, ex_id_class, ext = 'wav'){ paste0(class, '.', ifelse(ex_id_class < 10, "0000", "000"), ex_id_class, '.', ext) } ``` ```{r create_rec_id} rep <- read.csv(file = 'rep.csv') data$rec_id <- sapply(1:max(data$ex_id), function(x) ifelse(x %in% rep$ex_id, { i <- rep[which(x == rep$ex_id), 'rep_id'] min(rep[which(rep$rep_id == i), 'ex_id'])}, x)) ``` ## Create Tables ```{r excerpts} excerpts <- data %>% select(ex_id, rec_id) ``` ```{r recordings} recordings <- data %>% select(rec_id, track_name) %>% rename(title = track_name) %>% unique() ``` ```{r artists} # Find and split individual artists from artist lists artist_list <- data %>% select(artist_list) %>% unlist() artists <- data.frame(artist_id = numeric(), artist_name = character(), stringsAsFactors = F) recordings_artists <- data.frame(rec_id = numeric(), artist_id = numeric()) for (i in 1:nrow(data)){ aux <- unname( sapply( unlist(strsplit(data[i, 'artist_list'], split = ';')), trimws)) for (j in 1:length(aux)){ if (any(aux[j] %in% artists$artist_name)){ artist_id <- as.numeric( artists[artists$artist_name == aux[j], 'artist_id']) } else { artist_id <- ifelse(nrow(artists) < 1, 1, max(artists$artist_id) + 1) artists <- rbind(artists, data.frame(artist_id = artist_id, artist_name = unname(aux[j]), stringsAsFactors = F)) } recordings_artists <- rbind(recordings_artists, data.frame(rec_id = as.numeric(data[i, 'rec_id']), artist_id = artist_id)) } } recordings_artists <- unique(recordings_artists) ``` ```{r classes} class_names <- unique(data$class) classes <- data.frame(class_id = 1:length(class_names), class = class_names, stringsAsFactors = F) excerpts_classes <- data %>% select(ex_id, class, ex_id_class) %>% inner_join(classes, by = c('class')) %>% select(ex_id, class_id, ex_id_class) ``` ```{r create_db} tables <- c( "excerpts", "recordings", "artists", "recordings_artists", "classes", "excerpts_classes" ) con <- dbConnect(SQLite(), "gtzan.db") sapply( X = tables, FUN = function(x) dbWriteTable(con, name = x, value = get(x), overwrite = T) ) dbDisconnect(con) ```