comparison db/create_db.Rmd @ 0:205974c9568c tip

Initial commit. Predictions not included for lack of space.
author franrodalg <f.rodriguezalgarra@qmul.ac.uk>
date Sat, 29 Jun 2019 18:45:50 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:205974c9568c
1 ---
2 title: "Create Database"
3 output: html_notebook
4 ---
5
6 ```{r}
7 library(tidyverse)
8 if(!require("RSQLite")){
9 install.packages("RSQLite")
10 library("RSQLite")
11 }
12 ```
13
14 ```{r read_csv}
15 filename <- '../metadata/GTZANindex.csv'
16 data <- read.csv(filename, stringsAsFactors = F)
17 ```
18
19 ```{r fill_empty}
20 data <-
21 data %>%
22 mutate(
23 artist_list = ifelse(
24 artist_list == '',
25 paste0("Unknown Artist (", class, "_", ex_id_class, ")"),
26 artist_list),
27 track_name = ifelse(
28 track_name == '',
29 paste0("Unknown Track (", class, "_", ex_id_class, ")"),
30 track_name)
31 )
32 ```
33
34 ```{r reconstruct_file}
35 ext <- 'au'
36
37 get_filename <- function(class, ex_id_class, ext = 'wav'){
38 paste0(class, '.',
39 ifelse(ex_id_class < 10, "0000", "000"),
40 ex_id_class, '.', ext)
41 }
42 ```
43
44 ```{r create_rec_id}
45 rep <- read.csv(file = 'rep.csv')
46
47 data$rec_id <-
48 sapply(1:max(data$ex_id),
49 function(x) ifelse(x %in% rep$ex_id, {
50 i <- rep[which(x == rep$ex_id), 'rep_id']
51 min(rep[which(rep$rep_id == i), 'ex_id'])},
52 x))
53 ```
54
55
56 ## Create Tables
57
58 ```{r excerpts}
59 excerpts <- data %>% select(ex_id, rec_id)
60 ```
61 ```{r recordings}
62 recordings <- data %>%
63 select(rec_id, track_name) %>%
64 rename(title = track_name) %>%
65 unique()
66 ```
67 ```{r artists}
68 # Find and split individual artists from artist lists
69 artist_list <- data %>% select(artist_list) %>% unlist()
70 artists <- data.frame(artist_id = numeric(),
71 artist_name = character(),
72 stringsAsFactors = F)
73 recordings_artists <- data.frame(rec_id = numeric(),
74 artist_id = numeric())
75
76 for (i in 1:nrow(data)){
77 aux <- unname(
78 sapply(
79 unlist(strsplit(data[i, 'artist_list'], split = ';')), trimws))
80
81 for (j in 1:length(aux)){
82 if (any(aux[j] %in% artists$artist_name)){
83 artist_id <- as.numeric(
84 artists[artists$artist_name == aux[j], 'artist_id'])
85 }
86 else {
87 artist_id <- ifelse(nrow(artists) < 1, 1,
88 max(artists$artist_id) + 1)
89 artists <-
90 rbind(artists,
91 data.frame(artist_id = artist_id,
92 artist_name = unname(aux[j]),
93 stringsAsFactors = F))
94 }
95 recordings_artists <-
96 rbind(recordings_artists,
97 data.frame(rec_id = as.numeric(data[i, 'rec_id']),
98 artist_id = artist_id))
99 }
100
101 }
102
103 recordings_artists <- unique(recordings_artists)
104 ```
105
106 ```{r classes}
107 class_names <- unique(data$class)
108 classes <- data.frame(class_id = 1:length(class_names),
109 class = class_names, stringsAsFactors = F)
110 excerpts_classes <- data %>%
111 select(ex_id, class, ex_id_class) %>%
112 inner_join(classes, by = c('class')) %>%
113 select(ex_id, class_id, ex_id_class)
114 ```
115
116
117
118 ```{r create_db}
119
120 tables <- c(
121 "excerpts", "recordings", "artists", "recordings_artists",
122 "classes", "excerpts_classes"
123 )
124
125 con <- dbConnect(SQLite(), "gtzan.db")
126
127 sapply(
128 X = tables,
129 FUN = function(x)
130 dbWriteTable(con, name = x, value = get(x), overwrite = T)
131 )
132
133 dbDisconnect(con)
134 ```
135