diff metadata/parse_file.Rmd @ 0:205974c9568c tip

Initial commit. Predictions not included for lack of space.
author franrodalg <f.rodriguezalgarra@qmul.ac.uk>
date Sat, 29 Jun 2019 18:45:50 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/metadata/parse_file.Rmd	Sat Jun 29 18:45:50 2019 +0100
@@ -0,0 +1,70 @@
+---
+title: "GTZAN index parsing"
+output: html_notebook
+---
+
+```{r libraries}
+library(tidyverse)
+```
+
+## Reading the file
+
+```{r read_file_functions}
+read_text <- function(filename){
+  lines <- readLines(con <- file(filename))
+  close(con)
+  lines
+}
+
+remove_comments <- function(text, comment_char){
+  comment_regex <- paste0(
+    '^', comment_char, '|^.*', comment_char)
+  text[!grepl(comment_regex, text)]
+}
+
+parse_long_sep <- function(lines, separator){
+  records <- unname(
+    sapply(sapply(lines, strsplit, split = separator),
+           trimws))
+  records <- lapply(records, `length<-`, max(lengths(records)))
+  df <- data.frame(t(sapply(records,c)), stringsAsFactors = F)
+  rownames(df) <- 1:nrow(df)
+  df
+}
+
+parse_file <- function(filename, separator, comment_char){
+  lines <- read_text(filename)
+  lines <- remove_comments(lines, comment_char)
+  data <- parse_long_sep(lines, separator)
+  data[is.na(data)] <- ""
+  data
+}
+```
+
+```{r read_file}
+
+create_data_frame <- function(file = 'GTZANindex'){
+  
+  index_file = paste0(file, '.txt')
+  
+  data <- parse_file(
+  index_file, separator = ':::', comment_char = '#')
+  colnames(data) <- c('file_name', 'artist_list', 'track_name')
+  
+  data$ex_id <- 1:nrow(data)
+  data <- data %>%
+    select(ex_id, everything()) %>%
+    separate(file_name, c("class", "ex_id_class"),
+             remove = T, extra = "drop")
+  data$ex_id_class <- as.numeric(data$ex_id_class)
+  data
+}
+
+```
+
+
+```{r bob}
+filename <- 'GTZANindex'
+data_1 <- create_data_frame(filename)
+write.csv(data_1, file = paste0(filename, '.csv'), row.names = F)
+```