f@0: ---
f@0: title: "GTZAN index parsing"
f@0: output: html_notebook
f@0: ---
f@0: 
f@0: ```{r libraries}
f@0: library(tidyverse)
f@0: ```
f@0: 
f@0: ## Reading the file
f@0: 
f@0: ```{r read_file_functions}
f@0: read_text <- function(filename){
f@0:   lines <- readLines(con <- file(filename))
f@0:   close(con)
f@0:   lines
f@0: }
f@0: 
f@0: remove_comments <- function(text, comment_char){
f@0:   comment_regex <- paste0(
f@0:     '^', comment_char, '|^.*', comment_char)
f@0:   text[!grepl(comment_regex, text)]
f@0: }
f@0: 
f@0: parse_long_sep <- function(lines, separator){
f@0:   records <- unname(
f@0:     sapply(sapply(lines, strsplit, split = separator),
f@0:            trimws))
f@0:   records <- lapply(records, `length<-`, max(lengths(records)))
f@0:   df <- data.frame(t(sapply(records,c)), stringsAsFactors = F)
f@0:   rownames(df) <- 1:nrow(df)
f@0:   df
f@0: }
f@0: 
f@0: parse_file <- function(filename, separator, comment_char){
f@0:   lines <- read_text(filename)
f@0:   lines <- remove_comments(lines, comment_char)
f@0:   data <- parse_long_sep(lines, separator)
f@0:   data[is.na(data)] <- ""
f@0:   data
f@0: }
f@0: ```
f@0: 
f@0: ```{r read_file}
f@0: 
f@0: create_data_frame <- function(file = 'GTZANindex'){
f@0:   
f@0:   index_file = paste0(file, '.txt')
f@0:   
f@0:   data <- parse_file(
f@0:   index_file, separator = ':::', comment_char = '#')
f@0:   colnames(data) <- c('file_name', 'artist_list', 'track_name')
f@0:   
f@0:   data$ex_id <- 1:nrow(data)
f@0:   data <- data %>%
f@0:     select(ex_id, everything()) %>%
f@0:     separate(file_name, c("class", "ex_id_class"),
f@0:              remove = T, extra = "drop")
f@0:   data$ex_id_class <- as.numeric(data$ex_id_class)
f@0:   data
f@0: }
f@0: 
f@0: ```
f@0: 
f@0: 
f@0: ```{r bob}
f@0: filename <- 'GTZANindex'
f@0: data_1 <- create_data_frame(filename)
f@0: write.csv(data_1, file = paste0(filename, '.csv'), row.names = F)
f@0: ```