f@0: --- f@0: title: "GTZAN index parsing" f@0: output: html_notebook f@0: --- f@0: f@0: ```{r libraries} f@0: library(tidyverse) f@0: ``` f@0: f@0: ## Reading the file f@0: f@0: ```{r read_file_functions} f@0: read_text <- function(filename){ f@0: lines <- readLines(con <- file(filename)) f@0: close(con) f@0: lines f@0: } f@0: f@0: remove_comments <- function(text, comment_char){ f@0: comment_regex <- paste0( f@0: '^', comment_char, '|^.*', comment_char) f@0: text[!grepl(comment_regex, text)] f@0: } f@0: f@0: parse_long_sep <- function(lines, separator){ f@0: records <- unname( f@0: sapply(sapply(lines, strsplit, split = separator), f@0: trimws)) f@0: records <- lapply(records, `length<-`, max(lengths(records))) f@0: df <- data.frame(t(sapply(records,c)), stringsAsFactors = F) f@0: rownames(df) <- 1:nrow(df) f@0: df f@0: } f@0: f@0: parse_file <- function(filename, separator, comment_char){ f@0: lines <- read_text(filename) f@0: lines <- remove_comments(lines, comment_char) f@0: data <- parse_long_sep(lines, separator) f@0: data[is.na(data)] <- "" f@0: data f@0: } f@0: ``` f@0: f@0: ```{r read_file} f@0: f@0: create_data_frame <- function(file = 'GTZANindex'){ f@0: f@0: index_file = paste0(file, '.txt') f@0: f@0: data <- parse_file( f@0: index_file, separator = ':::', comment_char = '#') f@0: colnames(data) <- c('file_name', 'artist_list', 'track_name') f@0: f@0: data$ex_id <- 1:nrow(data) f@0: data <- data %>% f@0: select(ex_id, everything()) %>% f@0: separate(file_name, c("class", "ex_id_class"), f@0: remove = T, extra = "drop") f@0: data$ex_id_class <- as.numeric(data$ex_id_class) f@0: data f@0: } f@0: f@0: ``` f@0: f@0: f@0: ```{r bob} f@0: filename <- 'GTZANindex' f@0: data_1 <- create_data_frame(filename) f@0: write.csv(data_1, file = paste0(filename, '.csv'), row.names = F) f@0: ```