annotate metadata/parse_file.Rmd @ 0:205974c9568c tip

Initial commit. Predictions not included for lack of space.
author franrodalg <f.rodriguezalgarra@qmul.ac.uk>
date Sat, 29 Jun 2019 18:45:50 +0100
parents
children
rev   line source
f@0 1 ---
f@0 2 title: "GTZAN index parsing"
f@0 3 output: html_notebook
f@0 4 ---
f@0 5
f@0 6 ```{r libraries}
f@0 7 library(tidyverse)
f@0 8 ```
f@0 9
f@0 10 ## Reading the file
f@0 11
f@0 12 ```{r read_file_functions}
f@0 13 read_text <- function(filename){
f@0 14 lines <- readLines(con <- file(filename))
f@0 15 close(con)
f@0 16 lines
f@0 17 }
f@0 18
f@0 19 remove_comments <- function(text, comment_char){
f@0 20 comment_regex <- paste0(
f@0 21 '^', comment_char, '|^.*', comment_char)
f@0 22 text[!grepl(comment_regex, text)]
f@0 23 }
f@0 24
f@0 25 parse_long_sep <- function(lines, separator){
f@0 26 records <- unname(
f@0 27 sapply(sapply(lines, strsplit, split = separator),
f@0 28 trimws))
f@0 29 records <- lapply(records, `length<-`, max(lengths(records)))
f@0 30 df <- data.frame(t(sapply(records,c)), stringsAsFactors = F)
f@0 31 rownames(df) <- 1:nrow(df)
f@0 32 df
f@0 33 }
f@0 34
f@0 35 parse_file <- function(filename, separator, comment_char){
f@0 36 lines <- read_text(filename)
f@0 37 lines <- remove_comments(lines, comment_char)
f@0 38 data <- parse_long_sep(lines, separator)
f@0 39 data[is.na(data)] <- ""
f@0 40 data
f@0 41 }
f@0 42 ```
f@0 43
f@0 44 ```{r read_file}
f@0 45
f@0 46 create_data_frame <- function(file = 'GTZANindex'){
f@0 47
f@0 48 index_file = paste0(file, '.txt')
f@0 49
f@0 50 data <- parse_file(
f@0 51 index_file, separator = ':::', comment_char = '#')
f@0 52 colnames(data) <- c('file_name', 'artist_list', 'track_name')
f@0 53
f@0 54 data$ex_id <- 1:nrow(data)
f@0 55 data <- data %>%
f@0 56 select(ex_id, everything()) %>%
f@0 57 separate(file_name, c("class", "ex_id_class"),
f@0 58 remove = T, extra = "drop")
f@0 59 data$ex_id_class <- as.numeric(data$ex_id_class)
f@0 60 data
f@0 61 }
f@0 62
f@0 63 ```
f@0 64
f@0 65
f@0 66 ```{r bob}
f@0 67 filename <- 'GTZANindex'
f@0 68 data_1 <- create_data_frame(filename)
f@0 69 write.csv(data_1, file = paste0(filename, '.csv'), row.names = F)
f@0 70 ```