view metadata/parse_file.Rmd @ 0:205974c9568c tip

Initial commit. Predictions not included for lack of space.
author franrodalg <f.rodriguezalgarra@qmul.ac.uk>
date Sat, 29 Jun 2019 18:45:50 +0100
parents
children
line wrap: on
line source
---
title: "GTZAN index parsing"
output: html_notebook
---

```{r libraries}
library(tidyverse)
```

## Reading the file

```{r read_file_functions}
read_text <- function(filename){
  lines <- readLines(con <- file(filename))
  close(con)
  lines
}

remove_comments <- function(text, comment_char){
  comment_regex <- paste0(
    '^', comment_char, '|^.*', comment_char)
  text[!grepl(comment_regex, text)]
}

parse_long_sep <- function(lines, separator){
  records <- unname(
    sapply(sapply(lines, strsplit, split = separator),
           trimws))
  records <- lapply(records, `length<-`, max(lengths(records)))
  df <- data.frame(t(sapply(records,c)), stringsAsFactors = F)
  rownames(df) <- 1:nrow(df)
  df
}

parse_file <- function(filename, separator, comment_char){
  lines <- read_text(filename)
  lines <- remove_comments(lines, comment_char)
  data <- parse_long_sep(lines, separator)
  data[is.na(data)] <- ""
  data
}
```

```{r read_file}

create_data_frame <- function(file = 'GTZANindex'){
  
  index_file = paste0(file, '.txt')
  
  data <- parse_file(
  index_file, separator = ':::', comment_char = '#')
  colnames(data) <- c('file_name', 'artist_list', 'track_name')
  
  data$ex_id <- 1:nrow(data)
  data <- data %>%
    select(ex_id, everything()) %>%
    separate(file_name, c("class", "ex_id_class"),
             remove = T, extra = "drop")
  data$ex_id_class <- as.numeric(data$ex_id_class)
  data
}

```


```{r bob}
filename <- 'GTZANindex'
data_1 <- create_data_frame(filename)
write.csv(data_1, file = paste0(filename, '.csv'), row.names = F)
```