comparison metadata/parse_file.Rmd @ 0:205974c9568c tip

Initial commit. Predictions not included for lack of space.
author franrodalg <f.rodriguezalgarra@qmul.ac.uk>
date Sat, 29 Jun 2019 18:45:50 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:205974c9568c
1 ---
2 title: "GTZAN index parsing"
3 output: html_notebook
4 ---
5
6 ```{r libraries}
7 library(tidyverse)
8 ```
9
10 ## Reading the file
11
12 ```{r read_file_functions}
13 read_text <- function(filename){
14 lines <- readLines(con <- file(filename))
15 close(con)
16 lines
17 }
18
19 remove_comments <- function(text, comment_char){
20 comment_regex <- paste0(
21 '^', comment_char, '|^.*', comment_char)
22 text[!grepl(comment_regex, text)]
23 }
24
25 parse_long_sep <- function(lines, separator){
26 records <- unname(
27 sapply(sapply(lines, strsplit, split = separator),
28 trimws))
29 records <- lapply(records, `length<-`, max(lengths(records)))
30 df <- data.frame(t(sapply(records,c)), stringsAsFactors = F)
31 rownames(df) <- 1:nrow(df)
32 df
33 }
34
35 parse_file <- function(filename, separator, comment_char){
36 lines <- read_text(filename)
37 lines <- remove_comments(lines, comment_char)
38 data <- parse_long_sep(lines, separator)
39 data[is.na(data)] <- ""
40 data
41 }
42 ```
43
44 ```{r read_file}
45
46 create_data_frame <- function(file = 'GTZANindex'){
47
48 index_file = paste0(file, '.txt')
49
50 data <- parse_file(
51 index_file, separator = ':::', comment_char = '#')
52 colnames(data) <- c('file_name', 'artist_list', 'track_name')
53
54 data$ex_id <- 1:nrow(data)
55 data <- data %>%
56 select(ex_id, everything()) %>%
57 separate(file_name, c("class", "ex_id_class"),
58 remove = T, extra = "drop")
59 data$ex_id_class <- as.numeric(data$ex_id_class)
60 data
61 }
62
63 ```
64
65
66 ```{r bob}
67 filename <- 'GTZANindex'
68 data_1 <- create_data_frame(filename)
69 write.csv(data_1, file = paste0(filename, '.csv'), row.names = F)
70 ```