f@0
|
1 ---
|
f@0
|
2 title: "GTZAN index parsing"
|
f@0
|
3 output: html_notebook
|
f@0
|
4 ---
|
f@0
|
5
|
f@0
|
6 ```{r libraries}
|
f@0
|
7 library(tidyverse)
|
f@0
|
8 ```
|
f@0
|
9
|
f@0
|
10 ## Reading the file
|
f@0
|
11
|
f@0
|
12 ```{r read_file_functions}
|
f@0
|
13 read_text <- function(filename){
|
f@0
|
14 lines <- readLines(con <- file(filename))
|
f@0
|
15 close(con)
|
f@0
|
16 lines
|
f@0
|
17 }
|
f@0
|
18
|
f@0
|
19 remove_comments <- function(text, comment_char){
|
f@0
|
20 comment_regex <- paste0(
|
f@0
|
21 '^', comment_char, '|^.*', comment_char)
|
f@0
|
22 text[!grepl(comment_regex, text)]
|
f@0
|
23 }
|
f@0
|
24
|
f@0
|
25 parse_long_sep <- function(lines, separator){
|
f@0
|
26 records <- unname(
|
f@0
|
27 sapply(sapply(lines, strsplit, split = separator),
|
f@0
|
28 trimws))
|
f@0
|
29 records <- lapply(records, `length<-`, max(lengths(records)))
|
f@0
|
30 df <- data.frame(t(sapply(records,c)), stringsAsFactors = F)
|
f@0
|
31 rownames(df) <- 1:nrow(df)
|
f@0
|
32 df
|
f@0
|
33 }
|
f@0
|
34
|
f@0
|
35 parse_file <- function(filename, separator, comment_char){
|
f@0
|
36 lines <- read_text(filename)
|
f@0
|
37 lines <- remove_comments(lines, comment_char)
|
f@0
|
38 data <- parse_long_sep(lines, separator)
|
f@0
|
39 data[is.na(data)] <- ""
|
f@0
|
40 data
|
f@0
|
41 }
|
f@0
|
42 ```
|
f@0
|
43
|
f@0
|
44 ```{r read_file}
|
f@0
|
45
|
f@0
|
46 create_data_frame <- function(file = 'GTZANindex'){
|
f@0
|
47
|
f@0
|
48 index_file = paste0(file, '.txt')
|
f@0
|
49
|
f@0
|
50 data <- parse_file(
|
f@0
|
51 index_file, separator = ':::', comment_char = '#')
|
f@0
|
52 colnames(data) <- c('file_name', 'artist_list', 'track_name')
|
f@0
|
53
|
f@0
|
54 data$ex_id <- 1:nrow(data)
|
f@0
|
55 data <- data %>%
|
f@0
|
56 select(ex_id, everything()) %>%
|
f@0
|
57 separate(file_name, c("class", "ex_id_class"),
|
f@0
|
58 remove = T, extra = "drop")
|
f@0
|
59 data$ex_id_class <- as.numeric(data$ex_id_class)
|
f@0
|
60 data
|
f@0
|
61 }
|
f@0
|
62
|
f@0
|
63 ```
|
f@0
|
64
|
f@0
|
65
|
f@0
|
66 ```{r bob}
|
f@0
|
67 filename <- 'GTZANindex'
|
f@0
|
68 data_1 <- create_data_frame(filename)
|
f@0
|
69 write.csv(data_1, file = paste0(filename, '.csv'), row.names = F)
|
f@0
|
70 ```
|