Mercurial > hg > confint
comparison metadata/parse_file.Rmd @ 0:205974c9568c tip
Initial commit. Predictions not included for lack of space.
author | franrodalg <f.rodriguezalgarra@qmul.ac.uk> |
---|---|
date | Sat, 29 Jun 2019 18:45:50 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:205974c9568c |
---|---|
1 --- | |
2 title: "GTZAN index parsing" | |
3 output: html_notebook | |
4 --- | |
5 | |
6 ```{r libraries} | |
7 library(tidyverse) | |
8 ``` | |
9 | |
10 ## Reading the file | |
11 | |
12 ```{r read_file_functions} | |
13 read_text <- function(filename){ | |
14 lines <- readLines(con <- file(filename)) | |
15 close(con) | |
16 lines | |
17 } | |
18 | |
19 remove_comments <- function(text, comment_char){ | |
20 comment_regex <- paste0( | |
21 '^', comment_char, '|^.*', comment_char) | |
22 text[!grepl(comment_regex, text)] | |
23 } | |
24 | |
25 parse_long_sep <- function(lines, separator){ | |
26 records <- unname( | |
27 sapply(sapply(lines, strsplit, split = separator), | |
28 trimws)) | |
29 records <- lapply(records, `length<-`, max(lengths(records))) | |
30 df <- data.frame(t(sapply(records,c)), stringsAsFactors = F) | |
31 rownames(df) <- 1:nrow(df) | |
32 df | |
33 } | |
34 | |
35 parse_file <- function(filename, separator, comment_char){ | |
36 lines <- read_text(filename) | |
37 lines <- remove_comments(lines, comment_char) | |
38 data <- parse_long_sep(lines, separator) | |
39 data[is.na(data)] <- "" | |
40 data | |
41 } | |
42 ``` | |
43 | |
44 ```{r read_file} | |
45 | |
46 create_data_frame <- function(file = 'GTZANindex'){ | |
47 | |
48 index_file = paste0(file, '.txt') | |
49 | |
50 data <- parse_file( | |
51 index_file, separator = ':::', comment_char = '#') | |
52 colnames(data) <- c('file_name', 'artist_list', 'track_name') | |
53 | |
54 data$ex_id <- 1:nrow(data) | |
55 data <- data %>% | |
56 select(ex_id, everything()) %>% | |
57 separate(file_name, c("class", "ex_id_class"), | |
58 remove = T, extra = "drop") | |
59 data$ex_id_class <- as.numeric(data$ex_id_class) | |
60 data | |
61 } | |
62 | |
63 ``` | |
64 | |
65 | |
66 ```{r bob} | |
67 filename <- 'GTZANindex' | |
68 data_1 <- create_data_frame(filename) | |
69 write.csv(data_1, file = paste0(filename, '.csv'), row.names = F) | |
70 ``` |