view latex/scripts/data_import.R @ 4:3e666d0329b5 tip

updated code
author Jiajie Dai <daijiajie1@gmail.com>
date Sat, 06 Jan 2018 12:20:49 +0000
parents 6531169e6866
children
line wrap: on
line source
cortable <- function(df)
{
    nm <- names(df)
    # cm <- cor(df, method="spearman")
    for (ii in 1:ncol(df))
    {
        cat("  ")
        if (ii>1) cat(rep("&", ii-1))
        cat(nm[ii])
        if (ii < ncol(df))
        {
            for (jj in (ii+1):ncol(df))
            {
                test <- cor.test(df[,ii], df[,jj], method="spearman")
                significant <- test$p.value < 0.01
                # cat(sprintf("\n\n%i %i -- %s -- %s -- %0.3f -- %g -- %s\n\n", ii, jj, nm[ii], nm[jj], test$estimate, test$p.value, significant))
                if (significant) cat("&", sprintf("\\textbf{%5.2f}",test$estimate))
                else cat("&", sprintf("%5.2f",test$estimate))
            }
        }
        cat("\\\\\n")
    }
}

makeDriftGraph <- function(data, singermode) {
	
	subdata <- data[data$singermode == singermode, ]
	
	meanpitch <- median(subdata$pitchmedian-subdata$semitone)
    # print(meanpitch)
	
	plot(I(subdata$pitchmedian - subdata$semitone) ~ I((subdata$run-1) * 25 + subdata$noteid), 
	     pch = "+",
         ylim = meanpitch + c(-2,2),
         xlab = "note number",
         ylab = "pitch after score subtraction")
	
	fit <- lm(I(subdata$pitchmedian - subdata$semitone) ~ I((subdata$run-1) * 25 + subdata$noteid))
	abline(fit)
}

hz2pitch <- function(f)
{
    69 + 12 * log2(f / 440)
}

diff_1_3 <- function(sgr, md)
{
    singermodeind <- data$singer == sgr & data$mode == md
    firstrun <- singermodeind & data$run == 1
    thirdrun <- singermodeind & data$run == 3
    firstdata <- data.frame(noteid = data$noteid[firstrun], newmedian=data$newmedian[firstrun])
    thirddata <- data.frame(noteid = data$noteid[thirdrun], newmedian=data$newmedian[thirdrun])
    comparedata <- merge(firstdata, thirddata, by="noteid")
    return(comparedata)
}

# estimate.pitch <- function(x)
# {
#     if (length(x) > 0)
#     {
#         w <- 1./rowSums(rdist(x))
#         w <- w/sum(w)
#         return(sum(w*x))
#     } else {
#         return(NA)
#     }
# }


datadir <- '../data/filtered_pitchtracks/'

##############################################################################
# LOAD MAIN DATA AND SOME HELPER STUFF
##############################################################################

cat("\nMAIN PAPER EVALUATION\n\n", file = stderr())

cat("\nLoading note data...", file = stderr())

data <- read.csv(file = "../data/all_final_note_tracks_klaus.csv", header = TRUE, stringsAsFactors = T)
singers <- levels(data$singer)
poor.singers <- c("mf", "mw", "sr", "hl")
nSinger <- length(singers)
conditions <- c("normal", "masked", "imagined")
nCondition <- 3

# flag poor singers
data$ispoor <- data$singer %in% poor.singers
data$singer <- factor(data$singer)

# make a singermode variable to uniquely identify all recordings
data$singermode <- as.factor(sprintf("%s%s", data$singer, data$mode))

# make a serial note number (essentially from 1 to 75)
data$notenumber <- data$noteid + 25 * (data$run-1)

cat(" done.\n\n", file = stderr())

##############################################################################
# EXTRACT PITCH ESTIMATES FROM PITCH TRACKS
##############################################################################

cat("Extracting pitches etc. from pitch tracks (patience!) ...", file = stderr())

# get pitch data and extract medians
data$newmedian <- data$pitchmedian * NA
data$newsd <- data$pitchmedian * NA
data$newn <- data$pitchmedian * 0
data$slope <- data$pitchmedian * NA

count <- 0
cat("          ", file = stderr())

for (iSinger in 1:nSinger)
{
    for (iCondition in 1:nCondition)
    {
        count <- count + 1
        cat(sprintf("\b\b\b\b\b\b\b\b%2i of %2i", count, nSinger*nCondition), file = stderr())
        yindata <- read.table(
            sprintf('%s/happy_birthday_%s_%s_vamp_yintony_yintony_f0.csv', datadir, singers[iSinger], conditions[iCondition]), 
                header=F, sep=",")
        dataind <- which(data$singer == singers[iSinger] 
            & data$mode == conditions[iCondition])
        nNote <- length(dataind)
        for (iNote in 1:nNote)
        {
            yinind <- yindata[,1] >= data$onset[dataind[iNote]]  & yindata[,1] <= (data$onset[dataind[iNote]] + data$duration[dataind[iNote]])
            curryindata <- hz2pitch(yindata[yinind,2])
            currmedian <- median(curryindata)
            data$newmedian[dataind[iNote]] <- currmedian 
            around.median <- which(abs(curryindata-currmedian) < 3)
            currn <- length(around.median)
            if (currn > 0)
            {
                data$newsd[dataind[iNote]] <- sd(curryindata[around.median])
                data$slope[dataind[iNote]] <- lm(curryindata[around.median]~around.median)$coefficients[2]
                data$newn[dataind[iNote]] <- sum(currn)
            }
        }
    }
}

# find which notes are grosslly erroneous in the newly extracted medians
toCorrect <- is.na(data$newmedian) | abs(data$newmedian-data$pitchmedian) >= 1
# and set them to NA
data$newmedian[toCorrect] <- NA

cat(" done.\n\n", file = stderr())

##############################################################################
# LOAD MELODY GROUND TRUTH DATA
##############################################################################

cat("Loading melody ground truth...", file = stderr())

gtdata <- read.csv2(file = "../data/happy_birthday_GT.csv", header = TRUE, stringsAsFactors = FALSE)
# and weave it into the main data frame
for (iNote in 1:25)
{
	data$semitone[data$noteid == iNote] <- gtdata$pitch_ion_cents_et[iNote] / 100
}

cat(" done.\n\n", file = stderr())

##############################################################################
# LOADING SURVEY DATA
##############################################################################

cat("Loading survey data...", file = stderr())

meta <- read.csv(file = "../data/metadata.csv", header = TRUE, stringsAsFactors = T)

# re-order ordinal variables
meta$sing.ability <- relevel(meta$sing.ability, ref="veryhigh")
meta$sing.ability <- relevel(meta$sing.ability, ref="high")
meta$sing.ability <- relevel(meta$sing.ability, ref="medium")
meta$sing.ability <- relevel(meta$sing.ability, ref="low")
meta$sing.ability <- relevel(meta$sing.ability, ref="poor")
meta$sing.experience <- relevel(meta$sing.experience, ref="professional")
meta$sing.experience <- relevel(meta$sing.experience, ref="alot")
meta$sing.experience <- relevel(meta$sing.experience, ref="some")
meta$sing.experience <- relevel(meta$sing.experience, ref="none")
meta$sing.experience <- relevel(meta$sing.experience, ref="notanswered")
meta$musical.background <- relevel(meta$musical.background, ref="professional")
meta$musical.background <- relevel(meta$musical.background, ref="semiprofessional")
meta$musical.background <- relevel(meta$musical.background, ref="amateur")
meta$musical.background <- relevel(meta$musical.background, ref="none")
meta$choir.experience <- relevel(meta$choir.experience, ref="active")
meta$choir.experience <- relevel(meta$choir.experience, ref="sangbutnolongeractive")
meta$choir.experience <- relevel(meta$choir.experience, ref="childneverafter")
meta$choir.experience <- relevel(meta$choir.experience, ref="none")

meta$instudy <- meta$id %in% singers & !(meta$id %in% poor.singers)

cat(" done.\n\n", file = stderr())

##############################################################################
# DRIFT CALCULATIONS
##############################################################################

cat("Calculating drift...", file = stderr())

drifts <- data.frame(singer=c(), mode=c(), singermode =c(), noteid=c(), diffs=c())
for (iSinger in 1:nSinger)
{
    singerind <- data$singer == singers[iSinger]
    for (iCondition in 1:nCondition)
    {
        recordingind <- singerind & data$mode == conditions[iCondition]
        diffs <- rep(NA, 25)
        for (iNote in 1:25)
        {
            ind1 <- recordingind & data$notenumber == iNote
            ind2 <- recordingind & data$notenumber == iNote + 50
            if (sum(ind1) > 0  & sum(ind2) > 0)
            {
                diffs[iNote] <- data$newmedian[ind2] - data$newmedian[ind1]
            }
        }
        # cat(sum(is.na(diffs)))
        tempdf = data.frame(singer = rep(singers[iSinger], 25),
                            mode   = rep(conditions[iCondition], 25),
                            singermode = rep(sprintf("%s%s", singers[iSinger], conditions[iCondition]), 25),
                            noteid = 1:25,
                            diffs  = diffs)
        drifts = rbind(drifts, tempdf)
    }
}

cat(" done.\n\n", file = stderr())

##############################################################################
# PITCH ERROR CALCULATIONS
##############################################################################

cat("Calculating pitch errors...", file = stderr())

justintonation_scale <- c(0,11.7,3.9,15.6,-13.7,-2,-9.8,2,13.7,15.6,17.6,-11.7)
data$justintonation <- data$newmedian * 0

data$jipitcherror <- data$pitcherror
for (i in 0:11)
{
    ind <- data$semitone %% 12 == i
    data$justintonation[ind] <- justintonation_scale[i+1]/100
}

data$pitcherror <- data$newmedian * 0
data$jipitcherror <- data$newmedian * 0
for (sm in unique(data$singermode))
{
	for (r in 1:3)
	{
		logicalind <- data$singermode == sm & data$run == r & !is.na(data$newmedian)
		if (sum(logicalind) > 0)
		{
			data$pitcherror[logicalind] <- lm(I(newmedian-semitone) ~ noteid, data = data, subset = logicalind)$residuals
			data$jipitcherror[logicalind] <- lm(I(newmedian-semitone-justintonation) ~ noteid, data = data, subset = logicalind)$residuals
		}
	}
}

cat(" done.\n\n", file = stderr())

##############################################################################
# INTERVAL ERROR CALCULATIONS
##############################################################################

cat("Calculating interval errors (patience!) ... ", file = stderr())

data$expectedinterval <- data$semitone * NA
data$expectedfutureinterval <- data$semitone * NA
data$realisedinterval <- data$semitone * NA
count <- 0
cat("   of   ", file = stderr())
for (singermode in sort(unique(data$singermode)))
{
    count <- count + 1
    cat(sprintf("\b\b\b\b\b\b\b\b%2i of %2i", count, nSinger*nCondition), file = stderr())
	singermodeIdx <- which(data$singermode == singermode)
	notenumbers <- data$notenumber[singermodeIdx]
	for (notenumber in notenumbers)
	{
	    currNoteIdx <- singermodeIdx[which(notenumbers==notenumber)]
		previousNoteIdx <- singermodeIdx[which(notenumbers==(notenumber-1))]
		nextNoteIdx <- singermodeIdx[which(notenumbers==(notenumber+1))]
		if (length(previousNoteIdx) == 1) 
		{
			data$expectedinterval[currNoteIdx] <- data$semitone[currNoteIdx] - data$semitone[previousNoteIdx]
			data$realisedinterval[currNoteIdx] <- data$newmedian[currNoteIdx] - data$newmedian[previousNoteIdx]
		}
		if (length(nextNoteIdx) == 1) 
		{
			data$expectedfutureinterval[currNoteIdx] <- data$semitone[nextNoteIdx] - data$semitone[currNoteIdx]
		}
	}
}
data$intervalerror <- data$realisedinterval - data$expectedinterval

cat(" done.\n\n", file=stderr())

##############################################################################
# MEMORY MODEL STUFF
##############################################################################

cat("Calculating different memory models (patience!) ... \n\n", file = stderr())

# mus <- 1 - seq(1,0,length.out=nmu)^2
mus <- seq(0, 1, by = 0.01)
nmu <- length(mus)
burn.in <- 6
md <- with(data, data.frame(singer=singer, 
                            mode=mode, 
                            noteid=noteid, 
                            ispoor=ispoor, 
                            singermode=singermode, 
                            notenumber=notenumber, 
                            p=newmedian,
                            s=semitone))

mr <- data.frame(with(md, aggregate(data.frame(ispoor=ispoor), list(singer=singer,mode=mode), min)))
mr$singermode <- as.character(mr$singer)
mr$tstart <- mat.or.vec(nrow(mr),1)
for (iRow in 1:nrow(mr))
{
    mr$singermode[iRow]<-sprintf("%s%s", mr$singer[iRow], mr$mode[iRow])
    ind <- mr$singermode[iRow] == md$singermode & md$notenumber <= burn.in
    mr$tstart[iRow] <- mean(md$p[ind] - md$s[ind], na.rm=T)
}

cat("         ", file = stderr())
mdmatrix <- mat.or.vec(nrow(md), nmu) * NA
colnames(mdmatrix) <- sprintf("err%04.0f", mus*1000)
for (imu in 1:nmu) {
    mu <- mus[imu]
    muname <- sprintf("err%04.0f", mu*1000)
    # md[[muname]] <- md$p * NA
    # cat(sprintf("\b\b\b\b\b\b\b\b\b\bmu = %0.3f", mu), file = stderr())
    ptm <- proc.time()
    for (sm in levels(md$singermode)) 
    {
        mrind <- mr$singermode == sm
        mdind <- which(md$singermode == sm & md$notenumber > burn.in)
        r <- mr$tstart[mrind]
        indices <- c(mdind, mdind[1]-md$notenumber[mdind[1]]+76)
        # cat(length(indices), "\n")
        for (ii in indices) {
            if (!is.na(md$p[ii-1]))
            {
                eminus1 <- md$p[ii-1] - md$s[ii-1] - r
                # if (abs(eminus1)<100) # how hacky is this?
                # {
                r <- r + (1-mu) * eminus1
                mdmatrix[ii-1,imu] <- eminus1
                # }
            }
        }
    }
    cat(proc.time() - ptm, "\n")
}
md <- cbind(md, mdmatrix)
cat(" done.\n\n", file = stderr())

##############################################################################
# AGGREGATED METRICS
##############################################################################

cat("Calculating aggregated metrics ... ", file = stderr())

agg.drifts <- with(drifts[!(drifts$singer %in% poor.singers),],
    aggregate(data.frame(diffs=diffs), list(singer = singer, mode = mode), function(x){mean(x, na.rm=T)}))
singeragg.drifts <- with(drifts[!(drifts$singer %in% poor.singers),],
    aggregate(data.frame(diffs=diffs), list(singer = singer), function(x){mean(x, na.rm=T)}))
agg.drifts.p <- with(drifts[!(drifts$singer %in% poor.singers),], 
    aggregate(data.frame(p=diffs), list(singer = singer, mode = mode), function(x) {t.test(x)$p.value}))
agg.drifts.sd <- with(drifts[!(drifts$singer %in% poor.singers),], 
    aggregate(data.frame(sd=diffs), list(singer = singer, mode = mode), sd))
agg.drifts.all <- with(drifts,
    aggregate(data.frame(diffs=diffs), list(singer = singer, mode = mode), function(x){mean(x, na.rm=T)}))

agg.mape <- with(data[!data$ispoor,], 
    aggregate(data.frame(mape=abs(pitcherror)), list(singer = singer, mode = mode), function(x){mean(x, na.rm=T)}))
singeragg.mape <- with(data[!data$ispoor,], 
    aggregate(data.frame(mape=abs(pitcherror)), list(singer = singer), function(x){mean(x, na.rm=T)}))

agg.ib <- with(data[!data$ispoor,], 
    aggregate(data.frame(ib=intervalerror), list(singer = singer, mode = mode), function(x) {mean(x, na.rm=T)}))
agg.maie <- with(data[!data$ispoor,], 
    aggregate(data.frame(maie=abs(intervalerror)), list(singer = singer, mode = mode), function(x) {mean(x, na.rm=T)}))
agg.rise <- with(data[!data$ispoor,], 
    aggregate(data.frame(rise=duration * abs(slope) * 44100/1024), list(singer = singer, mode = mode), function(x) {mean(x, na.rm=T)}))
singeragg.maie <- with(data[!data$ispoor,], 
    aggregate(data.frame(maie=abs(intervalerror)), list(singer = singer), function(x) {mean(x, na.rm=T)}))
singeragg.ratioout <- with(data[!data$ispoor,], 
    aggregate(data.frame(ratioout=abs(intervalerror)), list(singer = singer), function(x) {mean(x>1, na.rm=T)}))
singeragg.ib <- with(data[!data$ispoor,], 
    aggregate(data.frame(ib=intervalerror), list(singer = singer), function(x) {mean(x, na.rm=T)}))

# linear.drift.all <- with(data, 
#     aggregate(data.frame(lineardrift=1:nrow(data)), list(singer=data$singer, mode=data$mode), function(ind) {lm(newmedian-semitone~notenumber, data=data[ind,])$coefficients[2]}))
linear.drift.all <- with(data, 
    aggregate(data.frame(est=1:nrow(data)), list(singer=data$singer, mode=data$mode), function(ind) {summary(lm(newmedian-semitone~notenumber, data=data[ind,]))$coefficients[2,c(1,4)]}))
linear.drift.all <- data.frame(singer=linear.drift.all$singer, mode=linear.drift.all$mode, lineardrift=linear.drift.all[,3][,1], p.value=linear.drift.all[,3][,2])
linear.drift <- linear.drift.all[!linear.drift.all$singer %in% poor.singers,]

# metadata versus stats
merged.singeragg <- merge(singeragg.mape, meta, by.x = "singer", by.y = "id", all.y = F)
merged.singeragg <- merge(merged.singeragg, singeragg.drifts, by.x = "singer", by.y = "singer", all.y = F)
merged.singeragg <- merge(merged.singeragg, singeragg.maie, by.x = "singer", by.y = "singer", all.y = F)
# merged.singeragg <- merge(merged.singeragg, singeragg.ib, by.x = "singer", by.y = "singer", all.y = F)

merged.agg <- merge(agg.mape, meta, by.x = c("singer"), by.y = "id", all.y = F)
merged.agg <- merge(merged.agg, agg.drifts, by = c("singer","mode"))
merged.agg <- merge(merged.agg, agg.maie, by = c("singer","mode"))
merged.agg <- merge(merged.agg, agg.ib, by = c("singer","mode"))
merged.agg <- merge(merged.agg, agg.rise, by = c("singer","mode"))
merged.agg <- merge(merged.agg, linear.drift, by = c("singer","mode"))

cat(" done.\n\n", file = stderr())

## VARIANCE CALCULATION
library(MASS)
burnin <- 6
data$relativetonic <- data$newmedian * NA
for (iRow in 1:nrow(mr))
{
    refind <- data$singermode == mr$singermode[iRow] & data$notenumber <= burnin
    ind <- data$singermode == mr$singermode[iRow] & data$notenumber > burnin
    data$relativetonic[ind] <- data$newmedian[ind] - data$semitone[ind] - median(data$newmedian[refind] - data$semitone[refind])
}