view scripts_R/MetadataPlots.R @ 70:cc028157502a branch-tests

scripts R
author Maria Panteli
date Fri, 22 Sep 2017 16:29:32 +0100
parents
children bde45ce0eeab
line wrap: on
line source
library(rworldmap)
library(ggplot2)

PlotBarChart<- function(df, cat="Language", ordercat="REGION", mincount=10, legend=T, color_plt="Paired"){
  idx_cat = which(colnames(df)==cat)
  idx_ordercat = which(colnames(df)==ordercat)
  dfsub <- subset(df, df[,idx_cat]!="")
  dfsub <- dfsub[ dfsub[,idx_cat] %in%  names(table(dfsub[,idx_cat]))[table(dfsub[,idx_cat]) >mincount] , ]
  #dfsub <- dfsub[order(dfsub$REGION.y),]
  dfsub <- dfsub[order(dfsub[,idx_ordercat]),]
  dfsub[,idx_cat] <- factor(dfsub[,idx_cat], levels=unique(dfsub[,idx_cat]))
  g = ggplot(dfsub,aes(dfsub[,idx_cat], fill=dfsub[,idx_ordercat], order=-as.numeric(dfsub[,idx_ordercat])))+geom_bar()
  #g = g+ylim("0", "100")#+scale_y_discrete(breaks=c("100"),labels=c("100+"))
  g=g+scale_y_continuous(limits=c(0, 200), breaks=seq(0,200,40))
  g=g+scale_fill_brewer(palette=color_plt)
  g=g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
  g=g+labs(y="Counts",x=cat)+coord_flip()+theme_bw()
  if (legend){
    g=g+guides(fill = guide_legend(title = ordercat))}
  else{
    g=g+guides(fill="none")
  }
  return(g)
}

PlotCountryNCounts <- function(df, mincount=10){
  countrycounts = table(df$Country)
  dd=data.frame(countrycounts)
  names(dd)=c("Country","Counts")
  cols <- rep(2, dim(dd)[1])
  cols[dd$Counts<mincount]=1
  dd$NCounts = cols
  spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country")
  mapParams <- mapCountryData(spdf, nameColumnToPlot="NCounts",catMethod='categorical',missingCountryCol="grey",oceanCol="lightblue",colourPalette='heat', mapTitle=paste("Country sample size, n_recordings>",mincount),addLegend=F)
  mapParams$legendText <- c(paste('<',mincount),paste('>=',mincount),'na')  
  do.call(addMapLegendBoxes, c(mapParams,x='bottomleft'))
}

PlotCountryCounts <- function(df, output=F){
  countrycounts = table(df$Country)
  dd=data.frame(countrycounts)
  names(dd)=c("Country","Counts")
  spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country")
  spdf<-spdf[-which(spdf$ADMIN=='Antarctica'),]
  #mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',borderCol='black',oceanCol="white",colourPalette='heat', mapTitle="",addLegend=F)
  mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=seq(10,100,10),missingCountryCol='grey',borderCol='black',oceanCol="white",colourPalette="heat", mapTitle="",addLegend=F)
  #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendShrink=0.5,legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page"))
  legend("left", legend = c(paste(seq(90,1,-10),'-',seq(100,11,-10)), 'NA'), fill = c(heat.colors(9, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white")
  if (output){
    pdf(file="countrycounts.pdf")
    mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=seq(10,100,10),missingCountryCol='grey',borderCol='black',oceanCol="white",colourPalette="heat", mapTitle="",addLegend=F)
    #mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',borderCol='black',oceanCol="lightblue",colourPalette='heat', mapTitle="",addLegend=F)
    #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendShrink=0.5,legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page"))
    legend("left", legend = c(paste(seq(90,1,-10),'-',seq(100,11,-10)), 'NA'), fill = c(heat.colors(9, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white")
    dev.off()
  }
}

PlotYearDistribution <- function(df, output=F){
  df$Year<-as.numeric(as.character(df$Year))
  g = ggplot(df,aes(x=Year,y=..count..))+geom_histogram(breaks=seq(1895, 2015, by = 1))
  #g = ggplot(df,aes(x=Year,y=..count..))+geom_bar()+geom_density(alpha=.3, fill="grey")
  #g = g+scale_x_continuous(breaks = pretty(df$Year, n=10))
  g = g+theme_bw()+labs(x ='Year', y ='Count') 
  #g = ggplot(df,aes(x=Year,y=..count..))+geom_histogram()+theme_bw()
  print(g)
  if (output){
    ggsave('yeardistribution.pdf',plot=g)
  }
}

PlotCountryDistribution <- function(df){
  #countrycounts = table(df$Country)
  #dd=data.frame(countrycounts)
  #names(dd)=c("Country","Counts")
  g = ggplot(df,aes(x=Country))+geom_bar()
  g=g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
}

#PlotCultureDistribution <- function(df){
#  g = ggplot(df,aes(x=Culture))+geom_bar()
#  g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
#}

PlotLanguageDistribution <- function(df, mincount=1){
  dfsubset <- subset(df, df$Language!="") # ignore the recordings culture info
  culturecounts = table(dfsubset$Culture)
  culturecounts = culturecounts[culturecounts>=mincount]
  barplot(culturecounts, las=2, cex.names=0.2)
  #g = ggplot(df,aes(x=Language))+geom_bar()
  #g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
}

PlotBarForCategory <- function(df, cat="Language", mincount=1){
  idx_cat = which(colnames(df)==cat)
  dfsubset <- subset(df, df[,idx_cat]!="")
  counts = table(dfsubset[,idx_cat])
  counts = counts[counts>=mincount]
  barplot(counts, las=2, cex.names=0.2)  
  #g = ggplot(df,aes(x=Language))+geom_bar()
  #g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
}

PlotCountryYearCutoff <- function(df, cutoffyear=1960){
  df$BeforeYear = df$Year<cutoffyear
  dfsubset <- subset(df, BeforeYear==T)
  countrycounts = table(dfsubset$Country)
  dd=data.frame(countrycounts)
  names(dd)=c("Country","Counts")
  spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country")
  mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',oceanCol="lightblue",colourPalette='heat', mapTitle=paste("Country sample size, year<",cutoffyear),addLegend=F)
  do.call( addMapLegend, c(mapParams, labelFontSize=0.3, legendWidth=0.5, tcl=-0.3, legendMar = 4, legendLabels="all",horizontal=F, legendIntervals="page"))
}

#dftemp$Decade<-floor(df$Year/10)*10
#yearcounts = sapply(levels(dftemp$Country),function(x)table(subset(dftemp,Country==x)$Before1960))
#decadecounts = sapply(levels(df$Country),function(x)table(subset(dftemp,Country==x & Before1960=="TRUE")$Before1960))
#print(yearcounts)

#metadata we are interested in:
#Artist,AlbumTitle,Culture, Language/Language_Album, Subject_Album
PlotCountryCulture <- function(df){
  dfsubset <- subset(df, df$Culture!="")
  countrycounts = table(dfsubset$Country)
  dd=data.frame(countrycounts)
  names(dd)=c("Country","Counts")
  spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country")
  #mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',oceanCol="lightblue",colourPalette='heat', mapTitle="Recordings with culture information",addLegend=F)
  mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod="logFixedWidth",missingCountryCol='grey',oceanCol="lightblue",colourPalette='heat', mapTitle="Number of recordings with culture information",addLegend=F)
  do.call( addMapLegend, c(mapParams, labelFontSize=0.3, legendWidth=0.5, tcl=-0.3, legendMar = 4, legendLabels="all",horizontal=F, legendIntervals="page"))
}

PlotCultureDistribution <- function(df){
  dfsubset <- subset(df, df$Culture!="") # ignore the recordings culture info
  culturecounts = table(dfsubset$Culture)
  barplot(culturecounts, las=2, cex.names=0.2)
  #g = ggplot(df,aes(x=Year,y=..count..))+geom_histogram(breaks=seq(1875, 2015, by = 1))
  #print(g)
}

PlotCountryNCulture <- function(df){
  dfsubset <- subset(df, df$Culture!="")
  #culturecounts = table(dfsubset$Culture)
  countrycounts = table(dfsubset$Country, dfsubset$Culture)
  aa <- addmargins(countrycounts, FUN = list(Total = sum), quiet = TRUE)
  print(paste(">1000",row.names(aa)[aa[,dim(aa)[2]]>1000]))
  print(paste(">500",row.names(aa)[aa[,dim(aa)[2]]>500]))
  print(paste(">100",row.names(aa)[aa[,dim(aa)[2]]>100]))
  inds = which(aa[,dim(aa)[2]]>500 & row.names(aa)!="Total")
  mosaicplot(aa[inds,1:5])
  dd=data.frame(countrycounts[inds,])
  dd=data.frame(countrycounts)
  names(dd)=c("Country","Culture","Counts")
  ddsub <- subset(dd, (Country=="Canada"| Country=="United Kingdom" | Country=="United States of America") & Counts>20)
  g=ggplot(ddsub, aes(x=Culture,y=Counts))+geom_point()+facet_wrap(~Country, scales = "free")
  g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
}

PlotCountryCultureNcounts <- function(df,mincount=50){
  dfsubset <- subset(df, df$Culture!="")
  #culturecounts = table(dfsubset$Culture)
  countrycounts = table(dfsubset$Country, dfsubset$Culture)
  dd=data.frame(countrycounts)
  names(dd)=c("Country","Culture","Counts")
  ddsub <- subset(dd, Counts>mincount)
  g=ggplot(ddsub, aes(x=Culture,y=Counts))+geom_point()+facet_wrap(~Country, scales = "free")
  g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))#+title(main=paste("Cultures per country, count>",mincount))
}

PlotCountryLanguageNcounts <- function(df,mincount=50){
  dfsubset <- subset(df, df$Language!="" & df$Language!=" ")
  countrycounts = table(dfsubset$Country, dfsubset$Language)
  dd=data.frame(countrycounts)
  names(dd)=c("Country","Language","Counts")
  ddsub <- subset(dd, Counts>mincount)
  g=ggplot(ddsub, aes(x=Language,y=Counts))+geom_point()+facet_wrap(~Country, scales = "free")
  g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))#+title(main=paste("Cultures per country, count>",mincount))
}

PlotNxNcounts <- function(df, cat1="Country", cat2="Culture", mincount=50, figname=""){
  indcat1 = which(colnames(df)==cat1)
  indcat2 = which(colnames(df)==cat2)
  dfsubset <- subset(df, df[,indcat1]!="" & df[,indcat2]!="")  # avoid nan values
  NNcounts <- table(dfsubset[,indcat1], dfsubset[,indcat2])
  dd=data.frame(NNcounts)
  names(dd) <- c("Cat1","Cat2","Counts")
  ddsub <- subset(dd, Counts>mincount)
  if (figname==""){
    g=ggplot(ddsub, aes(x=Cat2,y=Counts))+geom_point()+facet_wrap(~Cat1)
    g+coord_flip()+theme(axis.text.y=element_text(hjust=1,vjust=0.5,size=5))+labs(y="Counts",x=cat2)
  }else{
    g=ggplot(ddsub, aes(x=Cat2,y=Counts))+geom_point()+facet_wrap(~Cat1)
    g+coord_flip()+theme(axis.text.y=element_text(hjust=1,vjust=0.5,size=5))+labs(y="Counts",x=cat2)
    ggsave(figname)
  }
}

Wordcloud<- function(df, cat="Language", output=F){
  require(wordcloud)
  require(RColorBrewer)
  ind_cat = which(colnames(df)==cat)
  counts <- table(df[,ind_cat])
  dd=data.frame(counts)
  names(dd) <- c("words","freq")
  pal2 <- brewer.pal(8,"Dark2")
  wordcloud(dd$words,dd$freq,random.order=FALSE, colors=pal2)
  if (output){
    pdf("wordcloud.pdf")
    wordcloud(dd$words,dd$freq,random.order=FALSE, colors=pal2)
    dev.off()
  }
}
PlotCountryOutliers <- function(df, output=''){
  par(mar = rep(2, 4))
  spdf<-joinCountryData2Map(df,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country")
  spdf<-spdf[-which(spdf$ADMIN=='Antarctica'),]
  #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers",catMethod=seq(0,70,5),missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
  mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
  # avoid antarctica
  #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", ylim=c(-60,90), catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
  #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page"))
  legend("left", legend = c(paste(seq(90,0,-10),'-',seq(100,10,-10),'%'), 'NA'), fill = c(heat.colors(10, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white")
  if (output!=''){
    pdf(output)
    #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers",catMethod=seq(0,70,5),missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
    mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
    #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", ylim=c(-60,90), catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
    #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendWidth=0.5, tcl=0.3, legendMar=7, legendLabels="all",horizontal=T, legendIntervals="page"))
    legend("left", legend = c(paste(seq(90,0,-10),'-',seq(100,10,-10),'%'), 'NA'), fill = c(heat.colors(10, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white")
    dev.off()
  }
  else {
    return(mapParams)
  }
}