diff scripts_R/MetadataPlots.R @ 76:d17833be50ca branch-tests

merged
author Maria Panteli <m.x.panteli@gmail.com>
date Fri, 22 Sep 2017 16:30:36 +0100
parents cc028157502a
children bde45ce0eeab
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts_R/MetadataPlots.R	Fri Sep 22 16:30:36 2017 +0100
@@ -0,0 +1,235 @@
+library(rworldmap)
+library(ggplot2)
+
+PlotBarChart<- function(df, cat="Language", ordercat="REGION", mincount=10, legend=T, color_plt="Paired"){
+  idx_cat = which(colnames(df)==cat)
+  idx_ordercat = which(colnames(df)==ordercat)
+  dfsub <- subset(df, df[,idx_cat]!="")
+  dfsub <- dfsub[ dfsub[,idx_cat] %in%  names(table(dfsub[,idx_cat]))[table(dfsub[,idx_cat]) >mincount] , ]
+  #dfsub <- dfsub[order(dfsub$REGION.y),]
+  dfsub <- dfsub[order(dfsub[,idx_ordercat]),]
+  dfsub[,idx_cat] <- factor(dfsub[,idx_cat], levels=unique(dfsub[,idx_cat]))
+  g = ggplot(dfsub,aes(dfsub[,idx_cat], fill=dfsub[,idx_ordercat], order=-as.numeric(dfsub[,idx_ordercat])))+geom_bar()
+  #g = g+ylim("0", "100")#+scale_y_discrete(breaks=c("100"),labels=c("100+"))
+  g=g+scale_y_continuous(limits=c(0, 200), breaks=seq(0,200,40))
+  g=g+scale_fill_brewer(palette=color_plt)
+  g=g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
+  g=g+labs(y="Counts",x=cat)+coord_flip()+theme_bw()
+  if (legend){
+    g=g+guides(fill = guide_legend(title = ordercat))}
+  else{
+    g=g+guides(fill="none")
+  }
+  return(g)
+}
+
+PlotCountryNCounts <- function(df, mincount=10){
+  countrycounts = table(df$Country)
+  dd=data.frame(countrycounts)
+  names(dd)=c("Country","Counts")
+  cols <- rep(2, dim(dd)[1])
+  cols[dd$Counts<mincount]=1
+  dd$NCounts = cols
+  spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country")
+  mapParams <- mapCountryData(spdf, nameColumnToPlot="NCounts",catMethod='categorical',missingCountryCol="grey",oceanCol="lightblue",colourPalette='heat', mapTitle=paste("Country sample size, n_recordings>",mincount),addLegend=F)
+  mapParams$legendText <- c(paste('<',mincount),paste('>=',mincount),'na')  
+  do.call(addMapLegendBoxes, c(mapParams,x='bottomleft'))
+}
+
+PlotCountryCounts <- function(df, output=F){
+  countrycounts = table(df$Country)
+  dd=data.frame(countrycounts)
+  names(dd)=c("Country","Counts")
+  spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country")
+  spdf<-spdf[-which(spdf$ADMIN=='Antarctica'),]
+  #mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',borderCol='black',oceanCol="white",colourPalette='heat', mapTitle="",addLegend=F)
+  mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=seq(10,100,10),missingCountryCol='grey',borderCol='black',oceanCol="white",colourPalette="heat", mapTitle="",addLegend=F)
+  #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendShrink=0.5,legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page"))
+  legend("left", legend = c(paste(seq(90,1,-10),'-',seq(100,11,-10)), 'NA'), fill = c(heat.colors(9, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white")
+  if (output){
+    pdf(file="countrycounts.pdf")
+    mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=seq(10,100,10),missingCountryCol='grey',borderCol='black',oceanCol="white",colourPalette="heat", mapTitle="",addLegend=F)
+    #mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',borderCol='black',oceanCol="lightblue",colourPalette='heat', mapTitle="",addLegend=F)
+    #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendShrink=0.5,legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page"))
+    legend("left", legend = c(paste(seq(90,1,-10),'-',seq(100,11,-10)), 'NA'), fill = c(heat.colors(9, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white")
+    dev.off()
+  }
+}
+
+PlotYearDistribution <- function(df, output=F){
+  df$Year<-as.numeric(as.character(df$Year))
+  g = ggplot(df,aes(x=Year,y=..count..))+geom_histogram(breaks=seq(1895, 2015, by = 1))
+  #g = ggplot(df,aes(x=Year,y=..count..))+geom_bar()+geom_density(alpha=.3, fill="grey")
+  #g = g+scale_x_continuous(breaks = pretty(df$Year, n=10))
+  g = g+theme_bw()+labs(x ='Year', y ='Count') 
+  #g = ggplot(df,aes(x=Year,y=..count..))+geom_histogram()+theme_bw()
+  print(g)
+  if (output){
+    ggsave('yeardistribution.pdf',plot=g)
+  }
+}
+
+PlotCountryDistribution <- function(df){
+  #countrycounts = table(df$Country)
+  #dd=data.frame(countrycounts)
+  #names(dd)=c("Country","Counts")
+  g = ggplot(df,aes(x=Country))+geom_bar()
+  g=g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
+}
+
+#PlotCultureDistribution <- function(df){
+#  g = ggplot(df,aes(x=Culture))+geom_bar()
+#  g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
+#}
+
+PlotLanguageDistribution <- function(df, mincount=1){
+  dfsubset <- subset(df, df$Language!="") # ignore the recordings culture info
+  culturecounts = table(dfsubset$Culture)
+  culturecounts = culturecounts[culturecounts>=mincount]
+  barplot(culturecounts, las=2, cex.names=0.2)
+  #g = ggplot(df,aes(x=Language))+geom_bar()
+  #g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
+}
+
+PlotBarForCategory <- function(df, cat="Language", mincount=1){
+  idx_cat = which(colnames(df)==cat)
+  dfsubset <- subset(df, df[,idx_cat]!="")
+  counts = table(dfsubset[,idx_cat])
+  counts = counts[counts>=mincount]
+  barplot(counts, las=2, cex.names=0.2)  
+  #g = ggplot(df,aes(x=Language))+geom_bar()
+  #g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
+}
+
+PlotCountryYearCutoff <- function(df, cutoffyear=1960){
+  df$BeforeYear = df$Year<cutoffyear
+  dfsubset <- subset(df, BeforeYear==T)
+  countrycounts = table(dfsubset$Country)
+  dd=data.frame(countrycounts)
+  names(dd)=c("Country","Counts")
+  spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country")
+  mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',oceanCol="lightblue",colourPalette='heat', mapTitle=paste("Country sample size, year<",cutoffyear),addLegend=F)
+  do.call( addMapLegend, c(mapParams, labelFontSize=0.3, legendWidth=0.5, tcl=-0.3, legendMar = 4, legendLabels="all",horizontal=F, legendIntervals="page"))
+}
+
+#dftemp$Decade<-floor(df$Year/10)*10
+#yearcounts = sapply(levels(dftemp$Country),function(x)table(subset(dftemp,Country==x)$Before1960))
+#decadecounts = sapply(levels(df$Country),function(x)table(subset(dftemp,Country==x & Before1960=="TRUE")$Before1960))
+#print(yearcounts)
+
+#metadata we are interested in:
+#Artist,AlbumTitle,Culture, Language/Language_Album, Subject_Album
+PlotCountryCulture <- function(df){
+  dfsubset <- subset(df, df$Culture!="")
+  countrycounts = table(dfsubset$Country)
+  dd=data.frame(countrycounts)
+  names(dd)=c("Country","Counts")
+  spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country")
+  #mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',oceanCol="lightblue",colourPalette='heat', mapTitle="Recordings with culture information",addLegend=F)
+  mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod="logFixedWidth",missingCountryCol='grey',oceanCol="lightblue",colourPalette='heat', mapTitle="Number of recordings with culture information",addLegend=F)
+  do.call( addMapLegend, c(mapParams, labelFontSize=0.3, legendWidth=0.5, tcl=-0.3, legendMar = 4, legendLabels="all",horizontal=F, legendIntervals="page"))
+}
+
+PlotCultureDistribution <- function(df){
+  dfsubset <- subset(df, df$Culture!="") # ignore the recordings culture info
+  culturecounts = table(dfsubset$Culture)
+  barplot(culturecounts, las=2, cex.names=0.2)
+  #g = ggplot(df,aes(x=Year,y=..count..))+geom_histogram(breaks=seq(1875, 2015, by = 1))
+  #print(g)
+}
+
+PlotCountryNCulture <- function(df){
+  dfsubset <- subset(df, df$Culture!="")
+  #culturecounts = table(dfsubset$Culture)
+  countrycounts = table(dfsubset$Country, dfsubset$Culture)
+  aa <- addmargins(countrycounts, FUN = list(Total = sum), quiet = TRUE)
+  print(paste(">1000",row.names(aa)[aa[,dim(aa)[2]]>1000]))
+  print(paste(">500",row.names(aa)[aa[,dim(aa)[2]]>500]))
+  print(paste(">100",row.names(aa)[aa[,dim(aa)[2]]>100]))
+  inds = which(aa[,dim(aa)[2]]>500 & row.names(aa)!="Total")
+  mosaicplot(aa[inds,1:5])
+  dd=data.frame(countrycounts[inds,])
+  dd=data.frame(countrycounts)
+  names(dd)=c("Country","Culture","Counts")
+  ddsub <- subset(dd, (Country=="Canada"| Country=="United Kingdom" | Country=="United States of America") & Counts>20)
+  g=ggplot(ddsub, aes(x=Culture,y=Counts))+geom_point()+facet_wrap(~Country, scales = "free")
+  g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
+}
+
+PlotCountryCultureNcounts <- function(df,mincount=50){
+  dfsubset <- subset(df, df$Culture!="")
+  #culturecounts = table(dfsubset$Culture)
+  countrycounts = table(dfsubset$Country, dfsubset$Culture)
+  dd=data.frame(countrycounts)
+  names(dd)=c("Country","Culture","Counts")
+  ddsub <- subset(dd, Counts>mincount)
+  g=ggplot(ddsub, aes(x=Culture,y=Counts))+geom_point()+facet_wrap(~Country, scales = "free")
+  g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))#+title(main=paste("Cultures per country, count>",mincount))
+}
+
+PlotCountryLanguageNcounts <- function(df,mincount=50){
+  dfsubset <- subset(df, df$Language!="" & df$Language!=" ")
+  countrycounts = table(dfsubset$Country, dfsubset$Language)
+  dd=data.frame(countrycounts)
+  names(dd)=c("Country","Language","Counts")
+  ddsub <- subset(dd, Counts>mincount)
+  g=ggplot(ddsub, aes(x=Language,y=Counts))+geom_point()+facet_wrap(~Country, scales = "free")
+  g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))#+title(main=paste("Cultures per country, count>",mincount))
+}
+
+PlotNxNcounts <- function(df, cat1="Country", cat2="Culture", mincount=50, figname=""){
+  indcat1 = which(colnames(df)==cat1)
+  indcat2 = which(colnames(df)==cat2)
+  dfsubset <- subset(df, df[,indcat1]!="" & df[,indcat2]!="")  # avoid nan values
+  NNcounts <- table(dfsubset[,indcat1], dfsubset[,indcat2])
+  dd=data.frame(NNcounts)
+  names(dd) <- c("Cat1","Cat2","Counts")
+  ddsub <- subset(dd, Counts>mincount)
+  if (figname==""){
+    g=ggplot(ddsub, aes(x=Cat2,y=Counts))+geom_point()+facet_wrap(~Cat1)
+    g+coord_flip()+theme(axis.text.y=element_text(hjust=1,vjust=0.5,size=5))+labs(y="Counts",x=cat2)
+  }else{
+    g=ggplot(ddsub, aes(x=Cat2,y=Counts))+geom_point()+facet_wrap(~Cat1)
+    g+coord_flip()+theme(axis.text.y=element_text(hjust=1,vjust=0.5,size=5))+labs(y="Counts",x=cat2)
+    ggsave(figname)
+  }
+}
+
+Wordcloud<- function(df, cat="Language", output=F){
+  require(wordcloud)
+  require(RColorBrewer)
+  ind_cat = which(colnames(df)==cat)
+  counts <- table(df[,ind_cat])
+  dd=data.frame(counts)
+  names(dd) <- c("words","freq")
+  pal2 <- brewer.pal(8,"Dark2")
+  wordcloud(dd$words,dd$freq,random.order=FALSE, colors=pal2)
+  if (output){
+    pdf("wordcloud.pdf")
+    wordcloud(dd$words,dd$freq,random.order=FALSE, colors=pal2)
+    dev.off()
+  }
+}
+PlotCountryOutliers <- function(df, output=''){
+  par(mar = rep(2, 4))
+  spdf<-joinCountryData2Map(df,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country")
+  spdf<-spdf[-which(spdf$ADMIN=='Antarctica'),]
+  #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers",catMethod=seq(0,70,5),missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
+  mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
+  # avoid antarctica
+  #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", ylim=c(-60,90), catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
+  #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page"))
+  legend("left", legend = c(paste(seq(90,0,-10),'-',seq(100,10,-10),'%'), 'NA'), fill = c(heat.colors(10, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white")
+  if (output!=''){
+    pdf(output)
+    #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers",catMethod=seq(0,70,5),missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
+    mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
+    #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", ylim=c(-60,90), catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
+    #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendWidth=0.5, tcl=0.3, legendMar=7, legendLabels="all",horizontal=T, legendIntervals="page"))
+    legend("left", legend = c(paste(seq(90,0,-10),'-',seq(100,10,-10),'%'), 'NA'), fill = c(heat.colors(10, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white")
+    dev.off()
+  }
+  else {
+    return(mapParams)
+  }
+}
\ No newline at end of file