Mercurial > hg > plosone_underreview
diff scripts_R/MetadataPlots.R @ 76:d17833be50ca branch-tests
merged
author | Maria Panteli <m.x.panteli@gmail.com> |
---|---|
date | Fri, 22 Sep 2017 16:30:36 +0100 |
parents | cc028157502a |
children | bde45ce0eeab |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts_R/MetadataPlots.R Fri Sep 22 16:30:36 2017 +0100 @@ -0,0 +1,235 @@ +library(rworldmap) +library(ggplot2) + +PlotBarChart<- function(df, cat="Language", ordercat="REGION", mincount=10, legend=T, color_plt="Paired"){ + idx_cat = which(colnames(df)==cat) + idx_ordercat = which(colnames(df)==ordercat) + dfsub <- subset(df, df[,idx_cat]!="") + dfsub <- dfsub[ dfsub[,idx_cat] %in% names(table(dfsub[,idx_cat]))[table(dfsub[,idx_cat]) >mincount] , ] + #dfsub <- dfsub[order(dfsub$REGION.y),] + dfsub <- dfsub[order(dfsub[,idx_ordercat]),] + dfsub[,idx_cat] <- factor(dfsub[,idx_cat], levels=unique(dfsub[,idx_cat])) + g = ggplot(dfsub,aes(dfsub[,idx_cat], fill=dfsub[,idx_ordercat], order=-as.numeric(dfsub[,idx_ordercat])))+geom_bar() + #g = g+ylim("0", "100")#+scale_y_discrete(breaks=c("100"),labels=c("100+")) + g=g+scale_y_continuous(limits=c(0, 200), breaks=seq(0,200,40)) + g=g+scale_fill_brewer(palette=color_plt) + g=g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + g=g+labs(y="Counts",x=cat)+coord_flip()+theme_bw() + if (legend){ + g=g+guides(fill = guide_legend(title = ordercat))} + else{ + g=g+guides(fill="none") + } + return(g) +} + +PlotCountryNCounts <- function(df, mincount=10){ + countrycounts = table(df$Country) + dd=data.frame(countrycounts) + names(dd)=c("Country","Counts") + cols <- rep(2, dim(dd)[1]) + cols[dd$Counts<mincount]=1 + dd$NCounts = cols + spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country") + mapParams <- mapCountryData(spdf, nameColumnToPlot="NCounts",catMethod='categorical',missingCountryCol="grey",oceanCol="lightblue",colourPalette='heat', mapTitle=paste("Country sample size, n_recordings>",mincount),addLegend=F) + mapParams$legendText <- c(paste('<',mincount),paste('>=',mincount),'na') + do.call(addMapLegendBoxes, c(mapParams,x='bottomleft')) +} + +PlotCountryCounts <- function(df, output=F){ + countrycounts = table(df$Country) + dd=data.frame(countrycounts) + names(dd)=c("Country","Counts") + spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country") + spdf<-spdf[-which(spdf$ADMIN=='Antarctica'),] + #mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',borderCol='black',oceanCol="white",colourPalette='heat', mapTitle="",addLegend=F) + mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=seq(10,100,10),missingCountryCol='grey',borderCol='black',oceanCol="white",colourPalette="heat", mapTitle="",addLegend=F) + #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendShrink=0.5,legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page")) + legend("left", legend = c(paste(seq(90,1,-10),'-',seq(100,11,-10)), 'NA'), fill = c(heat.colors(9, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white") + if (output){ + pdf(file="countrycounts.pdf") + mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=seq(10,100,10),missingCountryCol='grey',borderCol='black',oceanCol="white",colourPalette="heat", mapTitle="",addLegend=F) + #mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',borderCol='black',oceanCol="lightblue",colourPalette='heat', mapTitle="",addLegend=F) + #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendShrink=0.5,legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page")) + legend("left", legend = c(paste(seq(90,1,-10),'-',seq(100,11,-10)), 'NA'), fill = c(heat.colors(9, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white") + dev.off() + } +} + +PlotYearDistribution <- function(df, output=F){ + df$Year<-as.numeric(as.character(df$Year)) + g = ggplot(df,aes(x=Year,y=..count..))+geom_histogram(breaks=seq(1895, 2015, by = 1)) + #g = ggplot(df,aes(x=Year,y=..count..))+geom_bar()+geom_density(alpha=.3, fill="grey") + #g = g+scale_x_continuous(breaks = pretty(df$Year, n=10)) + g = g+theme_bw()+labs(x ='Year', y ='Count') + #g = ggplot(df,aes(x=Year,y=..count..))+geom_histogram()+theme_bw() + print(g) + if (output){ + ggsave('yeardistribution.pdf',plot=g) + } +} + +PlotCountryDistribution <- function(df){ + #countrycounts = table(df$Country) + #dd=data.frame(countrycounts) + #names(dd)=c("Country","Counts") + g = ggplot(df,aes(x=Country))+geom_bar() + g=g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +} + +#PlotCultureDistribution <- function(df){ +# g = ggplot(df,aes(x=Culture))+geom_bar() +# g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +#} + +PlotLanguageDistribution <- function(df, mincount=1){ + dfsubset <- subset(df, df$Language!="") # ignore the recordings culture info + culturecounts = table(dfsubset$Culture) + culturecounts = culturecounts[culturecounts>=mincount] + barplot(culturecounts, las=2, cex.names=0.2) + #g = ggplot(df,aes(x=Language))+geom_bar() + #g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +} + +PlotBarForCategory <- function(df, cat="Language", mincount=1){ + idx_cat = which(colnames(df)==cat) + dfsubset <- subset(df, df[,idx_cat]!="") + counts = table(dfsubset[,idx_cat]) + counts = counts[counts>=mincount] + barplot(counts, las=2, cex.names=0.2) + #g = ggplot(df,aes(x=Language))+geom_bar() + #g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +} + +PlotCountryYearCutoff <- function(df, cutoffyear=1960){ + df$BeforeYear = df$Year<cutoffyear + dfsubset <- subset(df, BeforeYear==T) + countrycounts = table(dfsubset$Country) + dd=data.frame(countrycounts) + names(dd)=c("Country","Counts") + spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country") + mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',oceanCol="lightblue",colourPalette='heat', mapTitle=paste("Country sample size, year<",cutoffyear),addLegend=F) + do.call( addMapLegend, c(mapParams, labelFontSize=0.3, legendWidth=0.5, tcl=-0.3, legendMar = 4, legendLabels="all",horizontal=F, legendIntervals="page")) +} + +#dftemp$Decade<-floor(df$Year/10)*10 +#yearcounts = sapply(levels(dftemp$Country),function(x)table(subset(dftemp,Country==x)$Before1960)) +#decadecounts = sapply(levels(df$Country),function(x)table(subset(dftemp,Country==x & Before1960=="TRUE")$Before1960)) +#print(yearcounts) + +#metadata we are interested in: +#Artist,AlbumTitle,Culture, Language/Language_Album, Subject_Album +PlotCountryCulture <- function(df){ + dfsubset <- subset(df, df$Culture!="") + countrycounts = table(dfsubset$Country) + dd=data.frame(countrycounts) + names(dd)=c("Country","Counts") + spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country") + #mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',oceanCol="lightblue",colourPalette='heat', mapTitle="Recordings with culture information",addLegend=F) + mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod="logFixedWidth",missingCountryCol='grey',oceanCol="lightblue",colourPalette='heat', mapTitle="Number of recordings with culture information",addLegend=F) + do.call( addMapLegend, c(mapParams, labelFontSize=0.3, legendWidth=0.5, tcl=-0.3, legendMar = 4, legendLabels="all",horizontal=F, legendIntervals="page")) +} + +PlotCultureDistribution <- function(df){ + dfsubset <- subset(df, df$Culture!="") # ignore the recordings culture info + culturecounts = table(dfsubset$Culture) + barplot(culturecounts, las=2, cex.names=0.2) + #g = ggplot(df,aes(x=Year,y=..count..))+geom_histogram(breaks=seq(1875, 2015, by = 1)) + #print(g) +} + +PlotCountryNCulture <- function(df){ + dfsubset <- subset(df, df$Culture!="") + #culturecounts = table(dfsubset$Culture) + countrycounts = table(dfsubset$Country, dfsubset$Culture) + aa <- addmargins(countrycounts, FUN = list(Total = sum), quiet = TRUE) + print(paste(">1000",row.names(aa)[aa[,dim(aa)[2]]>1000])) + print(paste(">500",row.names(aa)[aa[,dim(aa)[2]]>500])) + print(paste(">100",row.names(aa)[aa[,dim(aa)[2]]>100])) + inds = which(aa[,dim(aa)[2]]>500 & row.names(aa)!="Total") + mosaicplot(aa[inds,1:5]) + dd=data.frame(countrycounts[inds,]) + dd=data.frame(countrycounts) + names(dd)=c("Country","Culture","Counts") + ddsub <- subset(dd, (Country=="Canada"| Country=="United Kingdom" | Country=="United States of America") & Counts>20) + g=ggplot(ddsub, aes(x=Culture,y=Counts))+geom_point()+facet_wrap(~Country, scales = "free") + g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +} + +PlotCountryCultureNcounts <- function(df,mincount=50){ + dfsubset <- subset(df, df$Culture!="") + #culturecounts = table(dfsubset$Culture) + countrycounts = table(dfsubset$Country, dfsubset$Culture) + dd=data.frame(countrycounts) + names(dd)=c("Country","Culture","Counts") + ddsub <- subset(dd, Counts>mincount) + g=ggplot(ddsub, aes(x=Culture,y=Counts))+geom_point()+facet_wrap(~Country, scales = "free") + g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))#+title(main=paste("Cultures per country, count>",mincount)) +} + +PlotCountryLanguageNcounts <- function(df,mincount=50){ + dfsubset <- subset(df, df$Language!="" & df$Language!=" ") + countrycounts = table(dfsubset$Country, dfsubset$Language) + dd=data.frame(countrycounts) + names(dd)=c("Country","Language","Counts") + ddsub <- subset(dd, Counts>mincount) + g=ggplot(ddsub, aes(x=Language,y=Counts))+geom_point()+facet_wrap(~Country, scales = "free") + g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))#+title(main=paste("Cultures per country, count>",mincount)) +} + +PlotNxNcounts <- function(df, cat1="Country", cat2="Culture", mincount=50, figname=""){ + indcat1 = which(colnames(df)==cat1) + indcat2 = which(colnames(df)==cat2) + dfsubset <- subset(df, df[,indcat1]!="" & df[,indcat2]!="") # avoid nan values + NNcounts <- table(dfsubset[,indcat1], dfsubset[,indcat2]) + dd=data.frame(NNcounts) + names(dd) <- c("Cat1","Cat2","Counts") + ddsub <- subset(dd, Counts>mincount) + if (figname==""){ + g=ggplot(ddsub, aes(x=Cat2,y=Counts))+geom_point()+facet_wrap(~Cat1) + g+coord_flip()+theme(axis.text.y=element_text(hjust=1,vjust=0.5,size=5))+labs(y="Counts",x=cat2) + }else{ + g=ggplot(ddsub, aes(x=Cat2,y=Counts))+geom_point()+facet_wrap(~Cat1) + g+coord_flip()+theme(axis.text.y=element_text(hjust=1,vjust=0.5,size=5))+labs(y="Counts",x=cat2) + ggsave(figname) + } +} + +Wordcloud<- function(df, cat="Language", output=F){ + require(wordcloud) + require(RColorBrewer) + ind_cat = which(colnames(df)==cat) + counts <- table(df[,ind_cat]) + dd=data.frame(counts) + names(dd) <- c("words","freq") + pal2 <- brewer.pal(8,"Dark2") + wordcloud(dd$words,dd$freq,random.order=FALSE, colors=pal2) + if (output){ + pdf("wordcloud.pdf") + wordcloud(dd$words,dd$freq,random.order=FALSE, colors=pal2) + dev.off() + } +} +PlotCountryOutliers <- function(df, output=''){ + par(mar = rep(2, 4)) + spdf<-joinCountryData2Map(df,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country") + spdf<-spdf[-which(spdf$ADMIN=='Antarctica'),] + #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers",catMethod=seq(0,70,5),missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE) + mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE) + # avoid antarctica + #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", ylim=c(-60,90), catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE) + #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page")) + legend("left", legend = c(paste(seq(90,0,-10),'-',seq(100,10,-10),'%'), 'NA'), fill = c(heat.colors(10, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white") + if (output!=''){ + pdf(output) + #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers",catMethod=seq(0,70,5),missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE) + mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE) + #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", ylim=c(-60,90), catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE) + #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendWidth=0.5, tcl=0.3, legendMar=7, legendLabels="all",horizontal=T, legendIntervals="page")) + legend("left", legend = c(paste(seq(90,0,-10),'-',seq(100,10,-10),'%'), 'NA'), fill = c(heat.colors(10, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white") + dev.off() + } + else { + return(mapParams) + } +} \ No newline at end of file