Mercurial > hg > plosone_underreview

--- a/notebooks/correlation_samples_outliers.ipynb	Fri Sep 22 16:30:28 2017 +0100
+++ b/notebooks/correlation_samples_outliers.ipynb	Fri Sep 22 16:30:36 2017 +0100
@@ -520,7 +520,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "version": "2.7.11"
   }
  },
  "nbformat": 4,
--- a/notebooks/explain_components.ipynb	Fri Sep 22 16:30:28 2017 +0100
+++ b/notebooks/explain_components.ipynb	Fri Sep 22 16:30:36 2017 +0100
@@ -32,7 +32,9 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -64,6 +66,7 @@
    "cell_type": "code",
    "execution_count": 14,
    "metadata": {
+    "collapsed": false,
     "scrolled": false
    },
    "outputs": [
@@ -230,7 +233,9 @@
   {
    "cell_type": "code",
    "execution_count": 54,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -268,7 +273,9 @@
   {
    "cell_type": "code",
    "execution_count": 62,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -300,7 +307,9 @@
   {
    "cell_type": "code",
    "execution_count": 63,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -363,7 +372,9 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -406,7 +417,9 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -424,7 +437,9 @@
   {
    "cell_type": "code",
    "execution_count": 65,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -494,7 +509,9 @@
   {
    "cell_type": "code",
    "execution_count": 69,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -529,7 +546,9 @@
   {
    "cell_type": "code",
    "execution_count": 67,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -569,7 +588,9 @@
   {
    "cell_type": "code",
    "execution_count": 51,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -628,7 +649,9 @@
   {
    "cell_type": "code",
    "execution_count": 26,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -660,7 +683,9 @@
   {
    "cell_type": "code",
    "execution_count": 31,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -693,7 +718,9 @@
   {
    "cell_type": "code",
    "execution_count": 33,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -736,7 +763,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "version": "2.7.11"
   }
  },
  "nbformat": 4,
--- a/notebooks/sensitivity_experiment.ipynb	Fri Sep 22 16:30:28 2017 +0100
+++ b/notebooks/sensitivity_experiment.ipynb	Fri Sep 22 16:30:36 2017 +0100
@@ -3,8 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
    "outputs": [],
+   },
    "source": [
     "import numpy as np\n",
     "import pandas as pd\n",
@@ -38,7 +39,9 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -59,7 +62,9 @@
   {
    "cell_type": "code",
    "execution_count": 48,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -269,7 +274,9 @@
   {
    "cell_type": "code",
    "execution_count": 52,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -438,7 +445,9 @@
   {
    "cell_type": "code",
    "execution_count": 56,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -461,7 +470,9 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -720,7 +731,9 @@
   {
    "cell_type": "code",
    "execution_count": 47,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -819,7 +832,9 @@
   {
    "cell_type": "code",
    "execution_count": 59,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -4566,7 +4581,9 @@
   {
    "cell_type": "code",
    "execution_count": 21,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -5029,7 +5046,9 @@
   {
    "cell_type": "code",
    "execution_count": 54,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -5093,7 +5112,9 @@
   {
    "cell_type": "code",
    "execution_count": 53,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -5207,7 +5228,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "version": "2.7.11"
   }
  },
  "nbformat": 4,
--- a/notebooks/test_hubness.ipynb	Fri Sep 22 16:30:28 2017 +0100
+++ b/notebooks/test_hubness.ipynb	Fri Sep 22 16:30:36 2017 +0100
@@ -47,7 +47,9 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -85,7 +87,9 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -107,7 +111,9 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -151,7 +157,9 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -230,6 +238,8 @@
    ]
   },
   {
+    "collapsed": false
+   },
    "cell_type": "code",
    "execution_count": 16,
    "metadata": {
@@ -245,7 +255,9 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -271,7 +283,9 @@
   {
    "cell_type": "code",
    "execution_count": 12,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -469,7 +483,9 @@
   {
    "cell_type": "code",
    "execution_count": 21,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -496,7 +512,9 @@
   {
    "cell_type": "code",
    "execution_count": 12,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stderr",
@@ -523,7 +541,9 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -543,7 +563,9 @@
   {
    "cell_type": "code",
    "execution_count": 7,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -570,7 +592,9 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stderr",
@@ -595,7 +619,9 @@
   {
    "cell_type": "code",
    "execution_count": 10,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -765,7 +791,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "version": "2.7.11"
   }
  },
  "nbformat": 4,
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts_R/MetadataPlots.R	Fri Sep 22 16:30:36 2017 +0100
@@ -0,0 +1,235 @@
+library(rworldmap)
+library(ggplot2)
+
+PlotBarChart<- function(df, cat="Language", ordercat="REGION", mincount=10, legend=T, color_plt="Paired"){
+  idx_cat = which(colnames(df)==cat)
+  idx_ordercat = which(colnames(df)==ordercat)
+  dfsub <- subset(df, df[,idx_cat]!="")
+  dfsub <- dfsub[ dfsub[,idx_cat] %in%  names(table(dfsub[,idx_cat]))[table(dfsub[,idx_cat]) >mincount] , ]
+  #dfsub <- dfsub[order(dfsub$REGION.y),]
+  dfsub <- dfsub[order(dfsub[,idx_ordercat]),]
+  dfsub[,idx_cat] <- factor(dfsub[,idx_cat], levels=unique(dfsub[,idx_cat]))
+  g = ggplot(dfsub,aes(dfsub[,idx_cat], fill=dfsub[,idx_ordercat], order=-as.numeric(dfsub[,idx_ordercat])))+geom_bar()
+  #g = g+ylim("0", "100")#+scale_y_discrete(breaks=c("100"),labels=c("100+"))
+  g=g+scale_y_continuous(limits=c(0, 200), breaks=seq(0,200,40))
+  g=g+scale_fill_brewer(palette=color_plt)
+  g=g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
+  g=g+labs(y="Counts",x=cat)+coord_flip()+theme_bw()
+  if (legend){
+    g=g+guides(fill = guide_legend(title = ordercat))}
+  else{
+    g=g+guides(fill="none")
+  }
+  return(g)
+}
+
+PlotCountryNCounts <- function(df, mincount=10){
+  countrycounts = table(df$Country)
+  dd=data.frame(countrycounts)
+  names(dd)=c("Country","Counts")
+  cols <- rep(2, dim(dd)[1])
+  cols[dd$Counts<mincount]=1
+  dd$NCounts = cols
+  spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country")
+  mapParams <- mapCountryData(spdf, nameColumnToPlot="NCounts",catMethod='categorical',missingCountryCol="grey",oceanCol="lightblue",colourPalette='heat', mapTitle=paste("Country sample size, n_recordings>",mincount),addLegend=F)
+  mapParams$legendText <- c(paste('<',mincount),paste('>=',mincount),'na')
+  do.call(addMapLegendBoxes, c(mapParams,x='bottomleft'))
+}
+
+PlotCountryCounts <- function(df, output=F){
+  countrycounts = table(df$Country)
+  dd=data.frame(countrycounts)
+  names(dd)=c("Country","Counts")
+  spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country")
+  spdf<-spdf[-which(spdf$ADMIN=='Antarctica'),]
+  #mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',borderCol='black',oceanCol="white",colourPalette='heat', mapTitle="",addLegend=F)
+  mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=seq(10,100,10),missingCountryCol='grey',borderCol='black',oceanCol="white",colourPalette="heat", mapTitle="",addLegend=F)
+  #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendShrink=0.5,legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page"))
+  legend("left", legend = c(paste(seq(90,1,-10),'-',seq(100,11,-10)), 'NA'), fill = c(heat.colors(9, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white")
+  if (output){
+    pdf(file="countrycounts.pdf")
+    mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=seq(10,100,10),missingCountryCol='grey',borderCol='black',oceanCol="white",colourPalette="heat", mapTitle="",addLegend=F)
+    #mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',borderCol='black',oceanCol="lightblue",colourPalette='heat', mapTitle="",addLegend=F)
+    #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendShrink=0.5,legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page"))
+    legend("left", legend = c(paste(seq(90,1,-10),'-',seq(100,11,-10)), 'NA'), fill = c(heat.colors(9, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white")
+    dev.off()
+  }
+}
+
+PlotYearDistribution <- function(df, output=F){
+  df$Year<-as.numeric(as.character(df$Year))
+  g = ggplot(df,aes(x=Year,y=..count..))+geom_histogram(breaks=seq(1895, 2015, by = 1))
+  #g = ggplot(df,aes(x=Year,y=..count..))+geom_bar()+geom_density(alpha=.3, fill="grey")
+  #g = g+scale_x_continuous(breaks = pretty(df$Year, n=10))
+  g = g+theme_bw()+labs(x ='Year', y ='Count')
+  #g = ggplot(df,aes(x=Year,y=..count..))+geom_histogram()+theme_bw()
+  print(g)
+  if (output){
+    ggsave('yeardistribution.pdf',plot=g)
+  }
+}
+
+PlotCountryDistribution <- function(df){
+  #countrycounts = table(df$Country)
+  #dd=data.frame(countrycounts)
+  #names(dd)=c("Country","Counts")
+  g = ggplot(df,aes(x=Country))+geom_bar()
+  g=g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
+}
+
+#PlotCultureDistribution <- function(df){
+#  g = ggplot(df,aes(x=Culture))+geom_bar()
+#  g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
+#}
+
+PlotLanguageDistribution <- function(df, mincount=1){
+  dfsubset <- subset(df, df$Language!="") # ignore the recordings culture info
+  culturecounts = table(dfsubset$Culture)
+  culturecounts = culturecounts[culturecounts>=mincount]
+  barplot(culturecounts, las=2, cex.names=0.2)
+  #g = ggplot(df,aes(x=Language))+geom_bar()
+  #g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
+}
+
+PlotBarForCategory <- function(df, cat="Language", mincount=1){
+  idx_cat = which(colnames(df)==cat)
+  dfsubset <- subset(df, df[,idx_cat]!="")
+  counts = table(dfsubset[,idx_cat])
+  counts = counts[counts>=mincount]
+  barplot(counts, las=2, cex.names=0.2)
+  #g = ggplot(df,aes(x=Language))+geom_bar()
+  #g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
+}
+
+PlotCountryYearCutoff <- function(df, cutoffyear=1960){
+  df$BeforeYear = df$Year<cutoffyear
+  dfsubset <- subset(df, BeforeYear==T)
+  countrycounts = table(dfsubset$Country)
+  dd=data.frame(countrycounts)
+  names(dd)=c("Country","Counts")
+  spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country")
+  mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',oceanCol="lightblue",colourPalette='heat', mapTitle=paste("Country sample size, year<",cutoffyear),addLegend=F)
+  do.call( addMapLegend, c(mapParams, labelFontSize=0.3, legendWidth=0.5, tcl=-0.3, legendMar = 4, legendLabels="all",horizontal=F, legendIntervals="page"))
+}
+
+#dftemp$Decade<-floor(df$Year/10)*10
+#yearcounts = sapply(levels(dftemp$Country),function(x)table(subset(dftemp,Country==x)$Before1960))
+#decadecounts = sapply(levels(df$Country),function(x)table(subset(dftemp,Country==x & Before1960=="TRUE")$Before1960))
+#print(yearcounts)
+
+#metadata we are interested in:
+#Artist,AlbumTitle,Culture, Language/Language_Album, Subject_Album
+PlotCountryCulture <- function(df){
+  dfsubset <- subset(df, df$Culture!="")
+  countrycounts = table(dfsubset$Country)
+  dd=data.frame(countrycounts)
+  names(dd)=c("Country","Counts")
+  spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country")
+  #mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',oceanCol="lightblue",colourPalette='heat', mapTitle="Recordings with culture information",addLegend=F)
+  mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod="logFixedWidth",missingCountryCol='grey',oceanCol="lightblue",colourPalette='heat', mapTitle="Number of recordings with culture information",addLegend=F)
+  do.call( addMapLegend, c(mapParams, labelFontSize=0.3, legendWidth=0.5, tcl=-0.3, legendMar = 4, legendLabels="all",horizontal=F, legendIntervals="page"))
+}
+
+PlotCultureDistribution <- function(df){
+  dfsubset <- subset(df, df$Culture!="") # ignore the recordings culture info
+  culturecounts = table(dfsubset$Culture)
+  barplot(culturecounts, las=2, cex.names=0.2)
+  #g = ggplot(df,aes(x=Year,y=..count..))+geom_histogram(breaks=seq(1875, 2015, by = 1))
+  #print(g)
+}
+
+PlotCountryNCulture <- function(df){
+  dfsubset <- subset(df, df$Culture!="")
+  #culturecounts = table(dfsubset$Culture)
+  countrycounts = table(dfsubset$Country, dfsubset$Culture)
+  aa <- addmargins(countrycounts, FUN = list(Total = sum), quiet = TRUE)
+  print(paste(">1000",row.names(aa)[aa[,dim(aa)[2]]>1000]))
+  print(paste(">500",row.names(aa)[aa[,dim(aa)[2]]>500]))
+  print(paste(">100",row.names(aa)[aa[,dim(aa)[2]]>100]))
+  inds = which(aa[,dim(aa)[2]]>500 & row.names(aa)!="Total")
+  mosaicplot(aa[inds,1:5])
+  dd=data.frame(countrycounts[inds,])
+  dd=data.frame(countrycounts)
+  names(dd)=c("Country","Culture","Counts")
+  ddsub <- subset(dd, (Country=="Canada"| Country=="United Kingdom" | Country=="United States of America") & Counts>20)
+  g=ggplot(ddsub, aes(x=Culture,y=Counts))+geom_point()+facet_wrap(~Country, scales = "free")
+  g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
+}
+
+PlotCountryCultureNcounts <- function(df,mincount=50){
+  dfsubset <- subset(df, df$Culture!="")
+  #culturecounts = table(dfsubset$Culture)
+  countrycounts = table(dfsubset$Country, dfsubset$Culture)
+  dd=data.frame(countrycounts)
+  names(dd)=c("Country","Culture","Counts")
+  ddsub <- subset(dd, Counts>mincount)
+  g=ggplot(ddsub, aes(x=Culture,y=Counts))+geom_point()+facet_wrap(~Country, scales = "free")
+  g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))#+title(main=paste("Cultures per country, count>",mincount))
+}
+
+PlotCountryLanguageNcounts <- function(df,mincount=50){
+  dfsubset <- subset(df, df$Language!="" & df$Language!=" ")
+  countrycounts = table(dfsubset$Country, dfsubset$Language)
+  dd=data.frame(countrycounts)
+  names(dd)=c("Country","Language","Counts")
+  ddsub <- subset(dd, Counts>mincount)
+  g=ggplot(ddsub, aes(x=Language,y=Counts))+geom_point()+facet_wrap(~Country, scales = "free")
+  g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))#+title(main=paste("Cultures per country, count>",mincount))
+}
+
+PlotNxNcounts <- function(df, cat1="Country", cat2="Culture", mincount=50, figname=""){
+  indcat1 = which(colnames(df)==cat1)
+  indcat2 = which(colnames(df)==cat2)
+  dfsubset <- subset(df, df[,indcat1]!="" & df[,indcat2]!="")  # avoid nan values
+  NNcounts <- table(dfsubset[,indcat1], dfsubset[,indcat2])
+  dd=data.frame(NNcounts)
+  names(dd) <- c("Cat1","Cat2","Counts")
+  ddsub <- subset(dd, Counts>mincount)
+  if (figname==""){
+    g=ggplot(ddsub, aes(x=Cat2,y=Counts))+geom_point()+facet_wrap(~Cat1)
+    g+coord_flip()+theme(axis.text.y=element_text(hjust=1,vjust=0.5,size=5))+labs(y="Counts",x=cat2)
+  }else{
+    g=ggplot(ddsub, aes(x=Cat2,y=Counts))+geom_point()+facet_wrap(~Cat1)
+    g+coord_flip()+theme(axis.text.y=element_text(hjust=1,vjust=0.5,size=5))+labs(y="Counts",x=cat2)
+    ggsave(figname)
+  }
+}
+
+Wordcloud<- function(df, cat="Language", output=F){
+  require(wordcloud)
+  require(RColorBrewer)
+  ind_cat = which(colnames(df)==cat)
+  counts <- table(df[,ind_cat])
+  dd=data.frame(counts)
+  names(dd) <- c("words","freq")
+  pal2 <- brewer.pal(8,"Dark2")
+  wordcloud(dd$words,dd$freq,random.order=FALSE, colors=pal2)
+  if (output){
+    pdf("wordcloud.pdf")
+    wordcloud(dd$words,dd$freq,random.order=FALSE, colors=pal2)
+    dev.off()
+  }
+}
+PlotCountryOutliers <- function(df, output=''){
+  par(mar = rep(2, 4))
+  spdf<-joinCountryData2Map(df,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country")
+  spdf<-spdf[-which(spdf$ADMIN=='Antarctica'),]
+  #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers",catMethod=seq(0,70,5),missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
+  mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
+  # avoid antarctica
+  #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", ylim=c(-60,90), catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
+  #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page"))
+  legend("left", legend = c(paste(seq(90,0,-10),'-',seq(100,10,-10),'%'), 'NA'), fill = c(heat.colors(10, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white")
+  if (output!=''){
+    pdf(output)
+    #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers",catMethod=seq(0,70,5),missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
+    mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
+    #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", ylim=c(-60,90), catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE)
+    #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendWidth=0.5, tcl=0.3, legendMar=7, legendLabels="all",horizontal=T, legendIntervals="page"))
+    legend("left", legend = c(paste(seq(90,0,-10),'-',seq(100,10,-10),'%'), 'NA'), fill = c(heat.colors(10, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white")
+    dev.off()
+  }
+  else {
+    return(mapParams)
+  }
+}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts_R/Metadata_subsetBLSM.R	Fri Sep 22 16:30:36 2017 +0100
@@ -0,0 +1,168 @@
+#df = read.csv("/Users/mariapanteli/Documents/2014-2015/Python/pythoncode/MergeBL-Smith/data/df_BLSM.csv",header=TRUE)
+#df = read.csv("data/df_subset_remove.csv",header=TRUE)
+#df = read.csv("/Users/mariapanteli/Documents/2014-2015/Python/pythoncode/CodeForBL/data/metadataBL_new.csv",header=TRUE)
+#df = read.csv("/Users/mariapanteli/Documents/2014-2015/Python/pythoncode/MergeBL-Smith/data/metadata_BLSM.csv",header=TRUE)
+#df = df[1:29182,] # BL data
+df = read.csv('data/df_and_clusters.csv', header=T)
+
+source("MetadataPlots.R")
+
+## for plos use arial
+#install.packages("extrafont")
+library(extrafont)
+font_import()
+loadfonts()
+Arial <- Type1Font(family="Arial", metrics=c("ArialMT.afm","arial-BoldMT.afm","Arial-ItalicMT.afm", "Arial-BoldItalicMT.afm"))
+postscriptFonts(Arial=Arial)
+par(family="Arial")
+
+#pdf(file="data/country_distribution_BL.pdf")
+pdf(file="data/country_distribution.pdf")
+PlotCountryCounts(df)
+dev.off()
+postscript(file="data/country_distribution.eps")
+PlotCountryCounts(df)
+dev.off()
+
+pdf(file="data/year_distribution.pdf", width=6, height=4)
+PlotYearDistribution(df)
+dev.off()
+postscript("data/year_distribution.eps", width=10)
+PlotYearDistribution(df)
+dev.off()
+#PlotBarChart(df, cat="Year", ordercat="REGION", mincount=10)
+
+#pdf(file="data/language_distribution_BL.pdf")
+levels(df$Language)[which(levels(df$Language)=="Southwestern Caribbean Creole English")]="SouthW Carib. Creole English"
+df$Language[which(df$Language=="Southwestern Caribbean Creole English")] = "SouthW Carib. Creole English"
+levels(df$Language)[which(levels(df$Language)=="Lesser Antillean Creole French")]="Lesser Antil. Creole French"
+df$Language[which(df$Language=="Lesser Antillean Creole French")] = "Lesser Antil. Creole French"
+df$REGION[which(df$Country=="French Guiana")] = "South America"
+pdf(file="data/language_distribution.pdf")
+PlotBarChart(df, cat="Language", ordercat="Region", mincount=10)
+dev.off()
+postscript("data/language_distribution.eps", width=8, height=10)
+PlotBarChart(df, cat="Language", ordercat="Region", mincount=10)
+dev.off()
+
+#language phylogeny
+df = read.csv('data/metadata_BLSM_language.csv', header=T)
+pdf(file="data/language_iso3_iso1.pdf")
+PlotBarChart(df, cat="Language_iso3", ordercat="Language_iso1", mincount=10)
+dev.off()
+
+# PlotCountryCounts(df)
+# PlotCountryCultureNcounts(df, mincount=20)
+# PlotCountryLanguageNcounts(df, mincount=20)
+# PlotYearDistribution(df)
+# PlotLanguageDistribution(df)
+# PlotCultureDistribution(df)
+# PlotNxNcounts(df, cat1="Country", cat2="Genre_Album", mincount=20)
+
+df = read.csv('data/df_and_clusters.csv', header=T)
+#PlotBarChart(df, cat="Clusters", ordercat="CountryLang", mincount=1,legend=F)
+df$REGION[which(df$Country=="French Guiana")] = "South America"
+g = ggplot(df,aes(df$Clusters, fill=df$REGION))+geom_bar()
+levels(df$REGION)[which(levels(df$REGION)=="South America")]="S. America"
+levels(df$REGION)[which(levels(df$REGION)=="North America")]="N. America"
+
+#library(rworldmap)
+#wrld = getMap()
+#regiondata<-wrld@data[,c("ADMIN","GEO3", "Stern")]
+#df<-merge(df,regiondata,by.x="Country",by.y="ADMIN",all.x=T)
+
+#cluster_labels_df = read.csv('data/clusters_top3_labels.csv')
+cluster_labels_df = read.csv('data/clusters_top3_countries.csv')
+cluster_labels = paste(cluster_labels_df[,1],cluster_labels_df[,2],cluster_labels_df[,3],sep="")
+#df$CountryLang = as.factor(paste(df$Country, df$Language, sep="-"))
+
+countrycounts = table(df$Clusters,df$Country)
+library(cluster)
+library(ape)
+library(gridExtra)
+library(ggdendro)
+library(dendextend)
+hc = hclust(dist(countrycounts), method="average")
+hc2=hc
+#hc2$labels = as.character(1:length(cluster_labels))
+hc2$labels = ""
+#dhc <- as.dendrogram(hc2)
+# library(dynamicTreeCut)
+# clusters <- cutreeDynamic(hc2, minClusterSize = k_clust,method = "tree")
+# clusters <- clusters[order.dendrogram(dhc)]
+# clusters_numbers <- unique(clusters) - (0 %in% clusters)
+# n_clusters <- length(clusters_numbers)
+# library(colorspace)
+# cols <- rainbow_hcl(n_clusters)
+# dhc <- hc2 %>% as.dendrogram %>%
+#   set("branches_k_color", k=k_clust) %>% branches_attr_by_clusters(clusters, values = cols)
+k_clust = 5
+dhc <- hc2 %>% as.dendrogram %>%
+  set("branches_k_color", k=k_clust) %>% set("branches_lwd", 0.7) %>%
+  set("labels_cex", 0.6) %>% set("labels_colors", k=k_clust) %>%
+  set("leaves_pch", 19) %>% set("leaves_cex", 0.5)
+#ddata <- dendro_data(dhc, type = "rectangle")
+ddata <- as.ggdend(dhc)
+p <- ggplot(ddata)+coord_flip()
+#p <- ggplot(segment(ddata)) +
+#  geom_segment(aes(x = x, y = y, xend = xend, yend = yend, colour=ddata$segments$col)) +
+#  coord_flip() + theme_dendro() + theme(legend.position="none") +
+#  geom_text(aes(x = x, y = y, label = label, angle = -90, hjust = 0.5, vjust=1.3, colour=ddata$labels$col), data= label(ddata))
+
+#dend <- hc2 %>% as.dendrogram %>%
+#  set("branches_k_color", k = 5) %>% set("branches_lwd", 0.7) %>%
+#  set("labels_cex", 0.6) %>% set("labels_colors", k = 5) %>%
+#  set("leaves_pch", 19) %>% set("leaves_cex", 0.5)
+#ggd1 <- as.ggdend(dend)
+#pp <- ggplot(ggd1, horiz = TRUE)
+
+library(stringr)
+for (i in 1:length(cluster_labels)){
+  cl = cluster_labels[i]
+  cl = str_replace_all(cl, "[(']", "")
+  cl = str_replace_all(cl, "[|]", "-")
+  cl = str_replace_all(cl, ", ", " (")
+  cl = str_replace_all(cl, "[)]", "), ")
+  cl = str_replace_all(cl, "nan", "NA")
+  #cl = paste(cl, "cluster",i)
+  cluster_labels[i] = cl
+}
+#cluster_idx = paste("cluster",1:length(cluster_labels))
+#df$Clusters = as.factor(df$Clusters)
+df$Clusters = factor(x=df$Clusters,levels=hc$labels[hc$order])
+cluster_labels = cluster_labels[hc$order]
+#g = ggplot(df,aes(as.factor(df$Clusters), fill=df$CountryLang))+geom_bar()
+#g = ggplot(df,aes(Clusters, fill=REGION))+geom_bar()+facet_grid(~REGION,space="free",scales="free")#,scales="free")
+g = ggplot(df,aes(as.factor(df$Clusters), fill=df$Region))+geom_bar()
+#g = ggplot(df,aes(as.factor(df$Clusters), fill=df$REGION))+geom_bar()
+g = g+scale_x_discrete(labels=cluster_labels)
+#g = g+scale_y_continuous(position="right")
+#g = g+scale_fill_brewer(palette="Paired")#+scale_fill_grey()
+g = g+scale_fill_brewer(palette="Paired")#+scale_fill_grey()
+#g = g+labs(y="Counts", x="Top 3 country-language tags in each cluster")+coord_flip()+theme_bw()#+guides(fill="none")
+g = g+labs(y="Counts", x="Clusters")+coord_flip()+theme_bw()#+guides(fill="none")
+#g = g+labs(y="Counts", x="Clusters")+coord_flip()+theme_bw()#+guides(fill="none")
+#g = g+guides(fill = guide_legend(title = "Region"))+theme(legend.position=c(.9,.8),legend.margin = unit(0, "cm"),legend.key.size = unit(0.3, "cm"),legend.title = element_text(size=10),legend.text = element_text(size=10))
+#g = g+guides(fill = guide_legend(title = "Region"))+theme(legend.position="left",legend.margin = unit(0, "cm"),legend.key.size = unit(0.3, "cm"),legend.title = element_text(size=9),legend.text = element_text(size=9))
+g = g+guides(fill = guide_legend(title = "Region"))+theme(legend.position="top",legend.title = element_text(size=9),legend.text = element_text(size=9))
+g = g+theme(panel.border = element_rect(colour = "white"),strip.background=element_rect(fill="white"),strip.text.x = element_blank())
+#g = g+theme(axis.text.y = element_text(colour = ddata$labels$col))
+ggsave('data/clusters_top3.pdf',plot=g)
+ggsave('data/clusters_top3.eps',plot=g)
+
+#g_legend<-function(a.gplot){
+#  tmp <- ggplot_gtable(ggplot_build(a.gplot))
+#  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
+#  legend <- tmp$grobs[[leg]]
+#  return(legend)}
+#mylegend<-g_legend(g)
+
+#pdf(file="data/clusters_top3_hclust.pdf", width=12, height=5)
+#grid.arrange(arrangeGrob(g + theme(legend.position="none"),p + theme(legend.position="none"),nrow=1, widths=c(4,1)),mylegend, nrow=2,heights=c(10, 1))
+#dev.off()
+
+#grid.arrange(arrangeGrob(g,p,nrow=1, ncol=2))
+#ggsave('data/clusters_top3_hclust.pdf',plot=g_comb)
+#g=g+annotate(x=20, y=1:18, label=cluster_idx)+geom_text(aes(x=20,y=1:18,label=cluster_idx))
+#+guides(fill = guide_legend(title = "Region"))
+#grid.draw(cbind(ggplotGrob(g), ggplotGrob(pp), size = "last"))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts_R/PlotOutliersCountry.R	Fri Sep 22 16:30:36 2017 +0100
@@ -0,0 +1,19 @@
+source("MetadataPlots.R")
+
+PlotCountryOutliers(df=read.csv("data/global_outliers.csv",header=TRUE), output="data/global_outliers.pdf")
+PlotCountryOutliers(df=read.csv("data/global_outliers_rhy.csv",header=TRUE), output="data/global_outliers_rhy.pdf")
+PlotCountryOutliers(df=read.csv("data/global_outliers_mel.csv",header=TRUE), output="data/global_outliers_mel.pdf")
+PlotCountryOutliers(df=read.csv("data/global_outliers_mfc.csv",header=TRUE), output="data/global_outliers_mfc.pdf")
+PlotCountryOutliers(df=read.csv("data/global_outliers_chr.csv",header=TRUE), output="data/global_outliers_chr.pdf")
+PlotCountryOutliers(df=read.csv("data/spatial_outliers.csv",header=TRUE), output="data/spatial_outliers.pdf")
+#PlotCountryOutliers(df=read.csv("data/global_outliers_rhy_1band.csv",header=TRUE))
+
+require(graphics)
+par(mfrow=c(2,2))
+g1<-PlotCountryOutliers(df=read.csv("data/global_outliers_rhy.csv",header=TRUE))
+g2<-PlotCountryOutliers(df=read.csv("data/global_outliers_mel.csv",header=TRUE))
+g3<-PlotCountryOutliers(df=read.csv("data/global_outliers_mfc.csv",header=TRUE))
+g4<-PlotCountryOutliers(df=read.csv("data/global_outliers_chr.csv",header=TRUE))
+#do.call(addMapLegend, c(g3,labelFontSize=0.7, legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page"))
+#legend("bottomleft", legend = c(paste(seq(100,1,-10),'%'), 'missing countries'), fill = c(heat.colors(10, alpha = 1), 'grey'), cex = 0.56, bty = "n")
+legend("right", legend = c(paste(seq(90,0,-10),'-',seq(100,10,-10),'%'), 'NA'), fill = c(heat.colors(10, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts_R/radial_dendro.R	Fri Sep 22 16:30:36 2017 +0100
@@ -0,0 +1,14 @@
+library(ape)
+library(cluster)
+
+df = read.csv("data/cluster_freq.csv")
+data = df[,2:dim(df)[2]]
+rownames(data) <- df$labels
+distMahal = as.dist(apply(data, 1, function(i) mahalanobis(data, i, cov = cov(data),tol=1e-18)))
+hc=hclust(distMahal, method="average")
+mypal = c("#000000", "#9B0000", "#9B0000", "#9B0000", "#9B0000")
+clus5 = cutree(hc, 5)
+pdf('data/hierarchical_cluster_R.pdf')
+par(mar=c(1,1,1,1))
+plot(as.phylo(hc),type="fan",tip.color=mypal[clus5], cex=.5, label.offset=.5)
+dev.off()