# HG changeset patch # User Maria Panteli # Date 1506094236 -3600 # Node ID d17833be50cae50f3cb1977b63066fcd0cae24c7 # Parent 02faad4a996bae797cf36f692ab76f0a5abcdb1f# Parent cc028157502ab312b6a7d22b48ec6f29188bf021 merged diff -r 02faad4a996b -r d17833be50ca notebooks/correlation_samples_outliers.ipynb --- a/notebooks/correlation_samples_outliers.ipynb Fri Sep 22 16:30:28 2017 +0100 +++ b/notebooks/correlation_samples_outliers.ipynb Fri Sep 22 16:30:36 2017 +0100 @@ -520,7 +520,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.12" + "version": "2.7.11" } }, "nbformat": 4, diff -r 02faad4a996b -r d17833be50ca notebooks/explain_components.ipynb --- a/notebooks/explain_components.ipynb Fri Sep 22 16:30:28 2017 +0100 +++ b/notebooks/explain_components.ipynb Fri Sep 22 16:30:36 2017 +0100 @@ -32,7 +32,9 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -64,6 +66,7 @@ "cell_type": "code", "execution_count": 14, "metadata": { + "collapsed": false, "scrolled": false }, "outputs": [ @@ -230,7 +233,9 @@ { "cell_type": "code", "execution_count": 54, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -268,7 +273,9 @@ { "cell_type": "code", "execution_count": 62, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -300,7 +307,9 @@ { "cell_type": "code", "execution_count": 63, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -363,7 +372,9 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -406,7 +417,9 @@ { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -424,7 +437,9 @@ { "cell_type": "code", "execution_count": 65, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -494,7 +509,9 @@ { "cell_type": "code", "execution_count": 69, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -529,7 +546,9 @@ { "cell_type": "code", "execution_count": 67, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -569,7 +588,9 @@ { "cell_type": "code", "execution_count": 51, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -628,7 +649,9 @@ { "cell_type": "code", "execution_count": 26, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -660,7 +683,9 @@ { "cell_type": "code", "execution_count": 31, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -693,7 +718,9 @@ { "cell_type": "code", "execution_count": 33, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -736,7 +763,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.12" + "version": "2.7.11" } }, "nbformat": 4, diff -r 02faad4a996b -r d17833be50ca notebooks/sensitivity_experiment.ipynb --- a/notebooks/sensitivity_experiment.ipynb Fri Sep 22 16:30:28 2017 +0100 +++ b/notebooks/sensitivity_experiment.ipynb Fri Sep 22 16:30:36 2017 +0100 @@ -3,8 +3,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { "outputs": [], + }, "source": [ "import numpy as np\n", "import pandas as pd\n", @@ -38,7 +39,9 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -59,7 +62,9 @@ { "cell_type": "code", "execution_count": 48, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -269,7 +274,9 @@ { "cell_type": "code", "execution_count": 52, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -438,7 +445,9 @@ { "cell_type": "code", "execution_count": 56, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -461,7 +470,9 @@ { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -720,7 +731,9 @@ { "cell_type": "code", "execution_count": 47, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -819,7 +832,9 @@ { "cell_type": "code", "execution_count": 59, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -4566,7 +4581,9 @@ { "cell_type": "code", "execution_count": 21, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -5029,7 +5046,9 @@ { "cell_type": "code", "execution_count": 54, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -5093,7 +5112,9 @@ { "cell_type": "code", "execution_count": 53, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -5207,7 +5228,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.12" + "version": "2.7.11" } }, "nbformat": 4, diff -r 02faad4a996b -r d17833be50ca notebooks/test_hubness.ipynb --- a/notebooks/test_hubness.ipynb Fri Sep 22 16:30:28 2017 +0100 +++ b/notebooks/test_hubness.ipynb Fri Sep 22 16:30:36 2017 +0100 @@ -47,7 +47,9 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -85,7 +87,9 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -107,7 +111,9 @@ { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -151,7 +157,9 @@ { "cell_type": "code", "execution_count": 14, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -230,6 +238,8 @@ ] }, { + "collapsed": false + }, "cell_type": "code", "execution_count": 16, "metadata": { @@ -245,7 +255,9 @@ { "cell_type": "code", "execution_count": 17, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -271,7 +283,9 @@ { "cell_type": "code", "execution_count": 12, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -469,7 +483,9 @@ { "cell_type": "code", "execution_count": 21, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -496,7 +512,9 @@ { "cell_type": "code", "execution_count": 12, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stderr", @@ -523,7 +541,9 @@ { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -543,7 +563,9 @@ { "cell_type": "code", "execution_count": 7, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -570,7 +592,9 @@ { "cell_type": "code", "execution_count": 14, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stderr", @@ -595,7 +619,9 @@ { "cell_type": "code", "execution_count": 10, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -765,7 +791,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.12" + "version": "2.7.11" } }, "nbformat": 4, diff -r 02faad4a996b -r d17833be50ca scripts_R/MetadataPlots.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts_R/MetadataPlots.R Fri Sep 22 16:30:36 2017 +0100 @@ -0,0 +1,235 @@ +library(rworldmap) +library(ggplot2) + +PlotBarChart<- function(df, cat="Language", ordercat="REGION", mincount=10, legend=T, color_plt="Paired"){ + idx_cat = which(colnames(df)==cat) + idx_ordercat = which(colnames(df)==ordercat) + dfsub <- subset(df, df[,idx_cat]!="") + dfsub <- dfsub[ dfsub[,idx_cat] %in% names(table(dfsub[,idx_cat]))[table(dfsub[,idx_cat]) >mincount] , ] + #dfsub <- dfsub[order(dfsub$REGION.y),] + dfsub <- dfsub[order(dfsub[,idx_ordercat]),] + dfsub[,idx_cat] <- factor(dfsub[,idx_cat], levels=unique(dfsub[,idx_cat])) + g = ggplot(dfsub,aes(dfsub[,idx_cat], fill=dfsub[,idx_ordercat], order=-as.numeric(dfsub[,idx_ordercat])))+geom_bar() + #g = g+ylim("0", "100")#+scale_y_discrete(breaks=c("100"),labels=c("100+")) + g=g+scale_y_continuous(limits=c(0, 200), breaks=seq(0,200,40)) + g=g+scale_fill_brewer(palette=color_plt) + g=g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + g=g+labs(y="Counts",x=cat)+coord_flip()+theme_bw() + if (legend){ + g=g+guides(fill = guide_legend(title = ordercat))} + else{ + g=g+guides(fill="none") + } + return(g) +} + +PlotCountryNCounts <- function(df, mincount=10){ + countrycounts = table(df$Country) + dd=data.frame(countrycounts) + names(dd)=c("Country","Counts") + cols <- rep(2, dim(dd)[1]) + cols[dd$Counts",mincount),addLegend=F) + mapParams$legendText <- c(paste('<',mincount),paste('>=',mincount),'na') + do.call(addMapLegendBoxes, c(mapParams,x='bottomleft')) +} + +PlotCountryCounts <- function(df, output=F){ + countrycounts = table(df$Country) + dd=data.frame(countrycounts) + names(dd)=c("Country","Counts") + spdf<-joinCountryData2Map(dd,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country") + spdf<-spdf[-which(spdf$ADMIN=='Antarctica'),] + #mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',borderCol='black',oceanCol="white",colourPalette='heat', mapTitle="",addLegend=F) + mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=seq(10,100,10),missingCountryCol='grey',borderCol='black',oceanCol="white",colourPalette="heat", mapTitle="",addLegend=F) + #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendShrink=0.5,legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page")) + legend("left", legend = c(paste(seq(90,1,-10),'-',seq(100,11,-10)), 'NA'), fill = c(heat.colors(9, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white") + if (output){ + pdf(file="countrycounts.pdf") + mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=seq(10,100,10),missingCountryCol='grey',borderCol='black',oceanCol="white",colourPalette="heat", mapTitle="",addLegend=F) + #mapParams <- mapCountryData(spdf, nameColumnToPlot="Counts",catMethod=as.numeric(levels(as.factor(spdf$Counts))),missingCountryCol='grey',borderCol='black',oceanCol="lightblue",colourPalette='heat', mapTitle="",addLegend=F) + #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendShrink=0.5,legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page")) + legend("left", legend = c(paste(seq(90,1,-10),'-',seq(100,11,-10)), 'NA'), fill = c(heat.colors(9, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white") + dev.off() + } +} + +PlotYearDistribution <- function(df, output=F){ + df$Year<-as.numeric(as.character(df$Year)) + g = ggplot(df,aes(x=Year,y=..count..))+geom_histogram(breaks=seq(1895, 2015, by = 1)) + #g = ggplot(df,aes(x=Year,y=..count..))+geom_bar()+geom_density(alpha=.3, fill="grey") + #g = g+scale_x_continuous(breaks = pretty(df$Year, n=10)) + g = g+theme_bw()+labs(x ='Year', y ='Count') + #g = ggplot(df,aes(x=Year,y=..count..))+geom_histogram()+theme_bw() + print(g) + if (output){ + ggsave('yeardistribution.pdf',plot=g) + } +} + +PlotCountryDistribution <- function(df){ + #countrycounts = table(df$Country) + #dd=data.frame(countrycounts) + #names(dd)=c("Country","Counts") + g = ggplot(df,aes(x=Country))+geom_bar() + g=g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +} + +#PlotCultureDistribution <- function(df){ +# g = ggplot(df,aes(x=Culture))+geom_bar() +# g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +#} + +PlotLanguageDistribution <- function(df, mincount=1){ + dfsubset <- subset(df, df$Language!="") # ignore the recordings culture info + culturecounts = table(dfsubset$Culture) + culturecounts = culturecounts[culturecounts>=mincount] + barplot(culturecounts, las=2, cex.names=0.2) + #g = ggplot(df,aes(x=Language))+geom_bar() + #g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +} + +PlotBarForCategory <- function(df, cat="Language", mincount=1){ + idx_cat = which(colnames(df)==cat) + dfsubset <- subset(df, df[,idx_cat]!="") + counts = table(dfsubset[,idx_cat]) + counts = counts[counts>=mincount] + barplot(counts, las=2, cex.names=0.2) + #g = ggplot(df,aes(x=Language))+geom_bar() + #g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +} + +PlotCountryYearCutoff <- function(df, cutoffyear=1960){ + df$BeforeYear = df$Year1000",row.names(aa)[aa[,dim(aa)[2]]>1000])) + print(paste(">500",row.names(aa)[aa[,dim(aa)[2]]>500])) + print(paste(">100",row.names(aa)[aa[,dim(aa)[2]]>100])) + inds = which(aa[,dim(aa)[2]]>500 & row.names(aa)!="Total") + mosaicplot(aa[inds,1:5]) + dd=data.frame(countrycounts[inds,]) + dd=data.frame(countrycounts) + names(dd)=c("Country","Culture","Counts") + ddsub <- subset(dd, (Country=="Canada"| Country=="United Kingdom" | Country=="United States of America") & Counts>20) + g=ggplot(ddsub, aes(x=Culture,y=Counts))+geom_point()+facet_wrap(~Country, scales = "free") + g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +} + +PlotCountryCultureNcounts <- function(df,mincount=50){ + dfsubset <- subset(df, df$Culture!="") + #culturecounts = table(dfsubset$Culture) + countrycounts = table(dfsubset$Country, dfsubset$Culture) + dd=data.frame(countrycounts) + names(dd)=c("Country","Culture","Counts") + ddsub <- subset(dd, Counts>mincount) + g=ggplot(ddsub, aes(x=Culture,y=Counts))+geom_point()+facet_wrap(~Country, scales = "free") + g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))#+title(main=paste("Cultures per country, count>",mincount)) +} + +PlotCountryLanguageNcounts <- function(df,mincount=50){ + dfsubset <- subset(df, df$Language!="" & df$Language!=" ") + countrycounts = table(dfsubset$Country, dfsubset$Language) + dd=data.frame(countrycounts) + names(dd)=c("Country","Language","Counts") + ddsub <- subset(dd, Counts>mincount) + g=ggplot(ddsub, aes(x=Language,y=Counts))+geom_point()+facet_wrap(~Country, scales = "free") + g+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))#+title(main=paste("Cultures per country, count>",mincount)) +} + +PlotNxNcounts <- function(df, cat1="Country", cat2="Culture", mincount=50, figname=""){ + indcat1 = which(colnames(df)==cat1) + indcat2 = which(colnames(df)==cat2) + dfsubset <- subset(df, df[,indcat1]!="" & df[,indcat2]!="") # avoid nan values + NNcounts <- table(dfsubset[,indcat1], dfsubset[,indcat2]) + dd=data.frame(NNcounts) + names(dd) <- c("Cat1","Cat2","Counts") + ddsub <- subset(dd, Counts>mincount) + if (figname==""){ + g=ggplot(ddsub, aes(x=Cat2,y=Counts))+geom_point()+facet_wrap(~Cat1) + g+coord_flip()+theme(axis.text.y=element_text(hjust=1,vjust=0.5,size=5))+labs(y="Counts",x=cat2) + }else{ + g=ggplot(ddsub, aes(x=Cat2,y=Counts))+geom_point()+facet_wrap(~Cat1) + g+coord_flip()+theme(axis.text.y=element_text(hjust=1,vjust=0.5,size=5))+labs(y="Counts",x=cat2) + ggsave(figname) + } +} + +Wordcloud<- function(df, cat="Language", output=F){ + require(wordcloud) + require(RColorBrewer) + ind_cat = which(colnames(df)==cat) + counts <- table(df[,ind_cat]) + dd=data.frame(counts) + names(dd) <- c("words","freq") + pal2 <- brewer.pal(8,"Dark2") + wordcloud(dd$words,dd$freq,random.order=FALSE, colors=pal2) + if (output){ + pdf("wordcloud.pdf") + wordcloud(dd$words,dd$freq,random.order=FALSE, colors=pal2) + dev.off() + } +} +PlotCountryOutliers <- function(df, output=''){ + par(mar = rep(2, 4)) + spdf<-joinCountryData2Map(df,joinCode="NAME",nameCountryColumn="Country",nameJoinColumn="Country") + spdf<-spdf[-which(spdf$ADMIN=='Antarctica'),] + #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers",catMethod=seq(0,70,5),missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE) + mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE) + # avoid antarctica + #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", ylim=c(-60,90), catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE) + #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page")) + legend("left", legend = c(paste(seq(90,0,-10),'-',seq(100,10,-10),'%'), 'NA'), fill = c(heat.colors(10, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white") + if (output!=''){ + pdf(output) + #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers",catMethod=seq(0,70,5),missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE) + mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE) + #mapParams <- mapCountryData(spdf, nameColumnToPlot="Outliers", ylim=c(-60,90), catMethod=seq(0,1,0.1), missingCountryCol='grey',colourPalette='heat', mapTitle="", addLegend=FALSE) + #do.call( addMapLegend, c(mapParams, labelFontSize=0.7, legendWidth=0.5, tcl=0.3, legendMar=7, legendLabels="all",horizontal=T, legendIntervals="page")) + legend("left", legend = c(paste(seq(90,0,-10),'-',seq(100,10,-10),'%'), 'NA'), fill = c(heat.colors(10, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white") + dev.off() + } + else { + return(mapParams) + } +} \ No newline at end of file diff -r 02faad4a996b -r d17833be50ca scripts_R/Metadata_subsetBLSM.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts_R/Metadata_subsetBLSM.R Fri Sep 22 16:30:36 2017 +0100 @@ -0,0 +1,168 @@ +#df = read.csv("/Users/mariapanteli/Documents/2014-2015/Python/pythoncode/MergeBL-Smith/data/df_BLSM.csv",header=TRUE) +#df = read.csv("data/df_subset_remove.csv",header=TRUE) +#df = read.csv("/Users/mariapanteli/Documents/2014-2015/Python/pythoncode/CodeForBL/data/metadataBL_new.csv",header=TRUE) +#df = read.csv("/Users/mariapanteli/Documents/2014-2015/Python/pythoncode/MergeBL-Smith/data/metadata_BLSM.csv",header=TRUE) +#df = df[1:29182,] # BL data +df = read.csv('data/df_and_clusters.csv', header=T) + +source("MetadataPlots.R") + +## for plos use arial +#install.packages("extrafont") +library(extrafont) +font_import() +loadfonts() +Arial <- Type1Font(family="Arial", metrics=c("ArialMT.afm","arial-BoldMT.afm","Arial-ItalicMT.afm", "Arial-BoldItalicMT.afm")) +postscriptFonts(Arial=Arial) +par(family="Arial") + +#pdf(file="data/country_distribution_BL.pdf") +pdf(file="data/country_distribution.pdf") +PlotCountryCounts(df) +dev.off() +postscript(file="data/country_distribution.eps") +PlotCountryCounts(df) +dev.off() + +pdf(file="data/year_distribution.pdf", width=6, height=4) +PlotYearDistribution(df) +dev.off() +postscript("data/year_distribution.eps", width=10) +PlotYearDistribution(df) +dev.off() +#PlotBarChart(df, cat="Year", ordercat="REGION", mincount=10) + +#pdf(file="data/language_distribution_BL.pdf") +levels(df$Language)[which(levels(df$Language)=="Southwestern Caribbean Creole English")]="SouthW Carib. Creole English" +df$Language[which(df$Language=="Southwestern Caribbean Creole English")] = "SouthW Carib. Creole English" +levels(df$Language)[which(levels(df$Language)=="Lesser Antillean Creole French")]="Lesser Antil. Creole French" +df$Language[which(df$Language=="Lesser Antillean Creole French")] = "Lesser Antil. Creole French" +df$REGION[which(df$Country=="French Guiana")] = "South America" +pdf(file="data/language_distribution.pdf") +PlotBarChart(df, cat="Language", ordercat="Region", mincount=10) +dev.off() +postscript("data/language_distribution.eps", width=8, height=10) +PlotBarChart(df, cat="Language", ordercat="Region", mincount=10) +dev.off() + +#language phylogeny +df = read.csv('data/metadata_BLSM_language.csv', header=T) +pdf(file="data/language_iso3_iso1.pdf") +PlotBarChart(df, cat="Language_iso3", ordercat="Language_iso1", mincount=10) +dev.off() + +# PlotCountryCounts(df) +# PlotCountryCultureNcounts(df, mincount=20) +# PlotCountryLanguageNcounts(df, mincount=20) +# PlotYearDistribution(df) +# PlotLanguageDistribution(df) +# PlotCultureDistribution(df) +# PlotNxNcounts(df, cat1="Country", cat2="Genre_Album", mincount=20) + +df = read.csv('data/df_and_clusters.csv', header=T) +#PlotBarChart(df, cat="Clusters", ordercat="CountryLang", mincount=1,legend=F) +df$REGION[which(df$Country=="French Guiana")] = "South America" +g = ggplot(df,aes(df$Clusters, fill=df$REGION))+geom_bar() +levels(df$REGION)[which(levels(df$REGION)=="South America")]="S. America" +levels(df$REGION)[which(levels(df$REGION)=="North America")]="N. America" + +#library(rworldmap) +#wrld = getMap() +#regiondata<-wrld@data[,c("ADMIN","GEO3", "Stern")] +#df<-merge(df,regiondata,by.x="Country",by.y="ADMIN",all.x=T) + +#cluster_labels_df = read.csv('data/clusters_top3_labels.csv') +cluster_labels_df = read.csv('data/clusters_top3_countries.csv') +cluster_labels = paste(cluster_labels_df[,1],cluster_labels_df[,2],cluster_labels_df[,3],sep="") +#df$CountryLang = as.factor(paste(df$Country, df$Language, sep="-")) + +countrycounts = table(df$Clusters,df$Country) +library(cluster) +library(ape) +library(gridExtra) +library(ggdendro) +library(dendextend) +hc = hclust(dist(countrycounts), method="average") +hc2=hc +#hc2$labels = as.character(1:length(cluster_labels)) +hc2$labels = "" +#dhc <- as.dendrogram(hc2) +# library(dynamicTreeCut) +# clusters <- cutreeDynamic(hc2, minClusterSize = k_clust,method = "tree") +# clusters <- clusters[order.dendrogram(dhc)] +# clusters_numbers <- unique(clusters) - (0 %in% clusters) +# n_clusters <- length(clusters_numbers) +# library(colorspace) +# cols <- rainbow_hcl(n_clusters) +# dhc <- hc2 %>% as.dendrogram %>% +# set("branches_k_color", k=k_clust) %>% branches_attr_by_clusters(clusters, values = cols) +k_clust = 5 +dhc <- hc2 %>% as.dendrogram %>% + set("branches_k_color", k=k_clust) %>% set("branches_lwd", 0.7) %>% + set("labels_cex", 0.6) %>% set("labels_colors", k=k_clust) %>% + set("leaves_pch", 19) %>% set("leaves_cex", 0.5) +#ddata <- dendro_data(dhc, type = "rectangle") +ddata <- as.ggdend(dhc) +p <- ggplot(ddata)+coord_flip() +#p <- ggplot(segment(ddata)) + +# geom_segment(aes(x = x, y = y, xend = xend, yend = yend, colour=ddata$segments$col)) + +# coord_flip() + theme_dendro() + theme(legend.position="none") + +# geom_text(aes(x = x, y = y, label = label, angle = -90, hjust = 0.5, vjust=1.3, colour=ddata$labels$col), data= label(ddata)) + +#dend <- hc2 %>% as.dendrogram %>% +# set("branches_k_color", k = 5) %>% set("branches_lwd", 0.7) %>% +# set("labels_cex", 0.6) %>% set("labels_colors", k = 5) %>% +# set("leaves_pch", 19) %>% set("leaves_cex", 0.5) +#ggd1 <- as.ggdend(dend) +#pp <- ggplot(ggd1, horiz = TRUE) + +library(stringr) +for (i in 1:length(cluster_labels)){ + cl = cluster_labels[i] + cl = str_replace_all(cl, "[(']", "") + cl = str_replace_all(cl, "[|]", "-") + cl = str_replace_all(cl, ", ", " (") + cl = str_replace_all(cl, "[)]", "), ") + cl = str_replace_all(cl, "nan", "NA") + #cl = paste(cl, "cluster",i) + cluster_labels[i] = cl +} +#cluster_idx = paste("cluster",1:length(cluster_labels)) +#df$Clusters = as.factor(df$Clusters) +df$Clusters = factor(x=df$Clusters,levels=hc$labels[hc$order]) +cluster_labels = cluster_labels[hc$order] +#g = ggplot(df,aes(as.factor(df$Clusters), fill=df$CountryLang))+geom_bar() +#g = ggplot(df,aes(Clusters, fill=REGION))+geom_bar()+facet_grid(~REGION,space="free",scales="free")#,scales="free") +g = ggplot(df,aes(as.factor(df$Clusters), fill=df$Region))+geom_bar() +#g = ggplot(df,aes(as.factor(df$Clusters), fill=df$REGION))+geom_bar() +g = g+scale_x_discrete(labels=cluster_labels) +#g = g+scale_y_continuous(position="right") +#g = g+scale_fill_brewer(palette="Paired")#+scale_fill_grey() +g = g+scale_fill_brewer(palette="Paired")#+scale_fill_grey() +#g = g+labs(y="Counts", x="Top 3 country-language tags in each cluster")+coord_flip()+theme_bw()#+guides(fill="none") +g = g+labs(y="Counts", x="Clusters")+coord_flip()+theme_bw()#+guides(fill="none") +#g = g+labs(y="Counts", x="Clusters")+coord_flip()+theme_bw()#+guides(fill="none") +#g = g+guides(fill = guide_legend(title = "Region"))+theme(legend.position=c(.9,.8),legend.margin = unit(0, "cm"),legend.key.size = unit(0.3, "cm"),legend.title = element_text(size=10),legend.text = element_text(size=10)) +#g = g+guides(fill = guide_legend(title = "Region"))+theme(legend.position="left",legend.margin = unit(0, "cm"),legend.key.size = unit(0.3, "cm"),legend.title = element_text(size=9),legend.text = element_text(size=9)) +g = g+guides(fill = guide_legend(title = "Region"))+theme(legend.position="top",legend.title = element_text(size=9),legend.text = element_text(size=9)) +g = g+theme(panel.border = element_rect(colour = "white"),strip.background=element_rect(fill="white"),strip.text.x = element_blank()) +#g = g+theme(axis.text.y = element_text(colour = ddata$labels$col)) +ggsave('data/clusters_top3.pdf',plot=g) +ggsave('data/clusters_top3.eps',plot=g) + +#g_legend<-function(a.gplot){ +# tmp <- ggplot_gtable(ggplot_build(a.gplot)) +# leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") +# legend <- tmp$grobs[[leg]] +# return(legend)} +#mylegend<-g_legend(g) + +#pdf(file="data/clusters_top3_hclust.pdf", width=12, height=5) +#grid.arrange(arrangeGrob(g + theme(legend.position="none"),p + theme(legend.position="none"),nrow=1, widths=c(4,1)),mylegend, nrow=2,heights=c(10, 1)) +#dev.off() + +#grid.arrange(arrangeGrob(g,p,nrow=1, ncol=2)) +#ggsave('data/clusters_top3_hclust.pdf',plot=g_comb) +#g=g+annotate(x=20, y=1:18, label=cluster_idx)+geom_text(aes(x=20,y=1:18,label=cluster_idx)) +#+guides(fill = guide_legend(title = "Region")) +#grid.draw(cbind(ggplotGrob(g), ggplotGrob(pp), size = "last")) diff -r 02faad4a996b -r d17833be50ca scripts_R/PlotOutliersCountry.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts_R/PlotOutliersCountry.R Fri Sep 22 16:30:36 2017 +0100 @@ -0,0 +1,19 @@ +source("MetadataPlots.R") + +PlotCountryOutliers(df=read.csv("data/global_outliers.csv",header=TRUE), output="data/global_outliers.pdf") +PlotCountryOutliers(df=read.csv("data/global_outliers_rhy.csv",header=TRUE), output="data/global_outliers_rhy.pdf") +PlotCountryOutliers(df=read.csv("data/global_outliers_mel.csv",header=TRUE), output="data/global_outliers_mel.pdf") +PlotCountryOutliers(df=read.csv("data/global_outliers_mfc.csv",header=TRUE), output="data/global_outliers_mfc.pdf") +PlotCountryOutliers(df=read.csv("data/global_outliers_chr.csv",header=TRUE), output="data/global_outliers_chr.pdf") +PlotCountryOutliers(df=read.csv("data/spatial_outliers.csv",header=TRUE), output="data/spatial_outliers.pdf") +#PlotCountryOutliers(df=read.csv("data/global_outliers_rhy_1band.csv",header=TRUE)) + +require(graphics) +par(mfrow=c(2,2)) +g1<-PlotCountryOutliers(df=read.csv("data/global_outliers_rhy.csv",header=TRUE)) +g2<-PlotCountryOutliers(df=read.csv("data/global_outliers_mel.csv",header=TRUE)) +g3<-PlotCountryOutliers(df=read.csv("data/global_outliers_mfc.csv",header=TRUE)) +g4<-PlotCountryOutliers(df=read.csv("data/global_outliers_chr.csv",header=TRUE)) +#do.call(addMapLegend, c(g3,labelFontSize=0.7, legendWidth=0.5, tcl=0.3, legendMar = 7, legendLabels="all",horizontal=T, legendIntervals="page")) +#legend("bottomleft", legend = c(paste(seq(100,1,-10),'%'), 'missing countries'), fill = c(heat.colors(10, alpha = 1), 'grey'), cex = 0.56, bty = "n") +legend("right", legend = c(paste(seq(90,0,-10),'-',seq(100,10,-10),'%'), 'NA'), fill = c(heat.colors(10, alpha = 1), 'grey'), cex = 0.56, bty = "o",bg="white",box.lwd=0,box.col="white") diff -r 02faad4a996b -r d17833be50ca scripts_R/radial_dendro.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts_R/radial_dendro.R Fri Sep 22 16:30:36 2017 +0100 @@ -0,0 +1,14 @@ +library(ape) +library(cluster) + +df = read.csv("data/cluster_freq.csv") +data = df[,2:dim(df)[2]] +rownames(data) <- df$labels +distMahal = as.dist(apply(data, 1, function(i) mahalanobis(data, i, cov = cov(data),tol=1e-18))) +hc=hclust(distMahal, method="average") +mypal = c("#000000", "#9B0000", "#9B0000", "#9B0000", "#9B0000") +clus5 = cutree(hc, 5) +pdf('data/hierarchical_cluster_R.pdf') +par(mar=c(1,1,1,1)) +plot(as.phylo(hc),type="fan",tip.color=mypal[clus5], cex=.5, label.offset=.5) +dev.off()