Mercurial > hg > plosone_underreview
view scripts_R/Metadata_subsetBLSM.R @ 70:cc028157502a branch-tests
scripts R
author | Maria Panteli |
---|---|
date | Fri, 22 Sep 2017 16:29:32 +0100 |
parents | |
children | bde45ce0eeab |
line wrap: on
line source
#df = read.csv("/Users/mariapanteli/Documents/2014-2015/Python/pythoncode/MergeBL-Smith/data/df_BLSM.csv",header=TRUE) #df = read.csv("data/df_subset_remove.csv",header=TRUE) #df = read.csv("/Users/mariapanteli/Documents/2014-2015/Python/pythoncode/CodeForBL/data/metadataBL_new.csv",header=TRUE) #df = read.csv("/Users/mariapanteli/Documents/2014-2015/Python/pythoncode/MergeBL-Smith/data/metadata_BLSM.csv",header=TRUE) #df = df[1:29182,] # BL data df = read.csv('data/df_and_clusters.csv', header=T) source("MetadataPlots.R") ## for plos use arial #install.packages("extrafont") library(extrafont) font_import() loadfonts() Arial <- Type1Font(family="Arial", metrics=c("ArialMT.afm","arial-BoldMT.afm","Arial-ItalicMT.afm", "Arial-BoldItalicMT.afm")) postscriptFonts(Arial=Arial) par(family="Arial") #pdf(file="data/country_distribution_BL.pdf") pdf(file="data/country_distribution.pdf") PlotCountryCounts(df) dev.off() postscript(file="data/country_distribution.eps") PlotCountryCounts(df) dev.off() pdf(file="data/year_distribution.pdf", width=6, height=4) PlotYearDistribution(df) dev.off() postscript("data/year_distribution.eps", width=10) PlotYearDistribution(df) dev.off() #PlotBarChart(df, cat="Year", ordercat="REGION", mincount=10) #pdf(file="data/language_distribution_BL.pdf") levels(df$Language)[which(levels(df$Language)=="Southwestern Caribbean Creole English")]="SouthW Carib. Creole English" df$Language[which(df$Language=="Southwestern Caribbean Creole English")] = "SouthW Carib. Creole English" levels(df$Language)[which(levels(df$Language)=="Lesser Antillean Creole French")]="Lesser Antil. Creole French" df$Language[which(df$Language=="Lesser Antillean Creole French")] = "Lesser Antil. Creole French" df$REGION[which(df$Country=="French Guiana")] = "South America" pdf(file="data/language_distribution.pdf") PlotBarChart(df, cat="Language", ordercat="Region", mincount=10) dev.off() postscript("data/language_distribution.eps", width=8, height=10) PlotBarChart(df, cat="Language", ordercat="Region", mincount=10) dev.off() #language phylogeny df = read.csv('data/metadata_BLSM_language.csv', header=T) pdf(file="data/language_iso3_iso1.pdf") PlotBarChart(df, cat="Language_iso3", ordercat="Language_iso1", mincount=10) dev.off() # PlotCountryCounts(df) # PlotCountryCultureNcounts(df, mincount=20) # PlotCountryLanguageNcounts(df, mincount=20) # PlotYearDistribution(df) # PlotLanguageDistribution(df) # PlotCultureDistribution(df) # PlotNxNcounts(df, cat1="Country", cat2="Genre_Album", mincount=20) df = read.csv('data/df_and_clusters.csv', header=T) #PlotBarChart(df, cat="Clusters", ordercat="CountryLang", mincount=1,legend=F) df$REGION[which(df$Country=="French Guiana")] = "South America" g = ggplot(df,aes(df$Clusters, fill=df$REGION))+geom_bar() levels(df$REGION)[which(levels(df$REGION)=="South America")]="S. America" levels(df$REGION)[which(levels(df$REGION)=="North America")]="N. America" #library(rworldmap) #wrld = getMap() #regiondata<-wrld@data[,c("ADMIN","GEO3", "Stern")] #df<-merge(df,regiondata,by.x="Country",by.y="ADMIN",all.x=T) #cluster_labels_df = read.csv('data/clusters_top3_labels.csv') cluster_labels_df = read.csv('data/clusters_top3_countries.csv') cluster_labels = paste(cluster_labels_df[,1],cluster_labels_df[,2],cluster_labels_df[,3],sep="") #df$CountryLang = as.factor(paste(df$Country, df$Language, sep="-")) countrycounts = table(df$Clusters,df$Country) library(cluster) library(ape) library(gridExtra) library(ggdendro) library(dendextend) hc = hclust(dist(countrycounts), method="average") hc2=hc #hc2$labels = as.character(1:length(cluster_labels)) hc2$labels = "" #dhc <- as.dendrogram(hc2) # library(dynamicTreeCut) # clusters <- cutreeDynamic(hc2, minClusterSize = k_clust,method = "tree") # clusters <- clusters[order.dendrogram(dhc)] # clusters_numbers <- unique(clusters) - (0 %in% clusters) # n_clusters <- length(clusters_numbers) # library(colorspace) # cols <- rainbow_hcl(n_clusters) # dhc <- hc2 %>% as.dendrogram %>% # set("branches_k_color", k=k_clust) %>% branches_attr_by_clusters(clusters, values = cols) k_clust = 5 dhc <- hc2 %>% as.dendrogram %>% set("branches_k_color", k=k_clust) %>% set("branches_lwd", 0.7) %>% set("labels_cex", 0.6) %>% set("labels_colors", k=k_clust) %>% set("leaves_pch", 19) %>% set("leaves_cex", 0.5) #ddata <- dendro_data(dhc, type = "rectangle") ddata <- as.ggdend(dhc) p <- ggplot(ddata)+coord_flip() #p <- ggplot(segment(ddata)) + # geom_segment(aes(x = x, y = y, xend = xend, yend = yend, colour=ddata$segments$col)) + # coord_flip() + theme_dendro() + theme(legend.position="none") + # geom_text(aes(x = x, y = y, label = label, angle = -90, hjust = 0.5, vjust=1.3, colour=ddata$labels$col), data= label(ddata)) #dend <- hc2 %>% as.dendrogram %>% # set("branches_k_color", k = 5) %>% set("branches_lwd", 0.7) %>% # set("labels_cex", 0.6) %>% set("labels_colors", k = 5) %>% # set("leaves_pch", 19) %>% set("leaves_cex", 0.5) #ggd1 <- as.ggdend(dend) #pp <- ggplot(ggd1, horiz = TRUE) library(stringr) for (i in 1:length(cluster_labels)){ cl = cluster_labels[i] cl = str_replace_all(cl, "[(']", "") cl = str_replace_all(cl, "[|]", "-") cl = str_replace_all(cl, ", ", " (") cl = str_replace_all(cl, "[)]", "), ") cl = str_replace_all(cl, "nan", "NA") #cl = paste(cl, "cluster",i) cluster_labels[i] = cl } #cluster_idx = paste("cluster",1:length(cluster_labels)) #df$Clusters = as.factor(df$Clusters) df$Clusters = factor(x=df$Clusters,levels=hc$labels[hc$order]) cluster_labels = cluster_labels[hc$order] #g = ggplot(df,aes(as.factor(df$Clusters), fill=df$CountryLang))+geom_bar() #g = ggplot(df,aes(Clusters, fill=REGION))+geom_bar()+facet_grid(~REGION,space="free",scales="free")#,scales="free") g = ggplot(df,aes(as.factor(df$Clusters), fill=df$Region))+geom_bar() #g = ggplot(df,aes(as.factor(df$Clusters), fill=df$REGION))+geom_bar() g = g+scale_x_discrete(labels=cluster_labels) #g = g+scale_y_continuous(position="right") #g = g+scale_fill_brewer(palette="Paired")#+scale_fill_grey() g = g+scale_fill_brewer(palette="Paired")#+scale_fill_grey() #g = g+labs(y="Counts", x="Top 3 country-language tags in each cluster")+coord_flip()+theme_bw()#+guides(fill="none") g = g+labs(y="Counts", x="Clusters")+coord_flip()+theme_bw()#+guides(fill="none") #g = g+labs(y="Counts", x="Clusters")+coord_flip()+theme_bw()#+guides(fill="none") #g = g+guides(fill = guide_legend(title = "Region"))+theme(legend.position=c(.9,.8),legend.margin = unit(0, "cm"),legend.key.size = unit(0.3, "cm"),legend.title = element_text(size=10),legend.text = element_text(size=10)) #g = g+guides(fill = guide_legend(title = "Region"))+theme(legend.position="left",legend.margin = unit(0, "cm"),legend.key.size = unit(0.3, "cm"),legend.title = element_text(size=9),legend.text = element_text(size=9)) g = g+guides(fill = guide_legend(title = "Region"))+theme(legend.position="top",legend.title = element_text(size=9),legend.text = element_text(size=9)) g = g+theme(panel.border = element_rect(colour = "white"),strip.background=element_rect(fill="white"),strip.text.x = element_blank()) #g = g+theme(axis.text.y = element_text(colour = ddata$labels$col)) ggsave('data/clusters_top3.pdf',plot=g) ggsave('data/clusters_top3.eps',plot=g) #g_legend<-function(a.gplot){ # tmp <- ggplot_gtable(ggplot_build(a.gplot)) # leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") # legend <- tmp$grobs[[leg]] # return(legend)} #mylegend<-g_legend(g) #pdf(file="data/clusters_top3_hclust.pdf", width=12, height=5) #grid.arrange(arrangeGrob(g + theme(legend.position="none"),p + theme(legend.position="none"),nrow=1, widths=c(4,1)),mylegend, nrow=2,heights=c(10, 1)) #dev.off() #grid.arrange(arrangeGrob(g,p,nrow=1, ncol=2)) #ggsave('data/clusters_top3_hclust.pdf',plot=g_comb) #g=g+annotate(x=20, y=1:18, label=cluster_idx)+geom_text(aes(x=20,y=1:18,label=cluster_idx)) #+guides(fill = guide_legend(title = "Region")) #grid.draw(cbind(ggplotGrob(g), ggplotGrob(pp), size = "last"))