annotate scripts_R/Metadata_subsetBLSM.R @ 70:cc028157502a branch-tests

scripts R
author Maria Panteli
date Fri, 22 Sep 2017 16:29:32 +0100
parents
children bde45ce0eeab
rev   line source
Maria@70 1 #df = read.csv("/Users/mariapanteli/Documents/2014-2015/Python/pythoncode/MergeBL-Smith/data/df_BLSM.csv",header=TRUE)
Maria@70 2 #df = read.csv("data/df_subset_remove.csv",header=TRUE)
Maria@70 3 #df = read.csv("/Users/mariapanteli/Documents/2014-2015/Python/pythoncode/CodeForBL/data/metadataBL_new.csv",header=TRUE)
Maria@70 4 #df = read.csv("/Users/mariapanteli/Documents/2014-2015/Python/pythoncode/MergeBL-Smith/data/metadata_BLSM.csv",header=TRUE)
Maria@70 5 #df = df[1:29182,] # BL data
Maria@70 6 df = read.csv('data/df_and_clusters.csv', header=T)
Maria@70 7
Maria@70 8 source("MetadataPlots.R")
Maria@70 9
Maria@70 10 ## for plos use arial
Maria@70 11 #install.packages("extrafont")
Maria@70 12 library(extrafont)
Maria@70 13 font_import()
Maria@70 14 loadfonts()
Maria@70 15 Arial <- Type1Font(family="Arial", metrics=c("ArialMT.afm","arial-BoldMT.afm","Arial-ItalicMT.afm", "Arial-BoldItalicMT.afm"))
Maria@70 16 postscriptFonts(Arial=Arial)
Maria@70 17 par(family="Arial")
Maria@70 18
Maria@70 19 #pdf(file="data/country_distribution_BL.pdf")
Maria@70 20 pdf(file="data/country_distribution.pdf")
Maria@70 21 PlotCountryCounts(df)
Maria@70 22 dev.off()
Maria@70 23 postscript(file="data/country_distribution.eps")
Maria@70 24 PlotCountryCounts(df)
Maria@70 25 dev.off()
Maria@70 26
Maria@70 27 pdf(file="data/year_distribution.pdf", width=6, height=4)
Maria@70 28 PlotYearDistribution(df)
Maria@70 29 dev.off()
Maria@70 30 postscript("data/year_distribution.eps", width=10)
Maria@70 31 PlotYearDistribution(df)
Maria@70 32 dev.off()
Maria@70 33 #PlotBarChart(df, cat="Year", ordercat="REGION", mincount=10)
Maria@70 34
Maria@70 35 #pdf(file="data/language_distribution_BL.pdf")
Maria@70 36 levels(df$Language)[which(levels(df$Language)=="Southwestern Caribbean Creole English")]="SouthW Carib. Creole English"
Maria@70 37 df$Language[which(df$Language=="Southwestern Caribbean Creole English")] = "SouthW Carib. Creole English"
Maria@70 38 levels(df$Language)[which(levels(df$Language)=="Lesser Antillean Creole French")]="Lesser Antil. Creole French"
Maria@70 39 df$Language[which(df$Language=="Lesser Antillean Creole French")] = "Lesser Antil. Creole French"
Maria@70 40 df$REGION[which(df$Country=="French Guiana")] = "South America"
Maria@70 41 pdf(file="data/language_distribution.pdf")
Maria@70 42 PlotBarChart(df, cat="Language", ordercat="Region", mincount=10)
Maria@70 43 dev.off()
Maria@70 44 postscript("data/language_distribution.eps", width=8, height=10)
Maria@70 45 PlotBarChart(df, cat="Language", ordercat="Region", mincount=10)
Maria@70 46 dev.off()
Maria@70 47
Maria@70 48 #language phylogeny
Maria@70 49 df = read.csv('data/metadata_BLSM_language.csv', header=T)
Maria@70 50 pdf(file="data/language_iso3_iso1.pdf")
Maria@70 51 PlotBarChart(df, cat="Language_iso3", ordercat="Language_iso1", mincount=10)
Maria@70 52 dev.off()
Maria@70 53
Maria@70 54 # PlotCountryCounts(df)
Maria@70 55 # PlotCountryCultureNcounts(df, mincount=20)
Maria@70 56 # PlotCountryLanguageNcounts(df, mincount=20)
Maria@70 57 # PlotYearDistribution(df)
Maria@70 58 # PlotLanguageDistribution(df)
Maria@70 59 # PlotCultureDistribution(df)
Maria@70 60 # PlotNxNcounts(df, cat1="Country", cat2="Genre_Album", mincount=20)
Maria@70 61
Maria@70 62 df = read.csv('data/df_and_clusters.csv', header=T)
Maria@70 63 #PlotBarChart(df, cat="Clusters", ordercat="CountryLang", mincount=1,legend=F)
Maria@70 64 df$REGION[which(df$Country=="French Guiana")] = "South America"
Maria@70 65 g = ggplot(df,aes(df$Clusters, fill=df$REGION))+geom_bar()
Maria@70 66 levels(df$REGION)[which(levels(df$REGION)=="South America")]="S. America"
Maria@70 67 levels(df$REGION)[which(levels(df$REGION)=="North America")]="N. America"
Maria@70 68
Maria@70 69 #library(rworldmap)
Maria@70 70 #wrld = getMap()
Maria@70 71 #regiondata<-wrld@data[,c("ADMIN","GEO3", "Stern")]
Maria@70 72 #df<-merge(df,regiondata,by.x="Country",by.y="ADMIN",all.x=T)
Maria@70 73
Maria@70 74 #cluster_labels_df = read.csv('data/clusters_top3_labels.csv')
Maria@70 75 cluster_labels_df = read.csv('data/clusters_top3_countries.csv')
Maria@70 76 cluster_labels = paste(cluster_labels_df[,1],cluster_labels_df[,2],cluster_labels_df[,3],sep="")
Maria@70 77 #df$CountryLang = as.factor(paste(df$Country, df$Language, sep="-"))
Maria@70 78
Maria@70 79 countrycounts = table(df$Clusters,df$Country)
Maria@70 80 library(cluster)
Maria@70 81 library(ape)
Maria@70 82 library(gridExtra)
Maria@70 83 library(ggdendro)
Maria@70 84 library(dendextend)
Maria@70 85 hc = hclust(dist(countrycounts), method="average")
Maria@70 86 hc2=hc
Maria@70 87 #hc2$labels = as.character(1:length(cluster_labels))
Maria@70 88 hc2$labels = ""
Maria@70 89 #dhc <- as.dendrogram(hc2)
Maria@70 90 # library(dynamicTreeCut)
Maria@70 91 # clusters <- cutreeDynamic(hc2, minClusterSize = k_clust,method = "tree")
Maria@70 92 # clusters <- clusters[order.dendrogram(dhc)]
Maria@70 93 # clusters_numbers <- unique(clusters) - (0 %in% clusters)
Maria@70 94 # n_clusters <- length(clusters_numbers)
Maria@70 95 # library(colorspace)
Maria@70 96 # cols <- rainbow_hcl(n_clusters)
Maria@70 97 # dhc <- hc2 %>% as.dendrogram %>%
Maria@70 98 # set("branches_k_color", k=k_clust) %>% branches_attr_by_clusters(clusters, values = cols)
Maria@70 99 k_clust = 5
Maria@70 100 dhc <- hc2 %>% as.dendrogram %>%
Maria@70 101 set("branches_k_color", k=k_clust) %>% set("branches_lwd", 0.7) %>%
Maria@70 102 set("labels_cex", 0.6) %>% set("labels_colors", k=k_clust) %>%
Maria@70 103 set("leaves_pch", 19) %>% set("leaves_cex", 0.5)
Maria@70 104 #ddata <- dendro_data(dhc, type = "rectangle")
Maria@70 105 ddata <- as.ggdend(dhc)
Maria@70 106 p <- ggplot(ddata)+coord_flip()
Maria@70 107 #p <- ggplot(segment(ddata)) +
Maria@70 108 # geom_segment(aes(x = x, y = y, xend = xend, yend = yend, colour=ddata$segments$col)) +
Maria@70 109 # coord_flip() + theme_dendro() + theme(legend.position="none") +
Maria@70 110 # geom_text(aes(x = x, y = y, label = label, angle = -90, hjust = 0.5, vjust=1.3, colour=ddata$labels$col), data= label(ddata))
Maria@70 111
Maria@70 112 #dend <- hc2 %>% as.dendrogram %>%
Maria@70 113 # set("branches_k_color", k = 5) %>% set("branches_lwd", 0.7) %>%
Maria@70 114 # set("labels_cex", 0.6) %>% set("labels_colors", k = 5) %>%
Maria@70 115 # set("leaves_pch", 19) %>% set("leaves_cex", 0.5)
Maria@70 116 #ggd1 <- as.ggdend(dend)
Maria@70 117 #pp <- ggplot(ggd1, horiz = TRUE)
Maria@70 118
Maria@70 119 library(stringr)
Maria@70 120 for (i in 1:length(cluster_labels)){
Maria@70 121 cl = cluster_labels[i]
Maria@70 122 cl = str_replace_all(cl, "[(']", "")
Maria@70 123 cl = str_replace_all(cl, "[|]", "-")
Maria@70 124 cl = str_replace_all(cl, ", ", " (")
Maria@70 125 cl = str_replace_all(cl, "[)]", "), ")
Maria@70 126 cl = str_replace_all(cl, "nan", "NA")
Maria@70 127 #cl = paste(cl, "cluster",i)
Maria@70 128 cluster_labels[i] = cl
Maria@70 129 }
Maria@70 130 #cluster_idx = paste("cluster",1:length(cluster_labels))
Maria@70 131 #df$Clusters = as.factor(df$Clusters)
Maria@70 132 df$Clusters = factor(x=df$Clusters,levels=hc$labels[hc$order])
Maria@70 133 cluster_labels = cluster_labels[hc$order]
Maria@70 134 #g = ggplot(df,aes(as.factor(df$Clusters), fill=df$CountryLang))+geom_bar()
Maria@70 135 #g = ggplot(df,aes(Clusters, fill=REGION))+geom_bar()+facet_grid(~REGION,space="free",scales="free")#,scales="free")
Maria@70 136 g = ggplot(df,aes(as.factor(df$Clusters), fill=df$Region))+geom_bar()
Maria@70 137 #g = ggplot(df,aes(as.factor(df$Clusters), fill=df$REGION))+geom_bar()
Maria@70 138 g = g+scale_x_discrete(labels=cluster_labels)
Maria@70 139 #g = g+scale_y_continuous(position="right")
Maria@70 140 #g = g+scale_fill_brewer(palette="Paired")#+scale_fill_grey()
Maria@70 141 g = g+scale_fill_brewer(palette="Paired")#+scale_fill_grey()
Maria@70 142 #g = g+labs(y="Counts", x="Top 3 country-language tags in each cluster")+coord_flip()+theme_bw()#+guides(fill="none")
Maria@70 143 g = g+labs(y="Counts", x="Clusters")+coord_flip()+theme_bw()#+guides(fill="none")
Maria@70 144 #g = g+labs(y="Counts", x="Clusters")+coord_flip()+theme_bw()#+guides(fill="none")
Maria@70 145 #g = g+guides(fill = guide_legend(title = "Region"))+theme(legend.position=c(.9,.8),legend.margin = unit(0, "cm"),legend.key.size = unit(0.3, "cm"),legend.title = element_text(size=10),legend.text = element_text(size=10))
Maria@70 146 #g = g+guides(fill = guide_legend(title = "Region"))+theme(legend.position="left",legend.margin = unit(0, "cm"),legend.key.size = unit(0.3, "cm"),legend.title = element_text(size=9),legend.text = element_text(size=9))
Maria@70 147 g = g+guides(fill = guide_legend(title = "Region"))+theme(legend.position="top",legend.title = element_text(size=9),legend.text = element_text(size=9))
Maria@70 148 g = g+theme(panel.border = element_rect(colour = "white"),strip.background=element_rect(fill="white"),strip.text.x = element_blank())
Maria@70 149 #g = g+theme(axis.text.y = element_text(colour = ddata$labels$col))
Maria@70 150 ggsave('data/clusters_top3.pdf',plot=g)
Maria@70 151 ggsave('data/clusters_top3.eps',plot=g)
Maria@70 152
Maria@70 153 #g_legend<-function(a.gplot){
Maria@70 154 # tmp <- ggplot_gtable(ggplot_build(a.gplot))
Maria@70 155 # leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
Maria@70 156 # legend <- tmp$grobs[[leg]]
Maria@70 157 # return(legend)}
Maria@70 158 #mylegend<-g_legend(g)
Maria@70 159
Maria@70 160 #pdf(file="data/clusters_top3_hclust.pdf", width=12, height=5)
Maria@70 161 #grid.arrange(arrangeGrob(g + theme(legend.position="none"),p + theme(legend.position="none"),nrow=1, widths=c(4,1)),mylegend, nrow=2,heights=c(10, 1))
Maria@70 162 #dev.off()
Maria@70 163
Maria@70 164 #grid.arrange(arrangeGrob(g,p,nrow=1, ncol=2))
Maria@70 165 #ggsave('data/clusters_top3_hclust.pdf',plot=g_comb)
Maria@70 166 #g=g+annotate(x=20, y=1:18, label=cluster_idx)+geom_text(aes(x=20,y=1:18,label=cluster_idx))
Maria@70 167 #+guides(fill = guide_legend(title = "Region"))
Maria@70 168 #grid.draw(cbind(ggplotGrob(g), ggplotGrob(pp), size = "last"))