annotate makeblobplot.R @ 0:0101d7d1211f

initial upload
author Eduardo <eduardoalves@abdn.ac.uk>
date Wed, 25 Nov 2015 11:26:59 +0000
parents
children b45fc20c7acd
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
1 #!/usr/bin/env Rscript
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
2
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
3 # 2013-09-16
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
4 # works with R 2.15.2 and ggplot 0.9.3.1
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
5 # Check ggplot2 help forums or contact sujai.kumar@gmail.com if something doesn't run
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
6 # because of updated programs/packages
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
7
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
8 #Function to ignore low frequency annotations:
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
9
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
10 clean.blobs<-function(d,threshold,taxlevel) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
11 annotated<-d[d[,taxlevel]!="Not annotated",]
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
12 total<-dim(annotated)[1]
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
13 levels(d[,taxlevel])[which(table(d[,taxlevel])<threshold*total)]<-"Not annotated"
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
14 return(d)
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
15 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
16
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
17 #########################################################################
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
18
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
19 #Load data from file and generate plot:
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
20
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
21 library(ggplot2)
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
22 library(reshape2)
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
23
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
24 subplotwidth=1000;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
25 subplotheight=1000;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
26
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
27 args <- commandArgs(trailingOnly = TRUE)
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
28 arg_input_file <- args[1]
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
29 arg_ignore_below_prop=as.numeric(args[2])
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
30 arg_taxlevel=args[3]
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
31
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
32 orig <- read.delim(arg_input_file,header=TRUE,sep="\t")
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
33 orig <- orig[orig$len>=200,]
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
34
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
35 # if cov_colnames >1, then create a new column cov_total = sum of all cov columns
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
36 if (length(grep("^cov_",colnames(orig))) >1) orig$cov_total = rowSums(orig[,grep("^cov_",colnames(orig))])
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
37
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
38 cov_colnames=colnames(orig)[grep("^cov_",colnames(orig))]
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
39 tax_colnames=colnames(orig)[grep("^taxlevel_",colnames(orig))]
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
40
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
41 numcols=length(cov_colnames)
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
42
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
43 taxlevel=arg_taxlevel;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
44
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
45 m<-melt(orig,id.vars=c("seqid","len","gc",taxlevel),measure.vars=cov_colnames, variable.name="read_set", value.name="cov")
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
46
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
47 # there aren't many colours available, so to restrict the plot to only 13 colours:
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
48 # (thanks to https://github.com/hobrien for the fix)
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
49 if (length(levels(m[,taxlevel])) > 14) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
50 levels(m[,taxlevel])[which(table(m[,taxlevel])<=sort(as.numeric(table(m[,taxlevel])), decreasing=T)[13])]<-"other"
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
51 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
52
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
53 mfilt<-clean.blobs(m,arg_ignore_below_prop,taxlevel)
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
54 taxnames=names(sort(table(mfilt[,taxlevel]),decreasing=TRUE))
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
55 taxnames=c("Not annotated", taxnames[taxnames != "Not annotated"])
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
56 mfilt[,taxlevel] <- factor(mfilt[,taxlevel], levels = taxnames)
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
57
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
58 png(paste(arg_input_file,taxlevel,"png",sep="."), (numcols * subplotwidth), (1 * subplotheight) + 300, units="px",res=100)
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
59
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
60 theme_set(theme_bw())
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
61 # Paul Tol scheme is well documented at http://www.sron.nl/~pault/colourschemes.pdf - Thank you Paul! Added DDDDDD and 777777 to it
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
62 paultol=list(c("#DDDDDD") ,c("#DDDDDD","#4477AA"), c("#DDDDDD","#4477AA","#CC6677"), c("#DDDDDD","#4477AA","#DDCC77","#CC6677"), c("#DDDDDD","#4477AA","#117733","#DDCC77","#CC6677"), c("#DDDDDD","#332288","#88CCEE","#117733","#DDCC77","#CC6677"), c("#DDDDDD","#332288","#88CCEE","#117733","#DDCC77","#CC6677","#AA4499"), c("#DDDDDD","#332288","#88CCEE","#44AA99","#117733","#DDCC77","#CC6677","#AA4499"), c("#DDDDDD","#332288","#88CCEE","#44AA99","#117733","#999933","#DDCC77","#CC6677","#AA4499"), c("#DDDDDD","#332288","#88CCEE","#44AA99","#117733","#999933","#DDCC77","#CC6677","#882255","#AA4499"), c("#DDDDDD","#332288","#88CCEE","#44AA99","#117733","#999933","#DDCC77","#661100","#CC6677","#882255","#AA4499"), c("#DDDDDD","#332288","#6699CC","#88CCEE","#44AA99","#117733","#999933","#DDCC77","#661100","#CC6677","#882255","#AA4499"), c("#DDDDDD","#332288","#6699CC","#88CCEE","#44AA99","#117733","#999933","#DDCC77","#661100","#CC6677","#AA4466","#882255","#AA4499"), c("#DDDDDD","#332288","#6699CC","#88CCEE","#44AA99","#117733","#999933","#DDCC77","#661100","#CC6677","#AA4466","#882255","#AA4499","#777777"))
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
63
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
64 g<-ggplot() + scale_colour_manual(values=paultol[[length(levels(mfilt[,taxlevel]))]], name="Taxonomic\nClassification", limits=levels(mfilt[,taxlevel]) )
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
65
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
66 for (t in levels(mfilt[,taxlevel])) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
67 g <- g + geom_point(data=mfilt[mfilt[,taxlevel]==t,],aes_string(x="gc", y="cov", colour=taxlevel), size=2, alpha=I(1/3))
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
68 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
69 #y_axis_breaks = c(1,2,5,10,20,50,100,200,500,1000);
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
70 g<-g +
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
71 facet_wrap(~read_set, ncol=numcols) +
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
72 #scale_y_log10() +
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
73 scale_x_continuous(limits=c(1e-307, 1),breaks = seq(0,1,.1)) +
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
74 labs(x="GC content", y="Read coverage") +
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
75 guides(colour = guide_legend(nrow=3, override.aes = list(alpha = 1,size=10))) +
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
76 theme (
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
77 strip.text.x = element_text(colour = "black", size = 25, vjust = 0.5),
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
78 axis.text.x = element_text(colour = "black", size = 25, vjust = 1),
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
79 axis.text.y = element_text(colour = "black", size = 25, vjust = 0.5),
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
80 axis.title.x = element_text(colour = "black", size = 25, vjust = 0),
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
81 axis.title.y = element_text(colour = "black", size = 25, hjust = 0.5, vjust = 0.5, angle=90),
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
82 legend.text = element_text(colour = "black", size = 25, vjust = 0),
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
83 legend.title = element_text(colour = "black", size = 25, vjust = 0, hjust = 0, lineheight=1),
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
84 legend.justification=c(1,1), legend.position="bottom", legend.direction="horizontal"
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
85 )
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
86 g
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
87 dev.off()