comparison ngs.plot.r @ 0:3ca58369469c draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/ngsplot commit b'e9fcc157a7f2f2fa9d6ac9a58d425ff17c975f5c\n'
author artbio
date Wed, 06 Dec 2017 19:01:53 -0500
parents
children de27d4172d19
comparison
equal deleted inserted replaced
-1:000000000000 0:3ca58369469c
1 #!/usr/bin/env Rscript
2 #
3 # Program: ngs.plot.r
4 # Purpose: Plot sequencing coverages at different genomic regions.
5 # Allow overlaying various coverages with gene lists.
6 #
7 # -- by Li Shen, MSSM
8 # Created: Nov 2011.
9 # -- by Christophe Antoniewski
10 # recoded for Galaxy: Dec 2017
11
12 ngsplot.version <- '2.63_artbio'
13 # Program environment variable.
14 progpath <- Sys.getenv('NGSPLOT')
15 if(progpath == "") {
16 stop("Set environment variable NGSPLOT before run the program. See README
17 for details.\n")
18 }
19
20 source(file.path(progpath, 'lib', 'parse.args.r'))
21 source(file.path(progpath, 'lib', 'genedb.r'))
22 source(file.path(progpath, 'lib', 'plotlib.r'))
23 source(file.path(progpath, 'lib', 'coverage.r'))
24
25 # Deal with command line arguments.
26 cmd.help <- function(){
27 cat("\nVisit https://github.com/shenlab-sinai/ngsplot/wiki/ProgramArguments101 for details\n")
28 cat(paste("Version:", ngsplot.version, sep=" "))
29 cat("\nUsage: ngs.plot.r -G genome -R region -C [cov|config]file\n")
30 cat(" -O name [Options]\n")
31 cat("\n## Mandatory parameters:\n")
32 cat(" -G Genome name. Use ngsplotdb.py list to show available genomes.\n")
33 cat(" -R Genomic regions to plot: tss, tes, genebody, exon, cgi, enhancer, dhs or bed\n")
34 cat(" -C Indexed bam file or a configuration file for multiplot\n")
35 cat(" -O Name for output: multiple files will be generated\n")
36 cat("## Optional parameters related to configuration file:\n")
37 cat(" -E Gene list to subset regions OR bed file for custom region\n")
38 cat(" -T Image title\n")
39 EchoCoverageArgs()
40 EchoPlotArgs()
41 cat("\n")
42 }
43
44
45 ###########################################################################
46 #################### Deal with program input arguments ####################
47 args <- commandArgs(T)
48 # args <- unlist(strsplit('-G mm10 -R tss -C K27M_no_stim_3_GATCAG_L006_R1_001Aligned.out.srt.rmdup.bam -O test -Debug 1', ' '))
49
50 # Input argument parser.
51 args.tbl <- parseArgs(args, c('-G', '-C', '-R', '-O')) # see lib/parse.args.r
52 if(is.null(args.tbl)){
53 cmd.help()
54 stop('Error in parsing command line arguments. Stop.\n')
55 }
56 genome <- args.tbl['-G']
57 reg2plot <- args.tbl['-R']
58 oname <- args.tbl['-O']
59 warning(sprintf("%s", args.tbl))
60
61 cat("Configuring variables...")
62 # Load tables of database: default.tbl, dbfile.tbl
63 default.tbl <- read.delim(file.path(progpath, 'database', 'default.tbl'))
64 dbfile.tbl <- read.delim(file.path(progpath, 'database', 'dbfile.tbl'))
65 CheckRegionAllowed(reg2plot, default.tbl)
66
67 # Setup variables from arguments.
68 cov.args <- CoverageVars(args.tbl, reg2plot)
69 attach(cov.args) #
70 plot.args <- PlotVars(args.tbl)
71 attach(plot.args)
72
73 # Configuration: coverage-genelist-title relationships.
74 ctg.tbl <- ConfigTbl(args.tbl, fraglen)
75
76 # Setup plot-related coordinates and variables.
77 plotvar.list <- SetupPlotCoord(args.tbl, ctg.tbl, default.tbl, dbfile.tbl,
78 progpath, genome, reg2plot, inttag, flanksize,
79 samprate, galaxy)
80 attach(plotvar.list)
81
82 # Setup data points for plot.
83 pts.list <- SetPtsSpline(pint, lgint)
84 pts <- pts.list$pts # data points for avg. profile and standard errors.
85 m.pts <- pts.list$m.pts # middle data points. For pint, m.pts=1.
86 f.pts <- pts.list$f.pts # flanking region data points.
87
88 # Setup matrix for avg. profiles.
89 regcovMat <- CreatePlotMat(pts, ctg.tbl)
90 # Setup matrix for standard errors.
91 confiMat <- CreateConfiMat(se, pts, ctg.tbl)
92
93 # Genomic enrichment for all profiles in the config. Use this for heatmaps.
94 enrichList <- vector('list', nrow(ctg.tbl))
95 cat("Done\n")
96
97 # Load required libraries.
98 cat("Loading R libraries")
99 if(!suppressMessages(require(ShortRead, warn.conflicts=F))) {
100 source("http://bioconductor.org/biocLite.R")
101 biocLite(ShortRead)
102 if(!suppressMessages(require(ShortRead, warn.conflicts=F))) {
103 stop('Loading package ShortRead failed!')
104 }
105 }
106 cat('.')
107 if(!suppressMessages(require(BSgenome, warn.conflicts=F))) {
108 source("http://bioconductor.org/biocLite.R")
109 biocLite(BSgenome)
110 if(!suppressMessages(require(BSgenome, warn.conflicts=F))) {
111 stop('Loading package BSgenome failed!')
112 }
113 }
114 cat('.')
115 if(!suppressMessages(require(doMC, warn.conflicts=F))) {
116 install.packages('doMC')
117 if(!suppressMessages(require(doMC, warn.conflicts=F))) {
118 stop('Loading package doMC failed!')
119 }
120 }
121 # Register doMC with CPU number.
122 if(cores.number == 0){
123 registerDoMC()
124 } else {
125 registerDoMC(cores.number)
126 }
127 cat('.')
128 if(!suppressMessages(require(caTools, warn.conflicts=F))) {
129 install.packages('caTools')
130 if(!suppressMessages(require(caTools, warn.conflicts=F))) {
131 stop('Loading package caTools failed!')
132 }
133 }
134 cat('.')
135 if(!suppressMessages(require(utils, warn.conflicts=F))) {
136 install.packages('utils')
137 if(!suppressMessages(require(utils, warn.conflicts=F))) {
138 stop('Loading package utils failed!')
139 }
140 }
141 cat('.')
142 cat("Done\n")
143
144 #######################################################################
145 # Here start to extract coverages for all genomic regions and calculate
146 # data for plotting.
147
148 cat("Analyze bam files and calculate coverage")
149 # Extract bam file names from configuration and determine if bam-pair is used.
150 bfl.res <- bamFileList(ctg.tbl)
151 bam.pair <- bfl.res$bbp # boolean for bam-pair.
152 bam.list <- bfl.res$bam.list # bam file list.
153 CheckHMColorConfig(hm.color, bam.pair)
154
155 # Determine if bowtie is used to generate the bam files.
156 # Index bam files if not done yet.
157 v.map.bowtie <- headerIndexBam(bam.list)
158
159 # Retrieve chromosome names for each bam file.
160 sn.list <- seqnamesBam(bam.list)
161
162 # Calculate library size from bam files for normalization.
163 v.lib.size <- libSizeBam(bam.list)
164
165 v.low.cutoff <- vector("integer", nrow(ctg.tbl)) # low count cutoffs.
166 # Process the config file row by row.
167 # browser()
168 for(r in 1:nrow(ctg.tbl)) { # r: index of plots/profiles.
169
170 reg <- ctg.tbl$glist[r] # retrieve gene list names.
171 # Create coordinate chunk indices.
172 chkidx.list <- chunkIndex(nrow(coord.list[[reg]]), gcs)
173
174 # Do coverage for each bam file.
175 bam.files <- unlist(strsplit(ctg.tbl$cov[r], ':'))
176
177 # Obtain fraglen for each bam file.
178 fraglens <- as.integer(unlist(strsplit(ctg.tbl$fraglen[r], ':')))
179
180 # Obtain bam file basic info.
181 libsize <- v.lib.size[bam.files[1]]
182 v.low.cutoff[r] <- low.count / libsize * 1e6
183 result.pseudo.rpm <- 1e6 / libsize
184 sn.inbam <- sn.list[[bam.files[1]]]
185 # chr.tag <- chrTag(sn.inbam)
186 chr.tag <- NA # do NOT modify the chromosome names.
187 is.bowtie <- v.map.bowtie[bam.files[1]]
188 cat("\nreport reg2plot\n")
189 cat(reg2plot)
190 cat("\n")
191 result.matrix <- covMatrix(debug, chkidx.list, coord.list[[reg]], rnaseq.gb,
192 exonmodel, libsize, TRUE, chr.tag, pint,
193 reg2plot, flanksize, flankfactor, m.pts, f.pts,
194 bufsize, cov.algo, bam.files[1], sn.inbam,
195 fraglens[1], map.qual, is.bowtie,
196 strand.spec=strand.spec)
197 # Rprof(NULL)
198 if(bam.pair) { # calculate background.
199 fraglen2 <- ifelse(length(fraglens) > 1, fraglens[2], fraglens[1])
200 libsize <- v.lib.size[bam.files[2]]
201 bkg.pseudo.rpm <- 1e6 / libsize
202 sn.inbam <- sn.list[[bam.files[2]]]
203 # chr.tag <- chrTag(sn.inbam)
204 chr.tag <- NA
205 is.bowtie <- v.map.bowtie[bam.files[2]]
206 # if(class(chr.tag) == 'character') {
207 # stop(sprintf("Read %s error: %s", bam.files[2], chr.tag))
208 # }
209 bkg.matrix <- covMatrix(debug, chkidx.list, coord.list[[reg]], rnaseq.gb,
210 exonmodel, libsize, TRUE, chr.tag, pint,
211 reg2plot, flanksize, flankfactor, m.pts, f.pts,
212 bufsize, cov.algo, bam.files[2], sn.inbam,
213 fraglen2, map.qual, is.bowtie,
214 strand.spec=strand.spec)
215 # browser()
216 result.matrix <- log2((result.matrix + result.pseudo.rpm) /
217 (bkg.matrix + bkg.pseudo.rpm))
218 }
219
220 # Calculate SEM if needed. Shut off SEM in single gene case.
221 if(nrow(result.matrix) > 1 && se){
222 confiMat[, r] <- apply(result.matrix, 2, function(x) CalcSem(x, robust))
223 }
224
225 # Book-keep this matrix for heatmap.
226 enrichList[[r]] <- result.matrix
227
228 # Return avg. profile.
229 regcovMat[, r] <- apply(result.matrix, 2, function(x) mean(x, trim=robust,
230 na.rm=T))
231 }
232 # browser()
233 cat("Done\n")
234
235 ########################################
236 # Add row names to heatmap data.
237 for(i in 1:length(enrichList)) {
238 reg <- ctg.tbl$glist[i] # gene list name.
239 rownames(enrichList[[i]]) <- paste(coord.list[[reg]]$gname,
240 coord.list[[reg]]$tid, sep=':')
241 }
242 # Some basic parameters.
243 xticks <- genXticks(reg2plot, pint, lgint, pts, flanksize, flankfactor, Labs)
244 unit.width <- 4
245 ng.list <- sapply(enrichList, nrow) # number of genes per heatmap.
246
247 # Create image file and plot data into it.
248 if(!fi_tag){
249 cat("Plotting figures...")
250 #### Average profile plot. ####
251 out.plot <- avgname
252 pdf(out.plot, width=plot.width, height=plot.height, pointsize=font.size)
253 plotmat(regcovMat, ctg.tbl$title, ctg.tbl$color, bam.pair, xticks, pts,
254 m.pts, f.pts, pint, shade.alp, confiMat, mw, prof.misc)
255 out.dev <- dev.off()
256
257 #### Heatmap. ####
258 # Setup output device.
259 hd <- SetupHeatmapDevice(reg.list, uniq.reg, ng.list, pts, font.size,
260 unit.width, rr)
261 reg.hei <- hd$reg.hei # list of image heights for unique regions.
262 hm.width <- hd$hm.width # image width.
263 hm.height <- hd$hm.height # image height.
264 lay.mat <- hd$lay.mat # matrix for heatmap layout.
265 heatmap.mar <- hd$heatmap.mar # heatmap margins in inches.
266
267 out.hm <- heatmapname
268 pdf(out.hm, width=hm.width, height=hm.height, pointsize=font.size)
269 par(mai=heatmap.mar)
270 layout(lay.mat, heights=reg.hei)
271
272 # Do heatmap plotting.
273 go.list <- plotheat(reg.list, uniq.reg, enrichList, v.low.cutoff, go.algo,
274 go.paras, ctg.tbl$title, bam.pair, xticks, flood.frac,
275 do.plot=T, hm.color=hm.color, color.distr=color.distr,
276 color.scale=color.scale)
277 out.dev <- dev.off()
278 cat("Done\n")
279 } else {
280 go.list <- plotheat(reg.list, uniq.reg, enrichList, v.low.cutoff, go.algo,
281 go.paras, ctg.tbl$title, bam.pair, xticks, flood.frac,
282 do.plot=F, hm.color=hm.color, color.distr=color.distr,
283 color.scale=color.scale)
284 }
285
286 # Save plotting data.
287 oname1="data"
288 cat("Saving results...")
289 dir.create(oname1, showWarnings=F)
290
291 # Average profiles.
292 out.prof <- file.path(oname1, 'avgprof.txt')
293 write.table(regcovMat, file=out.prof, row.names=F, sep="\t", quote=F)
294
295 # Standard errors of mean.
296 if(!is.null(confiMat)){
297 out.confi <- file.path(oname1, 'sem.txt')
298 write.table(confiMat, file=out.confi, row.names=F, sep="\t", quote=F)
299 }
300
301 # Heatmap density values.
302 for(i in 1:length(enrichList)) {
303 reg <- ctg.tbl$glist[i] # gene list name.
304 out.heat <- file.path(oname1, paste('hm', i, '.txt', sep=''))
305 write.table(cbind(coord.list[[reg]][, c('gid', 'gname', 'tid', 'strand')],
306 enrichList[[i]]),
307 file=out.heat, row.names=F, sep="\t", quote=F)
308 }
309
310 # Avg. profile R data.
311 prof.dat <- file.path(oname1, 'avgprof.RData')
312 save(plot.width, plot.height, regcovMat, ctg.tbl, bam.pair, xticks, pts,
313 m.pts, f.pts, pint, shade.alp, confiMat, mw, prof.misc, se, v.lib.size,
314 font.size,
315 file=prof.dat)
316
317 # Heatmap R data.
318 heat.dat <- file.path(oname1, 'heatmap.RData')
319 save(reg.list, uniq.reg, ng.list, pts, enrichList, v.low.cutoff, go.algo,
320 ctg.tbl, bam.pair, xticks, flood.frac, hm.color, unit.width, rr,
321 go.list, color.scale, v.lib.size, font.size, go.paras, low.count,
322 color.distr,
323 file=heat.dat)
324 cat("Done\n")
325
326 # Wrap results up.
327 cat("Wrapping results up...")
328 cur.dir <- getwd()
329 out.dir <- dirname(oname1)
330 out.zip <- basename(oname1)
331 setwd(out.dir)
332 if(!zip(paste(out.zip, '.zip', sep=''), out.zip, extras='-q')) {
333 if(unlink(oname, recursive=T)) {
334 warning(sprintf("Unable to delete intermediate result folder: %s",
335 oname))
336 }
337 }
338 setwd(cur.dir)
339 cat("Done\n")
340 cat("All done. Cheers!\n")
341