Mercurial > repos > artbio > ngsplot
diff ngs.plot.r @ 0:3ca58369469c draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/ngsplot commit b'e9fcc157a7f2f2fa9d6ac9a58d425ff17c975f5c\n'
| author | artbio |
|---|---|
| date | Wed, 06 Dec 2017 19:01:53 -0500 |
| parents | |
| children | de27d4172d19 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngs.plot.r Wed Dec 06 19:01:53 2017 -0500 @@ -0,0 +1,341 @@ +#!/usr/bin/env Rscript +# +# Program: ngs.plot.r +# Purpose: Plot sequencing coverages at different genomic regions. +# Allow overlaying various coverages with gene lists. +# +# -- by Li Shen, MSSM +# Created: Nov 2011. +# -- by Christophe Antoniewski +# recoded for Galaxy: Dec 2017 + +ngsplot.version <- '2.63_artbio' +# Program environment variable. +progpath <- Sys.getenv('NGSPLOT') +if(progpath == "") { + stop("Set environment variable NGSPLOT before run the program. See README + for details.\n") +} + +source(file.path(progpath, 'lib', 'parse.args.r')) +source(file.path(progpath, 'lib', 'genedb.r')) +source(file.path(progpath, 'lib', 'plotlib.r')) +source(file.path(progpath, 'lib', 'coverage.r')) + +# Deal with command line arguments. +cmd.help <- function(){ + cat("\nVisit https://github.com/shenlab-sinai/ngsplot/wiki/ProgramArguments101 for details\n") + cat(paste("Version:", ngsplot.version, sep=" ")) + cat("\nUsage: ngs.plot.r -G genome -R region -C [cov|config]file\n") + cat(" -O name [Options]\n") + cat("\n## Mandatory parameters:\n") + cat(" -G Genome name. Use ngsplotdb.py list to show available genomes.\n") + cat(" -R Genomic regions to plot: tss, tes, genebody, exon, cgi, enhancer, dhs or bed\n") + cat(" -C Indexed bam file or a configuration file for multiplot\n") + cat(" -O Name for output: multiple files will be generated\n") + cat("## Optional parameters related to configuration file:\n") + cat(" -E Gene list to subset regions OR bed file for custom region\n") + cat(" -T Image title\n") + EchoCoverageArgs() + EchoPlotArgs() + cat("\n") +} + + +########################################################################### +#################### Deal with program input arguments #################### +args <- commandArgs(T) +# args <- unlist(strsplit('-G mm10 -R tss -C K27M_no_stim_3_GATCAG_L006_R1_001Aligned.out.srt.rmdup.bam -O test -Debug 1', ' ')) + +# Input argument parser. +args.tbl <- parseArgs(args, c('-G', '-C', '-R', '-O')) # see lib/parse.args.r +if(is.null(args.tbl)){ + cmd.help() + stop('Error in parsing command line arguments. Stop.\n') +} +genome <- args.tbl['-G'] +reg2plot <- args.tbl['-R'] +oname <- args.tbl['-O'] +warning(sprintf("%s", args.tbl)) + +cat("Configuring variables...") +# Load tables of database: default.tbl, dbfile.tbl +default.tbl <- read.delim(file.path(progpath, 'database', 'default.tbl')) +dbfile.tbl <- read.delim(file.path(progpath, 'database', 'dbfile.tbl')) +CheckRegionAllowed(reg2plot, default.tbl) + +# Setup variables from arguments. +cov.args <- CoverageVars(args.tbl, reg2plot) +attach(cov.args) # +plot.args <- PlotVars(args.tbl) +attach(plot.args) + +# Configuration: coverage-genelist-title relationships. +ctg.tbl <- ConfigTbl(args.tbl, fraglen) + +# Setup plot-related coordinates and variables. +plotvar.list <- SetupPlotCoord(args.tbl, ctg.tbl, default.tbl, dbfile.tbl, + progpath, genome, reg2plot, inttag, flanksize, + samprate, galaxy) +attach(plotvar.list) + +# Setup data points for plot. +pts.list <- SetPtsSpline(pint, lgint) +pts <- pts.list$pts # data points for avg. profile and standard errors. +m.pts <- pts.list$m.pts # middle data points. For pint, m.pts=1. +f.pts <- pts.list$f.pts # flanking region data points. + +# Setup matrix for avg. profiles. +regcovMat <- CreatePlotMat(pts, ctg.tbl) +# Setup matrix for standard errors. +confiMat <- CreateConfiMat(se, pts, ctg.tbl) + +# Genomic enrichment for all profiles in the config. Use this for heatmaps. +enrichList <- vector('list', nrow(ctg.tbl)) +cat("Done\n") + +# Load required libraries. +cat("Loading R libraries") +if(!suppressMessages(require(ShortRead, warn.conflicts=F))) { + source("http://bioconductor.org/biocLite.R") + biocLite(ShortRead) + if(!suppressMessages(require(ShortRead, warn.conflicts=F))) { + stop('Loading package ShortRead failed!') + } +} +cat('.') +if(!suppressMessages(require(BSgenome, warn.conflicts=F))) { + source("http://bioconductor.org/biocLite.R") + biocLite(BSgenome) + if(!suppressMessages(require(BSgenome, warn.conflicts=F))) { + stop('Loading package BSgenome failed!') + } +} +cat('.') +if(!suppressMessages(require(doMC, warn.conflicts=F))) { + install.packages('doMC') + if(!suppressMessages(require(doMC, warn.conflicts=F))) { + stop('Loading package doMC failed!') + } +} +# Register doMC with CPU number. +if(cores.number == 0){ + registerDoMC() +} else { + registerDoMC(cores.number) +} +cat('.') +if(!suppressMessages(require(caTools, warn.conflicts=F))) { + install.packages('caTools') + if(!suppressMessages(require(caTools, warn.conflicts=F))) { + stop('Loading package caTools failed!') + } +} +cat('.') +if(!suppressMessages(require(utils, warn.conflicts=F))) { + install.packages('utils') + if(!suppressMessages(require(utils, warn.conflicts=F))) { + stop('Loading package utils failed!') + } +} +cat('.') +cat("Done\n") + +####################################################################### +# Here start to extract coverages for all genomic regions and calculate +# data for plotting. + +cat("Analyze bam files and calculate coverage") +# Extract bam file names from configuration and determine if bam-pair is used. +bfl.res <- bamFileList(ctg.tbl) +bam.pair <- bfl.res$bbp # boolean for bam-pair. +bam.list <- bfl.res$bam.list # bam file list. +CheckHMColorConfig(hm.color, bam.pair) + +# Determine if bowtie is used to generate the bam files. +# Index bam files if not done yet. +v.map.bowtie <- headerIndexBam(bam.list) + +# Retrieve chromosome names for each bam file. +sn.list <- seqnamesBam(bam.list) + +# Calculate library size from bam files for normalization. +v.lib.size <- libSizeBam(bam.list) + +v.low.cutoff <- vector("integer", nrow(ctg.tbl)) # low count cutoffs. +# Process the config file row by row. +# browser() +for(r in 1:nrow(ctg.tbl)) { # r: index of plots/profiles. + + reg <- ctg.tbl$glist[r] # retrieve gene list names. + # Create coordinate chunk indices. + chkidx.list <- chunkIndex(nrow(coord.list[[reg]]), gcs) + + # Do coverage for each bam file. + bam.files <- unlist(strsplit(ctg.tbl$cov[r], ':')) + + # Obtain fraglen for each bam file. + fraglens <- as.integer(unlist(strsplit(ctg.tbl$fraglen[r], ':'))) + + # Obtain bam file basic info. + libsize <- v.lib.size[bam.files[1]] + v.low.cutoff[r] <- low.count / libsize * 1e6 + result.pseudo.rpm <- 1e6 / libsize + sn.inbam <- sn.list[[bam.files[1]]] + # chr.tag <- chrTag(sn.inbam) + chr.tag <- NA # do NOT modify the chromosome names. + is.bowtie <- v.map.bowtie[bam.files[1]] + cat("\nreport reg2plot\n") + cat(reg2plot) + cat("\n") + result.matrix <- covMatrix(debug, chkidx.list, coord.list[[reg]], rnaseq.gb, + exonmodel, libsize, TRUE, chr.tag, pint, + reg2plot, flanksize, flankfactor, m.pts, f.pts, + bufsize, cov.algo, bam.files[1], sn.inbam, + fraglens[1], map.qual, is.bowtie, + strand.spec=strand.spec) + # Rprof(NULL) + if(bam.pair) { # calculate background. + fraglen2 <- ifelse(length(fraglens) > 1, fraglens[2], fraglens[1]) + libsize <- v.lib.size[bam.files[2]] + bkg.pseudo.rpm <- 1e6 / libsize + sn.inbam <- sn.list[[bam.files[2]]] + # chr.tag <- chrTag(sn.inbam) + chr.tag <- NA + is.bowtie <- v.map.bowtie[bam.files[2]] + # if(class(chr.tag) == 'character') { + # stop(sprintf("Read %s error: %s", bam.files[2], chr.tag)) + # } + bkg.matrix <- covMatrix(debug, chkidx.list, coord.list[[reg]], rnaseq.gb, + exonmodel, libsize, TRUE, chr.tag, pint, + reg2plot, flanksize, flankfactor, m.pts, f.pts, + bufsize, cov.algo, bam.files[2], sn.inbam, + fraglen2, map.qual, is.bowtie, + strand.spec=strand.spec) + # browser() + result.matrix <- log2((result.matrix + result.pseudo.rpm) / + (bkg.matrix + bkg.pseudo.rpm)) + } + + # Calculate SEM if needed. Shut off SEM in single gene case. + if(nrow(result.matrix) > 1 && se){ + confiMat[, r] <- apply(result.matrix, 2, function(x) CalcSem(x, robust)) + } + + # Book-keep this matrix for heatmap. + enrichList[[r]] <- result.matrix + + # Return avg. profile. + regcovMat[, r] <- apply(result.matrix, 2, function(x) mean(x, trim=robust, + na.rm=T)) +} +# browser() +cat("Done\n") + +######################################## +# Add row names to heatmap data. +for(i in 1:length(enrichList)) { + reg <- ctg.tbl$glist[i] # gene list name. + rownames(enrichList[[i]]) <- paste(coord.list[[reg]]$gname, + coord.list[[reg]]$tid, sep=':') +} +# Some basic parameters. +xticks <- genXticks(reg2plot, pint, lgint, pts, flanksize, flankfactor, Labs) +unit.width <- 4 +ng.list <- sapply(enrichList, nrow) # number of genes per heatmap. + +# Create image file and plot data into it. +if(!fi_tag){ + cat("Plotting figures...") + #### Average profile plot. #### + out.plot <- avgname + pdf(out.plot, width=plot.width, height=plot.height, pointsize=font.size) + plotmat(regcovMat, ctg.tbl$title, ctg.tbl$color, bam.pair, xticks, pts, + m.pts, f.pts, pint, shade.alp, confiMat, mw, prof.misc) + out.dev <- dev.off() + + #### Heatmap. #### + # Setup output device. + hd <- SetupHeatmapDevice(reg.list, uniq.reg, ng.list, pts, font.size, + unit.width, rr) + reg.hei <- hd$reg.hei # list of image heights for unique regions. + hm.width <- hd$hm.width # image width. + hm.height <- hd$hm.height # image height. + lay.mat <- hd$lay.mat # matrix for heatmap layout. + heatmap.mar <- hd$heatmap.mar # heatmap margins in inches. + + out.hm <- heatmapname + pdf(out.hm, width=hm.width, height=hm.height, pointsize=font.size) + par(mai=heatmap.mar) + layout(lay.mat, heights=reg.hei) + + # Do heatmap plotting. + go.list <- plotheat(reg.list, uniq.reg, enrichList, v.low.cutoff, go.algo, + go.paras, ctg.tbl$title, bam.pair, xticks, flood.frac, + do.plot=T, hm.color=hm.color, color.distr=color.distr, + color.scale=color.scale) + out.dev <- dev.off() + cat("Done\n") +} else { + go.list <- plotheat(reg.list, uniq.reg, enrichList, v.low.cutoff, go.algo, + go.paras, ctg.tbl$title, bam.pair, xticks, flood.frac, + do.plot=F, hm.color=hm.color, color.distr=color.distr, + color.scale=color.scale) +} + +# Save plotting data. +oname1="data" +cat("Saving results...") +dir.create(oname1, showWarnings=F) + +# Average profiles. +out.prof <- file.path(oname1, 'avgprof.txt') +write.table(regcovMat, file=out.prof, row.names=F, sep="\t", quote=F) + +# Standard errors of mean. +if(!is.null(confiMat)){ + out.confi <- file.path(oname1, 'sem.txt') + write.table(confiMat, file=out.confi, row.names=F, sep="\t", quote=F) +} + +# Heatmap density values. +for(i in 1:length(enrichList)) { + reg <- ctg.tbl$glist[i] # gene list name. + out.heat <- file.path(oname1, paste('hm', i, '.txt', sep='')) + write.table(cbind(coord.list[[reg]][, c('gid', 'gname', 'tid', 'strand')], + enrichList[[i]]), + file=out.heat, row.names=F, sep="\t", quote=F) +} + +# Avg. profile R data. +prof.dat <- file.path(oname1, 'avgprof.RData') +save(plot.width, plot.height, regcovMat, ctg.tbl, bam.pair, xticks, pts, + m.pts, f.pts, pint, shade.alp, confiMat, mw, prof.misc, se, v.lib.size, + font.size, + file=prof.dat) + +# Heatmap R data. +heat.dat <- file.path(oname1, 'heatmap.RData') +save(reg.list, uniq.reg, ng.list, pts, enrichList, v.low.cutoff, go.algo, + ctg.tbl, bam.pair, xticks, flood.frac, hm.color, unit.width, rr, + go.list, color.scale, v.lib.size, font.size, go.paras, low.count, + color.distr, + file=heat.dat) +cat("Done\n") + +# Wrap results up. +cat("Wrapping results up...") +cur.dir <- getwd() +out.dir <- dirname(oname1) +out.zip <- basename(oname1) +setwd(out.dir) +if(!zip(paste(out.zip, '.zip', sep=''), out.zip, extras='-q')) { + if(unlink(oname, recursive=T)) { + warning(sprintf("Unable to delete intermediate result folder: %s", + oname)) + } +} +setwd(cur.dir) +cat("Done\n") +cat("All done. Cheers!\n") +
