Mercurial > repos > artbio > ngsplot
view ngs.plot.r @ 4:9652d08d8441 draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/ngsplot commit 73a8ef46dddb0a1b224720bcdfe46050ee9a1b8b
author | artbio |
---|---|
date | Tue, 26 Dec 2017 08:07:56 -0500 |
parents | de27d4172d19 |
children |
line wrap: on
line source
#!/usr/bin/env Rscript # # Program: ngs.plot.r # Purpose: Plot sequencing coverages at different genomic regions. # Allow overlaying various coverages with gene lists. # # -- by Li Shen, MSSM # Created: Nov 2011. # -- by Christophe Antoniewski # recoded for Galaxy: Dec 2017 ngsplot.version <- '2.63_artbio' # Program environment variable. progpath <- Sys.getenv('NGSPLOT') if(progpath == "") { stop("Set environment variable NGSPLOT before run the program. See README for details.\n") } source(file.path(progpath, 'lib', 'parse.args.r')) source(file.path(progpath, 'lib', 'genedb.r')) source(file.path(progpath, 'lib', 'plotlib.r')) source(file.path(progpath, 'lib', 'coverage.r')) # Deal with command line arguments. cmd.help <- function(){ cat("\nVisit https://github.com/shenlab-sinai/ngsplot/wiki/ProgramArguments101 for details\n") cat(paste("Version:", ngsplot.version, sep=" ")) cat("\nUsage: ngs.plot.r -G genome -R region -C [cov|config]file\n") cat(" -O name [Options]\n") cat("\n## Mandatory parameters:\n") cat(" -G Genome name. Use ngsplotdb.py list to show available genomes.\n") cat(" -R Genomic regions to plot: tss, tes, genebody, exon, cgi, enhancer, dhs or bed\n") cat(" -C Indexed bam file or a configuration file for multiplot\n") cat(" -O Name for output: multiple files will be generated\n") cat("## Optional parameters related to configuration file:\n") cat(" -E Gene list to subset regions OR bed file for custom region\n") cat(" -T Image title\n") EchoCoverageArgs() EchoPlotArgs() cat("\n") } ########################################################################### #################### Deal with program input arguments #################### args <- commandArgs(T) # args <- unlist(strsplit('-G mm10 -R tss -C K27M_no_stim_3_GATCAG_L006_R1_001Aligned.out.srt.rmdup.bam -O test -Debug 1', ' ')) # Input argument parser. args.tbl <- parseArgs(args, c('-G', '-C', '-R')) # see lib/parse.args.r if(is.null(args.tbl)){ cmd.help() stop('Error in parsing command line arguments. Stop.\n') } genome <- args.tbl['-G'] reg2plot <- args.tbl['-R'] warning(sprintf("%s", args.tbl)) cat("Configuring variables...") # Load tables of database: default.tbl, dbfile.tbl default.tbl <- read.delim(file.path(progpath, 'database', 'default.tbl')) dbfile.tbl <- read.delim(file.path(progpath, 'database', 'dbfile.tbl')) CheckRegionAllowed(reg2plot, default.tbl) # Setup variables from arguments. cov.args <- CoverageVars(args.tbl, reg2plot) attach(cov.args) # plot.args <- PlotVars(args.tbl) attach(plot.args) # Configuration: coverage-genelist-title relationships. ctg.tbl <- ConfigTbl(args.tbl, fraglen) # Setup plot-related coordinates and variables. plotvar.list <- SetupPlotCoord(args.tbl, ctg.tbl, default.tbl, dbfile.tbl, progpath, genome, reg2plot, inttag, flanksize, samprate, galaxy) attach(plotvar.list) # Setup data points for plot. pts.list <- SetPtsSpline(pint, lgint) pts <- pts.list$pts # data points for avg. profile and standard errors. m.pts <- pts.list$m.pts # middle data points. For pint, m.pts=1. f.pts <- pts.list$f.pts # flanking region data points. # Setup matrix for avg. profiles. regcovMat <- CreatePlotMat(pts, ctg.tbl) # Setup matrix for standard errors. confiMat <- CreateConfiMat(se, pts, ctg.tbl) # Genomic enrichment for all profiles in the config. Use this for heatmaps. enrichList <- vector('list', nrow(ctg.tbl)) cat("Done\n") # Load required libraries. cat("Loading R libraries") if(!suppressMessages(require(ShortRead, warn.conflicts=F))) { source("http://bioconductor.org/biocLite.R") biocLite(ShortRead) if(!suppressMessages(require(ShortRead, warn.conflicts=F))) { stop('Loading package ShortRead failed!') } } cat('.') if(!suppressMessages(require(BSgenome, warn.conflicts=F))) { source("http://bioconductor.org/biocLite.R") biocLite(BSgenome) if(!suppressMessages(require(BSgenome, warn.conflicts=F))) { stop('Loading package BSgenome failed!') } } cat('.') if(!suppressMessages(require(doMC, warn.conflicts=F))) { install.packages('doMC') if(!suppressMessages(require(doMC, warn.conflicts=F))) { stop('Loading package doMC failed!') } } # Register doMC with CPU number. if(cores.number == 0){ registerDoMC() } else { registerDoMC(cores.number) } cat('.') if(!suppressMessages(require(caTools, warn.conflicts=F))) { install.packages('caTools') if(!suppressMessages(require(caTools, warn.conflicts=F))) { stop('Loading package caTools failed!') } } cat('.') if(!suppressMessages(require(utils, warn.conflicts=F))) { install.packages('utils') if(!suppressMessages(require(utils, warn.conflicts=F))) { stop('Loading package utils failed!') } } cat('.') cat("Done\n") ####################################################################### # Here start to extract coverages for all genomic regions and calculate # data for plotting. cat("Analyze bam files and calculate coverage") # Extract bam file names from configuration and determine if bam-pair is used. bfl.res <- bamFileList(ctg.tbl) bam.pair <- bfl.res$bbp # boolean for bam-pair. bam.list <- bfl.res$bam.list # bam file list. CheckHMColorConfig(hm.color, bam.pair) # Determine if bowtie is used to generate the bam files. # Index bam files if not done yet. v.map.bowtie <- headerIndexBam(bam.list) # Retrieve chromosome names for each bam file. sn.list <- seqnamesBam(bam.list) # Calculate library size from bam files for normalization. v.lib.size <- libSizeBam(bam.list) v.low.cutoff <- vector("integer", nrow(ctg.tbl)) # low count cutoffs. # Process the config file row by row. # browser() for(r in 1:nrow(ctg.tbl)) { # r: index of plots/profiles. reg <- ctg.tbl$glist[r] # retrieve gene list names. # Create coordinate chunk indices. chkidx.list <- chunkIndex(nrow(coord.list[[reg]]), gcs) # Do coverage for each bam file. bam.files <- unlist(strsplit(ctg.tbl$cov[r], ':')) # Obtain fraglen for each bam file. fraglens <- as.integer(unlist(strsplit(ctg.tbl$fraglen[r], ':'))) # Obtain bam file basic info. libsize <- v.lib.size[bam.files[1]] v.low.cutoff[r] <- low.count / libsize * 1e6 result.pseudo.rpm <- 1e6 / libsize sn.inbam <- sn.list[[bam.files[1]]] # chr.tag <- chrTag(sn.inbam) chr.tag <- NA # do NOT modify the chromosome names. is.bowtie <- v.map.bowtie[bam.files[1]] cat("\nreport reg2plot\n") cat(reg2plot) cat("\n") result.matrix <- covMatrix(debug, chkidx.list, coord.list[[reg]], rnaseq.gb, exonmodel, libsize, TRUE, chr.tag, pint, reg2plot, flanksize, flankfactor, m.pts, f.pts, bufsize, cov.algo, bam.files[1], sn.inbam, fraglens[1], map.qual, is.bowtie, strand.spec=strand.spec) # Rprof(NULL) if(bam.pair) { # calculate background. fraglen2 <- ifelse(length(fraglens) > 1, fraglens[2], fraglens[1]) libsize <- v.lib.size[bam.files[2]] bkg.pseudo.rpm <- 1e6 / libsize sn.inbam <- sn.list[[bam.files[2]]] # chr.tag <- chrTag(sn.inbam) chr.tag <- NA is.bowtie <- v.map.bowtie[bam.files[2]] # if(class(chr.tag) == 'character') { # stop(sprintf("Read %s error: %s", bam.files[2], chr.tag)) # } bkg.matrix <- covMatrix(debug, chkidx.list, coord.list[[reg]], rnaseq.gb, exonmodel, libsize, TRUE, chr.tag, pint, reg2plot, flanksize, flankfactor, m.pts, f.pts, bufsize, cov.algo, bam.files[2], sn.inbam, fraglen2, map.qual, is.bowtie, strand.spec=strand.spec) # browser() result.matrix <- log2((result.matrix + result.pseudo.rpm) / (bkg.matrix + bkg.pseudo.rpm)) } # Calculate SEM if needed. Shut off SEM in single gene case. if(nrow(result.matrix) > 1 && se){ confiMat[, r] <- apply(result.matrix, 2, function(x) CalcSem(x, robust)) } # Book-keep this matrix for heatmap. enrichList[[r]] <- result.matrix # Return avg. profile. regcovMat[, r] <- apply(result.matrix, 2, function(x) mean(x, trim=robust, na.rm=T)) } # browser() cat("Done\n") ######################################## # Add row names to heatmap data. for(i in 1:length(enrichList)) { reg <- ctg.tbl$glist[i] # gene list name. rownames(enrichList[[i]]) <- paste(coord.list[[reg]]$gname, coord.list[[reg]]$tid, sep=':') } # Some basic parameters. xticks <- genXticks(reg2plot, pint, lgint, pts, flanksize, flankfactor, Labs) unit.width <- 4 ng.list <- sapply(enrichList, nrow) # number of genes per heatmap. # Create image file and plot data into it. if(!fi_tag){ cat("Plotting figures...") #### Average profile plot. #### out.plot <- avgname pdf(out.plot, width=plot.width, height=plot.height, pointsize=font.size) plotmat(regcovMat, ctg.tbl$title, ctg.tbl$color, bam.pair, xticks, pts, m.pts, f.pts, pint, shade.alp, confiMat, mw, prof.misc) out.dev <- dev.off() #### Heatmap. #### # Setup output device. hd <- SetupHeatmapDevice(reg.list, uniq.reg, ng.list, pts, font.size, unit.width, rr) reg.hei <- hd$reg.hei # list of image heights for unique regions. hm.width <- hd$hm.width # image width. hm.height <- hd$hm.height # image height. lay.mat <- hd$lay.mat # matrix for heatmap layout. heatmap.mar <- hd$heatmap.mar # heatmap margins in inches. out.hm <- heatmapname pdf(out.hm, width=hm.width, height=hm.height, pointsize=font.size) par(mai=heatmap.mar) layout(lay.mat, heights=reg.hei) # Do heatmap plotting. go.list <- plotheat(reg.list, uniq.reg, enrichList, v.low.cutoff, go.algo, go.paras, ctg.tbl$title, bam.pair, xticks, flood.frac, do.plot=T, hm.color=hm.color, color.distr=color.distr, color.scale=color.scale) out.dev <- dev.off() cat("Done\n") } else { go.list <- plotheat(reg.list, uniq.reg, enrichList, v.low.cutoff, go.algo, go.paras, ctg.tbl$title, bam.pair, xticks, flood.frac, do.plot=F, hm.color=hm.color, color.distr=color.distr, color.scale=color.scale) } # Save plotting data. oname1="data" cat("Saving results...") dir.create(oname1, showWarnings=F) # Average profiles. out.prof <- file.path(oname1, 'avgprof.txt') write.table(regcovMat, file=out.prof, row.names=F, sep="\t", quote=F) # Standard errors of mean. if(!is.null(confiMat)){ out.confi <- file.path(oname1, 'sem.txt') write.table(confiMat, file=out.confi, row.names=F, sep="\t", quote=F) } # Heatmap density values. for(i in 1:length(enrichList)) { reg <- ctg.tbl$glist[i] # gene list name. out.heat <- file.path(oname1, paste('hm', i, '.txt', sep='')) write.table(cbind(coord.list[[reg]][, c('gid', 'gname', 'tid', 'strand')], enrichList[[i]]), file=out.heat, row.names=F, sep="\t", quote=F) } # Avg. profile R data. prof.dat <- file.path(oname1, 'avgprof.RData') save(plot.width, plot.height, regcovMat, ctg.tbl, bam.pair, xticks, pts, m.pts, f.pts, pint, shade.alp, confiMat, mw, prof.misc, se, v.lib.size, font.size, file=prof.dat) # Heatmap R data. heat.dat <- file.path(oname1, 'heatmap.RData') save(reg.list, uniq.reg, ng.list, pts, enrichList, v.low.cutoff, go.algo, ctg.tbl, bam.pair, xticks, flood.frac, hm.color, unit.width, rr, go.list, color.scale, v.lib.size, font.size, go.paras, low.count, color.distr, file=heat.dat) cat("Done\n") # Wrap results up. cat("Wrapping results up...") cur.dir <- getwd() cat("cur.dir") cat(cur.dir) out.dir <- dirname(oname1) cat("out.dir") cat(out.dir) out.zip <- basename(oname1) cat("out.zip") cat(out.zip) #setwd(out.dir) #cat("Reaching this point without failure") #zip(paste(out.zip, '.zip', sep=''), out.zip, extras='-q') zip("data.zip", "data", extras='-q') #if(!zip(paste(out.zip, '.zip', sep=''), out.zip, extras='-q')) { # if(unlink(oname, recursive=T)) { # warning(sprintf("Unable to delete intermediate result folder: %s", # oname)) # } #} #setwd(cur.dir) cat("Done\n") cat("All done. Cheers!\n")