Mercurial > repos > artbio > ngsplot
view lib/plotlib.r @ 0:3ca58369469c draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/ngsplot commit b'e9fcc157a7f2f2fa9d6ac9a58d425ff17c975f5c\n'
author | artbio |
---|---|
date | Wed, 06 Dec 2017 19:01:53 -0500 |
parents | |
children |
line wrap: on
line source
#### plotlib.r #### # This contains the library for plotting related functions. # # Authors: Li Shen, Ningyi Shao # # Created: Feb 19, 2013 # Last updated: May 21, 2013 # SetupHeatmapDevice <- function(reg.list, uniq.reg, ng.list, pts, font.size=12, unit.width=4, reduce.ratio=30) { # Configure parameters for heatmap output device. The output is used by # external procedures to setup pdf device ready for heatmap plotting. # Args: # reg.list: region list as in config file. # uniq.reg: unique region list. # ng.list: number of genes per heatmap in the order as config file. # pts: data points (number of columns of heatmaps). # font.size: font size. # unit.width: image width per heatmap. # reduce.ratio: how compressed are genes in comparison to data points? This # controls image height. # Number of plots per region. reg.np <- sapply(uniq.reg, function(r) sum(reg.list==r)) # Number of genes per region. reg.ng <- sapply(uniq.reg, function(r) { ri <- which(reg.list==r)[1] ng.list[ri] }) # Adjustment ratio. origin.fs <- 12 # default font size. fs.adj.ratio <- font.size / origin.fs # Margin size (in lines) adjusted by ratio. m.bot <- fs.adj.ratio * 2 m.lef <- fs.adj.ratio * 1.5 m.top <- fs.adj.ratio * 2 m.rig <- fs.adj.ratio * 1.5 key.in <- fs.adj.ratio * 1.0 # colorkey in inches. m.lef.diff <- (fs.adj.ratio - 1) * 1.5 m.rig.diff <- (fs.adj.ratio - 1) * 1.5 # Setup image size. hm.width <- (unit.width + m.lef.diff + m.rig.diff) * max(reg.np) ipl <- .2 # inches per line. Obtained from par->'mai', 'mar'. # Convert #gene to image height. reg.hei <- sapply(reg.ng, function(r) { c(key.in, # colorkey + margin. r * unit.width / pts / reduce.ratio + m.bot * ipl + m.top * ipl) # heatmap + margin. }) reg.hei <- c(reg.hei) hm.height <- sum(reg.hei) # Setup layout of the heatmaps. lay.mat <- matrix(0, ncol=max(reg.np), nrow=length(reg.np) * 2) fig.n <- 1 # figure plotting number. for(i in 1:length(reg.np)) { row.upper <- i * 2 - 1 row.lower <- i * 2 for(j in 1:reg.np[i]) { lay.mat[row.upper, j] <- fig.n; fig.n <- fig.n + 1 lay.mat[row.lower, j] <- fig.n; fig.n <- fig.n + 1 } } list(reg.hei=reg.hei, hm.width=hm.width, hm.height=hm.height, lay.mat=lay.mat, heatmap.mar=c(m.bot, m.lef, m.top, m.rig) * ipl) } SetPtsSpline <- function(pint, lgint) { # Set data points for spline function. # Args: # pint: tag for point interval. # Return: list of data points, middle data points, flanking data points. pts <- 100 # data points to plot: 0...pts if(pint){ # point interval. m.pts <- 1 # middle part points. f.pts <- pts / 2 # flanking part points. } else { if(lgint) { m.pts <- pts / 5 * 3 + 1 f.pts <- pts / 5 + 1 } else { m.pts <- pts / 5 + 1 f.pts <- pts / 5 * 2 + 1 } } list(pts=pts, m.pts=m.pts, f.pts=f.pts) } CreatePlotMat <- function(pts, ctg.tbl) { # Create matrix for avg. profiles. # Args: # pts: data points. # ctg.tbl: configuration table. # Return: avg. profile matrix initialized to zero. regcovMat <- matrix(0, nrow=pts + 1, ncol=nrow(ctg.tbl)) colnames(regcovMat) <- ctg.tbl$title regcovMat } CreateConfiMat <- function(se, pts, ctg.tbl){ # Create matrix for standard errors. # Args: # se: tag for standard error plotting. # pts: data points. # ctg.tbl: configuration table. # Return: standard error matrix initialized to zero or null. if(se){ confiMat <- matrix(0, nrow=pts + 1, ncol=nrow(ctg.tbl)) colnames(confiMat) <- ctg.tbl$title } else { confiMat <- NULL } confiMat } col2alpha <- function(col2use, alpha){ # Convert a vector of solid colors to semi-transparent colors. # Args: # col2use: vector of colors. # alpha: represents degree of opacity - [0,1] # Return: vector of transformed colors. apply(col2rgb(col2use), 2, function(x){ rgb(x[1], x[2], x[3], alpha=alpha*255, maxColorValue=255) }) } smoothvec <- function(v, radius, method=c('mean', 'median')){ # Given a vector of coverage, return smoothed version of coverage. # Args: # v: vector of coverage # radius: fraction of org. vector size. # method: smooth method # Return: vector of smoothed coverage. stopifnot(is.vector(v)) stopifnot(length(v) > 0) stopifnot(radius > 0 && radius < 1) halfwin <- ceiling(length(v) * radius) s <- rep(NA, length(v)) for(i in 1:length(v)){ winpos <- (i - halfwin) : (i + halfwin) winpos <- winpos[winpos > 0 & winpos <= length(v)] if(method == 'mean'){ s[i] <- mean(v[winpos]) }else if(method == 'median'){ s[i] <- median(v[winpos]) } } s } smoothplot <- function(m, radius, method=c('mean', 'median')){ # Smooth the entire avg. profile matrix using smoothvec. # Args: # m: avg. profile matrix # radius: fraction of org. vector size. # method: smooth method. # Return: smoothed matrix. stopifnot(is.matrix(m) || is.vector(m)) if(is.matrix(m)) { for(i in 1:ncol(m)) { m[, i] <- smoothvec(m[, i], radius, method) } } else { m <- smoothvec(m, radius, method) } m } genXticks <- function(reg2plot, pint, lgint, pts, flanksize, flankfactor, Labs) { # Generate X-ticks for plotting. # Args: # reg2plot: string representation of region. # pint: point interval. # lgint: tag for large interval. # pts: data points. # flanksize: flanking region size in bps. # flankfactor: flanking region factor. # Labs: character vector of labels of the genomic region. # Return: list of x-tick position and label. if(pint){ # point interval. mid.lab <- Labs[1] tick.pos <- c(0, pts / 4, pts / 2, pts / 4 * 3, pts) tick.lab <- as.character(c(-flanksize, -flanksize/2, mid.lab, flanksize/2, flanksize)) }else{ left.lab <- Labs[1] right.lab <- Labs[2] tick.pos <- c(0, pts / 5, pts / 5 * 2, pts / 5 * 3, pts / 5 * 4, pts) if(lgint){ # large interval: fla int int int fla if(flankfactor > 0){ # show percentage at x-tick. tick.lab <- c(sprintf("%d%%", -flankfactor*100), left.lab, '33%', '66%', right.lab, sprintf("%d%%", flankfactor*100)) } else{ # show bps at x-tick. tick.lab <- c(as.character(-flanksize), left.lab, '33%', '66%', right.lab, as.character(flanksize)) } } else { # small interval: fla fla int fla fla. if(flankfactor > 0){ tick.lab <- c(sprintf("%d%%", -flankfactor*100), sprintf("%d%%", -flankfactor*50), left.lab, right.lab, sprintf("%d%%", flankfactor*50), sprintf("%d%%", flankfactor*100)) } else { tick.lab <- c(as.character(-flanksize), as.character(-flanksize/2), left.lab, right.lab, as.character(flanksize/2), as.character(flanksize)) } } } list(pos=tick.pos, lab=tick.lab) } plotmat <- function(regcovMat, title2plot, plot.colors, bam.pair, xticks, pts, m.pts, f.pts, pint, shade.alp=0, confiMat=NULL, mw=1, misc.options=list(yscale='auto', legend=T, box=T, vline=T, xylab=T, line.wd=3)) { # Plot avg. profiles and standard errors around them. # Args: # regcovMat: matrix for avg. profiles. # title2plot: profile names, will be shown in figure legend. # plot.colors: vector of color specifications for all curves. # bam.pair: boolean for bam-pair data. # xticks: X-axis ticks. # pts: data points # m.pts: middle part data points # f.pts: flanking part data points # pint: tag for point interval # shade.alp: shading area alpha # confiMat: matrix for standard errors. # mw: moving window size for smoothing function. # misc.options: list of misc. options - y-axis scale, legend, box around plot, # verticle lines, X- and Y-axis labels, line width. # Smooth avg. profiles if specified. if(mw > 1){ regcovMat <- as.matrix(runmean(regcovMat, k=mw, alg='C', endrule='mean')) } # Choose colors. if(any(is.na(plot.colors))) { ncurve <- ncol(regcovMat) if(ncurve <= 8) { suppressMessages(require(RColorBrewer, warn.conflicts=F)) col2use <- brewer.pal(ifelse(ncurve >= 3, ncurve, 3), 'Dark2') col2use <- col2use[1:ncurve] } else { col2use <- rainbow(ncurve) } } else { col2use <- plot.colors } col2use <- col2alpha(col2use, 0.8) # Plot profiles. ytext <- ifelse(bam.pair, "log2(Fold change vs. control)", "Read count Per Million mapped reads") xrange <- 0:pts y.lim <- NULL if(length(misc.options$yscale) == 2) { y.lim <- misc.options$yscale } matplot(xrange, regcovMat, xaxt='n', type="l", col=col2use, ylim=y.lim, lty="solid", lwd=misc.options$line.wd, frame.plot=F, ann=F) if(misc.options$xylab) { title(xlab="Genomic Region (5' -> 3')", ylab=ytext) } axis(1, at=xticks$pos, labels=xticks$lab, lwd=3, lwd.ticks=3) if(misc.options$box) { # box around plot. box() } # Add shade area. if(shade.alp > 0){ for(i in 1:ncol(regcovMat)){ v.x <- c(xrange[1], xrange, xrange[length(xrange)]) v.y <- regcovMat[, i] v.y <- c(0, v.y, 0) col.rgb <- col2rgb(col2use[i]) p.col <- rgb(col.rgb[1, 1], col.rgb[2, 1], col.rgb[3, 1], alpha=shade.alp * 255, maxColorValue=255) polygon(v.x, v.y, density=-1, border=NA, col=p.col) } } # Add standard errors. if(!is.null(confiMat)){ v.x <- c(xrange, rev(xrange)) for(i in 1:ncol(confiMat)){ v.y <- c(regcovMat[, i] + confiMat[, i], rev(regcovMat[, i] - confiMat[, i])) col.rgb <- col2rgb(col2use[i]) p.col <- rgb(col.rgb[1, 1], col.rgb[2, 1], col.rgb[3, 1], alpha=0.2 * 255, maxColorValue=255) polygon(v.x, v.y, density=-1, border=NA, col=p.col) } } if(misc.options$vline) { # Add gray lines indicating feature boundaries. if(pint) { abline(v=f.pts, col="gray", lwd=2) } else { abline(v=f.pts - 1, col="gray", lwd=2) abline(v=f.pts + m.pts - 2, col="gray", lwd=2) } } if(misc.options$legend) { # Legend. legend("topright", title2plot, text.col=col2use) } } spline_mat <- function(mat, n=100){ # Calculate splined coverage for a matrix. # Args: # mat: each column represents a profile to be interpolated. # n: number of data points to be interpolated. foreach(r=iter(mat, by='row'), .combine='rbind', .multicombine=T) %dopar% { spline(1:length(r), r, n)$y } } RankNormalizeMatrix <- function(mat, low.cutoff) { # Rank-based normalization for a data matrix. # Args: # mat: data matrix. # low.cutoff: low value cutoff. # Return: rank normalized matrix. stopifnot(is.matrix(mat)) concat.dat <- c(mat) low.mask <- concat.dat < low.cutoff concat.r <- rank(concat.dat) concat.r[low.mask] <- 0 matrix(concat.r, nrow=nrow(mat)) } OrderGenesHeatmap <- function(enrichList, lowCutoffs, method=c('total', 'max', 'prod', 'diff', 'hc', 'none', 'km'), go.paras=list(knc=5, max.iter=20, nrs=30)) { # Order genes with a list of heatmap data. # Args: # enrichList: heatmap data in a list. # lowCutoffs: low count cutoff for normalized count data. # method: algorithm used to order genes. # go.paras: gene ordering parameters. # Returns: a vector of REVERSED gene orders. # NOTE: due to the design of image function, the order needs to be reversed # so that the genes will be shown correctly. rankList <- mapply(RankNormalizeMatrix, mat=enrichList, low.cutoff=lowCutoffs, SIMPLIFY=F) np <- length(enrichList) if(method == 'hc') { rankCombined <- do.call('cbind', rankList) # Clustering and order genes. hc <- hclust(dist(rankCombined, method='euclidean'), method='complete') memb <- cutree(hc, k = go.paras$knc) list(rev(hc$order), memb) # reversed! } else if(method == 'km') { rankCombined <- do.call('cbind', rankList) km <- kmeans(rankCombined, centers=go.paras$knc, iter.max=go.paras$max.iter, nstart=go.paras$nrs) list(rev(order(km$cluster)), km$cluster) # reversed! } else if(method == 'total' || method == 'diff' && np == 1) { list(order(rowSums(rankList[[1]])), NULL) } else if(method == 'max') { # peak enrichment value of the 1st profile. list(order(apply(rankList[[1]], 1, max)), NULL) } else if(method == 'prod') { # product of all profiles. rs.mat <- sapply(rankList, rowSums) g.prod <- apply(rs.mat, 1, prod) list(order(g.prod), NULL) } else if(method == 'diff' && np > 1) { # difference between 2 profiles. list(order(rowSums(rankList[[1]]) - rowSums(rankList[[2]])), NULL) } else if(method == 'none') { # according to the order of input gene list. # Because the image function draws from bottom to top, the rows are # reversed to give a more natural look. list(rev(1:nrow(enrichList[[1]])),NULL) } else { # pass. } } plotheat <- function(reg.list, uniq.reg, enrichList, v.low.cutoff, go.algo, go.paras, title2plot, bam.pair, xticks, flood.q=.02, do.plot=T, hm.color="default", color.distr=.6, color.scale='local') { # Plot heatmaps with genes ordered according to some algorithm. # Args: # reg.list: factor vector of regions as in configuration. # uniq.reg: character vector of unique regions. # enrichList: list of heatmap data. # v.low.cutoff: low count cutoff for normalized count data. # go.algo: gene order algorithm. # go.paras: gene ordering parameters. # title2plot: title for each heatmap. Same as the legends in avgprof. # bam.pair: boolean tag for bam-pair. # xticks: info for X-axis ticks. # flood.q: flooding percentage. # do.plot: boolean tag for plotting heatmaps. # hm.color: string for heatmap colors. # color.distr: positive number for color distribution. # color.scale: string for the method to adjust color scale. # Returns: ordered gene names for each unique region as a list. # Setup basic parameters. ncolor <- 256 if(bam.pair) { if(hm.color != "default") { tri.colors <- unlist(strsplit(hm.color, ':')) neg.color <- tri.colors[1] if(length(tri.colors) == 2) { neu.color <- 'black' pos.color <- tri.colors[2] } else { neu.color <- tri.colors[2] pos.color <- tri.colors[3] } enrich.palette <- colorRampPalette(c(neg.color, neu.color, pos.color), bias=color.distr, interpolate='spline') } else { enrich.palette <- colorRampPalette(c('blue', 'black', 'yellow'), bias=color.distr, interpolate='spline') } } else { if(hm.color != "default") { enrich.palette <- colorRampPalette(c('snow', hm.color)) } else { enrich.palette <- colorRampPalette(c('snow', 'red2')) } } hm_cols <- ncol(enrichList[[1]]) # Adjust X-axis tick position. In a heatmap, X-axis is [0, 1]. # Assume xticks$pos is from 0 to N(>0). xticks$pos <- xticks$pos / tail(xticks$pos, n=1) # scale to the same size. # Define a function to calculate color breaks. ColorBreaks <- function(max.e, min.e, bam.pair, ncolor) { # Args: # max.e: maximum enrichment value to be mapped to color. # min.e: minimum enrichment value to be mapped to color. # bam.pair: boolean tag for bam-pair. # ncolor: number of colors to use. # Returns: vector of color breaks. # If bam-pair is used, create breaks for positives and negatives # separately. If log2 ratios are all positive or negative, use only # half of the color space. if(bam.pair) { max.e <- ifelse(max.e > 0, max.e, 1) min.e <- ifelse(min.e < 0, min.e, -1) c(seq(min.e, 0, length.out=ncolor / 2 + 1), seq(0, max.e, length.out=ncolor / 2 + 1)[-1]) } else { seq(min.e, max.e, length.out=ncolor + 1) } } if(grepl(",", color.scale)) { scale.pair <- unlist(strsplit(color.scale, ",")) scale.min <- as.numeric(scale.pair[1]) scale.max <- as.numeric(scale.pair[2]) if(scale.min >= scale.max) { warning("Color scale min value is >= max value.\n") } flood.pts <- c(scale.min, scale.max) brk.use <- ColorBreaks(scale.max, scale.min, bam.pair, ncolor) } # If color scale is global, calculate breaks and quantile here. if(color.scale == 'global') { flood.pts <- quantile(c(enrichList, recursive=T), c(flood.q, 1-flood.q)) brk.use <- ColorBreaks(flood.pts[2], flood.pts[1], bam.pair, ncolor) } # Go through each unique region. # Do NOT use "dopar" in the "foreach" loops here because this will disturb # the image order. go.list <- vector('list', length(uniq.reg)) go.cluster <- vector('list', length(uniq.reg)) names(go.list) <- uniq.reg names(go.cluster) <- uniq.reg for(ur in uniq.reg) { # ur <- uniq.reg[i] plist <- which(reg.list==ur) # get indices in the config file. # Combine all profiles into one. # enrichCombined <- do.call('cbind', enrichList[plist]) enrichSelected <- enrichList[plist] # If color scale is region, calculate breaks and quantile here. if(color.scale == 'region') { flood.pts <- quantile(c(enrichSelected, recursive=T), c(flood.q, 1-flood.q)) brk.use <- ColorBreaks(flood.pts[2], flood.pts[1], bam.pair, ncolor) } # Order genes. if(is.matrix(enrichSelected[[1]]) && nrow(enrichSelected[[1]]) > 1) { if(bam.pair) { lowCutoffs <- 0 } else { lowCutoffs <- v.low.cutoff[plist] } g.order.list <- OrderGenesHeatmap(enrichSelected, lowCutoffs, go.algo, go.paras) g.order <- g.order.list[[1]] g.cluster <- g.order.list[[2]] if(is.null(g.cluster)) { go.cluster[[ur]] <- NA } else{ go.cluster[[ur]] <- rev(g.cluster[g.order]) } go.list[[ur]] <- rev(rownames(enrichSelected[[1]][g.order, ])) } else { go.cluster[[ur]] <- NULL go.list[[ur]] <- NULL } if(!do.plot) { next } # Go through each sample and do plot. for(pj in plist) { if(!is.null(g.order)) { enrichList[[pj]] <- enrichList[[pj]][g.order, ] } # If color scale is local, calculate breaks and quantiles here. if(color.scale == 'local') { flood.pts <- quantile(c(enrichList[[pj]], recursive=T), c(flood.q, 1-flood.q)) brk.use <- ColorBreaks(flood.pts[2], flood.pts[1], bam.pair, ncolor) } # Flooding extreme values. enrichList[[pj]][ enrichList[[pj]] < flood.pts[1] ] <- flood.pts[1] enrichList[[pj]][ enrichList[[pj]] > flood.pts[2] ] <- flood.pts[2] # Draw colorkey. image(z=matrix(brk.use, ncol=1), col=enrich.palette(ncolor), breaks=brk.use, axes=F, useRaster=T, main='Colorkey') axis(1, at=seq(0, 1, length.out=5), labels=format(brk.use[seq(1, ncolor + 1, length.out=5)], digits=1), lwd=1, lwd.ticks=1) # Draw heatmap. image(z=t(enrichList[[pj]]), col=enrich.palette(ncolor), breaks=brk.use, axes=F, useRaster=T, main=title2plot[pj]) axis(1, at=xticks$pos, labels=xticks$lab, lwd=1, lwd.ticks=1) } } list(go.list,go.cluster) } trim <- function(x, p){ # Trim a numeric vector on both ends. # Args: # x: numeric vector. # p: percentage of data to trim. # Return: trimmed vector. low <- quantile(x, p) hig <- quantile(x, 1 - p) x[x > low & x < hig] } CalcSem <- function(x, rb=.05){ # Calculate standard error of mean for a numeric vector. # Args: # x: numeric vector # rb: fraction of data to trim before calculating sem. # Return: a scalar of the standard error of mean if(rb > 0){ x <- trim(x, rb) } sem <- sd(x) / sqrt(length(x)) ifelse(is.na(sem), 0, sem) # NOTE: this should be improved to handle exception that "sd" calculation # emits errors. } ## Leave for future reference. # # Set the antialiasing. # type <- NULL # if (capabilities()["aqua"]) { # type <- "quartz" # } else if (capabilities()["cairo"]) { # type <- "cairo" # } else if (capabilities()["X11"]) { # type <- "Xlib" # } # Set the output type based on capabilities. # if (is.null(type)){ # png(plot.name, width, height, pointsize=pointsize) # } else { # png(plot.name, width, height, pointsize=pointsize, type=type) # }