Mercurial > repos > artbio > ngsplot
comparison lib/coverage.r @ 0:3ca58369469c draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/ngsplot commit b'e9fcc157a7f2f2fa9d6ac9a58d425ff17c975f5c\n'
| author | artbio |
|---|---|
| date | Wed, 06 Dec 2017 19:01:53 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:3ca58369469c |
|---|---|
| 1 # Function to check if the range exceeds coverage vector boundaries. | |
| 2 checkBound <- function(start, end, range, chrlen){ | |
| 3 if(end + range > chrlen || start - range < 1) | |
| 4 return(FALSE) # out of boundary. | |
| 5 else | |
| 6 return(TRUE) | |
| 7 } | |
| 8 | |
| 9 Bin <- function(v, n) { | |
| 10 # Function to bin a coverage vector. | |
| 11 # Args: | |
| 12 # v: coverage vector. | |
| 13 # n: number of bins. | |
| 14 # Return: vector of binned values. | |
| 15 | |
| 16 bin.bound <- seq(1, length(v), length.out=n + 1) | |
| 17 sapply(1:n, function(i) { | |
| 18 mean(v[bin.bound[i]:bin.bound[i + 1]]) | |
| 19 }) | |
| 20 } | |
| 21 | |
| 22 SplineRev3Sec <- function(cov.list, v.fls, pts.list, v.strand, algo="spline") { | |
| 23 # For 3 sections of continuous coverage, spline each according to specified | |
| 24 # data points and return concatenated, interpolated curves. | |
| 25 # Args: | |
| 26 # cov.list: a list of coverage vectors. Each vector represents a gene. | |
| 27 # v.fls: vector of flanking region size. | |
| 28 # pts.list: a list of three integers for data points. | |
| 29 # v.strand: factor vector of strands. | |
| 30 # algo: algorithm used to normalize coverage vectors (spline, bin). | |
| 31 # Return: matrix of interpolated coverage: each row represents a gene; each | |
| 32 # column represents a data point. Coverage from exons are concatenated. | |
| 33 # Notes: the names of cov.list and pts.list must be the same for index purpose. | |
| 34 # Suggested: use 'l', 'm', and 'r' for the names. | |
| 35 | |
| 36 # Create an empty coveage matrix first. | |
| 37 tot.pts <- sum(unlist(pts.list)) - 2 | |
| 38 cov.mat <- matrix(nrow=length(cov.list), ncol=tot.pts) | |
| 39 | |
| 40 for(i in 1:length(cov.list)) { | |
| 41 left.cov <- head(cov.list[[i]], n=v.fls[i]) | |
| 42 right.cov <- tail(cov.list[[i]], n=v.fls[i]) | |
| 43 mid.cov <- window(cov.list[[i]], start=v.fls[i] + 1, | |
| 44 width=length(cov.list[[i]]) - 2*v.fls[i]) | |
| 45 if(algo == "spline") { | |
| 46 left.cov <- spline(1:length(left.cov), left.cov, n=pts.list$l)$y | |
| 47 right.cov <- spline(1:length(right.cov), right.cov, n=pts.list$r)$y | |
| 48 mid.cov <- spline(1:length(mid.cov), mid.cov, n=pts.list$m)$y | |
| 49 } else if(algo == "bin") { | |
| 50 left.cov <- Bin(left.cov, n=pts.list$l) | |
| 51 right.cov <- Bin(right.cov, n=pts.list$r) | |
| 52 mid.cov <- Bin(mid.cov, n=pts.list$m) | |
| 53 } else { | |
| 54 # pass. | |
| 55 } | |
| 56 # Fuse the two points at the boundary and concatenate. | |
| 57 f1 <- (tail(left.cov, n=1) + head(mid.cov, n=1)) / 2 | |
| 58 f2 <- (tail(mid.cov, n=1) + head(right.cov, n=1)) / 2 | |
| 59 con.cov <- c(left.cov[-length(left.cov)], f1, | |
| 60 mid.cov[-c(1, length(mid.cov))], | |
| 61 f2, right.cov[-1]) | |
| 62 # browser() | |
| 63 if(v.strand[i] == '+') { | |
| 64 cov.mat[i, ] <- con.cov | |
| 65 } else { | |
| 66 cov.mat[i, ] <- rev(con.cov) | |
| 67 } | |
| 68 } | |
| 69 | |
| 70 cov.mat | |
| 71 } | |
| 72 | |
| 73 extrCov3Sec <- function(v.chrom, v.start, v.end, v.fls, v.strand, m.pts, f.pts, | |
| 74 bufsize, algo, ...) { | |
| 75 # Extract and interpolate coverage vectors from genomic regions with 3 sections. | |
| 76 # Args: | |
| 77 # v.chrom: factor vector of chromosome names. | |
| 78 # v.start: integer vector of region start. | |
| 79 # v.end: integer vector of region end. | |
| 80 # v.fls: integer vector of flanking region size in bps. | |
| 81 # v.strand: factor vector of gene strands. | |
| 82 # m.pts: data points for middle interval. | |
| 83 # f.pts: data points for flanking region. | |
| 84 # bufsize: integer; buffers are added to both ends of each region. | |
| 85 # algo: algorithm used to normalize coverage vectors. | |
| 86 # Return: matrix of interpolated coverage: each row represents a gene; each | |
| 87 # column represents a data point. | |
| 88 | |
| 89 interflank.gr <- GRanges(seqnames=v.chrom, ranges=IRanges( | |
| 90 start=v.start - v.fls - bufsize, | |
| 91 end=v.end + v.fls + bufsize)) | |
| 92 interflank.cov <- covBamExons(interflank.gr, v.strand, ...) | |
| 93 | |
| 94 # Trim buffers from coverage vectors. | |
| 95 interflank.cov <- lapply(interflank.cov, function(v) { | |
| 96 window(v, start=bufsize + 1, width=length(v) - 2 * bufsize) | |
| 97 }) | |
| 98 | |
| 99 # Interpolate and reverse coverage vectors. | |
| 100 SplineRev3Sec(interflank.cov, v.fls, list(l=f.pts, m=m.pts, r=f.pts), | |
| 101 v.strand, algo) | |
| 102 } | |
| 103 | |
| 104 sub3CovList <- function(all.cov, v.left, v.right) { | |
| 105 # For a list of coverage vectors, separate them into three lists. | |
| 106 # Args: | |
| 107 # all.cov: list of coverage vectors. | |
| 108 # v.left: integer vector of left flanking size. | |
| 109 # v.right: integer vector of right flanking size. | |
| 110 # Return: list of lists of coverage vectors. The outer list is named as: | |
| 111 # "l", "m", "r". | |
| 112 | |
| 113 left.cov <- foreach(i=icount(length(all.cov))) %do% { | |
| 114 head(all.cov[[i]], n=v.left[i]) | |
| 115 } | |
| 116 mid.cov <- foreach(i=icount(length(all.cov))) %do% { | |
| 117 len <- length(all.cov[[i]]) | |
| 118 all.cov[[i]][ (v.left[i] + 1) : (len - v.right[i]) ] | |
| 119 } | |
| 120 right.cov <- foreach(i=icount(length(all.cov))) %do% { | |
| 121 tail(all.cov[[i]], n=v.right[i]) | |
| 122 } | |
| 123 list(l=left.cov, m=mid.cov, r=right.cov) | |
| 124 } | |
| 125 | |
| 126 extrCovExons <- function(v.chrom, exonranges.list, v.fls, v.strand, | |
| 127 m.pts, f.pts, bufsize, algo, ...) { | |
| 128 # Extract coverage vectors for transcripts with exon models. | |
| 129 # Args: | |
| 130 # v.chrom: factor vector of chromosome names. | |
| 131 # exonranges.list: list of IRanges objects for exon coordinates. | |
| 132 # v.fls: integer vector of flanking region size in bps. | |
| 133 # v.strand: factor vector of gene strands. | |
| 134 # m.pts: data points for middle interval. | |
| 135 # f.pts: data points for flanking region. | |
| 136 # bufsize: integer; buffers are added to both ends of each region. | |
| 137 # algo: algorithm used to normalize coverage vectors. | |
| 138 # Return: matrix of interpolated coverage: each row represents a gene; each | |
| 139 # column represents a data point. Coverage from exons are concatenated. | |
| 140 | |
| 141 # Construct ranges including exon and flanking regions. | |
| 142 exonflank.list <- vector('list', length=length(exonranges.list)) | |
| 143 for(i in 1:length(exonranges.list)) { | |
| 144 r.mod <- exonranges.list[[i]] # IRanges object to be modified. | |
| 145 n <- length(r.mod) | |
| 146 # Add flanking and buffer regions. | |
| 147 start(r.mod)[1] <- start(r.mod)[1] - v.fls[i] - bufsize | |
| 148 end(r.mod)[n] <- end(r.mod)[n] + v.fls[i] + bufsize | |
| 149 exonflank.list[[i]] <- GRanges(seqnames=v.chrom[i], ranges=r.mod) | |
| 150 } | |
| 151 | |
| 152 exonflank.cov <- covBamExons(exonflank.list, v.strand, ...) | |
| 153 | |
| 154 # Trim buffers from coverage vectors. | |
| 155 exonflank.cov <- lapply(exonflank.cov, function(v) { | |
| 156 window(v, start=bufsize + 1, width=length(v) - 2 * bufsize) | |
| 157 }) | |
| 158 | |
| 159 SplineRev3Sec(exonflank.cov, v.fls, list(l=f.pts, m=m.pts, r=f.pts), | |
| 160 v.strand, algo) | |
| 161 } | |
| 162 | |
| 163 | |
| 164 extrCovMidp <- function(v.chrom, v.midp, flanksize, v.strand, pts, bufsize, | |
| 165 algo, ...) { | |
| 166 # Extract coverage vectors with a middle point and symmetric flanking regions. | |
| 167 # Args: | |
| 168 # v.chrom: factor vector of chromosome names. | |
| 169 # v.midp: integer vector of middle points. | |
| 170 # flanksize: integer of flanking region size in bps. | |
| 171 # v.strand: factor vector of gene strands. | |
| 172 # pts: data points to spline into. | |
| 173 # bufsize: integer; buffers are added to both ends of each region. | |
| 174 # algo: algorithm used to normalize coverage vectors. | |
| 175 # Return: matrix of interpolated coverage: each row represents a gene; each | |
| 176 # column represents a data point. | |
| 177 | |
| 178 granges <- GRanges(seqnames=v.chrom, ranges=IRanges( | |
| 179 start=v.midp - flanksize - bufsize, | |
| 180 end=v.midp + flanksize + bufsize)) | |
| 181 cov.list <- covBamExons(granges, v.strand, ...) | |
| 182 | |
| 183 # Trim buffers from coverage vectors. | |
| 184 cov.list <- lapply(cov.list, function(v) { | |
| 185 window(v, start=bufsize + 1, width=length(v) - 2 * bufsize) | |
| 186 }) | |
| 187 | |
| 188 # Interpolate and reverse coverage vectors and assemble into a matrix. | |
| 189 cov.mat <- matrix(nrow=length(cov.list), ncol=pts) | |
| 190 for(i in 1:length(cov.list)) { | |
| 191 if(is.null(cov.list[[i]])) { | |
| 192 cov.mat[i, ] <- vector('integer', length=pts) | |
| 193 } else { | |
| 194 if(algo == "spline") { | |
| 195 cov.mat[i, ] <- spline(1:length(cov.list[[i]]), cov.list[[i]], | |
| 196 n=pts)$y | |
| 197 } else if(algo == "bin") { | |
| 198 cov.mat[i, ] <- Bin(cov.list[[i]], n=pts) | |
| 199 } else { | |
| 200 # pass. | |
| 201 } | |
| 202 if(v.strand[i] == '-') { | |
| 203 cov.mat[i, ] <- rev(cov.mat[i, ]) | |
| 204 } | |
| 205 } | |
| 206 } | |
| 207 cov.mat | |
| 208 } | |
| 209 | |
| 210 scanBamRevOrder <- function(org.gr, sbp) { | |
| 211 # ScanBamParam re-arranges the input genomic ranges. Use range info to | |
| 212 # construct a string vector to find the order to reverse it. | |
| 213 org.grnames <- with(org.gr, paste(seqnames, start, end, sep=':')) | |
| 214 sbw.gr <- as.data.frame(bamWhich(sbp)) # scan-bam-ed | |
| 215 if('space' %in% names(sbw.gr)) { | |
| 216 sbw.grnames <- with(sbw.gr, paste(space, start, end, sep=':')) | |
| 217 } else if('group_name' %in% names(sbw.gr)) { | |
| 218 sbw.grnames <- with(sbw.gr, paste(group_name, start, end, sep=':')) | |
| 219 } else { | |
| 220 stop("Cannot locate chromosome names in extracted short reads. Report | |
| 221 this problem using issue tracking or discussion forum.\n") | |
| 222 } | |
| 223 | |
| 224 match(org.grnames, sbw.grnames) | |
| 225 } | |
| 226 | |
| 227 genZeroList <- function(llen, v.vlen) { | |
| 228 # Generate a list of specific length with each element being an Rle vector of | |
| 229 # zeros with specific length. | |
| 230 # Args: | |
| 231 # llen: list length | |
| 232 # v.vlen: vector of vector lengths. | |
| 233 | |
| 234 llen <- as.integer(llen) | |
| 235 stopifnot(llen > 0) | |
| 236 res <- vector('list', length=llen) | |
| 237 for(i in 1:llen) { | |
| 238 res[[i]] <- Rle(0, v.vlen[i]) | |
| 239 } | |
| 240 res | |
| 241 } | |
| 242 | |
| 243 covBamExons <- function(granges.dat, v.strand, bam.file, sn.inbam, fraglen, | |
| 244 map.qual=20, bowtie=F, | |
| 245 strand.spec=c('both', 'same', 'opposite')) { | |
| 246 # Extract coverage vectors from bam file for a list of transcripts of multiple | |
| 247 # exons. | |
| 248 # Args: | |
| 249 # granges.dat: a GRanges object representing a set of genomic ranges or a | |
| 250 # list of GRanges objects each representing a set of exonic | |
| 251 # ranges. | |
| 252 # v.strand: vector of strand info. | |
| 253 # bam.file: character string refers to the path of a bam file. | |
| 254 # sn.inbam: vector of chromosome names in the bam file. | |
| 255 # fraglen: fragment length. | |
| 256 # map.qual: mapping quality to filter reads. | |
| 257 # bowtie: boolean to indicate whether the aligner was Bowtie-like or not. | |
| 258 # strand.spec: string desc. for strand-specific coverage calculation. | |
| 259 # Return: list of coverage vectors, each vector represents a transcript. | |
| 260 | |
| 261 strand.spec <- match.arg(strand.spec) | |
| 262 | |
| 263 if(class(granges.dat) == 'list') { | |
| 264 # Construct a GRanges object representing DNA sequences. | |
| 265 v.seqnames <- sapply(granges.dat, | |
| 266 function(x) as.character(seqnames(x)[1])) | |
| 267 v.start <- sapply(granges.dat, function(x) start(ranges(x))[1]) | |
| 268 v.end <- sapply(granges.dat, function(x) tail(end(ranges(x)), n=1)) | |
| 269 granges.dna <- GRanges(seqnames=v.seqnames, | |
| 270 ranges=IRanges(start=v.start, end=v.end)) | |
| 271 # Obtain mRNA(including flanking) length for each gene. | |
| 272 repr.lens <- sapply(granges.dat, function(g) { | |
| 273 sum(end(g) - start(g) + 1) | |
| 274 }) | |
| 275 } else { | |
| 276 v.seqnames <- as.character(seqnames(granges.dat)) | |
| 277 v.start <- start(granges.dat) | |
| 278 v.end <- end(granges.dat) | |
| 279 granges.dna <- granges.dat | |
| 280 repr.lens <- v.end - v.start + 1 | |
| 281 granges.dat <- vector('list', length(granges.dna)) # set null tags. | |
| 282 } | |
| 283 | |
| 284 # Filter transcripts whose chromosomes do not match bam file. | |
| 285 inbam.mask <- as.character(seqnames(granges.dna)) %in% sn.inbam | |
| 286 if(!any(inbam.mask)) { # none matches. | |
| 287 return(genZeroList(length(granges.dna), repr.lens)) | |
| 288 } | |
| 289 | |
| 290 # scanBamWhat: the info that need to be extracted from a bam file. | |
| 291 sbw <- c('pos', 'qwidth', 'mapq', 'strand', 'rname', | |
| 292 'mrnm', 'mpos', 'isize') | |
| 293 sbp <- ScanBamParam(what=sbw, which=granges.dna[inbam.mask], | |
| 294 flag=scanBamFlag(isUnmappedQuery=F, | |
| 295 isNotPassingQualityControls=F, | |
| 296 isDuplicate=F)) | |
| 297 | |
| 298 # Scan bam file to retrieve short reads. | |
| 299 sr.in.ranges <- tryCatch(scanBam(bam.file, param=sbp), error=function(e) e) | |
| 300 if(class(sr.in.ranges)[1] == 'simpleError') { | |
| 301 # This is not supposed to happen after those unmatched seqnames are | |
| 302 # removed. I keep it for safty. | |
| 303 return(genZeroList(length(granges.dna), repr.lens)) | |
| 304 } | |
| 305 | |
| 306 # Restore the original order. | |
| 307 sr.in.ranges <- sr.in.ranges[scanBamRevOrder( | |
| 308 as.data.frame(granges.dna[inbam.mask]), sbp)] | |
| 309 | |
| 310 CalcReadsCov <- function(srg, start, end, gr.rna, repr.len, strand) { | |
| 311 # Calculate short read coverage for each gene/region. | |
| 312 # Args: | |
| 313 # srg: extracted short reads in gene. | |
| 314 # start: start position of the DNA sequence. | |
| 315 # end: end position of the DNA sequence. | |
| 316 # gr.rna: GRanges object (multiple ranges) representing exon sequences. | |
| 317 # This can be NULL indicating the input ranges are DNAs. | |
| 318 # repr.len: DNA or mRNA sequence length. | |
| 319 # strand: transcript strand (+/-). | |
| 320 # Returns: a coverage vector for the gene. | |
| 321 | |
| 322 # browser() | |
| 323 # Special handling for bowtie mapping. | |
| 324 if(bowtie) { | |
| 325 srg <- within(srg, mapq[is.na(mapq)] <- 254) # within! | |
| 326 } | |
| 327 # Filter short reads by mapping quality. | |
| 328 all.mask <- srg$mapq >= map.qual | |
| 329 | |
| 330 # Subset by strand info. | |
| 331 if(strand.spec != 'both') { | |
| 332 if(strand.spec == 'same') { | |
| 333 s.mask <- srg$strand == as.character(strand) | |
| 334 } else { | |
| 335 s.mask <- srg$strand != as.character(strand) | |
| 336 } | |
| 337 all.mask <- all.mask & s.mask | |
| 338 } | |
| 339 | |
| 340 # If paired, filter reads that are not properly paired. | |
| 341 paired <- all(with(srg, is.na(isize) | isize != 0)) | |
| 342 if(paired) { | |
| 343 p.mask <- with(srg, rname == mrnm & xor(strand == '+', isize < 0)) | |
| 344 all.mask <- all.mask & p.mask | |
| 345 } | |
| 346 | |
| 347 # Apply all the filters on short reads. | |
| 348 srg <- lapply(srg, `[`, which(all.mask)) | |
| 349 | |
| 350 # Calculate coverage. | |
| 351 if(length(srg[[1]]) > 0) { | |
| 352 if(paired) { | |
| 353 cov.pos <- with(srg, ifelse(isize < 0, mpos, pos)) | |
| 354 cov.wd <- abs(srg$isize) | |
| 355 } else { | |
| 356 # Adjust negative read positions for physical coverage. | |
| 357 cov.pos <- with(srg, ifelse(strand == '-', | |
| 358 pos - fraglen + qwidth, pos)) | |
| 359 cov.wd <- fraglen | |
| 360 } | |
| 361 # Shift reads by subtracting start positions. | |
| 362 cov.pos <- cov.pos - start + 1 | |
| 363 # Calculate physical coverage on the whole genebody. | |
| 364 covg <- coverage(IRanges(start=cov.pos, width=cov.wd), | |
| 365 width=end - start + 1, method='sort') | |
| 366 | |
| 367 if(!is.null(gr.rna)) { # RNA-seq. | |
| 368 # Shift exonic ranges by subtracting start positions. | |
| 369 # BE careful with negative start positions! Need to adjust end | |
| 370 # positions first(or the GRanges lib will emit errors if | |
| 371 # start > end). | |
| 372 # Negative start positions happen when flanking region exceeds | |
| 373 # the chromosomal start. | |
| 374 if(start > 0) { | |
| 375 start(gr.rna) <- start(gr.rna) - start + 1 | |
| 376 end(gr.rna) <- end(gr.rna) - start + 1 | |
| 377 } else { | |
| 378 end(gr.rna) <- end(gr.rna) - start + 1 | |
| 379 start(gr.rna) <- start(gr.rna) - start + 1 | |
| 380 } | |
| 381 # Concatenate all exon coverages. | |
| 382 covg[ranges(gr.rna)] | |
| 383 } else { # ChIP-seq. | |
| 384 covg | |
| 385 } | |
| 386 } else { | |
| 387 Rle(0, repr.len) | |
| 388 } | |
| 389 } | |
| 390 | |
| 391 covg.allgenes <- mapply(CalcReadsCov, srg=sr.in.ranges, | |
| 392 start=v.start, end=v.end, gr.rna=granges.dat, | |
| 393 repr.len=repr.lens, strand=v.strand, SIMPLIFY=F) | |
| 394 } | |
| 395 | |
| 396 bamFileList <- function(ctg.tbl) { | |
| 397 # Determine the bam files involved in the configuration and whether it is a | |
| 398 # bam file pair setup. | |
| 399 # Args: | |
| 400 # ctg.tbl: coverage-genelist-title table. | |
| 401 | |
| 402 cov.uniq <- unique(ctg.tbl$cov) | |
| 403 cov.list <- strsplit(cov.uniq, ':') | |
| 404 v.nbam <- sapply(cov.list, length) | |
| 405 v.bbp <- v.nbam == 2 | |
| 406 if(all(v.bbp)) { | |
| 407 bbp <- T | |
| 408 } else if(all(!v.bbp)) { | |
| 409 bbp <- F | |
| 410 } else { | |
| 411 stop("No mix of bam and bam-pair allowed in configuration.\n") | |
| 412 } | |
| 413 | |
| 414 list(bbp=bbp, bam.list=unique(unlist(cov.list))) | |
| 415 } | |
| 416 | |
| 417 estiMapqStyle <- function(bam.file){ | |
| 418 # Estimate the mapping quality style. Return TRUE if it is SAM standard. | |
| 419 # Sample 1000 mapped reads from bam file, and if the mapq of reads | |
| 420 # over half are NA, then return FALSE, because it is quite possible that | |
| 421 # the aligner using coding style as bowtie, 255 as highest score. | |
| 422 # Args: | |
| 423 # bam.file: bam file to be sampled. | |
| 424 | |
| 425 sbw <- c('pos', 'qwidth', 'mapq', 'strand') | |
| 426 sbp <- ScanBamParam(what=sbw, flag=scanBamFlag( | |
| 427 isUnmappedQuery=F, isDuplicate=F)) | |
| 428 samp <- BamSampler(bam.file, yieldSize=500) | |
| 429 samp.reads <- scanBam(samp, param=sbp)[[1]] | |
| 430 samp.len <- length(samp.reads[["mapq"]]) | |
| 431 mapq.255 <- sum(is.na(samp.reads[["mapq"]])) | |
| 432 if(mapq.255/samp.len >= 0.5){ | |
| 433 return(FALSE) | |
| 434 }else{ | |
| 435 return(TRUE) | |
| 436 } | |
| 437 } | |
| 438 | |
| 439 headerIndexBam <- function(bam.list) { | |
| 440 # Read bam header to determine mapping method. | |
| 441 # Index bam files if not done yet. | |
| 442 # Args: | |
| 443 # ctg.tbl: coverage-genelist-title table. | |
| 444 | |
| 445 v.map.bowtie <- vector('logical', length=length(bam.list)) | |
| 446 for(i in 1:length(bam.list)) { | |
| 447 bam.file <- bam.list[i] | |
| 448 | |
| 449 # Index bam file. | |
| 450 if(!file.exists(paste(bam.file, ".bai", sep=""))) { | |
| 451 indexBam(bam.file) | |
| 452 } | |
| 453 | |
| 454 # Derive mapping program. | |
| 455 header <- scanBamHeader(bam.file) | |
| 456 map.prog <- try(strsplit(header[[1]]$text$'@PG'[[1]], ':')[[1]][2], | |
| 457 silent=T) | |
| 458 if(class(map.prog) != "try-error") { | |
| 459 map.style <- grepl('tophat|bowtie|bedtools|star', map.prog, | |
| 460 ignore.case=T) | |
| 461 if(map.style){ | |
| 462 v.map.bowtie[i] <- TRUE | |
| 463 next | |
| 464 } | |
| 465 map.style <- grepl('bwa|casava|gem', map.prog, ignore.case=T) | |
| 466 if(map.style) { | |
| 467 v.map.bowtie[i] <- FALSE | |
| 468 next | |
| 469 } | |
| 470 # if(estiMapqStyle(bam.file)){ | |
| 471 # warning(sprintf("Aligner for: %s cannot be determined. Style of | |
| 472 # standard SAM mapping score will be used.", bam.file)) | |
| 473 # v.map.bowtie[i] <- FALSE | |
| 474 # }else{ | |
| 475 # warning(sprintf("Aligner for: %s cannot be determined. Style of | |
| 476 # Bowtie-like SAM mapping score will be used. Would you mind to tell us what | |
| 477 # aligner you are using?", bam.file)) | |
| 478 # v.map.bowtie[i] <- TRUE | |
| 479 # } | |
| 480 } | |
| 481 # else { | |
| 482 # cat("\n") | |
| 483 # if(estiMapqStyle(bam.file)){ | |
| 484 # warning(sprintf("Aligner for: %s cannot be determined. Style of | |
| 485 # standard SAM mapping score will be used.", bam.file)) | |
| 486 # v.map.bowtie[i] <- FALSE | |
| 487 # }else{ | |
| 488 # warning(sprintf("Aligner for: %s cannot be determined. Style of | |
| 489 # Bowtie-like SAM mapping score will be used.", bam.file)) | |
| 490 # v.map.bowtie[i] <- TRUE | |
| 491 # } | |
| 492 # } | |
| 493 warning(sprintf("Aligner for: %s cannot be determined. Style of | |
| 494 standard SAM mapping score will be used. Would you mind submitting an issue | |
| 495 report to us on Github? This will benefit people using the same aligner.", | |
| 496 bam.file)) | |
| 497 v.map.bowtie[i] <- FALSE | |
| 498 } | |
| 499 names(v.map.bowtie) <- bam.list | |
| 500 | |
| 501 v.map.bowtie | |
| 502 } | |
| 503 | |
| 504 libSizeBam <- function(bam.list) { | |
| 505 # Obtain library sizes by counting qualified bam records. | |
| 506 # Args: | |
| 507 # ctg.tbl: coverage-genelist-title table. | |
| 508 | |
| 509 # Count only reads that are mapped, primary, passed quality control and | |
| 510 # un-duplicated. | |
| 511 sbp <- ScanBamParam(flag=scanBamFlag(isUnmappedQuery=F, isSecondaryAlignment=F, | |
| 512 isNotPassingQualityControls=F, isDuplicate=F)) | |
| 513 v.lib.size <- vector('integer', length=length(bam.list)) | |
| 514 for(i in 1:length(bam.list)) { | |
| 515 bfn <- bam.list[i] # bam file name. | |
| 516 cfn <- paste(basename(bfn), '.cnt', sep='') # count file name. | |
| 517 if(file.exists(cfn)) { | |
| 518 v.lib.size[i] <- as.integer(readLines(cfn, n=1)) | |
| 519 } else { | |
| 520 cnt.bam <- countBam(bfn, param=sbp) | |
| 521 v.lib.size[i] <- cnt.bam$records | |
| 522 writeLines(as.character(v.lib.size[i]), cfn) | |
| 523 } | |
| 524 names(v.lib.size)[i] <- bfn | |
| 525 } | |
| 526 | |
| 527 v.lib.size | |
| 528 } | |
| 529 | |
| 530 seqnamesBam <- function(bam.list) { | |
| 531 # Obtain chromosome names for each bam file. This list must be used to filter | |
| 532 # genomic regions before scanBam or it terminates immaturely. | |
| 533 # ctg.tbl: coverage-genelist-title table. | |
| 534 | |
| 535 # browser() | |
| 536 sn.list <- lapply(scanBamHeader(bam.list), function(h) { | |
| 537 names(h$targets) | |
| 538 }) | |
| 539 names(sn.list) <- bam.list | |
| 540 | |
| 541 sn.list | |
| 542 } | |
| 543 | |
| 544 chrTag <- function(sn.inbam) { | |
| 545 # Check whether the chromosome name format in the bam file contains 'chr' or not. | |
| 546 # Args: | |
| 547 # sn.inbam: seqnames in the bam file. | |
| 548 | |
| 549 n.chr <- length(grep('^chr', sn.inbam)) | |
| 550 if(n.chr == 0) { | |
| 551 chr.tag <- F | |
| 552 } else if(n.chr == length(sn.inbam)) { | |
| 553 chr.tag <- T | |
| 554 } else { | |
| 555 return("Inconsistent chromosome names in bam file. Check bam header.") | |
| 556 } | |
| 557 | |
| 558 chr.tag | |
| 559 } | |
| 560 | |
| 561 chunkIndex <- function(tot.gene, gcs) { | |
| 562 # Create chunk indices according to total number of genes and chunk size. | |
| 563 # Args: | |
| 564 # tot.gene: total number of genes. | |
| 565 # gcs: gene chunk size. | |
| 566 | |
| 567 nchk <- ceiling(tot.gene / gcs) # number of chunks. | |
| 568 chkidx.list <- vector('list', length=nchk) # chunk indices list. | |
| 569 chk.start <- 1 # chunk start. | |
| 570 i.chk <- idiv(tot.gene, chunkSize=gcs) | |
| 571 for(i in 1:nchk) { | |
| 572 chk.size <- nextElem(i.chk) | |
| 573 chkidx.list[[i]] <- c(chk.start, chk.start + chk.size - 1) | |
| 574 chk.start <- chk.start + chk.size | |
| 575 } | |
| 576 | |
| 577 chkidx.list | |
| 578 } | |
| 579 | |
| 580 covMatrix <- function(debug, chkidx.list, coord, rnaseq.gb, exonmodel, libsize, | |
| 581 spit.dot=T, ...) { | |
| 582 # Function to generate a coverage matrix for all genes. | |
| 583 # Args: | |
| 584 # debug: boolean tag for debugging. | |
| 585 # chkidx.list: list of (start, end) indices for each chunk. | |
| 586 # coord: dataframe of gene coordinates. | |
| 587 # rnaseq.gb: boolean for RNA-seq genebody plot. | |
| 588 # exonmodel: exon model data object. | |
| 589 # libsize: total read count for this bam file. | |
| 590 # spit.dot: boolean to control sptting '.' to consoles. | |
| 591 # Return: normalized coverage matrix for all genes, each row represents a gene. | |
| 592 | |
| 593 | |
| 594 if(!debug) { | |
| 595 # Extract coverage and combine into a matrix. | |
| 596 result.matrix <- foreach(chk=chkidx.list, .combine='rbind', | |
| 597 .multicombine=T) %dopar% { | |
| 598 if(spit.dot) { | |
| 599 cat(".") | |
| 600 } | |
| 601 i <- chk[1]:chk[2] # chunk: start -> end | |
| 602 # If RNA-seq, retrieve exon ranges. | |
| 603 if(rnaseq.gb) { | |
| 604 exonranges.list <- unlist(exonmodel[coord[i, ]$tid]) | |
| 605 } else { | |
| 606 exonranges.list <- NULL | |
| 607 } | |
| 608 doCov(coord[i, ], exonranges.list, ...) | |
| 609 } | |
| 610 | |
| 611 # Floor negative values which are caused by spline. | |
| 612 result.matrix[result.matrix < 0] <- 0 | |
| 613 result.matrix / libsize * 1e6 # normalize to RPM. | |
| 614 | |
| 615 } else { | |
| 616 for(c in 1:length(chkidx.list)) { | |
| 617 chk <- chkidx.list[[c]] | |
| 618 i <- chk[1]:chk[2] # chunk: start -> end | |
| 619 cat(".") | |
| 620 # If RNA-seq, retrieve exon ranges. | |
| 621 if(rnaseq.gb) { | |
| 622 exonranges.list <- unlist(exonmodel[coord[i, ]$tid]) | |
| 623 } else { | |
| 624 exonranges.list <- NULL | |
| 625 } | |
| 626 cov <- doCov(coord[i, ], exonranges.list, ...) | |
| 627 if(c == 1) { | |
| 628 result.matrix <- matrix(0, nrow=nrow(coord), ncol=ncol(cov)) | |
| 629 } | |
| 630 result.matrix[i, ] <- cov | |
| 631 } | |
| 632 # Floor negative values which are caused by spline. | |
| 633 # browser() | |
| 634 result.matrix[result.matrix < 0] <- 0 | |
| 635 result.matrix / libsize * 1e6 # normalize to RPM. | |
| 636 | |
| 637 } | |
| 638 } | |
| 639 | |
| 640 | |
| 641 doCov <- function(coord.mat, exonranges.list, chr.tag, pint, reg2plot, | |
| 642 flanksize, flankfactor, m.pts, f.pts, ...) { | |
| 643 # Extract coverage from bam file into a matrix. According to the parameter | |
| 644 # values, call corresponding functions. | |
| 645 # Args: | |
| 646 # coord.mat: matrix of genomic coordinates to extract coverage. | |
| 647 # exonranges.list: list of IRanges objects, each represents a group of exons. | |
| 648 # pint: boolean of point interval. | |
| 649 # reg2plot: string of region to plot. | |
| 650 # flanksize: flanking region size. | |
| 651 # flankfactor: flanking region factor. | |
| 652 # m.pts: data points for middle interval. | |
| 653 # f.pts: data points for flanking region. | |
| 654 # Return: matrix of interpolated coverage: each row represents a gene; each | |
| 655 # column represents a data point. Coverage from exons are concatenated. | |
| 656 | |
| 657 v.chrom <- coord.mat$chrom | |
| 658 # if(!chr.tag) { | |
| 659 # v.chrom <- sub('chr', '', v.chrom) | |
| 660 # } | |
| 661 v.chrom <- as.factor(v.chrom) | |
| 662 v.strand <- as.factor(coord.mat$strand) | |
| 663 | |
| 664 # Figure out interval region sizes and calculate flanking region sizes. | |
| 665 if(!pint) { # interval regions. | |
| 666 if(!is.null(exonranges.list)) { # RNA-seq | |
| 667 if(flankfactor > 0) { | |
| 668 v.fls <- sapply(exonranges.list, function(t) { | |
| 669 sum(end(t) - start(t) + 1) * flankfactor | |
| 670 }) | |
| 671 } else { | |
| 672 v.fls <- rep(flanksize, length=nrow(coord.mat)) | |
| 673 } | |
| 674 extrCovExons(v.chrom, exonranges.list, v.fls, v.strand, | |
| 675 m.pts, f.pts, ...) | |
| 676 } else { # ChIP-seq with intervals. | |
| 677 v.start <- coord.mat$start | |
| 678 v.end <- coord.mat$end | |
| 679 if(flankfactor > 0) { | |
| 680 v.fls <- round((v.end - v.start + 1) * flankfactor) | |
| 681 } else { | |
| 682 v.fls <- rep(flanksize, length=nrow(coord.mat)) | |
| 683 } | |
| 684 extrCov3Sec(v.chrom, v.start, v.end, v.fls, v.strand, | |
| 685 m.pts, f.pts, ...) | |
| 686 } | |
| 687 } else { # point center with flanking regions. | |
| 688 v.midp <- vector('integer', length=nrow(coord.mat)) | |
| 689 for(r in 1:nrow(coord.mat)) { | |
| 690 if(reg2plot == 'tss' && v.strand[r] == '+' || | |
| 691 reg2plot == 'tes' && v.strand[r] == '-') { | |
| 692 # the left site is center. | |
| 693 v.midp[r] <- coord.mat$start[r] | |
| 694 } else { # this also includes BED supplied point center. | |
| 695 v.midp[r] <- coord.mat$end[r] | |
| 696 } | |
| 697 } | |
| 698 # browser() | |
| 699 extrCovMidp(v.chrom, v.midp, flanksize, v.strand, m.pts + f.pts*2, ...) | |
| 700 } | |
| 701 } |
