comparison lib/coverage.r @ 0:3ca58369469c draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/ngsplot commit b'e9fcc157a7f2f2fa9d6ac9a58d425ff17c975f5c\n'
author artbio
date Wed, 06 Dec 2017 19:01:53 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:3ca58369469c
1 # Function to check if the range exceeds coverage vector boundaries.
2 checkBound <- function(start, end, range, chrlen){
3 if(end + range > chrlen || start - range < 1)
4 return(FALSE) # out of boundary.
5 else
6 return(TRUE)
7 }
8
9 Bin <- function(v, n) {
10 # Function to bin a coverage vector.
11 # Args:
12 # v: coverage vector.
13 # n: number of bins.
14 # Return: vector of binned values.
15
16 bin.bound <- seq(1, length(v), length.out=n + 1)
17 sapply(1:n, function(i) {
18 mean(v[bin.bound[i]:bin.bound[i + 1]])
19 })
20 }
21
22 SplineRev3Sec <- function(cov.list, v.fls, pts.list, v.strand, algo="spline") {
23 # For 3 sections of continuous coverage, spline each according to specified
24 # data points and return concatenated, interpolated curves.
25 # Args:
26 # cov.list: a list of coverage vectors. Each vector represents a gene.
27 # v.fls: vector of flanking region size.
28 # pts.list: a list of three integers for data points.
29 # v.strand: factor vector of strands.
30 # algo: algorithm used to normalize coverage vectors (spline, bin).
31 # Return: matrix of interpolated coverage: each row represents a gene; each
32 # column represents a data point. Coverage from exons are concatenated.
33 # Notes: the names of cov.list and pts.list must be the same for index purpose.
34 # Suggested: use 'l', 'm', and 'r' for the names.
35
36 # Create an empty coveage matrix first.
37 tot.pts <- sum(unlist(pts.list)) - 2
38 cov.mat <- matrix(nrow=length(cov.list), ncol=tot.pts)
39
40 for(i in 1:length(cov.list)) {
41 left.cov <- head(cov.list[[i]], n=v.fls[i])
42 right.cov <- tail(cov.list[[i]], n=v.fls[i])
43 mid.cov <- window(cov.list[[i]], start=v.fls[i] + 1,
44 width=length(cov.list[[i]]) - 2*v.fls[i])
45 if(algo == "spline") {
46 left.cov <- spline(1:length(left.cov), left.cov, n=pts.list$l)$y
47 right.cov <- spline(1:length(right.cov), right.cov, n=pts.list$r)$y
48 mid.cov <- spline(1:length(mid.cov), mid.cov, n=pts.list$m)$y
49 } else if(algo == "bin") {
50 left.cov <- Bin(left.cov, n=pts.list$l)
51 right.cov <- Bin(right.cov, n=pts.list$r)
52 mid.cov <- Bin(mid.cov, n=pts.list$m)
53 } else {
54 # pass.
55 }
56 # Fuse the two points at the boundary and concatenate.
57 f1 <- (tail(left.cov, n=1) + head(mid.cov, n=1)) / 2
58 f2 <- (tail(mid.cov, n=1) + head(right.cov, n=1)) / 2
59 con.cov <- c(left.cov[-length(left.cov)], f1,
60 mid.cov[-c(1, length(mid.cov))],
61 f2, right.cov[-1])
62 # browser()
63 if(v.strand[i] == '+') {
64 cov.mat[i, ] <- con.cov
65 } else {
66 cov.mat[i, ] <- rev(con.cov)
67 }
68 }
69
70 cov.mat
71 }
72
73 extrCov3Sec <- function(v.chrom, v.start, v.end, v.fls, v.strand, m.pts, f.pts,
74 bufsize, algo, ...) {
75 # Extract and interpolate coverage vectors from genomic regions with 3 sections.
76 # Args:
77 # v.chrom: factor vector of chromosome names.
78 # v.start: integer vector of region start.
79 # v.end: integer vector of region end.
80 # v.fls: integer vector of flanking region size in bps.
81 # v.strand: factor vector of gene strands.
82 # m.pts: data points for middle interval.
83 # f.pts: data points for flanking region.
84 # bufsize: integer; buffers are added to both ends of each region.
85 # algo: algorithm used to normalize coverage vectors.
86 # Return: matrix of interpolated coverage: each row represents a gene; each
87 # column represents a data point.
88
89 interflank.gr <- GRanges(seqnames=v.chrom, ranges=IRanges(
90 start=v.start - v.fls - bufsize,
91 end=v.end + v.fls + bufsize))
92 interflank.cov <- covBamExons(interflank.gr, v.strand, ...)
93
94 # Trim buffers from coverage vectors.
95 interflank.cov <- lapply(interflank.cov, function(v) {
96 window(v, start=bufsize + 1, width=length(v) - 2 * bufsize)
97 })
98
99 # Interpolate and reverse coverage vectors.
100 SplineRev3Sec(interflank.cov, v.fls, list(l=f.pts, m=m.pts, r=f.pts),
101 v.strand, algo)
102 }
103
104 sub3CovList <- function(all.cov, v.left, v.right) {
105 # For a list of coverage vectors, separate them into three lists.
106 # Args:
107 # all.cov: list of coverage vectors.
108 # v.left: integer vector of left flanking size.
109 # v.right: integer vector of right flanking size.
110 # Return: list of lists of coverage vectors. The outer list is named as:
111 # "l", "m", "r".
112
113 left.cov <- foreach(i=icount(length(all.cov))) %do% {
114 head(all.cov[[i]], n=v.left[i])
115 }
116 mid.cov <- foreach(i=icount(length(all.cov))) %do% {
117 len <- length(all.cov[[i]])
118 all.cov[[i]][ (v.left[i] + 1) : (len - v.right[i]) ]
119 }
120 right.cov <- foreach(i=icount(length(all.cov))) %do% {
121 tail(all.cov[[i]], n=v.right[i])
122 }
123 list(l=left.cov, m=mid.cov, r=right.cov)
124 }
125
126 extrCovExons <- function(v.chrom, exonranges.list, v.fls, v.strand,
127 m.pts, f.pts, bufsize, algo, ...) {
128 # Extract coverage vectors for transcripts with exon models.
129 # Args:
130 # v.chrom: factor vector of chromosome names.
131 # exonranges.list: list of IRanges objects for exon coordinates.
132 # v.fls: integer vector of flanking region size in bps.
133 # v.strand: factor vector of gene strands.
134 # m.pts: data points for middle interval.
135 # f.pts: data points for flanking region.
136 # bufsize: integer; buffers are added to both ends of each region.
137 # algo: algorithm used to normalize coverage vectors.
138 # Return: matrix of interpolated coverage: each row represents a gene; each
139 # column represents a data point. Coverage from exons are concatenated.
140
141 # Construct ranges including exon and flanking regions.
142 exonflank.list <- vector('list', length=length(exonranges.list))
143 for(i in 1:length(exonranges.list)) {
144 r.mod <- exonranges.list[[i]] # IRanges object to be modified.
145 n <- length(r.mod)
146 # Add flanking and buffer regions.
147 start(r.mod)[1] <- start(r.mod)[1] - v.fls[i] - bufsize
148 end(r.mod)[n] <- end(r.mod)[n] + v.fls[i] + bufsize
149 exonflank.list[[i]] <- GRanges(seqnames=v.chrom[i], ranges=r.mod)
150 }
151
152 exonflank.cov <- covBamExons(exonflank.list, v.strand, ...)
153
154 # Trim buffers from coverage vectors.
155 exonflank.cov <- lapply(exonflank.cov, function(v) {
156 window(v, start=bufsize + 1, width=length(v) - 2 * bufsize)
157 })
158
159 SplineRev3Sec(exonflank.cov, v.fls, list(l=f.pts, m=m.pts, r=f.pts),
160 v.strand, algo)
161 }
162
163
164 extrCovMidp <- function(v.chrom, v.midp, flanksize, v.strand, pts, bufsize,
165 algo, ...) {
166 # Extract coverage vectors with a middle point and symmetric flanking regions.
167 # Args:
168 # v.chrom: factor vector of chromosome names.
169 # v.midp: integer vector of middle points.
170 # flanksize: integer of flanking region size in bps.
171 # v.strand: factor vector of gene strands.
172 # pts: data points to spline into.
173 # bufsize: integer; buffers are added to both ends of each region.
174 # algo: algorithm used to normalize coverage vectors.
175 # Return: matrix of interpolated coverage: each row represents a gene; each
176 # column represents a data point.
177
178 granges <- GRanges(seqnames=v.chrom, ranges=IRanges(
179 start=v.midp - flanksize - bufsize,
180 end=v.midp + flanksize + bufsize))
181 cov.list <- covBamExons(granges, v.strand, ...)
182
183 # Trim buffers from coverage vectors.
184 cov.list <- lapply(cov.list, function(v) {
185 window(v, start=bufsize + 1, width=length(v) - 2 * bufsize)
186 })
187
188 # Interpolate and reverse coverage vectors and assemble into a matrix.
189 cov.mat <- matrix(nrow=length(cov.list), ncol=pts)
190 for(i in 1:length(cov.list)) {
191 if(is.null(cov.list[[i]])) {
192 cov.mat[i, ] <- vector('integer', length=pts)
193 } else {
194 if(algo == "spline") {
195 cov.mat[i, ] <- spline(1:length(cov.list[[i]]), cov.list[[i]],
196 n=pts)$y
197 } else if(algo == "bin") {
198 cov.mat[i, ] <- Bin(cov.list[[i]], n=pts)
199 } else {
200 # pass.
201 }
202 if(v.strand[i] == '-') {
203 cov.mat[i, ] <- rev(cov.mat[i, ])
204 }
205 }
206 }
207 cov.mat
208 }
209
210 scanBamRevOrder <- function(org.gr, sbp) {
211 # ScanBamParam re-arranges the input genomic ranges. Use range info to
212 # construct a string vector to find the order to reverse it.
213 org.grnames <- with(org.gr, paste(seqnames, start, end, sep=':'))
214 sbw.gr <- as.data.frame(bamWhich(sbp)) # scan-bam-ed
215 if('space' %in% names(sbw.gr)) {
216 sbw.grnames <- with(sbw.gr, paste(space, start, end, sep=':'))
217 } else if('group_name' %in% names(sbw.gr)) {
218 sbw.grnames <- with(sbw.gr, paste(group_name, start, end, sep=':'))
219 } else {
220 stop("Cannot locate chromosome names in extracted short reads. Report
221 this problem using issue tracking or discussion forum.\n")
222 }
223
224 match(org.grnames, sbw.grnames)
225 }
226
227 genZeroList <- function(llen, v.vlen) {
228 # Generate a list of specific length with each element being an Rle vector of
229 # zeros with specific length.
230 # Args:
231 # llen: list length
232 # v.vlen: vector of vector lengths.
233
234 llen <- as.integer(llen)
235 stopifnot(llen > 0)
236 res <- vector('list', length=llen)
237 for(i in 1:llen) {
238 res[[i]] <- Rle(0, v.vlen[i])
239 }
240 res
241 }
242
243 covBamExons <- function(granges.dat, v.strand, bam.file, sn.inbam, fraglen,
244 map.qual=20, bowtie=F,
245 strand.spec=c('both', 'same', 'opposite')) {
246 # Extract coverage vectors from bam file for a list of transcripts of multiple
247 # exons.
248 # Args:
249 # granges.dat: a GRanges object representing a set of genomic ranges or a
250 # list of GRanges objects each representing a set of exonic
251 # ranges.
252 # v.strand: vector of strand info.
253 # bam.file: character string refers to the path of a bam file.
254 # sn.inbam: vector of chromosome names in the bam file.
255 # fraglen: fragment length.
256 # map.qual: mapping quality to filter reads.
257 # bowtie: boolean to indicate whether the aligner was Bowtie-like or not.
258 # strand.spec: string desc. for strand-specific coverage calculation.
259 # Return: list of coverage vectors, each vector represents a transcript.
260
261 strand.spec <- match.arg(strand.spec)
262
263 if(class(granges.dat) == 'list') {
264 # Construct a GRanges object representing DNA sequences.
265 v.seqnames <- sapply(granges.dat,
266 function(x) as.character(seqnames(x)[1]))
267 v.start <- sapply(granges.dat, function(x) start(ranges(x))[1])
268 v.end <- sapply(granges.dat, function(x) tail(end(ranges(x)), n=1))
269 granges.dna <- GRanges(seqnames=v.seqnames,
270 ranges=IRanges(start=v.start, end=v.end))
271 # Obtain mRNA(including flanking) length for each gene.
272 repr.lens <- sapply(granges.dat, function(g) {
273 sum(end(g) - start(g) + 1)
274 })
275 } else {
276 v.seqnames <- as.character(seqnames(granges.dat))
277 v.start <- start(granges.dat)
278 v.end <- end(granges.dat)
279 granges.dna <- granges.dat
280 repr.lens <- v.end - v.start + 1
281 granges.dat <- vector('list', length(granges.dna)) # set null tags.
282 }
283
284 # Filter transcripts whose chromosomes do not match bam file.
285 inbam.mask <- as.character(seqnames(granges.dna)) %in% sn.inbam
286 if(!any(inbam.mask)) { # none matches.
287 return(genZeroList(length(granges.dna), repr.lens))
288 }
289
290 # scanBamWhat: the info that need to be extracted from a bam file.
291 sbw <- c('pos', 'qwidth', 'mapq', 'strand', 'rname',
292 'mrnm', 'mpos', 'isize')
293 sbp <- ScanBamParam(what=sbw, which=granges.dna[inbam.mask],
294 flag=scanBamFlag(isUnmappedQuery=F,
295 isNotPassingQualityControls=F,
296 isDuplicate=F))
297
298 # Scan bam file to retrieve short reads.
299 sr.in.ranges <- tryCatch(scanBam(bam.file, param=sbp), error=function(e) e)
300 if(class(sr.in.ranges)[1] == 'simpleError') {
301 # This is not supposed to happen after those unmatched seqnames are
302 # removed. I keep it for safty.
303 return(genZeroList(length(granges.dna), repr.lens))
304 }
305
306 # Restore the original order.
307 sr.in.ranges <- sr.in.ranges[scanBamRevOrder(
308 as.data.frame(granges.dna[inbam.mask]), sbp)]
309
310 CalcReadsCov <- function(srg, start, end, gr.rna, repr.len, strand) {
311 # Calculate short read coverage for each gene/region.
312 # Args:
313 # srg: extracted short reads in gene.
314 # start: start position of the DNA sequence.
315 # end: end position of the DNA sequence.
316 # gr.rna: GRanges object (multiple ranges) representing exon sequences.
317 # This can be NULL indicating the input ranges are DNAs.
318 # repr.len: DNA or mRNA sequence length.
319 # strand: transcript strand (+/-).
320 # Returns: a coverage vector for the gene.
321
322 # browser()
323 # Special handling for bowtie mapping.
324 if(bowtie) {
325 srg <- within(srg, mapq[is.na(mapq)] <- 254) # within!
326 }
327 # Filter short reads by mapping quality.
328 all.mask <- srg$mapq >= map.qual
329
330 # Subset by strand info.
331 if(strand.spec != 'both') {
332 if(strand.spec == 'same') {
333 s.mask <- srg$strand == as.character(strand)
334 } else {
335 s.mask <- srg$strand != as.character(strand)
336 }
337 all.mask <- all.mask & s.mask
338 }
339
340 # If paired, filter reads that are not properly paired.
341 paired <- all(with(srg, is.na(isize) | isize != 0))
342 if(paired) {
343 p.mask <- with(srg, rname == mrnm & xor(strand == '+', isize < 0))
344 all.mask <- all.mask & p.mask
345 }
346
347 # Apply all the filters on short reads.
348 srg <- lapply(srg, `[`, which(all.mask))
349
350 # Calculate coverage.
351 if(length(srg[[1]]) > 0) {
352 if(paired) {
353 cov.pos <- with(srg, ifelse(isize < 0, mpos, pos))
354 cov.wd <- abs(srg$isize)
355 } else {
356 # Adjust negative read positions for physical coverage.
357 cov.pos <- with(srg, ifelse(strand == '-',
358 pos - fraglen + qwidth, pos))
359 cov.wd <- fraglen
360 }
361 # Shift reads by subtracting start positions.
362 cov.pos <- cov.pos - start + 1
363 # Calculate physical coverage on the whole genebody.
364 covg <- coverage(IRanges(start=cov.pos, width=cov.wd),
365 width=end - start + 1, method='sort')
366
367 if(!is.null(gr.rna)) { # RNA-seq.
368 # Shift exonic ranges by subtracting start positions.
369 # BE careful with negative start positions! Need to adjust end
370 # positions first(or the GRanges lib will emit errors if
371 # start > end).
372 # Negative start positions happen when flanking region exceeds
373 # the chromosomal start.
374 if(start > 0) {
375 start(gr.rna) <- start(gr.rna) - start + 1
376 end(gr.rna) <- end(gr.rna) - start + 1
377 } else {
378 end(gr.rna) <- end(gr.rna) - start + 1
379 start(gr.rna) <- start(gr.rna) - start + 1
380 }
381 # Concatenate all exon coverages.
382 covg[ranges(gr.rna)]
383 } else { # ChIP-seq.
384 covg
385 }
386 } else {
387 Rle(0, repr.len)
388 }
389 }
390
391 covg.allgenes <- mapply(CalcReadsCov, srg=sr.in.ranges,
392 start=v.start, end=v.end, gr.rna=granges.dat,
393 repr.len=repr.lens, strand=v.strand, SIMPLIFY=F)
394 }
395
396 bamFileList <- function(ctg.tbl) {
397 # Determine the bam files involved in the configuration and whether it is a
398 # bam file pair setup.
399 # Args:
400 # ctg.tbl: coverage-genelist-title table.
401
402 cov.uniq <- unique(ctg.tbl$cov)
403 cov.list <- strsplit(cov.uniq, ':')
404 v.nbam <- sapply(cov.list, length)
405 v.bbp <- v.nbam == 2
406 if(all(v.bbp)) {
407 bbp <- T
408 } else if(all(!v.bbp)) {
409 bbp <- F
410 } else {
411 stop("No mix of bam and bam-pair allowed in configuration.\n")
412 }
413
414 list(bbp=bbp, bam.list=unique(unlist(cov.list)))
415 }
416
417 estiMapqStyle <- function(bam.file){
418 # Estimate the mapping quality style. Return TRUE if it is SAM standard.
419 # Sample 1000 mapped reads from bam file, and if the mapq of reads
420 # over half are NA, then return FALSE, because it is quite possible that
421 # the aligner using coding style as bowtie, 255 as highest score.
422 # Args:
423 # bam.file: bam file to be sampled.
424
425 sbw <- c('pos', 'qwidth', 'mapq', 'strand')
426 sbp <- ScanBamParam(what=sbw, flag=scanBamFlag(
427 isUnmappedQuery=F, isDuplicate=F))
428 samp <- BamSampler(bam.file, yieldSize=500)
429 samp.reads <- scanBam(samp, param=sbp)[[1]]
430 samp.len <- length(samp.reads[["mapq"]])
431 mapq.255 <- sum(is.na(samp.reads[["mapq"]]))
432 if(mapq.255/samp.len >= 0.5){
433 return(FALSE)
434 }else{
435 return(TRUE)
436 }
437 }
438
439 headerIndexBam <- function(bam.list) {
440 # Read bam header to determine mapping method.
441 # Index bam files if not done yet.
442 # Args:
443 # ctg.tbl: coverage-genelist-title table.
444
445 v.map.bowtie <- vector('logical', length=length(bam.list))
446 for(i in 1:length(bam.list)) {
447 bam.file <- bam.list[i]
448
449 # Index bam file.
450 if(!file.exists(paste(bam.file, ".bai", sep=""))) {
451 indexBam(bam.file)
452 }
453
454 # Derive mapping program.
455 header <- scanBamHeader(bam.file)
456 map.prog <- try(strsplit(header[[1]]$text$'@PG'[[1]], ':')[[1]][2],
457 silent=T)
458 if(class(map.prog) != "try-error") {
459 map.style <- grepl('tophat|bowtie|bedtools|star', map.prog,
460 ignore.case=T)
461 if(map.style){
462 v.map.bowtie[i] <- TRUE
463 next
464 }
465 map.style <- grepl('bwa|casava|gem', map.prog, ignore.case=T)
466 if(map.style) {
467 v.map.bowtie[i] <- FALSE
468 next
469 }
470 # if(estiMapqStyle(bam.file)){
471 # warning(sprintf("Aligner for: %s cannot be determined. Style of
472 # standard SAM mapping score will be used.", bam.file))
473 # v.map.bowtie[i] <- FALSE
474 # }else{
475 # warning(sprintf("Aligner for: %s cannot be determined. Style of
476 # Bowtie-like SAM mapping score will be used. Would you mind to tell us what
477 # aligner you are using?", bam.file))
478 # v.map.bowtie[i] <- TRUE
479 # }
480 }
481 # else {
482 # cat("\n")
483 # if(estiMapqStyle(bam.file)){
484 # warning(sprintf("Aligner for: %s cannot be determined. Style of
485 # standard SAM mapping score will be used.", bam.file))
486 # v.map.bowtie[i] <- FALSE
487 # }else{
488 # warning(sprintf("Aligner for: %s cannot be determined. Style of
489 # Bowtie-like SAM mapping score will be used.", bam.file))
490 # v.map.bowtie[i] <- TRUE
491 # }
492 # }
493 warning(sprintf("Aligner for: %s cannot be determined. Style of
494 standard SAM mapping score will be used. Would you mind submitting an issue
495 report to us on Github? This will benefit people using the same aligner.",
496 bam.file))
497 v.map.bowtie[i] <- FALSE
498 }
499 names(v.map.bowtie) <- bam.list
500
501 v.map.bowtie
502 }
503
504 libSizeBam <- function(bam.list) {
505 # Obtain library sizes by counting qualified bam records.
506 # Args:
507 # ctg.tbl: coverage-genelist-title table.
508
509 # Count only reads that are mapped, primary, passed quality control and
510 # un-duplicated.
511 sbp <- ScanBamParam(flag=scanBamFlag(isUnmappedQuery=F, isSecondaryAlignment=F,
512 isNotPassingQualityControls=F, isDuplicate=F))
513 v.lib.size <- vector('integer', length=length(bam.list))
514 for(i in 1:length(bam.list)) {
515 bfn <- bam.list[i] # bam file name.
516 cfn <- paste(basename(bfn), '.cnt', sep='') # count file name.
517 if(file.exists(cfn)) {
518 v.lib.size[i] <- as.integer(readLines(cfn, n=1))
519 } else {
520 cnt.bam <- countBam(bfn, param=sbp)
521 v.lib.size[i] <- cnt.bam$records
522 writeLines(as.character(v.lib.size[i]), cfn)
523 }
524 names(v.lib.size)[i] <- bfn
525 }
526
527 v.lib.size
528 }
529
530 seqnamesBam <- function(bam.list) {
531 # Obtain chromosome names for each bam file. This list must be used to filter
532 # genomic regions before scanBam or it terminates immaturely.
533 # ctg.tbl: coverage-genelist-title table.
534
535 # browser()
536 sn.list <- lapply(scanBamHeader(bam.list), function(h) {
537 names(h$targets)
538 })
539 names(sn.list) <- bam.list
540
541 sn.list
542 }
543
544 chrTag <- function(sn.inbam) {
545 # Check whether the chromosome name format in the bam file contains 'chr' or not.
546 # Args:
547 # sn.inbam: seqnames in the bam file.
548
549 n.chr <- length(grep('^chr', sn.inbam))
550 if(n.chr == 0) {
551 chr.tag <- F
552 } else if(n.chr == length(sn.inbam)) {
553 chr.tag <- T
554 } else {
555 return("Inconsistent chromosome names in bam file. Check bam header.")
556 }
557
558 chr.tag
559 }
560
561 chunkIndex <- function(tot.gene, gcs) {
562 # Create chunk indices according to total number of genes and chunk size.
563 # Args:
564 # tot.gene: total number of genes.
565 # gcs: gene chunk size.
566
567 nchk <- ceiling(tot.gene / gcs) # number of chunks.
568 chkidx.list <- vector('list', length=nchk) # chunk indices list.
569 chk.start <- 1 # chunk start.
570 i.chk <- idiv(tot.gene, chunkSize=gcs)
571 for(i in 1:nchk) {
572 chk.size <- nextElem(i.chk)
573 chkidx.list[[i]] <- c(chk.start, chk.start + chk.size - 1)
574 chk.start <- chk.start + chk.size
575 }
576
577 chkidx.list
578 }
579
580 covMatrix <- function(debug, chkidx.list, coord, rnaseq.gb, exonmodel, libsize,
581 spit.dot=T, ...) {
582 # Function to generate a coverage matrix for all genes.
583 # Args:
584 # debug: boolean tag for debugging.
585 # chkidx.list: list of (start, end) indices for each chunk.
586 # coord: dataframe of gene coordinates.
587 # rnaseq.gb: boolean for RNA-seq genebody plot.
588 # exonmodel: exon model data object.
589 # libsize: total read count for this bam file.
590 # spit.dot: boolean to control sptting '.' to consoles.
591 # Return: normalized coverage matrix for all genes, each row represents a gene.
592
593
594 if(!debug) {
595 # Extract coverage and combine into a matrix.
596 result.matrix <- foreach(chk=chkidx.list, .combine='rbind',
597 .multicombine=T) %dopar% {
598 if(spit.dot) {
599 cat(".")
600 }
601 i <- chk[1]:chk[2] # chunk: start -> end
602 # If RNA-seq, retrieve exon ranges.
603 if(rnaseq.gb) {
604 exonranges.list <- unlist(exonmodel[coord[i, ]$tid])
605 } else {
606 exonranges.list <- NULL
607 }
608 doCov(coord[i, ], exonranges.list, ...)
609 }
610
611 # Floor negative values which are caused by spline.
612 result.matrix[result.matrix < 0] <- 0
613 result.matrix / libsize * 1e6 # normalize to RPM.
614
615 } else {
616 for(c in 1:length(chkidx.list)) {
617 chk <- chkidx.list[[c]]
618 i <- chk[1]:chk[2] # chunk: start -> end
619 cat(".")
620 # If RNA-seq, retrieve exon ranges.
621 if(rnaseq.gb) {
622 exonranges.list <- unlist(exonmodel[coord[i, ]$tid])
623 } else {
624 exonranges.list <- NULL
625 }
626 cov <- doCov(coord[i, ], exonranges.list, ...)
627 if(c == 1) {
628 result.matrix <- matrix(0, nrow=nrow(coord), ncol=ncol(cov))
629 }
630 result.matrix[i, ] <- cov
631 }
632 # Floor negative values which are caused by spline.
633 # browser()
634 result.matrix[result.matrix < 0] <- 0
635 result.matrix / libsize * 1e6 # normalize to RPM.
636
637 }
638 }
639
640
641 doCov <- function(coord.mat, exonranges.list, chr.tag, pint, reg2plot,
642 flanksize, flankfactor, m.pts, f.pts, ...) {
643 # Extract coverage from bam file into a matrix. According to the parameter
644 # values, call corresponding functions.
645 # Args:
646 # coord.mat: matrix of genomic coordinates to extract coverage.
647 # exonranges.list: list of IRanges objects, each represents a group of exons.
648 # pint: boolean of point interval.
649 # reg2plot: string of region to plot.
650 # flanksize: flanking region size.
651 # flankfactor: flanking region factor.
652 # m.pts: data points for middle interval.
653 # f.pts: data points for flanking region.
654 # Return: matrix of interpolated coverage: each row represents a gene; each
655 # column represents a data point. Coverage from exons are concatenated.
656
657 v.chrom <- coord.mat$chrom
658 # if(!chr.tag) {
659 # v.chrom <- sub('chr', '', v.chrom)
660 # }
661 v.chrom <- as.factor(v.chrom)
662 v.strand <- as.factor(coord.mat$strand)
663
664 # Figure out interval region sizes and calculate flanking region sizes.
665 if(!pint) { # interval regions.
666 if(!is.null(exonranges.list)) { # RNA-seq
667 if(flankfactor > 0) {
668 v.fls <- sapply(exonranges.list, function(t) {
669 sum(end(t) - start(t) + 1) * flankfactor
670 })
671 } else {
672 v.fls <- rep(flanksize, length=nrow(coord.mat))
673 }
674 extrCovExons(v.chrom, exonranges.list, v.fls, v.strand,
675 m.pts, f.pts, ...)
676 } else { # ChIP-seq with intervals.
677 v.start <- coord.mat$start
678 v.end <- coord.mat$end
679 if(flankfactor > 0) {
680 v.fls <- round((v.end - v.start + 1) * flankfactor)
681 } else {
682 v.fls <- rep(flanksize, length=nrow(coord.mat))
683 }
684 extrCov3Sec(v.chrom, v.start, v.end, v.fls, v.strand,
685 m.pts, f.pts, ...)
686 }
687 } else { # point center with flanking regions.
688 v.midp <- vector('integer', length=nrow(coord.mat))
689 for(r in 1:nrow(coord.mat)) {
690 if(reg2plot == 'tss' && v.strand[r] == '+' ||
691 reg2plot == 'tes' && v.strand[r] == '-') {
692 # the left site is center.
693 v.midp[r] <- coord.mat$start[r]
694 } else { # this also includes BED supplied point center.
695 v.midp[r] <- coord.mat$end[r]
696 }
697 }
698 # browser()
699 extrCovMidp(v.chrom, v.midp, flanksize, v.strand, m.pts + f.pts*2, ...)
700 }
701 }