Mercurial > repos > artbio > ngsplot
comparison lib/coverage.r @ 0:3ca58369469c draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/ngsplot commit b'e9fcc157a7f2f2fa9d6ac9a58d425ff17c975f5c\n'
author | artbio |
---|---|
date | Wed, 06 Dec 2017 19:01:53 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3ca58369469c |
---|---|
1 # Function to check if the range exceeds coverage vector boundaries. | |
2 checkBound <- function(start, end, range, chrlen){ | |
3 if(end + range > chrlen || start - range < 1) | |
4 return(FALSE) # out of boundary. | |
5 else | |
6 return(TRUE) | |
7 } | |
8 | |
9 Bin <- function(v, n) { | |
10 # Function to bin a coverage vector. | |
11 # Args: | |
12 # v: coverage vector. | |
13 # n: number of bins. | |
14 # Return: vector of binned values. | |
15 | |
16 bin.bound <- seq(1, length(v), length.out=n + 1) | |
17 sapply(1:n, function(i) { | |
18 mean(v[bin.bound[i]:bin.bound[i + 1]]) | |
19 }) | |
20 } | |
21 | |
22 SplineRev3Sec <- function(cov.list, v.fls, pts.list, v.strand, algo="spline") { | |
23 # For 3 sections of continuous coverage, spline each according to specified | |
24 # data points and return concatenated, interpolated curves. | |
25 # Args: | |
26 # cov.list: a list of coverage vectors. Each vector represents a gene. | |
27 # v.fls: vector of flanking region size. | |
28 # pts.list: a list of three integers for data points. | |
29 # v.strand: factor vector of strands. | |
30 # algo: algorithm used to normalize coverage vectors (spline, bin). | |
31 # Return: matrix of interpolated coverage: each row represents a gene; each | |
32 # column represents a data point. Coverage from exons are concatenated. | |
33 # Notes: the names of cov.list and pts.list must be the same for index purpose. | |
34 # Suggested: use 'l', 'm', and 'r' for the names. | |
35 | |
36 # Create an empty coveage matrix first. | |
37 tot.pts <- sum(unlist(pts.list)) - 2 | |
38 cov.mat <- matrix(nrow=length(cov.list), ncol=tot.pts) | |
39 | |
40 for(i in 1:length(cov.list)) { | |
41 left.cov <- head(cov.list[[i]], n=v.fls[i]) | |
42 right.cov <- tail(cov.list[[i]], n=v.fls[i]) | |
43 mid.cov <- window(cov.list[[i]], start=v.fls[i] + 1, | |
44 width=length(cov.list[[i]]) - 2*v.fls[i]) | |
45 if(algo == "spline") { | |
46 left.cov <- spline(1:length(left.cov), left.cov, n=pts.list$l)$y | |
47 right.cov <- spline(1:length(right.cov), right.cov, n=pts.list$r)$y | |
48 mid.cov <- spline(1:length(mid.cov), mid.cov, n=pts.list$m)$y | |
49 } else if(algo == "bin") { | |
50 left.cov <- Bin(left.cov, n=pts.list$l) | |
51 right.cov <- Bin(right.cov, n=pts.list$r) | |
52 mid.cov <- Bin(mid.cov, n=pts.list$m) | |
53 } else { | |
54 # pass. | |
55 } | |
56 # Fuse the two points at the boundary and concatenate. | |
57 f1 <- (tail(left.cov, n=1) + head(mid.cov, n=1)) / 2 | |
58 f2 <- (tail(mid.cov, n=1) + head(right.cov, n=1)) / 2 | |
59 con.cov <- c(left.cov[-length(left.cov)], f1, | |
60 mid.cov[-c(1, length(mid.cov))], | |
61 f2, right.cov[-1]) | |
62 # browser() | |
63 if(v.strand[i] == '+') { | |
64 cov.mat[i, ] <- con.cov | |
65 } else { | |
66 cov.mat[i, ] <- rev(con.cov) | |
67 } | |
68 } | |
69 | |
70 cov.mat | |
71 } | |
72 | |
73 extrCov3Sec <- function(v.chrom, v.start, v.end, v.fls, v.strand, m.pts, f.pts, | |
74 bufsize, algo, ...) { | |
75 # Extract and interpolate coverage vectors from genomic regions with 3 sections. | |
76 # Args: | |
77 # v.chrom: factor vector of chromosome names. | |
78 # v.start: integer vector of region start. | |
79 # v.end: integer vector of region end. | |
80 # v.fls: integer vector of flanking region size in bps. | |
81 # v.strand: factor vector of gene strands. | |
82 # m.pts: data points for middle interval. | |
83 # f.pts: data points for flanking region. | |
84 # bufsize: integer; buffers are added to both ends of each region. | |
85 # algo: algorithm used to normalize coverage vectors. | |
86 # Return: matrix of interpolated coverage: each row represents a gene; each | |
87 # column represents a data point. | |
88 | |
89 interflank.gr <- GRanges(seqnames=v.chrom, ranges=IRanges( | |
90 start=v.start - v.fls - bufsize, | |
91 end=v.end + v.fls + bufsize)) | |
92 interflank.cov <- covBamExons(interflank.gr, v.strand, ...) | |
93 | |
94 # Trim buffers from coverage vectors. | |
95 interflank.cov <- lapply(interflank.cov, function(v) { | |
96 window(v, start=bufsize + 1, width=length(v) - 2 * bufsize) | |
97 }) | |
98 | |
99 # Interpolate and reverse coverage vectors. | |
100 SplineRev3Sec(interflank.cov, v.fls, list(l=f.pts, m=m.pts, r=f.pts), | |
101 v.strand, algo) | |
102 } | |
103 | |
104 sub3CovList <- function(all.cov, v.left, v.right) { | |
105 # For a list of coverage vectors, separate them into three lists. | |
106 # Args: | |
107 # all.cov: list of coverage vectors. | |
108 # v.left: integer vector of left flanking size. | |
109 # v.right: integer vector of right flanking size. | |
110 # Return: list of lists of coverage vectors. The outer list is named as: | |
111 # "l", "m", "r". | |
112 | |
113 left.cov <- foreach(i=icount(length(all.cov))) %do% { | |
114 head(all.cov[[i]], n=v.left[i]) | |
115 } | |
116 mid.cov <- foreach(i=icount(length(all.cov))) %do% { | |
117 len <- length(all.cov[[i]]) | |
118 all.cov[[i]][ (v.left[i] + 1) : (len - v.right[i]) ] | |
119 } | |
120 right.cov <- foreach(i=icount(length(all.cov))) %do% { | |
121 tail(all.cov[[i]], n=v.right[i]) | |
122 } | |
123 list(l=left.cov, m=mid.cov, r=right.cov) | |
124 } | |
125 | |
126 extrCovExons <- function(v.chrom, exonranges.list, v.fls, v.strand, | |
127 m.pts, f.pts, bufsize, algo, ...) { | |
128 # Extract coverage vectors for transcripts with exon models. | |
129 # Args: | |
130 # v.chrom: factor vector of chromosome names. | |
131 # exonranges.list: list of IRanges objects for exon coordinates. | |
132 # v.fls: integer vector of flanking region size in bps. | |
133 # v.strand: factor vector of gene strands. | |
134 # m.pts: data points for middle interval. | |
135 # f.pts: data points for flanking region. | |
136 # bufsize: integer; buffers are added to both ends of each region. | |
137 # algo: algorithm used to normalize coverage vectors. | |
138 # Return: matrix of interpolated coverage: each row represents a gene; each | |
139 # column represents a data point. Coverage from exons are concatenated. | |
140 | |
141 # Construct ranges including exon and flanking regions. | |
142 exonflank.list <- vector('list', length=length(exonranges.list)) | |
143 for(i in 1:length(exonranges.list)) { | |
144 r.mod <- exonranges.list[[i]] # IRanges object to be modified. | |
145 n <- length(r.mod) | |
146 # Add flanking and buffer regions. | |
147 start(r.mod)[1] <- start(r.mod)[1] - v.fls[i] - bufsize | |
148 end(r.mod)[n] <- end(r.mod)[n] + v.fls[i] + bufsize | |
149 exonflank.list[[i]] <- GRanges(seqnames=v.chrom[i], ranges=r.mod) | |
150 } | |
151 | |
152 exonflank.cov <- covBamExons(exonflank.list, v.strand, ...) | |
153 | |
154 # Trim buffers from coverage vectors. | |
155 exonflank.cov <- lapply(exonflank.cov, function(v) { | |
156 window(v, start=bufsize + 1, width=length(v) - 2 * bufsize) | |
157 }) | |
158 | |
159 SplineRev3Sec(exonflank.cov, v.fls, list(l=f.pts, m=m.pts, r=f.pts), | |
160 v.strand, algo) | |
161 } | |
162 | |
163 | |
164 extrCovMidp <- function(v.chrom, v.midp, flanksize, v.strand, pts, bufsize, | |
165 algo, ...) { | |
166 # Extract coverage vectors with a middle point and symmetric flanking regions. | |
167 # Args: | |
168 # v.chrom: factor vector of chromosome names. | |
169 # v.midp: integer vector of middle points. | |
170 # flanksize: integer of flanking region size in bps. | |
171 # v.strand: factor vector of gene strands. | |
172 # pts: data points to spline into. | |
173 # bufsize: integer; buffers are added to both ends of each region. | |
174 # algo: algorithm used to normalize coverage vectors. | |
175 # Return: matrix of interpolated coverage: each row represents a gene; each | |
176 # column represents a data point. | |
177 | |
178 granges <- GRanges(seqnames=v.chrom, ranges=IRanges( | |
179 start=v.midp - flanksize - bufsize, | |
180 end=v.midp + flanksize + bufsize)) | |
181 cov.list <- covBamExons(granges, v.strand, ...) | |
182 | |
183 # Trim buffers from coverage vectors. | |
184 cov.list <- lapply(cov.list, function(v) { | |
185 window(v, start=bufsize + 1, width=length(v) - 2 * bufsize) | |
186 }) | |
187 | |
188 # Interpolate and reverse coverage vectors and assemble into a matrix. | |
189 cov.mat <- matrix(nrow=length(cov.list), ncol=pts) | |
190 for(i in 1:length(cov.list)) { | |
191 if(is.null(cov.list[[i]])) { | |
192 cov.mat[i, ] <- vector('integer', length=pts) | |
193 } else { | |
194 if(algo == "spline") { | |
195 cov.mat[i, ] <- spline(1:length(cov.list[[i]]), cov.list[[i]], | |
196 n=pts)$y | |
197 } else if(algo == "bin") { | |
198 cov.mat[i, ] <- Bin(cov.list[[i]], n=pts) | |
199 } else { | |
200 # pass. | |
201 } | |
202 if(v.strand[i] == '-') { | |
203 cov.mat[i, ] <- rev(cov.mat[i, ]) | |
204 } | |
205 } | |
206 } | |
207 cov.mat | |
208 } | |
209 | |
210 scanBamRevOrder <- function(org.gr, sbp) { | |
211 # ScanBamParam re-arranges the input genomic ranges. Use range info to | |
212 # construct a string vector to find the order to reverse it. | |
213 org.grnames <- with(org.gr, paste(seqnames, start, end, sep=':')) | |
214 sbw.gr <- as.data.frame(bamWhich(sbp)) # scan-bam-ed | |
215 if('space' %in% names(sbw.gr)) { | |
216 sbw.grnames <- with(sbw.gr, paste(space, start, end, sep=':')) | |
217 } else if('group_name' %in% names(sbw.gr)) { | |
218 sbw.grnames <- with(sbw.gr, paste(group_name, start, end, sep=':')) | |
219 } else { | |
220 stop("Cannot locate chromosome names in extracted short reads. Report | |
221 this problem using issue tracking or discussion forum.\n") | |
222 } | |
223 | |
224 match(org.grnames, sbw.grnames) | |
225 } | |
226 | |
227 genZeroList <- function(llen, v.vlen) { | |
228 # Generate a list of specific length with each element being an Rle vector of | |
229 # zeros with specific length. | |
230 # Args: | |
231 # llen: list length | |
232 # v.vlen: vector of vector lengths. | |
233 | |
234 llen <- as.integer(llen) | |
235 stopifnot(llen > 0) | |
236 res <- vector('list', length=llen) | |
237 for(i in 1:llen) { | |
238 res[[i]] <- Rle(0, v.vlen[i]) | |
239 } | |
240 res | |
241 } | |
242 | |
243 covBamExons <- function(granges.dat, v.strand, bam.file, sn.inbam, fraglen, | |
244 map.qual=20, bowtie=F, | |
245 strand.spec=c('both', 'same', 'opposite')) { | |
246 # Extract coverage vectors from bam file for a list of transcripts of multiple | |
247 # exons. | |
248 # Args: | |
249 # granges.dat: a GRanges object representing a set of genomic ranges or a | |
250 # list of GRanges objects each representing a set of exonic | |
251 # ranges. | |
252 # v.strand: vector of strand info. | |
253 # bam.file: character string refers to the path of a bam file. | |
254 # sn.inbam: vector of chromosome names in the bam file. | |
255 # fraglen: fragment length. | |
256 # map.qual: mapping quality to filter reads. | |
257 # bowtie: boolean to indicate whether the aligner was Bowtie-like or not. | |
258 # strand.spec: string desc. for strand-specific coverage calculation. | |
259 # Return: list of coverage vectors, each vector represents a transcript. | |
260 | |
261 strand.spec <- match.arg(strand.spec) | |
262 | |
263 if(class(granges.dat) == 'list') { | |
264 # Construct a GRanges object representing DNA sequences. | |
265 v.seqnames <- sapply(granges.dat, | |
266 function(x) as.character(seqnames(x)[1])) | |
267 v.start <- sapply(granges.dat, function(x) start(ranges(x))[1]) | |
268 v.end <- sapply(granges.dat, function(x) tail(end(ranges(x)), n=1)) | |
269 granges.dna <- GRanges(seqnames=v.seqnames, | |
270 ranges=IRanges(start=v.start, end=v.end)) | |
271 # Obtain mRNA(including flanking) length for each gene. | |
272 repr.lens <- sapply(granges.dat, function(g) { | |
273 sum(end(g) - start(g) + 1) | |
274 }) | |
275 } else { | |
276 v.seqnames <- as.character(seqnames(granges.dat)) | |
277 v.start <- start(granges.dat) | |
278 v.end <- end(granges.dat) | |
279 granges.dna <- granges.dat | |
280 repr.lens <- v.end - v.start + 1 | |
281 granges.dat <- vector('list', length(granges.dna)) # set null tags. | |
282 } | |
283 | |
284 # Filter transcripts whose chromosomes do not match bam file. | |
285 inbam.mask <- as.character(seqnames(granges.dna)) %in% sn.inbam | |
286 if(!any(inbam.mask)) { # none matches. | |
287 return(genZeroList(length(granges.dna), repr.lens)) | |
288 } | |
289 | |
290 # scanBamWhat: the info that need to be extracted from a bam file. | |
291 sbw <- c('pos', 'qwidth', 'mapq', 'strand', 'rname', | |
292 'mrnm', 'mpos', 'isize') | |
293 sbp <- ScanBamParam(what=sbw, which=granges.dna[inbam.mask], | |
294 flag=scanBamFlag(isUnmappedQuery=F, | |
295 isNotPassingQualityControls=F, | |
296 isDuplicate=F)) | |
297 | |
298 # Scan bam file to retrieve short reads. | |
299 sr.in.ranges <- tryCatch(scanBam(bam.file, param=sbp), error=function(e) e) | |
300 if(class(sr.in.ranges)[1] == 'simpleError') { | |
301 # This is not supposed to happen after those unmatched seqnames are | |
302 # removed. I keep it for safty. | |
303 return(genZeroList(length(granges.dna), repr.lens)) | |
304 } | |
305 | |
306 # Restore the original order. | |
307 sr.in.ranges <- sr.in.ranges[scanBamRevOrder( | |
308 as.data.frame(granges.dna[inbam.mask]), sbp)] | |
309 | |
310 CalcReadsCov <- function(srg, start, end, gr.rna, repr.len, strand) { | |
311 # Calculate short read coverage for each gene/region. | |
312 # Args: | |
313 # srg: extracted short reads in gene. | |
314 # start: start position of the DNA sequence. | |
315 # end: end position of the DNA sequence. | |
316 # gr.rna: GRanges object (multiple ranges) representing exon sequences. | |
317 # This can be NULL indicating the input ranges are DNAs. | |
318 # repr.len: DNA or mRNA sequence length. | |
319 # strand: transcript strand (+/-). | |
320 # Returns: a coverage vector for the gene. | |
321 | |
322 # browser() | |
323 # Special handling for bowtie mapping. | |
324 if(bowtie) { | |
325 srg <- within(srg, mapq[is.na(mapq)] <- 254) # within! | |
326 } | |
327 # Filter short reads by mapping quality. | |
328 all.mask <- srg$mapq >= map.qual | |
329 | |
330 # Subset by strand info. | |
331 if(strand.spec != 'both') { | |
332 if(strand.spec == 'same') { | |
333 s.mask <- srg$strand == as.character(strand) | |
334 } else { | |
335 s.mask <- srg$strand != as.character(strand) | |
336 } | |
337 all.mask <- all.mask & s.mask | |
338 } | |
339 | |
340 # If paired, filter reads that are not properly paired. | |
341 paired <- all(with(srg, is.na(isize) | isize != 0)) | |
342 if(paired) { | |
343 p.mask <- with(srg, rname == mrnm & xor(strand == '+', isize < 0)) | |
344 all.mask <- all.mask & p.mask | |
345 } | |
346 | |
347 # Apply all the filters on short reads. | |
348 srg <- lapply(srg, `[`, which(all.mask)) | |
349 | |
350 # Calculate coverage. | |
351 if(length(srg[[1]]) > 0) { | |
352 if(paired) { | |
353 cov.pos <- with(srg, ifelse(isize < 0, mpos, pos)) | |
354 cov.wd <- abs(srg$isize) | |
355 } else { | |
356 # Adjust negative read positions for physical coverage. | |
357 cov.pos <- with(srg, ifelse(strand == '-', | |
358 pos - fraglen + qwidth, pos)) | |
359 cov.wd <- fraglen | |
360 } | |
361 # Shift reads by subtracting start positions. | |
362 cov.pos <- cov.pos - start + 1 | |
363 # Calculate physical coverage on the whole genebody. | |
364 covg <- coverage(IRanges(start=cov.pos, width=cov.wd), | |
365 width=end - start + 1, method='sort') | |
366 | |
367 if(!is.null(gr.rna)) { # RNA-seq. | |
368 # Shift exonic ranges by subtracting start positions. | |
369 # BE careful with negative start positions! Need to adjust end | |
370 # positions first(or the GRanges lib will emit errors if | |
371 # start > end). | |
372 # Negative start positions happen when flanking region exceeds | |
373 # the chromosomal start. | |
374 if(start > 0) { | |
375 start(gr.rna) <- start(gr.rna) - start + 1 | |
376 end(gr.rna) <- end(gr.rna) - start + 1 | |
377 } else { | |
378 end(gr.rna) <- end(gr.rna) - start + 1 | |
379 start(gr.rna) <- start(gr.rna) - start + 1 | |
380 } | |
381 # Concatenate all exon coverages. | |
382 covg[ranges(gr.rna)] | |
383 } else { # ChIP-seq. | |
384 covg | |
385 } | |
386 } else { | |
387 Rle(0, repr.len) | |
388 } | |
389 } | |
390 | |
391 covg.allgenes <- mapply(CalcReadsCov, srg=sr.in.ranges, | |
392 start=v.start, end=v.end, gr.rna=granges.dat, | |
393 repr.len=repr.lens, strand=v.strand, SIMPLIFY=F) | |
394 } | |
395 | |
396 bamFileList <- function(ctg.tbl) { | |
397 # Determine the bam files involved in the configuration and whether it is a | |
398 # bam file pair setup. | |
399 # Args: | |
400 # ctg.tbl: coverage-genelist-title table. | |
401 | |
402 cov.uniq <- unique(ctg.tbl$cov) | |
403 cov.list <- strsplit(cov.uniq, ':') | |
404 v.nbam <- sapply(cov.list, length) | |
405 v.bbp <- v.nbam == 2 | |
406 if(all(v.bbp)) { | |
407 bbp <- T | |
408 } else if(all(!v.bbp)) { | |
409 bbp <- F | |
410 } else { | |
411 stop("No mix of bam and bam-pair allowed in configuration.\n") | |
412 } | |
413 | |
414 list(bbp=bbp, bam.list=unique(unlist(cov.list))) | |
415 } | |
416 | |
417 estiMapqStyle <- function(bam.file){ | |
418 # Estimate the mapping quality style. Return TRUE if it is SAM standard. | |
419 # Sample 1000 mapped reads from bam file, and if the mapq of reads | |
420 # over half are NA, then return FALSE, because it is quite possible that | |
421 # the aligner using coding style as bowtie, 255 as highest score. | |
422 # Args: | |
423 # bam.file: bam file to be sampled. | |
424 | |
425 sbw <- c('pos', 'qwidth', 'mapq', 'strand') | |
426 sbp <- ScanBamParam(what=sbw, flag=scanBamFlag( | |
427 isUnmappedQuery=F, isDuplicate=F)) | |
428 samp <- BamSampler(bam.file, yieldSize=500) | |
429 samp.reads <- scanBam(samp, param=sbp)[[1]] | |
430 samp.len <- length(samp.reads[["mapq"]]) | |
431 mapq.255 <- sum(is.na(samp.reads[["mapq"]])) | |
432 if(mapq.255/samp.len >= 0.5){ | |
433 return(FALSE) | |
434 }else{ | |
435 return(TRUE) | |
436 } | |
437 } | |
438 | |
439 headerIndexBam <- function(bam.list) { | |
440 # Read bam header to determine mapping method. | |
441 # Index bam files if not done yet. | |
442 # Args: | |
443 # ctg.tbl: coverage-genelist-title table. | |
444 | |
445 v.map.bowtie <- vector('logical', length=length(bam.list)) | |
446 for(i in 1:length(bam.list)) { | |
447 bam.file <- bam.list[i] | |
448 | |
449 # Index bam file. | |
450 if(!file.exists(paste(bam.file, ".bai", sep=""))) { | |
451 indexBam(bam.file) | |
452 } | |
453 | |
454 # Derive mapping program. | |
455 header <- scanBamHeader(bam.file) | |
456 map.prog <- try(strsplit(header[[1]]$text$'@PG'[[1]], ':')[[1]][2], | |
457 silent=T) | |
458 if(class(map.prog) != "try-error") { | |
459 map.style <- grepl('tophat|bowtie|bedtools|star', map.prog, | |
460 ignore.case=T) | |
461 if(map.style){ | |
462 v.map.bowtie[i] <- TRUE | |
463 next | |
464 } | |
465 map.style <- grepl('bwa|casava|gem', map.prog, ignore.case=T) | |
466 if(map.style) { | |
467 v.map.bowtie[i] <- FALSE | |
468 next | |
469 } | |
470 # if(estiMapqStyle(bam.file)){ | |
471 # warning(sprintf("Aligner for: %s cannot be determined. Style of | |
472 # standard SAM mapping score will be used.", bam.file)) | |
473 # v.map.bowtie[i] <- FALSE | |
474 # }else{ | |
475 # warning(sprintf("Aligner for: %s cannot be determined. Style of | |
476 # Bowtie-like SAM mapping score will be used. Would you mind to tell us what | |
477 # aligner you are using?", bam.file)) | |
478 # v.map.bowtie[i] <- TRUE | |
479 # } | |
480 } | |
481 # else { | |
482 # cat("\n") | |
483 # if(estiMapqStyle(bam.file)){ | |
484 # warning(sprintf("Aligner for: %s cannot be determined. Style of | |
485 # standard SAM mapping score will be used.", bam.file)) | |
486 # v.map.bowtie[i] <- FALSE | |
487 # }else{ | |
488 # warning(sprintf("Aligner for: %s cannot be determined. Style of | |
489 # Bowtie-like SAM mapping score will be used.", bam.file)) | |
490 # v.map.bowtie[i] <- TRUE | |
491 # } | |
492 # } | |
493 warning(sprintf("Aligner for: %s cannot be determined. Style of | |
494 standard SAM mapping score will be used. Would you mind submitting an issue | |
495 report to us on Github? This will benefit people using the same aligner.", | |
496 bam.file)) | |
497 v.map.bowtie[i] <- FALSE | |
498 } | |
499 names(v.map.bowtie) <- bam.list | |
500 | |
501 v.map.bowtie | |
502 } | |
503 | |
504 libSizeBam <- function(bam.list) { | |
505 # Obtain library sizes by counting qualified bam records. | |
506 # Args: | |
507 # ctg.tbl: coverage-genelist-title table. | |
508 | |
509 # Count only reads that are mapped, primary, passed quality control and | |
510 # un-duplicated. | |
511 sbp <- ScanBamParam(flag=scanBamFlag(isUnmappedQuery=F, isSecondaryAlignment=F, | |
512 isNotPassingQualityControls=F, isDuplicate=F)) | |
513 v.lib.size <- vector('integer', length=length(bam.list)) | |
514 for(i in 1:length(bam.list)) { | |
515 bfn <- bam.list[i] # bam file name. | |
516 cfn <- paste(basename(bfn), '.cnt', sep='') # count file name. | |
517 if(file.exists(cfn)) { | |
518 v.lib.size[i] <- as.integer(readLines(cfn, n=1)) | |
519 } else { | |
520 cnt.bam <- countBam(bfn, param=sbp) | |
521 v.lib.size[i] <- cnt.bam$records | |
522 writeLines(as.character(v.lib.size[i]), cfn) | |
523 } | |
524 names(v.lib.size)[i] <- bfn | |
525 } | |
526 | |
527 v.lib.size | |
528 } | |
529 | |
530 seqnamesBam <- function(bam.list) { | |
531 # Obtain chromosome names for each bam file. This list must be used to filter | |
532 # genomic regions before scanBam or it terminates immaturely. | |
533 # ctg.tbl: coverage-genelist-title table. | |
534 | |
535 # browser() | |
536 sn.list <- lapply(scanBamHeader(bam.list), function(h) { | |
537 names(h$targets) | |
538 }) | |
539 names(sn.list) <- bam.list | |
540 | |
541 sn.list | |
542 } | |
543 | |
544 chrTag <- function(sn.inbam) { | |
545 # Check whether the chromosome name format in the bam file contains 'chr' or not. | |
546 # Args: | |
547 # sn.inbam: seqnames in the bam file. | |
548 | |
549 n.chr <- length(grep('^chr', sn.inbam)) | |
550 if(n.chr == 0) { | |
551 chr.tag <- F | |
552 } else if(n.chr == length(sn.inbam)) { | |
553 chr.tag <- T | |
554 } else { | |
555 return("Inconsistent chromosome names in bam file. Check bam header.") | |
556 } | |
557 | |
558 chr.tag | |
559 } | |
560 | |
561 chunkIndex <- function(tot.gene, gcs) { | |
562 # Create chunk indices according to total number of genes and chunk size. | |
563 # Args: | |
564 # tot.gene: total number of genes. | |
565 # gcs: gene chunk size. | |
566 | |
567 nchk <- ceiling(tot.gene / gcs) # number of chunks. | |
568 chkidx.list <- vector('list', length=nchk) # chunk indices list. | |
569 chk.start <- 1 # chunk start. | |
570 i.chk <- idiv(tot.gene, chunkSize=gcs) | |
571 for(i in 1:nchk) { | |
572 chk.size <- nextElem(i.chk) | |
573 chkidx.list[[i]] <- c(chk.start, chk.start + chk.size - 1) | |
574 chk.start <- chk.start + chk.size | |
575 } | |
576 | |
577 chkidx.list | |
578 } | |
579 | |
580 covMatrix <- function(debug, chkidx.list, coord, rnaseq.gb, exonmodel, libsize, | |
581 spit.dot=T, ...) { | |
582 # Function to generate a coverage matrix for all genes. | |
583 # Args: | |
584 # debug: boolean tag for debugging. | |
585 # chkidx.list: list of (start, end) indices for each chunk. | |
586 # coord: dataframe of gene coordinates. | |
587 # rnaseq.gb: boolean for RNA-seq genebody plot. | |
588 # exonmodel: exon model data object. | |
589 # libsize: total read count for this bam file. | |
590 # spit.dot: boolean to control sptting '.' to consoles. | |
591 # Return: normalized coverage matrix for all genes, each row represents a gene. | |
592 | |
593 | |
594 if(!debug) { | |
595 # Extract coverage and combine into a matrix. | |
596 result.matrix <- foreach(chk=chkidx.list, .combine='rbind', | |
597 .multicombine=T) %dopar% { | |
598 if(spit.dot) { | |
599 cat(".") | |
600 } | |
601 i <- chk[1]:chk[2] # chunk: start -> end | |
602 # If RNA-seq, retrieve exon ranges. | |
603 if(rnaseq.gb) { | |
604 exonranges.list <- unlist(exonmodel[coord[i, ]$tid]) | |
605 } else { | |
606 exonranges.list <- NULL | |
607 } | |
608 doCov(coord[i, ], exonranges.list, ...) | |
609 } | |
610 | |
611 # Floor negative values which are caused by spline. | |
612 result.matrix[result.matrix < 0] <- 0 | |
613 result.matrix / libsize * 1e6 # normalize to RPM. | |
614 | |
615 } else { | |
616 for(c in 1:length(chkidx.list)) { | |
617 chk <- chkidx.list[[c]] | |
618 i <- chk[1]:chk[2] # chunk: start -> end | |
619 cat(".") | |
620 # If RNA-seq, retrieve exon ranges. | |
621 if(rnaseq.gb) { | |
622 exonranges.list <- unlist(exonmodel[coord[i, ]$tid]) | |
623 } else { | |
624 exonranges.list <- NULL | |
625 } | |
626 cov <- doCov(coord[i, ], exonranges.list, ...) | |
627 if(c == 1) { | |
628 result.matrix <- matrix(0, nrow=nrow(coord), ncol=ncol(cov)) | |
629 } | |
630 result.matrix[i, ] <- cov | |
631 } | |
632 # Floor negative values which are caused by spline. | |
633 # browser() | |
634 result.matrix[result.matrix < 0] <- 0 | |
635 result.matrix / libsize * 1e6 # normalize to RPM. | |
636 | |
637 } | |
638 } | |
639 | |
640 | |
641 doCov <- function(coord.mat, exonranges.list, chr.tag, pint, reg2plot, | |
642 flanksize, flankfactor, m.pts, f.pts, ...) { | |
643 # Extract coverage from bam file into a matrix. According to the parameter | |
644 # values, call corresponding functions. | |
645 # Args: | |
646 # coord.mat: matrix of genomic coordinates to extract coverage. | |
647 # exonranges.list: list of IRanges objects, each represents a group of exons. | |
648 # pint: boolean of point interval. | |
649 # reg2plot: string of region to plot. | |
650 # flanksize: flanking region size. | |
651 # flankfactor: flanking region factor. | |
652 # m.pts: data points for middle interval. | |
653 # f.pts: data points for flanking region. | |
654 # Return: matrix of interpolated coverage: each row represents a gene; each | |
655 # column represents a data point. Coverage from exons are concatenated. | |
656 | |
657 v.chrom <- coord.mat$chrom | |
658 # if(!chr.tag) { | |
659 # v.chrom <- sub('chr', '', v.chrom) | |
660 # } | |
661 v.chrom <- as.factor(v.chrom) | |
662 v.strand <- as.factor(coord.mat$strand) | |
663 | |
664 # Figure out interval region sizes and calculate flanking region sizes. | |
665 if(!pint) { # interval regions. | |
666 if(!is.null(exonranges.list)) { # RNA-seq | |
667 if(flankfactor > 0) { | |
668 v.fls <- sapply(exonranges.list, function(t) { | |
669 sum(end(t) - start(t) + 1) * flankfactor | |
670 }) | |
671 } else { | |
672 v.fls <- rep(flanksize, length=nrow(coord.mat)) | |
673 } | |
674 extrCovExons(v.chrom, exonranges.list, v.fls, v.strand, | |
675 m.pts, f.pts, ...) | |
676 } else { # ChIP-seq with intervals. | |
677 v.start <- coord.mat$start | |
678 v.end <- coord.mat$end | |
679 if(flankfactor > 0) { | |
680 v.fls <- round((v.end - v.start + 1) * flankfactor) | |
681 } else { | |
682 v.fls <- rep(flanksize, length=nrow(coord.mat)) | |
683 } | |
684 extrCov3Sec(v.chrom, v.start, v.end, v.fls, v.strand, | |
685 m.pts, f.pts, ...) | |
686 } | |
687 } else { # point center with flanking regions. | |
688 v.midp <- vector('integer', length=nrow(coord.mat)) | |
689 for(r in 1:nrow(coord.mat)) { | |
690 if(reg2plot == 'tss' && v.strand[r] == '+' || | |
691 reg2plot == 'tes' && v.strand[r] == '-') { | |
692 # the left site is center. | |
693 v.midp[r] <- coord.mat$start[r] | |
694 } else { # this also includes BED supplied point center. | |
695 v.midp[r] <- coord.mat$end[r] | |
696 } | |
697 } | |
698 # browser() | |
699 extrCovMidp(v.chrom, v.midp, flanksize, v.strand, m.pts + f.pts*2, ...) | |
700 } | |
701 } |