comparison countwins.r @ 0:d42f4d78c85e draft

Uploaded
author messersc
date Wed, 17 Dec 2014 10:40:23 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d42f4d78c85e
1 ###############################
2 #### Peak finder for NGS Data
3 #### R script
4 ###############################
5
6
7
8
9 # =======================
10 # User-defined variables
11 # =======================
12 cutoff = NA #To enforce an SNR cutoff ratio, delete NA and enter the number you want
13 strict = 1 #to make peak calling more / less strict, increase / decrease this number - Only use this if you are getting an unusually large or small number of peaks.
14 options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000)
15 #=======================> DONE!
16
17
18
19
20 # ================================
21 # Required Libraries check & load
22 # ================================
23 if ((is.element('mclust', installed.packages()[,1]) == FALSE) || (is.element('signal', installed.packages()[,1]) == FALSE) || (is.element('sqldf', installed.packages()[,1]) == FALSE) || (is.element('parallel', installed.packages()[,1]) == FALSE)) {
24 stop("R package 'Mclust', 'signal', 'sqldf' and 'parallel' are required. Please install them!")
25 }
26 suppressPackageStartupMessages(library("mclust"))
27 suppressPackageStartupMessages(library("signal"))
28 suppressPackageStartupMessages(library("sqldf"))
29 suppressPackageStartupMessages(library("tcltk"))
30 suppressPackageStartupMessages(library("parallel"))
31 #=======================> DONE!
32
33
34
35
36
37 # =================
38 # Custom Functions
39 # =================
40 #Get per-row Geometric mean (takes list, returns vectors, not lists!)
41 geomeanL <- function(mat){
42 n = length(mat)
43 if (n > 1) {
44 mult = (mat[[1]])*(mat[[2]])
45 if (n > 2) {
46 for (i in 3:n) {
47 mult = mult*(mat[[i]])
48 }
49 }
50 mult = mult^(1/n)
51 mat = mult
52 return(mat)
53 } else {
54 return(mat[[1]])
55 }
56 }
57
58
59
60 #Get per-row Geometric mean (takes matrix, returns vectors, not lists!)
61 geomean <- function(mat){
62 n = NCOL(mat)
63 if (n > 1) {
64 mult = (mat[,1])*(mat[,2])
65 if (n > 2) {
66 for (i in 3:n) {
67 mult = mult*(mat[,i])
68 }
69 }
70 mult = mult^(1/n)
71 mat = mult
72 }
73 return(mat)
74 }
75
76
77
78 #Read in bed(st2) file
79 parsein = function(bedfile) {
80 l = sqldf()
81 readsFile = file(bedfile)
82 return(sqldf("SELECT V1 + 1 As 'start' FROM readsFile", file.format=list(header = FALSE, sep = "\t"))$start)
83 l = sqldf()
84 close(readsFile)
85 }
86
87
88 #Read in bedpe(st2) file
89 parseinpe = function(bedfile) {
90 l = sqldf()
91 readsFile = file(bedfile)
92 sebastian = sqldf("SELECT * FROM readsFile", file.format=list(header = FALSE, sep = "\t"))
93 return(cbind(sebastian[[1]]+1, sebastian[[2]]))
94 l = sqldf()
95 close(readsFile)
96 }
97
98
99 #Produces normalized extended read counts (takes output of parsein(), return a vector of floats)
100 countreads = function(bedfile, reads, frag, chromsize, filelist) {
101
102 o = which(filelist == bedfile)
103
104 counts = vector(mode = "numeric", length = chromsize)
105
106 for (j in 1:length(reads[[o]])) {
107 if ((reads[[o]][j]+frag[o]-1) <= chromsize) {
108 counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] = counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] + 1
109 }
110 }
111 counts = counts/(mean(counts))
112 return(counts)
113 }
114
115
116
117 #Produces normalized extended read counts (takes output of parsein(), return a vector of floats)
118 countreadspe = function(bedfile, reads, chromsize, filelist) {
119
120 o = which(filelist == bedfile)
121
122 counts = vector(mode = "numeric", length = chromsize)
123
124 for (j in 1:length(reads[[o]][,1])) {
125 counts[(reads[[o]][j,1]):(reads[[o]][j,2])] = counts[(reads[[o]][j,1]):(reads[[o]][j,2])] + 1
126 }
127 counts = counts/(mean(counts))
128 return(counts)
129 }
130
131
132
133 #find enriched bins
134 pickbins = function(winStart, counts, binSize, chromSize, numdup, C, cutoff, strict, mCs, dCs, bkgd) {
135
136 if ((winStart + binSize) <= chromSize) {
137 winEnd = winStart + binSize
138 } else {
139 winEnd = chromSize
140 }
141 binSizeTemp = winEnd - winStart
142 tempend = winEnd - 1
143
144 #extract subset of the background
145 if (bkgd != "None") {
146 Cs = counts[[numdup+1]][winStart:tempend]
147 mCs = mean(Cs)
148 dCs = sd(Cs)
149 } else {
150 Cs = sample(C, binSizeTemp, replace = TRUE)
151 }
152
153
154 #find out whether it's enriched
155 go = rep(0, numdup)
156 for (g in 1:numdup) {
157 mS = (mean(counts[[g]][winStart:tempend]))
158 ratio = mS/dCs
159 if ((mS > (mCs * strict)) && (ratio > cutoff)) {
160 go[g] = 1
161 }
162 }
163 veep = sum(go)
164
165 #get counts when enriched
166 if (veep == numdup) {
167 mS = rep(0, numdup)
168 for (g in 1:numdup) {
169 mS[g] = (mean(counts[[g]][winStart:winEnd]))
170 }
171 cairo = mean(mS)
172 } else {
173 mS = rep(0, numdup)
174 for (g in 1:numdup) {
175 mS[g] = (mean(counts[[g]][winStart:winEnd]))
176 }
177 cairo = -(mean(mS))
178 }
179 return(cairo)
180 }
181
182
183
184 #find enriched wins
185 pickwins = function(winStart, coffeeshopSud, counts, numdup, startlist, winSize) {
186
187 plz = which(startlist == winStart)
188 winEnd = coffeeshopSud[plz]
189 rWinSize = winEnd - winStart + 1
190
191
192 if(rWinSize >= winSize) {
193 mS = rep(0, numdup)
194 for (g in 1:numdup) {
195 mS[g] = (sum(counts[[g]][winStart:winEnd]))
196 }
197 veep = sum(mS)
198 } else {
199 veep = FALSE
200 }
201
202 return(veep)
203 }
204
205
206
207 #Initialize MClust clustering parameters
208 smoothcounts = function(winStart, coffeeshopSud, numdup, counts, startlist) { #helper function1
209
210 plz = which(startlist == winStart)
211 winEnd = coffeeshopSud[plz]
212
213 #extract subset of the IP
214 Rs = matrix(0, nrow = (winEnd - winStart + 1), ncol = numdup)
215 for (j in 1:numdup) {
216 Rs[,j] = counts[[j]][winStart:winEnd]
217 }
218 #smooth extended read counts
219 for (j in 1:numdup) {
220 Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j])
221 }
222 return(Rs)
223 }
224 cluster = function(model, sig, init, clustnummer, noise) { #helper function2
225 noisy = sample(noise, length(sig[,1]), replace = TRUE)
226 clust = me(model, sig+noisy, init)
227 bicc = bic(model, clust$loglik, length(sig[,1]), length(sig[1,]), clustnummer)
228 out = list(bicc = bicc, param = clust$parameters)
229 return(out)
230 }
231 initparam = function(coffeeshopNord, coffeeshopSud, numdup, counts, cornum, clustnummer, modelnames, noise) { #main function
232
233 n = length(coffeeshopNord)
234 #smooth extended read counts
235 if (cornum > 1) {
236 sig = mclapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord, mc.cores = cornum, mc.preschedule = TRUE)
237 } else {
238 sig = lapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord)
239 }
240 sig = do.call(rbind, sig)
241
242 #kmeans initialization
243 init = kmeans(sig, clustnummer)
244 init = unmap(init$cluster)
245
246
247 if (cornum > 1) {
248 param = mclapply(modelnames, cluster, sig, init, clustnummer, noise, mc.cores = cornum, mc.preschedule = TRUE)
249 } else {
250 param = lapply(modelnames, cluster, sig, init, clustnummer, noise)
251 }
252
253 bicc = vector(mode = "numeric", length = length(modelnames))
254 for (i in 1:length(modelnames)) {
255 bicc[i] = as.numeric(param[[i]]$bicc)
256 }
257 bicc = which.max(bicc)
258
259 out = list(initparam = param[[bicc]]$param, modelname = modelnames[bicc])
260 return(out)
261 }
262
263
264
265
266 #find peaks
267 findpeak = function(winStart, coffeeshopSud, numdup, C, param, bkgd, resol, counts, noise, startlist) {
268
269
270 plz = which(startlist == winStart)
271 winEnd = coffeeshopSud[plz]
272
273
274 #will store peak information
275 writethis = list()
276
277
278 rWinSizeTemp = winEnd - winStart + 1
279
280 #extract subset of the IP
281 Rs = matrix(nrow = rWinSizeTemp, ncol = numdup)
282 Rsr = Rs
283 for (j in 1:numdup) {
284 Rsr[,j] = counts[[j]][winStart:winEnd]
285 Rs[,j] = filtfilt(rep(1,80)/80,1,Rsr[,j])
286 }
287
288
289 ####CLUSTERING START HERE#####
290 noisy = sample(noise, rWinSizeTemp, replace = TRUE)
291 clust = em(param$modelname, Rs+noisy, param$initparam)
292 clust$classification = map(clust$z)
293 ####CLUSTERING DONE####
294
295 #check whether clutering succeeded
296 if((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification))))) {
297
298
299 #check whether all components replicates agreed on clustering assignments
300 ccx = 1
301 if (numdup > 1) {
302 cc = vector(mode = "numeric", length = numdup)
303 for (g in 1:numdup) {
304 cc[g] = which.max(clust$parameters$mean[g,]) #which cluster has the largest mean (this is the peak cluster, hopefully!)
305 }
306 ccx = sum(diff(cc))
307 cluster = cc[1]
308 rm(cc)
309 } else {
310 cluster = which.max(clust$parameters$mean) #which cluster has the largest mean (this is the peak cluster, hopefully!)
311 ccx = 0
312 }
313 if (ccx == 0) {
314
315 #extract subset of the background
316 if (bkgd != "None") {
317 Cs = counts[[numdup+1]][winStart:winEnd]
318 Cs = filtfilt(rep(1,80)/80,1,Cs)
319 } else {
320 Cs = sample(C, rWinSizeTemp, replace = TRUE)
321 Cs = filtfilt(rep(1,80)/80,1,Cs)
322 }
323
324 #find region boundaries
325 loc = 1:length(clust$classification)
326 gmclass = cbind(loc, clust$classification)
327 locPeak = gmclass[gmclass[,2] == cluster,,drop=FALSE]
328 rStart = locPeak[1] #start position of the region
329 rEnd = locPeak[length(locPeak[,1]),1] #end position of the region
330
331 #peak resolution check
332 if (resol == "region") {
333 pSize = rEnd - rStart
334 signal = (geomean(Rs[rStart:rEnd,]))
335 signal2 = (signal) - (Cs[rStart:rEnd])
336 gm = mean(signal2)
337 summit = which.max(geomean(Rsr[rStart:rEnd,])) - 1
338 will2k = wilcox.test(signal, Cs[rStart:rEnd])
339
340 #Is there signal in the region above background
341 if (gm > 0) {
342 writethis[[1]] = rStart + winStart - 1
343 writethis[[2]] = rEnd + winStart
344 writethis[[3]] = paste0(chromName, ".", rStart+winStart -1)
345 writethis[[4]] = "1000"
346 writethis[[5]] = "."
347 writethis[[6]] = gm
348 writethis[[7]] = will2k$p.value
349 writethis[[8]] = "-1"
350 writethis[[9]] = summit
351 }
352 } else if (resol == "peak") {
353 #find out where separate peaks are
354 d = diff(locPeak[,1])
355 d[length(d)+1] = 0
356 locPeak = cbind(locPeak, d)
357 bound1 = which(locPeak[,3] > 1, arr.in=TRUE)
358 bound2 = bound1 + 1
359 bound = locPeak[sort(c(bound1,bound2))]
360 bound = c(rStart, bound, rEnd)
361 w = 1
362 warum = 0
363 while (w < length(bound)) {
364 pStart = bound[w] + winStart - 1
365 pEnd = bound[w+1] + winStart
366 pSize = pEnd - pStart
367 signal = (geomean(Rs[(bound[w]):(bound[w+1]),]))
368 signal2 = (signal) - (Cs[bound[w]:bound[w+1]])
369 gm = mean(signal2)
370 summit = which.max(geomean(Rsr[(bound[w]):(bound[w+1]),])) - 1
371 will2k = wilcox.test(signal, Cs[(bound[w]):(bound[w+1])])
372
373 weil = warum * 9
374 #Is there signal in the region above background
375 if (gm > 0) {
376 writethis[[1+weil]] = pStart
377 writethis[[2+weil]] = pEnd
378 writethis[[3+weil]] = paste0(chromName, ".", pStart)
379 writethis[[4+weil]] = "1000"
380 writethis[[5+weil]] = "."
381 writethis[[6+weil]] = gm
382 writethis[[7+weil]] = will2k$p.value
383 writethis[[8+weil]] = "-1"
384 writethis[[9+weil]] = summit
385 }
386 w = w + 2
387 warum = warum + 1
388 }
389
390 } #peak resolution check
391 } #all replicates agree on clustering assignments?
392 } #clustering worked?
393 return(writethis)
394 }
395
396
397 #filter return value of findpeak()
398 processPeaks = function(peaks) {
399 peaks = matrix(unlist(peaks), ncol=9, byrow=TRUE)
400 peaks = peaks[peaks[,1] != FALSE,,drop=FALSE]
401 peaks = data.frame(peaks)
402 return(peaks)
403 }
404 #=======================> DONE!
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423 # ==========================
424 # Parse-in System Variables
425 # ==========================
426 args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line
427
428 #Parsing arguments and storing values
429 for (each.arg in args) {
430 #chormosome size file
431 if (grepl('^-sfile=',each.arg)) {
432 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
433 if (! is.na(arg.split[2]) ) {
434 size.file <- arg.split[2]
435 } else {
436 stop('No genome size file')
437 }
438 }
439 #bed file names
440 if (grepl('^-bednames=',each.arg)) {
441 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
442 if (! is.na(arg.split[2]) ) {
443 bednames <- arg.split[2]
444 } else {
445 stop('No bed file names')
446 }
447 }
448 #bed files directory
449 if (grepl('^-frag=',each.arg)) {
450 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
451 if (! is.na(arg.split[2]) ) {
452 frag <- arg.split[2]
453 } else {
454 stop('No fragment length given')
455 }
456 }
457 #bakcground files directory
458 if (grepl('^-bkgd=',each.arg)) {
459 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
460 if (! is.na(arg.split[2]) ) {
461 bkgd <- arg.split[2]
462 } else {
463 stop('No background file')
464 }
465 }
466 #bakcground files directory
467 if (grepl('^-out=',each.arg)) {
468 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
469 if (! is.na(arg.split[2]) ) {
470 out <- arg.split[2]
471 } else {
472 stop('No output directory given')
473 }
474 }
475 #Cluster number
476 if (grepl('^-clustnummer=',each.arg)) {
477 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
478 if (! is.na(arg.split[2]) ) {
479 clustnummer <- as.numeric(arg.split[2])
480 }
481 }
482 #resolution
483 if (grepl('^-resolution=',each.arg)) {
484 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
485 if (! is.na(arg.split[2]) ) {
486 resol <- arg.split[2]
487 }
488 }
489 #processor cores
490 if (grepl('^-p=',each.arg)) {
491 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
492 if (! is.na(arg.split[2]) ) {
493 cornum <- as.numeric(arg.split[2])
494 }
495 }
496 #minimum window size
497 if (grepl('^-window=',each.arg)) {
498 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
499 if (! is.na(arg.split[2]) ) {
500 winSize <- arg.split[2]
501 }
502 }
503 #window size
504 if (grepl('^-bin=',each.arg)) {
505 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
506 if (! is.na(arg.split[2]) ) {
507 binsize <- arg.split[2]
508 }
509 }
510 #type (paired / single)
511 if (grepl('^-type=',each.arg)) {
512 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
513 if (! is.na(arg.split[2]) ) {
514 type <- arg.split[2]
515 }
516 }
517 }
518
519 ##Parse in variables
520 chromosomes = read.table(size.file, header=FALSE)
521 chromName = chromosomes$V1; #which chromosome
522 chromSize = chromosomes$V2; #chromosomes size
523 rm(chromosomes)
524
525 readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]])
526 numdup = length(readsFiles) #number of replicates
527 if (bkgd != "None") {
528 readsFiles[[numdup+1]] = bkgd
529 }
530 winSize = as.numeric(winSize)
531 binSize = as.numeric(binsize)
532 winSize = binSize * winSize
533
534 if (type == "single") {
535 frags = as.numeric(strsplit(frag, ",", fixed = TRUE)[[1]])
536 }
537 rm(bednames)
538
539 if (numdup > 1) {
540 modelnames = c("VVV","VEV")
541 } else {
542 modelnames = "V"
543 }
544
545 options(stringsAsFactors = FALSE)
546 #=======================> DONE!
547
548
549
550
551 # =======================
552 # Some preliminary stuff
553 # =======================
554 if (type == "single") {
555 if (cornum > 1) {
556 datain = mclapply(readsFiles, parsein, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control)
557 } else {
558 datain = lapply(readsFiles, parsein) #read in all bed files (samples and control)
559 }
560 }
561
562 if (type == "paired") {
563 if (cornum > 1) {
564 datain = mclapply(readsFiles, parseinpe, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control)
565 } else {
566 datain = lapply(readsFiles, parseinpe) #read in all bed files (samples and control)
567 }
568 }
569
570 #minimum peak size (only a recommendation)
571 minpeak = floor(binSize / 4)
572
573 #make bins vector
574 bins = seq(from = 1, to = (chromSize - 1), by = binSize)
575 #=======================> DONE!
576
577
578
579
580 # ===============
581 # Counting Reads
582 # ===============
583 if (type == "single") {
584 if (cornum > 1) {
585 counts = mclapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles, mc.cores = cornum, mc.preschedule = TRUE)
586 } else {
587 counts = lapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles)
588 }
589 }
590
591 if (type == "paired") {
592 if (cornum > 1) {
593 counts = mclapply(readsFiles, countreadspe, reads = datain, chromsize = chromSize, filelist = readsFiles, mc.cores = cornum, mc.preschedule = TRUE)
594 } else {
595 counts = lapply(readsFiles, countreadspe, reads = datain, chromsize = chromSize, filelist = readsFiles)
596 }
597 }
598
599 rm(datain)
600
601 #get total counts
602 mS = vector(mode = "numeric", length = numdup)
603 for (g in 1:numdup) {
604 mS[g] = sum(counts[[g]])
605 }
606 totalCounts = sum(mS)
607 rm(mS)
608 #=======================> DONE!
609
610
611
612 # ===================
613 # Estimating Cutoff
614 # ===================
615 if (is.na(cutoff)) {
616 if (bkgd != "None") {
617 cutoff = vector(length = numdup)
618 sdC = sd(counts[[numdup+1]])
619 for (x in 1:numdup) {
620 cutoff[x] = (mean(counts[[x]]))/(sdC)
621 }
622 cutoff = max(cutoff)
623 C = NULL
624 mCs = NULL
625 } else {
626 cutoff = vector(length = numdup)
627 mmV = var(geomeanL(counts))
628 mmM = mean(geomeanL(counts))
629 sigma = log(1+((mmV) / ((mmM)^2)))
630 mu = (log(mmM)) - (0.5 * (sigma))
631 C = rlnorm(100000, mu, sqrt(sigma))
632 for (x in 1:numdup) {
633 cutoff[x] = (mean(counts[[x]]))/(sd(C))
634 }
635 cutoff = max(cutoff)
636 mCs = (mean(sample(C, binSize*5, replace = TRUE)))
637 dCs = (sd(sample(C, binSize*5, replace = TRUE)))
638 }
639 }
640 #=======================> DONE!
641
642
643
644
645 # ========================
646 # Picking Enriched Windows
647 # ========================
648 binNum = vector(mode = "numeric", length = 41)
649 binTotalSize = vector(mode = "numeric", length = 41)
650 binTotalSizeRatio = vector(mode = "numeric", length = 41)
651 binTotalCount = vector(mode = "numeric", length = 41)
652 binTotalCountNO = vector(mode = "numeric", length = 41)
653 binTotalCountRatio = vector(mode = "numeric", length = 41)
654 maxiFun = vector(mode = "numeric", length = 41)
655 ggg = seq(1,5, by = 0.1)
656 for (i in 1:41) {
657 if (cornum > 1) {
658 coffeeshop = mclapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, ggg[i], mCs, dCs, bkgd, mc.cores = cornum, mc.preschedule = TRUE)
659 } else {
660 coffeeshop = lapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, ggg[i], mCs, dCs, bkgd)
661 }
662 coffeeshop = as.numeric(unlist(coffeeshop))
663 coffeeshopYES = coffeeshop[coffeeshop > 0]
664 coffeeshopNO = -(coffeeshop[coffeeshop <= 0])
665
666 binNum[i] = length(coffeeshopYES)
667 binTotalSize[i] = binNum[i] * binSize
668 binTotalSizeRatio[i] = binTotalSize[i] / chromSize
669
670 binTotalCount[i] = sum(coffeeshopYES)
671 binTotalCountNO[i] = sum(coffeeshopNO)
672 binTotalCountRatio[i] = binTotalCount[i] / binTotalCountNO[i]
673
674 maxiFun[i] = binTotalCountRatio[i] / binTotalSizeRatio[i]
675 }
676 rm(bins)
677 #=======================> DONE!
678
679
680 write(paste(ggg, binNum, binTotalSize, binTotalSizeRatio, binTotalCount, binTotalCountNO, binTotalCountRatio, maxiFun, sep = " "), file = paste0(file = paste0("/home/mibrahim/Documents/", chromName, ".GOYEAH"), ncolumns = 1))