comparison peakfinder.r @ 1:243f75d0ed6e draft default tip

Uploaded. Includes new release 1.0.7 with fixed optional controls.
author messersc
date Thu, 19 Feb 2015 05:39:45 -0500
parents d42f4d78c85e
children
comparison
equal deleted inserted replaced
0:d42f4d78c85e 1:243f75d0ed6e
1 ############################### 1 ########################################################################
2 #### Peak finder for NGS Data 2 # JAMMv1.0.7rev1 is a peak finder for joint analysis of NGS replicates.
3 #### R script 3 # Copyright (C) 2014-2015 Mahmoud Ibrahim
4 ############################### 4 #
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License
16 # along with this program. If not, see <http://www.gnu.org/licenses/>.
17 #
18 # Contact: mahmoud.ibrahim@mdc-berlin.de
19 ########################################################################
20
21
5 22
6 23
7 # ======================= 24 # =======================
8 # User-defined variables 25 # User-defined variables
9 # ======================= 26 # =======================
10 cutoff = NA #To enforce an SNR cutoff ratio, delete NA and enter the number you want 27 samplingSeed = 1011414 #the seed makes the random sampling that JAMM does "deterministic". You can change this to your integer seed of choice to reproduce exact same results or randomize it this way: ceiling(rnorm(1,50,300)).
11 strict = 1 #to make peak calling more / less strict, increase / decrease this number 28 reportNoClust = "n" #report windows for which clustering failed? Reported windows will be marked by "NoClust" in the peak name (4th column).
12 meanAdjust = "n" #adjust the initialization mean vector for each window before clustering? If you want to do so, change this to "y". 29 cutoff = NA #To enforce an SNR cutoff ratio for bin enrichment calling, delete NA and enter the number you want.
13 options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000) 30 strict = 1 #To make bin enrichment calling more / less strict, increase / decrease this number.
31 meanAdjust = "n" #Adjust the initialization mean vector for each window before clustering? If you want to do so, change this to "y".
32 options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000).
14 #=======================> DONE! 33 #=======================> DONE!
15 34
16 35
17 36
18 37
113 } else { 132 } else {
114 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE) 133 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE)
115 } 134 }
116 counts = counts/mCount 135 counts = counts/mCount
117 } 136 }
118 137
119 return(counts) 138 return(counts)
120 } 139 }
121 140
122 141
123 142
144 } else { 163 } else {
145 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE) 164 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE)
146 } 165 }
147 counts = counts/mCount 166 counts = counts/mCount
148 } 167 }
149 168
150 return(counts) 169 return(counts)
151 } 170 }
152 171
153 172
154 173
166 #extract subset of the background 185 #extract subset of the background
167 if (bkgd != "None") { 186 if (bkgd != "None") {
168 Cs = counts[[numdup+1]][winStart:tempend] 187 Cs = counts[[numdup+1]][winStart:tempend]
169 mCs = mean(Cs) 188 mCs = mean(Cs)
170 dCs = sd(Cs) 189 dCs = sd(Cs)
171 } else { 190 }
172 Cs = sample(C, binSizeTemp, replace = TRUE) 191
173 }
174
175
176 go = rep(0, numdup) 192 go = rep(0, numdup)
177 for (g in 1:numdup) { 193 for (g in 1:numdup) {
178 mS = (mean(counts[[g]][winStart:tempend])) 194 mS = (mean(counts[[g]][winStart:tempend]))
179 ratio = mS/dCs 195 ratio = mS/dCs
180 if ((mS > (mCs * strict)) && (ratio > cutoff)) { 196 if ((mS > (mCs * strict)) && (ratio > cutoff)) {
234 Cs = counts[[numdup+1]] 250 Cs = counts[[numdup+1]]
235 Cmin = min(Cs[Cs > 0]) 251 Cmin = min(Cs[Cs > 0])
236 Cs = Cs[winStart:winEnd] 252 Cs = Cs[winStart:winEnd]
237 Cs = filtfilt(rep(1,80)/80,1,Cs) + Cmin #gets rid of Inf in the fold change 253 Cs = filtfilt(rep(1,80)/80,1,Cs) + Cmin #gets rid of Inf in the fold change
238 } else { 254 } else {
255 set.seed(samplingSeed)
239 Cs = sample(C, rWinSizeTemp, replace = TRUE) 256 Cs = sample(C, rWinSizeTemp, replace = TRUE)
240 Cs = filtfilt(rep(1,80)/80,1,Cs) 257 Cs = filtfilt(rep(1,80)/80,1,Cs)
241 } 258 }
242 259
243 #start scoring 260 #start scoring
263 Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j]) 280 Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j])
264 } 281 }
265 return(Rs) 282 return(Rs)
266 } 283 }
267 cluster = function(model, sig, init, clustnummer, noise) { #helper function2 284 cluster = function(model, sig, init, clustnummer, noise) { #helper function2
285 set.seed(samplingSeed)
268 noisy = sample(noise, length(sig[,1]), replace = TRUE) 286 noisy = sample(noise, length(sig[,1]), replace = TRUE)
269 clust = me(model, sig+noisy, init) 287 clust = me(model, sig+noisy, init)
270 bicc = bic(model, clust$loglik, length(sig[,1]), length(sig[1,]), clustnummer) 288 bicc = bic(model, clust$loglik, length(sig[,1]), length(sig[1,]), clustnummer)
271 out = list(bicc = bicc, param = clust$parameters) 289 out = list(bicc = bicc, param = clust$parameters)
272 return(out) 290 return(out)
281 sig = lapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord) 299 sig = lapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord)
282 } 300 }
283 sig = do.call(rbind, sig) 301 sig = do.call(rbind, sig)
284 302
285 #kmeans initialization 303 #kmeans initialization
286 init = kmeans(sig, clustnummer) 304 set.seed(samplingSeed)
305 init = kmeans(sig, clustnummer, nstart = 20)
287 init = unmap(init$cluster) 306 init = unmap(init$cluster)
288 307
289 308
290 if (cornum > 1) { 309 if (cornum > 1) {
291 param = mclapply(modelnames, cluster, sig, init, clustnummer, noise, mc.cores = cornum, mc.preschedule = TRUE) 310 param = mclapply(modelnames, cluster, sig, init, clustnummer, noise, mc.cores = cornum, mc.preschedule = TRUE)
305 324
306 325
307 326
308 327
309 #find peaks 328 #find peaks
310 findpeak = function(winStart, coffeeshopSud, numdup, C, param, bkgd, resol, counts, noise, startlist, meanAdjust) { 329 findpeak = function(winStart, coffeeshopSud, numdup, C, param, bkgd, resol, counts, noise, startlist, meanAdjust, clustnummer) {
311 330
312 331
313 plz = which(startlist == winStart) 332 plz = which(startlist == winStart)
314 winEnd = coffeeshopSud[plz] 333 winEnd = coffeeshopSud[plz]
315 334
316 335
317 #will store peak information 336 #will store peak information
318 writethis = list() 337 writethis = list()
338 ccx = 1 #default is clustering didNOT work
319 339
320 340
321 rWinSizeTemp = winEnd - winStart + 1 341 rWinSizeTemp = winEnd - winStart + 1
322 342
323 #extract subset of the IP 343 #extract subset of the IP
336 Rs[,j] = filtfilt(rep(1,80)/80,1,Rsr[,j]) 356 Rs[,j] = filtfilt(rep(1,80)/80,1,Rsr[,j])
337 } 357 }
338 } 358 }
339 359
340 if (resol != "window") { 360 if (resol != "window") {
341 361
342 ####CLUSTERING START HERE##### 362 #clustering (take 1)
363 take = 1
364 set.seed(samplingSeed)
343 noisy = sample(noise, rWinSizeTemp, replace = TRUE) 365 noisy = sample(noise, rWinSizeTemp, replace = TRUE)
344 clust = em(param$modelname, Rs+noisy, param$initparam) 366 clust = em(param$modelname, Rs+noisy, param$initparam)
345 clust$classification = map(clust$z) 367 clust$classification = map(clust$z)
346 ####CLUSTERING DONE#### 368 if (!((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification)))))) { #clustering didn't work, take1
347 369
348 #check whether clutering succeeded 370 #repeat clustering from scratch, take 2!
349 if((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification))))) { 371 set.seed(samplingSeed)
350 372 init = kmeans(Rs, clustnummer, nstart = 20)
351 373 init = unmap(init$cluster)
352 #check whether all components replicates agreed on clustering assignments 374 set.seed(samplingSeed)
353 ccx = 1 375 noisy = sample(noise, rWinSizeTemp, replace = TRUE)
354 if (numdup > 1) { 376 clust = me(param$modelname, Rs+noisy, init)
377 clust$classification = map(clust$z)
378 if ((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification))))) {
379 ccx = 0 #clustering worked, take2
380 take = 2
381 }
382 } else {ccx = 0} #clustering worked, take1
383
384 if (ccx != 1) { #clustering worked either in take1 or take2
385
386 if (numdup > 1) { #check whether all components replicates agreed on clustering assignments
355 cc = vector(mode = "numeric", length = numdup) 387 cc = vector(mode = "numeric", length = numdup)
356 for (g in 1:numdup) { 388 for (g in 1:numdup) {
357 cc[g] = which.max(clust$parameters$mean[g,]) #which cluster has the largest mean (this is the peak cluster, hopefully!) 389 cc[g] = which.max(clust$parameters$mean[g,]) #which cluster has the largest mean (this is the peak cluster, hopefully!)
358 } 390 }
359 ccx = sum(diff(cc)) 391 ccx = sum(diff(cc))
360 cluster = cc[1] 392 cluster = cc[1]
361 rm(cc) 393 rm(cc)
394
395 if ((ccx != 0) && (take == 1)) { #not all replicates agreed? Repeat the clustering with from scratch if not already done!
396 set.seed(samplingSeed)
397 init = kmeans(Rs, clustnummer, nstart = 20)
398 init = unmap(init$cluster)
399 set.seed(samplingSeed)
400 noisy = sample(noise, rWinSizeTemp, replace = TRUE)
401 clust = me(param$modelname, Rs+noisy, init)
402 clust$classification = map(clust$z)
403 if ((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification))))) { #clustering worked? check whether replicates agreed take 3
404 cc = vector(mode = "numeric", length = numdup)
405 for (g in 1:numdup) {
406 cc[g] = which.max(clust$parameters$mean[g,]) #which cluster has the largest mean (this is the peak cluster, hopefully!)
407 }
408 ccx = sum(diff(cc))
409 cluster = cc[1]
410 rm(cc)
411 take = 3
412 }
413 }
414 } else { #no replicates!
415 cluster = which.max(clust$parameters$mean) #which cluster has the largest mean (this is the peak cluster, hopefully!)
416 }
417 }
418
419 if ((ccx != 0) && (reportNoClust=="y")) { resol = "window" } #clustering did not work and windows should be reported
420
421
422 if (ccx == 0) { #clustering worked and all replicates agree on the cluster assignments
423
424 #extract subset of the background
425 if (bkgd != "None") {
426 Cs = counts[[numdup+1]][winStart:winEnd]
427 Cs = filtfilt(rep(1,80)/80,1,Cs)
362 } else { 428 } else {
363 cluster = which.max(clust$parameters$mean) #which cluster has the largest mean (this is the peak cluster, hopefully!) 429 set.seed(samplingSeed)
364 ccx = 0 430 Cs = sample(C, rWinSizeTemp, replace = TRUE)
431 Cs = filtfilt(rep(1,80)/80,1,Cs)
365 } 432 }
366 if (ccx == 0) {
367 433
368 #extract subset of the background 434 #find region boundaries
369 if (bkgd != "None") { 435 loc = 1:length(clust$classification)
370 Cs = counts[[numdup+1]][winStart:winEnd] 436 gmclass = cbind(loc, clust$classification)
371 Cs = filtfilt(rep(1,80)/80,1,Cs) 437 locPeak = gmclass[gmclass[,2] == cluster,,drop=FALSE]
372 } else { 438 rStart = locPeak[1] #start position of the region
373 Cs = sample(C, rWinSizeTemp, replace = TRUE) 439 rEnd = locPeak[length(locPeak[,1]),1] #end position of the region
374 Cs = filtfilt(rep(1,80)/80,1,Cs) 440
441 #peak resolution check
442 if (resol == "region") {
443 pSize = rEnd - rStart
444 signal = (geomean(Rs[rStart:rEnd,]))
445 signal2 = (signal) - (Cs[rStart:rEnd])
446 gm = mean(signal2)
447 summit = which.max(geomean(Rsr[rStart:rEnd,])) - 1
448 will2k = wilcox.test(signal, Cs[rStart:rEnd])
449
450 #Is there signal in the region above background
451 if (gm > 0) {
452 writethis[[1]] = rStart + winStart - 1
453 writethis[[2]] = rEnd + winStart
454 writethis[[3]] = paste0(chromName, ".", rStart+winStart -1)
455 writethis[[4]] = "1000"
456 writethis[[5]] = "."
457 writethis[[6]] = gm
458 writethis[[7]] = will2k$p.value
459 writethis[[8]] = "-1"
460 writethis[[9]] = summit
375 } 461 }
376 462 } else if (resol == "peak") {
377 #find region boundaries 463 #find out where separate peaks are
378 loc = 1:length(clust$classification) 464 d = diff(locPeak[,1])
379 gmclass = cbind(loc, clust$classification) 465 d[length(d)+1] = 0
380 locPeak = gmclass[gmclass[,2] == cluster,,drop=FALSE] 466 locPeak = cbind(locPeak, d)
381 rStart = locPeak[1] #start position of the region 467 bound1 = which(locPeak[,3] > 1, arr.in=TRUE)
382 rEnd = locPeak[length(locPeak[,1]),1] #end position of the region 468 bound2 = bound1 + 1
383 469 bound = locPeak[sort(c(bound1,bound2))]
384 #peak resolution check 470 bound = c(rStart, bound, rEnd)
385 if (resol == "region") { 471 w = 1
386 pSize = rEnd - rStart 472 warum = 0
387 signal = (geomean(Rs[rStart:rEnd,])) 473 while (w < length(bound)) {
388 signal2 = (signal) - (Cs[rStart:rEnd]) 474 pStart = bound[w] + winStart - 1
475 pEnd = bound[w+1] + winStart
476 pSize = pEnd - pStart
477 signal = (geomean(Rs[(bound[w]):(bound[w+1]),]))
478 signal2 = (signal) - (Cs[bound[w]:bound[w+1]])
389 gm = mean(signal2) 479 gm = mean(signal2)
390 summit = which.max(geomean(Rsr[rStart:rEnd,])) - 1 480 summit = which.max(geomean(Rsr[(bound[w]):(bound[w+1]),])) - 1
391 will2k = wilcox.test(signal, Cs[rStart:rEnd]) 481 will2k = wilcox.test(signal, Cs[(bound[w]):(bound[w+1])])
392 482
483 weil = warum * 9
393 #Is there signal in the region above background 484 #Is there signal in the region above background
394 if (gm > 0) { 485 if (gm > 0) {
395 writethis[[1]] = rStart + winStart - 1 486 writethis[[1+weil]] = pStart
396 writethis[[2]] = rEnd + winStart 487 writethis[[2+weil]] = pEnd
397 writethis[[3]] = paste0(chromName, ".", rStart+winStart -1) 488 writethis[[3+weil]] = paste0(chromName, ".", pStart)
398 writethis[[4]] = "1000" 489 writethis[[4+weil]] = "1000"
399 writethis[[5]] = "." 490 writethis[[5+weil]] = "."
400 writethis[[6]] = gm 491 writethis[[6+weil]] = gm
401 writethis[[7]] = will2k$p.value 492 writethis[[7+weil]] = will2k$p.value
402 writethis[[8]] = "-1" 493 writethis[[8+weil]] = "-1"
403 writethis[[9]] = summit 494 writethis[[9+weil]] = summit
404 } 495 }
405 } else if (resol == "peak") { 496 w = w + 2
406 #find out where separate peaks are 497 warum = warum + 1
407 d = diff(locPeak[,1]) 498 }
408 d[length(d)+1] = 0 499 } #peak resolution check
409 locPeak = cbind(locPeak, d) 500 } #clustering worked and all replicates agree on clustering assignments?
410 bound1 = which(locPeak[,3] > 1, arr.in=TRUE) 501 } #window resolution check
411 bound2 = bound1 + 1 502
412 bound = locPeak[sort(c(bound1,bound2))] 503 if (resol == "window") {
413 bound = c(rStart, bound, rEnd)
414 w = 1
415 warum = 0
416 while (w < length(bound)) {
417 pStart = bound[w] + winStart - 1
418 pEnd = bound[w+1] + winStart
419 pSize = pEnd - pStart
420 signal = (geomean(Rs[(bound[w]):(bound[w+1]),]))
421 signal2 = (signal) - (Cs[bound[w]:bound[w+1]])
422 gm = mean(signal2)
423 summit = which.max(geomean(Rsr[(bound[w]):(bound[w+1]),])) - 1
424 will2k = wilcox.test(signal, Cs[(bound[w]):(bound[w+1])])
425
426 weil = warum * 9
427 #Is there signal in the region above background
428 if (gm > 0) {
429 writethis[[1+weil]] = pStart
430 writethis[[2+weil]] = pEnd
431 writethis[[3+weil]] = paste0(chromName, ".", pStart)
432 writethis[[4+weil]] = "1000"
433 writethis[[5+weil]] = "."
434 writethis[[6+weil]] = gm
435 writethis[[7+weil]] = will2k$p.value
436 writethis[[8+weil]] = "-1"
437 writethis[[9+weil]] = summit
438 }
439 w = w + 2
440 warum = warum + 1
441 }
442
443 } #peak resolution check
444 } #all replicates agree on clustering assignments?
445 } #clustering worked?
446
447 } else if (resol == "window") { #window resolution check
448 504
449 #extract subset of the background 505 #extract subset of the background
450 if (bkgd != "None") { 506 if (bkgd != "None") {
451 Cs = counts[[numdup+1]][winStart:winEnd] 507 Cs = counts[[numdup+1]][winStart:winEnd]
452 Cs = filtfilt(rep(1,80)/80,1,Cs) 508 Cs = filtfilt(rep(1,80)/80,1,Cs)
453 } else { 509 } else {
510 set.seed(samplingSeed)
454 Cs = sample(C, rWinSizeTemp, replace = TRUE) 511 Cs = sample(C, rWinSizeTemp, replace = TRUE)
455 Cs = filtfilt(rep(1,80)/80,1,Cs) 512 Cs = filtfilt(rep(1,80)/80,1,Cs)
456 } 513 }
457 514
458 #calculate scores 515 #calculate scores
465 522
466 #Is there signal in the region above background 523 #Is there signal in the region above background
467 if (gm > 0) { 524 if (gm > 0) {
468 writethis[[1]] = winStart - 1 525 writethis[[1]] = winStart - 1
469 writethis[[2]] = winEnd 526 writethis[[2]] = winEnd
470 writethis[[3]] = paste0(chromName, ".", winStart -1) 527 writethis[[3]] = paste0(chromName, ".", winStart -1, ".NoClust")
471 writethis[[4]] = "1000" 528 writethis[[4]] = "1000"
472 writethis[[5]] = "." 529 writethis[[5]] = "."
473 writethis[[6]] = gm 530 writethis[[6]] = gm
474 writethis[[7]] = will2k$p.value 531 writethis[[7]] = will2k$p.value
475 writethis[[8]] = "-1" 532 writethis[[8]] = "-1"
476 writethis[[9]] = summit 533 writethis[[9]] = summit
477 } 534 }
478 } #window clustering 535 } #window reporting
479 536
480 return(writethis) 537 return(writethis)
481 } 538 }
482 539
483 540
504 args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line 561 args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line
505 562
506 #Parsing arguments and storing values 563 #Parsing arguments and storing values
507 for (each.arg in args) { 564 for (each.arg in args) {
508 #chormosome size file 565 #chormosome size file
509 if (grepl('^-sfile=',each.arg)) { 566 if (grepl('-sfile=',each.arg)) {
510 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 567 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
511 if (! is.na(arg.split[2]) ) { 568 if (! is.na(arg.split[2]) ) {
512 size.file <- arg.split[2] 569 size.file <- arg.split[2]
513 } else { 570 } else {
514 stop('No genome size file') 571 stop('No genome size file')
515 } 572 }
516 } 573 }
517 #bed file names 574 #bed file names
518 if (grepl('^-bednames=',each.arg)) { 575 if (grepl('-bednames=',each.arg)) {
519 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 576 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
520 if (! is.na(arg.split[2]) ) { 577 if (! is.na(arg.split[2]) ) {
521 bednames <- arg.split[2] 578 bednames <- arg.split[2]
522 } else { 579 } else {
523 stop('No bed file names') 580 stop('No bed file names')
524 } 581 }
525 } 582 }
526 #bed files directory 583 #bed files directory
527 if (grepl('^-frag=',each.arg)) { 584 if (grepl('-frag=',each.arg)) {
528 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 585 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
529 if (! is.na(arg.split[2]) ) { 586 if (! is.na(arg.split[2]) ) {
530 frag <- arg.split[2] 587 frag <- arg.split[2]
531 } else { 588 } else {
532 stop('No fragment length given') 589 stop('No fragment length given')
533 } 590 }
534 } 591 }
535 #bakcground files directory 592 #bakcground files directory
536 if (grepl('^-bkgd=',each.arg)) { 593 if (grepl('-bkgd=',each.arg)) {
537 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 594 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
538 if (! is.na(arg.split[2]) ) { 595 if (! is.na(arg.split[2]) ) {
539 bkgd <- arg.split[2] 596 bkgd <- arg.split[2]
540 } else { 597 } else {
541 stop('No background file') 598 stop('No background file')
542 } 599 }
543 } 600 }
544 #bakcground files directory 601 #bakcground files directory
545 if (grepl('^-out=',each.arg)) { 602 if (grepl('-out=',each.arg)) {
546 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 603 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
547 if (! is.na(arg.split[2]) ) { 604 if (! is.na(arg.split[2]) ) {
548 out <- arg.split[2] 605 out <- arg.split[2]
549 } else { 606 } else {
550 stop('No output directory given') 607 stop('No output directory given')
551 } 608 }
552 } 609 }
553 #Cluster number 610 #Cluster number
554 if (grepl('^-clustnummer=',each.arg)) { 611 if (grepl('-clustnummer=',each.arg)) {
555 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 612 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
556 if (! is.na(arg.split[2]) ) { 613 if (! is.na(arg.split[2]) ) {
557 clustnummer <- as.numeric(arg.split[2]) 614 clustnummer <- as.numeric(arg.split[2])
558 } 615 }
559 } 616 }
560 #resolution 617 #resolution
561 if (grepl('^-resolution=',each.arg)) { 618 if (grepl('-resolution=',each.arg)) {
562 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 619 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
563 if (! is.na(arg.split[2]) ) { 620 if (! is.na(arg.split[2]) ) {
564 resol <- arg.split[2] 621 resol <- arg.split[2]
565 } 622 }
566 } 623 }
567 #processor cores 624 #processor cores
568 if (grepl('^-p=',each.arg)) { 625 if (grepl('-p=',each.arg)) {
569 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 626 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
570 if (! is.na(arg.split[2]) ) { 627 if (! is.na(arg.split[2]) ) {
571 cornum <- as.numeric(arg.split[2]) 628 cornum <- as.numeric(arg.split[2])
572 } 629 }
573 } 630 }
574 #minimum window size 631 #minimum window size
575 if (grepl('^-window=',each.arg)) { 632 if (grepl('-window=',each.arg)) {
576 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 633 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
577 if (! is.na(arg.split[2]) ) { 634 if (! is.na(arg.split[2]) ) {
578 winSize <- arg.split[2] 635 winSize <- arg.split[2]
579 } 636 }
580 } 637 }
581 #window size 638 #window size
582 if (grepl('^-bin=',each.arg)) { 639 if (grepl('-bin=',each.arg)) {
583 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 640 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
584 if (! is.na(arg.split[2]) ) { 641 if (! is.na(arg.split[2]) ) {
585 binsize <- arg.split[2] 642 binsize <- arg.split[2]
586 } 643 }
587 } 644 }
588 #type (paired / single) 645 #type (paired / single)
589 if (grepl('^-type=',each.arg)) { 646 if (grepl('-type=',each.arg)) {
590 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 647 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
591 if (! is.na(arg.split[2]) ) { 648 if (! is.na(arg.split[2]) ) {
592 type <- arg.split[2] 649 type <- arg.split[2]
593 } 650 }
594 } 651 }
595 #chromosome number 652 #chromosome number
596 if (grepl('^-chrcount=',each.arg)) { 653 if (grepl('-chrcount=',each.arg)) {
597 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 654 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
598 if (! is.na(arg.split[2]) ) { 655 if (! is.na(arg.split[2]) ) {
599 chrcount <- as.numeric(arg.split[2]) 656 chrcount <- as.numeric(arg.split[2])
600 } 657 }
601 } 658 }
602 #window enrichment cutoff 659 #window enrichment cutoff
603 if (grepl('^-windowe=',each.arg)) { 660 if (grepl('-windowe=',each.arg)) {
604 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 661 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
605 if (! is.na(arg.split[2]) ) { 662 if (! is.na(arg.split[2]) ) {
606 windowe <- arg.split[2] 663 windowe <- arg.split[2]
607 } 664 }
608 } 665 }
666 #initialize
667 if (grepl('-initModel=',each.arg)) {
668 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
669 if (! is.na(arg.split[2]) ) {
670 initialize <- arg.split[2]
671 }
672 }
673
609 } 674 }
610 675
611 ##Parse in variables 676 ##Parse in variables
612 chromosomes = read.table(size.file, header=FALSE) 677 chromosomes = read.table(size.file, header=FALSE)
613 chromName = chromosomes$V1; #which chromosome 678 chromName = chromosomes$V1; #which chromosome
614 chromSize = chromosomes$V2; #chromosomes size 679 chromSize = chromosomes$V2; #chromosomes size
615 rm(chromosomes) 680 rm(chromosomes)
616 681
682 if (chrcount == 1) {
683 write(paste(samplingSeed), file = paste0(out, "/seed.info"))
684 } else {
685 samplingSeed = as.numeric(read.table(paste0(out, "/seed.info"), header = FALSE)[[1]])
686 }
617 readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]]) 687 readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]])
618 numdup = length(readsFiles) #number of replicates 688 numdup = length(readsFiles) #number of replicates
619 if (bkgd != "None") { 689 if (bkgd != "None") {
620 readsFiles[[numdup+1]] = bkgd 690 readsFiles[[numdup+1]] = bkgd
621 } 691 }
701 771
702 772
703 # ============================ 773 # ============================
704 # Estimating Background Model 774 # Estimating Background Model
705 # ============================ 775 # ============================
706
707 if (chrcount == 1){ #first chromosome, estimate bkgd (includes SNR cutoff) 776 if (chrcount == 1){ #first chromosome, estimate bkgd (includes SNR cutoff)
708 if (is.na(cutoff)) { 777 if (is.na(cutoff)) {
709 if (bkgd != "None") { 778 if (bkgd != "None") {
710 cutoff = vector(length = numdup) 779 cutoff = vector(length = numdup)
711 sdC = sd(counts[[numdup+1]]) 780 sdC = sd(counts[[numdup+1]])
720 cutoff = vector(length = numdup) 789 cutoff = vector(length = numdup)
721 mmV = var(geomeanL(counts)) 790 mmV = var(geomeanL(counts))
722 mmM = mean(geomeanL(counts)) 791 mmM = mean(geomeanL(counts))
723 sigma = log(1+((mmV) / ((mmM)^2))) 792 sigma = log(1+((mmV) / ((mmM)^2)))
724 mu = (log(mmM)) - (0.5 * (sigma)) 793 mu = (log(mmM)) - (0.5 * (sigma))
794 set.seed(samplingSeed)
725 C = rlnorm(100000, mu, sqrt(sigma)) 795 C = rlnorm(100000, mu, sqrt(sigma))
726 for (x in 1:numdup) { 796 for (x in 1:numdup) {
727 cutoff[x] = (mean(counts[[x]]))/(sd(C)) 797 cutoff[x] = (mean(counts[[x]]))/(sd(C))
728 } 798 }
729 cutoff = max(cutoff) 799 cutoff = max(cutoff)
730 mCs = (mean(sample(C, binSize*5, replace = TRUE))) 800 set.seed(samplingSeed)
731 dCs = (sd(sample(C, binSize*5, replace = TRUE))) 801 snow = sample(C, binSize*5, replace = TRUE)
802 mCs = mean(snow)
803 dCs = sd(snow)
732 write(paste(c(cutoff,sigma,mu)), file = paste0(out, "/bkgd.info"), append = TRUE) 804 write(paste(c(cutoff,sigma,mu)), file = paste0(out, "/bkgd.info"), append = TRUE)
733 } 805 }
734 } 806 }
735 } else { #bkgd estiamted from before 807 } else { #bkgd estiamted from before
736 bkgdInfo = read.table(paste0(out, "/bkgd.info"), header = FALSE) 808 bkgdInfo = read.table(paste0(out, "/bkgd.info"), header = FALSE)
742 C = NULL 814 C = NULL
743 mCs = NULL 815 mCs = NULL
744 } else { 816 } else {
745 sigma = as.numeric(bkgdInfo[[1]][2]) 817 sigma = as.numeric(bkgdInfo[[1]][2])
746 mu = as.numeric(bkgdInfo[[1]][3]) 818 mu = as.numeric(bkgdInfo[[1]][3])
819 set.seed(samplingSeed)
747 C = rlnorm(100000, mu, sqrt(sigma)) 820 C = rlnorm(100000, mu, sqrt(sigma))
748 mCs = (mean(sample(C, binSize*5, replace = TRUE))) 821 set.seed(samplingSeed)
749 dCs = (sd(sample(C, binSize*5, replace = TRUE))) 822 snow = sample(C, binSize*5, replace = TRUE)
823 mCs = mean(snow)
824 dCs = sd(snow)
750 } 825 }
751 } 826 }
752 #=======================> DONE! 827 #=======================> DONE!
753 828
754 829
762 } else { 837 } else {
763 coffeeshop = lapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, strict, mCs, dCs, bkgd) 838 coffeeshop = lapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, strict, mCs, dCs, bkgd)
764 } 839 }
765 coffeeshop = as.numeric(unlist(coffeeshop)) 840 coffeeshop = as.numeric(unlist(coffeeshop))
766 coffeeshop[coffeeshop != numdup] = 0 841 coffeeshop[coffeeshop != numdup] = 0
842
843 if (sum(coffeeshop) != 0) { #Any enriched bins?
844
767 coffeeshop = c(0, diff(coffeeshop)) 845 coffeeshop = c(0, diff(coffeeshop))
768 coffeeshop = cbind(coffeeshop, bins) 846 coffeeshop = cbind(coffeeshop, bins)
769 coffeeshopNord = coffeeshop[coffeeshop[,1] == numdup,,drop=FALSE] 847 coffeeshopNord = coffeeshop[coffeeshop[,1] == numdup,,drop=FALSE]
770 coffeeshopSud = coffeeshop[coffeeshop[,1] == -numdup,,drop=FALSE] 848 coffeeshopSud = coffeeshop[coffeeshop[,1] == -numdup,,drop=FALSE]
771 coffeeshopNord = coffeeshopNord[,2] 849 coffeeshopNord = coffeeshopNord[,2]
798 # =================================== 876 # ===================================
799 # Initializing Clustering Parameters 877 # Initializing Clustering Parameters
800 # =================================== 878 # ===================================
801 if (length(coffeeshop[,1]) > 0) { #any enriched windows detected? 879 if (length(coffeeshop[,1]) > 0) { #any enriched windows detected?
802 880
803 yummy = ceiling(length(coffeeshop[,1]) / 4) 881 if (initialize == "deterministic") {
804 if (yummy > 20) { 882 yummy = ceiling(length(coffeeshop[,1]) / 1000)
805 yummy = sample(1:yummy, 20) 883 if (yummy == 0) {
806 } else if (yummy > 0) { 884 yummy = 1
807 yummy = 1:yummy 885 }
808 } else { 886 }
809 yummy = 1 887 if (initialize == "stochastic") {
888 yummy = ceiling(length(coffeeshop[,1]) / 4)
889 if (yummy > 20) {
890 set.seed(samplingSeed)
891 yummy = sample(1:yummy, 20)
892 } else if (yummy > 0) {
893 yummy = 1:yummy
894 } else {
895 yummy = 1
896 }
810 } 897 }
811 coffeeshopNord = coffeeshop[yummy,1] 898 coffeeshopNord = coffeeshop[yummy,1]
812 coffeeshopSud = coffeeshop[yummy,2] 899 coffeeshopSud = coffeeshop[yummy,2]
900 set.seed(samplingSeed)
813 noise = rnorm(100000, mean=0, sd=0.1) 901 noise = rnorm(100000, mean=0, sd=0.1)
814 param = initparam(coffeeshopNord, coffeeshopSud, numdup, counts, cornum, clustnummer, modelnames, noise) 902 param = initparam(coffeeshopNord, coffeeshopSud, numdup, counts, cornum, clustnummer, modelnames, noise)
815 #=======================> DONE! 903 #=======================> DONE!
816 904
817 905
826 scores = mclapply(coffeeshop[,1], scorewindow, coffeeshop[,2], numdup, C, bkgd, counts, startlist = coffeeshop[,1], mc.cores = cornum, mc.preschedule = TRUE) 914 scores = mclapply(coffeeshop[,1], scorewindow, coffeeshop[,2], numdup, C, bkgd, counts, startlist = coffeeshop[,1], mc.cores = cornum, mc.preschedule = TRUE)
827 } else { 915 } else {
828 scores = lapply(coffeeshop[,1], scorewindow, coffeeshop[,2], numdup, C, bkgd, counts, startlist = coffeeshop[,1]) 916 scores = lapply(coffeeshop[,1], scorewindow, coffeeshop[,2], numdup, C, bkgd, counts, startlist = coffeeshop[,1])
829 } 917 }
830 scores = unlist(scores) 918 scores = unlist(scores)
831 #scores = scores[is.finite(scores)] #just a sanity check
832 919
833 if (windowe == "auto") { 920 if (windowe == "auto") {
834 lscores = log(scores) 921 lscores = log(scores)
835 if (length(scores) > 0) { 922 if (length(scores) > 0) {
836 if (chrcount == 1) { 923 if (chrcount == 1) {
865 952
866 953
867 # ============== 954 # ==============
868 # Finding Peaks 955 # Finding Peaks
869 # ============== 956 # ==============
870 #if (length(coffeeshop[,1]) > 0) { #any enriched windows left after filtering?, just a sanity check! 957 if (length(coffeeshop[,1]) > 0) { #any enriched windows left after filtering?
871 coffeeshop = coffeeshop[,-3] 958 coffeeshop = coffeeshop[,-3]
872 if (cornum > 1) { 959 if (cornum > 1) {
873 peaks = mclapply(coffeeshop[,1], findpeak, coffeeshop[,2], numdup, C, param, bkgd, resol, counts, noise, startlist = coffeeshop[,1], meanAdjust, mc.cores = cornum, mc.preschedule = TRUE) 960 peaks = mclapply(coffeeshop[,1], findpeak, coffeeshop[,2], numdup, C, param, bkgd, resol, counts, noise, startlist = coffeeshop[,1], meanAdjust, clustnummer, mc.cores = cornum, mc.preschedule = TRUE)
874 } else { 961 } else {
875 peaks = lapply(coffeeshop[,1], findpeak, coffeeshop[,2], numdup, C, param, bkgd, resol, counts, noise, startlist = coffeeshop[,1], meanAdjust) 962 peaks = lapply(coffeeshop[,1], findpeak, coffeeshop[,2], numdup, C, param, bkgd, resol, counts, noise, startlist = coffeeshop[,1], meanAdjust, clustnummer)
876 } 963 }
877 if (!(is.null(peaks))) { #any peaks discovered? 964 if (!(is.null(peaks))) { #any peaks discovered?
878 writethis = processPeaks(peaks) 965 writethis = processPeaks(peaks)
879 #=======================> DONE! 966 #=======================> DONE!
880 967
884 971
885 # ========================= 972 # =========================
886 # Writing Peak Information 973 # Writing Peak Information
887 # ========================= 974 # =========================
888 } else { nothing = TRUE } #no peaks 975 } else { nothing = TRUE } #no peaks
889 #} else { nothing = TRUE } #no enriched windows left after filtering, just a sanity check (at least one window should still be there) 976 } else { nothing = TRUE } #no enriched windows left after filtering
890 } else { nothing = TRUE } #no enriched widnows discovered 977 } else { nothing = TRUE } #no enriched widnows discovered
978 } else { nothing = TRUE; cutthisW = windowe } #no enriched bins discovered
891 979
892 if (isTRUE(nothing)) { 980 if (isTRUE(nothing)) {
893 file.create(paste0(out, "/", chromName, ".peaks.bed")) 981 file.create(paste0(out, "/", chromName, ".peaks.bed"))
894 write(paste(chromName, minpeak, sep = " "), file = paste0(out, "/min.peaksize"), append=TRUE) 982 write(paste(chromName, minpeak, sep = " "), file = paste0(out, "/min.peaksize"), append=TRUE)
895 983
896 if (chrcount == 1) { 984 if (chrcount == 1) {
897 message(paste0("No peaks found! - Window Fold Enrichment: ", round(cutthisW,2))) 985 message(paste0("No peaks found! - Window Fold Enrichment: ", cutthisW, " - Seed: ", samplingSeed))
898 } else { 986 } else {
899 message("No peaks found!") 987 message("No peaks found!")
900 } 988 }
901 } else { 989 } else {
902 write(paste(chromName, writethis$X1, writethis$X2, writethis$X3, writethis$X4, writethis$X5, writethis$X6, writethis$X7, writethis$X8, writethis$X9, minpeak, sep = " "), file = paste0(out, "/", chromName, ".peaks.bed"), ncolumns = 1) 990 write(paste(chromName, writethis$X1, writethis$X2, writethis$X3, writethis$X4, writethis$X5, writethis$X6, writethis$X7, writethis$X8, writethis$X9, minpeak, sep = " "), file = paste0(out, "/", chromName, ".peaks.bed"), ncolumns = 1)
903 write(paste(chromName, minpeak, sep = " "), file = paste0(out, "/min.peaksize"), append=TRUE) 991 write(paste(chromName, minpeak, sep = " "), file = paste0(out, "/min.peaksize"), append=TRUE)
904 992
905 993
906 if (chrcount == 1) { 994 if (chrcount == 1) {
907 message(paste0("Done! - Window Fold Enrichment: ", round(cutthisW,2))) 995 message(paste0("Done! - Window Fold Enrichment: ", cutthisW, " - Seed: ", samplingSeed))
908 } else { 996 } else {
909 message("Done!") 997 message("Done!")
910 } 998 }
911 } 999 }
912 #=======================> DONE! 1000 #=======================> DONE!