Mercurial > repos > messersc > jamm
comparison peakfinder.r @ 1:243f75d0ed6e draft default tip
Uploaded.
Includes new release 1.0.7 with fixed optional controls.
author | messersc |
---|---|
date | Thu, 19 Feb 2015 05:39:45 -0500 |
parents | d42f4d78c85e |
children |
comparison
equal
deleted
inserted
replaced
0:d42f4d78c85e | 1:243f75d0ed6e |
---|---|
1 ############################### | 1 ######################################################################## |
2 #### Peak finder for NGS Data | 2 # JAMMv1.0.7rev1 is a peak finder for joint analysis of NGS replicates. |
3 #### R script | 3 # Copyright (C) 2014-2015 Mahmoud Ibrahim |
4 ############################### | 4 # |
5 # This program is free software: you can redistribute it and/or modify | |
6 # it under the terms of the GNU General Public License as published by | |
7 # the Free Software Foundation, either version 3 of the License, or | |
8 # (at your option) any later version. | |
9 # | |
10 # This program is distributed in the hope that it will be useful, | |
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 # GNU General Public License for more details. | |
14 # | |
15 # You should have received a copy of the GNU General Public License | |
16 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
17 # | |
18 # Contact: mahmoud.ibrahim@mdc-berlin.de | |
19 ######################################################################## | |
20 | |
21 | |
5 | 22 |
6 | 23 |
7 # ======================= | 24 # ======================= |
8 # User-defined variables | 25 # User-defined variables |
9 # ======================= | 26 # ======================= |
10 cutoff = NA #To enforce an SNR cutoff ratio, delete NA and enter the number you want | 27 samplingSeed = 1011414 #the seed makes the random sampling that JAMM does "deterministic". You can change this to your integer seed of choice to reproduce exact same results or randomize it this way: ceiling(rnorm(1,50,300)). |
11 strict = 1 #to make peak calling more / less strict, increase / decrease this number | 28 reportNoClust = "n" #report windows for which clustering failed? Reported windows will be marked by "NoClust" in the peak name (4th column). |
12 meanAdjust = "n" #adjust the initialization mean vector for each window before clustering? If you want to do so, change this to "y". | 29 cutoff = NA #To enforce an SNR cutoff ratio for bin enrichment calling, delete NA and enter the number you want. |
13 options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000) | 30 strict = 1 #To make bin enrichment calling more / less strict, increase / decrease this number. |
31 meanAdjust = "n" #Adjust the initialization mean vector for each window before clustering? If you want to do so, change this to "y". | |
32 options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000). | |
14 #=======================> DONE! | 33 #=======================> DONE! |
15 | 34 |
16 | 35 |
17 | 36 |
18 | 37 |
113 } else { | 132 } else { |
114 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE) | 133 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE) |
115 } | 134 } |
116 counts = counts/mCount | 135 counts = counts/mCount |
117 } | 136 } |
118 | 137 |
119 return(counts) | 138 return(counts) |
120 } | 139 } |
121 | 140 |
122 | 141 |
123 | 142 |
144 } else { | 163 } else { |
145 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE) | 164 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE) |
146 } | 165 } |
147 counts = counts/mCount | 166 counts = counts/mCount |
148 } | 167 } |
149 | 168 |
150 return(counts) | 169 return(counts) |
151 } | 170 } |
152 | 171 |
153 | 172 |
154 | 173 |
166 #extract subset of the background | 185 #extract subset of the background |
167 if (bkgd != "None") { | 186 if (bkgd != "None") { |
168 Cs = counts[[numdup+1]][winStart:tempend] | 187 Cs = counts[[numdup+1]][winStart:tempend] |
169 mCs = mean(Cs) | 188 mCs = mean(Cs) |
170 dCs = sd(Cs) | 189 dCs = sd(Cs) |
171 } else { | 190 } |
172 Cs = sample(C, binSizeTemp, replace = TRUE) | 191 |
173 } | |
174 | |
175 | |
176 go = rep(0, numdup) | 192 go = rep(0, numdup) |
177 for (g in 1:numdup) { | 193 for (g in 1:numdup) { |
178 mS = (mean(counts[[g]][winStart:tempend])) | 194 mS = (mean(counts[[g]][winStart:tempend])) |
179 ratio = mS/dCs | 195 ratio = mS/dCs |
180 if ((mS > (mCs * strict)) && (ratio > cutoff)) { | 196 if ((mS > (mCs * strict)) && (ratio > cutoff)) { |
234 Cs = counts[[numdup+1]] | 250 Cs = counts[[numdup+1]] |
235 Cmin = min(Cs[Cs > 0]) | 251 Cmin = min(Cs[Cs > 0]) |
236 Cs = Cs[winStart:winEnd] | 252 Cs = Cs[winStart:winEnd] |
237 Cs = filtfilt(rep(1,80)/80,1,Cs) + Cmin #gets rid of Inf in the fold change | 253 Cs = filtfilt(rep(1,80)/80,1,Cs) + Cmin #gets rid of Inf in the fold change |
238 } else { | 254 } else { |
255 set.seed(samplingSeed) | |
239 Cs = sample(C, rWinSizeTemp, replace = TRUE) | 256 Cs = sample(C, rWinSizeTemp, replace = TRUE) |
240 Cs = filtfilt(rep(1,80)/80,1,Cs) | 257 Cs = filtfilt(rep(1,80)/80,1,Cs) |
241 } | 258 } |
242 | 259 |
243 #start scoring | 260 #start scoring |
263 Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j]) | 280 Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j]) |
264 } | 281 } |
265 return(Rs) | 282 return(Rs) |
266 } | 283 } |
267 cluster = function(model, sig, init, clustnummer, noise) { #helper function2 | 284 cluster = function(model, sig, init, clustnummer, noise) { #helper function2 |
285 set.seed(samplingSeed) | |
268 noisy = sample(noise, length(sig[,1]), replace = TRUE) | 286 noisy = sample(noise, length(sig[,1]), replace = TRUE) |
269 clust = me(model, sig+noisy, init) | 287 clust = me(model, sig+noisy, init) |
270 bicc = bic(model, clust$loglik, length(sig[,1]), length(sig[1,]), clustnummer) | 288 bicc = bic(model, clust$loglik, length(sig[,1]), length(sig[1,]), clustnummer) |
271 out = list(bicc = bicc, param = clust$parameters) | 289 out = list(bicc = bicc, param = clust$parameters) |
272 return(out) | 290 return(out) |
281 sig = lapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord) | 299 sig = lapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord) |
282 } | 300 } |
283 sig = do.call(rbind, sig) | 301 sig = do.call(rbind, sig) |
284 | 302 |
285 #kmeans initialization | 303 #kmeans initialization |
286 init = kmeans(sig, clustnummer) | 304 set.seed(samplingSeed) |
305 init = kmeans(sig, clustnummer, nstart = 20) | |
287 init = unmap(init$cluster) | 306 init = unmap(init$cluster) |
288 | 307 |
289 | 308 |
290 if (cornum > 1) { | 309 if (cornum > 1) { |
291 param = mclapply(modelnames, cluster, sig, init, clustnummer, noise, mc.cores = cornum, mc.preschedule = TRUE) | 310 param = mclapply(modelnames, cluster, sig, init, clustnummer, noise, mc.cores = cornum, mc.preschedule = TRUE) |
305 | 324 |
306 | 325 |
307 | 326 |
308 | 327 |
309 #find peaks | 328 #find peaks |
310 findpeak = function(winStart, coffeeshopSud, numdup, C, param, bkgd, resol, counts, noise, startlist, meanAdjust) { | 329 findpeak = function(winStart, coffeeshopSud, numdup, C, param, bkgd, resol, counts, noise, startlist, meanAdjust, clustnummer) { |
311 | 330 |
312 | 331 |
313 plz = which(startlist == winStart) | 332 plz = which(startlist == winStart) |
314 winEnd = coffeeshopSud[plz] | 333 winEnd = coffeeshopSud[plz] |
315 | 334 |
316 | 335 |
317 #will store peak information | 336 #will store peak information |
318 writethis = list() | 337 writethis = list() |
338 ccx = 1 #default is clustering didNOT work | |
319 | 339 |
320 | 340 |
321 rWinSizeTemp = winEnd - winStart + 1 | 341 rWinSizeTemp = winEnd - winStart + 1 |
322 | 342 |
323 #extract subset of the IP | 343 #extract subset of the IP |
336 Rs[,j] = filtfilt(rep(1,80)/80,1,Rsr[,j]) | 356 Rs[,j] = filtfilt(rep(1,80)/80,1,Rsr[,j]) |
337 } | 357 } |
338 } | 358 } |
339 | 359 |
340 if (resol != "window") { | 360 if (resol != "window") { |
341 | 361 |
342 ####CLUSTERING START HERE##### | 362 #clustering (take 1) |
363 take = 1 | |
364 set.seed(samplingSeed) | |
343 noisy = sample(noise, rWinSizeTemp, replace = TRUE) | 365 noisy = sample(noise, rWinSizeTemp, replace = TRUE) |
344 clust = em(param$modelname, Rs+noisy, param$initparam) | 366 clust = em(param$modelname, Rs+noisy, param$initparam) |
345 clust$classification = map(clust$z) | 367 clust$classification = map(clust$z) |
346 ####CLUSTERING DONE#### | 368 if (!((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification)))))) { #clustering didn't work, take1 |
347 | 369 |
348 #check whether clutering succeeded | 370 #repeat clustering from scratch, take 2! |
349 if((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification))))) { | 371 set.seed(samplingSeed) |
350 | 372 init = kmeans(Rs, clustnummer, nstart = 20) |
351 | 373 init = unmap(init$cluster) |
352 #check whether all components replicates agreed on clustering assignments | 374 set.seed(samplingSeed) |
353 ccx = 1 | 375 noisy = sample(noise, rWinSizeTemp, replace = TRUE) |
354 if (numdup > 1) { | 376 clust = me(param$modelname, Rs+noisy, init) |
377 clust$classification = map(clust$z) | |
378 if ((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification))))) { | |
379 ccx = 0 #clustering worked, take2 | |
380 take = 2 | |
381 } | |
382 } else {ccx = 0} #clustering worked, take1 | |
383 | |
384 if (ccx != 1) { #clustering worked either in take1 or take2 | |
385 | |
386 if (numdup > 1) { #check whether all components replicates agreed on clustering assignments | |
355 cc = vector(mode = "numeric", length = numdup) | 387 cc = vector(mode = "numeric", length = numdup) |
356 for (g in 1:numdup) { | 388 for (g in 1:numdup) { |
357 cc[g] = which.max(clust$parameters$mean[g,]) #which cluster has the largest mean (this is the peak cluster, hopefully!) | 389 cc[g] = which.max(clust$parameters$mean[g,]) #which cluster has the largest mean (this is the peak cluster, hopefully!) |
358 } | 390 } |
359 ccx = sum(diff(cc)) | 391 ccx = sum(diff(cc)) |
360 cluster = cc[1] | 392 cluster = cc[1] |
361 rm(cc) | 393 rm(cc) |
394 | |
395 if ((ccx != 0) && (take == 1)) { #not all replicates agreed? Repeat the clustering with from scratch if not already done! | |
396 set.seed(samplingSeed) | |
397 init = kmeans(Rs, clustnummer, nstart = 20) | |
398 init = unmap(init$cluster) | |
399 set.seed(samplingSeed) | |
400 noisy = sample(noise, rWinSizeTemp, replace = TRUE) | |
401 clust = me(param$modelname, Rs+noisy, init) | |
402 clust$classification = map(clust$z) | |
403 if ((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification))))) { #clustering worked? check whether replicates agreed take 3 | |
404 cc = vector(mode = "numeric", length = numdup) | |
405 for (g in 1:numdup) { | |
406 cc[g] = which.max(clust$parameters$mean[g,]) #which cluster has the largest mean (this is the peak cluster, hopefully!) | |
407 } | |
408 ccx = sum(diff(cc)) | |
409 cluster = cc[1] | |
410 rm(cc) | |
411 take = 3 | |
412 } | |
413 } | |
414 } else { #no replicates! | |
415 cluster = which.max(clust$parameters$mean) #which cluster has the largest mean (this is the peak cluster, hopefully!) | |
416 } | |
417 } | |
418 | |
419 if ((ccx != 0) && (reportNoClust=="y")) { resol = "window" } #clustering did not work and windows should be reported | |
420 | |
421 | |
422 if (ccx == 0) { #clustering worked and all replicates agree on the cluster assignments | |
423 | |
424 #extract subset of the background | |
425 if (bkgd != "None") { | |
426 Cs = counts[[numdup+1]][winStart:winEnd] | |
427 Cs = filtfilt(rep(1,80)/80,1,Cs) | |
362 } else { | 428 } else { |
363 cluster = which.max(clust$parameters$mean) #which cluster has the largest mean (this is the peak cluster, hopefully!) | 429 set.seed(samplingSeed) |
364 ccx = 0 | 430 Cs = sample(C, rWinSizeTemp, replace = TRUE) |
431 Cs = filtfilt(rep(1,80)/80,1,Cs) | |
365 } | 432 } |
366 if (ccx == 0) { | |
367 | 433 |
368 #extract subset of the background | 434 #find region boundaries |
369 if (bkgd != "None") { | 435 loc = 1:length(clust$classification) |
370 Cs = counts[[numdup+1]][winStart:winEnd] | 436 gmclass = cbind(loc, clust$classification) |
371 Cs = filtfilt(rep(1,80)/80,1,Cs) | 437 locPeak = gmclass[gmclass[,2] == cluster,,drop=FALSE] |
372 } else { | 438 rStart = locPeak[1] #start position of the region |
373 Cs = sample(C, rWinSizeTemp, replace = TRUE) | 439 rEnd = locPeak[length(locPeak[,1]),1] #end position of the region |
374 Cs = filtfilt(rep(1,80)/80,1,Cs) | 440 |
441 #peak resolution check | |
442 if (resol == "region") { | |
443 pSize = rEnd - rStart | |
444 signal = (geomean(Rs[rStart:rEnd,])) | |
445 signal2 = (signal) - (Cs[rStart:rEnd]) | |
446 gm = mean(signal2) | |
447 summit = which.max(geomean(Rsr[rStart:rEnd,])) - 1 | |
448 will2k = wilcox.test(signal, Cs[rStart:rEnd]) | |
449 | |
450 #Is there signal in the region above background | |
451 if (gm > 0) { | |
452 writethis[[1]] = rStart + winStart - 1 | |
453 writethis[[2]] = rEnd + winStart | |
454 writethis[[3]] = paste0(chromName, ".", rStart+winStart -1) | |
455 writethis[[4]] = "1000" | |
456 writethis[[5]] = "." | |
457 writethis[[6]] = gm | |
458 writethis[[7]] = will2k$p.value | |
459 writethis[[8]] = "-1" | |
460 writethis[[9]] = summit | |
375 } | 461 } |
376 | 462 } else if (resol == "peak") { |
377 #find region boundaries | 463 #find out where separate peaks are |
378 loc = 1:length(clust$classification) | 464 d = diff(locPeak[,1]) |
379 gmclass = cbind(loc, clust$classification) | 465 d[length(d)+1] = 0 |
380 locPeak = gmclass[gmclass[,2] == cluster,,drop=FALSE] | 466 locPeak = cbind(locPeak, d) |
381 rStart = locPeak[1] #start position of the region | 467 bound1 = which(locPeak[,3] > 1, arr.in=TRUE) |
382 rEnd = locPeak[length(locPeak[,1]),1] #end position of the region | 468 bound2 = bound1 + 1 |
383 | 469 bound = locPeak[sort(c(bound1,bound2))] |
384 #peak resolution check | 470 bound = c(rStart, bound, rEnd) |
385 if (resol == "region") { | 471 w = 1 |
386 pSize = rEnd - rStart | 472 warum = 0 |
387 signal = (geomean(Rs[rStart:rEnd,])) | 473 while (w < length(bound)) { |
388 signal2 = (signal) - (Cs[rStart:rEnd]) | 474 pStart = bound[w] + winStart - 1 |
475 pEnd = bound[w+1] + winStart | |
476 pSize = pEnd - pStart | |
477 signal = (geomean(Rs[(bound[w]):(bound[w+1]),])) | |
478 signal2 = (signal) - (Cs[bound[w]:bound[w+1]]) | |
389 gm = mean(signal2) | 479 gm = mean(signal2) |
390 summit = which.max(geomean(Rsr[rStart:rEnd,])) - 1 | 480 summit = which.max(geomean(Rsr[(bound[w]):(bound[w+1]),])) - 1 |
391 will2k = wilcox.test(signal, Cs[rStart:rEnd]) | 481 will2k = wilcox.test(signal, Cs[(bound[w]):(bound[w+1])]) |
392 | 482 |
483 weil = warum * 9 | |
393 #Is there signal in the region above background | 484 #Is there signal in the region above background |
394 if (gm > 0) { | 485 if (gm > 0) { |
395 writethis[[1]] = rStart + winStart - 1 | 486 writethis[[1+weil]] = pStart |
396 writethis[[2]] = rEnd + winStart | 487 writethis[[2+weil]] = pEnd |
397 writethis[[3]] = paste0(chromName, ".", rStart+winStart -1) | 488 writethis[[3+weil]] = paste0(chromName, ".", pStart) |
398 writethis[[4]] = "1000" | 489 writethis[[4+weil]] = "1000" |
399 writethis[[5]] = "." | 490 writethis[[5+weil]] = "." |
400 writethis[[6]] = gm | 491 writethis[[6+weil]] = gm |
401 writethis[[7]] = will2k$p.value | 492 writethis[[7+weil]] = will2k$p.value |
402 writethis[[8]] = "-1" | 493 writethis[[8+weil]] = "-1" |
403 writethis[[9]] = summit | 494 writethis[[9+weil]] = summit |
404 } | 495 } |
405 } else if (resol == "peak") { | 496 w = w + 2 |
406 #find out where separate peaks are | 497 warum = warum + 1 |
407 d = diff(locPeak[,1]) | 498 } |
408 d[length(d)+1] = 0 | 499 } #peak resolution check |
409 locPeak = cbind(locPeak, d) | 500 } #clustering worked and all replicates agree on clustering assignments? |
410 bound1 = which(locPeak[,3] > 1, arr.in=TRUE) | 501 } #window resolution check |
411 bound2 = bound1 + 1 | 502 |
412 bound = locPeak[sort(c(bound1,bound2))] | 503 if (resol == "window") { |
413 bound = c(rStart, bound, rEnd) | |
414 w = 1 | |
415 warum = 0 | |
416 while (w < length(bound)) { | |
417 pStart = bound[w] + winStart - 1 | |
418 pEnd = bound[w+1] + winStart | |
419 pSize = pEnd - pStart | |
420 signal = (geomean(Rs[(bound[w]):(bound[w+1]),])) | |
421 signal2 = (signal) - (Cs[bound[w]:bound[w+1]]) | |
422 gm = mean(signal2) | |
423 summit = which.max(geomean(Rsr[(bound[w]):(bound[w+1]),])) - 1 | |
424 will2k = wilcox.test(signal, Cs[(bound[w]):(bound[w+1])]) | |
425 | |
426 weil = warum * 9 | |
427 #Is there signal in the region above background | |
428 if (gm > 0) { | |
429 writethis[[1+weil]] = pStart | |
430 writethis[[2+weil]] = pEnd | |
431 writethis[[3+weil]] = paste0(chromName, ".", pStart) | |
432 writethis[[4+weil]] = "1000" | |
433 writethis[[5+weil]] = "." | |
434 writethis[[6+weil]] = gm | |
435 writethis[[7+weil]] = will2k$p.value | |
436 writethis[[8+weil]] = "-1" | |
437 writethis[[9+weil]] = summit | |
438 } | |
439 w = w + 2 | |
440 warum = warum + 1 | |
441 } | |
442 | |
443 } #peak resolution check | |
444 } #all replicates agree on clustering assignments? | |
445 } #clustering worked? | |
446 | |
447 } else if (resol == "window") { #window resolution check | |
448 | 504 |
449 #extract subset of the background | 505 #extract subset of the background |
450 if (bkgd != "None") { | 506 if (bkgd != "None") { |
451 Cs = counts[[numdup+1]][winStart:winEnd] | 507 Cs = counts[[numdup+1]][winStart:winEnd] |
452 Cs = filtfilt(rep(1,80)/80,1,Cs) | 508 Cs = filtfilt(rep(1,80)/80,1,Cs) |
453 } else { | 509 } else { |
510 set.seed(samplingSeed) | |
454 Cs = sample(C, rWinSizeTemp, replace = TRUE) | 511 Cs = sample(C, rWinSizeTemp, replace = TRUE) |
455 Cs = filtfilt(rep(1,80)/80,1,Cs) | 512 Cs = filtfilt(rep(1,80)/80,1,Cs) |
456 } | 513 } |
457 | 514 |
458 #calculate scores | 515 #calculate scores |
465 | 522 |
466 #Is there signal in the region above background | 523 #Is there signal in the region above background |
467 if (gm > 0) { | 524 if (gm > 0) { |
468 writethis[[1]] = winStart - 1 | 525 writethis[[1]] = winStart - 1 |
469 writethis[[2]] = winEnd | 526 writethis[[2]] = winEnd |
470 writethis[[3]] = paste0(chromName, ".", winStart -1) | 527 writethis[[3]] = paste0(chromName, ".", winStart -1, ".NoClust") |
471 writethis[[4]] = "1000" | 528 writethis[[4]] = "1000" |
472 writethis[[5]] = "." | 529 writethis[[5]] = "." |
473 writethis[[6]] = gm | 530 writethis[[6]] = gm |
474 writethis[[7]] = will2k$p.value | 531 writethis[[7]] = will2k$p.value |
475 writethis[[8]] = "-1" | 532 writethis[[8]] = "-1" |
476 writethis[[9]] = summit | 533 writethis[[9]] = summit |
477 } | 534 } |
478 } #window clustering | 535 } #window reporting |
479 | 536 |
480 return(writethis) | 537 return(writethis) |
481 } | 538 } |
482 | 539 |
483 | 540 |
504 args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line | 561 args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line |
505 | 562 |
506 #Parsing arguments and storing values | 563 #Parsing arguments and storing values |
507 for (each.arg in args) { | 564 for (each.arg in args) { |
508 #chormosome size file | 565 #chormosome size file |
509 if (grepl('^-sfile=',each.arg)) { | 566 if (grepl('-sfile=',each.arg)) { |
510 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | 567 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] |
511 if (! is.na(arg.split[2]) ) { | 568 if (! is.na(arg.split[2]) ) { |
512 size.file <- arg.split[2] | 569 size.file <- arg.split[2] |
513 } else { | 570 } else { |
514 stop('No genome size file') | 571 stop('No genome size file') |
515 } | 572 } |
516 } | 573 } |
517 #bed file names | 574 #bed file names |
518 if (grepl('^-bednames=',each.arg)) { | 575 if (grepl('-bednames=',each.arg)) { |
519 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | 576 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] |
520 if (! is.na(arg.split[2]) ) { | 577 if (! is.na(arg.split[2]) ) { |
521 bednames <- arg.split[2] | 578 bednames <- arg.split[2] |
522 } else { | 579 } else { |
523 stop('No bed file names') | 580 stop('No bed file names') |
524 } | 581 } |
525 } | 582 } |
526 #bed files directory | 583 #bed files directory |
527 if (grepl('^-frag=',each.arg)) { | 584 if (grepl('-frag=',each.arg)) { |
528 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | 585 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] |
529 if (! is.na(arg.split[2]) ) { | 586 if (! is.na(arg.split[2]) ) { |
530 frag <- arg.split[2] | 587 frag <- arg.split[2] |
531 } else { | 588 } else { |
532 stop('No fragment length given') | 589 stop('No fragment length given') |
533 } | 590 } |
534 } | 591 } |
535 #bakcground files directory | 592 #bakcground files directory |
536 if (grepl('^-bkgd=',each.arg)) { | 593 if (grepl('-bkgd=',each.arg)) { |
537 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | 594 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] |
538 if (! is.na(arg.split[2]) ) { | 595 if (! is.na(arg.split[2]) ) { |
539 bkgd <- arg.split[2] | 596 bkgd <- arg.split[2] |
540 } else { | 597 } else { |
541 stop('No background file') | 598 stop('No background file') |
542 } | 599 } |
543 } | 600 } |
544 #bakcground files directory | 601 #bakcground files directory |
545 if (grepl('^-out=',each.arg)) { | 602 if (grepl('-out=',each.arg)) { |
546 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | 603 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] |
547 if (! is.na(arg.split[2]) ) { | 604 if (! is.na(arg.split[2]) ) { |
548 out <- arg.split[2] | 605 out <- arg.split[2] |
549 } else { | 606 } else { |
550 stop('No output directory given') | 607 stop('No output directory given') |
551 } | 608 } |
552 } | 609 } |
553 #Cluster number | 610 #Cluster number |
554 if (grepl('^-clustnummer=',each.arg)) { | 611 if (grepl('-clustnummer=',each.arg)) { |
555 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | 612 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] |
556 if (! is.na(arg.split[2]) ) { | 613 if (! is.na(arg.split[2]) ) { |
557 clustnummer <- as.numeric(arg.split[2]) | 614 clustnummer <- as.numeric(arg.split[2]) |
558 } | 615 } |
559 } | 616 } |
560 #resolution | 617 #resolution |
561 if (grepl('^-resolution=',each.arg)) { | 618 if (grepl('-resolution=',each.arg)) { |
562 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | 619 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] |
563 if (! is.na(arg.split[2]) ) { | 620 if (! is.na(arg.split[2]) ) { |
564 resol <- arg.split[2] | 621 resol <- arg.split[2] |
565 } | 622 } |
566 } | 623 } |
567 #processor cores | 624 #processor cores |
568 if (grepl('^-p=',each.arg)) { | 625 if (grepl('-p=',each.arg)) { |
569 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | 626 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] |
570 if (! is.na(arg.split[2]) ) { | 627 if (! is.na(arg.split[2]) ) { |
571 cornum <- as.numeric(arg.split[2]) | 628 cornum <- as.numeric(arg.split[2]) |
572 } | 629 } |
573 } | 630 } |
574 #minimum window size | 631 #minimum window size |
575 if (grepl('^-window=',each.arg)) { | 632 if (grepl('-window=',each.arg)) { |
576 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | 633 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] |
577 if (! is.na(arg.split[2]) ) { | 634 if (! is.na(arg.split[2]) ) { |
578 winSize <- arg.split[2] | 635 winSize <- arg.split[2] |
579 } | 636 } |
580 } | 637 } |
581 #window size | 638 #window size |
582 if (grepl('^-bin=',each.arg)) { | 639 if (grepl('-bin=',each.arg)) { |
583 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | 640 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] |
584 if (! is.na(arg.split[2]) ) { | 641 if (! is.na(arg.split[2]) ) { |
585 binsize <- arg.split[2] | 642 binsize <- arg.split[2] |
586 } | 643 } |
587 } | 644 } |
588 #type (paired / single) | 645 #type (paired / single) |
589 if (grepl('^-type=',each.arg)) { | 646 if (grepl('-type=',each.arg)) { |
590 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | 647 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] |
591 if (! is.na(arg.split[2]) ) { | 648 if (! is.na(arg.split[2]) ) { |
592 type <- arg.split[2] | 649 type <- arg.split[2] |
593 } | 650 } |
594 } | 651 } |
595 #chromosome number | 652 #chromosome number |
596 if (grepl('^-chrcount=',each.arg)) { | 653 if (grepl('-chrcount=',each.arg)) { |
597 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | 654 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] |
598 if (! is.na(arg.split[2]) ) { | 655 if (! is.na(arg.split[2]) ) { |
599 chrcount <- as.numeric(arg.split[2]) | 656 chrcount <- as.numeric(arg.split[2]) |
600 } | 657 } |
601 } | 658 } |
602 #window enrichment cutoff | 659 #window enrichment cutoff |
603 if (grepl('^-windowe=',each.arg)) { | 660 if (grepl('-windowe=',each.arg)) { |
604 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | 661 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] |
605 if (! is.na(arg.split[2]) ) { | 662 if (! is.na(arg.split[2]) ) { |
606 windowe <- arg.split[2] | 663 windowe <- arg.split[2] |
607 } | 664 } |
608 } | 665 } |
666 #initialize | |
667 if (grepl('-initModel=',each.arg)) { | |
668 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | |
669 if (! is.na(arg.split[2]) ) { | |
670 initialize <- arg.split[2] | |
671 } | |
672 } | |
673 | |
609 } | 674 } |
610 | 675 |
611 ##Parse in variables | 676 ##Parse in variables |
612 chromosomes = read.table(size.file, header=FALSE) | 677 chromosomes = read.table(size.file, header=FALSE) |
613 chromName = chromosomes$V1; #which chromosome | 678 chromName = chromosomes$V1; #which chromosome |
614 chromSize = chromosomes$V2; #chromosomes size | 679 chromSize = chromosomes$V2; #chromosomes size |
615 rm(chromosomes) | 680 rm(chromosomes) |
616 | 681 |
682 if (chrcount == 1) { | |
683 write(paste(samplingSeed), file = paste0(out, "/seed.info")) | |
684 } else { | |
685 samplingSeed = as.numeric(read.table(paste0(out, "/seed.info"), header = FALSE)[[1]]) | |
686 } | |
617 readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]]) | 687 readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]]) |
618 numdup = length(readsFiles) #number of replicates | 688 numdup = length(readsFiles) #number of replicates |
619 if (bkgd != "None") { | 689 if (bkgd != "None") { |
620 readsFiles[[numdup+1]] = bkgd | 690 readsFiles[[numdup+1]] = bkgd |
621 } | 691 } |
701 | 771 |
702 | 772 |
703 # ============================ | 773 # ============================ |
704 # Estimating Background Model | 774 # Estimating Background Model |
705 # ============================ | 775 # ============================ |
706 | |
707 if (chrcount == 1){ #first chromosome, estimate bkgd (includes SNR cutoff) | 776 if (chrcount == 1){ #first chromosome, estimate bkgd (includes SNR cutoff) |
708 if (is.na(cutoff)) { | 777 if (is.na(cutoff)) { |
709 if (bkgd != "None") { | 778 if (bkgd != "None") { |
710 cutoff = vector(length = numdup) | 779 cutoff = vector(length = numdup) |
711 sdC = sd(counts[[numdup+1]]) | 780 sdC = sd(counts[[numdup+1]]) |
720 cutoff = vector(length = numdup) | 789 cutoff = vector(length = numdup) |
721 mmV = var(geomeanL(counts)) | 790 mmV = var(geomeanL(counts)) |
722 mmM = mean(geomeanL(counts)) | 791 mmM = mean(geomeanL(counts)) |
723 sigma = log(1+((mmV) / ((mmM)^2))) | 792 sigma = log(1+((mmV) / ((mmM)^2))) |
724 mu = (log(mmM)) - (0.5 * (sigma)) | 793 mu = (log(mmM)) - (0.5 * (sigma)) |
794 set.seed(samplingSeed) | |
725 C = rlnorm(100000, mu, sqrt(sigma)) | 795 C = rlnorm(100000, mu, sqrt(sigma)) |
726 for (x in 1:numdup) { | 796 for (x in 1:numdup) { |
727 cutoff[x] = (mean(counts[[x]]))/(sd(C)) | 797 cutoff[x] = (mean(counts[[x]]))/(sd(C)) |
728 } | 798 } |
729 cutoff = max(cutoff) | 799 cutoff = max(cutoff) |
730 mCs = (mean(sample(C, binSize*5, replace = TRUE))) | 800 set.seed(samplingSeed) |
731 dCs = (sd(sample(C, binSize*5, replace = TRUE))) | 801 snow = sample(C, binSize*5, replace = TRUE) |
802 mCs = mean(snow) | |
803 dCs = sd(snow) | |
732 write(paste(c(cutoff,sigma,mu)), file = paste0(out, "/bkgd.info"), append = TRUE) | 804 write(paste(c(cutoff,sigma,mu)), file = paste0(out, "/bkgd.info"), append = TRUE) |
733 } | 805 } |
734 } | 806 } |
735 } else { #bkgd estiamted from before | 807 } else { #bkgd estiamted from before |
736 bkgdInfo = read.table(paste0(out, "/bkgd.info"), header = FALSE) | 808 bkgdInfo = read.table(paste0(out, "/bkgd.info"), header = FALSE) |
742 C = NULL | 814 C = NULL |
743 mCs = NULL | 815 mCs = NULL |
744 } else { | 816 } else { |
745 sigma = as.numeric(bkgdInfo[[1]][2]) | 817 sigma = as.numeric(bkgdInfo[[1]][2]) |
746 mu = as.numeric(bkgdInfo[[1]][3]) | 818 mu = as.numeric(bkgdInfo[[1]][3]) |
819 set.seed(samplingSeed) | |
747 C = rlnorm(100000, mu, sqrt(sigma)) | 820 C = rlnorm(100000, mu, sqrt(sigma)) |
748 mCs = (mean(sample(C, binSize*5, replace = TRUE))) | 821 set.seed(samplingSeed) |
749 dCs = (sd(sample(C, binSize*5, replace = TRUE))) | 822 snow = sample(C, binSize*5, replace = TRUE) |
823 mCs = mean(snow) | |
824 dCs = sd(snow) | |
750 } | 825 } |
751 } | 826 } |
752 #=======================> DONE! | 827 #=======================> DONE! |
753 | 828 |
754 | 829 |
762 } else { | 837 } else { |
763 coffeeshop = lapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, strict, mCs, dCs, bkgd) | 838 coffeeshop = lapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, strict, mCs, dCs, bkgd) |
764 } | 839 } |
765 coffeeshop = as.numeric(unlist(coffeeshop)) | 840 coffeeshop = as.numeric(unlist(coffeeshop)) |
766 coffeeshop[coffeeshop != numdup] = 0 | 841 coffeeshop[coffeeshop != numdup] = 0 |
842 | |
843 if (sum(coffeeshop) != 0) { #Any enriched bins? | |
844 | |
767 coffeeshop = c(0, diff(coffeeshop)) | 845 coffeeshop = c(0, diff(coffeeshop)) |
768 coffeeshop = cbind(coffeeshop, bins) | 846 coffeeshop = cbind(coffeeshop, bins) |
769 coffeeshopNord = coffeeshop[coffeeshop[,1] == numdup,,drop=FALSE] | 847 coffeeshopNord = coffeeshop[coffeeshop[,1] == numdup,,drop=FALSE] |
770 coffeeshopSud = coffeeshop[coffeeshop[,1] == -numdup,,drop=FALSE] | 848 coffeeshopSud = coffeeshop[coffeeshop[,1] == -numdup,,drop=FALSE] |
771 coffeeshopNord = coffeeshopNord[,2] | 849 coffeeshopNord = coffeeshopNord[,2] |
798 # =================================== | 876 # =================================== |
799 # Initializing Clustering Parameters | 877 # Initializing Clustering Parameters |
800 # =================================== | 878 # =================================== |
801 if (length(coffeeshop[,1]) > 0) { #any enriched windows detected? | 879 if (length(coffeeshop[,1]) > 0) { #any enriched windows detected? |
802 | 880 |
803 yummy = ceiling(length(coffeeshop[,1]) / 4) | 881 if (initialize == "deterministic") { |
804 if (yummy > 20) { | 882 yummy = ceiling(length(coffeeshop[,1]) / 1000) |
805 yummy = sample(1:yummy, 20) | 883 if (yummy == 0) { |
806 } else if (yummy > 0) { | 884 yummy = 1 |
807 yummy = 1:yummy | 885 } |
808 } else { | 886 } |
809 yummy = 1 | 887 if (initialize == "stochastic") { |
888 yummy = ceiling(length(coffeeshop[,1]) / 4) | |
889 if (yummy > 20) { | |
890 set.seed(samplingSeed) | |
891 yummy = sample(1:yummy, 20) | |
892 } else if (yummy > 0) { | |
893 yummy = 1:yummy | |
894 } else { | |
895 yummy = 1 | |
896 } | |
810 } | 897 } |
811 coffeeshopNord = coffeeshop[yummy,1] | 898 coffeeshopNord = coffeeshop[yummy,1] |
812 coffeeshopSud = coffeeshop[yummy,2] | 899 coffeeshopSud = coffeeshop[yummy,2] |
900 set.seed(samplingSeed) | |
813 noise = rnorm(100000, mean=0, sd=0.1) | 901 noise = rnorm(100000, mean=0, sd=0.1) |
814 param = initparam(coffeeshopNord, coffeeshopSud, numdup, counts, cornum, clustnummer, modelnames, noise) | 902 param = initparam(coffeeshopNord, coffeeshopSud, numdup, counts, cornum, clustnummer, modelnames, noise) |
815 #=======================> DONE! | 903 #=======================> DONE! |
816 | 904 |
817 | 905 |
826 scores = mclapply(coffeeshop[,1], scorewindow, coffeeshop[,2], numdup, C, bkgd, counts, startlist = coffeeshop[,1], mc.cores = cornum, mc.preschedule = TRUE) | 914 scores = mclapply(coffeeshop[,1], scorewindow, coffeeshop[,2], numdup, C, bkgd, counts, startlist = coffeeshop[,1], mc.cores = cornum, mc.preschedule = TRUE) |
827 } else { | 915 } else { |
828 scores = lapply(coffeeshop[,1], scorewindow, coffeeshop[,2], numdup, C, bkgd, counts, startlist = coffeeshop[,1]) | 916 scores = lapply(coffeeshop[,1], scorewindow, coffeeshop[,2], numdup, C, bkgd, counts, startlist = coffeeshop[,1]) |
829 } | 917 } |
830 scores = unlist(scores) | 918 scores = unlist(scores) |
831 #scores = scores[is.finite(scores)] #just a sanity check | |
832 | 919 |
833 if (windowe == "auto") { | 920 if (windowe == "auto") { |
834 lscores = log(scores) | 921 lscores = log(scores) |
835 if (length(scores) > 0) { | 922 if (length(scores) > 0) { |
836 if (chrcount == 1) { | 923 if (chrcount == 1) { |
865 | 952 |
866 | 953 |
867 # ============== | 954 # ============== |
868 # Finding Peaks | 955 # Finding Peaks |
869 # ============== | 956 # ============== |
870 #if (length(coffeeshop[,1]) > 0) { #any enriched windows left after filtering?, just a sanity check! | 957 if (length(coffeeshop[,1]) > 0) { #any enriched windows left after filtering? |
871 coffeeshop = coffeeshop[,-3] | 958 coffeeshop = coffeeshop[,-3] |
872 if (cornum > 1) { | 959 if (cornum > 1) { |
873 peaks = mclapply(coffeeshop[,1], findpeak, coffeeshop[,2], numdup, C, param, bkgd, resol, counts, noise, startlist = coffeeshop[,1], meanAdjust, mc.cores = cornum, mc.preschedule = TRUE) | 960 peaks = mclapply(coffeeshop[,1], findpeak, coffeeshop[,2], numdup, C, param, bkgd, resol, counts, noise, startlist = coffeeshop[,1], meanAdjust, clustnummer, mc.cores = cornum, mc.preschedule = TRUE) |
874 } else { | 961 } else { |
875 peaks = lapply(coffeeshop[,1], findpeak, coffeeshop[,2], numdup, C, param, bkgd, resol, counts, noise, startlist = coffeeshop[,1], meanAdjust) | 962 peaks = lapply(coffeeshop[,1], findpeak, coffeeshop[,2], numdup, C, param, bkgd, resol, counts, noise, startlist = coffeeshop[,1], meanAdjust, clustnummer) |
876 } | 963 } |
877 if (!(is.null(peaks))) { #any peaks discovered? | 964 if (!(is.null(peaks))) { #any peaks discovered? |
878 writethis = processPeaks(peaks) | 965 writethis = processPeaks(peaks) |
879 #=======================> DONE! | 966 #=======================> DONE! |
880 | 967 |
884 | 971 |
885 # ========================= | 972 # ========================= |
886 # Writing Peak Information | 973 # Writing Peak Information |
887 # ========================= | 974 # ========================= |
888 } else { nothing = TRUE } #no peaks | 975 } else { nothing = TRUE } #no peaks |
889 #} else { nothing = TRUE } #no enriched windows left after filtering, just a sanity check (at least one window should still be there) | 976 } else { nothing = TRUE } #no enriched windows left after filtering |
890 } else { nothing = TRUE } #no enriched widnows discovered | 977 } else { nothing = TRUE } #no enriched widnows discovered |
978 } else { nothing = TRUE; cutthisW = windowe } #no enriched bins discovered | |
891 | 979 |
892 if (isTRUE(nothing)) { | 980 if (isTRUE(nothing)) { |
893 file.create(paste0(out, "/", chromName, ".peaks.bed")) | 981 file.create(paste0(out, "/", chromName, ".peaks.bed")) |
894 write(paste(chromName, minpeak, sep = " "), file = paste0(out, "/min.peaksize"), append=TRUE) | 982 write(paste(chromName, minpeak, sep = " "), file = paste0(out, "/min.peaksize"), append=TRUE) |
895 | 983 |
896 if (chrcount == 1) { | 984 if (chrcount == 1) { |
897 message(paste0("No peaks found! - Window Fold Enrichment: ", round(cutthisW,2))) | 985 message(paste0("No peaks found! - Window Fold Enrichment: ", cutthisW, " - Seed: ", samplingSeed)) |
898 } else { | 986 } else { |
899 message("No peaks found!") | 987 message("No peaks found!") |
900 } | 988 } |
901 } else { | 989 } else { |
902 write(paste(chromName, writethis$X1, writethis$X2, writethis$X3, writethis$X4, writethis$X5, writethis$X6, writethis$X7, writethis$X8, writethis$X9, minpeak, sep = " "), file = paste0(out, "/", chromName, ".peaks.bed"), ncolumns = 1) | 990 write(paste(chromName, writethis$X1, writethis$X2, writethis$X3, writethis$X4, writethis$X5, writethis$X6, writethis$X7, writethis$X8, writethis$X9, minpeak, sep = " "), file = paste0(out, "/", chromName, ".peaks.bed"), ncolumns = 1) |
903 write(paste(chromName, minpeak, sep = " "), file = paste0(out, "/min.peaksize"), append=TRUE) | 991 write(paste(chromName, minpeak, sep = " "), file = paste0(out, "/min.peaksize"), append=TRUE) |
904 | 992 |
905 | 993 |
906 if (chrcount == 1) { | 994 if (chrcount == 1) { |
907 message(paste0("Done! - Window Fold Enrichment: ", round(cutthisW,2))) | 995 message(paste0("Done! - Window Fold Enrichment: ", cutthisW, " - Seed: ", samplingSeed)) |
908 } else { | 996 } else { |
909 message("Done!") | 997 message("Done!") |
910 } | 998 } |
911 } | 999 } |
912 #=======================> DONE! | 1000 #=======================> DONE! |