1
|
1 ########################################################################
|
|
2 # JAMMv1.0.7rev1 is a peak finder for joint analysis of NGS replicates.
|
|
3 # Copyright (C) 2014-2015 Mahmoud Ibrahim
|
|
4 #
|
|
5 # This program is free software: you can redistribute it and/or modify
|
|
6 # it under the terms of the GNU General Public License as published by
|
|
7 # the Free Software Foundation, either version 3 of the License, or
|
|
8 # (at your option) any later version.
|
|
9 #
|
|
10 # This program is distributed in the hope that it will be useful,
|
|
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 # GNU General Public License for more details.
|
|
14 #
|
|
15 # You should have received a copy of the GNU General Public License
|
|
16 # along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
17 #
|
|
18 # Contact: mahmoud.ibrahim@mdc-berlin.de
|
|
19 ########################################################################
|
|
20
|
|
21
|
0
|
22
|
|
23
|
|
24 # =======================
|
|
25 # User-defined variables
|
|
26 # =======================
|
1
|
27 samplingSeed = 1011414 #the seed makes the random sampling that JAMM does "deterministic". You can change this to your integer seed of choice to reproduce exact same results or randomize it this way: ceiling(rnorm(1,50,300)).
|
|
28 reportNoClust = "n" #report windows for which clustering failed? Reported windows will be marked by "NoClust" in the peak name (4th column).
|
|
29 cutoff = NA #To enforce an SNR cutoff ratio for bin enrichment calling, delete NA and enter the number you want.
|
|
30 strict = 1 #To make bin enrichment calling more / less strict, increase / decrease this number.
|
|
31 meanAdjust = "n" #Adjust the initialization mean vector for each window before clustering? If you want to do so, change this to "y".
|
|
32 options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000).
|
0
|
33 #=======================> DONE!
|
|
34
|
|
35
|
|
36
|
|
37
|
|
38 # ================================
|
|
39 # Required Libraries check & load
|
|
40 # ================================
|
|
41 if ((is.element('mclust', installed.packages()[,1]) == FALSE) || (is.element('signal', installed.packages()[,1]) == FALSE) || (is.element('parallel', installed.packages()[,1]) == FALSE)) {
|
|
42 stop("R package 'mclust', 'signal' and 'parallel' are required. Please install them!")
|
|
43 }
|
|
44 suppressPackageStartupMessages(library("mclust"))
|
|
45 suppressPackageStartupMessages(library("signal"))
|
|
46 suppressPackageStartupMessages(library("parallel"))
|
|
47 #=======================> DONE!
|
|
48
|
|
49
|
|
50
|
|
51
|
|
52
|
|
53 # =================
|
|
54 # Custom Functions
|
|
55 # =================
|
|
56 #Get per-row Geometric mean (takes list, returns vectors, not lists!)
|
|
57 geomeanL <- function(mat){
|
|
58 n = length(mat)
|
|
59 if (n > 1) {
|
|
60 mult = (mat[[1]])*(mat[[2]])
|
|
61 if (n > 2) {
|
|
62 for (i in 3:n) {
|
|
63 mult = mult*(mat[[i]])
|
|
64 }
|
|
65 }
|
|
66 mult = mult^(1/n)
|
|
67 mat = mult
|
|
68 return(mat)
|
|
69 } else {
|
|
70 return(mat[[1]])
|
|
71 }
|
|
72 }
|
|
73
|
|
74
|
|
75
|
|
76 #Get per-row Geometric mean (takes matrix, returns vectors, not lists!)
|
|
77 geomean <- function(mat){
|
|
78 n = NCOL(mat)
|
|
79 if (n > 1) {
|
|
80 mult = (mat[,1])*(mat[,2])
|
|
81 if (n > 2) {
|
|
82 for (i in 3:n) {
|
|
83 mult = mult*(mat[,i])
|
|
84 }
|
|
85 }
|
|
86 mult = mult^(1/n)
|
|
87 mat = mult
|
|
88 }
|
|
89 return(mat)
|
|
90 }
|
|
91
|
|
92
|
|
93
|
|
94 #Read in bed(st2) file
|
|
95 parsein = function(bedfile) {
|
|
96 l = read.table(bedfile, header = FALSE)[[1]]
|
|
97 l = l + 1
|
|
98 return(l)
|
|
99 }
|
|
100
|
|
101
|
|
102 #Read in bedpe(st2) file
|
|
103 parseinpe = function(bedfile) {
|
|
104 l = read.table(bedfile, header = FALSE)
|
|
105 l = cbind((l[[1]] + 1), l[[2]])
|
|
106 return(l)
|
|
107 }
|
|
108
|
|
109
|
|
110 #Produces normalized extended read counts (takes output of parsein(), return a vector of floats)
|
|
111 countreads = function(bedfile, reads, frag, chromsize, filelist, chrcount) {
|
|
112
|
|
113 o = which(filelist == bedfile)
|
|
114
|
|
115 counts = vector(mode = "numeric", length = chromsize)
|
|
116
|
|
117 for (j in 1:length(reads[[o]])) {
|
|
118 if ((reads[[o]][j]+frag[o]-1) <= chromsize) {
|
|
119 counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] = counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] + 1
|
|
120 }
|
|
121 }
|
|
122
|
|
123 mCount = mean(counts)
|
|
124
|
|
125 if (chrcount == 1) {
|
|
126 counts = counts/mCount
|
|
127 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"))
|
|
128 } else {
|
|
129 meanCounts = mean(as.numeric(read.table(paste0(out, "/norma.", o, ".info"))[[1]]))
|
|
130 if ((mCount > (5*meanCounts)) || (mCount < (0.2*meanCounts))) {
|
|
131 mCount = meanCounts
|
|
132 } else {
|
|
133 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE)
|
|
134 }
|
|
135 counts = counts/mCount
|
|
136 }
|
1
|
137
|
0
|
138 return(counts)
|
|
139 }
|
|
140
|
|
141
|
|
142
|
|
143 #Produces normalized extended read counts (takes output of parsein(), return a vector of floats)
|
|
144 countreadspe = function(bedfile, reads, chromsize, filelist, chrcount) {
|
|
145
|
|
146 o = which(filelist == bedfile)
|
|
147
|
|
148 counts = vector(mode = "numeric", length = chromsize)
|
|
149
|
|
150 for (j in 1:length(reads[[o]][,1])) {
|
|
151 counts[(reads[[o]][j,1]):(reads[[o]][j,2])] = counts[(reads[[o]][j,1]):(reads[[o]][j,2])] + 1
|
|
152 }
|
|
153
|
|
154 mCount = mean(counts)
|
|
155
|
|
156 if (chrcount == 1) {
|
|
157 counts = counts/mCount
|
|
158 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"))
|
|
159 } else {
|
|
160 meanCounts = mean(as.numeric(read.table(paste0(out, "/norma.", o, ".info"))[[1]]))
|
|
161 if ((mCount > (5*meanCounts)) || (mCount < (0.2*meanCounts))) {
|
|
162 mCount = meanCounts
|
|
163 } else {
|
|
164 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE)
|
|
165 }
|
|
166 counts = counts/mCount
|
|
167 }
|
1
|
168
|
0
|
169 return(counts)
|
|
170 }
|
|
171
|
|
172
|
|
173
|
|
174 #find enriched bins
|
|
175 pickbins = function(winStart, counts, binSize, chromSize, numdup, C, cutoff, strict, mCs, dCs, bkgd) {
|
|
176
|
|
177 if ((winStart + binSize) <= chromSize) {
|
|
178 winEnd = winStart + binSize
|
|
179 } else {
|
|
180 winEnd = chromSize
|
|
181 }
|
|
182 binSizeTemp = winEnd - winStart
|
|
183 tempend = winEnd - 1
|
|
184
|
|
185 #extract subset of the background
|
|
186 if (bkgd != "None") {
|
|
187 Cs = counts[[numdup+1]][winStart:tempend]
|
|
188 mCs = mean(Cs)
|
|
189 dCs = sd(Cs)
|
|
190 }
|
|
191
|
|
192 go = rep(0, numdup)
|
|
193 for (g in 1:numdup) {
|
|
194 mS = (mean(counts[[g]][winStart:tempend]))
|
|
195 ratio = mS/dCs
|
|
196 if ((mS > (mCs * strict)) && (ratio > cutoff)) {
|
|
197 go[g] = 1
|
|
198 }
|
|
199 }
|
|
200 veep = sum(go)
|
|
201 return(veep)
|
|
202 }
|
|
203
|
|
204
|
|
205
|
|
206
|
|
207 #find enriched wins
|
|
208 pickwins = function(winStart, coffeeshopSud, counts, numdup, startlist, winSize) {
|
|
209
|
|
210 plz = which(startlist == winStart)
|
|
211 winEnd = coffeeshopSud[plz]
|
|
212 rWinSize = winEnd - winStart + 1
|
|
213
|
|
214
|
|
215 if(rWinSize >= winSize) {
|
|
216 mS = rep(0, numdup)
|
|
217 for (g in 1:numdup) {
|
|
218 mS[g] = (mean(counts[[g]][winStart:winEnd]))
|
|
219 }
|
|
220 veep = mean(mS)
|
|
221 } else {
|
|
222 veep = FALSE
|
|
223 }
|
|
224
|
|
225 return(veep)
|
|
226 }
|
|
227
|
|
228
|
|
229
|
|
230 #score windows for fast analysis
|
|
231 scorewindow = function(winStart, coffeeshopSud, numdup, C, bkgd, counts, startlist) {
|
|
232
|
|
233 plz = which(startlist == winStart)
|
|
234 winEnd = coffeeshopSud[plz]
|
|
235
|
|
236 #will store peak information
|
|
237 writethis = list()
|
|
238
|
|
239 rWinSizeTemp = winEnd - winStart + 1
|
|
240
|
|
241 #extract subset of the IP
|
|
242 Rs = matrix(nrow = rWinSizeTemp, ncol = numdup)
|
|
243 Rsr = Rs
|
|
244 for (j in 1:numdup) {
|
|
245 Rsr[,j] = counts[[j]][winStart:winEnd]
|
|
246 Rs[,j] = filtfilt(rep(1,80)/80,1,Rsr[,j])
|
|
247 }
|
|
248 #extract subset of the background
|
|
249 if (bkgd != "None") {
|
|
250 Cs = counts[[numdup+1]]
|
|
251 Cmin = min(Cs[Cs > 0])
|
|
252 Cs = Cs[winStart:winEnd]
|
|
253 Cs = filtfilt(rep(1,80)/80,1,Cs) + Cmin #gets rid of Inf in the fold change
|
|
254 } else {
|
1
|
255 set.seed(samplingSeed)
|
0
|
256 Cs = sample(C, rWinSizeTemp, replace = TRUE)
|
|
257 Cs = filtfilt(rep(1,80)/80,1,Cs)
|
|
258 }
|
|
259
|
|
260 #start scoring
|
|
261 signal = (geomean(Rs))
|
|
262 cairo = (mean(signal)) / (mean(Cs))
|
|
263 return(cairo)
|
|
264 }
|
|
265
|
|
266
|
|
267 #Initialize MClust clustering parameters
|
|
268 smoothcounts = function(winStart, coffeeshopSud, numdup, counts, startlist) { #helper function1
|
|
269
|
|
270 plz = which(startlist == winStart)
|
|
271 winEnd = coffeeshopSud[plz]
|
|
272
|
|
273 #extract subset of the IP
|
|
274 Rs = matrix(0, nrow = (winEnd - winStart + 1), ncol = numdup)
|
|
275 for (j in 1:numdup) {
|
|
276 Rs[,j] = counts[[j]][winStart:winEnd]
|
|
277 }
|
|
278 #smooth extended read counts
|
|
279 for (j in 1:numdup) {
|
|
280 Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j])
|
|
281 }
|
|
282 return(Rs)
|
|
283 }
|
|
284 cluster = function(model, sig, init, clustnummer, noise) { #helper function2
|
1
|
285 set.seed(samplingSeed)
|
0
|
286 noisy = sample(noise, length(sig[,1]), replace = TRUE)
|
|
287 clust = me(model, sig+noisy, init)
|
|
288 bicc = bic(model, clust$loglik, length(sig[,1]), length(sig[1,]), clustnummer)
|
|
289 out = list(bicc = bicc, param = clust$parameters)
|
|
290 return(out)
|
|
291 }
|
|
292 initparam = function(coffeeshopNord, coffeeshopSud, numdup, counts, cornum, clustnummer, modelnames, noise) { #main function
|
|
293
|
|
294 n = length(coffeeshopNord)
|
|
295 #smooth extended read counts
|
|
296 if (cornum > 1) {
|
|
297 sig = mclapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord, mc.cores = cornum, mc.preschedule = TRUE)
|
|
298 } else {
|
|
299 sig = lapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord)
|
|
300 }
|
|
301 sig = do.call(rbind, sig)
|
|
302
|
|
303 #kmeans initialization
|
1
|
304 set.seed(samplingSeed)
|
|
305 init = kmeans(sig, clustnummer, nstart = 20)
|
0
|
306 init = unmap(init$cluster)
|
|
307
|
|
308
|
|
309 if (cornum > 1) {
|
|
310 param = mclapply(modelnames, cluster, sig, init, clustnummer, noise, mc.cores = cornum, mc.preschedule = TRUE)
|
|
311 } else {
|
|
312 param = lapply(modelnames, cluster, sig, init, clustnummer, noise)
|
|
313 }
|
|
314
|
|
315 bicc = vector(mode = "numeric", length = length(modelnames))
|
|
316 for (i in 1:length(modelnames)) {
|
|
317 bicc[i] = as.numeric(param[[i]]$bicc)
|
|
318 }
|
|
319 bicc = which.max(bicc)
|
|
320
|
|
321 out = list(initparam = param[[bicc]]$param, modelname = modelnames[bicc])
|
|
322 return(out)
|
|
323 }
|
|
324
|
|
325
|
|
326
|
|
327
|
|
328 #find peaks
|
1
|
329 findpeak = function(winStart, coffeeshopSud, numdup, C, param, bkgd, resol, counts, noise, startlist, meanAdjust, clustnummer) {
|
0
|
330
|
1
|
331
|
0
|
332 plz = which(startlist == winStart)
|
|
333 winEnd = coffeeshopSud[plz]
|
|
334
|
|
335
|
|
336 #will store peak information
|
|
337 writethis = list()
|
1
|
338 ccx = 1 #default is clustering didNOT work
|
0
|
339
|
|
340
|
|
341 rWinSizeTemp = winEnd - winStart + 1
|
|
342
|
|
343 #extract subset of the IP
|
|
344 Rs = matrix(nrow = rWinSizeTemp, ncol = numdup)
|
|
345 Rsr = Rs
|
|
346 if (meanAdjust == "y") {
|
|
347 for (j in 1:numdup) {
|
|
348 Rsr[,j] = counts[[j]][winStart:winEnd]
|
|
349 Rs[,j] = filtfilt(rep(1,80)/80,1,Rsr[,j])
|
|
350 kabel = which.max(param$init$mean[j,])
|
|
351 param$initparam$mean[j,kabel] = mean(Rs[,j])
|
|
352 }
|
|
353 } else {
|
|
354 for (j in 1:numdup) {
|
|
355 Rsr[,j] = counts[[j]][winStart:winEnd]
|
|
356 Rs[,j] = filtfilt(rep(1,80)/80,1,Rsr[,j])
|
|
357 }
|
|
358 }
|
|
359
|
|
360 if (resol != "window") {
|
1
|
361
|
|
362 #clustering (take 1)
|
|
363 take = 1
|
|
364 set.seed(samplingSeed)
|
0
|
365 noisy = sample(noise, rWinSizeTemp, replace = TRUE)
|
|
366 clust = em(param$modelname, Rs+noisy, param$initparam)
|
|
367 clust$classification = map(clust$z)
|
1
|
368 if (!((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification)))))) { #clustering didn't work, take1
|
|
369
|
|
370 #repeat clustering from scratch, take 2!
|
|
371 set.seed(samplingSeed)
|
|
372 init = kmeans(Rs, clustnummer, nstart = 20)
|
|
373 init = unmap(init$cluster)
|
|
374 set.seed(samplingSeed)
|
|
375 noisy = sample(noise, rWinSizeTemp, replace = TRUE)
|
|
376 clust = me(param$modelname, Rs+noisy, init)
|
|
377 clust$classification = map(clust$z)
|
|
378 if ((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification))))) {
|
|
379 ccx = 0 #clustering worked, take2
|
|
380 take = 2
|
|
381 }
|
|
382 } else {ccx = 0} #clustering worked, take1
|
|
383
|
|
384 if (ccx != 1) { #clustering worked either in take1 or take2
|
|
385
|
|
386 if (numdup > 1) { #check whether all components replicates agreed on clustering assignments
|
0
|
387 cc = vector(mode = "numeric", length = numdup)
|
|
388 for (g in 1:numdup) {
|
|
389 cc[g] = which.max(clust$parameters$mean[g,]) #which cluster has the largest mean (this is the peak cluster, hopefully!)
|
|
390 }
|
|
391 ccx = sum(diff(cc))
|
|
392 cluster = cc[1]
|
|
393 rm(cc)
|
|
394
|
1
|
395 if ((ccx != 0) && (take == 1)) { #not all replicates agreed? Repeat the clustering with from scratch if not already done!
|
|
396 set.seed(samplingSeed)
|
|
397 init = kmeans(Rs, clustnummer, nstart = 20)
|
|
398 init = unmap(init$cluster)
|
|
399 set.seed(samplingSeed)
|
|
400 noisy = sample(noise, rWinSizeTemp, replace = TRUE)
|
|
401 clust = me(param$modelname, Rs+noisy, init)
|
|
402 clust$classification = map(clust$z)
|
|
403 if ((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification))))) { #clustering worked? check whether replicates agreed take 3
|
|
404 cc = vector(mode = "numeric", length = numdup)
|
|
405 for (g in 1:numdup) {
|
|
406 cc[g] = which.max(clust$parameters$mean[g,]) #which cluster has the largest mean (this is the peak cluster, hopefully!)
|
|
407 }
|
|
408 ccx = sum(diff(cc))
|
|
409 cluster = cc[1]
|
|
410 rm(cc)
|
|
411 take = 3
|
|
412 }
|
0
|
413 }
|
1
|
414 } else { #no replicates!
|
|
415 cluster = which.max(clust$parameters$mean) #which cluster has the largest mean (this is the peak cluster, hopefully!)
|
|
416 }
|
|
417 }
|
|
418
|
|
419 if ((ccx != 0) && (reportNoClust=="y")) { resol = "window" } #clustering did not work and windows should be reported
|
|
420
|
|
421
|
|
422 if (ccx == 0) { #clustering worked and all replicates agree on the cluster assignments
|
|
423
|
|
424 #extract subset of the background
|
|
425 if (bkgd != "None") {
|
|
426 Cs = counts[[numdup+1]][winStart:winEnd]
|
|
427 Cs = filtfilt(rep(1,80)/80,1,Cs)
|
|
428 } else {
|
|
429 set.seed(samplingSeed)
|
|
430 Cs = sample(C, rWinSizeTemp, replace = TRUE)
|
|
431 Cs = filtfilt(rep(1,80)/80,1,Cs)
|
|
432 }
|
0
|
433
|
1
|
434 #find region boundaries
|
|
435 loc = 1:length(clust$classification)
|
|
436 gmclass = cbind(loc, clust$classification)
|
|
437 locPeak = gmclass[gmclass[,2] == cluster,,drop=FALSE]
|
|
438 rStart = locPeak[1] #start position of the region
|
|
439 rEnd = locPeak[length(locPeak[,1]),1] #end position of the region
|
0
|
440
|
1
|
441 #peak resolution check
|
|
442 if (resol == "region") {
|
|
443 pSize = rEnd - rStart
|
|
444 signal = (geomean(Rs[rStart:rEnd,]))
|
|
445 signal2 = (signal) - (Cs[rStart:rEnd])
|
|
446 gm = mean(signal2)
|
|
447 summit = which.max(geomean(Rsr[rStart:rEnd,])) - 1
|
|
448 will2k = wilcox.test(signal, Cs[rStart:rEnd])
|
|
449
|
|
450 #Is there signal in the region above background
|
|
451 if (gm > 0) {
|
|
452 writethis[[1]] = rStart + winStart - 1
|
|
453 writethis[[2]] = rEnd + winStart
|
|
454 writethis[[3]] = paste0(chromName, ".", rStart+winStart -1)
|
|
455 writethis[[4]] = "1000"
|
|
456 writethis[[5]] = "."
|
|
457 writethis[[6]] = gm
|
|
458 writethis[[7]] = will2k$p.value
|
|
459 writethis[[8]] = "-1"
|
|
460 writethis[[9]] = summit
|
|
461 }
|
|
462 } else if (resol == "peak") {
|
|
463 #find out where separate peaks are
|
|
464 d = diff(locPeak[,1])
|
|
465 d[length(d)+1] = 0
|
|
466 locPeak = cbind(locPeak, d)
|
|
467 bound1 = which(locPeak[,3] > 1, arr.in=TRUE)
|
|
468 bound2 = bound1 + 1
|
|
469 bound = locPeak[sort(c(bound1,bound2))]
|
|
470 bound = c(rStart, bound, rEnd)
|
|
471 w = 1
|
|
472 warum = 0
|
|
473 while (w < length(bound)) {
|
|
474 pStart = bound[w] + winStart - 1
|
|
475 pEnd = bound[w+1] + winStart
|
|
476 pSize = pEnd - pStart
|
|
477 signal = (geomean(Rs[(bound[w]):(bound[w+1]),]))
|
|
478 signal2 = (signal) - (Cs[bound[w]:bound[w+1]])
|
0
|
479 gm = mean(signal2)
|
1
|
480 summit = which.max(geomean(Rsr[(bound[w]):(bound[w+1]),])) - 1
|
|
481 will2k = wilcox.test(signal, Cs[(bound[w]):(bound[w+1])])
|
|
482
|
|
483 weil = warum * 9
|
0
|
484 #Is there signal in the region above background
|
|
485 if (gm > 0) {
|
1
|
486 writethis[[1+weil]] = pStart
|
|
487 writethis[[2+weil]] = pEnd
|
|
488 writethis[[3+weil]] = paste0(chromName, ".", pStart)
|
|
489 writethis[[4+weil]] = "1000"
|
|
490 writethis[[5+weil]] = "."
|
|
491 writethis[[6+weil]] = gm
|
|
492 writethis[[7+weil]] = will2k$p.value
|
|
493 writethis[[8+weil]] = "-1"
|
|
494 writethis[[9+weil]] = summit
|
0
|
495 }
|
1
|
496 w = w + 2
|
|
497 warum = warum + 1
|
|
498 }
|
|
499 } #peak resolution check
|
|
500 } #clustering worked and all replicates agree on clustering assignments?
|
|
501 } #window resolution check
|
|
502
|
|
503 if (resol == "window") {
|
0
|
504
|
|
505 #extract subset of the background
|
|
506 if (bkgd != "None") {
|
|
507 Cs = counts[[numdup+1]][winStart:winEnd]
|
|
508 Cs = filtfilt(rep(1,80)/80,1,Cs)
|
|
509 } else {
|
1
|
510 set.seed(samplingSeed)
|
0
|
511 Cs = sample(C, rWinSizeTemp, replace = TRUE)
|
|
512 Cs = filtfilt(rep(1,80)/80,1,Cs)
|
|
513 }
|
|
514
|
|
515 #calculate scores
|
|
516 pSize = rWinSizeTemp
|
|
517 signal = geomean(Rs)
|
|
518 signal2 = (signal) - (Cs)
|
|
519 gm = mean(signal2)
|
|
520 summit = which.max(geomean(Rsr)) - 1
|
|
521 will2k = wilcox.test(signal, Cs)
|
|
522
|
|
523 #Is there signal in the region above background
|
|
524 if (gm > 0) {
|
|
525 writethis[[1]] = winStart - 1
|
|
526 writethis[[2]] = winEnd
|
1
|
527 writethis[[3]] = paste0(chromName, ".", winStart -1, ".NoClust")
|
0
|
528 writethis[[4]] = "1000"
|
|
529 writethis[[5]] = "."
|
|
530 writethis[[6]] = gm
|
|
531 writethis[[7]] = will2k$p.value
|
|
532 writethis[[8]] = "-1"
|
|
533 writethis[[9]] = summit
|
|
534 }
|
1
|
535 } #window reporting
|
0
|
536
|
|
537 return(writethis)
|
|
538 }
|
|
539
|
|
540
|
|
541 #filter return value of findpeak()
|
|
542 processPeaks = function(peaks) {
|
|
543 peaks = matrix(unlist(peaks), ncol=9, byrow=TRUE)
|
|
544 peaks = peaks[peaks[,1] != FALSE,,drop=FALSE]
|
|
545 peaks = data.frame(peaks)
|
|
546 return(peaks)
|
|
547 }
|
|
548 #=======================> DONE!
|
|
549
|
|
550
|
|
551
|
|
552
|
|
553
|
|
554
|
|
555
|
|
556
|
|
557
|
|
558 # ==========================
|
|
559 # Parse-in System Variables
|
|
560 # ==========================
|
|
561 args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line
|
|
562
|
|
563 #Parsing arguments and storing values
|
|
564 for (each.arg in args) {
|
|
565 #chormosome size file
|
1
|
566 if (grepl('-sfile=',each.arg)) {
|
0
|
567 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
568 if (! is.na(arg.split[2]) ) {
|
|
569 size.file <- arg.split[2]
|
|
570 } else {
|
|
571 stop('No genome size file')
|
|
572 }
|
|
573 }
|
|
574 #bed file names
|
1
|
575 if (grepl('-bednames=',each.arg)) {
|
0
|
576 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
577 if (! is.na(arg.split[2]) ) {
|
|
578 bednames <- arg.split[2]
|
|
579 } else {
|
|
580 stop('No bed file names')
|
|
581 }
|
|
582 }
|
|
583 #bed files directory
|
1
|
584 if (grepl('-frag=',each.arg)) {
|
0
|
585 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
586 if (! is.na(arg.split[2]) ) {
|
|
587 frag <- arg.split[2]
|
|
588 } else {
|
|
589 stop('No fragment length given')
|
|
590 }
|
|
591 }
|
|
592 #bakcground files directory
|
1
|
593 if (grepl('-bkgd=',each.arg)) {
|
0
|
594 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
595 if (! is.na(arg.split[2]) ) {
|
|
596 bkgd <- arg.split[2]
|
|
597 } else {
|
|
598 stop('No background file')
|
|
599 }
|
|
600 }
|
|
601 #bakcground files directory
|
1
|
602 if (grepl('-out=',each.arg)) {
|
0
|
603 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
604 if (! is.na(arg.split[2]) ) {
|
|
605 out <- arg.split[2]
|
|
606 } else {
|
|
607 stop('No output directory given')
|
|
608 }
|
|
609 }
|
|
610 #Cluster number
|
1
|
611 if (grepl('-clustnummer=',each.arg)) {
|
0
|
612 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
613 if (! is.na(arg.split[2]) ) {
|
|
614 clustnummer <- as.numeric(arg.split[2])
|
|
615 }
|
|
616 }
|
|
617 #resolution
|
1
|
618 if (grepl('-resolution=',each.arg)) {
|
0
|
619 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
620 if (! is.na(arg.split[2]) ) {
|
|
621 resol <- arg.split[2]
|
|
622 }
|
|
623 }
|
|
624 #processor cores
|
1
|
625 if (grepl('-p=',each.arg)) {
|
0
|
626 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
627 if (! is.na(arg.split[2]) ) {
|
|
628 cornum <- as.numeric(arg.split[2])
|
|
629 }
|
|
630 }
|
|
631 #minimum window size
|
1
|
632 if (grepl('-window=',each.arg)) {
|
0
|
633 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
634 if (! is.na(arg.split[2]) ) {
|
|
635 winSize <- arg.split[2]
|
|
636 }
|
|
637 }
|
|
638 #window size
|
1
|
639 if (grepl('-bin=',each.arg)) {
|
0
|
640 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
641 if (! is.na(arg.split[2]) ) {
|
|
642 binsize <- arg.split[2]
|
|
643 }
|
|
644 }
|
|
645 #type (paired / single)
|
1
|
646 if (grepl('-type=',each.arg)) {
|
0
|
647 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
648 if (! is.na(arg.split[2]) ) {
|
|
649 type <- arg.split[2]
|
|
650 }
|
|
651 }
|
|
652 #chromosome number
|
1
|
653 if (grepl('-chrcount=',each.arg)) {
|
0
|
654 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
655 if (! is.na(arg.split[2]) ) {
|
|
656 chrcount <- as.numeric(arg.split[2])
|
|
657 }
|
|
658 }
|
|
659 #window enrichment cutoff
|
1
|
660 if (grepl('-windowe=',each.arg)) {
|
0
|
661 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
662 if (! is.na(arg.split[2]) ) {
|
|
663 windowe <- arg.split[2]
|
|
664 }
|
|
665 }
|
1
|
666 #initialize
|
|
667 if (grepl('-initModel=',each.arg)) {
|
|
668 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
669 if (! is.na(arg.split[2]) ) {
|
|
670 initialize <- arg.split[2]
|
|
671 }
|
|
672 }
|
|
673
|
0
|
674 }
|
|
675
|
|
676 ##Parse in variables
|
|
677 chromosomes = read.table(size.file, header=FALSE)
|
|
678 chromName = chromosomes$V1; #which chromosome
|
|
679 chromSize = chromosomes$V2; #chromosomes size
|
|
680 rm(chromosomes)
|
|
681
|
1
|
682 if (chrcount == 1) {
|
|
683 write(paste(samplingSeed), file = paste0(out, "/seed.info"))
|
|
684 } else {
|
|
685 samplingSeed = as.numeric(read.table(paste0(out, "/seed.info"), header = FALSE)[[1]])
|
|
686 }
|
0
|
687 readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]])
|
|
688 numdup = length(readsFiles) #number of replicates
|
|
689 if (bkgd != "None") {
|
|
690 readsFiles[[numdup+1]] = bkgd
|
|
691 }
|
|
692 winSize = as.numeric(winSize)
|
|
693 binSize = as.numeric(binsize)
|
|
694 winSize = binSize * winSize
|
|
695
|
|
696 if (type == "single") {
|
|
697 frags = as.numeric(strsplit(frag, ",", fixed = TRUE)[[1]])
|
|
698 }
|
|
699 rm(bednames)
|
|
700
|
|
701 if (numdup > 1) {
|
|
702 modelnames = c("VVV","VEV")
|
|
703 } else {
|
|
704 modelnames = "V"
|
|
705 }
|
|
706
|
|
707 if (windowe != "auto") {
|
|
708 windowe = as.numeric(windowe)
|
|
709 }
|
|
710
|
|
711 nothing = FALSE #default is we found some peaks
|
|
712 options(stringsAsFactors = FALSE)
|
|
713 #=======================> DONE!
|
|
714
|
|
715
|
|
716
|
|
717
|
|
718
|
|
719 # =======================
|
|
720 # Some preliminary stuff
|
|
721 # =======================
|
|
722 if (type == "single") {
|
|
723 if (cornum > 1) {
|
|
724 datain = mclapply(readsFiles, parsein, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control)
|
|
725 } else {
|
|
726 datain = lapply(readsFiles, parsein) #read in all bed files (samples and control)
|
|
727 }
|
|
728 }
|
|
729
|
|
730 if (type == "paired") {
|
|
731 if (cornum > 1) {
|
|
732 datain = mclapply(readsFiles, parseinpe, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control)
|
|
733 } else {
|
|
734 datain = lapply(readsFiles, parseinpe) #read in all bed files (samples and control)
|
|
735 }
|
|
736 }
|
|
737
|
|
738 #minimum peak size (only a recommendation)
|
|
739 minpeak = floor(binSize / 4)
|
|
740
|
|
741 #make bins vector
|
|
742 bins = seq(from = 1, to = (chromSize - 1), by = binSize)
|
|
743 #=======================> DONE!
|
|
744
|
|
745
|
|
746
|
|
747
|
|
748 # ===============
|
|
749 # Counting Reads
|
|
750 # ===============
|
|
751 if (type == "single") {
|
|
752 if (cornum > 1) {
|
|
753 counts = mclapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles, chrcount = chrcount, mc.cores = cornum, mc.preschedule = TRUE)
|
|
754 } else {
|
|
755 counts = lapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles, chrcount = chrcount)
|
|
756 }
|
|
757 }
|
|
758
|
|
759 if (type == "paired") {
|
|
760 if (cornum > 1) {
|
|
761 counts = mclapply(readsFiles, countreadspe, reads = datain, chromsize = chromSize, filelist = readsFiles, chrcount = chrcount, mc.cores = cornum, mc.preschedule = TRUE)
|
|
762 } else {
|
|
763 counts = lapply(readsFiles, countreadspe, reads = datain, chromsize = chromSize, filelist = readsFiles, chrcount = chrcount)
|
|
764 }
|
|
765 }
|
|
766
|
|
767 rm(datain)
|
|
768 #=======================> DONE!
|
|
769
|
|
770
|
|
771
|
|
772
|
|
773 # ============================
|
|
774 # Estimating Background Model
|
|
775 # ============================
|
|
776 if (chrcount == 1){ #first chromosome, estimate bkgd (includes SNR cutoff)
|
|
777 if (is.na(cutoff)) {
|
|
778 if (bkgd != "None") {
|
|
779 cutoff = vector(length = numdup)
|
|
780 sdC = sd(counts[[numdup+1]])
|
|
781 for (x in 1:numdup) {
|
|
782 cutoff[x] = (mean(counts[[x]]))/(sdC)
|
|
783 }
|
|
784 cutoff = max(cutoff)
|
|
785 C = NULL
|
|
786 mCs = NULL
|
|
787 write(paste(c(cutoff,NA,NA)), file = paste0(out, "/bkgd.info"), append = TRUE)
|
|
788 } else {
|
|
789 cutoff = vector(length = numdup)
|
|
790 mmV = var(geomeanL(counts))
|
|
791 mmM = mean(geomeanL(counts))
|
|
792 sigma = log(1+((mmV) / ((mmM)^2)))
|
|
793 mu = (log(mmM)) - (0.5 * (sigma))
|
1
|
794 set.seed(samplingSeed)
|
0
|
795 C = rlnorm(100000, mu, sqrt(sigma))
|
|
796 for (x in 1:numdup) {
|
|
797 cutoff[x] = (mean(counts[[x]]))/(sd(C))
|
|
798 }
|
|
799 cutoff = max(cutoff)
|
1
|
800 set.seed(samplingSeed)
|
|
801 snow = sample(C, binSize*5, replace = TRUE)
|
|
802 mCs = mean(snow)
|
|
803 dCs = sd(snow)
|
0
|
804 write(paste(c(cutoff,sigma,mu)), file = paste0(out, "/bkgd.info"), append = TRUE)
|
|
805 }
|
|
806 }
|
|
807 } else { #bkgd estiamted from before
|
|
808 bkgdInfo = read.table(paste0(out, "/bkgd.info"), header = FALSE)
|
|
809 if (is.na(cutoff)) {
|
|
810 cutoff = as.numeric(bkgdInfo[[1]][1])
|
|
811 }
|
|
812
|
|
813 if (bkgd != "None") {
|
|
814 C = NULL
|
|
815 mCs = NULL
|
|
816 } else {
|
|
817 sigma = as.numeric(bkgdInfo[[1]][2])
|
|
818 mu = as.numeric(bkgdInfo[[1]][3])
|
1
|
819 set.seed(samplingSeed)
|
0
|
820 C = rlnorm(100000, mu, sqrt(sigma))
|
1
|
821 set.seed(samplingSeed)
|
|
822 snow = sample(C, binSize*5, replace = TRUE)
|
|
823 mCs = mean(snow)
|
|
824 dCs = sd(snow)
|
0
|
825 }
|
|
826 }
|
|
827 #=======================> DONE!
|
|
828
|
|
829
|
|
830
|
|
831
|
|
832 # ========================
|
|
833 # Picking Enriched Windows
|
|
834 # ========================
|
|
835 if (cornum > 1) {
|
|
836 coffeeshop = mclapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, strict, mCs, dCs, bkgd, mc.cores = cornum, mc.preschedule = TRUE)
|
|
837 } else {
|
|
838 coffeeshop = lapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, strict, mCs, dCs, bkgd)
|
|
839 }
|
|
840 coffeeshop = as.numeric(unlist(coffeeshop))
|
|
841 coffeeshop[coffeeshop != numdup] = 0
|
1
|
842
|
|
843 if (sum(coffeeshop) != 0) { #Any enriched bins?
|
|
844
|
0
|
845 coffeeshop = c(0, diff(coffeeshop))
|
|
846 coffeeshop = cbind(coffeeshop, bins)
|
|
847 coffeeshopNord = coffeeshop[coffeeshop[,1] == numdup,,drop=FALSE]
|
|
848 coffeeshopSud = coffeeshop[coffeeshop[,1] == -numdup,,drop=FALSE]
|
|
849 coffeeshopNord = coffeeshopNord[,2]
|
|
850 coffeeshopSud = coffeeshopSud[,2] - 1
|
|
851 if (length(coffeeshopSud) < length(coffeeshopNord)) {
|
|
852 coffeeshopSud = c(coffeeshopSud, chromSize)
|
|
853 } else if (length(coffeeshopSud) > length(coffeeshopNord)) {
|
|
854 coffeeshopNord = c(1, coffeeshopNord)
|
|
855 }
|
|
856 if (coffeeshopSud[length(coffeeshopSud)] > chromSize) {
|
|
857 coffeeshopSud[length(coffeeshopSud)] = chromSize
|
|
858 }
|
|
859
|
|
860
|
|
861 if (cornum > 1) {
|
|
862 coffeeshop = mclapply(coffeeshopNord, pickwins, coffeeshopSud, counts, numdup, startlist = coffeeshopNord, winSize, mc.cores = cornum, mc.preschedule = TRUE)
|
|
863 } else {
|
|
864 coffeeshop = lapply(coffeeshopNord, pickwins, coffeeshopSud, counts, numdup, startlist = coffeeshopNord, winSize)
|
|
865 }
|
|
866 coffeeshop = as.numeric(unlist(coffeeshop))
|
|
867 coffeeshop = cbind(coffeeshopNord, coffeeshopSud, coffeeshop)
|
|
868 coffeeshop = coffeeshop[coffeeshop[,3] != FALSE,,drop=FALSE]
|
|
869 coffeeshop = coffeeshop[order(coffeeshop[,3], decreasing = TRUE),]
|
|
870 rm(bins)
|
|
871 #=======================> DONE!
|
|
872
|
|
873
|
|
874
|
|
875
|
|
876 # ===================================
|
|
877 # Initializing Clustering Parameters
|
|
878 # ===================================
|
|
879 if (length(coffeeshop[,1]) > 0) { #any enriched windows detected?
|
|
880
|
1
|
881 if (initialize == "deterministic") {
|
|
882 yummy = ceiling(length(coffeeshop[,1]) / 1000)
|
|
883 if (yummy == 0) {
|
|
884 yummy = 1
|
|
885 }
|
|
886 }
|
|
887 if (initialize == "stochastic") {
|
|
888 yummy = ceiling(length(coffeeshop[,1]) / 4)
|
|
889 if (yummy > 20) {
|
|
890 set.seed(samplingSeed)
|
|
891 yummy = sample(1:yummy, 20)
|
|
892 } else if (yummy > 0) {
|
|
893 yummy = 1:yummy
|
|
894 } else {
|
|
895 yummy = 1
|
|
896 }
|
0
|
897 }
|
|
898 coffeeshopNord = coffeeshop[yummy,1]
|
|
899 coffeeshopSud = coffeeshop[yummy,2]
|
1
|
900 set.seed(samplingSeed)
|
0
|
901 noise = rnorm(100000, mean=0, sd=0.1)
|
|
902 param = initparam(coffeeshopNord, coffeeshopSud, numdup, counts, cornum, clustnummer, modelnames, noise)
|
|
903 #=======================> DONE!
|
|
904
|
|
905
|
|
906
|
|
907
|
|
908
|
|
909 # ==========================
|
|
910 # Enriched Window Filtering
|
|
911 # ==========================
|
|
912 if (windowe != 1) { #do it only if window fold enrichment filtering is required
|
|
913 if (cornum > 1) {
|
|
914 scores = mclapply(coffeeshop[,1], scorewindow, coffeeshop[,2], numdup, C, bkgd, counts, startlist = coffeeshop[,1], mc.cores = cornum, mc.preschedule = TRUE)
|
|
915 } else {
|
|
916 scores = lapply(coffeeshop[,1], scorewindow, coffeeshop[,2], numdup, C, bkgd, counts, startlist = coffeeshop[,1])
|
|
917 }
|
|
918 scores = unlist(scores)
|
|
919
|
|
920 if (windowe == "auto") {
|
|
921 lscores = log(scores)
|
|
922 if (length(scores) > 0) {
|
|
923 if (chrcount == 1) {
|
|
924 cutthisTEMP = ((mean(lscores)) + (sd(lscores)*1))
|
|
925 write(paste(cutthisTEMP), file = paste0(out, "/bkgd.info"), append = TRUE)
|
|
926 } else {
|
|
927 cutthisTEMP = as.numeric(bkgdInfo[[1]][4])
|
|
928 }
|
|
929 finalwins = which(lscores > cutthisTEMP)
|
|
930 cutthisW = min(scores[finalwins])
|
|
931 coffeeshop = coffeeshop[finalwins,]
|
|
932 } else {
|
|
933 if (chrcount == 1) {
|
|
934 cutthisTEMP = 0
|
|
935 cutthisW = "Not Applicable, All Windows Analyzed!"
|
|
936 write(paste(cutthisTEMP), file = paste0(out, "/bkgd.info"), append = TRUE)
|
|
937 }
|
|
938 }
|
|
939 } else {
|
|
940 cutthisW = windowe
|
|
941 if (length(scores) > 0) {
|
|
942 finalwins = which(scores >= windowe)
|
|
943 coffeeshop = coffeeshop[finalwins,]
|
|
944 }
|
|
945 }
|
|
946 }
|
|
947 else { cutthisW = 1 }
|
|
948 #=======================> DONE!
|
|
949
|
|
950
|
|
951
|
|
952
|
|
953
|
|
954 # ==============
|
|
955 # Finding Peaks
|
|
956 # ==============
|
1
|
957 if (length(coffeeshop[,1]) > 0) { #any enriched windows left after filtering?
|
0
|
958 coffeeshop = coffeeshop[,-3]
|
|
959 if (cornum > 1) {
|
1
|
960 peaks = mclapply(coffeeshop[,1], findpeak, coffeeshop[,2], numdup, C, param, bkgd, resol, counts, noise, startlist = coffeeshop[,1], meanAdjust, clustnummer, mc.cores = cornum, mc.preschedule = TRUE)
|
0
|
961 } else {
|
1
|
962 peaks = lapply(coffeeshop[,1], findpeak, coffeeshop[,2], numdup, C, param, bkgd, resol, counts, noise, startlist = coffeeshop[,1], meanAdjust, clustnummer)
|
0
|
963 }
|
|
964 if (!(is.null(peaks))) { #any peaks discovered?
|
|
965 writethis = processPeaks(peaks)
|
|
966 #=======================> DONE!
|
|
967
|
|
968
|
|
969
|
|
970
|
|
971
|
|
972 # =========================
|
|
973 # Writing Peak Information
|
|
974 # =========================
|
|
975 } else { nothing = TRUE } #no peaks
|
1
|
976 } else { nothing = TRUE } #no enriched windows left after filtering
|
0
|
977 } else { nothing = TRUE } #no enriched widnows discovered
|
1
|
978 } else { nothing = TRUE; cutthisW = windowe } #no enriched bins discovered
|
0
|
979
|
|
980 if (isTRUE(nothing)) {
|
|
981 file.create(paste0(out, "/", chromName, ".peaks.bed"))
|
|
982 write(paste(chromName, minpeak, sep = " "), file = paste0(out, "/min.peaksize"), append=TRUE)
|
|
983
|
|
984 if (chrcount == 1) {
|
1
|
985 message(paste0("No peaks found! - Window Fold Enrichment: ", cutthisW, " - Seed: ", samplingSeed))
|
0
|
986 } else {
|
|
987 message("No peaks found!")
|
|
988 }
|
|
989 } else {
|
|
990 write(paste(chromName, writethis$X1, writethis$X2, writethis$X3, writethis$X4, writethis$X5, writethis$X6, writethis$X7, writethis$X8, writethis$X9, minpeak, sep = " "), file = paste0(out, "/", chromName, ".peaks.bed"), ncolumns = 1)
|
|
991 write(paste(chromName, minpeak, sep = " "), file = paste0(out, "/min.peaksize"), append=TRUE)
|
|
992
|
|
993
|
|
994 if (chrcount == 1) {
|
1
|
995 message(paste0("Done! - Window Fold Enrichment: ", cutthisW, " - Seed: ", samplingSeed))
|
0
|
996 } else {
|
|
997 message("Done!")
|
|
998 }
|
|
999 }
|
|
1000 #=======================> DONE!
|
|
1001
|
|
1002
|