0
|
1 ###############################
|
|
2 #### Peak finder for NGS Data
|
|
3 #### R script
|
|
4 ###############################
|
|
5
|
|
6
|
|
7
|
|
8
|
|
9 # =======================
|
|
10 # User-defined variables
|
|
11 # =======================
|
|
12 cutoff = NA #To enforce an SNR cutoff ratio, delete NA and enter the number you want
|
|
13 strict = 1 #to make peak calling more / less strict, increase / decrease this number - Only use this if you are getting an unusually large or small number of peaks.
|
|
14 options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000)
|
|
15 #=======================> DONE!
|
|
16
|
|
17
|
|
18
|
|
19
|
|
20 # ================================
|
|
21 # Required Libraries check & load
|
|
22 # ================================
|
|
23 if ((is.element('mclust', installed.packages()[,1]) == FALSE) || (is.element('signal', installed.packages()[,1]) == FALSE) || (is.element('sqldf', installed.packages()[,1]) == FALSE) || (is.element('parallel', installed.packages()[,1]) == FALSE)) {
|
|
24 stop("R package 'Mclust', 'signal', 'sqldf' and 'parallel' are required. Please install them!")
|
|
25 }
|
|
26 suppressPackageStartupMessages(library("mclust"))
|
|
27 suppressPackageStartupMessages(library("signal"))
|
|
28 suppressPackageStartupMessages(library("sqldf"))
|
|
29 suppressPackageStartupMessages(library("tcltk"))
|
|
30 suppressPackageStartupMessages(library("parallel"))
|
|
31 #=======================> DONE!
|
|
32
|
|
33
|
|
34
|
|
35
|
|
36
|
|
37 # =================
|
|
38 # Custom Functions
|
|
39 # =================
|
|
40 #Get per-row Geometric mean (takes list, returns vectors, not lists!)
|
|
41 geomeanL <- function(mat){
|
|
42 n = length(mat)
|
|
43 if (n > 1) {
|
|
44 mult = (mat[[1]])*(mat[[2]])
|
|
45 if (n > 2) {
|
|
46 for (i in 3:n) {
|
|
47 mult = mult*(mat[[i]])
|
|
48 }
|
|
49 }
|
|
50 mult = mult^(1/n)
|
|
51 mat = mult
|
|
52 return(mat)
|
|
53 } else {
|
|
54 return(mat[[1]])
|
|
55 }
|
|
56 }
|
|
57
|
|
58
|
|
59
|
|
60 #Get per-row Geometric mean (takes matrix, returns vectors, not lists!)
|
|
61 geomean <- function(mat){
|
|
62 n = NCOL(mat)
|
|
63 if (n > 1) {
|
|
64 mult = (mat[,1])*(mat[,2])
|
|
65 if (n > 2) {
|
|
66 for (i in 3:n) {
|
|
67 mult = mult*(mat[,i])
|
|
68 }
|
|
69 }
|
|
70 mult = mult^(1/n)
|
|
71 mat = mult
|
|
72 }
|
|
73 return(mat)
|
|
74 }
|
|
75
|
|
76
|
|
77
|
|
78 #Read in bed(st2) file
|
|
79 parsein = function(bedfile) {
|
|
80 l = sqldf()
|
|
81 readsFile = file(bedfile)
|
|
82 return(sqldf("SELECT V1 + 1 As 'start' FROM readsFile", file.format=list(header = FALSE, sep = "\t"))$start)
|
|
83 l = sqldf()
|
|
84 close(readsFile)
|
|
85 }
|
|
86
|
|
87
|
|
88 #Read in bedpe(st2) file
|
|
89 parseinpe = function(bedfile) {
|
|
90 l = sqldf()
|
|
91 readsFile = file(bedfile)
|
|
92 sebastian = sqldf("SELECT * FROM readsFile", file.format=list(header = FALSE, sep = "\t"))
|
|
93 return(cbind(sebastian[[1]]+1, sebastian[[2]]))
|
|
94 l = sqldf()
|
|
95 close(readsFile)
|
|
96 }
|
|
97
|
|
98
|
|
99 #Produces normalized extended read counts (takes output of parsein(), return a vector of floats)
|
|
100 countreads = function(bedfile, reads, frag, chromsize, filelist) {
|
|
101
|
|
102 o = which(filelist == bedfile)
|
|
103
|
|
104 counts = vector(mode = "numeric", length = chromsize)
|
|
105
|
|
106 for (j in 1:length(reads[[o]])) {
|
|
107 if ((reads[[o]][j]+frag[o]-1) <= chromsize) {
|
|
108 counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] = counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] + 1
|
|
109 }
|
|
110 }
|
|
111 counts = counts/(mean(counts))
|
|
112 return(counts)
|
|
113 }
|
|
114
|
|
115
|
|
116
|
|
117 #Produces normalized extended read counts (takes output of parsein(), return a vector of floats)
|
|
118 countreadspe = function(bedfile, reads, chromsize, filelist) {
|
|
119
|
|
120 o = which(filelist == bedfile)
|
|
121
|
|
122 counts = vector(mode = "numeric", length = chromsize)
|
|
123
|
|
124 for (j in 1:length(reads[[o]][,1])) {
|
|
125 counts[(reads[[o]][j,1]):(reads[[o]][j,2])] = counts[(reads[[o]][j,1]):(reads[[o]][j,2])] + 1
|
|
126 }
|
|
127 counts = counts/(mean(counts))
|
|
128 return(counts)
|
|
129 }
|
|
130
|
|
131
|
|
132
|
|
133 #find enriched bins
|
|
134 pickbins = function(winStart, counts, binSize, chromSize, numdup, C, cutoff, strict, mCs, dCs, bkgd) {
|
|
135
|
|
136 if ((winStart + binSize) <= chromSize) {
|
|
137 winEnd = winStart + binSize
|
|
138 } else {
|
|
139 winEnd = chromSize
|
|
140 }
|
|
141 binSizeTemp = winEnd - winStart
|
|
142 tempend = winEnd - 1
|
|
143
|
|
144 #extract subset of the background
|
|
145 if (bkgd != "None") {
|
|
146 Cs = counts[[numdup+1]][winStart:tempend]
|
|
147 mCs = mean(Cs)
|
|
148 dCs = sd(Cs)
|
|
149 } else {
|
|
150 Cs = sample(C, binSizeTemp, replace = TRUE)
|
|
151 }
|
|
152
|
|
153
|
|
154 #find out whether it's enriched
|
|
155 go = rep(0, numdup)
|
|
156 for (g in 1:numdup) {
|
|
157 mS = (mean(counts[[g]][winStart:tempend]))
|
|
158 ratio = mS/dCs
|
|
159 if ((mS > (mCs * strict)) && (ratio > cutoff)) {
|
|
160 go[g] = 1
|
|
161 }
|
|
162 }
|
|
163 veep = sum(go)
|
|
164
|
|
165 #get counts when enriched
|
|
166 if (veep == numdup) {
|
|
167 mS = rep(0, numdup)
|
|
168 for (g in 1:numdup) {
|
|
169 mS[g] = (mean(counts[[g]][winStart:winEnd]))
|
|
170 }
|
|
171 cairo = mean(mS)
|
|
172 } else {
|
|
173 mS = rep(0, numdup)
|
|
174 for (g in 1:numdup) {
|
|
175 mS[g] = (mean(counts[[g]][winStart:winEnd]))
|
|
176 }
|
|
177 cairo = -(mean(mS))
|
|
178 }
|
|
179 return(cairo)
|
|
180 }
|
|
181
|
|
182
|
|
183
|
|
184 #find enriched wins
|
|
185 pickwins = function(winStart, coffeeshopSud, counts, numdup, startlist, winSize) {
|
|
186
|
|
187 plz = which(startlist == winStart)
|
|
188 winEnd = coffeeshopSud[plz]
|
|
189 rWinSize = winEnd - winStart + 1
|
|
190
|
|
191
|
|
192 if(rWinSize >= winSize) {
|
|
193 mS = rep(0, numdup)
|
|
194 for (g in 1:numdup) {
|
|
195 mS[g] = (sum(counts[[g]][winStart:winEnd]))
|
|
196 }
|
|
197 veep = sum(mS)
|
|
198 } else {
|
|
199 veep = FALSE
|
|
200 }
|
|
201
|
|
202 return(veep)
|
|
203 }
|
|
204
|
|
205
|
|
206
|
|
207 #Initialize MClust clustering parameters
|
|
208 smoothcounts = function(winStart, coffeeshopSud, numdup, counts, startlist) { #helper function1
|
|
209
|
|
210 plz = which(startlist == winStart)
|
|
211 winEnd = coffeeshopSud[plz]
|
|
212
|
|
213 #extract subset of the IP
|
|
214 Rs = matrix(0, nrow = (winEnd - winStart + 1), ncol = numdup)
|
|
215 for (j in 1:numdup) {
|
|
216 Rs[,j] = counts[[j]][winStart:winEnd]
|
|
217 }
|
|
218 #smooth extended read counts
|
|
219 for (j in 1:numdup) {
|
|
220 Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j])
|
|
221 }
|
|
222 return(Rs)
|
|
223 }
|
|
224 cluster = function(model, sig, init, clustnummer, noise) { #helper function2
|
|
225 noisy = sample(noise, length(sig[,1]), replace = TRUE)
|
|
226 clust = me(model, sig+noisy, init)
|
|
227 bicc = bic(model, clust$loglik, length(sig[,1]), length(sig[1,]), clustnummer)
|
|
228 out = list(bicc = bicc, param = clust$parameters)
|
|
229 return(out)
|
|
230 }
|
|
231 initparam = function(coffeeshopNord, coffeeshopSud, numdup, counts, cornum, clustnummer, modelnames, noise) { #main function
|
|
232
|
|
233 n = length(coffeeshopNord)
|
|
234 #smooth extended read counts
|
|
235 if (cornum > 1) {
|
|
236 sig = mclapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord, mc.cores = cornum, mc.preschedule = TRUE)
|
|
237 } else {
|
|
238 sig = lapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord)
|
|
239 }
|
|
240 sig = do.call(rbind, sig)
|
|
241
|
|
242 #kmeans initialization
|
|
243 init = kmeans(sig, clustnummer)
|
|
244 init = unmap(init$cluster)
|
|
245
|
|
246
|
|
247 if (cornum > 1) {
|
|
248 param = mclapply(modelnames, cluster, sig, init, clustnummer, noise, mc.cores = cornum, mc.preschedule = TRUE)
|
|
249 } else {
|
|
250 param = lapply(modelnames, cluster, sig, init, clustnummer, noise)
|
|
251 }
|
|
252
|
|
253 bicc = vector(mode = "numeric", length = length(modelnames))
|
|
254 for (i in 1:length(modelnames)) {
|
|
255 bicc[i] = as.numeric(param[[i]]$bicc)
|
|
256 }
|
|
257 bicc = which.max(bicc)
|
|
258
|
|
259 out = list(initparam = param[[bicc]]$param, modelname = modelnames[bicc])
|
|
260 return(out)
|
|
261 }
|
|
262
|
|
263
|
|
264
|
|
265
|
|
266 #find peaks
|
|
267 findpeak = function(winStart, coffeeshopSud, numdup, C, param, bkgd, resol, counts, noise, startlist) {
|
|
268
|
|
269
|
|
270 plz = which(startlist == winStart)
|
|
271 winEnd = coffeeshopSud[plz]
|
|
272
|
|
273
|
|
274 #will store peak information
|
|
275 writethis = list()
|
|
276
|
|
277
|
|
278 rWinSizeTemp = winEnd - winStart + 1
|
|
279
|
|
280 #extract subset of the IP
|
|
281 Rs = matrix(nrow = rWinSizeTemp, ncol = numdup)
|
|
282 Rsr = Rs
|
|
283 for (j in 1:numdup) {
|
|
284 Rsr[,j] = counts[[j]][winStart:winEnd]
|
|
285 Rs[,j] = filtfilt(rep(1,80)/80,1,Rsr[,j])
|
|
286 }
|
|
287
|
|
288
|
|
289 ####CLUSTERING START HERE#####
|
|
290 noisy = sample(noise, rWinSizeTemp, replace = TRUE)
|
|
291 clust = em(param$modelname, Rs+noisy, param$initparam)
|
|
292 clust$classification = map(clust$z)
|
|
293 ####CLUSTERING DONE####
|
|
294
|
|
295 #check whether clutering succeeded
|
|
296 if((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification))))) {
|
|
297
|
|
298
|
|
299 #check whether all components replicates agreed on clustering assignments
|
|
300 ccx = 1
|
|
301 if (numdup > 1) {
|
|
302 cc = vector(mode = "numeric", length = numdup)
|
|
303 for (g in 1:numdup) {
|
|
304 cc[g] = which.max(clust$parameters$mean[g,]) #which cluster has the largest mean (this is the peak cluster, hopefully!)
|
|
305 }
|
|
306 ccx = sum(diff(cc))
|
|
307 cluster = cc[1]
|
|
308 rm(cc)
|
|
309 } else {
|
|
310 cluster = which.max(clust$parameters$mean) #which cluster has the largest mean (this is the peak cluster, hopefully!)
|
|
311 ccx = 0
|
|
312 }
|
|
313 if (ccx == 0) {
|
|
314
|
|
315 #extract subset of the background
|
|
316 if (bkgd != "None") {
|
|
317 Cs = counts[[numdup+1]][winStart:winEnd]
|
|
318 Cs = filtfilt(rep(1,80)/80,1,Cs)
|
|
319 } else {
|
|
320 Cs = sample(C, rWinSizeTemp, replace = TRUE)
|
|
321 Cs = filtfilt(rep(1,80)/80,1,Cs)
|
|
322 }
|
|
323
|
|
324 #find region boundaries
|
|
325 loc = 1:length(clust$classification)
|
|
326 gmclass = cbind(loc, clust$classification)
|
|
327 locPeak = gmclass[gmclass[,2] == cluster,,drop=FALSE]
|
|
328 rStart = locPeak[1] #start position of the region
|
|
329 rEnd = locPeak[length(locPeak[,1]),1] #end position of the region
|
|
330
|
|
331 #peak resolution check
|
|
332 if (resol == "region") {
|
|
333 pSize = rEnd - rStart
|
|
334 signal = (geomean(Rs[rStart:rEnd,]))
|
|
335 signal2 = (signal) - (Cs[rStart:rEnd])
|
|
336 gm = mean(signal2)
|
|
337 summit = which.max(geomean(Rsr[rStart:rEnd,])) - 1
|
|
338 will2k = wilcox.test(signal, Cs[rStart:rEnd])
|
|
339
|
|
340 #Is there signal in the region above background
|
|
341 if (gm > 0) {
|
|
342 writethis[[1]] = rStart + winStart - 1
|
|
343 writethis[[2]] = rEnd + winStart
|
|
344 writethis[[3]] = paste0(chromName, ".", rStart+winStart -1)
|
|
345 writethis[[4]] = "1000"
|
|
346 writethis[[5]] = "."
|
|
347 writethis[[6]] = gm
|
|
348 writethis[[7]] = will2k$p.value
|
|
349 writethis[[8]] = "-1"
|
|
350 writethis[[9]] = summit
|
|
351 }
|
|
352 } else if (resol == "peak") {
|
|
353 #find out where separate peaks are
|
|
354 d = diff(locPeak[,1])
|
|
355 d[length(d)+1] = 0
|
|
356 locPeak = cbind(locPeak, d)
|
|
357 bound1 = which(locPeak[,3] > 1, arr.in=TRUE)
|
|
358 bound2 = bound1 + 1
|
|
359 bound = locPeak[sort(c(bound1,bound2))]
|
|
360 bound = c(rStart, bound, rEnd)
|
|
361 w = 1
|
|
362 warum = 0
|
|
363 while (w < length(bound)) {
|
|
364 pStart = bound[w] + winStart - 1
|
|
365 pEnd = bound[w+1] + winStart
|
|
366 pSize = pEnd - pStart
|
|
367 signal = (geomean(Rs[(bound[w]):(bound[w+1]),]))
|
|
368 signal2 = (signal) - (Cs[bound[w]:bound[w+1]])
|
|
369 gm = mean(signal2)
|
|
370 summit = which.max(geomean(Rsr[(bound[w]):(bound[w+1]),])) - 1
|
|
371 will2k = wilcox.test(signal, Cs[(bound[w]):(bound[w+1])])
|
|
372
|
|
373 weil = warum * 9
|
|
374 #Is there signal in the region above background
|
|
375 if (gm > 0) {
|
|
376 writethis[[1+weil]] = pStart
|
|
377 writethis[[2+weil]] = pEnd
|
|
378 writethis[[3+weil]] = paste0(chromName, ".", pStart)
|
|
379 writethis[[4+weil]] = "1000"
|
|
380 writethis[[5+weil]] = "."
|
|
381 writethis[[6+weil]] = gm
|
|
382 writethis[[7+weil]] = will2k$p.value
|
|
383 writethis[[8+weil]] = "-1"
|
|
384 writethis[[9+weil]] = summit
|
|
385 }
|
|
386 w = w + 2
|
|
387 warum = warum + 1
|
|
388 }
|
|
389
|
|
390 } #peak resolution check
|
|
391 } #all replicates agree on clustering assignments?
|
|
392 } #clustering worked?
|
|
393 return(writethis)
|
|
394 }
|
|
395
|
|
396
|
|
397 #filter return value of findpeak()
|
|
398 processPeaks = function(peaks) {
|
|
399 peaks = matrix(unlist(peaks), ncol=9, byrow=TRUE)
|
|
400 peaks = peaks[peaks[,1] != FALSE,,drop=FALSE]
|
|
401 peaks = data.frame(peaks)
|
|
402 return(peaks)
|
|
403 }
|
|
404 #=======================> DONE!
|
|
405
|
|
406
|
|
407
|
|
408
|
|
409
|
|
410
|
|
411
|
|
412
|
|
413
|
|
414
|
|
415
|
|
416
|
|
417
|
|
418
|
|
419
|
|
420
|
|
421
|
|
422
|
|
423 # ==========================
|
|
424 # Parse-in System Variables
|
|
425 # ==========================
|
|
426 args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line
|
|
427
|
|
428 #Parsing arguments and storing values
|
|
429 for (each.arg in args) {
|
|
430 #chormosome size file
|
|
431 if (grepl('^-sfile=',each.arg)) {
|
|
432 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
433 if (! is.na(arg.split[2]) ) {
|
|
434 size.file <- arg.split[2]
|
|
435 } else {
|
|
436 stop('No genome size file')
|
|
437 }
|
|
438 }
|
|
439 #bed file names
|
|
440 if (grepl('^-bednames=',each.arg)) {
|
|
441 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
442 if (! is.na(arg.split[2]) ) {
|
|
443 bednames <- arg.split[2]
|
|
444 } else {
|
|
445 stop('No bed file names')
|
|
446 }
|
|
447 }
|
|
448 #bed files directory
|
|
449 if (grepl('^-frag=',each.arg)) {
|
|
450 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
451 if (! is.na(arg.split[2]) ) {
|
|
452 frag <- arg.split[2]
|
|
453 } else {
|
|
454 stop('No fragment length given')
|
|
455 }
|
|
456 }
|
|
457 #bakcground files directory
|
|
458 if (grepl('^-bkgd=',each.arg)) {
|
|
459 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
460 if (! is.na(arg.split[2]) ) {
|
|
461 bkgd <- arg.split[2]
|
|
462 } else {
|
|
463 stop('No background file')
|
|
464 }
|
|
465 }
|
|
466 #bakcground files directory
|
|
467 if (grepl('^-out=',each.arg)) {
|
|
468 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
469 if (! is.na(arg.split[2]) ) {
|
|
470 out <- arg.split[2]
|
|
471 } else {
|
|
472 stop('No output directory given')
|
|
473 }
|
|
474 }
|
|
475 #Cluster number
|
|
476 if (grepl('^-clustnummer=',each.arg)) {
|
|
477 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
478 if (! is.na(arg.split[2]) ) {
|
|
479 clustnummer <- as.numeric(arg.split[2])
|
|
480 }
|
|
481 }
|
|
482 #resolution
|
|
483 if (grepl('^-resolution=',each.arg)) {
|
|
484 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
485 if (! is.na(arg.split[2]) ) {
|
|
486 resol <- arg.split[2]
|
|
487 }
|
|
488 }
|
|
489 #processor cores
|
|
490 if (grepl('^-p=',each.arg)) {
|
|
491 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
492 if (! is.na(arg.split[2]) ) {
|
|
493 cornum <- as.numeric(arg.split[2])
|
|
494 }
|
|
495 }
|
|
496 #minimum window size
|
|
497 if (grepl('^-window=',each.arg)) {
|
|
498 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
499 if (! is.na(arg.split[2]) ) {
|
|
500 winSize <- arg.split[2]
|
|
501 }
|
|
502 }
|
|
503 #window size
|
|
504 if (grepl('^-bin=',each.arg)) {
|
|
505 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
506 if (! is.na(arg.split[2]) ) {
|
|
507 binsize <- arg.split[2]
|
|
508 }
|
|
509 }
|
|
510 #type (paired / single)
|
|
511 if (grepl('^-type=',each.arg)) {
|
|
512 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
513 if (! is.na(arg.split[2]) ) {
|
|
514 type <- arg.split[2]
|
|
515 }
|
|
516 }
|
|
517 }
|
|
518
|
|
519 ##Parse in variables
|
|
520 chromosomes = read.table(size.file, header=FALSE)
|
|
521 chromName = chromosomes$V1; #which chromosome
|
|
522 chromSize = chromosomes$V2; #chromosomes size
|
|
523 rm(chromosomes)
|
|
524
|
|
525 readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]])
|
|
526 numdup = length(readsFiles) #number of replicates
|
|
527 if (bkgd != "None") {
|
|
528 readsFiles[[numdup+1]] = bkgd
|
|
529 }
|
|
530 winSize = as.numeric(winSize)
|
|
531 binSize = as.numeric(binsize)
|
|
532 winSize = binSize * winSize
|
|
533
|
|
534 if (type == "single") {
|
|
535 frags = as.numeric(strsplit(frag, ",", fixed = TRUE)[[1]])
|
|
536 }
|
|
537 rm(bednames)
|
|
538
|
|
539 if (numdup > 1) {
|
|
540 modelnames = c("VVV","VEV")
|
|
541 } else {
|
|
542 modelnames = "V"
|
|
543 }
|
|
544
|
|
545 options(stringsAsFactors = FALSE)
|
|
546 #=======================> DONE!
|
|
547
|
|
548
|
|
549
|
|
550
|
|
551 # =======================
|
|
552 # Some preliminary stuff
|
|
553 # =======================
|
|
554 if (type == "single") {
|
|
555 if (cornum > 1) {
|
|
556 datain = mclapply(readsFiles, parsein, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control)
|
|
557 } else {
|
|
558 datain = lapply(readsFiles, parsein) #read in all bed files (samples and control)
|
|
559 }
|
|
560 }
|
|
561
|
|
562 if (type == "paired") {
|
|
563 if (cornum > 1) {
|
|
564 datain = mclapply(readsFiles, parseinpe, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control)
|
|
565 } else {
|
|
566 datain = lapply(readsFiles, parseinpe) #read in all bed files (samples and control)
|
|
567 }
|
|
568 }
|
|
569
|
|
570 #minimum peak size (only a recommendation)
|
|
571 minpeak = floor(binSize / 4)
|
|
572
|
|
573 #make bins vector
|
|
574 bins = seq(from = 1, to = (chromSize - 1), by = binSize)
|
|
575 #=======================> DONE!
|
|
576
|
|
577
|
|
578
|
|
579
|
|
580 # ===============
|
|
581 # Counting Reads
|
|
582 # ===============
|
|
583 if (type == "single") {
|
|
584 if (cornum > 1) {
|
|
585 counts = mclapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles, mc.cores = cornum, mc.preschedule = TRUE)
|
|
586 } else {
|
|
587 counts = lapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles)
|
|
588 }
|
|
589 }
|
|
590
|
|
591 if (type == "paired") {
|
|
592 if (cornum > 1) {
|
|
593 counts = mclapply(readsFiles, countreadspe, reads = datain, chromsize = chromSize, filelist = readsFiles, mc.cores = cornum, mc.preschedule = TRUE)
|
|
594 } else {
|
|
595 counts = lapply(readsFiles, countreadspe, reads = datain, chromsize = chromSize, filelist = readsFiles)
|
|
596 }
|
|
597 }
|
|
598
|
|
599 rm(datain)
|
|
600
|
|
601 #get total counts
|
|
602 mS = vector(mode = "numeric", length = numdup)
|
|
603 for (g in 1:numdup) {
|
|
604 mS[g] = sum(counts[[g]])
|
|
605 }
|
|
606 totalCounts = sum(mS)
|
|
607 rm(mS)
|
|
608 #=======================> DONE!
|
|
609
|
|
610
|
|
611
|
|
612 # ===================
|
|
613 # Estimating Cutoff
|
|
614 # ===================
|
|
615 if (is.na(cutoff)) {
|
|
616 if (bkgd != "None") {
|
|
617 cutoff = vector(length = numdup)
|
|
618 sdC = sd(counts[[numdup+1]])
|
|
619 for (x in 1:numdup) {
|
|
620 cutoff[x] = (mean(counts[[x]]))/(sdC)
|
|
621 }
|
|
622 cutoff = max(cutoff)
|
|
623 C = NULL
|
|
624 mCs = NULL
|
|
625 } else {
|
|
626 cutoff = vector(length = numdup)
|
|
627 mmV = var(geomeanL(counts))
|
|
628 mmM = mean(geomeanL(counts))
|
|
629 sigma = log(1+((mmV) / ((mmM)^2)))
|
|
630 mu = (log(mmM)) - (0.5 * (sigma))
|
|
631 C = rlnorm(100000, mu, sqrt(sigma))
|
|
632 for (x in 1:numdup) {
|
|
633 cutoff[x] = (mean(counts[[x]]))/(sd(C))
|
|
634 }
|
|
635 cutoff = max(cutoff)
|
|
636 mCs = (mean(sample(C, binSize*5, replace = TRUE)))
|
|
637 dCs = (sd(sample(C, binSize*5, replace = TRUE)))
|
|
638 }
|
|
639 }
|
|
640 #=======================> DONE!
|
|
641
|
|
642
|
|
643
|
|
644
|
|
645 # ========================
|
|
646 # Picking Enriched Windows
|
|
647 # ========================
|
|
648 binNum = vector(mode = "numeric", length = 41)
|
|
649 binTotalSize = vector(mode = "numeric", length = 41)
|
|
650 binTotalSizeRatio = vector(mode = "numeric", length = 41)
|
|
651 binTotalCount = vector(mode = "numeric", length = 41)
|
|
652 binTotalCountNO = vector(mode = "numeric", length = 41)
|
|
653 binTotalCountRatio = vector(mode = "numeric", length = 41)
|
|
654 maxiFun = vector(mode = "numeric", length = 41)
|
|
655 ggg = seq(1,5, by = 0.1)
|
|
656 for (i in 1:41) {
|
|
657 if (cornum > 1) {
|
|
658 coffeeshop = mclapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, ggg[i], mCs, dCs, bkgd, mc.cores = cornum, mc.preschedule = TRUE)
|
|
659 } else {
|
|
660 coffeeshop = lapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, ggg[i], mCs, dCs, bkgd)
|
|
661 }
|
|
662 coffeeshop = as.numeric(unlist(coffeeshop))
|
|
663 coffeeshopYES = coffeeshop[coffeeshop > 0]
|
|
664 coffeeshopNO = -(coffeeshop[coffeeshop <= 0])
|
|
665
|
|
666 binNum[i] = length(coffeeshopYES)
|
|
667 binTotalSize[i] = binNum[i] * binSize
|
|
668 binTotalSizeRatio[i] = binTotalSize[i] / chromSize
|
|
669
|
|
670 binTotalCount[i] = sum(coffeeshopYES)
|
|
671 binTotalCountNO[i] = sum(coffeeshopNO)
|
|
672 binTotalCountRatio[i] = binTotalCount[i] / binTotalCountNO[i]
|
|
673
|
|
674 maxiFun[i] = binTotalCountRatio[i] / binTotalSizeRatio[i]
|
|
675 }
|
|
676 rm(bins)
|
|
677 #=======================> DONE!
|
|
678
|
|
679
|
|
680 write(paste(ggg, binNum, binTotalSize, binTotalSizeRatio, binTotalCount, binTotalCountNO, binTotalCountRatio, maxiFun, sep = " "), file = paste0(file = paste0("/home/mibrahim/Documents/", chromName, ".GOYEAH"), ncolumns = 1))
|