Mercurial > repos > messersc > jamm
comparison countwins.r @ 0:d42f4d78c85e draft
Uploaded
author | messersc |
---|---|
date | Wed, 17 Dec 2014 10:40:23 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d42f4d78c85e |
---|---|
1 ############################### | |
2 #### Peak finder for NGS Data | |
3 #### R script | |
4 ############################### | |
5 | |
6 | |
7 | |
8 | |
9 # ======================= | |
10 # User-defined variables | |
11 # ======================= | |
12 cutoff = NA #To enforce an SNR cutoff ratio, delete NA and enter the number you want | |
13 strict = 1 #to make peak calling more / less strict, increase / decrease this number - Only use this if you are getting an unusually large or small number of peaks. | |
14 options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000) | |
15 #=======================> DONE! | |
16 | |
17 | |
18 | |
19 | |
20 # ================================ | |
21 # Required Libraries check & load | |
22 # ================================ | |
23 if ((is.element('mclust', installed.packages()[,1]) == FALSE) || (is.element('signal', installed.packages()[,1]) == FALSE) || (is.element('sqldf', installed.packages()[,1]) == FALSE) || (is.element('parallel', installed.packages()[,1]) == FALSE)) { | |
24 stop("R package 'Mclust', 'signal', 'sqldf' and 'parallel' are required. Please install them!") | |
25 } | |
26 suppressPackageStartupMessages(library("mclust")) | |
27 suppressPackageStartupMessages(library("signal")) | |
28 suppressPackageStartupMessages(library("sqldf")) | |
29 suppressPackageStartupMessages(library("tcltk")) | |
30 suppressPackageStartupMessages(library("parallel")) | |
31 #=======================> DONE! | |
32 | |
33 | |
34 | |
35 | |
36 | |
37 # ================= | |
38 # Custom Functions | |
39 # ================= | |
40 #Get per-row Geometric mean (takes list, returns vectors, not lists!) | |
41 geomeanL <- function(mat){ | |
42 n = length(mat) | |
43 if (n > 1) { | |
44 mult = (mat[[1]])*(mat[[2]]) | |
45 if (n > 2) { | |
46 for (i in 3:n) { | |
47 mult = mult*(mat[[i]]) | |
48 } | |
49 } | |
50 mult = mult^(1/n) | |
51 mat = mult | |
52 return(mat) | |
53 } else { | |
54 return(mat[[1]]) | |
55 } | |
56 } | |
57 | |
58 | |
59 | |
60 #Get per-row Geometric mean (takes matrix, returns vectors, not lists!) | |
61 geomean <- function(mat){ | |
62 n = NCOL(mat) | |
63 if (n > 1) { | |
64 mult = (mat[,1])*(mat[,2]) | |
65 if (n > 2) { | |
66 for (i in 3:n) { | |
67 mult = mult*(mat[,i]) | |
68 } | |
69 } | |
70 mult = mult^(1/n) | |
71 mat = mult | |
72 } | |
73 return(mat) | |
74 } | |
75 | |
76 | |
77 | |
78 #Read in bed(st2) file | |
79 parsein = function(bedfile) { | |
80 l = sqldf() | |
81 readsFile = file(bedfile) | |
82 return(sqldf("SELECT V1 + 1 As 'start' FROM readsFile", file.format=list(header = FALSE, sep = "\t"))$start) | |
83 l = sqldf() | |
84 close(readsFile) | |
85 } | |
86 | |
87 | |
88 #Read in bedpe(st2) file | |
89 parseinpe = function(bedfile) { | |
90 l = sqldf() | |
91 readsFile = file(bedfile) | |
92 sebastian = sqldf("SELECT * FROM readsFile", file.format=list(header = FALSE, sep = "\t")) | |
93 return(cbind(sebastian[[1]]+1, sebastian[[2]])) | |
94 l = sqldf() | |
95 close(readsFile) | |
96 } | |
97 | |
98 | |
99 #Produces normalized extended read counts (takes output of parsein(), return a vector of floats) | |
100 countreads = function(bedfile, reads, frag, chromsize, filelist) { | |
101 | |
102 o = which(filelist == bedfile) | |
103 | |
104 counts = vector(mode = "numeric", length = chromsize) | |
105 | |
106 for (j in 1:length(reads[[o]])) { | |
107 if ((reads[[o]][j]+frag[o]-1) <= chromsize) { | |
108 counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] = counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] + 1 | |
109 } | |
110 } | |
111 counts = counts/(mean(counts)) | |
112 return(counts) | |
113 } | |
114 | |
115 | |
116 | |
117 #Produces normalized extended read counts (takes output of parsein(), return a vector of floats) | |
118 countreadspe = function(bedfile, reads, chromsize, filelist) { | |
119 | |
120 o = which(filelist == bedfile) | |
121 | |
122 counts = vector(mode = "numeric", length = chromsize) | |
123 | |
124 for (j in 1:length(reads[[o]][,1])) { | |
125 counts[(reads[[o]][j,1]):(reads[[o]][j,2])] = counts[(reads[[o]][j,1]):(reads[[o]][j,2])] + 1 | |
126 } | |
127 counts = counts/(mean(counts)) | |
128 return(counts) | |
129 } | |
130 | |
131 | |
132 | |
133 #find enriched bins | |
134 pickbins = function(winStart, counts, binSize, chromSize, numdup, C, cutoff, strict, mCs, dCs, bkgd) { | |
135 | |
136 if ((winStart + binSize) <= chromSize) { | |
137 winEnd = winStart + binSize | |
138 } else { | |
139 winEnd = chromSize | |
140 } | |
141 binSizeTemp = winEnd - winStart | |
142 tempend = winEnd - 1 | |
143 | |
144 #extract subset of the background | |
145 if (bkgd != "None") { | |
146 Cs = counts[[numdup+1]][winStart:tempend] | |
147 mCs = mean(Cs) | |
148 dCs = sd(Cs) | |
149 } else { | |
150 Cs = sample(C, binSizeTemp, replace = TRUE) | |
151 } | |
152 | |
153 | |
154 #find out whether it's enriched | |
155 go = rep(0, numdup) | |
156 for (g in 1:numdup) { | |
157 mS = (mean(counts[[g]][winStart:tempend])) | |
158 ratio = mS/dCs | |
159 if ((mS > (mCs * strict)) && (ratio > cutoff)) { | |
160 go[g] = 1 | |
161 } | |
162 } | |
163 veep = sum(go) | |
164 | |
165 #get counts when enriched | |
166 if (veep == numdup) { | |
167 mS = rep(0, numdup) | |
168 for (g in 1:numdup) { | |
169 mS[g] = (mean(counts[[g]][winStart:winEnd])) | |
170 } | |
171 cairo = mean(mS) | |
172 } else { | |
173 mS = rep(0, numdup) | |
174 for (g in 1:numdup) { | |
175 mS[g] = (mean(counts[[g]][winStart:winEnd])) | |
176 } | |
177 cairo = -(mean(mS)) | |
178 } | |
179 return(cairo) | |
180 } | |
181 | |
182 | |
183 | |
184 #find enriched wins | |
185 pickwins = function(winStart, coffeeshopSud, counts, numdup, startlist, winSize) { | |
186 | |
187 plz = which(startlist == winStart) | |
188 winEnd = coffeeshopSud[plz] | |
189 rWinSize = winEnd - winStart + 1 | |
190 | |
191 | |
192 if(rWinSize >= winSize) { | |
193 mS = rep(0, numdup) | |
194 for (g in 1:numdup) { | |
195 mS[g] = (sum(counts[[g]][winStart:winEnd])) | |
196 } | |
197 veep = sum(mS) | |
198 } else { | |
199 veep = FALSE | |
200 } | |
201 | |
202 return(veep) | |
203 } | |
204 | |
205 | |
206 | |
207 #Initialize MClust clustering parameters | |
208 smoothcounts = function(winStart, coffeeshopSud, numdup, counts, startlist) { #helper function1 | |
209 | |
210 plz = which(startlist == winStart) | |
211 winEnd = coffeeshopSud[plz] | |
212 | |
213 #extract subset of the IP | |
214 Rs = matrix(0, nrow = (winEnd - winStart + 1), ncol = numdup) | |
215 for (j in 1:numdup) { | |
216 Rs[,j] = counts[[j]][winStart:winEnd] | |
217 } | |
218 #smooth extended read counts | |
219 for (j in 1:numdup) { | |
220 Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j]) | |
221 } | |
222 return(Rs) | |
223 } | |
224 cluster = function(model, sig, init, clustnummer, noise) { #helper function2 | |
225 noisy = sample(noise, length(sig[,1]), replace = TRUE) | |
226 clust = me(model, sig+noisy, init) | |
227 bicc = bic(model, clust$loglik, length(sig[,1]), length(sig[1,]), clustnummer) | |
228 out = list(bicc = bicc, param = clust$parameters) | |
229 return(out) | |
230 } | |
231 initparam = function(coffeeshopNord, coffeeshopSud, numdup, counts, cornum, clustnummer, modelnames, noise) { #main function | |
232 | |
233 n = length(coffeeshopNord) | |
234 #smooth extended read counts | |
235 if (cornum > 1) { | |
236 sig = mclapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord, mc.cores = cornum, mc.preschedule = TRUE) | |
237 } else { | |
238 sig = lapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord) | |
239 } | |
240 sig = do.call(rbind, sig) | |
241 | |
242 #kmeans initialization | |
243 init = kmeans(sig, clustnummer) | |
244 init = unmap(init$cluster) | |
245 | |
246 | |
247 if (cornum > 1) { | |
248 param = mclapply(modelnames, cluster, sig, init, clustnummer, noise, mc.cores = cornum, mc.preschedule = TRUE) | |
249 } else { | |
250 param = lapply(modelnames, cluster, sig, init, clustnummer, noise) | |
251 } | |
252 | |
253 bicc = vector(mode = "numeric", length = length(modelnames)) | |
254 for (i in 1:length(modelnames)) { | |
255 bicc[i] = as.numeric(param[[i]]$bicc) | |
256 } | |
257 bicc = which.max(bicc) | |
258 | |
259 out = list(initparam = param[[bicc]]$param, modelname = modelnames[bicc]) | |
260 return(out) | |
261 } | |
262 | |
263 | |
264 | |
265 | |
266 #find peaks | |
267 findpeak = function(winStart, coffeeshopSud, numdup, C, param, bkgd, resol, counts, noise, startlist) { | |
268 | |
269 | |
270 plz = which(startlist == winStart) | |
271 winEnd = coffeeshopSud[plz] | |
272 | |
273 | |
274 #will store peak information | |
275 writethis = list() | |
276 | |
277 | |
278 rWinSizeTemp = winEnd - winStart + 1 | |
279 | |
280 #extract subset of the IP | |
281 Rs = matrix(nrow = rWinSizeTemp, ncol = numdup) | |
282 Rsr = Rs | |
283 for (j in 1:numdup) { | |
284 Rsr[,j] = counts[[j]][winStart:winEnd] | |
285 Rs[,j] = filtfilt(rep(1,80)/80,1,Rsr[,j]) | |
286 } | |
287 | |
288 | |
289 ####CLUSTERING START HERE##### | |
290 noisy = sample(noise, rWinSizeTemp, replace = TRUE) | |
291 clust = em(param$modelname, Rs+noisy, param$initparam) | |
292 clust$classification = map(clust$z) | |
293 ####CLUSTERING DONE#### | |
294 | |
295 #check whether clutering succeeded | |
296 if((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification))))) { | |
297 | |
298 | |
299 #check whether all components replicates agreed on clustering assignments | |
300 ccx = 1 | |
301 if (numdup > 1) { | |
302 cc = vector(mode = "numeric", length = numdup) | |
303 for (g in 1:numdup) { | |
304 cc[g] = which.max(clust$parameters$mean[g,]) #which cluster has the largest mean (this is the peak cluster, hopefully!) | |
305 } | |
306 ccx = sum(diff(cc)) | |
307 cluster = cc[1] | |
308 rm(cc) | |
309 } else { | |
310 cluster = which.max(clust$parameters$mean) #which cluster has the largest mean (this is the peak cluster, hopefully!) | |
311 ccx = 0 | |
312 } | |
313 if (ccx == 0) { | |
314 | |
315 #extract subset of the background | |
316 if (bkgd != "None") { | |
317 Cs = counts[[numdup+1]][winStart:winEnd] | |
318 Cs = filtfilt(rep(1,80)/80,1,Cs) | |
319 } else { | |
320 Cs = sample(C, rWinSizeTemp, replace = TRUE) | |
321 Cs = filtfilt(rep(1,80)/80,1,Cs) | |
322 } | |
323 | |
324 #find region boundaries | |
325 loc = 1:length(clust$classification) | |
326 gmclass = cbind(loc, clust$classification) | |
327 locPeak = gmclass[gmclass[,2] == cluster,,drop=FALSE] | |
328 rStart = locPeak[1] #start position of the region | |
329 rEnd = locPeak[length(locPeak[,1]),1] #end position of the region | |
330 | |
331 #peak resolution check | |
332 if (resol == "region") { | |
333 pSize = rEnd - rStart | |
334 signal = (geomean(Rs[rStart:rEnd,])) | |
335 signal2 = (signal) - (Cs[rStart:rEnd]) | |
336 gm = mean(signal2) | |
337 summit = which.max(geomean(Rsr[rStart:rEnd,])) - 1 | |
338 will2k = wilcox.test(signal, Cs[rStart:rEnd]) | |
339 | |
340 #Is there signal in the region above background | |
341 if (gm > 0) { | |
342 writethis[[1]] = rStart + winStart - 1 | |
343 writethis[[2]] = rEnd + winStart | |
344 writethis[[3]] = paste0(chromName, ".", rStart+winStart -1) | |
345 writethis[[4]] = "1000" | |
346 writethis[[5]] = "." | |
347 writethis[[6]] = gm | |
348 writethis[[7]] = will2k$p.value | |
349 writethis[[8]] = "-1" | |
350 writethis[[9]] = summit | |
351 } | |
352 } else if (resol == "peak") { | |
353 #find out where separate peaks are | |
354 d = diff(locPeak[,1]) | |
355 d[length(d)+1] = 0 | |
356 locPeak = cbind(locPeak, d) | |
357 bound1 = which(locPeak[,3] > 1, arr.in=TRUE) | |
358 bound2 = bound1 + 1 | |
359 bound = locPeak[sort(c(bound1,bound2))] | |
360 bound = c(rStart, bound, rEnd) | |
361 w = 1 | |
362 warum = 0 | |
363 while (w < length(bound)) { | |
364 pStart = bound[w] + winStart - 1 | |
365 pEnd = bound[w+1] + winStart | |
366 pSize = pEnd - pStart | |
367 signal = (geomean(Rs[(bound[w]):(bound[w+1]),])) | |
368 signal2 = (signal) - (Cs[bound[w]:bound[w+1]]) | |
369 gm = mean(signal2) | |
370 summit = which.max(geomean(Rsr[(bound[w]):(bound[w+1]),])) - 1 | |
371 will2k = wilcox.test(signal, Cs[(bound[w]):(bound[w+1])]) | |
372 | |
373 weil = warum * 9 | |
374 #Is there signal in the region above background | |
375 if (gm > 0) { | |
376 writethis[[1+weil]] = pStart | |
377 writethis[[2+weil]] = pEnd | |
378 writethis[[3+weil]] = paste0(chromName, ".", pStart) | |
379 writethis[[4+weil]] = "1000" | |
380 writethis[[5+weil]] = "." | |
381 writethis[[6+weil]] = gm | |
382 writethis[[7+weil]] = will2k$p.value | |
383 writethis[[8+weil]] = "-1" | |
384 writethis[[9+weil]] = summit | |
385 } | |
386 w = w + 2 | |
387 warum = warum + 1 | |
388 } | |
389 | |
390 } #peak resolution check | |
391 } #all replicates agree on clustering assignments? | |
392 } #clustering worked? | |
393 return(writethis) | |
394 } | |
395 | |
396 | |
397 #filter return value of findpeak() | |
398 processPeaks = function(peaks) { | |
399 peaks = matrix(unlist(peaks), ncol=9, byrow=TRUE) | |
400 peaks = peaks[peaks[,1] != FALSE,,drop=FALSE] | |
401 peaks = data.frame(peaks) | |
402 return(peaks) | |
403 } | |
404 #=======================> DONE! | |
405 | |
406 | |
407 | |
408 | |
409 | |
410 | |
411 | |
412 | |
413 | |
414 | |
415 | |
416 | |
417 | |
418 | |
419 | |
420 | |
421 | |
422 | |
423 # ========================== | |
424 # Parse-in System Variables | |
425 # ========================== | |
426 args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line | |
427 | |
428 #Parsing arguments and storing values | |
429 for (each.arg in args) { | |
430 #chormosome size file | |
431 if (grepl('^-sfile=',each.arg)) { | |
432 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
433 if (! is.na(arg.split[2]) ) { | |
434 size.file <- arg.split[2] | |
435 } else { | |
436 stop('No genome size file') | |
437 } | |
438 } | |
439 #bed file names | |
440 if (grepl('^-bednames=',each.arg)) { | |
441 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
442 if (! is.na(arg.split[2]) ) { | |
443 bednames <- arg.split[2] | |
444 } else { | |
445 stop('No bed file names') | |
446 } | |
447 } | |
448 #bed files directory | |
449 if (grepl('^-frag=',each.arg)) { | |
450 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
451 if (! is.na(arg.split[2]) ) { | |
452 frag <- arg.split[2] | |
453 } else { | |
454 stop('No fragment length given') | |
455 } | |
456 } | |
457 #bakcground files directory | |
458 if (grepl('^-bkgd=',each.arg)) { | |
459 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
460 if (! is.na(arg.split[2]) ) { | |
461 bkgd <- arg.split[2] | |
462 } else { | |
463 stop('No background file') | |
464 } | |
465 } | |
466 #bakcground files directory | |
467 if (grepl('^-out=',each.arg)) { | |
468 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
469 if (! is.na(arg.split[2]) ) { | |
470 out <- arg.split[2] | |
471 } else { | |
472 stop('No output directory given') | |
473 } | |
474 } | |
475 #Cluster number | |
476 if (grepl('^-clustnummer=',each.arg)) { | |
477 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
478 if (! is.na(arg.split[2]) ) { | |
479 clustnummer <- as.numeric(arg.split[2]) | |
480 } | |
481 } | |
482 #resolution | |
483 if (grepl('^-resolution=',each.arg)) { | |
484 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
485 if (! is.na(arg.split[2]) ) { | |
486 resol <- arg.split[2] | |
487 } | |
488 } | |
489 #processor cores | |
490 if (grepl('^-p=',each.arg)) { | |
491 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
492 if (! is.na(arg.split[2]) ) { | |
493 cornum <- as.numeric(arg.split[2]) | |
494 } | |
495 } | |
496 #minimum window size | |
497 if (grepl('^-window=',each.arg)) { | |
498 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
499 if (! is.na(arg.split[2]) ) { | |
500 winSize <- arg.split[2] | |
501 } | |
502 } | |
503 #window size | |
504 if (grepl('^-bin=',each.arg)) { | |
505 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
506 if (! is.na(arg.split[2]) ) { | |
507 binsize <- arg.split[2] | |
508 } | |
509 } | |
510 #type (paired / single) | |
511 if (grepl('^-type=',each.arg)) { | |
512 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
513 if (! is.na(arg.split[2]) ) { | |
514 type <- arg.split[2] | |
515 } | |
516 } | |
517 } | |
518 | |
519 ##Parse in variables | |
520 chromosomes = read.table(size.file, header=FALSE) | |
521 chromName = chromosomes$V1; #which chromosome | |
522 chromSize = chromosomes$V2; #chromosomes size | |
523 rm(chromosomes) | |
524 | |
525 readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]]) | |
526 numdup = length(readsFiles) #number of replicates | |
527 if (bkgd != "None") { | |
528 readsFiles[[numdup+1]] = bkgd | |
529 } | |
530 winSize = as.numeric(winSize) | |
531 binSize = as.numeric(binsize) | |
532 winSize = binSize * winSize | |
533 | |
534 if (type == "single") { | |
535 frags = as.numeric(strsplit(frag, ",", fixed = TRUE)[[1]]) | |
536 } | |
537 rm(bednames) | |
538 | |
539 if (numdup > 1) { | |
540 modelnames = c("VVV","VEV") | |
541 } else { | |
542 modelnames = "V" | |
543 } | |
544 | |
545 options(stringsAsFactors = FALSE) | |
546 #=======================> DONE! | |
547 | |
548 | |
549 | |
550 | |
551 # ======================= | |
552 # Some preliminary stuff | |
553 # ======================= | |
554 if (type == "single") { | |
555 if (cornum > 1) { | |
556 datain = mclapply(readsFiles, parsein, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control) | |
557 } else { | |
558 datain = lapply(readsFiles, parsein) #read in all bed files (samples and control) | |
559 } | |
560 } | |
561 | |
562 if (type == "paired") { | |
563 if (cornum > 1) { | |
564 datain = mclapply(readsFiles, parseinpe, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control) | |
565 } else { | |
566 datain = lapply(readsFiles, parseinpe) #read in all bed files (samples and control) | |
567 } | |
568 } | |
569 | |
570 #minimum peak size (only a recommendation) | |
571 minpeak = floor(binSize / 4) | |
572 | |
573 #make bins vector | |
574 bins = seq(from = 1, to = (chromSize - 1), by = binSize) | |
575 #=======================> DONE! | |
576 | |
577 | |
578 | |
579 | |
580 # =============== | |
581 # Counting Reads | |
582 # =============== | |
583 if (type == "single") { | |
584 if (cornum > 1) { | |
585 counts = mclapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles, mc.cores = cornum, mc.preschedule = TRUE) | |
586 } else { | |
587 counts = lapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles) | |
588 } | |
589 } | |
590 | |
591 if (type == "paired") { | |
592 if (cornum > 1) { | |
593 counts = mclapply(readsFiles, countreadspe, reads = datain, chromsize = chromSize, filelist = readsFiles, mc.cores = cornum, mc.preschedule = TRUE) | |
594 } else { | |
595 counts = lapply(readsFiles, countreadspe, reads = datain, chromsize = chromSize, filelist = readsFiles) | |
596 } | |
597 } | |
598 | |
599 rm(datain) | |
600 | |
601 #get total counts | |
602 mS = vector(mode = "numeric", length = numdup) | |
603 for (g in 1:numdup) { | |
604 mS[g] = sum(counts[[g]]) | |
605 } | |
606 totalCounts = sum(mS) | |
607 rm(mS) | |
608 #=======================> DONE! | |
609 | |
610 | |
611 | |
612 # =================== | |
613 # Estimating Cutoff | |
614 # =================== | |
615 if (is.na(cutoff)) { | |
616 if (bkgd != "None") { | |
617 cutoff = vector(length = numdup) | |
618 sdC = sd(counts[[numdup+1]]) | |
619 for (x in 1:numdup) { | |
620 cutoff[x] = (mean(counts[[x]]))/(sdC) | |
621 } | |
622 cutoff = max(cutoff) | |
623 C = NULL | |
624 mCs = NULL | |
625 } else { | |
626 cutoff = vector(length = numdup) | |
627 mmV = var(geomeanL(counts)) | |
628 mmM = mean(geomeanL(counts)) | |
629 sigma = log(1+((mmV) / ((mmM)^2))) | |
630 mu = (log(mmM)) - (0.5 * (sigma)) | |
631 C = rlnorm(100000, mu, sqrt(sigma)) | |
632 for (x in 1:numdup) { | |
633 cutoff[x] = (mean(counts[[x]]))/(sd(C)) | |
634 } | |
635 cutoff = max(cutoff) | |
636 mCs = (mean(sample(C, binSize*5, replace = TRUE))) | |
637 dCs = (sd(sample(C, binSize*5, replace = TRUE))) | |
638 } | |
639 } | |
640 #=======================> DONE! | |
641 | |
642 | |
643 | |
644 | |
645 # ======================== | |
646 # Picking Enriched Windows | |
647 # ======================== | |
648 binNum = vector(mode = "numeric", length = 41) | |
649 binTotalSize = vector(mode = "numeric", length = 41) | |
650 binTotalSizeRatio = vector(mode = "numeric", length = 41) | |
651 binTotalCount = vector(mode = "numeric", length = 41) | |
652 binTotalCountNO = vector(mode = "numeric", length = 41) | |
653 binTotalCountRatio = vector(mode = "numeric", length = 41) | |
654 maxiFun = vector(mode = "numeric", length = 41) | |
655 ggg = seq(1,5, by = 0.1) | |
656 for (i in 1:41) { | |
657 if (cornum > 1) { | |
658 coffeeshop = mclapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, ggg[i], mCs, dCs, bkgd, mc.cores = cornum, mc.preschedule = TRUE) | |
659 } else { | |
660 coffeeshop = lapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, ggg[i], mCs, dCs, bkgd) | |
661 } | |
662 coffeeshop = as.numeric(unlist(coffeeshop)) | |
663 coffeeshopYES = coffeeshop[coffeeshop > 0] | |
664 coffeeshopNO = -(coffeeshop[coffeeshop <= 0]) | |
665 | |
666 binNum[i] = length(coffeeshopYES) | |
667 binTotalSize[i] = binNum[i] * binSize | |
668 binTotalSizeRatio[i] = binTotalSize[i] / chromSize | |
669 | |
670 binTotalCount[i] = sum(coffeeshopYES) | |
671 binTotalCountNO[i] = sum(coffeeshopNO) | |
672 binTotalCountRatio[i] = binTotalCount[i] / binTotalCountNO[i] | |
673 | |
674 maxiFun[i] = binTotalCountRatio[i] / binTotalSizeRatio[i] | |
675 } | |
676 rm(bins) | |
677 #=======================> DONE! | |
678 | |
679 | |
680 write(paste(ggg, binNum, binTotalSize, binTotalSizeRatio, binTotalCount, binTotalCountNO, binTotalCountRatio, maxiFun, sep = " "), file = paste0(file = paste0("/home/mibrahim/Documents/", chromName, ".GOYEAH"), ncolumns = 1)) |