Mercurial > repos > zzhou > spp_phantompeak
comparison spp/R/zroutines.R @ 6:ce08b0efa3fd draft
Uploaded
author | zzhou |
---|---|
date | Tue, 27 Nov 2012 16:11:40 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:608a8e0eac56 | 6:ce08b0efa3fd |
---|---|
1 #library(caTools) | |
2 #dyn.load("src/bed2vector.so"); | |
3 #dyn.load("src/wdl.so"); | |
4 #dyn.load("src/peaks.so"); | |
5 #dyn.load("src/cdensum.so"); | |
6 | |
7 | |
8 # -------- ROUTINES FOR READING IN THE DATA FILES ------------ | |
9 # fix.chromosome.names : remove ".fa" suffix from match sequence names | |
10 read.eland.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T,max.eland.tag.length=-1,extended=F,multi=F) { | |
11 if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; | |
12 storage.mode(max.eland.tag.length) <- "integer"; | |
13 callfunction <- "read_eland"; | |
14 if(extended) { callfunction <- "read_eland_extended"; }; | |
15 if(multi) { callfunction <- "read_eland_multi"; }; | |
16 tl <- lapply(.Call(callfunction,filename,rtn,max.eland.tag.length),function(d) { | |
17 xo <- order(abs(d$t)); | |
18 d$t <- d$t[xo]; | |
19 d$n <- d$n[xo]; | |
20 if(read.tag.names) { | |
21 d$s <- d$s[xo]; | |
22 } | |
23 return(d); | |
24 }); | |
25 if(fix.chromosome.names) { | |
26 # remove ".fa" | |
27 names(tl) <- gsub("\\.fa","",names(tl)) | |
28 } | |
29 # separate tags and quality | |
30 if(read.tag.names) { | |
31 return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s))); | |
32 } else { | |
33 return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); | |
34 } | |
35 } | |
36 | |
37 read.tagalign.tags <- function(filename,fix.chromosome.names=T,fix.quality=T) { | |
38 tl <- lapply(.Call("read_tagalign",filename),function(d) { | |
39 xo <- order(abs(d$t)); | |
40 d$t <- d$t[xo]; | |
41 d$n <- d$n[xo]; | |
42 #if(fix.quality) { | |
43 # d$n <- 4-cut(d$n,breaks=c(0,250,500,750,1000),labels=F) | |
44 #} | |
45 if(fix.quality) { # Anshul: changed the way the quality field is processed | |
46 if (min(d$n)<0.5){ | |
47 d$n = ceiling(1000/4^d$n); | |
48 } | |
49 break.vals <- unique(sort(c(0,unique(d$n)))); | |
50 d$n <- length(break.vals)-1-cut(d$n,breaks=break.vals,labels=F); | |
51 } | |
52 return(d); | |
53 }); | |
54 if(fix.chromosome.names) { | |
55 # remove ".fa" | |
56 names(tl) <- gsub("\\.fa","",names(tl)) | |
57 } | |
58 # separate tags and quality | |
59 return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); | |
60 } | |
61 | |
62 | |
63 read.short.arachne.tags <- function(filename,fix.chromosome.names=F) { | |
64 tl <- lapply(.Call("read_arachne",filename),function(d) { | |
65 xo <- order(abs(d$t)); | |
66 d$t <- d$t[xo]; | |
67 d$n <- d$n[xo]; | |
68 return(d); | |
69 }); | |
70 if(fix.chromosome.names) { | |
71 # remove ".fa" | |
72 names(tl) <- gsub("\\.fa","",names(tl)) | |
73 } | |
74 # separate tags and quality | |
75 return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); | |
76 } | |
77 | |
78 | |
79 read.arachne.tags <- function(filename,fix.chromosome.names=F) { | |
80 tl <- lapply(.Call("read_arachne_long",filename),function(d) { | |
81 xo <- order(abs(d$t)); | |
82 d$t <- d$t[xo]; | |
83 d$n <- d$n[xo]; | |
84 d$l <- d$l[xo]; | |
85 return(d); | |
86 }); | |
87 if(fix.chromosome.names) { | |
88 # remove ".fa" | |
89 names(tl) <- gsub("\\.fa","",names(tl)) | |
90 } | |
91 # separate tags and quality | |
92 return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),length=lapply(tl,function(d) d$l))); | |
93 } | |
94 | |
95 read.bowtie.tags <- function(filename,read.tag.names=F,fix.chromosome.names=F) { | |
96 if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; | |
97 tl <- lapply(.Call("read_bowtie",filename,rtn),function(d) { | |
98 xo <- order(abs(d$t)); | |
99 d$t <- d$t[xo]; | |
100 d$n <- d$n[xo]; | |
101 if(read.tag.names) { | |
102 d$s <- d$s[xo]; | |
103 } | |
104 return(d); | |
105 }); | |
106 if(fix.chromosome.names) { | |
107 # remove ".fa" | |
108 names(tl) <- gsub("\\.fa","",names(tl)) | |
109 } | |
110 # separate tags and quality | |
111 if(read.tag.names) { | |
112 return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s))); | |
113 } else { | |
114 return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); | |
115 } | |
116 } | |
117 | |
118 read.bam.tags <- function(filename,read.tag.names=F,fix.chromosome.names=F) { | |
119 if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; | |
120 tl <- lapply(.Call("read_bam",filename,rtn),function(d) { | |
121 xo <- order(abs(d$t)); | |
122 d$t <- d$t[xo]; | |
123 d$n <- d$n[xo]; | |
124 if(read.tag.names) { | |
125 d$s <- d$s[xo]; | |
126 } | |
127 return(d); | |
128 }); | |
129 if(fix.chromosome.names) { | |
130 # remove ".fa" | |
131 names(tl) <- gsub("\\.fa","",names(tl)) | |
132 } | |
133 # separate tags and quality | |
134 if(read.tag.names) { | |
135 return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s))); | |
136 } else { | |
137 return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); | |
138 } | |
139 } | |
140 | |
141 | |
142 read.helicos.tags <- function(filename,read.tag.names=F,fix.chromosome.names=F,include.length.info=T) { | |
143 if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; | |
144 tl <- lapply(.Call("read_helicostabf",filename,rtn),function(d) { | |
145 xo <- order(abs(d$t)); | |
146 d$t <- d$t[xo]; | |
147 d$n <- d$n[xo]; | |
148 d$l <- d$l[xo]; | |
149 if(read.tag.names) { | |
150 d$s <- d$s[xo]; | |
151 } | |
152 return(d); | |
153 }); | |
154 if(fix.chromosome.names) { | |
155 # remove ".fa" | |
156 names(tl) <- gsub("\\.fa","",names(tl)) | |
157 } | |
158 # separate tags and quality | |
159 if(read.tag.names) { | |
160 return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),length=lapply(tl,function(d) d$l),names=lapply(tl,function(d) d$s))); | |
161 } else { | |
162 return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),length=lapply(tl,function(d) d$l))); | |
163 } | |
164 } | |
165 | |
166 read.maqmap.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T) { | |
167 if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; | |
168 tl <- lapply(.Call("read_maqmap",filename,rtn),function(d) { | |
169 xo <- order(abs(d$t)); | |
170 d$t <- d$t[xo]; | |
171 d$n <- d$n[xo]; | |
172 if(read.tag.names) { | |
173 d$s <- d$s[xo]; | |
174 } | |
175 return(d); | |
176 }); | |
177 if(fix.chromosome.names) { | |
178 # remove ".fa" | |
179 names(tl) <- gsub("\\.fa","",names(tl)) | |
180 } | |
181 # separate tags and quality | |
182 if(read.tag.names) { | |
183 return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s))); | |
184 } else { | |
185 return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); | |
186 } | |
187 } | |
188 | |
189 | |
190 read.bin.maqmap.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T) { | |
191 if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; | |
192 tl <- lapply(.Call("read_binmaqmap",filename,rtn),function(d) { | |
193 xo <- order(abs(d$t)); | |
194 d$t <- d$t[xo]; | |
195 d$n <- d$n[xo]; | |
196 if(read.tag.names) { | |
197 d$s <- d$s[xo]; | |
198 } | |
199 return(d); | |
200 }); | |
201 if(fix.chromosome.names) { | |
202 # remove ".fa" | |
203 names(tl) <- gsub("\\.fa","",names(tl)) | |
204 } | |
205 # separate tags and quality | |
206 if(read.tag.names) { | |
207 return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s))); | |
208 } else { | |
209 return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); | |
210 } | |
211 } | |
212 | |
213 | |
214 # read in tags from an extended eland format with match length information | |
215 read.meland.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T) { | |
216 if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; | |
217 tl <- lapply(.Call("read_meland",filename,rtn),function(d) { | |
218 xo <- order(abs(d$t)); | |
219 d$t <- d$t[xo]; | |
220 d$n <- d$n[xo]; | |
221 d$l <- d$l[xo]; | |
222 if(read.tag.names) { | |
223 d$s <- d$s[xo]; | |
224 } | |
225 return(d); | |
226 }); | |
227 | |
228 if(fix.chromosome.names) { | |
229 # remove ".fa" | |
230 names(tl) <- gsub("\\.fa","",names(tl)) | |
231 } | |
232 # separate tags and quality | |
233 chrl <- names(tl); names(chrl) <- chrl; | |
234 # reformulate quality scores into monotonic integers | |
235 ml <- max(unlist(lapply(tl,function(d) max(d$l)))); | |
236 qual <- lapply(chrl,function(chr) (ml-tl[[chr]]$l)+tl[[chr]]$n/10); | |
237 if(read.tag.names) { | |
238 return(list(tags=lapply(tl,function(d) d$t),quality=qual,names=lapply(tl,function(d) d$s))); | |
239 } else { | |
240 return(list(tags=lapply(tl,function(d) d$t),quality=qual)); | |
241 } | |
242 } | |
243 | |
244 # -------- ROUTINES FOR ASSESSING BINDING PATTERN AND SELECTING INFORMATIVE TAGS ------------ | |
245 | |
246 # removes tag positions that have anomalously high counts on both strands | |
247 # z - z-score used to determine anomalous bins | |
248 # zo - z used to filter out one-strand matches | |
249 # trim.fraction - fraction of top bins to discard when calculating overall background density | |
250 remove.tag.anomalies <- function(data, bin=1,trim.fraction=1e-3,z=5,zo=3*z) { | |
251 | |
252 t.remove.tag.anomalies <- function(tv,bin=1,trim.fraction=1e-3,z=5,zo=3*z,return.indecies=F) { | |
253 tt <- table(floor(tv/bin)); | |
254 | |
255 # trim value | |
256 stt <- sort(as.numeric(tt)); | |
257 stt <- stt[1:(length(stt)*(1-trim.fraction))]; | |
258 mtc <- mean(stt); tcd <- sqrt(var(stt)); | |
259 | |
260 thr <- max(1,ceiling(mtc+z*tcd)); | |
261 thr.o <- max(1,ceiling(mtc+zo*tcd)); | |
262 # filter tt | |
263 tt <- tt[tt>=thr] | |
264 # get + and - tags | |
265 tp <- as.numeric(names(tt)); | |
266 pti <- tp>0; | |
267 it <- intersect(tp[pti],(-1)*tp[!pti]); | |
268 # add one-strand matches | |
269 it <- unique(c(it,tp[tt>=thr.o])); | |
270 sit <- c(it,(-1)*it); | |
271 | |
272 if(bin>1) { | |
273 sit <- sit*bin; | |
274 sit <- c(sit,unlist(lapply(1:bin,function(i) sit+i))) | |
275 } | |
276 if(return.indecies) { | |
277 return(!tv %in% sit); | |
278 } else { | |
279 return(tv[!tv %in% sit]); | |
280 } | |
281 } | |
282 | |
283 vil <- lapply(data$tags,t.remove.tag.anomalies,return.indecies=T,bin=bin,trim.fraction=trim.fraction,z=z,zo=zo); | |
284 chrl <- names(data$tags); names(chrl) <- chrl; | |
285 data$tags <- lapply(chrl,function(chr) data$tags[[chr]][vil[[chr]]]); | |
286 # count tags to remove empty chromosomes | |
287 nt <- unlist(lapply(data$tags,length)); | |
288 if(any(nt==0)) { | |
289 data$tags <- data$tags[nt!=0] | |
290 } | |
291 | |
292 if(!is.null(data$quality)) { | |
293 data$quality <- lapply(chrl,function(chr) data$quality[[chr]][vil[[chr]]]); | |
294 data$quality <- data$quality[nt!=0]; | |
295 } | |
296 if(!is.null(data$names)) { | |
297 data$names <- lapply(chrl,function(chr) data$names[[chr]][vil[[chr]]]); | |
298 data$names <- data$names[nt!=0]; | |
299 } | |
300 | |
301 return(data); | |
302 } | |
303 | |
304 # caps or removes tag positions that are significantly higher than local background | |
305 remove.local.tag.anomalies <- function(tags,window.size=200,eliminate.fold=10,cap.fold=4,z.threshold=3) { | |
306 lapply(tags,filter.singular.positions.by.local.density,window.size=2e2,eliminate.fold=10,cap.fold=4,z.threshold=3); | |
307 } | |
308 | |
309 | |
310 | |
311 # assess strand cross-correlation, determine peak position, determine appropriate window size | |
312 # for binding detection. | |
313 get.binding.characteristics <- function(data,srange=c(50,500),bin=5,cluster=NULL,debug=F,min.tag.count=1e3,acceptance.z.score=3,remove.tag.anomalies=T,anomalies.z=5,accept.all.tags=F) { | |
314 if(remove.tag.anomalies) { | |
315 data <- remove.tag.anomalies(data,z=anomalies.z); | |
316 } | |
317 | |
318 # take highest quality tag bin | |
319 if(!is.null(data$quality) & !accept.all.tags) { | |
320 min.bin <- min(unlist(lapply(data$quality,min))) | |
321 chrl <- names(data$tags); names(chrl) <- chrl; | |
322 otl <- lapply(chrl,function(chr) data$tags[[chr]][data$quality[[chr]]==min.bin]); | |
323 } else { | |
324 otl <- data$tags; | |
325 } | |
326 # remove empty chromosomes | |
327 otl <- otl[unlist(lapply(otl,length))!=0]; | |
328 | |
329 | |
330 # calculate strand scc | |
331 if(!is.null(cluster)) { | |
332 cc <- clusterApplyLB(cluster,otl,tag.scc,srange=srange,bin=bin); | |
333 names(cc) <- names(otl); | |
334 } else { | |
335 cc <- lapply(otl,tag.scc,srange=srange,bin=bin); | |
336 } | |
337 ccl<-list(sample=cc); | |
338 ccl.av <- lapply(names(ccl),t.plotavcc,type='l',ccl=ccl,return.ac=T,ttl=list(sample=otl),plot=F)[[1]] | |
339 ccl.av <- data.frame(x=as.numeric(names(ccl.av)),y=as.numeric(ccl.av)); | |
340 | |
341 # find peak | |
342 pi <- which.max(ccl.av$y); | |
343 | |
344 # determine width at third-height | |
345 th <- (ccl.av$y[pi]-ccl.av$y[length(ccl.av$y)])/3+ccl.av$y[length(ccl.av$y)] | |
346 whs <- max(ccl.av$x[ccl.av$y>=th]); | |
347 | |
348 if (! is.integer(whs)) { # Anshul: added this to avoid situations where whs ends up being -Inf | |
349 whs <- ccl.av$x[ min(c(2*pi,length(ccl.av$y))) ] | |
350 } | |
351 | |
352 # determine acceptance of different quality bins | |
353 | |
354 # calculates tag scc for the best tags, and combinations of best tag category with every other category | |
355 # for subsequent selection of acceptable categories | |
356 scc.acceptance.calc <- function() { | |
357 | |
358 qr <- range(unlist(lapply(data$quality,range))) | |
359 | |
360 # start with best tags | |
361 | |
362 # determine half-width for scc calculations | |
363 pi <- which.max(ccl.av$y); | |
364 | |
365 # determine width at half-height | |
366 th <- (ccl.av$y[pi]-ccl.av$y[length(ccl.av$y)])/2+ccl.av$y[length(ccl.av$y)] | |
367 lwhs <- max(ccl.av$x[ccl.av$y>=th])-ccl.av$x[pi]; | |
368 lwhs <- max(c(20,bin*10,lwhs)); | |
369 srange <- ccl.av$x[pi]+c(-lwhs,lwhs) | |
370 | |
371 # calculate chromosome-average scc | |
372 t.scc <- function(tags) { | |
373 if(is.null(cluster)) { | |
374 cc <- lapply(tags,tag.scc,srange=srange,bin=bin); | |
375 } else { | |
376 cc <- clusterApplyLB(cluster,tags,tag.scc,srange=srange,bin=bin); names(cc) <- names(tags); | |
377 } | |
378 return(t.plotavcc(1,type='l',ccl=list(cc),ttl=list(tags),plot=F,return.ac=T)) | |
379 } | |
380 | |
381 | |
382 # returns info list for a given tag length (lv), mismatch count (nv) | |
383 t.cat <- function(qual) { | |
384 # construct tag set | |
385 if(qual==qr[1]) { | |
386 ts <- otl; | |
387 } else { | |
388 nts <- names(otl); names(nts) <- nts; | |
389 # select tags | |
390 at <- lapply(nts,function(chr) data$tags[[chr]][data$quality[[chr]]==qual]); | |
391 ntags <- sum(unlist(lapply(at,length))); | |
392 if(ntags<min.tag.count) { return(NULL); } | |
393 | |
394 # append to otl | |
395 ts <- lapply(nts,function(nam) c(otl[[nam]],at[[nam]])); | |
396 } | |
397 | |
398 return(t.scc(ts)); | |
399 } | |
400 | |
401 | |
402 # calculate cross-correlation values for each quality bin | |
403 ql <- sort(unique(unlist(lapply(data$quality,unique)))); names(ql) <- ql; | |
404 | |
405 qccl <- lapply(ql,t.cat); | |
406 | |
407 # acceptance tests | |
408 ac <- c(T,unlist(lapply(qccl[-1],function(d) if(is.null(d)) { return(F) } else { t.test(d-qccl[[as.character(min.bin)]],alternative="greater")$p.value<pnorm(acceptance.z.score,lower.tail=F) }))); names(ac) <- names(qccl); | |
409 return(list(informative.bins=ac,quality.cc=qccl)) | |
410 } | |
411 | |
412 if(accept.all.tags | is.null(data$quality)) { | |
413 return(list(cross.correlation=ccl.av,peak=list(x=ccl.av$x[pi],y=ccl.av$y[pi]),whs=whs)) | |
414 } else { | |
415 acc <- scc.acceptance.calc(); | |
416 return(list(cross.correlation=ccl.av,peak=list(x=ccl.av$x[pi],y=ccl.av$y[pi]),whs=whs,quality.bin.acceptance=acc)); | |
417 } | |
418 | |
419 } | |
420 | |
421 | |
422 # select a set of informative tags based on the pre-calculated binding characteristics | |
423 select.informative.tags <- function(data,binding.characteristics=NULL) { | |
424 if(is.null(binding.characteristics)) { | |
425 return(data$tags); | |
426 } | |
427 if(is.null(binding.characteristics$quality.bin.acceptance)) { | |
428 cat("binding characteristics doesn't contain quality selection info, accepting all tags\n"); | |
429 return(data$tags); | |
430 } | |
431 | |
432 ib <- binding.characteristics$quality.bin.acceptance$informative.bins; | |
433 abn <- names(ib)[ib] | |
434 | |
435 chrl <- names(data$tags); names(chrl) <- chrl; | |
436 lapply(chrl,function(chr) { | |
437 data$tags[[chr]][as.character(data$quality[[chr]]) %in% abn] | |
438 }) | |
439 } | |
440 | |
441 # -------- ROUTINES FOR CALLING BINDING POSITIONS ------------ | |
442 | |
443 # determine binding positions | |
444 # signal.data - IP tag lists | |
445 # control.data - input tag lists | |
446 # e.value - desired E-value threshold (either E-value or FDR threshold must be provided) | |
447 # fdr - desired FDR threshold | |
448 # min.dist - minimal distance between detected positions | |
449 # tag.count.whs - size of the window to be used to estimate confidence interval of the peak fold enrichment ratios | |
450 # enrichmnent.z - Z-score defining the desired confidence level for enrichment interval estimates | |
451 # enrichment.background.scales - define how many tiems larger should be the window for estimating background | |
452 # tag density when evaluating peak enrichment confidence intervals. | |
453 # If multiple values are given, multiple independent interval estimates will be | |
454 # calculated. | |
455 # tec.filter - whether to mask out the regions that exhibit significant background enrichment | |
456 # tec.window.size, tec.z - window size and Z-score for maksing out significant background enrichment regions | |
457 # | |
458 # If the control.data is not provided, the method will assess significance of the determined binding positions | |
459 # based on the randomizations of the original data. The following paramters control such randomizations: | |
460 # n.randomizations - number of randomizations to be performed | |
461 # shuffle.window - size of the bin that defines the tags that are kept together during randomization. | |
462 # value of 0 means that all tags are shuffled independently | |
463 # | |
464 # Binding detection methods: | |
465 # tag.wtd - default method. | |
466 # must specify parameter "whs", which is the half-size of the window used to calculate binding scores | |
467 # tag.lwcc - LWCC method; | |
468 # must specify whs - a size of the window used to calculate binding scores | |
469 # can specify isize (default=15bp) - size of the internal window that is masked out | |
470 find.binding.positions <- function(signal.data,f=1,e.value=NULL,fdr=NULL, masked.data=NULL,control.data=NULL,whs=200,min.dist=200,window.size=4e7,cluster=NULL,debug=T,n.randomizations=3,shuffle.window=1,min.thr=2,topN=NULL, tag.count.whs=100, enrichment.z=2, method=tag.wtd, tec.filter=T,tec.window.size=1e4,tec.z=5,tec.masking.window.size=tec.window.size, tec.poisson.z=5,tec.poisson.ratio=5, tec=NULL, n.control.samples=1, enrichment.scale.down.control=F, enrichment.background.scales=c(1,5,10), use.randomized.controls=F, background.density.scaling=T, mle.filter=F, min.mle.threshold=1, ...) { | |
471 | |
472 if(f<1) { | |
473 if(debug) { cat("subsampling signal ... "); } | |
474 signal.data <- lapply(signal.data,function(x) sample(x,length(x)*f)) | |
475 if(debug) { cat("done\n"); } | |
476 } | |
477 | |
478 | |
479 if(!is.null(control.data) & !use.randomized.controls) { | |
480 # limit both control and signal data to a common set of chromosomes | |
481 chrl <- intersect(names(signal.data),names(control.data)); | |
482 signal.data <- signal.data[chrl]; | |
483 control.data <- control.data[chrl]; | |
484 control <- list(control.data); | |
485 } else { | |
486 control <- NULL; | |
487 } | |
488 | |
489 prd <- lwcc.prediction(signal.data,min.dist=min.dist,whs=whs,window.size=window.size,e.value=e.value,fdr=fdr,debug=debug,n.randomizations=n.randomizations,shuffle.window=shuffle.window,min.thr=min.thr,cluster=cluster,method=method,bg.tl=control.data,mask.tl=masked.data, topN=topN, control=control,tec.filter=tec.filter,tec.z=tec.z,tec.window.size=tec.window.size, tec.masking.window.size=tec.masking.window.size, tec.poisson.z=tec.poisson.z,tec.poisson.ratio=tec.poisson.ratio, background.density.scaling=background.density.scaling, ...); | |
490 | |
491 # add tag counts | |
492 chrl <- names(prd$npl); names(chrl) <- chrl; | |
493 prd$npl <- lapply(chrl,function(chr) { | |
494 pd <- prd$npl[[chr]]; | |
495 pd$nt <- points.within(abs(signal.data[[chr]]),pd$x-tag.count.whs,pd$x+tag.count.whs,return.point.counts=T); | |
496 return(pd); | |
497 }); | |
498 prd$f <- f; | |
499 prd$n <- sum(unlist(lapply(signal.data,length))); | |
500 if(!is.null(control.data)) { | |
501 prd$n.bg <- sum(unlist(lapply(control.data,length))); | |
502 } | |
503 | |
504 # calculate enrichment ratios | |
505 prd <- calculate.enrichment.estimates(prd,signal.data,control.data=control.data,fraction=1,tag.count.whs=tag.count.whs,z=enrichment.z,scale.down.control=enrichment.scale.down.control,background.scales=enrichment.background.scales); | |
506 | |
507 if(mle.filter) { | |
508 if(!is.null(prd$npl)) { | |
509 if(length(prd$npl)>1) { | |
510 mle.columns <- grep("enr.mle",colnames(prd$npl[[1]])); | |
511 if(length(mle.columns)>1) { | |
512 prd$npl <- lapply(prd$npl,function(d) d[apply(d[,mle.columns],1,function(x) all(x>min.mle.threshold)),]) | |
513 } | |
514 } | |
515 } | |
516 } | |
517 | |
518 prd$whs <- whs; | |
519 | |
520 return(prd); | |
521 } | |
522 | |
523 | |
524 | |
525 # -------- ROUTINES FOR WRITING OUT TAG DENSITY AND ENRICHMENT PROFILES ------------ | |
526 # calculate smoothed tag density, optionally subtracting the background | |
527 get.smoothed.tag.density <- function(signal.tags,control.tags=NULL,bandwidth=150,bg.weight=NULL,tag.shift=146/2,step=round(bandwidth/3),background.density.scaling=T,rngl=NULL,scale.by.dataset.size=F) { | |
528 chrl <- names(signal.tags); names(chrl) <- chrl; | |
529 | |
530 if(!is.null(control.tags)) { | |
531 bg.weight <- dataset.density.ratio(signal.tags,control.tags,background.density.scaling=background.density.scaling); | |
532 } | |
533 | |
534 if(scale.by.dataset.size) { | |
535 den.scaling <- dataset.density.size(signal.tags,background.density.scaling=background.density.scaling)/1e6; | |
536 } else { | |
537 den.scaling <- 1; | |
538 } | |
539 | |
540 lapply(chrl,function(chr) { | |
541 ad <- abs(signal.tags[[chr]]+tag.shift); | |
542 rng <- NULL; | |
543 if(!is.null(rngl)) { | |
544 rng <- rngl[[chr]]; | |
545 } | |
546 if(is.null(rng)) { | |
547 rng <- range(ad); | |
548 } | |
549 | |
550 ds <- densum(ad,bw=bandwidth,from=rng[1],to=rng[2],return.x=T,step=step); | |
551 if(!is.null(control.tags)) { | |
552 if(!is.null(control.tags[[chr]])) { | |
553 bsd <- densum(abs(control.tags[[chr]]+tag.shift),bw=bandwidth,from=rng[1],to=rng[2],return.x=F,step=step); | |
554 ds$y <- ds$y-bsd*bg.weight; | |
555 } | |
556 } | |
557 return(data.frame(x=seq(ds$x[1],ds$x[2],by=step),y=den.scaling*ds$y)) | |
558 }) | |
559 } | |
560 | |
561 # get smoothed maximum likelihood estimate of the log2 signal to control enrichment ratio | |
562 get.smoothed.enrichment.mle <- function(signal.tags, control.tags, tag.shift=146/2, background.density.scaling=F, pseudocount=1,bg.weight=NULL, ... ) { | |
563 # determine common range | |
564 chrl <- intersect(names(signal.tags),names(control.tags)); names(chrl) <- chrl; | |
565 rngl <- lapply(chrl,function(chr) range(c(range(abs(signal.tags[[chr]]+tag.shift)),range(abs(control.tags[[chr]]+tag.shift))))) | |
566 ssd <- get.smoothed.tag.density(signal.tags, rngl=rngl, ..., scale.by.dataset.size=F) | |
567 csd <- get.smoothed.tag.density(control.tags, rngl=rngl, ..., scale.by.dataset.size=F) | |
568 if(is.null(bg.weight)) { | |
569 bg.weight <- dataset.density.ratio(signal.tags,control.tags,background.density.scaling=background.density.scaling); | |
570 } | |
571 cmle <- lapply(chrl,function(chr) { d <- ssd[[chr]]; d$y <- log2(d$y+pseudocount) - log2(csd[[chr]]$y+pseudocount) - log2(bg.weight); return(d); }) | |
572 } | |
573 | |
574 | |
575 # returns a conservative upper/lower bound profile (log2) given signal tag list, background tag list and window scales | |
576 get.conservative.fold.enrichment.profile <- function(ftl,btl,fws,bwsl=c(1,5,25,50)*fws,step=50,tag.shift=146/2,alpha=0.05,use.most.informative.scale=F,quick.calculation=T,background.density.scaling=T,bg.weight=NULL,posl=NULL,return.mle=F) { | |
577 # include only chromosomes with more than 2 reads | |
578 ftl <- ftl[unlist(lapply(ftl,length))>2] | |
579 chrl <- names(ftl); names(chrl) <- chrl; | |
580 if(!is.null(posl)) { | |
581 chrl <- chrl[chrl %in% names(posl)]; | |
582 } | |
583 # calculate background tag ratio | |
584 if(is.null(bg.weight)) { | |
585 bg.weight <- dataset.density.ratio(ftl,btl,background.density.scaling=background.density.scaling); | |
586 } | |
587 lapply(chrl,function(chr) { | |
588 if(is.null(btl[[chr]])) { bt <- c(); } else { bt <- abs(btl[[chr]]+tag.shift); } | |
589 if(is.null(posl)) { | |
590 x <- mbs.enrichment.bounds(abs(ftl[[chr]]+tag.shift),bt,fws=fws,bwsl=bwsl,step=step,calculate.upper.bound=T,bg.weight=bg.weight,use.most.informative.scale=use.most.informative.scale,quick.calculation=quick.calculation,alpha=alpha); | |
591 } else { | |
592 x <- mbs.enrichment.bounds(abs(ftl[[chr]]+tag.shift),bt,fws=fws,bwsl=bwsl,step=step,calculate.upper.bound=T,bg.weight=bg.weight,use.most.informative.scale=use.most.informative.scale,quick.calculation=quick.calculation,alpha=alpha,pos=posl[[chr]]); | |
593 } | |
594 # compose profile showing lower bound for enriched, upper bound for depleted regions | |
595 ps <- rep(1,length(x$mle)); | |
596 vi <- which(!is.na(x$lb) & x$lb>1); | |
597 ps[vi] <- x$lb[vi]; | |
598 vi <- which(!is.na(x$ub) & x$ub<1); | |
599 ps[vi] <- x$ub[vi]; | |
600 ps <- log2(ps); | |
601 if(is.null(posl)) { | |
602 if(return.mle) { | |
603 return(data.frame(x=seq(x$x$s,x$x$e,by=x$x$step),y=ps,mle=log2(x$mle),lb=log2(x$lb),ub=log2(x$ub))); | |
604 } else { | |
605 return(data.frame(x=seq(x$x$s,x$x$e,by=x$x$step),y=ps)); | |
606 } | |
607 } else { | |
608 if(return.mle) { | |
609 return(data.frame(x=posl[[chr]],y=ps,mle=log2(x$mle),lb=log2(x$lb),ub=log2(x$ub))); | |
610 } else { | |
611 return(data.frame(x=posl[[chr]],y=ps)); | |
612 } | |
613 } | |
614 }) | |
615 } | |
616 | |
617 | |
618 # write a per-chromosome $x/$y data structure into a wig file | |
619 writewig <- function(dat,fname,feature,threshold=5,zip=F) { | |
620 chrl <- names(dat); names(chrl) <- chrl; | |
621 invisible(lapply(chrl,function(chr) { | |
622 bdiff <- dat[[chr]]; | |
623 ind <- seq(1,length(bdiff$x)); | |
624 ind <- ind[!is.na(bdiff$y[ind])]; | |
625 header <- chr==chrl[1]; | |
626 write.probe.wig(chr,bdiff$x[ind],bdiff$y[ind],fname,append=!header,feature=feature,header=header); | |
627 })) | |
628 if(zip) { | |
629 zf <- paste(fname,"zip",sep="."); | |
630 system(paste("zip \"",zf,"\" \"",fname,"\"",sep="")); | |
631 system(paste("rm \"",fname,"\"",sep="")); | |
632 return(zf); | |
633 } else { | |
634 return(fname); | |
635 } | |
636 } | |
637 | |
638 | |
639 | |
640 # -------- ROUTINES FOR ANALYZING SATURATION PROPERTIES ------------ | |
641 | |
642 # PUBLIC | |
643 # calculate minimal saturation enrichment ratios (MSER) | |
644 get.mser <- function(signal.data,control.data,n.chains=5,step.size=1e5, chains=NULL, cluster=NULL, test.agreement=0.99, return.chains=F, enrichment.background.scales=c(1), n.steps=1, ...) { | |
645 if(is.null(chains)) { | |
646 ci <- c(1:n.chains); names(ci) <- ci; | |
647 if(is.null(cluster)) { | |
648 chains <- lapply(ci,get.subsample.chain.calls,signal.data=signal.data,control.data=control.data,n.steps=n.steps,step.size=step.size,subsample.control=F, enrichment.background.scales=enrichment.background.scales, ...); | |
649 } else { | |
650 chains <- clusterApplyLB(cluster,ci,get.subsample.chain.calls,signal.data=signal.data,control.data=control.data,n.steps=n.steps,step.size=step.size,subsample.control=F, enrichment.background.scales=enrichment.background.scales, ...); | |
651 names(chains) <- ci; | |
652 } | |
653 } | |
654 cvl <- mser.chain.interpolation(chains=chains,enrichment.background.scales=enrichment.background.scales,test.agreement=test.agreement,return.lists=F); | |
655 if(n.steps>1) { | |
656 msers <- cvl; | |
657 } else { | |
658 msers <- unlist(lapply(cvl,function(d) d$me)) | |
659 } | |
660 if(return.chains) { | |
661 return(list(mser=msers,chains=chains)); | |
662 } else { | |
663 return(msers); | |
664 } | |
665 } | |
666 | |
667 # PUBLIC | |
668 # interpolate MSER dependency on tag counts | |
669 get.mser.interpolation <- function(signal.data,control.data,target.fold.enrichment=5,n.chains=10,n.steps=6,step.size=1e5, chains=NULL, test.agreement=0.99, return.chains=F, enrichment.background.scales=c(1), excluded.steps=c(seq(2,n.steps-2)), ...) { | |
670 msers <- get.mser(signal.data,control.data,n.chains=n.chains,n.steps=n.steps,step.size=step.size,chains=chains,test.agrement=test.agreement,return.chains=T,enrichment.background.scales=enrichment.background.scales,excluded.steps=excluded.steps, ...); | |
671 | |
672 # adjust sizes in case a subset of chromosomes was used | |
673 mser <- mser.chain.interpolation(chains=msers$chains,enrichment.background.scales=enrichment.background.scales,test.agreement=test.agreement,return.lists=T); | |
674 sr <- sum(unlist(lapply(signal.data,length)))/mser[[1]][[1]]$n[1]; | |
675 | |
676 # Subsampling each chain requires removing a fraction of each chromosome's | |
677 # tag list. To get the exact step.size, this often leaves chromosomes with | |
678 # a non-integer number of tags. The non-integer values are floored, so each | |
679 # chr can contribute at most 0.999.. <= 1 error to the step.size. | |
680 floor.error <- length(msers$chains[[1]][[1]]$npl) | |
681 intpn <- lapply(mser,function(ms) { | |
682 lmvo <- do.call(rbind,ms) | |
683 lmvo$n <- lmvo$n*sr; | |
684 # Don't select rows corresponding to excluded.steps | |
685 # Keep in mind that nd values are negative. | |
686 lmvo <- lmvo[lmvo$nd <= (lmvo$nd[1] + floor.error) & lmvo$nd >= (lmvo$nd[1] - floor.error),]; | |
687 lmvo <- na.omit(lmvo); | |
688 if(any(lmvo$me==1)) { | |
689 return(list(prediction=NA)); | |
690 } | |
691 lmvo$n <- log10(lmvo$n); lmvo$me <- log10(lmvo$me-1) | |
692 # remove non-standard steps | |
693 emvf <- lm(me ~ n,data=lmvo); | |
694 tfe <- (log10(target.fold.enrichment-1)-coef(emvf)[[1]])/coef(emvf)[[2]]; | |
695 tfen <- 10^tfe; | |
696 return(list(prediction=tfen,log10.fit=emvf)); | |
697 }) | |
698 | |
699 if(return.chains) { | |
700 return(list(interpolation=intpn,chains=msers$chains)) | |
701 } else { | |
702 return(intpn); | |
703 } | |
704 | |
705 return(msers); | |
706 | |
707 } | |
708 | |
709 | |
710 # output binding detection results to a text file | |
711 # the file will contain a table with each row corresponding | |
712 # to a detected position, with the following columns: | |
713 # chr - chromosome or target sequence | |
714 # pos - position of detected binding site on the chromosome/sequence | |
715 # score - a score reflecting magnitude of the binding | |
716 # Evalue - E-value corresponding to the peak magnitude | |
717 # FDR - FDR corresponding to the peak magnitude | |
718 # enrichment.lb - lower bound of the fold-enrichment ratio | |
719 # enrichment.mle - maximum likelihood estimate of the fold-enrichment ratio | |
720 output.binding.results <- function(results,filename) { | |
721 write(file=filename,"chr\tpos\tscore\tEvalue\tFDR\tenrichment.lb\tenrichment.mle",append=F); | |
722 chrl <- names(results$npl); names(chrl) <- chrl; | |
723 x <- lapply(chrl,function(chr) { | |
724 d <- results$npl[[chr]]; | |
725 if(dim(d)[1]>0) { | |
726 if(results$thr$type=="topN") { | |
727 od <- cbind(rep(chr,dim(d)[1]),subset(d,select=c(x,y,enr,enr.mle))) | |
728 } else { | |
729 od <- cbind(rep(chr,dim(d)[1]),subset(d,select=c(x,y,evalue,fdr,enr,enr.mle))) | |
730 } | |
731 write.table(od,file=filename,col.names=F,row.names=F,sep="\t",append=T,quote=F) | |
732 } | |
733 }) | |
734 } | |
735 | |
736 | |
737 # -------- LOW-LEVEL ROUTINES ------------ | |
738 | |
739 # calculates tag strand cross-correlation for a range of shifts (on positive strand) | |
740 tag.scc <- function(tags,srange=c(50,250),bin=1,tt=NULL,llim=10) { | |
741 if(is.null(tt)) { | |
742 tt <- table(sign(tags)*as.integer(floor(abs(tags)/bin+0.5))); | |
743 } | |
744 if(!is.null(llim)) { l <- mean(tt); tt <- tt[tt<llim*l] } | |
745 tc <- as.integer(names(tt)); | |
746 tt <- as.numeric(tt); | |
747 | |
748 pv <- tt; pv[tc<0]<-0; | |
749 nv <- tt; nv[tc>0]<-0; | |
750 | |
751 pti <- which(tc>0) | |
752 nti <- which(tc<0); | |
753 | |
754 ptc <- tc[pti]; | |
755 ntc <- (-1)*tc[nti]; | |
756 | |
757 ptv <- tt[pti]; | |
758 ntv <- tt[nti]; | |
759 | |
760 trng <- range(c(range(ptc),range(ntc))) | |
761 l <- diff(trng)+1; | |
762 rm(tc,tt); | |
763 | |
764 mp <- sum(ptv)*bin/l; mn <- sum(ntv)*bin/l; | |
765 ptv <- ptv-mp; ntv <- ntv-mn; | |
766 ss <- sqrt((sum(ptv*ptv)+(l-length(ptv))*mp^2) * (sum(ntv*ntv)+(l-length(ntv))*mn^2)); | |
767 | |
768 t.cor <- function(s) { | |
769 smi <- match(ptc+s,ntc); | |
770 return((sum(ptv[!is.na(smi)]*ntv[na.omit(smi)]) - | |
771 mn*sum(ptv[is.na(smi)]) - | |
772 mp*sum(ntv[-na.omit(smi)]) + | |
773 mp*mn*(l-length(ptv)-length(ntv)+length(which(!is.na(smi)))))/ss); | |
774 } | |
775 shifts <- floor(seq(srange[1],srange[2],by=bin)/bin+0.5); | |
776 scc <- unlist(lapply(shifts,t.cor)); names(scc) <- shifts*bin; | |
777 return(scc); | |
778 } | |
779 | |
780 | |
781 # plot tag cross-correlation | |
782 t.plotcc <- function(ac, lab=c(10,5,7), ylab="correlation", xlab="lag", pch=19, grid.i=c(-5:5), grid.s=10, type='b', plot.grid=F, cols=c(1,2,4,"orange",8,"pink"), min.peak.x=NULL, xlim=NULL, plot.147=F, plot.max=T, rmw=1, rescale=F, legendx="right", ltys=rep(1,length(ac)), ...) { | |
783 if(is.list(ac)) { | |
784 cols <- cols[1:length(ac)]; | |
785 | |
786 if(!is.null(xlim)) { | |
787 vx <- as.numeric(names(ac[[1]])); vx <- which(vx>=xlim[1] & vx<=xlim[2]); | |
788 ac[[1]] <- (ac[[1]])[vx]; | |
789 } else { | |
790 xlim <- range(as.numeric(names(ac[[1]]))); | |
791 } | |
792 | |
793 | |
794 plot(as.numeric(names(ac[[1]])),runmean(ac[[1]],rmw),type=type,pch=pch,xlab=xlab,ylab=ylab,lab=lab, col=cols[1], xlim=xlim, lty=ltys[1], ...); | |
795 if(length(ac)>1) { | |
796 for(i in seq(2,length(ac))) { | |
797 irng <- range(ac[[i]]); | |
798 vx <- as.numeric(names(ac[[i]])); vx <- which(vx>=xlim[1] & vx<=xlim[2]); | |
799 if(rescale) { | |
800 lines(as.numeric(names(ac[[i]])[vx]),runmean((ac[[i]][vx]-irng[1])/diff(irng)*diff(range(ac[[1]]))+min(ac[[1]]),rmw),col=cols[i],lty=ltys[i]); | |
801 } else { | |
802 lines(as.numeric(names(ac[[i]]))[vx],runmean(ac[[i]][vx],rmw),col=cols[i],lty=ltys[i]); | |
803 } | |
804 } | |
805 } | |
806 if(is.null(min.peak.x)) { | |
807 m <- as.numeric(names(ac[[1]])[which.max(ac[[1]])]); | |
808 } else { | |
809 sac <- (ac[[1]])[which(as.numeric(names(ac[[1]]))>min.peak.x)] | |
810 m <- as.numeric(names(sac)[which.max(sac)]); | |
811 } | |
812 legend(x="topright",bty="n",legend=c(names(ac)),col=cols,lty=ltys) | |
813 } else { | |
814 if(!is.null(xlim)) { | |
815 vx <- as.numeric(names(ac)); | |
816 vx <- which(vx>=xlim[1] & vx<=xlim[2]); | |
817 ac <- ac[vx]; | |
818 } else { | |
819 xlim <- range(as.numeric(names(ac))); | |
820 } | |
821 | |
822 plot(names(ac),runmean(ac,rmw),type=type,pch=pch,xlab=xlab,ylab=ylab,lab=lab, xlim=xlim, ...); | |
823 if(is.null(min.peak.x)) { | |
824 m <- as.numeric(names(ac)[which.max(ac)]); | |
825 } else { | |
826 sac <- ac[which(names(ac)>min.peak.x)] | |
827 m <- as.numeric(names(sac)[which.max(sac)]); | |
828 } | |
829 } | |
830 if(plot.147) { | |
831 abline(v=147,lty=2,col=8); | |
832 } | |
833 if(plot.grid) { | |
834 abline(v=m+grid.i*grid.s,lty=3,col="pink"); | |
835 } | |
836 if(plot.max) { | |
837 abline(v=m,lty=2,col=2); | |
838 legend(x=legendx,bty="n",legend=c(paste("max at ",m,"bp",sep=""))); | |
839 return(m); | |
840 } | |
841 } | |
842 | |
843 # plot chromosome-acerage cross-correlation | |
844 t.plotavcc <- function(ci, main=paste(ci,"chromosome average"), ccl=tl.cc, return.ac=F, ttl=tl, plot=T, ... ) { | |
845 cc <- ccl[[ci]]; | |
846 if(length(cc)==1) { return(cc[[1]]) }; | |
847 if(length(cc)==0) { return(c()) }; | |
848 ac <- do.call(rbind,cc); | |
849 # omit NA chromosomes | |
850 ina <- apply(ac,1,function(d) any(is.na(d))); | |
851 | |
852 tags <- ttl[[ci]]; | |
853 avw <- unlist(lapply(tags,length)); avw <- avw/sum(avw); | |
854 ac <- ac[!ina,]; avw <- avw[!ina]; | |
855 ac <- apply(ac,2,function(x) sum(x*avw)); | |
856 if(plot) { | |
857 m <- t.plotcc(ac, main=main, ...); | |
858 if(!return.ac) { return(m) } | |
859 } | |
860 if(return.ac) { return(ac) } | |
861 } | |
862 | |
863 t.plotchrcc <- function(ci,ncol=4, ccl=tl.cc, ... ) { | |
864 cc <- ccl[[ci]]; | |
865 ac <- do.call(rbind,cc); | |
866 par(mfrow = c(length(cc)/ncol,ncol), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8) | |
867 lapply(names(cc),function(ch) { t.plotcc(cc[[ch]],main=paste(ci,": chr",ch,sep=""), ...) }) | |
868 } | |
869 | |
870 t.plotavccl <- function(ci, ccl=tl.ccl, main=paste(ci,"chromosome average"), rtl=tl, ... ) { | |
871 #cc <- lapply(ccl[[ci]],function(x) { if(!is.null(x$M)) { x$M <- NULL;}; return(x); }); | |
872 cc <- ccl[[ci]]; | |
873 chrs <- names(cc[[1]]); names(chrs) <- chrs; | |
874 acl <- lapply(cc,function(x) do.call(rbind,x)); | |
875 tags <- rtl[[ci]][chrs]; | |
876 avw <- unlist(lapply(tags,length)); avw <- avw/sum(avw); | |
877 acl <- lapply(acl,function(ac) apply(ac,2,function(x) sum(x*avw))) | |
878 t.plotcc(acl, main=main, ...); | |
879 } | |
880 | |
881 t.plotchrccl <- function(ci,ccl=tl.ccl,ncol=4, ... ) { | |
882 par(mfrow = c(length(cc[[1]])/ncol,ncol), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8) | |
883 lapply(names(cc[[1]]),function(ch) { t.plotcc(lapply(cc,function(x) x[[ch]]),main=paste(ci,": chr",ch,sep=""), ...) }) | |
884 } | |
885 | |
886 | |
887 | |
888 show.scc <- function(tl,srange,cluster=NULL) { | |
889 if(!is.null(cluster)) { | |
890 cc <- clusterApplyLB(cluster,tl,tag.scc,srange=srange); | |
891 names(cc) <- names(tl); | |
892 } else { | |
893 cc <- lapply(tl,tag.scc,srange=srange); | |
894 } | |
895 par(mfrow = c(1,1), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8); | |
896 ccl<-list(sample=cc); | |
897 ccl.av <- lapply(names(ccl),t.plotavcc,type='l',ccl=ccl,xlim=srange,return.ac=F,ttl=list(sample=tl),main="")[[1]] | |
898 } | |
899 | |
900 # find regions of significant tag enrichment | |
901 find.significantly.enriched.regions <- function(signal.data,control.data,window.size=500,multiplier=1,z.thr=3,mcs=0,debug=F,background.density.scaling=T,masking.window.size=window.size,poisson.z=0,poisson.ratio=4,either=F,tag.shift=146/2,bg.weight=NULL) { | |
902 if(is.null(bg.weight)) { | |
903 bg.weight <- dataset.density.ratio(signal.data,control.data,background.density.scaling=background.density.scaling); | |
904 } | |
905 | |
906 if(debug) { | |
907 cat("bg.weight=",bg.weight,"\n"); | |
908 } | |
909 chrl <- names(signal.data); names(chrl) <- chrl; | |
910 tec <- lapply(chrl,function(chr) { | |
911 d <- tag.enrichment.clusters(signal.data[[chr]],control.data[[chr]],bg.weight=bg.weight*multiplier,thr=z.thr,wsize=window.size,mcs=mcs,min.tag.count.z=poisson.z,min.tag.count.ratio=poisson.ratio,either=either,tag.shift=tag.shift); | |
912 d$s <- d$s-masking.window.size/2; d$e <- d$e+masking.window.size/2; | |
913 return(d); | |
914 }) | |
915 } | |
916 | |
917 | |
918 # given tag position vectors, find contigs of significant enrichment of signal over background | |
919 # thr - z score threshold | |
920 # mcs - minimal cluster size | |
921 # bg.weight - fraction by which background counts should be multipled | |
922 # min.tag.count.z will impose a poisson constraint based on randomized signal in parallel of background constaint (0 - no constraint) | |
923 tag.enrichment.clusters <- function(signal,background,wsize=200,thr=3,mcs=1,bg.weight=1,min.tag.count.z=0,tag.av.den=NULL,min.tag.count.thr=0,min.tag.count.ratio=4,either=F,tag.shift=146/2) { | |
924 if(is.null(tag.av.den)) { | |
925 tag.av.den <- length(signal)/diff(range(abs(signal))); | |
926 } | |
927 if(min.tag.count.z>0) { | |
928 min.tag.count.thr <- qpois(pnorm(min.tag.count.z,lower.tail=F),min.tag.count.ratio*tag.av.den*wsize,lower.tail=F) | |
929 } else { | |
930 min.tag.count.thr <- 0; | |
931 } | |
932 | |
933 #if(bg.weight!=1) { | |
934 # background <- sample(background,length(background)*(bg.weight),replace=T); | |
935 #} | |
936 # make up combined position, flag vectors | |
937 pv <- abs(c(signal,background)+tag.shift); | |
938 fv <- c(rep(1,length(signal)),rep(0,length(background))); | |
939 po <- order(pv); | |
940 pv <- pv[po]; | |
941 fv <- fv[po]; | |
942 | |
943 #thr <- pnorm(thr,lower.tail=F); | |
944 | |
945 storage.mode(wsize) <- storage.mode(mcs) <- storage.mode(fv) <- "integer"; | |
946 storage.mode(thr) <- storage.mode(pv) <- "double"; | |
947 storage.mode(bg.weight) <- "double"; | |
948 storage.mode(min.tag.count.thr) <- "double"; | |
949 either <- as.integer(either); | |
950 storage.mode(either) <- "integer"; | |
951 | |
952 z <- .Call("find_poisson_enrichment_clusters",pv,fv,wsize,thr,mcs,bg.weight,min.tag.count.thr,either) | |
953 return(z); | |
954 } | |
955 | |
956 | |
957 | |
958 | |
959 | |
960 # estimates threshold, calculates predictions on complete data and randomized data | |
961 # input: tvl | |
962 # control - a list of control tag datasets | |
963 # no randomization is done if control is supplied | |
964 # return.rtp - return randomized tag peaks - do not fit thresholds or do actual predictions | |
965 # topN - use min threshold to do a run, return topN peaks from entire genome | |
966 # threshold - specify a user-defined threshold | |
967 lwcc.prediction <- function(tvl,e.value=NULL, fdr=0.01, chrl=names(tvl), min.thr=0, n.randomizations=1, shuffle.window=1, debug=T, predict.on.random=F, shuffle.both.strands=T,strand.shuffle.only=F, return.rtp=F, control=NULL, print.level=0, threshold=NULL, topN=NULL, bg.tl=NULL, tec.filter=T, tec.window.size=1e3,tec.z=3, tec.masking.window.size=tec.window.size, tec.poisson.z=3,tec.poisson.ratio=4, bg.reverse=T, return.control.predictions=F, return.core.data=F, background.density.scaling=T, ... ) { | |
968 | |
969 control.predictions <- NULL; | |
970 core.data <- list(); | |
971 | |
972 if(!is.null(bg.tl) & tec.filter) { | |
973 if(debug) { cat("finding background exclusion regions ... "); } | |
974 tec <- find.significantly.enriched.regions(bg.tl,tvl,window.size=tec.window.size,z.thr=tec.z,masking.window.size=tec.masking.window.size,poisson.z=tec.poisson.z,poisson.ratio=tec.poisson.ratio,background.density.scaling=background.density.scaling,either=T); | |
975 if(return.core.data) { | |
976 core.data <- c(core.data,list(tec=tec)); | |
977 } | |
978 if(debug) { cat("done\n"); } | |
979 } | |
980 | |
981 | |
982 if(is.null(threshold) & is.null(topN)) { # threshold determination is needed | |
983 # generate control predictions | |
984 if(!is.null(control)) { | |
985 if(debug) { cat("determining peaks on provided",length(control),"control datasets:\n"); } | |
986 if(!is.null(bg.tl)) { | |
987 if(bg.reverse) { | |
988 if(debug) { cat("using reversed signal for FDR calculations\n"); } | |
989 rbg.tl <- tvl; | |
990 } else { | |
991 if(debug) { cat("generating randomized (within chromosome) background ... "); } | |
992 rbg.tl <- lapply(bg.tl,function(d) { | |
993 if(length(d)<2) { return(d); } | |
994 rng <- range(abs(d)); | |
995 rd <- round(runif(length(d),rng[1],rng[2])); | |
996 nrd <- sample(1:length(rd),length(which(d<0))); | |
997 rd[nrd] <- rd[nrd]*(-1); | |
998 return(rd); | |
999 }) | |
1000 if(debug) { cat("done\n"); } | |
1001 } | |
1002 } else { | |
1003 rbg.tl <- NULL; | |
1004 } | |
1005 n.randomizations <- length(control); | |
1006 #signal.size <- sum(unlist(lapply(tvl,length))); | |
1007 rtp <- lapply(control,function(d) { | |
1008 # calculate tag.weight | |
1009 #tag.weight <- sum(unlist(lapply(tvl,length)))/sum(unlist(lapply(d,length))); | |
1010 tag.weight <- dataset.density.ratio(tvl,d,background.density.scaling=background.density.scaling); | |
1011 #cat("tag.weight=",tag.weight," "); | |
1012 return(window.call.mirror.binding(d,min.thr=min.thr, tag.weight=tag.weight,bg.tl=rbg.tl, debug=debug, round.up=T,background.density.scaling=background.density.scaling, ...)); | |
1013 #return(window.call.mirror.binding(d,min.thr=min.thr, method=tag.wtd,wsize=200,bg.tl=control.data,window.size=window.size,debug=T,min.dist=min.dist,cluster=cluster)) | |
1014 }); | |
1015 if(return.core.data) { | |
1016 core.data <- c(core.data,list(rtp.unfiltered=rtp)); | |
1017 } | |
1018 if(tec.filter) { | |
1019 if(debug) { cat("excluding systematic background anomalies ... "); } | |
1020 rtp <- lapply(rtp,filter.binding.sites,tec,exclude=T); | |
1021 if(debug) { cat("done\n"); } | |
1022 } | |
1023 } else { | |
1024 if(debug) { cat("determining peaks on ",n.randomizations,"randomized datasets:\n"); } | |
1025 rtp <- lapply(1:n.randomizations,function(i) { | |
1026 rd <- generate.randomized.data(tvl,shuffle.window=shuffle.window,shuffle.both.strands=shuffle.both.strands,strand.shuffle.only=strand.shuffle.only); | |
1027 return(window.call.mirror.binding(rd,min.thr=min.thr,bg.tl=bg.tl, debug=debug, ...)); | |
1028 #return(window.call.mirror.binding(rd,min.thr=min.thr, method=tag.wtd,wsize=200,bg.tl=control.data,window.size=window.size,debug=T,min.dist=min.dist)) | |
1029 }); | |
1030 } | |
1031 if(return.control.predictions) { | |
1032 control.predictions <- rtp; | |
1033 } | |
1034 rtp <- do.call(rbind,lapply(rtp,function(d) do.call(rbind,d))); # merge tables | |
1035 | |
1036 # generate real data predictions | |
1037 if(debug) { cat("determining peaks on real data:\n"); } | |
1038 npl <- window.call.mirror.binding(tvl,min.thr=min.thr,bg.tl=bg.tl, debug=debug, background.density.scaling=background.density.scaling, ...); | |
1039 #npl <- window.call.mirror.binding(tvl,min.thr=min.thr, method=tag.wtd,wsize=200,bg.tl=control.data,window.size=window.size,debug=T,min.dist=min.dist,cluster=cluster); | |
1040 if(return.core.data) { | |
1041 core.data <- c(core.data,list(npl.unfiltered=npl)); | |
1042 } | |
1043 | |
1044 if(!is.null(bg.tl) & tec.filter) { | |
1045 if(debug) { cat("excluding systematic background anomalies ... "); } | |
1046 npl <- filter.binding.sites(npl,tec,exclude=T); | |
1047 if(debug) { cat("done\n"); } | |
1048 } | |
1049 | |
1050 # calculate E-value and FDRs for all of the peaks | |
1051 if(debug) { cat("calculating statistical thresholds\n"); } | |
1052 chrl <- names(npl); names(chrl) <- chrl; | |
1053 npld <- do.call(rbind,lapply(names(npl),function(chr) { k <- npl[[chr]]; if(!is.null(k) & dim(k)[1]>0) { k$chr <- rep(chr,dim(k)[1]) }; return(k) })) | |
1054 npld <- cbind(npld,get.eval.fdr.vectors(npld$y,rtp$y)); | |
1055 # correct for n.randomizations | |
1056 npld$fdr <- npld$fdr/n.randomizations; | |
1057 npld$evalue <- npld$evalue/n.randomizations; | |
1058 | |
1059 if(return.core.data) { | |
1060 core.data <- c(core.data,list(npld=npld)); | |
1061 } | |
1062 | |
1063 # determine actual thresholds | |
1064 if(is.null(e.value)) { | |
1065 if(is.null(fdr)) { fdr <- 0.01; } | |
1066 thr <- list(root=min(npld$y[npld$fdr<=fdr]),type="FDR",fdr=fdr) | |
1067 if(debug) { cat("FDR",fdr,"threshold=",thr$root,"\n"); } | |
1068 } else { | |
1069 # determine threshold based on e-value | |
1070 thr <- list(root=min(npld$y[npld$evalue<=e.value]),type="Evalue",e.value=e.value) | |
1071 if(debug) { cat("E-value",e.value,"threshold=",thr$root,"\n"); } | |
1072 } | |
1073 | |
1074 | |
1075 npld <- npld[npld$y>=thr$root,]; | |
1076 if(dim(npld)[1]>0) { | |
1077 npl <- tapply(c(1:dim(npld)[1]),as.factor(npld$chr),function(ii) {df <- npld[ii,]; df$chr <- NULL; return(df) }); | |
1078 } else { | |
1079 npl <- list(); | |
1080 } | |
1081 } else { | |
1082 if(is.null(threshold)) { | |
1083 thr <- list(root=min.thr,type="minimal"); | |
1084 } else { | |
1085 thr <- list(root=threshold,type="user specified"); | |
1086 } | |
1087 | |
1088 cat("calling binding positions using",thr$type,"threshold (",thr$root,") :\n"); | |
1089 npl <- window.call.mirror.binding(tvl=tvl,min.thr=thr$root,bg.tl=bg.tl, debug=debug, ...); | |
1090 if(!is.null(bg.tl) & tec.filter) { | |
1091 if(debug) { cat("excluding systematic background anomalies ... "); } | |
1092 npl <- filter.binding.sites(npl,tec,exclude=T); | |
1093 if(debug) { cat("done\n"); } | |
1094 } | |
1095 | |
1096 if(!is.null(topN)) { | |
1097 # determine threshold based on topN peaks | |
1098 ay <- unlist(lapply(npl,function(d) d$y)); | |
1099 if(length(ay)>topN) { | |
1100 thr <- list(root=sort(ay,decreasing=T)[topN],type="topN",topN=topN); | |
1101 cat(paste("determined topN threshold :",thr$root,"\n")); | |
1102 npl <- lapply(npl,function(d) d[d$y>thr$root,]); | |
1103 } | |
1104 } | |
1105 } | |
1106 | |
1107 if(return.core.data) { | |
1108 return(c(list(npl=npl,thr=thr),core.data)); | |
1109 } | |
1110 if(return.control.predictions & !is.null(control.predictions)) { | |
1111 return(list(npl=npl,thr=thr,control.predictions=control.predictions)); | |
1112 } | |
1113 return(list(npl=npl,thr=thr)); | |
1114 } | |
1115 | |
1116 # window tag difference method | |
1117 wtd <- function(x,y,s,e,whs=200,return.peaks=T,min.thr=5,min.dist=200,step=1,direct.count=F,tag.weight=1,bg.x=NULL,bg.y=NULL,bg.weight=1,mask.x=NULL,mask.y=NULL,ignore.masking=F, bg.whs=whs, round.up=F, ...) { | |
1118 ignore.masking <- ignore.masking | (is.null(mask.x) & is.null(mask.y)); | |
1119 if(step>1) { | |
1120 x <- floor(x/step+0.5); y <- floor(y/step+0.5) | |
1121 | |
1122 if(!is.null(bg.x)) { | |
1123 bg.x <- floor(bg.x/step+0.5); bg.y <- floor(bg.y/step+0.5) | |
1124 } | |
1125 | |
1126 if(!is.null(mask.x)) { | |
1127 mask.x <- floor(mask.x/step+0.5); mask.y <- floor(mask.y/step+0.5) | |
1128 } | |
1129 | |
1130 | |
1131 whs <- floor(whs/step+0.5); | |
1132 bg.whs <- floor(bg.whs/step+0.5); | |
1133 min.dist <- floor(min.dist/step +0.5); | |
1134 s <- floor(s/step+0.5) | |
1135 e <- floor(e/step+0.5) | |
1136 } | |
1137 | |
1138 # scale bg.weight, since within calculation they are considered independent | |
1139 bg.weight <- bg.weight*tag.weight; | |
1140 | |
1141 rx <- c(s-whs,e+whs); | |
1142 | |
1143 # compile tag vectors | |
1144 xt <- table(x); | |
1145 xh <- integer(diff(rx)+1); | |
1146 xh[as.integer(names(xt))-rx[1]+1] <- as.integer(xt); | |
1147 | |
1148 yt <- table(y); | |
1149 yh <- integer(diff(rx)+1); | |
1150 yh[as.integer(names(yt))-rx[1]+1] <- as.integer(yt); | |
1151 | |
1152 # compile background vectors | |
1153 if(!is.null(bg.x) & length(bg.x)>0) { | |
1154 bg.subtract <- 1; | |
1155 | |
1156 bg.xt <- table(bg.x); | |
1157 bg.xh <- integer(diff(rx)+1); | |
1158 bg.xh[as.integer(names(bg.xt))-rx[1]+1] <- as.integer(bg.xt); | |
1159 rm(bg.xt); | |
1160 | |
1161 bg.yt <- table(bg.y); | |
1162 bg.yh <- integer(diff(rx)+1); | |
1163 bg.yh[as.integer(names(bg.yt))-rx[1]+1] <- as.integer(bg.yt); | |
1164 rm(bg.yt); | |
1165 | |
1166 # adjust bg.weight according to bg.whs | |
1167 if(bg.whs!=whs) { | |
1168 bg.weight <- bg.weight*whs/bg.whs; | |
1169 } | |
1170 } else { | |
1171 bg.subtract <- 0; | |
1172 bg.xh <- bg.yh <- c(); | |
1173 } | |
1174 | |
1175 # record masked positions | |
1176 if(!ignore.masking) { | |
1177 if(!is.null(mask.x) & length(mask.x)>0) { | |
1178 mvx <- unique(mask.x); mvx <- setdiff(mvx,as.numeric(names(xt))); | |
1179 mvx <- mvx[mvx>=rx[1] & mvx<=rx[2]]; | |
1180 xh[mvx-rx[1]+1] <- -1; | |
1181 } | |
1182 | |
1183 if(!is.null(mask.y) & length(mask.y)>0) { | |
1184 mvy <- unique(mask.y); mvy <- setdiff(mvy,as.numeric(names(yt))); | |
1185 mvy <- mvy[mvy>=rx[1] & mvy<=rx[2]]; | |
1186 yh[mvy-rx[1]+1] <- -1; | |
1187 } | |
1188 } | |
1189 | |
1190 rm(xt,yt); | |
1191 | |
1192 if(round.up) { round.up <- 1; } else { round.up <- 0; } | |
1193 | |
1194 storage.mode(xh) <- storage.mode(yh) <- "integer"; | |
1195 storage.mode(bg.xh) <- storage.mode(bg.yh) <- "integer"; | |
1196 nx <- length(xh); storage.mode(nx) <- storage.mode(whs) <- storage.mode(bg.whs) <- "integer"; | |
1197 rp <- as.integer(return.peaks); | |
1198 dcon <- as.integer(direct.count); | |
1199 storage.mode(rp) <- storage.mode(min.dist) <- "integer"; | |
1200 storage.mode(min.thr) <- "double"; | |
1201 storage.mode(dcon) <- "integer"; | |
1202 storage.mode(tag.weight) <- "double"; | |
1203 storage.mode(bg.weight) <- "double"; | |
1204 storage.mode(bg.subtract) <- "integer"; | |
1205 storage.mode(round.up) <- "integer"; | |
1206 im <- as.integer(ignore.masking); | |
1207 storage.mode(im) <- "integer"; | |
1208 z <- .Call("wtd",xh,yh,whs,rp,min.dist,min.thr,dcon,tag.weight,im,bg.subtract,bg.xh,bg.yh,bg.whs,bg.weight,round.up); | |
1209 if(return.peaks) { | |
1210 return(data.frame(x=(z$x+rx[1])*step,y=z$v)); | |
1211 } else { | |
1212 return(list(x=rx*step,y=z)); | |
1213 } | |
1214 } | |
1215 | |
1216 | |
1217 tag.wtd <- function(ctv,s,e,return.peaks=T, bg.ctv=NULL, mask.ctv=NULL, ...) { | |
1218 x <- ctv[ctv>=s & ctv<=e]; | |
1219 y <- (-1)*ctv[ctv<=-s & ctv>=-e]; | |
1220 | |
1221 if(!is.null(bg.ctv)) { | |
1222 bg.x <- bg.ctv[bg.ctv>=s & bg.ctv<=e]; | |
1223 bg.y <- (-1)*bg.ctv[bg.ctv<=-s & bg.ctv>=-e]; | |
1224 } else { | |
1225 bg.x <- bg.y <- NULL; | |
1226 } | |
1227 | |
1228 if(!is.null(mask.ctv)) { | |
1229 mask.x <- mask.ctv[mask.ctv>=s & mask.ctv<=e]; | |
1230 mask.y <- (-1)*mask.ctv[mask.ctv<=-s & mask.ctv>=-e]; | |
1231 } else { | |
1232 mask.x <- mask.y <- NULL; | |
1233 } | |
1234 | |
1235 if(length(x)==0 | length(y) ==0) { | |
1236 if(return.peaks) { | |
1237 return(data.frame(x=c(),y=c())); | |
1238 } else { | |
1239 rx <- range(c(x,y)); | |
1240 return(list(x=rx,y=numeric(diff(rx)+1))); | |
1241 } | |
1242 } else { | |
1243 return(wtd(x,y,s,e,return.peaks=return.peaks, bg.x=bg.x,bg.y=bg.y, mask.x=mask.x,mask.y=mask.y, ...)) | |
1244 } | |
1245 } | |
1246 | |
1247 # shuffles tags in chromosome blocks of a specified size | |
1248 # note: all coordinates should be positive | |
1249 tag.block.shuffle <- function(tags,window.size=100) { | |
1250 if(length(tags)<3) { | |
1251 warning("too few tags for shuffling"); | |
1252 return(tags); | |
1253 } | |
1254 rng <- range(tags); | |
1255 #if(rng[1]<0) { stop("negative tag coordinates found") } | |
1256 if(diff(rng)<=window.size) { | |
1257 warning(paste("tag range (",diff(rng),") is smaller than shuffle window size")); | |
1258 return(tags); | |
1259 } | |
1260 | |
1261 if(window.size==0) { | |
1262 return(as.integer(runif(length(tags),min=rng[1],max=rng[2]))) | |
1263 } else if(window.size==1) { | |
1264 tt <- table(tags); | |
1265 return(rep(runif(length(tt),min=rng[1],max=rng[2]),as.integer(tt))) | |
1266 } else { | |
1267 # block positions | |
1268 bp <- tags %/% window.size; | |
1269 # block-relative tag positions | |
1270 rp <- tags %% window.size; | |
1271 | |
1272 # shuffle block positions | |
1273 bpu <- unique(bp); | |
1274 rbp <- range(bpu); | |
1275 bps <- as.integer(runif(length(bpu),min=rbp[1],max=rbp[2])); | |
1276 bpi <- match(bp,bpu); | |
1277 sbp <- bps[bpi]; | |
1278 #sbp <- rbp[1]+match(bp,sample(rbp[1]:rbp[2])) | |
1279 return(sbp*window.size+rp); | |
1280 } | |
1281 } | |
1282 | |
1283 | |
1284 # calculate window cross-correlation | |
1285 lwcc <- function(x,y,s,e,whs=100,isize=20,return.peaks=T,min.thr=1,min.dist=100,step=1,tag.weight=1,bg.x=NULL,bg.y=NULL,bg.weight=NULL,mask.x=NULL,mask.y=NULL,bg.whs=whs,round.up=F) { | |
1286 if(step>1) { | |
1287 x <- floor(x/step+0.5); y <- floor(y/step+0.5) | |
1288 | |
1289 if(!is.null(bg.x)) { | |
1290 bg.x <- floor(bg.x/step+0.5); bg.y <- floor(bg.y/step+0.5) | |
1291 } | |
1292 | |
1293 if(!is.null(mask.x)) { | |
1294 mask.x <- floor(mask.x/step+0.5); mask.y <- floor(mask.y/step+0.5) | |
1295 } | |
1296 | |
1297 whs <- floor(whs/step+0.5); | |
1298 bg.whs <- floor(bg.whs/step+0.5); | |
1299 isize <- floor(isize/step+0.5); | |
1300 min.dist <- floor(min.dist/step +0.5); | |
1301 s <- floor(s/step+0.5) | |
1302 e <- floor(e/step+0.5) | |
1303 } | |
1304 | |
1305 # scale bg.weight, since within calculation they are considered independent | |
1306 bg.weight <- bg.weight*tag.weight; | |
1307 | |
1308 | |
1309 rx <- c(s-whs,e+whs); | |
1310 xt <- table(x); | |
1311 xh <- integer(diff(rx)+1); | |
1312 xh[as.integer(names(xt))-rx[1]+1] <- as.integer(xt); | |
1313 | |
1314 yt <- table(y); | |
1315 | |
1316 yh <- integer(diff(rx)+1); | |
1317 yh[as.integer(names(yt))-rx[1]+1] <- as.integer(yt); | |
1318 | |
1319 # compile background vectors | |
1320 if(!is.null(bg.x) & length(bg.x)>0) { | |
1321 bg.subtract <- 1; | |
1322 | |
1323 bg.xt <- table(bg.x); | |
1324 bg.xh <- integer(diff(rx)+1); | |
1325 bg.xh[as.integer(names(bg.xt))-rx[1]+1] <- as.integer(bg.xt); | |
1326 rm(bg.xt); | |
1327 | |
1328 bg.yt <- table(bg.y); | |
1329 bg.yh <- integer(diff(rx)+1); | |
1330 bg.yh[as.integer(names(bg.yt))-rx[1]+1] <- as.integer(bg.yt); | |
1331 rm(bg.yt); | |
1332 | |
1333 # adjust bg.weight according to bg.whs | |
1334 bg.weight <- bg.weight*(whs-isize)/bg.whs; | |
1335 } else { | |
1336 bg.subtract <- 0; | |
1337 bg.xh <- bg.yh <- c(); | |
1338 } | |
1339 | |
1340 # record masked positions | |
1341 if(!is.null(mask.x) & length(mask.x)>0) { | |
1342 mvx <- unique(mask.x); mvx <- setdiff(mvx,as.numeric(names(xt))); | |
1343 mvx <- mvx[mvx>=rx[1] & mvx<=rx[2]]; | |
1344 | |
1345 xh[mvx-rx[1]+1] <- -1; | |
1346 } | |
1347 | |
1348 if(!is.null(mask.y) & length(mask.y)>0) { | |
1349 mvy <- unique(mask.y); mvy <- setdiff(mvy,as.numeric(names(yt))); | |
1350 mvy <- mvy[mvy>=rx[1] & mvy<=rx[2]]; | |
1351 yh[mvy-rx[1]+1] <- -1; | |
1352 } | |
1353 | |
1354 rm(xt,yt); | |
1355 if(round.up) { round.up <- 1; } else { round.up <- 0; } | |
1356 | |
1357 storage.mode(xh) <- storage.mode(yh) <- "integer"; | |
1358 storage.mode(bg.xh) <- storage.mode(bg.yh) <- "integer"; | |
1359 nx <- length(xh); storage.mode(nx) <- storage.mode(whs) <- storage.mode(isize) <- storage.mode(bg.whs) <- "integer"; | |
1360 rp <- as.integer(return.peaks); | |
1361 storage.mode(rp) <- storage.mode(min.dist) <- "integer"; | |
1362 storage.mode(min.thr) <- "double"; | |
1363 storage.mode(tag.weight) <- "double"; | |
1364 storage.mode(bg.weight) <- "double"; | |
1365 storage.mode(bg.subtract) <- "integer"; | |
1366 storage.mode(round.up) <- "integer"; | |
1367 | |
1368 # allocate return arrays | |
1369 #cc <- numeric(nx); storage.mode(cc) <- "double"; | |
1370 z <- .Call("lwcc",xh,yh,whs,isize,rp,min.dist,min.thr,tag.weight,bg.subtract,bg.xh,bg.yh,bg.whs,bg.weight,round.up); | |
1371 if(return.peaks) { | |
1372 return(data.frame(x=(z$x+rx[1])*step,y=z$v)); | |
1373 } else { | |
1374 return(list(x=rx*step,y=z)); | |
1375 } | |
1376 } | |
1377 | |
1378 | |
1379 tag.lwcc <- function(ctv,s,e,return.peaks=T, bg.ctv=NULL, mask.ctv=NULL, ...) { | |
1380 x <- ctv[ctv>=s & ctv<=e]; | |
1381 y <- (-1)*ctv[ctv<=-s & ctv>=-e]; | |
1382 | |
1383 if(!is.null(bg.ctv)) { | |
1384 bg.x <- bg.ctv[bg.ctv>=s & bg.ctv<=e]; | |
1385 bg.y <- (-1)*bg.ctv[bg.ctv<=-s & bg.ctv>=-e]; | |
1386 } else { | |
1387 bg.x <- bg.y <- NULL; | |
1388 } | |
1389 | |
1390 if(!is.null(mask.ctv)) { | |
1391 mask.x <- mask.ctv[mask.ctv>=s & mask.ctv<=e]; | |
1392 mask.y <- (-1)*mask.ctv[mask.ctv<=-s & mask.ctv>=-e]; | |
1393 } else { | |
1394 mask.x <- mask.y <- NULL; | |
1395 } | |
1396 | |
1397 if(length(x)==0 | length(y) ==0) { | |
1398 if(return.peaks) { | |
1399 return(data.frame(x=c(),y=c())); | |
1400 } else { | |
1401 rx <- range(c(x,y)); | |
1402 return(list(x=rx,y=numeric(diff(rx)+1))); | |
1403 } | |
1404 } else { | |
1405 return(lwcc(x,y, s,e,return.peaks=return.peaks, bg.x=bg.x,bg.y=bg.y, mask.x=mask.x,mask.y=mask.y, ...)) | |
1406 } | |
1407 } | |
1408 | |
1409 # determine mirror-based binding positions using sliding window along each chromosome | |
1410 # extra parameters are passed on to call.nucleosomes() | |
1411 window.call.mirror.binding <- function(tvl,window.size=4e7, debug=T, cluster=NULL, bg.tl=NULL, mask.tl=NULL, background.density.scaling=T, ...) { | |
1412 chrl <- names(tvl); | |
1413 # determine bg.weight | |
1414 if(!is.null(bg.tl)) { | |
1415 bg.weight <- dataset.density.ratio(tvl,bg.tl,background.density.scaling=background.density.scaling); | |
1416 } else { | |
1417 bg.weight <- NULL; | |
1418 } | |
1419 if(debug) { | |
1420 cat("bg.weight=",bg.weight," "); | |
1421 } | |
1422 | |
1423 names(chrl) <- chrl; | |
1424 | |
1425 if(is.null(cluster)) { | |
1426 return(lapply(chrl,function(chr) { | |
1427 bg.ctv <- NULL; if(!is.null(bg.tl)) { bg.ctv <- bg.tl[[chr]]; }; | |
1428 mask.ctv <- NULL; if(!is.null(mask.tl)) { mask.ctv <- mask.tl[[chr]]; }; | |
1429 | |
1430 window.chr.call.mirror.binding(list(ctv=tvl[[chr]],bg.ctv=bg.ctv,mask.ctv=mask.ctv),window.size=window.size,chr=chr,debug=debug, bg.weight=bg.weight, bg.ctv=bg.ctv, mask.ctv=mask.ctv, ...); | |
1431 })); | |
1432 } else { | |
1433 # add bg.ctv and mask.ctv to parallel call | |
1434 tvll <- lapply(chrl,function(chr) { | |
1435 bg.ctv <- NULL; if(!is.null(bg.tl)) { bg.ctv <- bg.tl[[chr]]; }; | |
1436 mask.ctv <- NULL; if(!is.null(mask.tl)) { mask.ctv <- mask.tl[[chr]]; }; | |
1437 return(list(ctv=tvl[[chr]],bg.ctv=bg.ctv,mask.ctv=mask.ctv)) | |
1438 }); | |
1439 bl <- clusterApplyLB(cluster,tvll,window.chr.call.mirror.binding,window.size=window.size,debug=debug, bg.weight=bg.weight, ...); | |
1440 names(bl) <- chrl; | |
1441 return(bl); | |
1442 } | |
1443 } | |
1444 | |
1445 window.chr.call.mirror.binding <- function(ctvl,window.size,debug=T, chr="NA", cluster=NULL, method=tag.wtd, bg.ctv=NULL, mask.ctv=NULL, ...) { | |
1446 ctv <- ctvl$ctv; bg.ctv <- ctvl$bg.ctv; mask.ctv <- ctvl$mask.ctv; | |
1447 if(is.null(ctv)) { return(data.frame(x=c(),y=c())) } | |
1448 if(length(ctv)<2) { return(data.frame(x=c(),y=c())) } | |
1449 | |
1450 dr <- range(unlist(lapply(ctv,function(x) range(abs(x))))) | |
1451 n.windows <- ceiling(diff(dr)/window.size); | |
1452 | |
1453 | |
1454 pinfo <- c(); | |
1455 if(debug) { | |
1456 cat(paste("processing ",chr," in ",n.windows," steps [",sep="")); | |
1457 } | |
1458 for(i in 1:n.windows) { | |
1459 s <- dr[1]+(i-1)*window.size; | |
1460 npn <- method(s=s, e=s+window.size,ctv=ctv, return.peaks=T, bg.ctv=bg.ctv, mask.ctv=mask.ctv, ... ); | |
1461 if(length(npn) > 0) { pinfo <- rbind(pinfo,npn) } | |
1462 if(debug) { | |
1463 cat("."); | |
1464 } | |
1465 } | |
1466 if(debug) { | |
1467 cat(paste("] done (",dim(pinfo)[1],"positions)\n")); | |
1468 } else { | |
1469 cat("."); | |
1470 } | |
1471 return(data.frame(x=pinfo[,1],y=pinfo[,2])); | |
1472 } | |
1473 | |
1474 generate.randomized.data <- function(data,shuffle.window=1,shuffle.both.strands=T,strand.shuffle.only=F,chrl=names(data)) { | |
1475 names(chrl) <- unlist(chrl); | |
1476 if(strand.shuffle.only) { | |
1477 # shuffle just strand assignment, not tag positions | |
1478 rt <- lapply(data[unlist(chrl)],function(tv) tv*sample(c(-1,1),length(tv),replace=T)); | |
1479 } else { | |
1480 if(shuffle.both.strands) { | |
1481 rt <- lapply(data[unlist(chrl)],function(tv) { | |
1482 pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tag.block.shuffle(tv[-pti],window.size=shuffle.window))) | |
1483 }); | |
1484 } else { | |
1485 rt <- lapply(data[unlist(chrl)],function(tv) { pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tv[-pti]))}); | |
1486 } | |
1487 } | |
1488 } | |
1489 | |
1490 # determine threshold based on E value | |
1491 # for efficiency chrl should include just one or two small chromosomes | |
1492 # optional parameters are passed to call.nucleosomes() | |
1493 determine.lwcc.threshold <- function(tvl,chrl=names(tvl),e.value=100, n.randomizations=1, min.thr=1, debug=F, tol=1e-2, shuffle.window=1, shuffle.both.strands=T, return.rtp=F, control=NULL, strand.shuffle=F, ...) { | |
1494 names(chrl) <- unlist(chrl); | |
1495 | |
1496 # determine fraction of total tags contained in the specified nucleosomes | |
1497 ntags <- sum(unlist(lapply(tvl,function(cv) length(cv)))); | |
1498 nctags <- sum(unlist(lapply(chrl, function(cn) length(tvl[[cn]])))); | |
1499 # calculate actual target E value | |
1500 if(!is.null(control)) { | |
1501 n.randomizations <- length(control); | |
1502 } | |
1503 eval <- e.value*n.randomizations*nctags/ntags | |
1504 if(eval<1) { | |
1505 warning("specified e.value and set of chromosomes results in target e.value of less than 1"); | |
1506 eval <- 1; | |
1507 } | |
1508 | |
1509 if(debug) { | |
1510 cat(paste("randomizations =",n.randomizations," chromosomes =",length(chrl),"\n")) | |
1511 cat(paste("adjusted target eval =",eval,"\ngenerating randomized tag peaks ...")); | |
1512 } | |
1513 | |
1514 # get peaks on randomized tags | |
1515 if(is.null(control)) { | |
1516 rtp <- data.frame(do.call(rbind,lapply(1:n.randomizations,function(i) { | |
1517 if(strand.shuffle) { | |
1518 # shuffle just strand assignment, not tag positions | |
1519 rt <- lapply(tvl[unlist(chrl)],function(tv) tv*sample(c(-1,1),length(tv),replace=T)); | |
1520 } else { | |
1521 if(shuffle.both.strands) { | |
1522 rt <- lapply(tvl[unlist(chrl)],function(tv) { | |
1523 pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tag.block.shuffle(tv[-pti],window.size=shuffle.window))) | |
1524 }); | |
1525 } else { | |
1526 rt <- lapply(tvl[unlist(chrl)],function(tv) { pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tv[-pti]))}); | |
1527 } | |
1528 } | |
1529 if(debug) { | |
1530 cat("."); | |
1531 } | |
1532 rl <- window.call.mirror.binding(rt,min.thr=min.thr, debug=F, ...); | |
1533 | |
1534 return(do.call(rbind,rl)) | |
1535 #return(do.call(rbind,window.call.mirror.binding(rt,min.thr=min.thr, debug=F, whs=100,isize=10,window.size=3e7,min.dist=200))) | |
1536 }))); | |
1537 | |
1538 } else { | |
1539 if(debug) { | |
1540 cat(" using provided controls "); | |
1541 } | |
1542 rtp <- data.frame(do.call(rbind,lapply(control,function(rt) do.call(rbind,window.call.mirror.binding(rt,min.thr=min.thr, debug=F, ...))))) | |
1543 } | |
1544 | |
1545 if(return.rtp) { | |
1546 return(rtp) | |
1547 } | |
1548 | |
1549 if(debug) { | |
1550 cat(" done\nfinding threshold ."); | |
1551 } | |
1552 | |
1553 # determine range and starting value | |
1554 rng <- c(min.thr,max(na.omit(rtp$y))) | |
1555 | |
1556 # find E value threshold | |
1557 count.nucs.f <- function(nthr) { | |
1558 return(eval-length(which(rtp$y>=nthr))); | |
1559 } | |
1560 | |
1561 # estimate position of the root by downward bisection iterations | |
1562 mv <- c(eval); mvp <- c(rng[2]); ni <- 1; | |
1563 max.it <- 2*as.integer(log2(rng[2]/rng[1])+0.5); | |
1564 while((ni<=max.it) & (mv[1]>=0)) { | |
1565 np <- mvp[1]/2; | |
1566 npv <- count.nucs.f(np); | |
1567 mv <- c(npv,mv); | |
1568 mvp <- c(np,mvp); | |
1569 ni <- ni+1; | |
1570 } | |
1571 | |
1572 | |
1573 if(ni>max.it) { | |
1574 # determine lowest value | |
1575 if(debug) { | |
1576 cat(paste("exceeded max.it (",max.it,"), returning lowest point",signif(mvp[1],4))); | |
1577 } | |
1578 return(list(root=mvp[1])) | |
1579 } else { | |
1580 rng <- mvp[1:2]; | |
1581 if(mv[2]==0) rng[2] <- mvp[3]; | |
1582 if(debug) { | |
1583 cat(paste("bound to (",signif(rng[1],4),signif(rng[2],4),") ")); | |
1584 } | |
1585 } | |
1586 | |
1587 # find root on the right side | |
1588 x <- uniroot(count.nucs.f,rng,tol=tol); | |
1589 #x$max <- o$par; | |
1590 #x$f.max <- (-1)*o$value; | |
1591 if(debug) { | |
1592 cat(paste(" done (thr=",signif(x$root,4),")\n")); | |
1593 } | |
1594 return(x); | |
1595 | |
1596 } | |
1597 | |
1598 | |
1599 # determine membership of points in fragments | |
1600 points.within <- function(x,fs,fe,return.list=F,return.unique=F,sorted=F,return.point.counts=F) { | |
1601 if(is.null(x) | length(x) < 1) { return(c()) }; | |
1602 if(!sorted) { | |
1603 ox <- rank(x,ties="first"); | |
1604 x <- sort(x); | |
1605 } | |
1606 | |
1607 se <- c(fs,fe); | |
1608 fi <- seq(1:length(fs)); | |
1609 fi <- c(fi,-1*fi); | |
1610 | |
1611 fi <- fi[order(se)]; | |
1612 se <- sort(se); | |
1613 | |
1614 storage.mode(x) <- storage.mode(fi) <- storage.mode(se) <- "integer"; | |
1615 if(return.unique) { iu <- 1; } else { iu <- 0; } | |
1616 if(return.list) { il <- 1; } else { il <- 0; } | |
1617 if(return.point.counts) { rpc <- 1; } else { rpc <- 0; } | |
1618 storage.mode(iu) <- storage.mode(il) <- storage.mode(rpc) <- "integer"; | |
1619 result <- .Call("points_within",x,se,fi,il,iu,rpc); | |
1620 if(!sorted & !return.point.counts) { | |
1621 result <- result[ox]; | |
1622 } | |
1623 return(result); | |
1624 } | |
1625 | |
1626 | |
1627 # determine cooridnates of points x relative to signed | |
1628 # positions pos within size range | |
1629 get.relative.coordinates <- function(x,pos,size,sorted=F) { | |
1630 if(!sorted) { | |
1631 op <- order(abs(pos)); | |
1632 x <- sort(x); pos <- pos[op]; | |
1633 } | |
1634 #dyn.load("~/zhao/sc/peaks.so"); | |
1635 storage.mode(x) <- storage.mode(pos) <- storage.mode(size) <- "integer"; | |
1636 rf <- .Call("get_relative_coordinates",x,pos,size); | |
1637 if(!sorted) { | |
1638 rf$i <- op[rf$i]; | |
1639 } else { | |
1640 return(rf$i); | |
1641 } | |
1642 return(rf); | |
1643 } | |
1644 | |
1645 # given list of magnitude values for signal(x) and control (y), | |
1646 # return a dataframe with $e.val and $fdr | |
1647 get.eval.fdr.vectors <- function(x,y) { | |
1648 nx <- length(x); ny <- length(y); | |
1649 if(nx==0) { return(data.frame(evalue=c(),fdr=c())) } | |
1650 if(ny==0) { return(data.frame(evalue=rep(0,nx),fdr=rep(1,nx))) } | |
1651 ex <- ecdf(x); ey <- ecdf(y); | |
1652 | |
1653 evals <- (1-ey(x))*ny; | |
1654 yvals <- (1-ex(x))*nx; | |
1655 fdr <- (evals+0.5)/(yvals+0.5); # with pseudo-counts | |
1656 fdr[yvals==0] <- min(fdr); # correct for undercounts | |
1657 # find a min x corresponding to a minimal FDR | |
1658 mfdr <- min(fdr); | |
1659 mfdrmx <- min(x[fdr==mfdr]); | |
1660 # correct | |
1661 fdr[x>=mfdrmx] <- mfdr; | |
1662 return(data.frame(evalue=(evals+1),fdr=fdr)); | |
1663 } | |
1664 | |
1665 | |
1666 # filter predictions to remove calls failling into the tag enrichment clusters ( chr list of $s/$e dfs) | |
1667 filter.binding.sites <- function(bd,tec,exclude=F) { | |
1668 chrl <- names(bd); names(chrl) <- chrl; | |
1669 lapply(chrl,function(chr) { | |
1670 cbd <- bd[[chr]]; | |
1671 if(is.null(cbd)) { return(NULL) }; | |
1672 if(length(cbd)==0) { return(NULL) }; | |
1673 if(dim(cbd)[1]>0) { | |
1674 ctec <- tec[[chr]]; | |
1675 if(length(ctec$s)>0) { | |
1676 if(exclude) { | |
1677 pwi <- which(points.within(cbd$x,ctec$s,ctec$e)== -1); | |
1678 } else { | |
1679 pwi <- which(points.within(cbd$x,ctec$s,ctec$e)> -1); | |
1680 } | |
1681 return(cbd[pwi,]); | |
1682 } else { | |
1683 if(exclude) { | |
1684 return(cbd); | |
1685 } else { | |
1686 return(data.frame(x=c(),y=c())); | |
1687 } | |
1688 } | |
1689 } else { | |
1690 return(cbd); | |
1691 } | |
1692 }); | |
1693 } | |
1694 | |
1695 | |
1696 # PUBLIC | |
1697 # generate predictions on sequential (chained) subsamples of data | |
1698 # if step.size <1, it is intepreted as a fraciton and a each subsequent subsample | |
1699 # is of a size (1-fraction.step)*N (N - size of the signal data); | |
1700 # otherwise the step.size is interpreted as a number of tags, and each subsample is of the size N-step.size | |
1701 get.subsample.chain.calls <- function(signal.data,control.data,n.steps=NULL,step.size=1e6,subsample.control=F,debug=F,min.ntags=1e3, excluded.steps=c(), test.chromosomes=NULL, ... ) { | |
1702 | |
1703 if(!is.null(test.chromosomes)) { | |
1704 # adjust step size | |
1705 sz <- sum(unlist(lapply(signal.data,length))) | |
1706 signal.data <- signal.data[test.chromosomes]; | |
1707 control.data <- control.data[test.chromosomes]; | |
1708 | |
1709 if(step.size>1) { | |
1710 step.size <- step.size*sum(unlist(lapply(signal.data,length)))/sz; | |
1711 # cat("adjusted step.size=",step.size,"\n"); | |
1712 } | |
1713 } | |
1714 | |
1715 if(is.null(n.steps)) { | |
1716 if(step.size<1) { | |
1717 # down to 10% | |
1718 n.steps <- log(0.1)/log(step.size); | |
1719 } else { | |
1720 n.steps <- floor(sum(unlist(lapply(signal.data,length)))/step.size) | |
1721 } | |
1722 } | |
1723 if(subsample.control & !is.null(control.data)) { | |
1724 # normalize control to the signal size | |
1725 if(debug) { cat("pre-subsampling control.\n"); } | |
1726 bg.weight <- sum(unlist(lapply(signal.data,length)))/sum(unlist(lapply(control.data,length))) | |
1727 control.data <- lapply(control.data,function(d) sample(d,length(d)*bg.weight,replace=(bg.weight>1))) | |
1728 } | |
1729 calls <- list(); | |
1730 callnames <- c(); | |
1731 for(i in 0:n.steps) { | |
1732 if(debug) { cat("chained subsample step",i,":\n"); } | |
1733 if(!i %in% excluded.steps) { | |
1734 ans <- list(find.binding.positions(signal.data=signal.data,control.data=control.data,debug=debug, skip.control.normalization=T, ...)); | |
1735 names(ans) <- as.character(c(i)); | |
1736 calls <- c(calls,ans); | |
1737 callnames <- c(callnames,i); | |
1738 } | |
1739 # subsample | |
1740 if(step.size<1) { | |
1741 # fraction steps | |
1742 f <- 1-step.size; | |
1743 } else { | |
1744 # bin steps | |
1745 sz <- sum(unlist(lapply(signal.data,length))); | |
1746 f <- (sz-step.size)/sz; | |
1747 if(f<=0) break; | |
1748 } | |
1749 if(debug) { cat("chained subsampling using fraction",f,".\n"); } | |
1750 signal.data <- lapply(signal.data,function(d) sample(d,length(d)*f)); | |
1751 if(subsample.control & !is.null(control.data)) { | |
1752 control.data <- lapply(control.data,function(d) sample(d,length(d)*f)); | |
1753 } | |
1754 sz <- sum(unlist(lapply(signal.data,length))); | |
1755 if(sz<min.ntags) break; | |
1756 } | |
1757 names(calls) <- callnames; | |
1758 return(calls); | |
1759 } | |
1760 | |
1761 | |
1762 # chain-subsample dataset and calculate MSER interpolation | |
1763 mser.chain.interpolation <- function(signal.data=NULL,control.data=NULL,chains=NULL,n.chains=5,debug=F, enrichment.background.scales=c(1,5), test.agreement=0.99, agreement.distance=50, return.median=F, mean.trim=0.1, enr.field="enr", return.lists=F, ...) { | |
1764 if(is.null(chains)) { | |
1765 cn <- c(1:n.chains); names(cn) <- cn; | |
1766 tf <- function(i, ...) get.subsample.chain.calls(signal.data,control.data,debug=debug, enrichment.background.scales=enrichment.background.scales, ...); | |
1767 chains <- lapply(cn,tf,...); | |
1768 } | |
1769 names(enrichment.background.scales) <- enrichment.background.scales; | |
1770 lapply(enrichment.background.scales,function(scale) { | |
1771 actual.enr.field <- enr.field; | |
1772 if(scale>1) { | |
1773 actual.enr.field <- paste(actual.enr.field,scale,sep="."); | |
1774 } | |
1775 | |
1776 cvl <- lapply(chains,function(chain) { | |
1777 nn <- sort(unlist(lapply(chain,function(d) d$n)),decreasing=T); | |
1778 nd <- diff(nn); | |
1779 nn <- nn[-length(nn)]; | |
1780 me <- lapply(c(2:length(chain)),function(i) { | |
1781 sla <- t.precalculate.ref.peak.agreement(chain[[i-1]],chain[i],agreement.distance=agreement.distance,enr.field=actual.enr.field) | |
1782 me <- t.find.min.saturated.enr(sla,thr=1-test.agreement) | |
1783 menr <- max(min(na.omit(unlist(lapply(chain[[i-1]]$npl,function(d) d[actual.enr.field])))),min(na.omit(unlist(lapply(chain[[i]]$npl,function(d) d[actual.enr.field])))),1) | |
1784 if(me<=menr) { me <- 1; }; | |
1785 return(me); | |
1786 }) | |
1787 data.frame(n=nn,me=unlist(me),nd=nd); | |
1788 }); | |
1789 if(return.lists) { return(cvl) } | |
1790 cvl <- na.omit(do.call(rbind,cvl)); | |
1791 if(return.median) { | |
1792 tv <- tapply(cvl$me,as.factor(cvl$n),median) | |
1793 } else { | |
1794 tv <- tapply(cvl$me,as.factor(cvl$n),mean,trim=mean.trim); | |
1795 } | |
1796 df <- data.frame(n=as.numeric(names(tv)),me=as.numeric(tv)); | |
1797 return(df[order(df$n,decreasing=T),]) | |
1798 }) | |
1799 } | |
1800 | |
1801 | |
1802 | |
1803 # returns agreement as a function of dataset size, possibly filtering peaks by min.enr threshold, and by max.fdr | |
1804 chain.to.reference.comparison <- function(chains,min.enr=NULL,debug=F,agreement.distance=50, return.median=F, mean.trim=0.1, enr.field="enr",max.fdr=NULL) { | |
1805 cvl <- lapply(chains,function(chain) { | |
1806 # filter chain by fdr | |
1807 if(!is.null(max.fdr)) { | |
1808 chain <- lapply(chain,function(d) { d$npl <- lapply(d$npl,function(cd) cd[cd$fdr<=max.fdr,]); return(d); }); | |
1809 } | |
1810 nn <- sort(unlist(lapply(chain,function(d) d$n)),decreasing=T); | |
1811 nn <- nn[-length(nn)]; | |
1812 me <- lapply(c(2:length(chain)),function(i) { | |
1813 sla <- t.precalculate.ref.peak.agreement(chain[[1]],chain[i],agreement.distance=agreement.distance,enr.field=enr.field) | |
1814 # calculate overlap | |
1815 x <- lapply(sla,function(mpd) { | |
1816 if(!is.null(min.enr)) { | |
1817 | |
1818 me <- mpd$re >= min.enr; | |
1819 me[is.na(me)] <- F; | |
1820 mpd <- mpd[me,]; | |
1821 ome <- mpd$oe < min.enr; | |
1822 ome[is.na(ome)] <- T; | |
1823 mpd$ov[ome] <- 0; | |
1824 } | |
1825 return(mean(mpd$ov)); | |
1826 }) | |
1827 }) | |
1828 | |
1829 data.frame(n=nn,me=unlist(me)); | |
1830 }); | |
1831 | |
1832 cvl <- na.omit(do.call(rbind,cvl)); | |
1833 if(return.median) { | |
1834 tv <- tapply(cvl$me,as.factor(cvl$n),median) | |
1835 } else { | |
1836 tv <- tapply(cvl$me,as.factor(cvl$n),mean,trim=mean.trim); | |
1837 } | |
1838 df <- data.frame(n=as.numeric(names(tv)),me=as.numeric(tv)); | |
1839 return(df[order(df$n,decreasing=T),]) | |
1840 } | |
1841 | |
1842 | |
1843 # estimates enrichment confidence interval based on 2*tag.count.whs window around each position, and a z-score (alpha/2) | |
1844 # if(multiple.background.scales=T) the enrichment is also estimated using 5- and 10-fold increased background tag window | |
1845 # adds $enr (lower bound), $enr.ub (upper bound) and $enr.mle fields | |
1846 calculate.enrichment.estimates <- function(binding.positions,signal.data=NULL,control.data=NULL,fraction=1,tag.count.whs=100,z=2,effective.genome.size=3e9,scale.down.control=F,background.scales=c(1),bg.weight=NULL) { | |
1847 f <- fraction; | |
1848 qv <- pnorm(z,lower.tail=F); | |
1849 cn <- names(binding.positions$npl); names(cn) <- cn; | |
1850 | |
1851 if(is.null(control.data)) { | |
1852 # estimate from gamma distribution | |
1853 fg.lambda <- f*sum(unlist(lapply(signal.data,length)))*2*tag.count.whs/effective.genome.size; | |
1854 binding.positions$npl <- lapply(binding.positions$npl,function(d) { | |
1855 d$enr <- qgamma(qv,d$nt,scale=1)/fg.lambda; | |
1856 d$enr.ub <- qgamma(1-qv,d$nt,scale=1)/fg.lambda; | |
1857 d$enr.mle <- d$nt/fg.lambda; | |
1858 return(d); | |
1859 }); | |
1860 } else { | |
1861 # estimate using beta distribution | |
1862 if(is.null(bg.weight)) { | |
1863 bg.weight <- sum(unlist(lapply(signal.data,length)))/sum(unlist(lapply(control.data,length))) | |
1864 } | |
1865 | |
1866 if(scale.down.control) { | |
1867 # sample down control to be the same size as true signal.data (bg.weight*f) | |
1868 control.data <- lapply(control.data,function(d) sample(d,length(d)*bg.weight*f,replace=(f*bg.weight>1))) | |
1869 #bg.weight <- sum(unlist(lapply(signal.data,length)))/sum(unlist(lapply(control.data,length))) | |
1870 bg.weight <- 1/f; | |
1871 | |
1872 } | |
1873 | |
1874 binding.positions$enrichment.bg.weight <- bg.weight; | |
1875 binding.positions$enrichment.whs <- tag.count.whs; | |
1876 binding.positions$enrichment.z <- z; | |
1877 | |
1878 binding.positions$npl <- lapply(cn,function(chr) { | |
1879 d <- binding.positions$npl[[chr]]; | |
1880 | |
1881 edf <- lapply(background.scales,function(background.width.multiplier) { | |
1882 sig.mult <- bg.weight*f/background.width.multiplier; | |
1883 nbg <- points.within(abs(control.data[[chr]]),d$x-tag.count.whs*background.width.multiplier,d$x+tag.count.whs*background.width.multiplier,return.point.counts=T,return.unique=F); | |
1884 | |
1885 nfg <- d$nt; | |
1886 | |
1887 | |
1888 # Poisson ratio Bayesian LB with non-informative prior (Clopper & Pearson 1934) | |
1889 nf <- ((nfg+0.5)/(nbg+0.5))*qf(1-qv,2*(nfg+0.5),2*(nbg+0.5),lower.tail=F) | |
1890 nf <- nf/sig.mult; | |
1891 | |
1892 ub <- ((nfg+0.5)/(nbg+0.5))*qf(qv,2*(nfg+0.5),2*(nbg+0.5),lower.tail=F) | |
1893 ub <- ub/sig.mult; | |
1894 | |
1895 mle <- (nfg+0.5)/(nbg+0.5); | |
1896 mle <- mle/sig.mult; | |
1897 if(is.null(nbg)) { nbg <- numeric(0) } | |
1898 if(is.null(nf)) { nf <- numeric(0) } | |
1899 if(is.null(ub)) { ub <- numeric(0) } | |
1900 if(is.null(mle)) { mle <- numeric(0) } | |
1901 return(data.frame(nbg=nbg,lb=nf,ub=ub,mle=mle)) | |
1902 }) | |
1903 | |
1904 adf <- do.call(cbind,lapply(c(1:length(background.scales)),function(i) { | |
1905 df <- edf[[i]]; | |
1906 cn <- c("nbgt","enr","enr.ub","enr.mle"); | |
1907 if(background.scales[i]>1) { | |
1908 cn <- paste(cn,as.character(background.scales[i]),sep="."); | |
1909 } | |
1910 names(df) <- cn; | |
1911 return(df); | |
1912 })) | |
1913 | |
1914 return(cbind(d,adf)); | |
1915 }); | |
1916 } | |
1917 | |
1918 return(binding.positions); | |
1919 } | |
1920 | |
1921 | |
1922 # precalculate peak agreement of a sampling list given a reference | |
1923 t.precalculate.ref.peak.agreement <- function(ref,sf,agreement.distance=50,enr.field="enr") { | |
1924 ref <- ref$npl; | |
1925 cn <- names(ref); names(cn) <- cn; | |
1926 | |
1927 # for each sampling round | |
1928 lapply(sf,function(sd) { | |
1929 # calculate overlap | |
1930 | |
1931 ov <- data.frame(do.call(rbind,lapply(cn,function(chr) { | |
1932 if(dim(ref[[chr]])[1]<1) { return(cbind(ov=c(),re=c(),oe=c())) }; | |
1933 pwi <- points.within(ref[[chr]]$x,sd$npl[[chr]]$x-agreement.distance,sd$npl[[chr]]$x+agreement.distance); | |
1934 pwi[pwi==-1] <- NA; | |
1935 renr <- ref[[chr]][,enr.field] | |
1936 oenr <- sd$npl[[chr]][,enr.field][pwi]; | |
1937 if(length(oenr)==0) { oenr <- rep(NA,length(renr)); } | |
1938 return(cbind(ov=as.integer(!is.na(pwi)),re=renr,oe=oenr)); | |
1939 }))) | |
1940 }) | |
1941 } | |
1942 | |
1943 | |
1944 # find minimal saturated enrichment given a list of replicate agreement matrices (for one fraction) | |
1945 t.find.min.saturated.enr <- function(pal,thr=0.01,plot=F,return.number.of.peaks=F,plot.individual=T,return.median=F,return.vector=F) { | |
1946 nr <- length(pal); | |
1947 # merge replicate data frames | |
1948 mpd <- data.frame(do.call(rbind,pal)); | |
1949 | |
1950 mpd$re[is.na(mpd$re)] <- Inf; | |
1951 mpd$oe[is.na(mpd$oe)] <- Inf; | |
1952 | |
1953 | |
1954 | |
1955 # round up values to avoid miscounting | |
1956 mpd$re <- round(mpd$re,digits=2); | |
1957 mpd$oe <- round(mpd$oe,digits=2); | |
1958 | |
1959 me <- pmin(mpd$re,mpd$oe); | |
1960 ome <- order(me,decreasing=T); | |
1961 df <- data.frame(me=me[ome],ov=mpd$ov[ome]); | |
1962 recdf <- ecdf(-mpd$re); ren <- length(mpd$re); | |
1963 | |
1964 # collapse equal peak heights | |
1965 xk <- tapply(df$ov,as.factor(df$me),sum); xk <- data.frame(ov=as.numeric(xk),me=as.numeric(names(xk))); xk <- xk[order(xk$me,decreasing=T),]; | |
1966 | |
1967 | |
1968 cso <- cumsum(xk$ov)/(recdf(-xk$me)*ren); | |
1969 cso[is.na(cso)] <- 0; | |
1970 cso[!is.finite(cso)] <- 0; | |
1971 mv <- max(which(cso >= 1-thr)) | |
1972 menr <- xk$me[mv]; | |
1973 | |
1974 ir <- lapply(pal,function(d) { | |
1975 d$re[is.na(d$re)] <- Inf; | |
1976 d$oe[is.na(d$oe)] <- Inf; | |
1977 | |
1978 me <- pmin(d$re,d$oe); | |
1979 ome <- order(me,decreasing=T); | |
1980 df <- data.frame(me=me[ome],ov=d$ov[ome]); | |
1981 cso <- cumsum(df$ov)/c(1:length(df$ov)); | |
1982 mv <- max(which(cso >= 1-thr)) | |
1983 menr <- df$me[mv]; | |
1984 return(list(df=df,menr=menr)); | |
1985 }); | |
1986 | |
1987 if(plot) { | |
1988 par(mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8); | |
1989 plot(df$me,cumsum(df$ov)/c(1:length(df$ov)),type='l',ylab="fraction of positions overlapping with reference",xlab="minimal enrichment of binding positions",xlim=c(min(df$me),2*menr)); | |
1990 abline(h=1-thr,lty=2,col=4) | |
1991 if(plot.individual) { | |
1992 lapply(ir,function(d) { | |
1993 df <- d$df; | |
1994 lines(df$me,cumsum(df$ov)/c(1:length(df$ov)),col=8); | |
1995 abline(v=menr,col="pink",lty=3) | |
1996 }); | |
1997 lines(df$me,cumsum(df$ov)/c(1:length(df$ov)),col=1); | |
1998 } | |
1999 abline(v=menr,col=2,lty=2) | |
2000 legend(x="bottomright",lty=c(1,2,1,3,2),col=c(1,2,8,"pink",4),legend=c("combined samples","combined sample MSER","individual samples","individual MSERs","consistency threshold")); | |
2001 } | |
2002 | |
2003 if(return.number.of.peaks) { | |
2004 mpd <- data.frame(do.call(rbind,pal)); | |
2005 return(length(which(!is.na(mpd$re) & mpd$re >=menr))/nr); | |
2006 } else { | |
2007 if(return.vector) { | |
2008 return(unlist(lapply(ir,function(d) d$menr))); | |
2009 } | |
2010 if(return.median) { | |
2011 return(median(unlist(lapply(ir,function(d) d$menr)))); | |
2012 } else { | |
2013 return(menr); | |
2014 } | |
2015 } | |
2016 } | |
2017 | |
2018 | |
2019 | |
2020 # determine d1/d2 dataset size ratio. If background.density.scaling=F, the ratio of tag counts is returned. | |
2021 # if background.density.scaling=T, regions of significant tag enrichment are masked prior to ratio calculation. | |
2022 dataset.density.ratio <- function(d1,d2,min.tag.count.z=4.3,wsize=1e3,mcs=0,background.density.scaling=T) { | |
2023 if(!background.density.scaling) { | |
2024 return(sum(unlist(lapply(d1,length)))/sum(unlist(lapply(d2,length)))) | |
2025 } | |
2026 | |
2027 chrl <- intersect(names(d1),names(d2)); | |
2028 ntc <- do.call(rbind,lapply(chrl,function(chr) { | |
2029 x1 <- tag.enrichment.clusters(abs(d1[[chr]]),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F) | |
2030 x2 <- tag.enrichment.clusters(abs(d2[[chr]]),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F) | |
2031 return(c(length(which(points.within(abs(d1[[chr]]),c(x1$s,x2$s)-wsize/2,c(x1$e,x2$e)+wsize/2)==-1)),length(which(points.within(abs(d2[[chr]]),c(x1$s,x2$s)-wsize/2,c(x1$e,x2$e)+wsize/2)==-1)))) | |
2032 })) | |
2033 ntcs <- apply(ntc,2,sum); | |
2034 #print(ntcs/c(sum(unlist(lapply(d1,length))),sum(unlist(lapply(d2,length))))); | |
2035 return(ntcs[1]/ntcs[2]) | |
2036 } | |
2037 | |
2038 # returns effective size of the dataset based on the same logic as dataset.density.ratio | |
2039 dataset.density.size <- function(d1,min.tag.count.z=4.3,wsize=1e3,mcs=0,background.density.scaling=T) { | |
2040 if(!background.density.scaling) { | |
2041 return(sum(unlist(lapply(d1,length)))) | |
2042 } | |
2043 | |
2044 chrl <- names(d1); | |
2045 ntc <- lapply(chrl,function(chr) { | |
2046 x1 <- tag.enrichment.clusters(abs(d1[[chr]]),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F) | |
2047 return(length(which(points.within(abs(d1[[chr]]),x1$s-wsize/2,x1$e+wsize/2)==-1))) | |
2048 }) | |
2049 return(sum(unlist(ntc))) | |
2050 } | |
2051 | |
2052 old.dataset.density.ratio <- function(d1,d2,min.tag.count.z=4.3,wsize=1e3,mcs=0,background.density.scaling=T) { | |
2053 if(!background.density.scaling) { | |
2054 return(sum(unlist(lapply(d1,length)))/sum(unlist(lapply(d2,length)))) | |
2055 } | |
2056 | |
2057 t.chromosome.counts <- function(tl) { | |
2058 lapply(tl,function(d) { | |
2059 x <- tag.enrichment.clusters(abs(d),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F) | |
2060 x$s <- x$s-wsize/2; x$e <- x$e+wsize/2; | |
2061 x <- regionset.intersection.c(list(x),do.union=T) | |
2062 return(c(n=length(which(points.within(abs(d),x$s,x$e)==-1)),s=diff(range(abs(d))),m=sum(x$e-x$s))); | |
2063 }) | |
2064 } | |
2065 | |
2066 l1 <- t.chromosome.counts(d1); | |
2067 l2 <- t.chromosome.counts(d2); | |
2068 | |
2069 l2 <- data.frame(do.call(rbind,l2[names(l1)])); | |
2070 l1 <- data.frame(do.call(rbind,l1)); | |
2071 | |
2072 # genome size | |
2073 gs <- sum(pmax(l1$s,l2$s)) | |
2074 | |
2075 den1 <- sum(l1$n)/(gs-sum(l1$m)) | |
2076 den2 <- sum(l2$n)/(gs-sum(l2$m)) | |
2077 return(den1/den2); | |
2078 } | |
2079 | |
2080 | |
2081 | |
2082 | |
2083 # calculate cumulative density based on sum of scaled gaussian curves | |
2084 # (by Michael Tolstorukov) | |
2085 # | |
2086 # vin - input vector; bw -- standard deviation, dw-gaussina cutoff in stdev; dout - output "density") | |
2087 # output - if return.x=F vector of cumulative density values corresponding to integer positions described by range(vin) | |
2088 # output - if return.x=T a data structure with $x and $y corresponding to the cumulative density | |
2089 # optional match.wt.f is a function that will return weights for a tag vector | |
2090 densum <- function(vin,bw=5,dw=3,match.wt.f=NULL,return.x=T,from=min(vin),to=max(vin),step=1) { | |
2091 # construct vector of unique tags and their counts | |
2092 tc <- table(vin[vin>=from & vin<=to]); | |
2093 pos <- as.numeric(names(tc)); storage.mode(pos) <- "double"; | |
2094 tc <- as.numeric(tc); storage.mode(tc) <- "double"; | |
2095 n <- length(pos) | |
2096 # weight counts | |
2097 if(!is.null(match.wt.f)) { | |
2098 tc <- tc*match.wt.f(pos); | |
2099 } | |
2100 | |
2101 rng <- c(from,to); | |
2102 if(rng[1]<0) { stop("range extends into negative values") } | |
2103 if(range(pos)[1]<0) { stop("position vector contains negative values") } | |
2104 | |
2105 storage.mode(n) <- storage.mode(rng) <- storage.mode(bw) <- storage.mode(dw) <- storage.mode(step) <- "integer"; | |
2106 | |
2107 spos <- rng[1]; storage.mode(spos) <- "double"; | |
2108 | |
2109 dlength <- floor((rng[2] - rng[1])/step) + 1; # length of output array | |
2110 if(dlength<1) { stop("zero data range") } | |
2111 dout <- numeric(dlength); storage.mode(dout) <- "double"; | |
2112 storage.mode(dlength) <- "integer"; | |
2113 .C("cdensum",n,pos,tc,spos,bw,dw,dlength,step,dout,DUP=F); | |
2114 | |
2115 if(return.x) { | |
2116 return(list(x=c(rng[1],rng[1]+step*(dlength-1)),y=dout,step=step)) | |
2117 } else { | |
2118 return(dout) | |
2119 } | |
2120 } | |
2121 | |
2122 # count tags within sliding window of a specified size | |
2123 # vin - tag vector (postive values, pre-shifted) | |
2124 # window.size/window.step - window characteristics | |
2125 # tv - optional, pre-sorted, pre-trimmed tag vector | |
2126 window.tag.count <- function(vin,window.size,window.step=1,return.x=T,from=min(vin)+floor(window.size/2),to=max(vin)-floor(window.size/2),tv=NULL) { | |
2127 whs <- floor(window.size/2); | |
2128 # select tags with margins | |
2129 if(is.null(tv)) { | |
2130 tv <- sort(vin[vin>=from-whs-1 & vin<=to+whs+1]) | |
2131 } | |
2132 storage.mode(tv) <- "double"; | |
2133 n <- length(tv) | |
2134 nsteps <- ceiling((to-from)/window.step); | |
2135 | |
2136 storage.mode(n) <- storage.mode(nsteps) <- storage.mode(window.size) <- storage.mode(window.step) <- "integer"; | |
2137 | |
2138 spos <- from; storage.mode(spos) <- "double"; | |
2139 | |
2140 if(nsteps<1) { stop("zero data range") } | |
2141 #dout <- integer(nsteps); storage.mode(dout) <- "integer"; | |
2142 #.C("window_n_tags",n,tv,spos,window.size,window.step,nsteps,dout,DUP=F); | |
2143 dout <- .Call("cwindow_n_tags",tv,spos,window.size,window.step,nsteps); | |
2144 | |
2145 if(return.x) { | |
2146 return(list(x=c(from,from+(nsteps-1)*window.step),y=dout,step=window.step)) | |
2147 } else { | |
2148 return(dout) | |
2149 } | |
2150 } | |
2151 | |
2152 # count tags in windows around specified positions (pos) | |
2153 window.tag.count.around <- function(vin,window.size,pos,return.x=T,tc=NULL,sorted=F) { | |
2154 if(is.null(tc)) { | |
2155 tc <- table(vin); | |
2156 } | |
2157 if(!sorted) { | |
2158 op <- rank(pos); | |
2159 pos <- sort(pos); | |
2160 } | |
2161 storage.mode(pos) <- "double"; | |
2162 tpos <- as.integer(names(tc)); storage.mode(tpos) <- "double"; | |
2163 tc <- as.integer(tc); storage.mode(tc) <- "integer"; | |
2164 | |
2165 whs <- floor(window.size/2); | |
2166 | |
2167 storage.mode(whs) <- "integer"; | |
2168 twc <- .Call("cwindow_n_tags_around",tpos,tc,pos,whs); | |
2169 if(return.x) { | |
2170 if(sorted) { | |
2171 return(data.frame(x=pos,y=twc)); | |
2172 } else { | |
2173 return(data.frame(x=pos[op],y=twc[op])); | |
2174 } | |
2175 } else { | |
2176 if(sorted) { | |
2177 return(twc); | |
2178 } else { | |
2179 return(twc[op]); | |
2180 } | |
2181 } | |
2182 } | |
2183 | |
2184 # given a tag vector (signed), identify and clean up (either remove or cap) singular positions that exceed local tag density | |
2185 # vin - tag vector | |
2186 # cap.fold - maximal fold over enrichment over local density allowed for a single tag position, at which the tag count is capped | |
2187 # eliminate.fold - max fold enrichment that, when exceeded, results in exclusion of all the tags at that position (e.g. counted as anomaly) | |
2188 # z.threshold - Z-score used to determine max allowed counts | |
2189 filter.singular.positions.by.local.density <- function(tags,window.size=200,cap.fold=4,eliminate.fold=10,z.threshold=3) { | |
2190 # tabulate tag positions | |
2191 if(length(tags)<2) { return(tags); }; | |
2192 | |
2193 tc <- table(tags); | |
2194 pos <- as.numeric(names(tc)); storage.mode(pos) <- "double"; | |
2195 tc <- as.integer(tc); storage.mode(tc) <- "integer"; | |
2196 n <- length(pos); | |
2197 | |
2198 whs <- floor(window.size/2); | |
2199 | |
2200 storage.mode(n) <- storage.mode(whs) <- "integer"; | |
2201 twc <- .Call("cwindow_n_tags_around",pos,tc,pos,whs); | |
2202 twc <- (twc-tc+1)/window.size; # local density | |
2203 | |
2204 pv <- pnorm(z.threshold,lower.tail=F) | |
2205 # exclude | |
2206 max.counts <- qpois(pv,twc*eliminate.fold,lower.tail=F) | |
2207 tc[tc>max.counts] <- 0; | |
2208 # cap | |
2209 max.counts <- qpois(pv,twc*cap.fold,lower.tail=F) | |
2210 ivi <- which(tc>max.counts); | |
2211 tc[ivi] <- max.counts[ivi]+1; | |
2212 | |
2213 # reconstruct tag vector | |
2214 tv <- rep(pos,tc); | |
2215 to <- order(abs(tv)); tv <- tv[to]; | |
2216 return(tv); | |
2217 } | |
2218 | |
2219 | |
2220 | |
2221 # calculates enrichment bounds using multiple background scales | |
2222 # ft - foreground tags (pre-shifted, positive) | |
2223 # bt - background tags | |
2224 # fws - foreground window size | |
2225 # bwsl - background window size list | |
2226 # step - window step | |
2227 # rng - from/to coordinates (to will be adjusted according to step) | |
2228 # | |
2229 # returns: a list with $x ($s $e $step), $lb vector and $mle vector ($ub if calculate.upper.bound=T) | |
2230 mbs.enrichment.bounds <- function(ft,bt,fws,bwsl,step=1,rng=NULL,alpha=0.05,calculate.upper.bound=F,bg.weight=length(ft)/length(bt),use.most.informative.scale=F,quick.calculation=F,pos=NULL) { | |
2231 # determine range | |
2232 if(is.null(rng)) { | |
2233 rng <- range(range(ft)); | |
2234 } | |
2235 # foreground counts | |
2236 if(is.null(pos)) { | |
2237 fwc <- window.tag.count(ft,fws,window.step=step,from=rng[1],to=rng[2],return.x=T); | |
2238 } else { | |
2239 fwc <- window.tag.count.around(ft,fws,pos,return.x=T) | |
2240 } | |
2241 fwc$y <- fwc$y+0.5; | |
2242 | |
2243 zal <- qnorm(alpha/2,lower.tail=F); | |
2244 | |
2245 # background counts | |
2246 bt <- sort(bt); | |
2247 if(!is.null(pos)) { | |
2248 tc <- table(bt); | |
2249 } | |
2250 bgcm <- lapply(bwsl,function(bgws) { | |
2251 if(is.null(pos)) { | |
2252 window.tag.count(bt,bgws,window.step=step,from=rng[1],to=rng[2],return.x=F,tv=bt)+0.5; | |
2253 } else { | |
2254 window.tag.count.around(bt,bgws,pos,return.x=F,tc=tc)+0.5 | |
2255 } | |
2256 }) | |
2257 if(!is.null(pos)) { | |
2258 rm(tc); | |
2259 } | |
2260 | |
2261 # pick most informative scale | |
2262 if(use.most.informative.scale) { | |
2263 bgcm <- t(do.call(cbind,bgcm)) | |
2264 isi <- max.col(t((bgcm)/(bwsl/fws))) # add pseudo-counts to select lowest scale in case of a tie | |
2265 | |
2266 bgc <- c(bgcm)[isi+dim(bgcm)[1]*(c(1:length(isi))-1)] | |
2267 | |
2268 if(quick.calculation) { | |
2269 rte <- fwc$y+bgc-0.25*zal*zal; rte[rte<0] <- 0; | |
2270 dn <- bgc - 0.25*zal*zal; | |
2271 lbm=(sqrt(fwc$y*bgc) - 0.5*zal*sqrt(rte))/dn; | |
2272 ivi <- which(lbm<0); | |
2273 lbm <- lbm*lbm*bwsl[isi]/fws/bg.weight; | |
2274 lbm[rte<=0] <- 1; | |
2275 lbm[dn<=0] <- 1; | |
2276 lbm[ivi] <- 1; | |
2277 } else { | |
2278 lbm <- (fwc$y/bgc)*qf(1-alpha/2,2*fwc$y,2*bgc,lower.tail=F)*bwsl[isi]/fws/bg.weight; | |
2279 } | |
2280 | |
2281 mle <- fwc$y/bgc*bwsl[isi]/fws/bg.weight; mle[is.nan(mle)] <- Inf; mle[is.na(mle)] <- Inf; | |
2282 | |
2283 rl <- list(x=list(s=fwc$x[1],e=fwc$x[2],step=fwc$step),lb=lbm,mle=mle); | |
2284 | |
2285 if(calculate.upper.bound) { | |
2286 isi <- max.col(t((-bgcm)/(bwsl/fws))) # add pseudo-counts to select highest scale in case of a tie | |
2287 bgc <- c(bgcm)[isi+dim(bgcm)[1]*(c(1:length(isi))-1)] | |
2288 | |
2289 if(quick.calculation) { | |
2290 ubm=(sqrt(fwc$y*bgc) + 0.5*zal*sqrt(rte))/dn; | |
2291 ivi <- which(ubm<0); | |
2292 ubm <- ubm*ubm*bwsl[isi]/fws/bg.weight; | |
2293 ubm[rte<=0] <- 1; | |
2294 ubm[ivi] <- 1; | |
2295 lbm[dn<=0] <- 1; | |
2296 } else { | |
2297 ubm <- (fwc$y/bgc)*qf(alpha/2,2*fwc$y,2*bgc,lower.tail=F)*bwsl[isi]/fws/bg.weight; | |
2298 } | |
2299 rl <- c(rl,list(ub=ubm)); | |
2300 } | |
2301 return(rl); | |
2302 | |
2303 } else { | |
2304 # determine lower bounds | |
2305 lbm <- lapply(c(1:length(bgcm)),function(i) { | |
2306 nbg <- bgcm[[i]]; | |
2307 if(quick.calculation) { | |
2308 rte <- fwc$y+nbg-0.25*zal*zal; rte[rte<0] <- 0; | |
2309 dn <- (nbg - 0.25*zal*zal); | |
2310 lbm=(sqrt(fwc$y*nbg) - 0.5*zal*sqrt(rte))/dn; | |
2311 ivi <- which(lbm<0); | |
2312 lbm <- lbm*lbm*bwsl[i]/fws/bg.weight; | |
2313 lbm[rte<=0] <- 1; | |
2314 lbm[dn<=0] <- 1; | |
2315 lbm[ivi] <- 1; | |
2316 return(lbm); | |
2317 } else { | |
2318 return((fwc$y/nbg)*qf(1-alpha/2,2*fwc$y,2*nbg,lower.tail=F)*bwsl[i]/fws/bg.weight); | |
2319 } | |
2320 }) | |
2321 lbm <- do.call(pmin,lbm); | |
2322 | |
2323 # calculate mle | |
2324 #mle <- do.call(pmin,lapply(bgcm,function(bgc) fwc/bgc)) | |
2325 mle <- do.call(pmin,lapply(c(1:length(bgcm)),function(i) { | |
2326 bgc <- bgcm[[i]]; | |
2327 x <- fwc$y/bgc*bwsl[i]/fws/bg.weight; x[is.nan(x)] <- Inf; x[is.na(x)] <- Inf; return(x); | |
2328 })) | |
2329 | |
2330 rl <- list(x=list(s=fwc$x[1],e=fwc$x[2],step=fwc$step),lb=lbm,mle=mle); | |
2331 | |
2332 if(calculate.upper.bound) { | |
2333 # determine upper bound | |
2334 ubm <- lapply(c(1:length(bgcm)),function(i) { | |
2335 nbg <- bgcm[[i]]; | |
2336 if(quick.calculation) { | |
2337 rte <- fwc$y+nbg-0.25*zal*zal; rte[rte<0] <- 0; | |
2338 dn <- (nbg - 0.25*zal*zal); | |
2339 ubm=(sqrt(fwc$y*nbg) + 0.5*zal*sqrt(rte))/dn; | |
2340 ivi <- which(ubm<0); | |
2341 ubm <- ubm*ubm*bwsl[i]/fws/bg.weight; | |
2342 ubm[rte<=0] <- 1; | |
2343 ubm[dn<=0] <- 1; | |
2344 ubm[ivi] <- 1; | |
2345 return(ubm); | |
2346 } else { | |
2347 return((fwc$y/nbg)*qf(alpha/2,2*fwc$y,2*nbg,lower.tail=F)*bwsl[i]/fws/bg.weight); | |
2348 } | |
2349 }) | |
2350 ubm <- do.call(pmax,ubm); | |
2351 rl <- c(rl,list(ub=ubm)); | |
2352 } | |
2353 | |
2354 return(rl); | |
2355 } | |
2356 } | |
2357 | |
2358 write.probe.wig <- function(chr,pos,val,fname,append=F,feature="M",probe.length=35,header=T) { | |
2359 min.dist <- min(diff(pos)); | |
2360 if(probe.length>=min.dist) { | |
2361 probe.length <- min.dist-1; | |
2362 cat("warning: adjusted down wig segment length to",probe.length,"\n"); | |
2363 } | |
2364 mdat <- data.frame(chr,as.integer(pos),as.integer(pos+probe.length),val) | |
2365 | |
2366 if(header) { | |
2367 write(paste("track type=wiggle_0 name=\"Bed Format\" description=\"",feature,"\" visibility=dense color=200,100,0 altColor=0,100,200 priority=20",sep=""),file=fname,append=append) | |
2368 write.table(mdat,file=fname,col.names=F,row.names=F,quote=F,sep=" ",append=T); | |
2369 } else { | |
2370 write.table(mdat,file=fname,col.names=F,row.names=F,quote=F,sep=" ",append=append); | |
2371 } | |
2372 | |
2373 } | |
2374 | |
2375 # returns intersection of multiple region sets | |
2376 # each regionset needs to contain $s, $e and optional $v column | |
2377 regionset.intersection.c <- function(rsl,max.val=-1,do.union=F) { | |
2378 # translate into position/flag form | |
2379 rfl <- lapply(rsl,function(rs) { | |
2380 rp <- c(rs$s,rs$e); rf <- c(rep(c(1,-1),each=length(rs$s))); | |
2381 | |
2382 ro <- order(rp); | |
2383 rp <- rp[ro]; rf <- rf[ro]; | |
2384 if(!is.null(rs$v)) { | |
2385 rv <- c(rs$v,rs$v)[ro]; | |
2386 return(data.frame(p=as.numeric(rp),f=as.integer(rf),v=as.numeric(rv))); | |
2387 } else { | |
2388 return(data.frame(p=as.numeric(rp),f=as.integer(rf))); | |
2389 } | |
2390 }) | |
2391 rfd <- data.frame(do.call(rbind,lapply(1:length(rfl),function(i) { | |
2392 d <- rfl[[i]]; d$f <- d$f*i; return(d); | |
2393 }))) | |
2394 rfd <- rfd[order(rfd$p),]; | |
2395 if(is.null(rfd$v)) { max.val <- 0; } | |
2396 if(do.union) { ur <- 1; } else { ur <- 0; }; | |
2397 rl <- .Call("region_intersection",as.integer(length(rfl)),as.numeric(rfd$p),as.integer(rfd$f),as.numeric(rfd$v),as.integer(max.val),as.integer(ur)); | |
2398 return(data.frame(do.call(cbind,rl))); | |
2399 } | |
2400 | |
2401 | |
2402 # idenfity if binding peak falls within a larger region of significant tag enrichment, and if so record its booundaries | |
2403 add.broad.peak.regions <- function(chip.tags,input.tags,bp,window.size=500,z.thr=2) { | |
2404 se <- find.significantly.enriched.regions(chip.tags,input.tags,window.size=window.size,z.thr=z.thr,poisson.z=0,poisson.ratio=0,either=F) | |
2405 chrl <- names(bp$npl); names(chrl) <- chrl; | |
2406 bnpl <- lapply(chrl,function(chr) { | |
2407 npl <- bp$npl[[chr]]; | |
2408 if(is.null(npl) | dim(npl)[1]<1) { | |
2409 return(npl); | |
2410 } | |
2411 pi <- points.within(npl$x,se[[chr]]$s,se[[chr]]$e,return.list=T); | |
2412 | |
2413 pm <- do.call(rbind,lapply(pi,function(rl) { | |
2414 if(length(rl)>0) { | |
2415 return(range(c(se[[chr]]$s[rl],se[[chr]]$e[rl]))) | |
2416 } else { | |
2417 return(c(NA,NA)); | |
2418 } | |
2419 })) | |
2420 | |
2421 npl$rs <- pm[,1]; | |
2422 npl$re <- pm[,2]; | |
2423 return(npl); | |
2424 }) | |
2425 bp$npl <- bnpl; | |
2426 return(bp); | |
2427 } | |
2428 | |
2429 # writing out binding results in a narrowpeak format, incorporating broad region boundaries if they are present | |
2430 # if broad region info is not present, margin is used to determine region width. The default margin is equal | |
2431 # to the window half size used to call the binding peaks | |
2432 write.narrowpeak.binding <- function(bd,fname,margin=bd$whs,npeaks=NA) { # Anshul: added npeaks option | |
2433 if(is.null(margin)) { margin <- 50; } | |
2434 chrl <- names(bd$npl); names(chrl) <- chrl; | |
2435 md <- do.call(rbind,lapply(chrl,function(chr) { | |
2436 df <- bd$npl[[chr]]; | |
2437 x <- df$x; | |
2438 rs <- df$rs; if(is.null(rs)) { rs <- rep(NA,length(x)) } | |
2439 re <- df$re; if(is.null(re)) { re <- rep(NA,length(x)) } | |
2440 #ivi <- which(is.na(rs)); if(any(ivi)) {rs[ivi] <- x[ivi]-margin;} | |
2441 ivi <- which(is.na(rs)); if(any(ivi)) {rs[ivi] <- pmax(0,x[ivi]-margin);} # Anshul: added the pmax (0, ...) to avoid negative peak starts | |
2442 ivi <- which(is.na(re)); if(any(ivi)) {re[ivi] <- x[ivi]+margin;} | |
2443 #cbind(chr,rs,re,".","0",".",df$y,-1,format(df$fdr,scientific=T,digits=3),x-rs) | |
2444 cbind(chr,rs,re,".","0",".",df$y,-1,-log10(df$fdr),x-rs) # Anshul: converted fdr to -log10 | |
2445 })) | |
2446 md <- md[order(as.numeric(md[,7]),decreasing=T),] | |
2447 if (!is.na(npeaks)) { # Anshul: added this option to print a limited number of peaks | |
2448 npeaks <- min(nrow(md),npeaks) | |
2449 md <- md[1:npeaks,] | |
2450 } | |
2451 write.table(md,file=fname,col.names=F,row.names=F,quote=F,sep="\t",append=F); | |
2452 } | |
2453 | |
2454 | |
2455 get.broad.enrichment.clusters <- function(signal.data,control.data,window.size=1e3,z.thr=3, tag.shift=146/2,background.density.scaling=F, ... ) { | |
2456 # find significantly enriched clusters | |
2457 bg.weight <- dataset.density.ratio(signal.data,control.data,background.density.scaling=background.density.scaling); | |
2458 se <- find.significantly.enriched.regions(signal.data,control.data,window.size=window.size,z.thr=z.thr,tag.shift=tag.shift, bg.weight=bg.weight, ...) | |
2459 chrl <- names(se); names(chrl) <- chrl; | |
2460 se <- lapply(chrl,function(chr) { | |
2461 d <- se[[chr]]; | |
2462 if(length(d$s>1)) { | |
2463 d <- regionset.intersection.c(list(d,d),do.union=T); | |
2464 sc <- points.within(abs(signal.data[[chr]]+tag.shift),d$s,d$e,return.point.counts=T); | |
2465 cc <- points.within(abs(control.data[[chr]]+tag.shift),d$s,d$e,return.point.counts=T); | |
2466 d$rv <- log2((sc+1)/(cc+1)/bg.weight); | |
2467 return(d); | |
2468 } else { | |
2469 return(d) | |
2470 } | |
2471 }) | |
2472 } | |
2473 | |
2474 write.broadpeak.info <- function(bp,fname) { | |
2475 chrl <- names(bp); names(chrl) <- chrl; | |
2476 chrl <- chrl[unlist(lapply(bp,function(d) length(d$s)))>0] | |
2477 md <- do.call(rbind,lapply(chrl,function(chr) { | |
2478 df <- bp[[chr]]; | |
2479 cbind(chr,df$s,df$e,".","0",".",df$rv,-1,-1) | |
2480 })) | |
2481 md <- md[order(as.numeric(md[,7]),decreasing=T),] | |
2482 write.table(md,file=fname,col.names=F,row.names=F,quote=F,sep="\t",append=F); | |
2483 } | |
2484 | |
2485 | |
2486 get.clusters2 <- function(x,CL) { | |
2487 temp <- which(diff(x) != 0) | |
2488 begin <- c(1, temp + 1) | |
2489 end <- c(temp, length(x)) | |
2490 size <- end - begin + 1 | |
2491 | |
2492 begin <- begin[size >= CL] | |
2493 end <- end[size >= CL] | |
2494 size <- size[size >= CL] | |
2495 | |
2496 size <- size[x[end] != 0] | |
2497 begin <- begin[x[end] != 0] | |
2498 end <- end[x[end] != 0] | |
2499 | |
2500 return (list(size=size,begin=begin,end=end)) | |
2501 } |