comparison makeTSSdist.R @ 78:84e2feff4698 draft

Uploaded
author jbrayet
date Thu, 13 Aug 2015 07:04:39 -0400
parents 1546b1cc0655
children
comparison
equal deleted inserted replaced
77:89d18f4dd051 78:84e2feff4698
1 #!/usr/bin/env Rscript
2
3 #usage $0 STEP RIGHT chipPeaks outputFile.png output.txt [controlPeaks] [1 for pdf]
4 args <- commandArgs(TRUE)
5 myStep <- type.convert(args[2])
6 maxValue <- type.convert(args[3])
7
8 dataTable <-read.table(file=paste(args[4],".genes.ClosestPeakDist", sep=""), header=TRUE);
9 chip.genes.ClosestPeakDist<-data.frame(dataTable)
10 ifReg <- 0
11 if (length(unique(chip.genes.ClosestPeakDist$Reg))>1) {
12 ifReg <- 1
13 }
14 ifControl <- 0
15
16 ifPDF <- 0
17 if (length(args)>=8) {
18 ifPDF=args[8]
19 }
20 if (length(args)==7 & args[7]==1) {
21 ifPDF=1
22 }
23
24 suppressMessages(library(Hmisc))
25
26 if (length(args)>=7 & args[7]!=1 & args[7]!=0) {
27 dataTable <-read.table(file=paste(args[7],".genes.ClosestPeakDist", sep=""), header=TRUE);
28 control.genes.ClosestPeakDist<-data.frame(dataTable)
29 ifControl <- 1
30 }
31 if (ifReg & ifControl) {
32 if (ifPDF==1) {
33 pdf(file = args[5], width = 19, height = 8, pointsize = 20, bg = "white")
34 } else {
35 png(file = args[5], type="cairo", width = 1440 , height = 680, units = "px", pointsize = 20, bg = "white", res = NA)
36 plot(1:10)
37 }
38 op <- par(mfrow = c(2,3))
39 } else {
40 if (ifPDF==1) {
41 pdf(file = args[5], width = 10, height = 13, pointsize = 20, bg = "white")
42 } else {
43 png(file = args[5], type="cairo", width = 680, height = 880, units = "px", pointsize = 20, bg = "white", res = NA)
44 plot(1:10)
45 }
46 op <- par(mfrow = c(2,1))
47 }
48 myColor <- 1
49 myColor[1] <- colors()[131]
50 myColor[2] <- "darkolivegreen3"
51 myColor[3] <- "azure4"
52 myColor[4] <- "royalblue3"
53 myColor[5] <- colors()[17]
54
55 myColorControl <- 1
56
57 myColorControl[1] <- colors()[24]
58 myColorControl[2] <- colors()[278]
59 myColorControl[3] <- colors()[305]
60 myColorControl[4] <- colors()[219]
61 myColorControl[5] <- colors()[343]
62
63
64
65 #for cumulative:
66 dist_real_f <- chip.genes.ClosestPeakDist
67 if (ifControl) {
68 dist_control_f <- control.genes.ClosestPeakDist
69 }
70 step <- myStep
71 lim <- maxValue
72 x <- 0
73 count <- 1
74 countL <-1
75 n.types <- 1
76 myLevels <- 0
77 countTotalCont <- 0
78 countTotal <-0
79 countLCont <- 0
80 cumTotalCont <- 0
81 if (ifReg) {
82 n.types <- length(levels(chip.genes.ClosestPeakDist$Reg))
83 myLevels <- levels(chip.genes.ClosestPeakDist$Reg)
84 cum = matrix( 0, nrow=lim/step +1, ncol=n.types, byrow = TRUE)
85 for (i in c(1:n.types)) {
86 t <- which ((dist_real_f$Reg==myLevels[i]))
87 countL[i] <- length(t)
88 }
89 count <-1
90 for (i in seq(length=lim/step +1, from=0, by=step)) {
91 for (t in c(1:n.types)) {
92 tt <- which ((dist_real_f$Reg==myLevels[t])&(dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
93 cum[count,t] <- length(tt)
94 }
95 x[count] <- i
96 count <- count + 1
97 }
98 ymax <- max(cum[,1]/countL[1], na.rm=TRUE)
99 for (i in c(2:n.types)) {
100 ymax <- max(ymax,max(cum[,i]/countL[i], na.rm=TRUE))
101 }
102 myLocCol <- myColor[2]
103
104 par(mar=c(5.1, 7.1, 4.1, 2.1))
105
106 plot (x,cum[,1]/countL[1] ,col = myColor[2],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" , ylim=c(0,ymax))
107 for (i in c(2:n.types)) {
108 colorr <- i+1
109 myLocCol <- c(myLocCol,myColor[colorr])
110 lines (x,cum[,i]/countL[i] ,col = myColor[colorr],type="l", lwd = 2)
111 # print (myColor[colorr])
112 }
113
114 gradi <- 1000
115 if (lim>10000) {
116 gradi <- 10000
117 }
118 if (lim<3000) {
119 gradi <- 500
120 }
121 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
122 axisxlab <- paste(axisx/1000,"", sep = "")
123 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
124 ymax <- max(cum[,i]/countL[i], na.rm=TRUE)
125
126 minor.tick(nx=5,tick.ratio=0.5)
127
128 legend(lim*0.45, ymax*0.45, myLevels, cex=1, lwd = 2, bty = "n", col = myLocCol, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
129
130 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
131
132 if (ifControl) {
133 count <-1
134 n.types <- length(levels(control.genes.ClosestPeakDist$Reg))
135 myLevels <- levels(control.genes.ClosestPeakDist$Reg)
136 cumCont = matrix( 0, nrow=lim/step +1, ncol=n.types, byrow = TRUE)
137 for (i in c(1:n.types)) {
138 t <- which ((dist_control_f$Reg==myLevels[i]))
139 countLCont[i] <- length(t)
140 }
141 for (i in seq(length=lim/step +1, from=0, by=step)) {
142 for (t in c(1:n.types)) {
143 tt <- which ((dist_control_f$Reg==myLevels[t])&(dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
144 cumCont[count,t] <- length(tt)
145 }
146 x[count] <- i
147 count <- count + 1
148 }
149 ymax <- max(cumCont[,1]/countLCont[1], na.rm=TRUE)
150 for (i in c(2:n.types)) {
151 ymax <- max(ymax,max(cumCont[,i]/countLCont[i], na.rm=TRUE))
152 }
153 myLocColCntr <- myColorControl[2]
154 plot (x,cumCont[,1]/countLCont[1] ,col = myLocColCntr[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" , ylim=c(0,ymax))
155 for (i in c(2:n.types)) {
156 colorr <- i+1
157 myLocColCntr <- c(myLocColCntr,myColorControl[colorr])
158 lines (x,cumCont[,i]/countLCont[i] ,col = myColorControl[colorr],type="l", lwd = 2)
159 }
160 if (lim>10000) {
161 gradi <- 10000
162 }
163 if (lim<3000) {
164 gradi <- 500
165 }
166 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
167 axisxlab <- paste(axisx/1000, sep = "")
168 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
169 minor.tick(nx=5,tick.ratio=0.5)
170 legend(lim*0.45, ymax*0.45, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocColCntr, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
171 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
172 #real_vs_control_cumulative:
173 count <-1
174 countTotal <- length(dist_real_f$Reg)
175 cumTotal <- 0
176 for (i in seq(length=lim/step +1, from=0, by=step)) {
177 t <- which ((dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
178 cumTotal[count] <- length(t)
179 x[count] <- i
180 count <- count + 1
181 }
182 plot (x,cumTotal/countTotal ,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" )
183 gradi <- 1000
184 if (lim>10000) {
185 gradi <- 10000
186 }
187 if (lim<3000) {
188 gradi <- 500
189 }
190 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
191 axisxlab <- paste(axisx/1000, sep = "")
192 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
193 ymax <- max(cumTotal/countTotal, na.rm=TRUE)
194 minor.tick(nx=5,tick.ratio=0.5)
195 countTotalCont <- length(dist_control_f$Reg)
196 cumTotalCont <- 0
197 count <- 1
198 for (i in seq(length=lim/step +1, from=0, by=step)) {
199 t <- which ((dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
200 cumTotalCont[count] <- length(t)
201 x[count] <- i
202 count <- count + 1
203 }
204 lines (x,cumTotalCont/countTotalCont ,col = colors()[328],type="l", lwd = 2)
205 legend(lim*0.45, ymax*0.45, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
206 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
207 }
208 } else {
209 countTotal <- length(dist_real_f$Reg)
210 cumTotal <- 0
211 count <-1
212
213 gradi <- 1000
214 if (lim>10000) {
215 gradi <- 10000
216 }
217 if (lim<3000) {
218 gradi <- 500
219 }
220
221 for (i in seq(length=lim/step +1, from=0, by=step)) {
222 t <- which ((dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
223 cumTotal[count] <- length(t)
224 x[count] <- i
225 count <- count + 1
226 }
227 par(mar=c(5.1, 7.1, 4.1, 2.1))
228
229 plot (x,cumTotal/countTotal ,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" )
230 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
231 axisxlab <- paste(axisx/1000, sep = "")
232 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
233 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
234 ymax <- max(cumTotal/countTotal, na.rm=TRUE)
235 if (ifControl) {
236 countTotalCont <- length(dist_control_f$Reg)
237 cumTotalCont <- 0
238 count <- 1
239 for (i in seq(length=lim/step +1, from=0, by=step)) {
240 t <- which ((dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
241 cumTotalCont[count] <- length(t)
242 x[count] <- i
243 count <- count + 1
244 }
245 lines (x,cumTotalCont/countTotalCont ,col = colors()[328],type="l", lwd = 2)
246 legend(lim*0.45, ymax*0.45, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
247 } else {
248 legend(lim*0.45, ymax*0.45, c("ChIP"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1]), lty = c(1), pt.bg= c(myColor[1]) , merge = TRUE)
249 }
250 }
251
252 sink(args[6], append=FALSE, split=FALSE)
253 if (ifReg) {
254 if (ifControl) {
255 cat (paste("Dist_TSS","% genes w/ a peak in ChIP","% genes w/ a peak in control",sep='\t'))
256 cat("\t")
257 for (i in c(1:n.types)) {
258 cat(paste("% ", myLevels[i]," genes w/ a peak in ChIP", sep=""))
259 cat("\t")
260 }
261
262 for (i in c(1:n.types)) {
263 cat(paste("% ", myLevels[i]," genes w/ a peak in Control", sep=""))
264 cat("\t")
265 }
266 cat("\n")
267 for (i in c(1:length(x))) {
268 cat(paste(x[i],cumTotal[i]/countTotal,cumTotalCont[i]/countTotalCont,sep="\t"))
269 cat("\t")
270 for (t in c(1:n.types)) {
271 cat(paste(cum[i,t]/countL[t],"\t", sep=""))
272 }
273 for (t in c(1:n.types)) {
274 cat(paste(cumCont[i,t]/countLCont[t],"\t", sep=""))
275 }
276 cat("\n")
277 }
278 }else {
279 cat (paste("Dist_TSS","\t",sep=''))
280 for (i in c(1:n.types)) {
281 cat(paste("% ", myLevels[i]," genes w/ a peak in ChIP", "\t", sep=""))
282 }
283 cat("\n")
284 for (i in c(1:length(x))) {
285 cat(paste(x[i],"\t",sep=""))
286 for (t in c(1:n.types)) {
287 cat(paste(cum[i,t]/countL[t],"\t", sep=""))
288 }
289 cat("\n")
290 }
291 }
292 } else {
293 if (ifControl) {
294 cat (paste("Dist_TSS","% genes w/ a peak in ChIP","% genes w/ a peak in control",sep='\t'))
295 cat("\n")
296 for (i in c(1:length(x))) {
297 cat(paste(x[i],cumTotal[i]/countTotal,cumTotalCont[i]/countTotalCont,sep="\t"))
298 cat("\n")
299 }
300 }else {
301 cat (paste("Dist_TSS","% genes w/ a peak in ChIP",sep='\t'))
302 cat("\n")
303 for (i in c(1:length(x))) {
304 cat(paste(x[i],cumTotal[i]/countTotal,sep="\t"))
305 cat("\n")
306 }
307
308 }
309 }
310
311
312 #stop sinking:
313 sink()
314
315 #around TSS:
316 lim <- maxValue
317 step <- myStep
318 my_breaks <- seq(length=lim/step*2 +1, from=-lim, by=step)
319 chip.genes <- read.table(file=paste(args[4],".genes", sep=""), header=TRUE) ;
320 dist_real_f <- chip.genes
321 if (ifControl) {
322 control.genes <- read.table(file=paste(args[4],".genes", sep=""), header=TRUE) ;
323 dist_control_f<-data.frame(control.genes)
324 }
325 if (ifReg) {
326 #n.types <- length(levels(chip.genes.ClosestPeakDist$Reg))
327 #myLevels <- levels(dist_real_f$Reg)
328
329 t<- which (dist_real_f$Reg==myLevels[1])
330 values_real <-dist_real_f$Dist[t]
331 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
332 ymax <- max(hTSSreal$density, na.rm=TRUE)
333 for (i in c(2:n.types)) {
334 t<- which (dist_real_f$Reg==myLevels[i])
335 values_real <-dist_real_f$Dist[t]
336 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
337 ymax <- max(ymax,max(hTSSreal$density, na.rm=TRUE))
338 }
339
340
341 t<- which (dist_real_f$Reg==myLevels[1])
342 values_real <-dist_real_f$Dist[t]
343 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
344 plot (hTSSreal$mids,hTSSreal$density,col = myLocCol[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),ylim = c(0, ymax), xaxt="n" )
345
346 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
347
348 for (i in c(2:n.types)) {
349 t<- which (dist_real_f$Reg==myLevels[i])
350 values_real <-dist_real_f$Dist[t]
351 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
352 lines (hTSSreal$mids,hTSSreal$density,col = myLocCol[i],type="l", lwd = 2)
353 }
354 legend(lim*0.1, ymax*0.9, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocCol, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
355
356 gradi <- 1000
357 if (lim>10000) {
358 gradi <- 10000
359 }
360 if (lim<3000) {
361 gradi <- 500
362 }
363
364 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
365 axisxlab <- paste(axisx/1000, sep = "")
366 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
367
368
369 #minor.tick(nx=10,tick.ratio=0.5)
370 if (ifControl) {
371 t<- which (dist_control_f$Reg==myLevels[1])
372 values_control <-dist_control_f$Dist[t]
373 hTSScontrol= hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
374 ymax <- max(hTSScontrol$density, na.rm=TRUE)
375 for (i in c(2:n.types)) {
376 t<- which (dist_control_f$Reg==myLevels[i])
377 values_control <-dist_control_f$Dist[t]
378 hTSScontrol = hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
379 ymax <- max(ymax,max(hTSScontrol$density, na.rm=TRUE))
380 }
381 t<- which (dist_control_f$Reg==myLevels[1])
382 values_control <-dist_control_f$Dist[t]
383 hTSScontrol= hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
384 plot (hTSScontrol$mids,hTSScontrol$density,col = myLocColCntr[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),ylim = c(0, ymax),xaxt="n" )
385 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
386 for (i in c(2:n.types)) {
387 t<- which (dist_control_f$Reg==myLevels[i])
388 values_control <-dist_control_f$Dist[t]
389 hTSScontrol = hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
390 lines (hTSScontrol$mids,hTSScontrol$density,col = myLocColCntr[i],type="l", lwd = 2)
391 }
392
393 gradi <- 1000
394 if (lim>10000) {
395 gradi <- 10000
396 }
397 if (lim<3000) {
398 gradi <- 500
399 }
400
401 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
402 axisxlab <- paste(axisx/1000, sep = "")
403 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
404
405 legend(lim*0.1, ymax*0.9, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocColCntr, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
406
407 # minor.tick(nx=10,tick.ratio=0.5)
408 #control vs real
409 values_real <-dist_real_f$Dist
410 hTSSreal = hist(values_real, plot=FALSE, breaks = c(min(values_real),my_breaks,max(values_real)) )
411 plot (hTSSreal$mids,hTSSreal$density,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),xaxt="n")
412 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
413 ymax <- max(hTSSreal$density, na.rm=TRUE)
414 values_control <-dist_control_f$Dist
415 hTSScontrol = hist(values_control, plot=FALSE, breaks = c(min(values_control),my_breaks,max(values_control)) )
416 lines (hTSScontrol$mids,hTSScontrol$density,col = colors()[328],type="l", lwd = 2)
417 legend(lim*0.2, ymax*0.9, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
418
419 gradi <- 1000
420 if (lim>10000) {
421 gradi <- 10000
422 }
423 if (lim<3000) {
424 gradi <- 500
425 }
426
427 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
428 axisxlab <- paste(axisx/1000, sep = "")
429 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
430
431
432
433 # minor.tick(nx=10,tick.ratio=0.5)
434 }
435 } else {
436 values_real <-dist_real_f$Dist
437 hTSSreal = hist(values_real, plot=FALSE, breaks = c(min(values_real),my_breaks,max(values_real)) )
438 plot (hTSSreal$mids,hTSSreal$density,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),xaxt="n")
439 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
440 ymax <- max(hTSSreal$density, na.rm=TRUE)
441 if (ifControl) {
442 values_control <-dist_control_f$Dist
443 hTSScontrol = hist(values_control, plot=FALSE, breaks = c(min(values_control),my_breaks,max(values_control)) )
444 lines (hTSScontrol$mids,hTSScontrol$density,col = colors()[328],type="l", lwd = 2)
445 legend(lim*0.2, ymax*0.9, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
446 } else {
447 legend(lim*0.2, ymax*0.9, c("ChIP"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1]), lty = c(1), pt.bg= c(myColor[1]) , merge = TRUE)
448 }
449
450 gradi <- 1000
451 if (lim>10000) {
452 gradi <- 10000
453 }
454 if (lim<3000) {
455 gradi <- 500
456 }
457
458 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
459 axisxlab <- paste(axisx/1000, sep = "")
460 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
461
462
463 # minor.tick(nx=10,tick.ratio=0.5)
464 }
465 suppressMessages(dev.off())
466 q();
467 cat (paste("peak height","# peaks in ChIP","# peaks in Control","#control/chip","\n",sep='\t'))
468 for (xval in c(minValue:maxValue)) {
469 for (i in (1:length(chipHist$mids))) {
470 if (xval==chipHist$mids[i]) {
471 ychip <- chipHist$counts[i]
472 }
473 }
474 for (i in (1:length(controlHist$mids))) {
475 if (xval==controlHist$mids[i]) {
476 ycontrol <- controlHist$counts[i]
477 }
478 }
479 cat (paste(xval,ychip,ycontrol,ycontrol/ychip,"\n",sep='\t'))
480 }