comparison makeTSSdist.R @ 25:f5fe1f0cfbba draft

Uploaded
author jbrayet
date Wed, 12 Aug 2015 04:47:20 -0400
parents
children
comparison
equal deleted inserted replaced
24:307c899c4f11 25:f5fe1f0cfbba
1 #!/usr/bin/env Rscript
2
3 #usage $0 STEP RIGHT chipPeaks outputFile.png output.txt [controlPeaks] [1 for pdf]
4 args <- commandArgs(TRUE)
5 #print (args)
6 myStep <- type.convert(args[2])
7 maxValue <- type.convert(args[3])
8
9 dataTable <-read.table(file=paste(args[4],".genes.ClosestPeakDist", sep=""), header=TRUE);
10 chip.genes.ClosestPeakDist<-data.frame(dataTable)
11 ifReg <- 0
12 if (length(unique(chip.genes.ClosestPeakDist$Reg))>1) {
13 ifReg <- 1
14 }
15 ifControl <- 0
16
17
18 ifPDF <- 0
19 if (length(args)>=8) {
20 ifPDF=args[8]
21 }
22 if (length(args)==7 & args[7]==1) {
23 ifPDF=1
24 }
25
26 suppressMessages(library(Hmisc))
27
28 if (length(args)>=7 & args[7]!=1 & args[7]!=0) {
29 dataTable <-read.table(file=paste(args[7],".genes.ClosestPeakDist", sep=""), header=TRUE);
30 control.genes.ClosestPeakDist<-data.frame(dataTable)
31 ifControl <- 1
32 }
33 if (ifReg & ifControl) {
34 if (ifPDF==1) {
35 pdf(file = args[5], width = 19, height = 8, pointsize = 20, bg = "white")
36 } else {
37 bitmap(file = args[5], type = "png16m", width = 1440 , height = 680, units = "px", pointsize = 20, bg = "white", res = NA)
38 plot(1:10)
39 }
40 op <- par(mfrow = c(2,3))
41 } else {
42 if (ifPDF==1) {
43 pdf(file = args[5], width = 10, height = 13, pointsize = 20, bg = "white")
44 } else {
45 bitmap(file = args[5], type = "png16m", width = 680, height = 880, units = "px", pointsize = 20, bg = "white", res = NA)
46 plot(1:10)
47 }
48 # plot(1:10)
49 op <- par(mfrow = c(2,1))
50 }
51 myColor <- 1
52 myColor[1] <- colors()[131]
53 myColor[2] <- "darkolivegreen3"
54 myColor[3] <- "azure4"
55 myColor[4] <- "royalblue3"
56 myColor[5] <- colors()[17]
57
58 myColorControl <- 1
59
60 myColorControl[1] <- colors()[24]
61 myColorControl[2] <- colors()[278]
62 myColorControl[3] <- colors()[305]
63 myColorControl[4] <- colors()[219]
64 myColorControl[5] <- colors()[343]
65
66
67
68 #for cumulative:
69 dist_real_f <- chip.genes.ClosestPeakDist
70 if (ifControl) {
71 dist_control_f <- control.genes.ClosestPeakDist
72 }
73 step <- myStep
74 lim <- maxValue
75 x <- 0
76 count <- 1
77 countL <-1
78 n.types <- 1
79 myLevels <- 0
80 countTotalCont <- 0
81 countTotal <-0
82 countLCont <- 0
83 cumTotalCont <- 0
84 if (ifReg) {
85 n.types <- length(levels(chip.genes.ClosestPeakDist$Reg))
86 myLevels <- levels(chip.genes.ClosestPeakDist$Reg)
87 cum = matrix( 0, nrow=lim/step +1, ncol=n.types, byrow = TRUE)
88 for (i in c(1:n.types)) {
89 t <- which ((dist_real_f$Reg==myLevels[i]))
90 countL[i] <- length(t)
91 }
92 count <-1
93 for (i in seq(length=lim/step +1, from=0, by=step)) {
94 for (t in c(1:n.types)) {
95 tt <- which ((dist_real_f$Reg==myLevels[t])&(dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
96 cum[count,t] <- length(tt)
97 }
98 x[count] <- i
99 count <- count + 1
100 }
101 ymax <- max(cum[,1]/countL[1], na.rm=TRUE)
102 for (i in c(2:n.types)) {
103 ymax <- max(ymax,max(cum[,i]/countL[i], na.rm=TRUE))
104 }
105 myLocCol <- myColor[2]
106
107 par(mar=c(5.1, 7.1, 4.1, 2.1))
108
109 plot (x,cum[,1]/countL[1] ,col = myColor[2],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" , ylim=c(0,ymax))
110 for (i in c(2:n.types)) {
111 colorr <- i+1
112 myLocCol <- c(myLocCol,myColor[colorr])
113 lines (x,cum[,i]/countL[i] ,col = myColor[colorr],type="l", lwd = 2)
114 # print (myColor[colorr])
115 }
116
117 gradi <- 1000
118 if (lim>10000) {
119 gradi <- 10000
120 }
121 if (lim<3000) {
122 gradi <- 500
123 }
124 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
125 axisxlab <- paste(axisx/1000,"", sep = "")
126 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
127 ymax <- max(cum[,i]/countL[i], na.rm=TRUE)
128
129 minor.tick(nx=5,tick.ratio=0.5)
130
131 legend(lim*0.45, ymax*0.45, myLevels, cex=1, lwd = 2, bty = "n", col = myLocCol, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
132
133 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
134
135 if (ifControl) {
136 count <-1
137 n.types <- length(levels(control.genes.ClosestPeakDist$Reg))
138 myLevels <- levels(control.genes.ClosestPeakDist$Reg)
139 cumCont = matrix( 0, nrow=lim/step +1, ncol=n.types, byrow = TRUE)
140 for (i in c(1:n.types)) {
141 t <- which ((dist_control_f$Reg==myLevels[i]))
142 countLCont[i] <- length(t)
143 }
144 for (i in seq(length=lim/step +1, from=0, by=step)) {
145 for (t in c(1:n.types)) {
146 tt <- which ((dist_control_f$Reg==myLevels[t])&(dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
147 cumCont[count,t] <- length(tt)
148 }
149 x[count] <- i
150 count <- count + 1
151 }
152 ymax <- max(cumCont[,1]/countLCont[1], na.rm=TRUE)
153 for (i in c(2:n.types)) {
154 ymax <- max(ymax,max(cumCont[,i]/countLCont[i], na.rm=TRUE))
155 }
156 myLocColCntr <- myColorControl[2]
157 plot (x,cumCont[,1]/countLCont[1] ,col = myLocColCntr[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" , ylim=c(0,ymax))
158 for (i in c(2:n.types)) {
159 colorr <- i+1
160 myLocColCntr <- c(myLocColCntr,myColorControl[colorr])
161 lines (x,cumCont[,i]/countLCont[i] ,col = myColorControl[colorr],type="l", lwd = 2)
162 }
163 if (lim>10000) {
164 gradi <- 10000
165 }
166 if (lim<3000) {
167 gradi <- 500
168 }
169 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
170 axisxlab <- paste(axisx/1000, sep = "")
171 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
172 minor.tick(nx=5,tick.ratio=0.5)
173 legend(lim*0.45, ymax*0.45, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocColCntr, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
174 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
175 #real_vs_control_cumulative:
176 count <-1
177 countTotal <- length(dist_real_f$Reg)
178 cumTotal <- 0
179 for (i in seq(length=lim/step +1, from=0, by=step)) {
180 t <- which ((dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
181 cumTotal[count] <- length(t)
182 x[count] <- i
183 count <- count + 1
184 }
185 plot (x,cumTotal/countTotal ,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" )
186 gradi <- 1000
187 if (lim>10000) {
188 gradi <- 10000
189 }
190 if (lim<3000) {
191 gradi <- 500
192 }
193 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
194 axisxlab <- paste(axisx/1000, sep = "")
195 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
196 ymax <- max(cumTotal/countTotal, na.rm=TRUE)
197 minor.tick(nx=5,tick.ratio=0.5)
198 countTotalCont <- length(dist_control_f$Reg)
199 cumTotalCont <- 0
200 count <- 1
201 for (i in seq(length=lim/step +1, from=0, by=step)) {
202 t <- which ((dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
203 cumTotalCont[count] <- length(t)
204 x[count] <- i
205 count <- count + 1
206 }
207 lines (x,cumTotalCont/countTotalCont ,col = colors()[328],type="l", lwd = 2)
208 legend(lim*0.45, ymax*0.45, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
209 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
210 }
211 } else {
212 countTotal <- length(dist_real_f$Reg)
213 cumTotal <- 0
214 count <-1
215
216 gradi <- 1000
217 if (lim>10000) {
218 gradi <- 10000
219 }
220 if (lim<3000) {
221 gradi <- 500
222 }
223
224 for (i in seq(length=lim/step +1, from=0, by=step)) {
225 t <- which ((dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
226 cumTotal[count] <- length(t)
227 x[count] <- i
228 count <- count + 1
229 }
230 par(mar=c(5.1, 7.1, 4.1, 2.1))
231
232 plot (x,cumTotal/countTotal ,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" )
233 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
234 axisxlab <- paste(axisx/1000, sep = "")
235 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
236 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
237 ymax <- max(cumTotal/countTotal, na.rm=TRUE)
238 if (ifControl) {
239 countTotalCont <- length(dist_control_f$Reg)
240 cumTotalCont <- 0
241 count <- 1
242 for (i in seq(length=lim/step +1, from=0, by=step)) {
243 t <- which ((dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
244 cumTotalCont[count] <- length(t)
245 x[count] <- i
246 count <- count + 1
247 }
248 lines (x,cumTotalCont/countTotalCont ,col = colors()[328],type="l", lwd = 2)
249 legend(lim*0.45, ymax*0.45, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
250 } else {
251 legend(lim*0.45, ymax*0.45, c("ChIP"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1]), lty = c(1), pt.bg= c(myColor[1]) , merge = TRUE)
252 }
253 }
254
255 sink(args[6], append=FALSE, split=FALSE)
256 if (ifReg) {
257 if (ifControl) {
258 cat (paste("Dist_TSS","% genes w/ a peak in ChIP","% genes w/ a peak in control",sep='\t'))
259 cat("\t")
260 for (i in c(1:n.types)) {
261 cat(paste("% ", myLevels[i]," genes w/ a peak in ChIP", sep=""))
262 cat("\t")
263 }
264
265 for (i in c(1:n.types)) {
266 cat(paste("% ", myLevels[i]," genes w/ a peak in Control", sep=""))
267 cat("\t")
268 }
269 cat("\n")
270 for (i in c(1:length(x))) {
271 cat(paste(x[i],cumTotal[i]/countTotal,cumTotalCont[i]/countTotalCont,sep="\t"))
272 cat("\t")
273 for (t in c(1:n.types)) {
274 cat(paste(cum[i,t]/countL[t],"\t", sep=""))
275 }
276 for (t in c(1:n.types)) {
277 cat(paste(cumCont[i,t]/countLCont[t],"\t", sep=""))
278 }
279 cat("\n")
280 }
281 }else {
282 cat (paste("Dist_TSS","\t",sep=''))
283 for (i in c(1:n.types)) {
284 cat(paste("% ", myLevels[i]," genes w/ a peak in ChIP", "\t", sep=""))
285 }
286 cat("\n")
287 for (i in c(1:length(x))) {
288 cat(paste(x[i],"\t",sep=""))
289 for (t in c(1:n.types)) {
290 cat(paste(cum[i,t]/countL[t],"\t", sep=""))
291 }
292 cat("\n")
293 }
294 }
295 } else {
296 if (ifControl) {
297 cat (paste("Dist_TSS","% genes w/ a peak in ChIP","% genes w/ a peak in control",sep='\t'))
298 cat("\n")
299 for (i in c(1:length(x))) {
300 cat(paste(x[i],cumTotal[i]/countTotal,cumTotalCont[i]/countTotalCont,sep="\t"))
301 cat("\n")
302 }
303 }else {
304 cat (paste("Dist_TSS","% genes w/ a peak in ChIP",sep='\t'))
305 cat("\n")
306 for (i in c(1:length(x))) {
307 cat(paste(x[i],cumTotal[i]/countTotal,sep="\t"))
308 cat("\n")
309 }
310
311 }
312 }
313
314
315 #stop sinking:
316 sink()
317
318 #around TSS:
319 lim <- maxValue
320 step <- myStep
321 my_breaks <- seq(length=lim/step*2 +1, from=-lim, by=step)
322 chip.genes <- read.table(file=paste(args[4],".genes", sep=""), header=TRUE) ;
323 dist_real_f <- chip.genes
324 if (ifControl) {
325 control.genes <- read.table(file=paste(args[4],".genes", sep=""), header=TRUE) ;
326 dist_control_f<-data.frame(control.genes)
327 }
328 if (ifReg) {
329 #n.types <- length(levels(chip.genes.ClosestPeakDist$Reg))
330 #myLevels <- levels(dist_real_f$Reg)
331
332 t<- which (dist_real_f$Reg==myLevels[1])
333 values_real <-dist_real_f$Dist[t]
334 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
335 ymax <- max(hTSSreal$density, na.rm=TRUE)
336 for (i in c(2:n.types)) {
337 t<- which (dist_real_f$Reg==myLevels[i])
338 values_real <-dist_real_f$Dist[t]
339 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
340 ymax <- max(ymax,max(hTSSreal$density, na.rm=TRUE))
341 }
342
343
344 t<- which (dist_real_f$Reg==myLevels[1])
345 values_real <-dist_real_f$Dist[t]
346 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
347 plot (hTSSreal$mids,hTSSreal$density,col = myLocCol[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),ylim = c(0, ymax), xaxt="n" )
348
349 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
350
351 for (i in c(2:n.types)) {
352 t<- which (dist_real_f$Reg==myLevels[i])
353 values_real <-dist_real_f$Dist[t]
354 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
355 lines (hTSSreal$mids,hTSSreal$density,col = myLocCol[i],type="l", lwd = 2)
356 }
357 legend(lim*0.1, ymax*0.9, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocCol, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
358
359 gradi <- 1000
360 if (lim>10000) {
361 gradi <- 10000
362 }
363 if (lim<3000) {
364 gradi <- 500
365 }
366
367 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
368 axisxlab <- paste(axisx/1000, sep = "")
369 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
370
371
372 #minor.tick(nx=10,tick.ratio=0.5)
373 if (ifControl) {
374 t<- which (dist_control_f$Reg==myLevels[1])
375 values_control <-dist_control_f$Dist[t]
376 hTSScontrol= hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
377 ymax <- max(hTSScontrol$density, na.rm=TRUE)
378 for (i in c(2:n.types)) {
379 t<- which (dist_control_f$Reg==myLevels[i])
380 values_control <-dist_control_f$Dist[t]
381 hTSScontrol = hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
382 ymax <- max(ymax,max(hTSScontrol$density, na.rm=TRUE))
383 }
384 t<- which (dist_control_f$Reg==myLevels[1])
385 values_control <-dist_control_f$Dist[t]
386 hTSScontrol= hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
387 plot (hTSScontrol$mids,hTSScontrol$density,col = myLocColCntr[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),ylim = c(0, ymax),xaxt="n" )
388 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
389 for (i in c(2:n.types)) {
390 t<- which (dist_control_f$Reg==myLevels[i])
391 values_control <-dist_control_f$Dist[t]
392 hTSScontrol = hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
393 lines (hTSScontrol$mids,hTSScontrol$density,col = myLocColCntr[i],type="l", lwd = 2)
394 }
395
396 gradi <- 1000
397 if (lim>10000) {
398 gradi <- 10000
399 }
400 if (lim<3000) {
401 gradi <- 500
402 }
403
404 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
405 axisxlab <- paste(axisx/1000, sep = "")
406 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
407
408 legend(lim*0.1, ymax*0.9, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocColCntr, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
409
410 # minor.tick(nx=10,tick.ratio=0.5)
411 #control vs real
412 values_real <-dist_real_f$Dist
413 hTSSreal = hist(values_real, plot=FALSE, breaks = c(min(values_real),my_breaks,max(values_real)) )
414 plot (hTSSreal$mids,hTSSreal$density,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),xaxt="n")
415 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
416 ymax <- max(hTSSreal$density, na.rm=TRUE)
417 values_control <-dist_control_f$Dist
418 hTSScontrol = hist(values_control, plot=FALSE, breaks = c(min(values_control),my_breaks,max(values_control)) )
419 lines (hTSScontrol$mids,hTSScontrol$density,col = colors()[328],type="l", lwd = 2)
420 legend(lim*0.2, ymax*0.9, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
421
422 gradi <- 1000
423 if (lim>10000) {
424 gradi <- 10000
425 }
426 if (lim<3000) {
427 gradi <- 500
428 }
429
430 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
431 axisxlab <- paste(axisx/1000, sep = "")
432 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
433
434
435
436 # minor.tick(nx=10,tick.ratio=0.5)
437 }
438 } else {
439 values_real <-dist_real_f$Dist
440 hTSSreal = hist(values_real, plot=FALSE, breaks = c(min(values_real),my_breaks,max(values_real)) )
441 plot (hTSSreal$mids,hTSSreal$density,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),xaxt="n")
442 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
443 ymax <- max(hTSSreal$density, na.rm=TRUE)
444 if (ifControl) {
445 values_control <-dist_control_f$Dist
446 hTSScontrol = hist(values_control, plot=FALSE, breaks = c(min(values_control),my_breaks,max(values_control)) )
447 lines (hTSScontrol$mids,hTSScontrol$density,col = colors()[328],type="l", lwd = 2)
448 legend(lim*0.2, ymax*0.9, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
449 } else {
450 legend(lim*0.2, ymax*0.9, c("ChIP"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1]), lty = c(1), pt.bg= c(myColor[1]) , merge = TRUE)
451 }
452
453 gradi <- 1000
454 if (lim>10000) {
455 gradi <- 10000
456 }
457 if (lim<3000) {
458 gradi <- 500
459 }
460
461 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
462 axisxlab <- paste(axisx/1000, sep = "")
463 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
464
465
466 # minor.tick(nx=10,tick.ratio=0.5)
467 }
468 suppressMessages(dev.off())
469 q();
470 cat (paste("peak height","# peaks in ChIP","# peaks in Control","#control/chip","\n",sep='\t'))
471 for (xval in c(minValue:maxValue)) {
472 for (i in (1:length(chipHist$mids))) {
473 if (xval==chipHist$mids[i]) {
474 ychip <- chipHist$counts[i]
475 }
476 }
477 for (i in (1:length(controlHist$mids))) {
478 if (xval==controlHist$mids[i]) {
479 ycontrol <- controlHist$counts[i]
480 }
481 }
482 cat (paste(xval,ychip,ycontrol,ycontrol/ychip,"\n",sep='\t'))
483 }