comparison makeTSSdist.R @ 33:7705cb40c04b draft

Uploaded
author jbrayet
date Wed, 12 Aug 2015 05:38:55 -0400
parents
children
comparison
equal deleted inserted replaced
32:e89a2dd5de6d 33:7705cb40c04b
1 #!/usr/bin/env Rscript
2
3 #usage $0 STEP RIGHT chipPeaks outputFile.png output.txt [controlPeaks] [1 for pdf]
4 args <- commandArgs(TRUE)
5 #print (args)
6 myStep <- type.convert(args[2])
7 maxValue <- type.convert(args[3])
8
9 dataTable <-read.table(file=paste(args[4],".genes.ClosestPeakDist", sep=""), header=TRUE);
10 chip.genes.ClosestPeakDist<-data.frame(dataTable)
11 ifReg <- 0
12 if (length(unique(chip.genes.ClosestPeakDist$Reg))>1) {
13 ifReg <- 1
14 }
15 ifControl <- 0
16
17 options(bitmapType='cairo')
18
19 ifPDF <- 0
20 if (length(args)>=8) {
21 ifPDF=args[8]
22 }
23 if (length(args)==7 & args[7]==1) {
24 ifPDF=1
25 }
26
27 suppressMessages(library(Hmisc))
28
29 if (length(args)>=7 & args[7]!=1 & args[7]!=0) {
30 dataTable <-read.table(file=paste(args[7],".genes.ClosestPeakDist", sep=""), header=TRUE);
31 control.genes.ClosestPeakDist<-data.frame(dataTable)
32 ifControl <- 1
33 }
34 if (ifReg & ifControl) {
35 if (ifPDF==1) {
36 pdf(file = args[5], width = 19, height = 8, pointsize = 20, bg = "white")
37 } else {
38 png(filename = args[5], width = 1440 , height = 680, units = "px", pointsize = 20, bg = "white", res = NA)
39 plot(1:10)
40 }
41 op <- par(mfrow = c(2,3))
42 } else {
43 if (ifPDF==1) {
44 pdf(file = args[5], width = 10, height = 13, pointsize = 20, bg = "white")
45 } else {
46 png(filename = args[5], width = 680, height = 880, units = "px", pointsize = 20, bg = "white", res = NA)
47 plot(1:10)
48 }
49 # plot(1:10)
50 op <- par(mfrow = c(2,1))
51 }
52 myColor <- 1
53 myColor[1] <- colors()[131]
54 myColor[2] <- "darkolivegreen3"
55 myColor[3] <- "azure4"
56 myColor[4] <- "royalblue3"
57 myColor[5] <- colors()[17]
58
59 myColorControl <- 1
60
61 myColorControl[1] <- colors()[24]
62 myColorControl[2] <- colors()[278]
63 myColorControl[3] <- colors()[305]
64 myColorControl[4] <- colors()[219]
65 myColorControl[5] <- colors()[343]
66
67
68
69 #for cumulative:
70 dist_real_f <- chip.genes.ClosestPeakDist
71 if (ifControl) {
72 dist_control_f <- control.genes.ClosestPeakDist
73 }
74 step <- myStep
75 lim <- maxValue
76 x <- 0
77 count <- 1
78 countL <-1
79 n.types <- 1
80 myLevels <- 0
81 countTotalCont <- 0
82 countTotal <-0
83 countLCont <- 0
84 cumTotalCont <- 0
85 if (ifReg) {
86 n.types <- length(levels(chip.genes.ClosestPeakDist$Reg))
87 myLevels <- levels(chip.genes.ClosestPeakDist$Reg)
88 cum = matrix( 0, nrow=lim/step +1, ncol=n.types, byrow = TRUE)
89 for (i in c(1:n.types)) {
90 t <- which ((dist_real_f$Reg==myLevels[i]))
91 countL[i] <- length(t)
92 }
93 count <-1
94 for (i in seq(length=lim/step +1, from=0, by=step)) {
95 for (t in c(1:n.types)) {
96 tt <- which ((dist_real_f$Reg==myLevels[t])&(dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
97 cum[count,t] <- length(tt)
98 }
99 x[count] <- i
100 count <- count + 1
101 }
102 ymax <- max(cum[,1]/countL[1], na.rm=TRUE)
103 for (i in c(2:n.types)) {
104 ymax <- max(ymax,max(cum[,i]/countL[i], na.rm=TRUE))
105 }
106 myLocCol <- myColor[2]
107
108 par(mar=c(5.1, 7.1, 4.1, 2.1))
109
110 plot (x,cum[,1]/countL[1] ,col = myColor[2],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" , ylim=c(0,ymax))
111 for (i in c(2:n.types)) {
112 colorr <- i+1
113 myLocCol <- c(myLocCol,myColor[colorr])
114 lines (x,cum[,i]/countL[i] ,col = myColor[colorr],type="l", lwd = 2)
115 # print (myColor[colorr])
116 }
117
118 gradi <- 1000
119 if (lim>10000) {
120 gradi <- 10000
121 }
122 if (lim<3000) {
123 gradi <- 500
124 }
125 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
126 axisxlab <- paste(axisx/1000,"", sep = "")
127 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
128 ymax <- max(cum[,i]/countL[i], na.rm=TRUE)
129
130 minor.tick(nx=5,tick.ratio=0.5)
131
132 legend(lim*0.45, ymax*0.45, myLevels, cex=1, lwd = 2, bty = "n", col = myLocCol, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
133
134 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
135
136 if (ifControl) {
137 count <-1
138 n.types <- length(levels(control.genes.ClosestPeakDist$Reg))
139 myLevels <- levels(control.genes.ClosestPeakDist$Reg)
140 cumCont = matrix( 0, nrow=lim/step +1, ncol=n.types, byrow = TRUE)
141 for (i in c(1:n.types)) {
142 t <- which ((dist_control_f$Reg==myLevels[i]))
143 countLCont[i] <- length(t)
144 }
145 for (i in seq(length=lim/step +1, from=0, by=step)) {
146 for (t in c(1:n.types)) {
147 tt <- which ((dist_control_f$Reg==myLevels[t])&(dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
148 cumCont[count,t] <- length(tt)
149 }
150 x[count] <- i
151 count <- count + 1
152 }
153 ymax <- max(cumCont[,1]/countLCont[1], na.rm=TRUE)
154 for (i in c(2:n.types)) {
155 ymax <- max(ymax,max(cumCont[,i]/countLCont[i], na.rm=TRUE))
156 }
157 myLocColCntr <- myColorControl[2]
158 plot (x,cumCont[,1]/countLCont[1] ,col = myLocColCntr[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" , ylim=c(0,ymax))
159 for (i in c(2:n.types)) {
160 colorr <- i+1
161 myLocColCntr <- c(myLocColCntr,myColorControl[colorr])
162 lines (x,cumCont[,i]/countLCont[i] ,col = myColorControl[colorr],type="l", lwd = 2)
163 }
164 if (lim>10000) {
165 gradi <- 10000
166 }
167 if (lim<3000) {
168 gradi <- 500
169 }
170 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
171 axisxlab <- paste(axisx/1000, sep = "")
172 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
173 minor.tick(nx=5,tick.ratio=0.5)
174 legend(lim*0.45, ymax*0.45, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocColCntr, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
175 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
176 #real_vs_control_cumulative:
177 count <-1
178 countTotal <- length(dist_real_f$Reg)
179 cumTotal <- 0
180 for (i in seq(length=lim/step +1, from=0, by=step)) {
181 t <- which ((dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
182 cumTotal[count] <- length(t)
183 x[count] <- i
184 count <- count + 1
185 }
186 plot (x,cumTotal/countTotal ,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" )
187 gradi <- 1000
188 if (lim>10000) {
189 gradi <- 10000
190 }
191 if (lim<3000) {
192 gradi <- 500
193 }
194 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
195 axisxlab <- paste(axisx/1000, sep = "")
196 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
197 ymax <- max(cumTotal/countTotal, na.rm=TRUE)
198 minor.tick(nx=5,tick.ratio=0.5)
199 countTotalCont <- length(dist_control_f$Reg)
200 cumTotalCont <- 0
201 count <- 1
202 for (i in seq(length=lim/step +1, from=0, by=step)) {
203 t <- which ((dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
204 cumTotalCont[count] <- length(t)
205 x[count] <- i
206 count <- count + 1
207 }
208 lines (x,cumTotalCont/countTotalCont ,col = colors()[328],type="l", lwd = 2)
209 legend(lim*0.45, ymax*0.45, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
210 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
211 }
212 } else {
213 countTotal <- length(dist_real_f$Reg)
214 cumTotal <- 0
215 count <-1
216
217 gradi <- 1000
218 if (lim>10000) {
219 gradi <- 10000
220 }
221 if (lim<3000) {
222 gradi <- 500
223 }
224
225 for (i in seq(length=lim/step +1, from=0, by=step)) {
226 t <- which ((dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
227 cumTotal[count] <- length(t)
228 x[count] <- i
229 count <- count + 1
230 }
231 par(mar=c(5.1, 7.1, 4.1, 2.1))
232
233 plot (x,cumTotal/countTotal ,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" )
234 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
235 axisxlab <- paste(axisx/1000, sep = "")
236 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
237 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
238 ymax <- max(cumTotal/countTotal, na.rm=TRUE)
239 if (ifControl) {
240 countTotalCont <- length(dist_control_f$Reg)
241 cumTotalCont <- 0
242 count <- 1
243 for (i in seq(length=lim/step +1, from=0, by=step)) {
244 t <- which ((dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
245 cumTotalCont[count] <- length(t)
246 x[count] <- i
247 count <- count + 1
248 }
249 lines (x,cumTotalCont/countTotalCont ,col = colors()[328],type="l", lwd = 2)
250 legend(lim*0.45, ymax*0.45, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
251 } else {
252 legend(lim*0.45, ymax*0.45, c("ChIP"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1]), lty = c(1), pt.bg= c(myColor[1]) , merge = TRUE)
253 }
254 }
255
256 sink(args[6], append=FALSE, split=FALSE)
257 if (ifReg) {
258 if (ifControl) {
259 cat (paste("Dist_TSS","% genes w/ a peak in ChIP","% genes w/ a peak in control",sep='\t'))
260 cat("\t")
261 for (i in c(1:n.types)) {
262 cat(paste("% ", myLevels[i]," genes w/ a peak in ChIP", sep=""))
263 cat("\t")
264 }
265
266 for (i in c(1:n.types)) {
267 cat(paste("% ", myLevels[i]," genes w/ a peak in Control", sep=""))
268 cat("\t")
269 }
270 cat("\n")
271 for (i in c(1:length(x))) {
272 cat(paste(x[i],cumTotal[i]/countTotal,cumTotalCont[i]/countTotalCont,sep="\t"))
273 cat("\t")
274 for (t in c(1:n.types)) {
275 cat(paste(cum[i,t]/countL[t],"\t", sep=""))
276 }
277 for (t in c(1:n.types)) {
278 cat(paste(cumCont[i,t]/countLCont[t],"\t", sep=""))
279 }
280 cat("\n")
281 }
282 }else {
283 cat (paste("Dist_TSS","\t",sep=''))
284 for (i in c(1:n.types)) {
285 cat(paste("% ", myLevels[i]," genes w/ a peak in ChIP", "\t", sep=""))
286 }
287 cat("\n")
288 for (i in c(1:length(x))) {
289 cat(paste(x[i],"\t",sep=""))
290 for (t in c(1:n.types)) {
291 cat(paste(cum[i,t]/countL[t],"\t", sep=""))
292 }
293 cat("\n")
294 }
295 }
296 } else {
297 if (ifControl) {
298 cat (paste("Dist_TSS","% genes w/ a peak in ChIP","% genes w/ a peak in control",sep='\t'))
299 cat("\n")
300 for (i in c(1:length(x))) {
301 cat(paste(x[i],cumTotal[i]/countTotal,cumTotalCont[i]/countTotalCont,sep="\t"))
302 cat("\n")
303 }
304 }else {
305 cat (paste("Dist_TSS","% genes w/ a peak in ChIP",sep='\t'))
306 cat("\n")
307 for (i in c(1:length(x))) {
308 cat(paste(x[i],cumTotal[i]/countTotal,sep="\t"))
309 cat("\n")
310 }
311
312 }
313 }
314
315
316 #stop sinking:
317 sink()
318
319 #around TSS:
320 lim <- maxValue
321 step <- myStep
322 my_breaks <- seq(length=lim/step*2 +1, from=-lim, by=step)
323 chip.genes <- read.table(file=paste(args[4],".genes", sep=""), header=TRUE) ;
324 dist_real_f <- chip.genes
325 if (ifControl) {
326 control.genes <- read.table(file=paste(args[4],".genes", sep=""), header=TRUE) ;
327 dist_control_f<-data.frame(control.genes)
328 }
329 if (ifReg) {
330 #n.types <- length(levels(chip.genes.ClosestPeakDist$Reg))
331 #myLevels <- levels(dist_real_f$Reg)
332
333 t<- which (dist_real_f$Reg==myLevels[1])
334 values_real <-dist_real_f$Dist[t]
335 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
336 ymax <- max(hTSSreal$density, na.rm=TRUE)
337 for (i in c(2:n.types)) {
338 t<- which (dist_real_f$Reg==myLevels[i])
339 values_real <-dist_real_f$Dist[t]
340 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
341 ymax <- max(ymax,max(hTSSreal$density, na.rm=TRUE))
342 }
343
344
345 t<- which (dist_real_f$Reg==myLevels[1])
346 values_real <-dist_real_f$Dist[t]
347 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
348 plot (hTSSreal$mids,hTSSreal$density,col = myLocCol[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),ylim = c(0, ymax), xaxt="n" )
349
350 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
351
352 for (i in c(2:n.types)) {
353 t<- which (dist_real_f$Reg==myLevels[i])
354 values_real <-dist_real_f$Dist[t]
355 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
356 lines (hTSSreal$mids,hTSSreal$density,col = myLocCol[i],type="l", lwd = 2)
357 }
358 legend(lim*0.1, ymax*0.9, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocCol, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
359
360 gradi <- 1000
361 if (lim>10000) {
362 gradi <- 10000
363 }
364 if (lim<3000) {
365 gradi <- 500
366 }
367
368 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
369 axisxlab <- paste(axisx/1000, sep = "")
370 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
371
372
373 #minor.tick(nx=10,tick.ratio=0.5)
374 if (ifControl) {
375 t<- which (dist_control_f$Reg==myLevels[1])
376 values_control <-dist_control_f$Dist[t]
377 hTSScontrol= hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
378 ymax <- max(hTSScontrol$density, na.rm=TRUE)
379 for (i in c(2:n.types)) {
380 t<- which (dist_control_f$Reg==myLevels[i])
381 values_control <-dist_control_f$Dist[t]
382 hTSScontrol = hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
383 ymax <- max(ymax,max(hTSScontrol$density, na.rm=TRUE))
384 }
385 t<- which (dist_control_f$Reg==myLevels[1])
386 values_control <-dist_control_f$Dist[t]
387 hTSScontrol= hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
388 plot (hTSScontrol$mids,hTSScontrol$density,col = myLocColCntr[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),ylim = c(0, ymax),xaxt="n" )
389 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
390 for (i in c(2:n.types)) {
391 t<- which (dist_control_f$Reg==myLevels[i])
392 values_control <-dist_control_f$Dist[t]
393 hTSScontrol = hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
394 lines (hTSScontrol$mids,hTSScontrol$density,col = myLocColCntr[i],type="l", lwd = 2)
395 }
396
397 gradi <- 1000
398 if (lim>10000) {
399 gradi <- 10000
400 }
401 if (lim<3000) {
402 gradi <- 500
403 }
404
405 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
406 axisxlab <- paste(axisx/1000, sep = "")
407 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
408
409 legend(lim*0.1, ymax*0.9, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocColCntr, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
410
411 # minor.tick(nx=10,tick.ratio=0.5)
412 #control vs real
413 values_real <-dist_real_f$Dist
414 hTSSreal = hist(values_real, plot=FALSE, breaks = c(min(values_real),my_breaks,max(values_real)) )
415 plot (hTSSreal$mids,hTSSreal$density,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),xaxt="n")
416 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
417 ymax <- max(hTSSreal$density, na.rm=TRUE)
418 values_control <-dist_control_f$Dist
419 hTSScontrol = hist(values_control, plot=FALSE, breaks = c(min(values_control),my_breaks,max(values_control)) )
420 lines (hTSScontrol$mids,hTSScontrol$density,col = colors()[328],type="l", lwd = 2)
421 legend(lim*0.2, ymax*0.9, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
422
423 gradi <- 1000
424 if (lim>10000) {
425 gradi <- 10000
426 }
427 if (lim<3000) {
428 gradi <- 500
429 }
430
431 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
432 axisxlab <- paste(axisx/1000, sep = "")
433 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
434
435
436
437 # minor.tick(nx=10,tick.ratio=0.5)
438 }
439 } else {
440 values_real <-dist_real_f$Dist
441 hTSSreal = hist(values_real, plot=FALSE, breaks = c(min(values_real),my_breaks,max(values_real)) )
442 plot (hTSSreal$mids,hTSSreal$density,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),xaxt="n")
443 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
444 ymax <- max(hTSSreal$density, na.rm=TRUE)
445 if (ifControl) {
446 values_control <-dist_control_f$Dist
447 hTSScontrol = hist(values_control, plot=FALSE, breaks = c(min(values_control),my_breaks,max(values_control)) )
448 lines (hTSScontrol$mids,hTSScontrol$density,col = colors()[328],type="l", lwd = 2)
449 legend(lim*0.2, ymax*0.9, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
450 } else {
451 legend(lim*0.2, ymax*0.9, c("ChIP"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1]), lty = c(1), pt.bg= c(myColor[1]) , merge = TRUE)
452 }
453
454 gradi <- 1000
455 if (lim>10000) {
456 gradi <- 10000
457 }
458 if (lim<3000) {
459 gradi <- 500
460 }
461
462 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
463 axisxlab <- paste(axisx/1000, sep = "")
464 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
465
466
467 # minor.tick(nx=10,tick.ratio=0.5)
468 }
469 suppressMessages(dev.off())
470 q();
471 cat (paste("peak height","# peaks in ChIP","# peaks in Control","#control/chip","\n",sep='\t'))
472 for (xval in c(minValue:maxValue)) {
473 for (i in (1:length(chipHist$mids))) {
474 if (xval==chipHist$mids[i]) {
475 ychip <- chipHist$counts[i]
476 }
477 }
478 for (i in (1:length(controlHist$mids))) {
479 if (xval==controlHist$mids[i]) {
480 ycontrol <- controlHist$counts[i]
481 }
482 }
483 cat (paste(xval,ychip,ycontrol,ycontrol/ychip,"\n",sep='\t'))
484 }