comparison makeTSSdist.R @ 31:ecc3686fd8e0 draft

Uploaded
author jbrayet
date Wed, 12 Aug 2015 05:36:25 -0400
parents
children
comparison
equal deleted inserted replaced
30:99ca7c46fe9c 31:ecc3686fd8e0
1 #!/usr/bin/env Rscript
2
3 #usage $0 STEP RIGHT chipPeaks outputFile.png output.txt [controlPeaks] [1 for pdf]
4 args <- commandArgs(TRUE)
5 #print (args)
6 myStep <- type.convert(args[2])
7 maxValue <- type.convert(args[3])
8
9 dataTable <-read.table(file=paste(args[4],".genes.ClosestPeakDist", sep=""), header=TRUE);
10 chip.genes.ClosestPeakDist<-data.frame(dataTable)
11 ifReg <- 0
12 if (length(unique(chip.genes.ClosestPeakDist$Reg))>1) {
13 ifReg <- 1
14 }
15 ifControl <- 0
16
17 print(capabilities())
18 options(bitmapType='cairo')
19
20 ifPDF <- 0
21 if (length(args)>=8) {
22 ifPDF=args[8]
23 }
24 if (length(args)==7 & args[7]==1) {
25 ifPDF=1
26 }
27
28 suppressMessages(library(Hmisc))
29
30 if (length(args)>=7 & args[7]!=1 & args[7]!=0) {
31 dataTable <-read.table(file=paste(args[7],".genes.ClosestPeakDist", sep=""), header=TRUE);
32 control.genes.ClosestPeakDist<-data.frame(dataTable)
33 ifControl <- 1
34 }
35 if (ifReg & ifControl) {
36 if (ifPDF==1) {
37 pdf(file = args[5], width = 19, height = 8, pointsize = 20, bg = "white")
38 } else {
39 png(filename = args[5], width = 1440 , height = 680, units = "px", pointsize = 20, bg = "white", res = NA)
40 plot(1:10)
41 }
42 op <- par(mfrow = c(2,3))
43 } else {
44 if (ifPDF==1) {
45 pdf(file = args[5], width = 10, height = 13, pointsize = 20, bg = "white")
46 } else {
47 png(filename = args[5], width = 680, height = 880, units = "px", pointsize = 20, bg = "white", res = NA)
48 plot(1:10)
49 }
50 # plot(1:10)
51 op <- par(mfrow = c(2,1))
52 }
53 myColor <- 1
54 myColor[1] <- colors()[131]
55 myColor[2] <- "darkolivegreen3"
56 myColor[3] <- "azure4"
57 myColor[4] <- "royalblue3"
58 myColor[5] <- colors()[17]
59
60 myColorControl <- 1
61
62 myColorControl[1] <- colors()[24]
63 myColorControl[2] <- colors()[278]
64 myColorControl[3] <- colors()[305]
65 myColorControl[4] <- colors()[219]
66 myColorControl[5] <- colors()[343]
67
68
69
70 #for cumulative:
71 dist_real_f <- chip.genes.ClosestPeakDist
72 if (ifControl) {
73 dist_control_f <- control.genes.ClosestPeakDist
74 }
75 step <- myStep
76 lim <- maxValue
77 x <- 0
78 count <- 1
79 countL <-1
80 n.types <- 1
81 myLevels <- 0
82 countTotalCont <- 0
83 countTotal <-0
84 countLCont <- 0
85 cumTotalCont <- 0
86 if (ifReg) {
87 n.types <- length(levels(chip.genes.ClosestPeakDist$Reg))
88 myLevels <- levels(chip.genes.ClosestPeakDist$Reg)
89 cum = matrix( 0, nrow=lim/step +1, ncol=n.types, byrow = TRUE)
90 for (i in c(1:n.types)) {
91 t <- which ((dist_real_f$Reg==myLevels[i]))
92 countL[i] <- length(t)
93 }
94 count <-1
95 for (i in seq(length=lim/step +1, from=0, by=step)) {
96 for (t in c(1:n.types)) {
97 tt <- which ((dist_real_f$Reg==myLevels[t])&(dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
98 cum[count,t] <- length(tt)
99 }
100 x[count] <- i
101 count <- count + 1
102 }
103 ymax <- max(cum[,1]/countL[1], na.rm=TRUE)
104 for (i in c(2:n.types)) {
105 ymax <- max(ymax,max(cum[,i]/countL[i], na.rm=TRUE))
106 }
107 myLocCol <- myColor[2]
108
109 par(mar=c(5.1, 7.1, 4.1, 2.1))
110
111 plot (x,cum[,1]/countL[1] ,col = myColor[2],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" , ylim=c(0,ymax))
112 for (i in c(2:n.types)) {
113 colorr <- i+1
114 myLocCol <- c(myLocCol,myColor[colorr])
115 lines (x,cum[,i]/countL[i] ,col = myColor[colorr],type="l", lwd = 2)
116 # print (myColor[colorr])
117 }
118
119 gradi <- 1000
120 if (lim>10000) {
121 gradi <- 10000
122 }
123 if (lim<3000) {
124 gradi <- 500
125 }
126 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
127 axisxlab <- paste(axisx/1000,"", sep = "")
128 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
129 ymax <- max(cum[,i]/countL[i], na.rm=TRUE)
130
131 minor.tick(nx=5,tick.ratio=0.5)
132
133 legend(lim*0.45, ymax*0.45, myLevels, cex=1, lwd = 2, bty = "n", col = myLocCol, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
134
135 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
136
137 if (ifControl) {
138 count <-1
139 n.types <- length(levels(control.genes.ClosestPeakDist$Reg))
140 myLevels <- levels(control.genes.ClosestPeakDist$Reg)
141 cumCont = matrix( 0, nrow=lim/step +1, ncol=n.types, byrow = TRUE)
142 for (i in c(1:n.types)) {
143 t <- which ((dist_control_f$Reg==myLevels[i]))
144 countLCont[i] <- length(t)
145 }
146 for (i in seq(length=lim/step +1, from=0, by=step)) {
147 for (t in c(1:n.types)) {
148 tt <- which ((dist_control_f$Reg==myLevels[t])&(dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
149 cumCont[count,t] <- length(tt)
150 }
151 x[count] <- i
152 count <- count + 1
153 }
154 ymax <- max(cumCont[,1]/countLCont[1], na.rm=TRUE)
155 for (i in c(2:n.types)) {
156 ymax <- max(ymax,max(cumCont[,i]/countLCont[i], na.rm=TRUE))
157 }
158 myLocColCntr <- myColorControl[2]
159 plot (x,cumCont[,1]/countLCont[1] ,col = myLocColCntr[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" , ylim=c(0,ymax))
160 for (i in c(2:n.types)) {
161 colorr <- i+1
162 myLocColCntr <- c(myLocColCntr,myColorControl[colorr])
163 lines (x,cumCont[,i]/countLCont[i] ,col = myColorControl[colorr],type="l", lwd = 2)
164 }
165 if (lim>10000) {
166 gradi <- 10000
167 }
168 if (lim<3000) {
169 gradi <- 500
170 }
171 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
172 axisxlab <- paste(axisx/1000, sep = "")
173 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
174 minor.tick(nx=5,tick.ratio=0.5)
175 legend(lim*0.45, ymax*0.45, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocColCntr, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
176 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
177 #real_vs_control_cumulative:
178 count <-1
179 countTotal <- length(dist_real_f$Reg)
180 cumTotal <- 0
181 for (i in seq(length=lim/step +1, from=0, by=step)) {
182 t <- which ((dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
183 cumTotal[count] <- length(t)
184 x[count] <- i
185 count <- count + 1
186 }
187 plot (x,cumTotal/countTotal ,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" )
188 gradi <- 1000
189 if (lim>10000) {
190 gradi <- 10000
191 }
192 if (lim<3000) {
193 gradi <- 500
194 }
195 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
196 axisxlab <- paste(axisx/1000, sep = "")
197 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
198 ymax <- max(cumTotal/countTotal, na.rm=TRUE)
199 minor.tick(nx=5,tick.ratio=0.5)
200 countTotalCont <- length(dist_control_f$Reg)
201 cumTotalCont <- 0
202 count <- 1
203 for (i in seq(length=lim/step +1, from=0, by=step)) {
204 t <- which ((dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
205 cumTotalCont[count] <- length(t)
206 x[count] <- i
207 count <- count + 1
208 }
209 lines (x,cumTotalCont/countTotalCont ,col = colors()[328],type="l", lwd = 2)
210 legend(lim*0.45, ymax*0.45, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
211 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
212 }
213 } else {
214 countTotal <- length(dist_real_f$Reg)
215 cumTotal <- 0
216 count <-1
217
218 gradi <- 1000
219 if (lim>10000) {
220 gradi <- 10000
221 }
222 if (lim<3000) {
223 gradi <- 500
224 }
225
226 for (i in seq(length=lim/step +1, from=0, by=step)) {
227 t <- which ((dist_real_f$Dist<=i)&(dist_real_f$Dist>=-i))
228 cumTotal[count] <- length(t)
229 x[count] <- i
230 count <- count + 1
231 }
232 par(mar=c(5.1, 7.1, 4.1, 2.1))
233
234 plot (x,cumTotal/countTotal ,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(0, lim),xaxt="n" )
235 axisx <- seq(length=lim/gradi+1, from=0, by=gradi)
236 axisxlab <- paste(axisx/1000, sep = "")
237 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
238 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (cumulative)")
239 ymax <- max(cumTotal/countTotal, na.rm=TRUE)
240 if (ifControl) {
241 countTotalCont <- length(dist_control_f$Reg)
242 cumTotalCont <- 0
243 count <- 1
244 for (i in seq(length=lim/step +1, from=0, by=step)) {
245 t <- which ((dist_control_f$Dist<=i)&(dist_control_f$Dist>=-i))
246 cumTotalCont[count] <- length(t)
247 x[count] <- i
248 count <- count + 1
249 }
250 lines (x,cumTotalCont/countTotalCont ,col = colors()[328],type="l", lwd = 2)
251 legend(lim*0.45, ymax*0.45, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
252 } else {
253 legend(lim*0.45, ymax*0.45, c("ChIP"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1]), lty = c(1), pt.bg= c(myColor[1]) , merge = TRUE)
254 }
255 }
256
257 sink(args[6], append=FALSE, split=FALSE)
258 if (ifReg) {
259 if (ifControl) {
260 cat (paste("Dist_TSS","% genes w/ a peak in ChIP","% genes w/ a peak in control",sep='\t'))
261 cat("\t")
262 for (i in c(1:n.types)) {
263 cat(paste("% ", myLevels[i]," genes w/ a peak in ChIP", sep=""))
264 cat("\t")
265 }
266
267 for (i in c(1:n.types)) {
268 cat(paste("% ", myLevels[i]," genes w/ a peak in Control", sep=""))
269 cat("\t")
270 }
271 cat("\n")
272 for (i in c(1:length(x))) {
273 cat(paste(x[i],cumTotal[i]/countTotal,cumTotalCont[i]/countTotalCont,sep="\t"))
274 cat("\t")
275 for (t in c(1:n.types)) {
276 cat(paste(cum[i,t]/countL[t],"\t", sep=""))
277 }
278 for (t in c(1:n.types)) {
279 cat(paste(cumCont[i,t]/countLCont[t],"\t", sep=""))
280 }
281 cat("\n")
282 }
283 }else {
284 cat (paste("Dist_TSS","\t",sep=''))
285 for (i in c(1:n.types)) {
286 cat(paste("% ", myLevels[i]," genes w/ a peak in ChIP", "\t", sep=""))
287 }
288 cat("\n")
289 for (i in c(1:length(x))) {
290 cat(paste(x[i],"\t",sep=""))
291 for (t in c(1:n.types)) {
292 cat(paste(cum[i,t]/countL[t],"\t", sep=""))
293 }
294 cat("\n")
295 }
296 }
297 } else {
298 if (ifControl) {
299 cat (paste("Dist_TSS","% genes w/ a peak in ChIP","% genes w/ a peak in control",sep='\t'))
300 cat("\n")
301 for (i in c(1:length(x))) {
302 cat(paste(x[i],cumTotal[i]/countTotal,cumTotalCont[i]/countTotalCont,sep="\t"))
303 cat("\n")
304 }
305 }else {
306 cat (paste("Dist_TSS","% genes w/ a peak in ChIP",sep='\t'))
307 cat("\n")
308 for (i in c(1:length(x))) {
309 cat(paste(x[i],cumTotal[i]/countTotal,sep="\t"))
310 cat("\n")
311 }
312
313 }
314 }
315
316
317 #stop sinking:
318 sink()
319
320 #around TSS:
321 lim <- maxValue
322 step <- myStep
323 my_breaks <- seq(length=lim/step*2 +1, from=-lim, by=step)
324 chip.genes <- read.table(file=paste(args[4],".genes", sep=""), header=TRUE) ;
325 dist_real_f <- chip.genes
326 if (ifControl) {
327 control.genes <- read.table(file=paste(args[4],".genes", sep=""), header=TRUE) ;
328 dist_control_f<-data.frame(control.genes)
329 }
330 if (ifReg) {
331 #n.types <- length(levels(chip.genes.ClosestPeakDist$Reg))
332 #myLevels <- levels(dist_real_f$Reg)
333
334 t<- which (dist_real_f$Reg==myLevels[1])
335 values_real <-dist_real_f$Dist[t]
336 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
337 ymax <- max(hTSSreal$density, na.rm=TRUE)
338 for (i in c(2:n.types)) {
339 t<- which (dist_real_f$Reg==myLevels[i])
340 values_real <-dist_real_f$Dist[t]
341 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
342 ymax <- max(ymax,max(hTSSreal$density, na.rm=TRUE))
343 }
344
345
346 t<- which (dist_real_f$Reg==myLevels[1])
347 values_real <-dist_real_f$Dist[t]
348 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
349 plot (hTSSreal$mids,hTSSreal$density,col = myLocCol[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),ylim = c(0, ymax), xaxt="n" )
350
351 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
352
353 for (i in c(2:n.types)) {
354 t<- which (dist_real_f$Reg==myLevels[i])
355 values_real <-dist_real_f$Dist[t]
356 hTSSreal = hist(values_real,plot=FALSE,breaks = c(min(values_real),my_breaks,max(values_real)) )
357 lines (hTSSreal$mids,hTSSreal$density,col = myLocCol[i],type="l", lwd = 2)
358 }
359 legend(lim*0.1, ymax*0.9, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocCol, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
360
361 gradi <- 1000
362 if (lim>10000) {
363 gradi <- 10000
364 }
365 if (lim<3000) {
366 gradi <- 500
367 }
368
369 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
370 axisxlab <- paste(axisx/1000, sep = "")
371 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
372
373
374 #minor.tick(nx=10,tick.ratio=0.5)
375 if (ifControl) {
376 t<- which (dist_control_f$Reg==myLevels[1])
377 values_control <-dist_control_f$Dist[t]
378 hTSScontrol= hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
379 ymax <- max(hTSScontrol$density, na.rm=TRUE)
380 for (i in c(2:n.types)) {
381 t<- which (dist_control_f$Reg==myLevels[i])
382 values_control <-dist_control_f$Dist[t]
383 hTSScontrol = hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
384 ymax <- max(ymax,max(hTSScontrol$density, na.rm=TRUE))
385 }
386 t<- which (dist_control_f$Reg==myLevels[1])
387 values_control <-dist_control_f$Dist[t]
388 hTSScontrol= hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
389 plot (hTSScontrol$mids,hTSScontrol$density,col = myLocColCntr[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),ylim = c(0, ymax),xaxt="n" )
390 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
391 for (i in c(2:n.types)) {
392 t<- which (dist_control_f$Reg==myLevels[i])
393 values_control <-dist_control_f$Dist[t]
394 hTSScontrol = hist(values_control,plot=FALSE,breaks = c(min(values_control),my_breaks,max(values_control)) )
395 lines (hTSScontrol$mids,hTSScontrol$density,col = myLocColCntr[i],type="l", lwd = 2)
396 }
397
398 gradi <- 1000
399 if (lim>10000) {
400 gradi <- 10000
401 }
402 if (lim<3000) {
403 gradi <- 500
404 }
405
406 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
407 axisxlab <- paste(axisx/1000, sep = "")
408 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
409
410 legend(lim*0.1, ymax*0.9, myLevels, cex=1 , lwd = 2, bty = "n", col = myLocColCntr, lty = c(1), pt.bg= c(myLocCol) , merge = TRUE)
411
412 # minor.tick(nx=10,tick.ratio=0.5)
413 #control vs real
414 values_real <-dist_real_f$Dist
415 hTSSreal = hist(values_real, plot=FALSE, breaks = c(min(values_real),my_breaks,max(values_real)) )
416 plot (hTSSreal$mids,hTSSreal$density,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),xaxt="n")
417 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
418 ymax <- max(hTSSreal$density, na.rm=TRUE)
419 values_control <-dist_control_f$Dist
420 hTSScontrol = hist(values_control, plot=FALSE, breaks = c(min(values_control),my_breaks,max(values_control)) )
421 lines (hTSScontrol$mids,hTSScontrol$density,col = colors()[328],type="l", lwd = 2)
422 legend(lim*0.2, ymax*0.9, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
423
424 gradi <- 1000
425 if (lim>10000) {
426 gradi <- 10000
427 }
428 if (lim<3000) {
429 gradi <- 500
430 }
431
432 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
433 axisxlab <- paste(axisx/1000, sep = "")
434 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
435
436
437
438 # minor.tick(nx=10,tick.ratio=0.5)
439 }
440 } else {
441 values_real <-dist_real_f$Dist
442 hTSSreal = hist(values_real, plot=FALSE, breaks = c(min(values_real),my_breaks,max(values_real)) )
443 plot (hTSSreal$mids,hTSSreal$density,col = myColor[1],type="l", main="",xlab="",ylab="", lwd = 2, xlim = c(-lim, lim),xaxt="n")
444 title( main="",xlab="Distance from TSS (Kb)",ylab="Proportion of genes with a peak\nat a given distance (density)")
445 ymax <- max(hTSSreal$density, na.rm=TRUE)
446 if (ifControl) {
447 values_control <-dist_control_f$Dist
448 hTSScontrol = hist(values_control, plot=FALSE, breaks = c(min(values_control),my_breaks,max(values_control)) )
449 lines (hTSScontrol$mids,hTSScontrol$density,col = colors()[328],type="l", lwd = 2)
450 legend(lim*0.2, ymax*0.9, c("ChIP","Control"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1], colors()[328]), lty = c(1), pt.bg= c(myColor[1], colors()[328]) , merge = TRUE)
451 } else {
452 legend(lim*0.2, ymax*0.9, c("ChIP"), cex=1 , lwd = 2, bty = "n", col = c(myColor[1]), lty = c(1), pt.bg= c(myColor[1]) , merge = TRUE)
453 }
454
455 gradi <- 1000
456 if (lim>10000) {
457 gradi <- 10000
458 }
459 if (lim<3000) {
460 gradi <- 500
461 }
462
463 axisx <- seq(length=2*lim/gradi+1, from=-lim, by=gradi)
464 axisxlab <- paste(axisx/1000, sep = "")
465 axis(1, at=axisx,labels=axisxlab , las=1, cex.axis=1)
466
467
468 # minor.tick(nx=10,tick.ratio=0.5)
469 }
470 suppressMessages(dev.off())
471 q();
472 cat (paste("peak height","# peaks in ChIP","# peaks in Control","#control/chip","\n",sep='\t'))
473 for (xval in c(minValue:maxValue)) {
474 for (i in (1:length(chipHist$mids))) {
475 if (xval==chipHist$mids[i]) {
476 ychip <- chipHist$counts[i]
477 }
478 }
479 for (i in (1:length(controlHist$mids))) {
480 if (xval==controlHist$mids[i]) {
481 ycontrol <- controlHist$counts[i]
482 }
483 }
484 cat (paste(xval,ychip,ycontrol,ycontrol/ychip,"\n",sep='\t'))
485 }