comparison offtarget_v7.R @ 0:b73bf7e5bb41 draft default tip

Uploaded
author mkim8
date Fri, 14 Mar 2014 14:31:15 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b73bf7e5bb41
1 library(methods)
2 library(lattice)
3 suppressMessages(library(Matrix))
4 suppressMessages(library(ClassComparison))
5 suppressMessages(library(glmnet))
6 options(warn=-1)
7
8 #######################
9 ### arguments input ###
10 #######################
11
12 arg=commandArgs(TRUE)
13
14 InputFile=arg[1] # must be given by user including both
15 # response variable and siRNA sequences
16
17 Seed=as.integer(arg[2]) # Name: "Seed"
18 # specify the seed region to be used in analysis
19 # an interger out of 1~14.
20 # default is 2, which means 2~7 hexamer is used
21
22 Strand=arg[3] # Name:"Strand Orientation for Analysis"
23 # specify which strand is used in analysis
24 # must be given by user, options are "sense",
25 #"antisense" or "both"
26
27 Lambda=as.numeric(arg[4]) # Name:"Lambda Value"
28 # penalty parameter in LASSO regression
29 # default = 0.001, which can be changed by user
30 # in order to perform differennt LASSO estimation
31
32 Library=arg[5] # Name:"siRNA Library to use"
33 # user has to specify which siRNA library will be
34 # used in analysis can be "Custom", "Ambion", "New Dharmacon"
35 # or "Old Dharmacon". Default is "Custom"
36
37 ### new parameter ###
38
39 Str=as.numeric(arg[6]) # Name:"Strength of seed-linked effect“
40 # specify cutoff for strenth of seed-linked effet, must be positive
41 # default is 1
42
43 Sig=as.numeric(arg[7]) # Name:"Significance (P value)"
44 # specify cutoff for significance (p value)
45 # default is 0.01
46
47 #################
48 ### functions ###
49 #################
50
51 # uppercase a sequence and transform T to U ###
52
53 TtoU=function(x){
54 x=as.character(x)
55 x=toupper(x)
56 x=chartr("T", "U", x)
57 return(x)
58 }
59
60 # reverse a sequence #
61
62 StrReverse <- function(x){
63 x=as.character(x)
64 x=sapply(lapply(strsplit(x, NULL), rev), paste, collapse="")
65 return(x)
66 }
67
68 # generate complementary sequence #
69
70 DuplexReverse=function(x)
71 {
72 x=as.character(x)
73 x=gsub("U","T",x)
74 x=gsub("A","U",x)
75 x=gsub("T","A",x)
76 x=gsub("G","g",x)
77 x=gsub("C","G",x)
78 x=gsub("g","C",x)
79 x=StrReverse(x)
80 return(x)
81 }
82
83 # check if input contains AUGC only #
84
85 SeqCheck=function(x){
86 x=as.character(x)
87 x=strsplit(x, "")
88 x=unlist(x)
89 if(sum(x %in% c("A", "U", "G", "C")) == length(x) ){
90 return(TRUE)
91 }
92 else{
93 return(FALSE)
94 }
95 }
96
97 # extract seed sequence from siRNA #
98
99 SeedExtract=function(x, start=2, end=7){
100 x=as.character(x)
101 x=substr(x, start, end)
102 return(x)
103 }
104
105 # examine if a pool contain a seed #
106
107 SeedExist=function(dfr, s){
108 dfr=data.frame(dfr)
109 s=as.character(s)
110 return(as.numeric(apply(dfr == s, 1, sum)))
111 }
112
113 # seed analysis #
114
115 SeedAnalysis=function(Z, data, start=2, end=7){
116
117 print("Generating seed.dfr ... ... ")
118 data=data.frame(data)
119 seed.dfr=data
120 for(i in 1:ncol(seed.dfr)){
121 seed.dfr[, i]=as.character(SeedExtract(seed.dfr[, i], start=start, end=end))
122 }
123 colnames(seed.dfr)=paste("seed", 1:ncol(seed.dfr), sep=".")
124 print("Generating seed.table ... ... ")
125 seed.table=data.frame(table(unlist(seed.dfr)))
126 names(seed.table)=c("seed", "family.size")
127
128 seed.table$ks.p.value=rep(NA, length(seed.table$seed))
129 y=Z
130 for(i in 1:length(seed.table$seed)){
131 s=SeedExist(seed.dfr, seed.table$seed[i])
132 seed.table$ks.p.value[i]=
133 ks.test(y[s > 0], y[s== 0])$p.value
134 print(paste("seed ", i))
135 }
136
137 return(list(Z=Z, data=data, seed.table=seed.table, seed.dfr=seed.dfr))
138 }
139
140 # offtarget analysis #
141
142 OfftargetAnalysis=function(object, s=0.001){
143
144 seed.dfr=object$seed.dfr
145 seed.table=object$seed.table
146 y=object$Z
147
148 print("Generating Matrix X ... ... ")
149 s.sub=as.character(seed.table$seed)
150 x=matrix(0, length(y), length(s.sub))
151 colnames(x)=as.character(s.sub)
152
153 for(i in 1:length(s.sub)){
154 n=as.character(s.sub[i])
155 x[, n]=SeedExist(seed.dfr, n)
156 print(paste("Matrix X seed ", i))
157 }
158
159 print("Fitting ... ... ")
160 gc()
161
162 fit=glmnet(x, y)
163
164 print("offtarget analysis")
165 offtarget=data.frame(as.matrix(coef(fit, s=s)))
166 offtarget$seed=rownames(offtarget)
167 offtarget=offtarget[, c(2, 1)]
168 names(offtarget)[2]="coef"
169 rownames(offtarget)=NULL
170
171 offtarget=merge(offtarget, seed.table, by="seed", all.x=T)
172 y.fit=as.numeric(predict(fit, x, s=s))
173 y.corrected=y-y.fit
174 print("Off-target analysis Done!")
175
176 return(list(fit=fit, X=x, offtarget=offtarget, Z.corrected=y.corrected))
177
178
179 }
180
181 # generating data frame of siRNA sequence #
182
183 SeqGenerate=function(SeqDat){
184 SenseSeq=data.frame(apply(SeqDat, 2, TtoU))
185 AntiSeq=data.frame(apply(SenseSeq, 2, DuplexReverse))
186 for(i in 1:ncol(SenseSeq)){
187 AntiSeq[, i]=DuplexReverse(SenseSeq[, i])
188 }
189 BothSeq=cbind(SenseSeq, AntiSeq)
190 names(BothSeq)=paste("duplex", 1:ncol(BothSeq), sep=".")
191 output=list(SenseSeq=SenseSeq, AntiSeq=AntiSeq, BothSeq=BothSeq)
192 return(output)
193 }
194
195 # DataAnalysis #
196
197 DataAnalysis=function(Z, Seq, SeedStart, Strand, Lambda){
198
199 SeedEnd=SeedStart+5
200
201 analysis=list()
202
203 if(Strand == 'sense'){
204 Seq=Seq$SenseSeq
205 analysis$seed.analysis=SeedAnalysis(Z=Z, data=Seq, start=SeedStart, end=SeedEnd)
206 analysis$offtarget.analysis=OfftargetAnalysis(analysis$seed.analysis, s=Lambda)
207 }
208 else if(Strand == "antisense"){
209 Seq=Seq$AntiSeq
210 analysis$seed.analysis=SeedAnalysis(Z=Z, data=Seq, start=SeedStart, end=SeedEnd)
211 analysis$offtarget.analysis=OfftargetAnalysis(analysis$seed.analysis, s=Lambda)
212 }
213 else if(Strand == 'both'){
214 Seq=Seq$BothSeq
215 analysis$seed.analysis=SeedAnalysis(Z=Z, data=Seq, start=SeedStart, end=SeedEnd)
216 analysis$offtarget.analysis=OfftargetAnalysis(analysis$seed.analysis, s=Lambda)
217 }
218 return(analysis)
219 }
220
221 # new functions #
222
223 # FDR calculation #
224
225 fdr.bum=function(p.value, Sig){
226 p.sum=attributes(summary(Bum(p.value), tau=Sig))$estimates
227 f=p.sum[4]/(p.sum[2]+p.sum[4])
228 f=as.numeric(f)
229 return(f)
230 }
231
232 # load miRNA database #
233
234 load("/home/galaxy/galaxy-dist/tools/offtarget/miRNA.RData")
235
236 ##################
237 ### data input ###
238 ##################
239
240 # data input #
241
242 SeedStart=Seed
243 inputfile=read.csv(InputFile, header=F)
244 id=inputfile[, 1]
245 Z=as.numeric(inputfile[, 2])
246
247 # Z score checking #
248
249 if(sum(is.na(Z)) > 0){
250 print("Missing Value in Experimental Readouts!")
251 quit()
252 }
253
254 if(sum(Z == "") > 0){
255 print("Missing Value in Experimental Readouts!")
256 quit()
257 }
258
259 #####################
260 ### data analysis ###
261 #####################
262
263 ### Custom ###
264
265 if(Library == "custom"){
266
267 ### check if user selected wrong "Library" parameter ###
268 if(ncol(inputfile) < 3){
269 print("siRNA sequences not available and please check your Library selection")
270 quit()
271 }
272 ##################################################################
273
274 SeqDat=inputfile[, 3:ncol(inputfile)]
275 SeqDat=data.frame(SeqDat)
276
277 if(sum(is.na(unlist(SeqDat))) > 0){
278 print("Missing Value in siRNA sequences!")
279 quit()
280 }
281
282 if(sum(unlist(SeqDat) == "") > 0){
283 print("Missing Value in siRNA sequences!")
284 quit()
285 }
286
287 Seq=SeqGenerate(SeqDat)
288
289 SenseSeq=Seq$SenseSeq
290 Check=data.frame(apply(SenseSeq, 1, SeqCheck))
291
292 if(sum(Check == FALSE) > 0){
293 print("Non-Sequence Value in siRNA sequences!")
294 quit()
295 }
296 else{
297 analysis=DataAnalysis(Z, Seq, SeedStart, Strand, Lambda)
298 }
299 }
300
301 ### Ambion ###
302
303 if(Library == "ambion"){
304
305 id=as.integer(id)
306
307 if(sum(is.na(id)) > 0){
308 print("Missing Value in Gene ID!")
309 quit()
310 }
311
312 if(sum(id == "") > 0){
313 print("Missing Value in Gene ID!")
314 quit()
315 }
316
317 load("/home/galaxy/galaxy-dist/tools/offtarget/Ambion.RData")
318 id.ambion=unique(ambion$id)
319
320 if(length(id[!(id %in% id.ambion)]) > 0){
321 write.csv(c("No Matching siRNA Sequences with Below Gene ID and Removed from Analysis",
322 id[!(id %in% id.ambion)]), "unmatched.id.csv")
323 }
324
325 Dat=data.frame(id=id, z=Z)
326 Dat=merge(Dat, ambion, by="id", sort=F)
327 Z=Dat$z
328 id=Dat$id
329 SeqDat=Dat[, 6:8]
330 Seq=SeqGenerate(SeqDat)
331 analysis=DataAnalysis(Z, Seq, SeedStart, Strand, Lambda)
332
333 }
334
335 ### New Dhar ###
336
337 if(Library == "new_dharmacon"){
338
339 id=as.integer(id)
340
341 if(sum(is.na(id)) > 0){
342 print("Missing Value in Gene ID!")
343 quit()
344 }
345
346 if(sum(id == "") > 0){
347 print("Missing Value in Gene ID!")
348 quit()
349 }
350
351 load("/home/galaxy/galaxy-dist/tools/offtarget/New_Dhar.RData")
352 id.new.dhar=unique(new.dhar$id)
353
354 if(length(id[!(id %in% id.new.dhar)]) > 0){
355 write.csv(c("No Matching siRNA Sequences with Below Gene ID and Removed from Analysis",
356 id[!(id %in% id.new.dhar)]), "unmatched.id.csv")
357 }
358
359 Dat=data.frame(id=id, z=Z)
360 Dat=merge(Dat, new.dhar, by="id", sort=F)
361 Z=Dat$z
362 id=Dat$id
363 SeqDat=Dat[, 7:10]
364 Seq=SeqGenerate(SeqDat)
365 analysis=DataAnalysis(Z, Seq, SeedStart, Strand, Lambda)
366
367 }
368
369 ### Old Dhar ###
370
371 if(Library == "old_dharmacon"){
372
373 id=as.integer(id)
374
375 if(sum(is.na(id)) > 0){
376 print("Missing Value in Gene ID!")
377 quit()
378 }
379
380 if(sum(id == "") > 0){
381 print("Missing Value in Gene ID!")
382 quit()
383 }
384
385 load("/home/galaxy/galaxy-dist/tools/offtarget/Old_Dhar.RData")
386 id.old.dhar=unique(old.dhar$id)
387
388 if(length(id[!(id %in% id.old.dhar)]) > 0){
389 write.csv(c("No Matching siRNA Sequences with Below Gene ID and Removed from Analysis",
390 id[!(id %in% id.old.dhar)]), "unmatched.id.csv")
391 }
392
393 Dat=data.frame(id=id, z=Z)
394 Dat=merge(Dat, old.dhar, by="id", sort=F)
395 Z=Dat$z
396 id=Dat$id
397 SeqDat=Dat[, 7:10]
398 Seq=SeqGenerate(SeqDat)
399 analysis=DataAnalysis(Z, Seq, SeedStart, Strand, Lambda)
400
401 }
402
403 ######################
404 ### results output ###
405 ######################
406
407 # code chagned below #
408
409 # generating output files #
410
411 duplex.dfr=analysis$seed.analysis$data
412 names(duplex.dfr)=paste("duplex", 1:length(names(duplex.dfr)), sep=".")
413 seed.dfr=analysis$seed.analysis$seed.dfr
414 offtarget=analysis$offtarget.analysis$offtarget
415 Z.corrected=analysis$offtarget.analysis$Z.corrected
416 output.corr=cbind(ID=id, Z.Score=Z, Corrected.Z.Score=Z.corrected,
417 duplex.dfr, seed.dfr)
418
419
420 print("outputing results... ...")
421
422 if(Library == 'custom'){
423
424 print("outputing corrected Z score... ...")
425 write.csv(output.corr, "CorrectedZScore.csv", row.names=F) ### export corrected z score
426
427 print("outputing summary... ...")
428 write.csv(offtarget, "Seed_Famliy_Summary.csv", row.names=FALSE) ### output seed family summary table
429
430 print("outputing offtarget seed families... ...")
431
432 off.target=offtarget$seed[offtarget$ks.p.value < Sig & (
433 offtarget$coef < -Str | offtarget$coef > Str)] # select off target by custom setup
434
435 write.csv(offtarget[offtarget$seed %in% off.target, ],
436 "Off-Target_Seed_Families.csv", row.names=FALSE) ### export off-target seed families
437
438 print("outputing miRNAs... ...")
439 if(nrow(miRNA[miRNA$Seed.sequence %in% off.target, ]) == 0){
440 write.csv("No miRNA detected", "miRNA.csv", row.names=F)
441 fe="No"
442 } ### export miRNAs table sharing common seed
443
444 if(nrow(miRNA[miRNA$Seed.sequence %in% off.target, ]) != 0){
445 write.csv(miRNA[miRNA$Seed.sequence %in% off.target, ], "miRNA.csv", row.names=F)
446 fe="Yes"
447 } ### export miRNAs talbe sharing common seed
448
449 if(length(off.target) > 0){
450 for(i in 1:length(off.target)){
451
452 n=off.target[i]
453 si=SeedExist(seed.dfr, n)
454 si.offtarget=output.corr[si == 1, -3]
455 si.offtarget$off.target.seed=n
456 si.offtarget=si.offtarget[order(si.offtarget$Z.Score), ]
457 print(paste(n, "siRNA pools ... ..."))
458 write.csv(si.offtarget, paste("siRNAsPool_OffTargetSeedFamily_", n, ".csv", sep=""), row.names=F)
459
460 }
461 }
462 }else{
463
464 print("outputing corrected Z score... ...")
465 write.csv(output.corr[, 1:3], "CorrectedZScore.csv", row.names=F)
466
467 print("outputing summary... ...")
468 write.csv(offtarget, "Seed_Famliy_Summary.csv", row.names=FALSE) ### output seed family summary table
469
470 print("outputing offtarget seed families... ...")
471
472 off.target=offtarget$seed[offtarget$ks.p.value < Sig & (
473 offtarget$coef < -Str | offtarget$coef > Str)] # select off target by custom setup
474
475 write.csv(offtarget[offtarget$seed %in% off.target, ],
476 "Off-Target_Seed_Families.csv", row.names=FALSE) ### export off-target seed families
477
478 print("outputing miRNAs... ...")
479 if(nrow(miRNA[miRNA$Seed.sequence %in% off.target, ]) == 0){
480 write.csv("No miRNA detected", "miRNA.csv", row.names=F)
481 fe="No"
482 } ### export miRNAs table sharing common seed
483
484 if(nrow(miRNA[miRNA$Seed.sequence %in% off.target, ]) != 0){
485 write.csv(miRNA[miRNA$Seed.sequence %in% off.target, ], "miRNA.csv", row.names=F)
486 fe="Yes"
487 } ### export miRNAs talbe sharing common seed
488
489 if(length(off.target) > 0){
490 for(i in 1:length(off.target)){
491
492 n=off.target[i]
493 si=SeedExist(seed.dfr, n)
494 si.offtarget=output.corr[si == 1, -3]
495 si.offtarget$off.target.seed=n
496 si.offtarget=si.offtarget[order(si.offtarget$Z.Score), ]
497 print(paste(n, "siRNA pools ... ..."))
498 write.csv(si.offtarget, paste("siRNAsPool_OffTargetSeedFamily_", n, ".csv", sep=""), row.names=F)
499
500 }
501 }
502 }
503
504
505 # generating output figures #
506
507 print("outputing figures... ...")
508
509 jpeg(file="SeedFamilyOffTarget.jpeg", height=600, width=800)
510 par(mar=c(5, 5, 2, 2))
511 plot(offtarget$coef, -log10(offtarget$ks.p.value),
512 pch=20, col="blue", yaxt="n", cex.axis=1.5, cex.lab=2,
513 xlab="Strength of seed-linked effect", ylab="Significance (P value)")
514 p.log=ceiling(-log10(min(offtarget$ks.p.value, na.rm=T))/2)
515 p.range=10^(0:-p.log*2)
516 axis(2, p.range,
517 at=-log(p.range, 10),
518 cex.axis=1.5)
519 off.target=offtarget$seed[offtarget$coef < -Str & offtarget$ks.p.value < Sig]
520 nn=length(off.target)
521 points(offtarget$coef[offtarget$seed %in% off.target],
522 -log10(offtarget$ks.p.value[offtarget$seed %in% off.target]),
523 col="red", pch=20)
524 off.target=offtarget$seed[offtarget$coef > Str & offtarget$ks.p.value < Sig]
525 np=length(off.target)
526 points(offtarget$coef[offtarget$seed %in% off.target],
527 -log10(offtarget$ks.p.value[offtarget$seed %in% off.target]),
528 col="green", pch=20)
529 abline(v=-Str, col="red", lwd=3)
530 abline(v=Str, col="green", lwd=3)
531 abline(h=-log10(Sig), col="blue", lwd=3)
532 dev.off()
533
534 # generating legend #
535
536 jpeg(file="SeedFamilyOffTargetLegend.jpeg", width=500, height=100)
537 par(mar=c(0, 0, 0, 0))
538 plot(c(0, 2), c(0.8, 1.2), type="n", bty="n", yaxt="n", xaxt='n')
539 points(c(0, 0), c(1.1, 0.9), col=c('red', "green"), pch=20, cex=2)
540 text(1, 1.1, paste('N: ', nn, " (Strength < ", -Str, ", P value < ", Sig, ")", sep=""), cex=2)
541 text(1, 0.9, paste('N: ', np, " (Strength > ", Str, ", P value < ", Sig, ")", sep=""), cex=2)
542 dev.off()
543
544 # new output #
545
546 # generating summary #
547
548 print("Ansysis Summary ... ...")
549
550 print("p value ... ...")
551 p.value=offtarget$ks.p.value[offtarget$coef < -Str | offtarget$coef > Str]
552
553 print("FDR ... ...")
554 f=fdr.bum(p.value, Sig)
555 print("rounding ... ...")
556 f=round(f, digits=3)
557 print(f)
558 print("FDR done ... ...")
559
560
561 print("off.sum ... ...")
562 off.sum=matrix(rep("", 30), 15, 2)
563 off.sum[1,1]="Analysis is successfully done!"
564 off.sum[3,1]="Parameters set up"
565 off.sum[4:15, 1]=c("Strand Orientation for Analysis:", "Lambda value:", "Seed:",
566 "siRNA Library:", "Strength of seed-linked effect:", "P value:", "",
567 "Analysis summary",
568 "Number of negative off-target seed families:",
569 "Number of positive off-target seed families:", "FDR:",
570 "Identify miRNA with phenotypical effects:")
571 off.sum[4:15, 2]=c(Strand, Lambda, Seed, Library, Str, Sig, "", "", nn, np, f, fe)
572 write.table(off.sum, "Analysis_Summary.csv", row.names=F, col.names=F, sep=",")
573
574 print("Analysis done ... ...")
575