Mercurial > repos > mkim8 > mintest3
comparison offtarget_v7.R @ 0:b73bf7e5bb41 draft default tip
Uploaded
author | mkim8 |
---|---|
date | Fri, 14 Mar 2014 14:31:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b73bf7e5bb41 |
---|---|
1 library(methods) | |
2 library(lattice) | |
3 suppressMessages(library(Matrix)) | |
4 suppressMessages(library(ClassComparison)) | |
5 suppressMessages(library(glmnet)) | |
6 options(warn=-1) | |
7 | |
8 ####################### | |
9 ### arguments input ### | |
10 ####################### | |
11 | |
12 arg=commandArgs(TRUE) | |
13 | |
14 InputFile=arg[1] # must be given by user including both | |
15 # response variable and siRNA sequences | |
16 | |
17 Seed=as.integer(arg[2]) # Name: "Seed" | |
18 # specify the seed region to be used in analysis | |
19 # an interger out of 1~14. | |
20 # default is 2, which means 2~7 hexamer is used | |
21 | |
22 Strand=arg[3] # Name:"Strand Orientation for Analysis" | |
23 # specify which strand is used in analysis | |
24 # must be given by user, options are "sense", | |
25 #"antisense" or "both" | |
26 | |
27 Lambda=as.numeric(arg[4]) # Name:"Lambda Value" | |
28 # penalty parameter in LASSO regression | |
29 # default = 0.001, which can be changed by user | |
30 # in order to perform differennt LASSO estimation | |
31 | |
32 Library=arg[5] # Name:"siRNA Library to use" | |
33 # user has to specify which siRNA library will be | |
34 # used in analysis can be "Custom", "Ambion", "New Dharmacon" | |
35 # or "Old Dharmacon". Default is "Custom" | |
36 | |
37 ### new parameter ### | |
38 | |
39 Str=as.numeric(arg[6]) # Name:"Strength of seed-linked effect“ | |
40 # specify cutoff for strenth of seed-linked effet, must be positive | |
41 # default is 1 | |
42 | |
43 Sig=as.numeric(arg[7]) # Name:"Significance (P value)" | |
44 # specify cutoff for significance (p value) | |
45 # default is 0.01 | |
46 | |
47 ################# | |
48 ### functions ### | |
49 ################# | |
50 | |
51 # uppercase a sequence and transform T to U ### | |
52 | |
53 TtoU=function(x){ | |
54 x=as.character(x) | |
55 x=toupper(x) | |
56 x=chartr("T", "U", x) | |
57 return(x) | |
58 } | |
59 | |
60 # reverse a sequence # | |
61 | |
62 StrReverse <- function(x){ | |
63 x=as.character(x) | |
64 x=sapply(lapply(strsplit(x, NULL), rev), paste, collapse="") | |
65 return(x) | |
66 } | |
67 | |
68 # generate complementary sequence # | |
69 | |
70 DuplexReverse=function(x) | |
71 { | |
72 x=as.character(x) | |
73 x=gsub("U","T",x) | |
74 x=gsub("A","U",x) | |
75 x=gsub("T","A",x) | |
76 x=gsub("G","g",x) | |
77 x=gsub("C","G",x) | |
78 x=gsub("g","C",x) | |
79 x=StrReverse(x) | |
80 return(x) | |
81 } | |
82 | |
83 # check if input contains AUGC only # | |
84 | |
85 SeqCheck=function(x){ | |
86 x=as.character(x) | |
87 x=strsplit(x, "") | |
88 x=unlist(x) | |
89 if(sum(x %in% c("A", "U", "G", "C")) == length(x) ){ | |
90 return(TRUE) | |
91 } | |
92 else{ | |
93 return(FALSE) | |
94 } | |
95 } | |
96 | |
97 # extract seed sequence from siRNA # | |
98 | |
99 SeedExtract=function(x, start=2, end=7){ | |
100 x=as.character(x) | |
101 x=substr(x, start, end) | |
102 return(x) | |
103 } | |
104 | |
105 # examine if a pool contain a seed # | |
106 | |
107 SeedExist=function(dfr, s){ | |
108 dfr=data.frame(dfr) | |
109 s=as.character(s) | |
110 return(as.numeric(apply(dfr == s, 1, sum))) | |
111 } | |
112 | |
113 # seed analysis # | |
114 | |
115 SeedAnalysis=function(Z, data, start=2, end=7){ | |
116 | |
117 print("Generating seed.dfr ... ... ") | |
118 data=data.frame(data) | |
119 seed.dfr=data | |
120 for(i in 1:ncol(seed.dfr)){ | |
121 seed.dfr[, i]=as.character(SeedExtract(seed.dfr[, i], start=start, end=end)) | |
122 } | |
123 colnames(seed.dfr)=paste("seed", 1:ncol(seed.dfr), sep=".") | |
124 print("Generating seed.table ... ... ") | |
125 seed.table=data.frame(table(unlist(seed.dfr))) | |
126 names(seed.table)=c("seed", "family.size") | |
127 | |
128 seed.table$ks.p.value=rep(NA, length(seed.table$seed)) | |
129 y=Z | |
130 for(i in 1:length(seed.table$seed)){ | |
131 s=SeedExist(seed.dfr, seed.table$seed[i]) | |
132 seed.table$ks.p.value[i]= | |
133 ks.test(y[s > 0], y[s== 0])$p.value | |
134 print(paste("seed ", i)) | |
135 } | |
136 | |
137 return(list(Z=Z, data=data, seed.table=seed.table, seed.dfr=seed.dfr)) | |
138 } | |
139 | |
140 # offtarget analysis # | |
141 | |
142 OfftargetAnalysis=function(object, s=0.001){ | |
143 | |
144 seed.dfr=object$seed.dfr | |
145 seed.table=object$seed.table | |
146 y=object$Z | |
147 | |
148 print("Generating Matrix X ... ... ") | |
149 s.sub=as.character(seed.table$seed) | |
150 x=matrix(0, length(y), length(s.sub)) | |
151 colnames(x)=as.character(s.sub) | |
152 | |
153 for(i in 1:length(s.sub)){ | |
154 n=as.character(s.sub[i]) | |
155 x[, n]=SeedExist(seed.dfr, n) | |
156 print(paste("Matrix X seed ", i)) | |
157 } | |
158 | |
159 print("Fitting ... ... ") | |
160 gc() | |
161 | |
162 fit=glmnet(x, y) | |
163 | |
164 print("offtarget analysis") | |
165 offtarget=data.frame(as.matrix(coef(fit, s=s))) | |
166 offtarget$seed=rownames(offtarget) | |
167 offtarget=offtarget[, c(2, 1)] | |
168 names(offtarget)[2]="coef" | |
169 rownames(offtarget)=NULL | |
170 | |
171 offtarget=merge(offtarget, seed.table, by="seed", all.x=T) | |
172 y.fit=as.numeric(predict(fit, x, s=s)) | |
173 y.corrected=y-y.fit | |
174 print("Off-target analysis Done!") | |
175 | |
176 return(list(fit=fit, X=x, offtarget=offtarget, Z.corrected=y.corrected)) | |
177 | |
178 | |
179 } | |
180 | |
181 # generating data frame of siRNA sequence # | |
182 | |
183 SeqGenerate=function(SeqDat){ | |
184 SenseSeq=data.frame(apply(SeqDat, 2, TtoU)) | |
185 AntiSeq=data.frame(apply(SenseSeq, 2, DuplexReverse)) | |
186 for(i in 1:ncol(SenseSeq)){ | |
187 AntiSeq[, i]=DuplexReverse(SenseSeq[, i]) | |
188 } | |
189 BothSeq=cbind(SenseSeq, AntiSeq) | |
190 names(BothSeq)=paste("duplex", 1:ncol(BothSeq), sep=".") | |
191 output=list(SenseSeq=SenseSeq, AntiSeq=AntiSeq, BothSeq=BothSeq) | |
192 return(output) | |
193 } | |
194 | |
195 # DataAnalysis # | |
196 | |
197 DataAnalysis=function(Z, Seq, SeedStart, Strand, Lambda){ | |
198 | |
199 SeedEnd=SeedStart+5 | |
200 | |
201 analysis=list() | |
202 | |
203 if(Strand == 'sense'){ | |
204 Seq=Seq$SenseSeq | |
205 analysis$seed.analysis=SeedAnalysis(Z=Z, data=Seq, start=SeedStart, end=SeedEnd) | |
206 analysis$offtarget.analysis=OfftargetAnalysis(analysis$seed.analysis, s=Lambda) | |
207 } | |
208 else if(Strand == "antisense"){ | |
209 Seq=Seq$AntiSeq | |
210 analysis$seed.analysis=SeedAnalysis(Z=Z, data=Seq, start=SeedStart, end=SeedEnd) | |
211 analysis$offtarget.analysis=OfftargetAnalysis(analysis$seed.analysis, s=Lambda) | |
212 } | |
213 else if(Strand == 'both'){ | |
214 Seq=Seq$BothSeq | |
215 analysis$seed.analysis=SeedAnalysis(Z=Z, data=Seq, start=SeedStart, end=SeedEnd) | |
216 analysis$offtarget.analysis=OfftargetAnalysis(analysis$seed.analysis, s=Lambda) | |
217 } | |
218 return(analysis) | |
219 } | |
220 | |
221 # new functions # | |
222 | |
223 # FDR calculation # | |
224 | |
225 fdr.bum=function(p.value, Sig){ | |
226 p.sum=attributes(summary(Bum(p.value), tau=Sig))$estimates | |
227 f=p.sum[4]/(p.sum[2]+p.sum[4]) | |
228 f=as.numeric(f) | |
229 return(f) | |
230 } | |
231 | |
232 # load miRNA database # | |
233 | |
234 load("/home/galaxy/galaxy-dist/tools/offtarget/miRNA.RData") | |
235 | |
236 ################## | |
237 ### data input ### | |
238 ################## | |
239 | |
240 # data input # | |
241 | |
242 SeedStart=Seed | |
243 inputfile=read.csv(InputFile, header=F) | |
244 id=inputfile[, 1] | |
245 Z=as.numeric(inputfile[, 2]) | |
246 | |
247 # Z score checking # | |
248 | |
249 if(sum(is.na(Z)) > 0){ | |
250 print("Missing Value in Experimental Readouts!") | |
251 quit() | |
252 } | |
253 | |
254 if(sum(Z == "") > 0){ | |
255 print("Missing Value in Experimental Readouts!") | |
256 quit() | |
257 } | |
258 | |
259 ##################### | |
260 ### data analysis ### | |
261 ##################### | |
262 | |
263 ### Custom ### | |
264 | |
265 if(Library == "custom"){ | |
266 | |
267 ### check if user selected wrong "Library" parameter ### | |
268 if(ncol(inputfile) < 3){ | |
269 print("siRNA sequences not available and please check your Library selection") | |
270 quit() | |
271 } | |
272 ################################################################## | |
273 | |
274 SeqDat=inputfile[, 3:ncol(inputfile)] | |
275 SeqDat=data.frame(SeqDat) | |
276 | |
277 if(sum(is.na(unlist(SeqDat))) > 0){ | |
278 print("Missing Value in siRNA sequences!") | |
279 quit() | |
280 } | |
281 | |
282 if(sum(unlist(SeqDat) == "") > 0){ | |
283 print("Missing Value in siRNA sequences!") | |
284 quit() | |
285 } | |
286 | |
287 Seq=SeqGenerate(SeqDat) | |
288 | |
289 SenseSeq=Seq$SenseSeq | |
290 Check=data.frame(apply(SenseSeq, 1, SeqCheck)) | |
291 | |
292 if(sum(Check == FALSE) > 0){ | |
293 print("Non-Sequence Value in siRNA sequences!") | |
294 quit() | |
295 } | |
296 else{ | |
297 analysis=DataAnalysis(Z, Seq, SeedStart, Strand, Lambda) | |
298 } | |
299 } | |
300 | |
301 ### Ambion ### | |
302 | |
303 if(Library == "ambion"){ | |
304 | |
305 id=as.integer(id) | |
306 | |
307 if(sum(is.na(id)) > 0){ | |
308 print("Missing Value in Gene ID!") | |
309 quit() | |
310 } | |
311 | |
312 if(sum(id == "") > 0){ | |
313 print("Missing Value in Gene ID!") | |
314 quit() | |
315 } | |
316 | |
317 load("/home/galaxy/galaxy-dist/tools/offtarget/Ambion.RData") | |
318 id.ambion=unique(ambion$id) | |
319 | |
320 if(length(id[!(id %in% id.ambion)]) > 0){ | |
321 write.csv(c("No Matching siRNA Sequences with Below Gene ID and Removed from Analysis", | |
322 id[!(id %in% id.ambion)]), "unmatched.id.csv") | |
323 } | |
324 | |
325 Dat=data.frame(id=id, z=Z) | |
326 Dat=merge(Dat, ambion, by="id", sort=F) | |
327 Z=Dat$z | |
328 id=Dat$id | |
329 SeqDat=Dat[, 6:8] | |
330 Seq=SeqGenerate(SeqDat) | |
331 analysis=DataAnalysis(Z, Seq, SeedStart, Strand, Lambda) | |
332 | |
333 } | |
334 | |
335 ### New Dhar ### | |
336 | |
337 if(Library == "new_dharmacon"){ | |
338 | |
339 id=as.integer(id) | |
340 | |
341 if(sum(is.na(id)) > 0){ | |
342 print("Missing Value in Gene ID!") | |
343 quit() | |
344 } | |
345 | |
346 if(sum(id == "") > 0){ | |
347 print("Missing Value in Gene ID!") | |
348 quit() | |
349 } | |
350 | |
351 load("/home/galaxy/galaxy-dist/tools/offtarget/New_Dhar.RData") | |
352 id.new.dhar=unique(new.dhar$id) | |
353 | |
354 if(length(id[!(id %in% id.new.dhar)]) > 0){ | |
355 write.csv(c("No Matching siRNA Sequences with Below Gene ID and Removed from Analysis", | |
356 id[!(id %in% id.new.dhar)]), "unmatched.id.csv") | |
357 } | |
358 | |
359 Dat=data.frame(id=id, z=Z) | |
360 Dat=merge(Dat, new.dhar, by="id", sort=F) | |
361 Z=Dat$z | |
362 id=Dat$id | |
363 SeqDat=Dat[, 7:10] | |
364 Seq=SeqGenerate(SeqDat) | |
365 analysis=DataAnalysis(Z, Seq, SeedStart, Strand, Lambda) | |
366 | |
367 } | |
368 | |
369 ### Old Dhar ### | |
370 | |
371 if(Library == "old_dharmacon"){ | |
372 | |
373 id=as.integer(id) | |
374 | |
375 if(sum(is.na(id)) > 0){ | |
376 print("Missing Value in Gene ID!") | |
377 quit() | |
378 } | |
379 | |
380 if(sum(id == "") > 0){ | |
381 print("Missing Value in Gene ID!") | |
382 quit() | |
383 } | |
384 | |
385 load("/home/galaxy/galaxy-dist/tools/offtarget/Old_Dhar.RData") | |
386 id.old.dhar=unique(old.dhar$id) | |
387 | |
388 if(length(id[!(id %in% id.old.dhar)]) > 0){ | |
389 write.csv(c("No Matching siRNA Sequences with Below Gene ID and Removed from Analysis", | |
390 id[!(id %in% id.old.dhar)]), "unmatched.id.csv") | |
391 } | |
392 | |
393 Dat=data.frame(id=id, z=Z) | |
394 Dat=merge(Dat, old.dhar, by="id", sort=F) | |
395 Z=Dat$z | |
396 id=Dat$id | |
397 SeqDat=Dat[, 7:10] | |
398 Seq=SeqGenerate(SeqDat) | |
399 analysis=DataAnalysis(Z, Seq, SeedStart, Strand, Lambda) | |
400 | |
401 } | |
402 | |
403 ###################### | |
404 ### results output ### | |
405 ###################### | |
406 | |
407 # code chagned below # | |
408 | |
409 # generating output files # | |
410 | |
411 duplex.dfr=analysis$seed.analysis$data | |
412 names(duplex.dfr)=paste("duplex", 1:length(names(duplex.dfr)), sep=".") | |
413 seed.dfr=analysis$seed.analysis$seed.dfr | |
414 offtarget=analysis$offtarget.analysis$offtarget | |
415 Z.corrected=analysis$offtarget.analysis$Z.corrected | |
416 output.corr=cbind(ID=id, Z.Score=Z, Corrected.Z.Score=Z.corrected, | |
417 duplex.dfr, seed.dfr) | |
418 | |
419 | |
420 print("outputing results... ...") | |
421 | |
422 if(Library == 'custom'){ | |
423 | |
424 print("outputing corrected Z score... ...") | |
425 write.csv(output.corr, "CorrectedZScore.csv", row.names=F) ### export corrected z score | |
426 | |
427 print("outputing summary... ...") | |
428 write.csv(offtarget, "Seed_Famliy_Summary.csv", row.names=FALSE) ### output seed family summary table | |
429 | |
430 print("outputing offtarget seed families... ...") | |
431 | |
432 off.target=offtarget$seed[offtarget$ks.p.value < Sig & ( | |
433 offtarget$coef < -Str | offtarget$coef > Str)] # select off target by custom setup | |
434 | |
435 write.csv(offtarget[offtarget$seed %in% off.target, ], | |
436 "Off-Target_Seed_Families.csv", row.names=FALSE) ### export off-target seed families | |
437 | |
438 print("outputing miRNAs... ...") | |
439 if(nrow(miRNA[miRNA$Seed.sequence %in% off.target, ]) == 0){ | |
440 write.csv("No miRNA detected", "miRNA.csv", row.names=F) | |
441 fe="No" | |
442 } ### export miRNAs table sharing common seed | |
443 | |
444 if(nrow(miRNA[miRNA$Seed.sequence %in% off.target, ]) != 0){ | |
445 write.csv(miRNA[miRNA$Seed.sequence %in% off.target, ], "miRNA.csv", row.names=F) | |
446 fe="Yes" | |
447 } ### export miRNAs talbe sharing common seed | |
448 | |
449 if(length(off.target) > 0){ | |
450 for(i in 1:length(off.target)){ | |
451 | |
452 n=off.target[i] | |
453 si=SeedExist(seed.dfr, n) | |
454 si.offtarget=output.corr[si == 1, -3] | |
455 si.offtarget$off.target.seed=n | |
456 si.offtarget=si.offtarget[order(si.offtarget$Z.Score), ] | |
457 print(paste(n, "siRNA pools ... ...")) | |
458 write.csv(si.offtarget, paste("siRNAsPool_OffTargetSeedFamily_", n, ".csv", sep=""), row.names=F) | |
459 | |
460 } | |
461 } | |
462 }else{ | |
463 | |
464 print("outputing corrected Z score... ...") | |
465 write.csv(output.corr[, 1:3], "CorrectedZScore.csv", row.names=F) | |
466 | |
467 print("outputing summary... ...") | |
468 write.csv(offtarget, "Seed_Famliy_Summary.csv", row.names=FALSE) ### output seed family summary table | |
469 | |
470 print("outputing offtarget seed families... ...") | |
471 | |
472 off.target=offtarget$seed[offtarget$ks.p.value < Sig & ( | |
473 offtarget$coef < -Str | offtarget$coef > Str)] # select off target by custom setup | |
474 | |
475 write.csv(offtarget[offtarget$seed %in% off.target, ], | |
476 "Off-Target_Seed_Families.csv", row.names=FALSE) ### export off-target seed families | |
477 | |
478 print("outputing miRNAs... ...") | |
479 if(nrow(miRNA[miRNA$Seed.sequence %in% off.target, ]) == 0){ | |
480 write.csv("No miRNA detected", "miRNA.csv", row.names=F) | |
481 fe="No" | |
482 } ### export miRNAs table sharing common seed | |
483 | |
484 if(nrow(miRNA[miRNA$Seed.sequence %in% off.target, ]) != 0){ | |
485 write.csv(miRNA[miRNA$Seed.sequence %in% off.target, ], "miRNA.csv", row.names=F) | |
486 fe="Yes" | |
487 } ### export miRNAs talbe sharing common seed | |
488 | |
489 if(length(off.target) > 0){ | |
490 for(i in 1:length(off.target)){ | |
491 | |
492 n=off.target[i] | |
493 si=SeedExist(seed.dfr, n) | |
494 si.offtarget=output.corr[si == 1, -3] | |
495 si.offtarget$off.target.seed=n | |
496 si.offtarget=si.offtarget[order(si.offtarget$Z.Score), ] | |
497 print(paste(n, "siRNA pools ... ...")) | |
498 write.csv(si.offtarget, paste("siRNAsPool_OffTargetSeedFamily_", n, ".csv", sep=""), row.names=F) | |
499 | |
500 } | |
501 } | |
502 } | |
503 | |
504 | |
505 # generating output figures # | |
506 | |
507 print("outputing figures... ...") | |
508 | |
509 jpeg(file="SeedFamilyOffTarget.jpeg", height=600, width=800) | |
510 par(mar=c(5, 5, 2, 2)) | |
511 plot(offtarget$coef, -log10(offtarget$ks.p.value), | |
512 pch=20, col="blue", yaxt="n", cex.axis=1.5, cex.lab=2, | |
513 xlab="Strength of seed-linked effect", ylab="Significance (P value)") | |
514 p.log=ceiling(-log10(min(offtarget$ks.p.value, na.rm=T))/2) | |
515 p.range=10^(0:-p.log*2) | |
516 axis(2, p.range, | |
517 at=-log(p.range, 10), | |
518 cex.axis=1.5) | |
519 off.target=offtarget$seed[offtarget$coef < -Str & offtarget$ks.p.value < Sig] | |
520 nn=length(off.target) | |
521 points(offtarget$coef[offtarget$seed %in% off.target], | |
522 -log10(offtarget$ks.p.value[offtarget$seed %in% off.target]), | |
523 col="red", pch=20) | |
524 off.target=offtarget$seed[offtarget$coef > Str & offtarget$ks.p.value < Sig] | |
525 np=length(off.target) | |
526 points(offtarget$coef[offtarget$seed %in% off.target], | |
527 -log10(offtarget$ks.p.value[offtarget$seed %in% off.target]), | |
528 col="green", pch=20) | |
529 abline(v=-Str, col="red", lwd=3) | |
530 abline(v=Str, col="green", lwd=3) | |
531 abline(h=-log10(Sig), col="blue", lwd=3) | |
532 dev.off() | |
533 | |
534 # generating legend # | |
535 | |
536 jpeg(file="SeedFamilyOffTargetLegend.jpeg", width=500, height=100) | |
537 par(mar=c(0, 0, 0, 0)) | |
538 plot(c(0, 2), c(0.8, 1.2), type="n", bty="n", yaxt="n", xaxt='n') | |
539 points(c(0, 0), c(1.1, 0.9), col=c('red', "green"), pch=20, cex=2) | |
540 text(1, 1.1, paste('N: ', nn, " (Strength < ", -Str, ", P value < ", Sig, ")", sep=""), cex=2) | |
541 text(1, 0.9, paste('N: ', np, " (Strength > ", Str, ", P value < ", Sig, ")", sep=""), cex=2) | |
542 dev.off() | |
543 | |
544 # new output # | |
545 | |
546 # generating summary # | |
547 | |
548 print("Ansysis Summary ... ...") | |
549 | |
550 print("p value ... ...") | |
551 p.value=offtarget$ks.p.value[offtarget$coef < -Str | offtarget$coef > Str] | |
552 | |
553 print("FDR ... ...") | |
554 f=fdr.bum(p.value, Sig) | |
555 print("rounding ... ...") | |
556 f=round(f, digits=3) | |
557 print(f) | |
558 print("FDR done ... ...") | |
559 | |
560 | |
561 print("off.sum ... ...") | |
562 off.sum=matrix(rep("", 30), 15, 2) | |
563 off.sum[1,1]="Analysis is successfully done!" | |
564 off.sum[3,1]="Parameters set up" | |
565 off.sum[4:15, 1]=c("Strand Orientation for Analysis:", "Lambda value:", "Seed:", | |
566 "siRNA Library:", "Strength of seed-linked effect:", "P value:", "", | |
567 "Analysis summary", | |
568 "Number of negative off-target seed families:", | |
569 "Number of positive off-target seed families:", "FDR:", | |
570 "Identify miRNA with phenotypical effects:") | |
571 off.sum[4:15, 2]=c(Strand, Lambda, Seed, Library, Str, Sig, "", "", nn, np, f, fe) | |
572 write.table(off.sum, "Analysis_Summary.csv", row.names=F, col.names=F, sep=",") | |
573 | |
574 print("Analysis done ... ...") | |
575 |