diff offtarget_v7.R @ 0:b73bf7e5bb41 draft default tip

Uploaded
author mkim8
date Fri, 14 Mar 2014 14:31:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/offtarget_v7.R	Fri Mar 14 14:31:15 2014 -0400
@@ -0,0 +1,575 @@
+library(methods)
+library(lattice)
+suppressMessages(library(Matrix))
+suppressMessages(library(ClassComparison))
+suppressMessages(library(glmnet))
+options(warn=-1)
+
+#######################
+### arguments input ###
+#######################
+
+arg=commandArgs(TRUE)
+
+InputFile=arg[1]          # must be given by user including both 
+                          # response variable and siRNA sequences
+
+Seed=as.integer(arg[2])   # Name: "Seed"
+                          # specify the seed region to be used in analysis 
+                          # an interger out of 1~14.
+                          # default is 2, which means 2~7 hexamer is used 
+
+Strand=arg[3]             # Name:"Strand Orientation for Analysis"
+                          # specify which strand is used in analysis 
+                          # must be given by user, options are "sense", 
+                          #"antisense" or "both"
+
+Lambda=as.numeric(arg[4]) # Name:"Lambda Value"
+                          # penalty parameter in LASSO regression 
+                          # default = 0.001, which can be changed by user
+                          # in order to perform differennt LASSO estimation
+
+Library=arg[5]            # Name:"siRNA Library to use"
+                          # user has to specify which siRNA library will be 
+                          # used in analysis can be "Custom", "Ambion", "New Dharmacon"
+                          # or "Old Dharmacon". Default is "Custom"
+
+### new parameter ###
+
+Str=as.numeric(arg[6])    # Name:"Strength of seed-linked effect“
+                          # specify cutoff for strenth of seed-linked effet, must be positive
+                          # default is 1
+
+Sig=as.numeric(arg[7])    # Name:"Significance (P value)"
+                          # specify cutoff for significance (p value)
+                          # default is 0.01
+                           
+#################
+### functions ###
+#################
+
+# uppercase a sequence and transform T to U ###
+
+TtoU=function(x){
+  x=as.character(x)
+  x=toupper(x)
+  x=chartr("T", "U", x)
+  return(x)
+}
+
+# reverse a sequence #
+
+StrReverse <- function(x){
+  x=as.character(x)
+  x=sapply(lapply(strsplit(x, NULL), rev), paste, collapse="")
+  return(x)
+} 
+
+# generate complementary sequence #
+
+DuplexReverse=function(x)
+{
+  x=as.character(x)
+  x=gsub("U","T",x)
+  x=gsub("A","U",x)
+  x=gsub("T","A",x)
+  x=gsub("G","g",x)
+  x=gsub("C","G",x)
+  x=gsub("g","C",x)
+  x=StrReverse(x)
+  return(x)
+}
+
+# check if input contains AUGC only #
+
+SeqCheck=function(x){
+  x=as.character(x)
+  x=strsplit(x, "")
+  x=unlist(x)
+  if(sum(x %in% c("A", "U", "G", "C")) == length(x) ){
+    return(TRUE)
+  }
+  else{
+    return(FALSE)
+  }
+}
+
+# extract seed sequence from siRNA #
+
+SeedExtract=function(x, start=2, end=7){
+  x=as.character(x)
+  x=substr(x, start, end)
+  return(x)
+}
+
+# examine if a pool contain a seed #
+
+SeedExist=function(dfr, s){
+  dfr=data.frame(dfr)
+  s=as.character(s)
+  return(as.numeric(apply(dfr == s, 1, sum)))
+}
+
+# seed analysis #
+
+SeedAnalysis=function(Z, data, start=2, end=7){
+  
+  print("Generating seed.dfr ... ... ")
+  data=data.frame(data)
+  seed.dfr=data
+  for(i in 1:ncol(seed.dfr)){
+    seed.dfr[, i]=as.character(SeedExtract(seed.dfr[, i], start=start, end=end))
+  }
+  colnames(seed.dfr)=paste("seed", 1:ncol(seed.dfr), sep=".")
+  print("Generating seed.table ... ... ")
+  seed.table=data.frame(table(unlist(seed.dfr)))
+  names(seed.table)=c("seed", "family.size")
+  
+  seed.table$ks.p.value=rep(NA, length(seed.table$seed))
+  y=Z
+  for(i in 1:length(seed.table$seed)){
+    s=SeedExist(seed.dfr, seed.table$seed[i])
+    seed.table$ks.p.value[i]=
+      ks.test(y[s > 0], y[s== 0])$p.value
+    print(paste("seed ", i))
+  }
+  
+  return(list(Z=Z, data=data, seed.table=seed.table, seed.dfr=seed.dfr))
+}
+
+# offtarget analysis #
+
+OfftargetAnalysis=function(object, s=0.001){
+  
+  seed.dfr=object$seed.dfr
+  seed.table=object$seed.table
+  y=object$Z
+  
+  print("Generating Matrix X ... ... ")
+  s.sub=as.character(seed.table$seed)
+  x=matrix(0, length(y), length(s.sub))
+  colnames(x)=as.character(s.sub)
+  
+  for(i in 1:length(s.sub)){
+    n=as.character(s.sub[i])
+    x[, n]=SeedExist(seed.dfr, n)
+    print(paste("Matrix X seed ", i))
+  }
+  
+  print("Fitting ... ... ")
+  gc()
+  
+  fit=glmnet(x, y)
+  
+  print("offtarget analysis")
+  offtarget=data.frame(as.matrix(coef(fit, s=s)))
+  offtarget$seed=rownames(offtarget)
+  offtarget=offtarget[, c(2, 1)]
+  names(offtarget)[2]="coef"
+  rownames(offtarget)=NULL
+  
+  offtarget=merge(offtarget, seed.table, by="seed", all.x=T)
+  y.fit=as.numeric(predict(fit, x, s=s))
+  y.corrected=y-y.fit
+  print("Off-target analysis Done!")
+  
+  return(list(fit=fit, X=x, offtarget=offtarget, Z.corrected=y.corrected))
+  
+  
+}
+
+# generating data frame of siRNA sequence #
+
+SeqGenerate=function(SeqDat){
+  SenseSeq=data.frame(apply(SeqDat, 2, TtoU))
+  AntiSeq=data.frame(apply(SenseSeq, 2, DuplexReverse))
+  for(i in 1:ncol(SenseSeq)){
+    AntiSeq[, i]=DuplexReverse(SenseSeq[, i])
+  }
+  BothSeq=cbind(SenseSeq, AntiSeq)
+  names(BothSeq)=paste("duplex", 1:ncol(BothSeq), sep=".")
+  output=list(SenseSeq=SenseSeq, AntiSeq=AntiSeq, BothSeq=BothSeq)
+  return(output)
+}
+
+# DataAnalysis #
+
+DataAnalysis=function(Z, Seq, SeedStart, Strand, Lambda){
+  
+  SeedEnd=SeedStart+5
+  
+  analysis=list()
+  
+  if(Strand == 'sense'){
+    Seq=Seq$SenseSeq
+    analysis$seed.analysis=SeedAnalysis(Z=Z, data=Seq, start=SeedStart, end=SeedEnd)
+    analysis$offtarget.analysis=OfftargetAnalysis(analysis$seed.analysis, s=Lambda)
+  }
+  else if(Strand == "antisense"){
+    Seq=Seq$AntiSeq
+    analysis$seed.analysis=SeedAnalysis(Z=Z, data=Seq, start=SeedStart, end=SeedEnd)
+    analysis$offtarget.analysis=OfftargetAnalysis(analysis$seed.analysis, s=Lambda)
+  }
+  else if(Strand == 'both'){
+    Seq=Seq$BothSeq
+    analysis$seed.analysis=SeedAnalysis(Z=Z, data=Seq, start=SeedStart, end=SeedEnd)
+    analysis$offtarget.analysis=OfftargetAnalysis(analysis$seed.analysis, s=Lambda)
+  }
+  return(analysis)
+}
+
+# new functions #
+
+# FDR calculation # 
+
+fdr.bum=function(p.value, Sig){
+  p.sum=attributes(summary(Bum(p.value), tau=Sig))$estimates
+  f=p.sum[4]/(p.sum[2]+p.sum[4])
+  f=as.numeric(f)
+  return(f)
+}
+
+# load miRNA database #
+
+load("/home/galaxy/galaxy-dist/tools/offtarget/miRNA.RData")
+
+##################
+### data input ###
+##################
+
+# data input #
+
+SeedStart=Seed
+inputfile=read.csv(InputFile, header=F)
+id=inputfile[, 1]
+Z=as.numeric(inputfile[, 2]) 
+
+# Z score checking #
+
+if(sum(is.na(Z)) > 0){
+  print("Missing Value in Experimental Readouts!")
+  quit()
+}
+
+if(sum(Z == "") > 0){
+  print("Missing Value in Experimental Readouts!")
+  quit()
+}
+
+#####################
+### data analysis ###
+#####################
+
+### Custom ###
+
+if(Library == "custom"){
+  
+  ### check if user selected wrong "Library" parameter ###
+  if(ncol(inputfile) < 3){
+    print("siRNA sequences not available and please check your Library selection")
+    quit()
+  }
+  ##################################################################
+  
+  SeqDat=inputfile[, 3:ncol(inputfile)] 
+  SeqDat=data.frame(SeqDat)
+  
+  if(sum(is.na(unlist(SeqDat))) > 0){
+    print("Missing Value in siRNA sequences!")
+    quit()
+  }
+  
+  if(sum(unlist(SeqDat) == "") > 0){
+    print("Missing Value in siRNA sequences!")
+    quit()
+  }
+  
+  Seq=SeqGenerate(SeqDat)
+  
+  SenseSeq=Seq$SenseSeq
+  Check=data.frame(apply(SenseSeq, 1, SeqCheck))
+  
+  if(sum(Check == FALSE) > 0){
+    print("Non-Sequence Value in siRNA sequences!")
+    quit()
+  }
+  else{
+    analysis=DataAnalysis(Z, Seq, SeedStart, Strand, Lambda)
+  }
+}
+
+### Ambion ###
+
+if(Library == "ambion"){
+  
+  id=as.integer(id)
+  
+  if(sum(is.na(id)) > 0){
+    print("Missing Value in Gene ID!")
+    quit()
+  }
+  
+  if(sum(id == "") > 0){
+    print("Missing Value in Gene ID!")
+    quit()
+  }
+  
+  load("/home/galaxy/galaxy-dist/tools/offtarget/Ambion.RData")
+  id.ambion=unique(ambion$id)
+  
+  if(length(id[!(id %in% id.ambion)]) > 0){
+    write.csv(c("No Matching siRNA Sequences with Below Gene ID and Removed from Analysis",
+                id[!(id %in% id.ambion)]), "unmatched.id.csv")
+  }
+
+  Dat=data.frame(id=id, z=Z)
+  Dat=merge(Dat, ambion, by="id", sort=F)
+  Z=Dat$z
+  id=Dat$id
+  SeqDat=Dat[, 6:8]
+  Seq=SeqGenerate(SeqDat)
+  analysis=DataAnalysis(Z, Seq, SeedStart, Strand, Lambda)
+  
+}
+
+### New Dhar ###
+
+if(Library == "new_dharmacon"){
+  
+  id=as.integer(id)
+  
+  if(sum(is.na(id)) > 0){
+    print("Missing Value in Gene ID!")
+    quit()
+  }
+  
+  if(sum(id == "") > 0){
+    print("Missing Value in Gene ID!")
+    quit()
+  }
+  
+  load("/home/galaxy/galaxy-dist/tools/offtarget/New_Dhar.RData")
+  id.new.dhar=unique(new.dhar$id)
+  
+  if(length(id[!(id %in% id.new.dhar)]) > 0){
+    write.csv(c("No Matching siRNA Sequences with Below Gene ID and Removed from Analysis",
+                id[!(id %in% id.new.dhar)]), "unmatched.id.csv")
+  }
+  
+  Dat=data.frame(id=id, z=Z)
+  Dat=merge(Dat, new.dhar, by="id", sort=F)
+  Z=Dat$z
+  id=Dat$id
+  SeqDat=Dat[, 7:10]
+  Seq=SeqGenerate(SeqDat)
+  analysis=DataAnalysis(Z, Seq, SeedStart, Strand, Lambda)
+  
+}
+
+### Old Dhar ###
+
+if(Library == "old_dharmacon"){
+  
+  id=as.integer(id)
+  
+  if(sum(is.na(id)) > 0){
+    print("Missing Value in Gene ID!")
+    quit()
+  }
+  
+  if(sum(id == "") > 0){
+    print("Missing Value in Gene ID!")
+    quit()
+  }
+  
+  load("/home/galaxy/galaxy-dist/tools/offtarget/Old_Dhar.RData")
+  id.old.dhar=unique(old.dhar$id)
+  
+  if(length(id[!(id %in% id.old.dhar)]) > 0){
+    write.csv(c("No Matching siRNA Sequences with Below Gene ID and Removed from Analysis",
+                id[!(id %in% id.old.dhar)]), "unmatched.id.csv")
+  }
+  
+  Dat=data.frame(id=id, z=Z)
+  Dat=merge(Dat, old.dhar, by="id", sort=F)
+  Z=Dat$z
+  id=Dat$id
+  SeqDat=Dat[, 7:10]
+  Seq=SeqGenerate(SeqDat)
+  analysis=DataAnalysis(Z, Seq, SeedStart, Strand, Lambda)
+  
+}
+
+######################
+### results output ###
+######################
+
+# code chagned below #
+
+# generating output files #
+
+duplex.dfr=analysis$seed.analysis$data 
+names(duplex.dfr)=paste("duplex", 1:length(names(duplex.dfr)), sep=".") 
+seed.dfr=analysis$seed.analysis$seed.dfr 
+offtarget=analysis$offtarget.analysis$offtarget 
+Z.corrected=analysis$offtarget.analysis$Z.corrected
+output.corr=cbind(ID=id, Z.Score=Z, Corrected.Z.Score=Z.corrected, 
+                  duplex.dfr, seed.dfr)
+
+
+print("outputing results... ...")
+ 
+if(Library == 'custom'){
+  
+  print("outputing corrected Z score... ...")
+  write.csv(output.corr, "CorrectedZScore.csv", row.names=F)   ### export corrected z score 
+  
+  print("outputing summary... ...")
+  write.csv(offtarget, "Seed_Famliy_Summary.csv", row.names=FALSE)   ### output seed family summary table
+  
+  print("outputing offtarget seed families... ...")
+  
+  off.target=offtarget$seed[offtarget$ks.p.value < Sig & (  
+    offtarget$coef < -Str | offtarget$coef > Str)] # select off target by custom setup
+  
+  write.csv(offtarget[offtarget$seed %in% off.target, ], 
+            "Off-Target_Seed_Families.csv", row.names=FALSE)   ### export off-target seed families 
+  
+  print("outputing miRNAs... ...")
+  if(nrow(miRNA[miRNA$Seed.sequence %in% off.target, ]) ==  0){
+    write.csv("No miRNA detected", "miRNA.csv", row.names=F) 
+    fe="No"
+  }   ### export miRNAs table sharing common seed
+  
+  if(nrow(miRNA[miRNA$Seed.sequence %in% off.target, ]) != 0){
+    write.csv(miRNA[miRNA$Seed.sequence %in% off.target, ], "miRNA.csv", row.names=F) 
+    fe="Yes"
+  }   ### export miRNAs talbe sharing common seed 
+  
+  if(length(off.target) > 0){
+    for(i in 1:length(off.target)){
+      
+      n=off.target[i]
+      si=SeedExist(seed.dfr, n)
+      si.offtarget=output.corr[si == 1, -3]
+      si.offtarget$off.target.seed=n
+      si.offtarget=si.offtarget[order(si.offtarget$Z.Score), ]
+      print(paste(n, "siRNA pools ... ..."))
+      write.csv(si.offtarget, paste("siRNAsPool_OffTargetSeedFamily_", n,  ".csv",  sep=""), row.names=F)
+      
+    }
+  } 
+}else{
+  
+  print("outputing corrected Z score... ...")
+  write.csv(output.corr[, 1:3], "CorrectedZScore.csv", row.names=F)
+  
+  print("outputing summary... ...")
+  write.csv(offtarget, "Seed_Famliy_Summary.csv", row.names=FALSE)   ### output seed family summary table
+  
+  print("outputing offtarget seed families... ...")
+  
+  off.target=offtarget$seed[offtarget$ks.p.value < Sig & (  
+    offtarget$coef < -Str | offtarget$coef > Str)] # select off target by custom setup
+  
+  write.csv(offtarget[offtarget$seed %in% off.target, ], 
+            "Off-Target_Seed_Families.csv", row.names=FALSE)   ### export off-target seed families 
+  
+  print("outputing miRNAs... ...")
+  if(nrow(miRNA[miRNA$Seed.sequence %in% off.target, ]) ==  0){
+    write.csv("No miRNA detected", "miRNA.csv", row.names=F) 
+    fe="No"
+  }   ### export miRNAs table sharing common seed
+  
+  if(nrow(miRNA[miRNA$Seed.sequence %in% off.target, ]) != 0){
+    write.csv(miRNA[miRNA$Seed.sequence %in% off.target, ], "miRNA.csv", row.names=F) 
+    fe="Yes"
+  }   ### export miRNAs talbe sharing common seed 
+  
+  if(length(off.target) > 0){
+    for(i in 1:length(off.target)){
+      
+      n=off.target[i]
+      si=SeedExist(seed.dfr, n)
+      si.offtarget=output.corr[si == 1, -3]
+      si.offtarget$off.target.seed=n
+      si.offtarget=si.offtarget[order(si.offtarget$Z.Score), ]
+      print(paste(n, "siRNA pools ... ..."))
+      write.csv(si.offtarget, paste("siRNAsPool_OffTargetSeedFamily_", n,  ".csv",  sep=""), row.names=F)
+      
+    }
+  }
+}  
+
+
+# generating output figures # 
+
+print("outputing figures... ...")
+
+jpeg(file="SeedFamilyOffTarget.jpeg", height=600, width=800) 
+par(mar=c(5, 5, 2, 2))
+plot(offtarget$coef, -log10(offtarget$ks.p.value), 
+     pch=20, col="blue", yaxt="n", cex.axis=1.5, cex.lab=2, 
+     xlab="Strength of seed-linked effect", ylab="Significance (P value)")
+p.log=ceiling(-log10(min(offtarget$ks.p.value, na.rm=T))/2)
+p.range=10^(0:-p.log*2)
+axis(2, p.range, 
+     at=-log(p.range, 10),
+     cex.axis=1.5)
+off.target=offtarget$seed[offtarget$coef < -Str & offtarget$ks.p.value < Sig]
+nn=length(off.target)
+points(offtarget$coef[offtarget$seed %in% off.target], 
+      -log10(offtarget$ks.p.value[offtarget$seed %in% off.target]), 
+      col="red", pch=20)
+off.target=offtarget$seed[offtarget$coef > Str & offtarget$ks.p.value < Sig]
+np=length(off.target)
+points(offtarget$coef[offtarget$seed %in% off.target], 
+       -log10(offtarget$ks.p.value[offtarget$seed %in% off.target]), 
+       col="green", pch=20)
+abline(v=-Str, col="red", lwd=3)
+abline(v=Str, col="green", lwd=3)
+abline(h=-log10(Sig), col="blue", lwd=3)
+dev.off()
+
+# generating legend #
+
+jpeg(file="SeedFamilyOffTargetLegend.jpeg", width=500, height=100) 
+par(mar=c(0, 0, 0, 0))
+plot(c(0, 2), c(0.8, 1.2), type="n", bty="n", yaxt="n", xaxt='n')
+points(c(0, 0), c(1.1, 0.9), col=c('red', "green"), pch=20, cex=2)
+text(1, 1.1, paste('N: ', nn, " (Strength < ", -Str, ", P value < ", Sig, ")", sep=""), cex=2)
+text(1, 0.9, paste('N: ', np, " (Strength > ", Str, ",  P value < ", Sig, ")", sep=""), cex=2)
+dev.off()
+
+# new output #
+
+# generating summary #
+
+print("Ansysis Summary ... ...")
+
+print("p value ... ...")
+p.value=offtarget$ks.p.value[offtarget$coef < -Str | offtarget$coef > Str]
+
+print("FDR ... ...")
+f=fdr.bum(p.value, Sig)
+print("rounding ... ...")
+f=round(f, digits=3)
+print(f)
+print("FDR done ... ...")
+
+
+print("off.sum ... ...")
+off.sum=matrix(rep("", 30), 15, 2)
+off.sum[1,1]="Analysis is successfully done!"
+off.sum[3,1]="Parameters set up"
+off.sum[4:15, 1]=c("Strand Orientation for Analysis:", "Lambda value:", "Seed:", 
+                   "siRNA Library:", "Strength of seed-linked effect:", "P value:", "",
+                   "Analysis summary",
+                   "Number of negative off-target seed families:", 
+                   "Number of positive off-target seed families:", "FDR:", 
+                   "Identify miRNA with phenotypical effects:")
+off.sum[4:15, 2]=c(Strand, Lambda, Seed, Library, Str, Sig, "", "", nn, np, f, fe)
+write.table(off.sum, "Analysis_Summary.csv", row.names=F, col.names=F, sep=",")
+
+print("Analysis done ... ...")
+