Mercurial > repos > mkim8 > mintest3
view offtarget_v7.R @ 0:b73bf7e5bb41 draft default tip
Uploaded
author | mkim8 |
---|---|
date | Fri, 14 Mar 2014 14:31:15 -0400 |
parents | |
children |
line wrap: on
line source
library(methods) library(lattice) suppressMessages(library(Matrix)) suppressMessages(library(ClassComparison)) suppressMessages(library(glmnet)) options(warn=-1) ####################### ### arguments input ### ####################### arg=commandArgs(TRUE) InputFile=arg[1] # must be given by user including both # response variable and siRNA sequences Seed=as.integer(arg[2]) # Name: "Seed" # specify the seed region to be used in analysis # an interger out of 1~14. # default is 2, which means 2~7 hexamer is used Strand=arg[3] # Name:"Strand Orientation for Analysis" # specify which strand is used in analysis # must be given by user, options are "sense", #"antisense" or "both" Lambda=as.numeric(arg[4]) # Name:"Lambda Value" # penalty parameter in LASSO regression # default = 0.001, which can be changed by user # in order to perform differennt LASSO estimation Library=arg[5] # Name:"siRNA Library to use" # user has to specify which siRNA library will be # used in analysis can be "Custom", "Ambion", "New Dharmacon" # or "Old Dharmacon". Default is "Custom" ### new parameter ### Str=as.numeric(arg[6]) # Name:"Strength of seed-linked effect“ # specify cutoff for strenth of seed-linked effet, must be positive # default is 1 Sig=as.numeric(arg[7]) # Name:"Significance (P value)" # specify cutoff for significance (p value) # default is 0.01 ################# ### functions ### ################# # uppercase a sequence and transform T to U ### TtoU=function(x){ x=as.character(x) x=toupper(x) x=chartr("T", "U", x) return(x) } # reverse a sequence # StrReverse <- function(x){ x=as.character(x) x=sapply(lapply(strsplit(x, NULL), rev), paste, collapse="") return(x) } # generate complementary sequence # DuplexReverse=function(x) { x=as.character(x) x=gsub("U","T",x) x=gsub("A","U",x) x=gsub("T","A",x) x=gsub("G","g",x) x=gsub("C","G",x) x=gsub("g","C",x) x=StrReverse(x) return(x) } # check if input contains AUGC only # SeqCheck=function(x){ x=as.character(x) x=strsplit(x, "") x=unlist(x) if(sum(x %in% c("A", "U", "G", "C")) == length(x) ){ return(TRUE) } else{ return(FALSE) } } # extract seed sequence from siRNA # SeedExtract=function(x, start=2, end=7){ x=as.character(x) x=substr(x, start, end) return(x) } # examine if a pool contain a seed # SeedExist=function(dfr, s){ dfr=data.frame(dfr) s=as.character(s) return(as.numeric(apply(dfr == s, 1, sum))) } # seed analysis # SeedAnalysis=function(Z, data, start=2, end=7){ print("Generating seed.dfr ... ... ") data=data.frame(data) seed.dfr=data for(i in 1:ncol(seed.dfr)){ seed.dfr[, i]=as.character(SeedExtract(seed.dfr[, i], start=start, end=end)) } colnames(seed.dfr)=paste("seed", 1:ncol(seed.dfr), sep=".") print("Generating seed.table ... ... ") seed.table=data.frame(table(unlist(seed.dfr))) names(seed.table)=c("seed", "family.size") seed.table$ks.p.value=rep(NA, length(seed.table$seed)) y=Z for(i in 1:length(seed.table$seed)){ s=SeedExist(seed.dfr, seed.table$seed[i]) seed.table$ks.p.value[i]= ks.test(y[s > 0], y[s== 0])$p.value print(paste("seed ", i)) } return(list(Z=Z, data=data, seed.table=seed.table, seed.dfr=seed.dfr)) } # offtarget analysis # OfftargetAnalysis=function(object, s=0.001){ seed.dfr=object$seed.dfr seed.table=object$seed.table y=object$Z print("Generating Matrix X ... ... ") s.sub=as.character(seed.table$seed) x=matrix(0, length(y), length(s.sub)) colnames(x)=as.character(s.sub) for(i in 1:length(s.sub)){ n=as.character(s.sub[i]) x[, n]=SeedExist(seed.dfr, n) print(paste("Matrix X seed ", i)) } print("Fitting ... ... ") gc() fit=glmnet(x, y) print("offtarget analysis") offtarget=data.frame(as.matrix(coef(fit, s=s))) offtarget$seed=rownames(offtarget) offtarget=offtarget[, c(2, 1)] names(offtarget)[2]="coef" rownames(offtarget)=NULL offtarget=merge(offtarget, seed.table, by="seed", all.x=T) y.fit=as.numeric(predict(fit, x, s=s)) y.corrected=y-y.fit print("Off-target analysis Done!") return(list(fit=fit, X=x, offtarget=offtarget, Z.corrected=y.corrected)) } # generating data frame of siRNA sequence # SeqGenerate=function(SeqDat){ SenseSeq=data.frame(apply(SeqDat, 2, TtoU)) AntiSeq=data.frame(apply(SenseSeq, 2, DuplexReverse)) for(i in 1:ncol(SenseSeq)){ AntiSeq[, i]=DuplexReverse(SenseSeq[, i]) } BothSeq=cbind(SenseSeq, AntiSeq) names(BothSeq)=paste("duplex", 1:ncol(BothSeq), sep=".") output=list(SenseSeq=SenseSeq, AntiSeq=AntiSeq, BothSeq=BothSeq) return(output) } # DataAnalysis # DataAnalysis=function(Z, Seq, SeedStart, Strand, Lambda){ SeedEnd=SeedStart+5 analysis=list() if(Strand == 'sense'){ Seq=Seq$SenseSeq analysis$seed.analysis=SeedAnalysis(Z=Z, data=Seq, start=SeedStart, end=SeedEnd) analysis$offtarget.analysis=OfftargetAnalysis(analysis$seed.analysis, s=Lambda) } else if(Strand == "antisense"){ Seq=Seq$AntiSeq analysis$seed.analysis=SeedAnalysis(Z=Z, data=Seq, start=SeedStart, end=SeedEnd) analysis$offtarget.analysis=OfftargetAnalysis(analysis$seed.analysis, s=Lambda) } else if(Strand == 'both'){ Seq=Seq$BothSeq analysis$seed.analysis=SeedAnalysis(Z=Z, data=Seq, start=SeedStart, end=SeedEnd) analysis$offtarget.analysis=OfftargetAnalysis(analysis$seed.analysis, s=Lambda) } return(analysis) } # new functions # # FDR calculation # fdr.bum=function(p.value, Sig){ p.sum=attributes(summary(Bum(p.value), tau=Sig))$estimates f=p.sum[4]/(p.sum[2]+p.sum[4]) f=as.numeric(f) return(f) } # load miRNA database # load("/home/galaxy/galaxy-dist/tools/offtarget/miRNA.RData") ################## ### data input ### ################## # data input # SeedStart=Seed inputfile=read.csv(InputFile, header=F) id=inputfile[, 1] Z=as.numeric(inputfile[, 2]) # Z score checking # if(sum(is.na(Z)) > 0){ print("Missing Value in Experimental Readouts!") quit() } if(sum(Z == "") > 0){ print("Missing Value in Experimental Readouts!") quit() } ##################### ### data analysis ### ##################### ### Custom ### if(Library == "custom"){ ### check if user selected wrong "Library" parameter ### if(ncol(inputfile) < 3){ print("siRNA sequences not available and please check your Library selection") quit() } ################################################################## SeqDat=inputfile[, 3:ncol(inputfile)] SeqDat=data.frame(SeqDat) if(sum(is.na(unlist(SeqDat))) > 0){ print("Missing Value in siRNA sequences!") quit() } if(sum(unlist(SeqDat) == "") > 0){ print("Missing Value in siRNA sequences!") quit() } Seq=SeqGenerate(SeqDat) SenseSeq=Seq$SenseSeq Check=data.frame(apply(SenseSeq, 1, SeqCheck)) if(sum(Check == FALSE) > 0){ print("Non-Sequence Value in siRNA sequences!") quit() } else{ analysis=DataAnalysis(Z, Seq, SeedStart, Strand, Lambda) } } ### Ambion ### if(Library == "ambion"){ id=as.integer(id) if(sum(is.na(id)) > 0){ print("Missing Value in Gene ID!") quit() } if(sum(id == "") > 0){ print("Missing Value in Gene ID!") quit() } load("/home/galaxy/galaxy-dist/tools/offtarget/Ambion.RData") id.ambion=unique(ambion$id) if(length(id[!(id %in% id.ambion)]) > 0){ write.csv(c("No Matching siRNA Sequences with Below Gene ID and Removed from Analysis", id[!(id %in% id.ambion)]), "unmatched.id.csv") } Dat=data.frame(id=id, z=Z) Dat=merge(Dat, ambion, by="id", sort=F) Z=Dat$z id=Dat$id SeqDat=Dat[, 6:8] Seq=SeqGenerate(SeqDat) analysis=DataAnalysis(Z, Seq, SeedStart, Strand, Lambda) } ### New Dhar ### if(Library == "new_dharmacon"){ id=as.integer(id) if(sum(is.na(id)) > 0){ print("Missing Value in Gene ID!") quit() } if(sum(id == "") > 0){ print("Missing Value in Gene ID!") quit() } load("/home/galaxy/galaxy-dist/tools/offtarget/New_Dhar.RData") id.new.dhar=unique(new.dhar$id) if(length(id[!(id %in% id.new.dhar)]) > 0){ write.csv(c("No Matching siRNA Sequences with Below Gene ID and Removed from Analysis", id[!(id %in% id.new.dhar)]), "unmatched.id.csv") } Dat=data.frame(id=id, z=Z) Dat=merge(Dat, new.dhar, by="id", sort=F) Z=Dat$z id=Dat$id SeqDat=Dat[, 7:10] Seq=SeqGenerate(SeqDat) analysis=DataAnalysis(Z, Seq, SeedStart, Strand, Lambda) } ### Old Dhar ### if(Library == "old_dharmacon"){ id=as.integer(id) if(sum(is.na(id)) > 0){ print("Missing Value in Gene ID!") quit() } if(sum(id == "") > 0){ print("Missing Value in Gene ID!") quit() } load("/home/galaxy/galaxy-dist/tools/offtarget/Old_Dhar.RData") id.old.dhar=unique(old.dhar$id) if(length(id[!(id %in% id.old.dhar)]) > 0){ write.csv(c("No Matching siRNA Sequences with Below Gene ID and Removed from Analysis", id[!(id %in% id.old.dhar)]), "unmatched.id.csv") } Dat=data.frame(id=id, z=Z) Dat=merge(Dat, old.dhar, by="id", sort=F) Z=Dat$z id=Dat$id SeqDat=Dat[, 7:10] Seq=SeqGenerate(SeqDat) analysis=DataAnalysis(Z, Seq, SeedStart, Strand, Lambda) } ###################### ### results output ### ###################### # code chagned below # # generating output files # duplex.dfr=analysis$seed.analysis$data names(duplex.dfr)=paste("duplex", 1:length(names(duplex.dfr)), sep=".") seed.dfr=analysis$seed.analysis$seed.dfr offtarget=analysis$offtarget.analysis$offtarget Z.corrected=analysis$offtarget.analysis$Z.corrected output.corr=cbind(ID=id, Z.Score=Z, Corrected.Z.Score=Z.corrected, duplex.dfr, seed.dfr) print("outputing results... ...") if(Library == 'custom'){ print("outputing corrected Z score... ...") write.csv(output.corr, "CorrectedZScore.csv", row.names=F) ### export corrected z score print("outputing summary... ...") write.csv(offtarget, "Seed_Famliy_Summary.csv", row.names=FALSE) ### output seed family summary table print("outputing offtarget seed families... ...") off.target=offtarget$seed[offtarget$ks.p.value < Sig & ( offtarget$coef < -Str | offtarget$coef > Str)] # select off target by custom setup write.csv(offtarget[offtarget$seed %in% off.target, ], "Off-Target_Seed_Families.csv", row.names=FALSE) ### export off-target seed families print("outputing miRNAs... ...") if(nrow(miRNA[miRNA$Seed.sequence %in% off.target, ]) == 0){ write.csv("No miRNA detected", "miRNA.csv", row.names=F) fe="No" } ### export miRNAs table sharing common seed if(nrow(miRNA[miRNA$Seed.sequence %in% off.target, ]) != 0){ write.csv(miRNA[miRNA$Seed.sequence %in% off.target, ], "miRNA.csv", row.names=F) fe="Yes" } ### export miRNAs talbe sharing common seed if(length(off.target) > 0){ for(i in 1:length(off.target)){ n=off.target[i] si=SeedExist(seed.dfr, n) si.offtarget=output.corr[si == 1, -3] si.offtarget$off.target.seed=n si.offtarget=si.offtarget[order(si.offtarget$Z.Score), ] print(paste(n, "siRNA pools ... ...")) write.csv(si.offtarget, paste("siRNAsPool_OffTargetSeedFamily_", n, ".csv", sep=""), row.names=F) } } }else{ print("outputing corrected Z score... ...") write.csv(output.corr[, 1:3], "CorrectedZScore.csv", row.names=F) print("outputing summary... ...") write.csv(offtarget, "Seed_Famliy_Summary.csv", row.names=FALSE) ### output seed family summary table print("outputing offtarget seed families... ...") off.target=offtarget$seed[offtarget$ks.p.value < Sig & ( offtarget$coef < -Str | offtarget$coef > Str)] # select off target by custom setup write.csv(offtarget[offtarget$seed %in% off.target, ], "Off-Target_Seed_Families.csv", row.names=FALSE) ### export off-target seed families print("outputing miRNAs... ...") if(nrow(miRNA[miRNA$Seed.sequence %in% off.target, ]) == 0){ write.csv("No miRNA detected", "miRNA.csv", row.names=F) fe="No" } ### export miRNAs table sharing common seed if(nrow(miRNA[miRNA$Seed.sequence %in% off.target, ]) != 0){ write.csv(miRNA[miRNA$Seed.sequence %in% off.target, ], "miRNA.csv", row.names=F) fe="Yes" } ### export miRNAs talbe sharing common seed if(length(off.target) > 0){ for(i in 1:length(off.target)){ n=off.target[i] si=SeedExist(seed.dfr, n) si.offtarget=output.corr[si == 1, -3] si.offtarget$off.target.seed=n si.offtarget=si.offtarget[order(si.offtarget$Z.Score), ] print(paste(n, "siRNA pools ... ...")) write.csv(si.offtarget, paste("siRNAsPool_OffTargetSeedFamily_", n, ".csv", sep=""), row.names=F) } } } # generating output figures # print("outputing figures... ...") jpeg(file="SeedFamilyOffTarget.jpeg", height=600, width=800) par(mar=c(5, 5, 2, 2)) plot(offtarget$coef, -log10(offtarget$ks.p.value), pch=20, col="blue", yaxt="n", cex.axis=1.5, cex.lab=2, xlab="Strength of seed-linked effect", ylab="Significance (P value)") p.log=ceiling(-log10(min(offtarget$ks.p.value, na.rm=T))/2) p.range=10^(0:-p.log*2) axis(2, p.range, at=-log(p.range, 10), cex.axis=1.5) off.target=offtarget$seed[offtarget$coef < -Str & offtarget$ks.p.value < Sig] nn=length(off.target) points(offtarget$coef[offtarget$seed %in% off.target], -log10(offtarget$ks.p.value[offtarget$seed %in% off.target]), col="red", pch=20) off.target=offtarget$seed[offtarget$coef > Str & offtarget$ks.p.value < Sig] np=length(off.target) points(offtarget$coef[offtarget$seed %in% off.target], -log10(offtarget$ks.p.value[offtarget$seed %in% off.target]), col="green", pch=20) abline(v=-Str, col="red", lwd=3) abline(v=Str, col="green", lwd=3) abline(h=-log10(Sig), col="blue", lwd=3) dev.off() # generating legend # jpeg(file="SeedFamilyOffTargetLegend.jpeg", width=500, height=100) par(mar=c(0, 0, 0, 0)) plot(c(0, 2), c(0.8, 1.2), type="n", bty="n", yaxt="n", xaxt='n') points(c(0, 0), c(1.1, 0.9), col=c('red', "green"), pch=20, cex=2) text(1, 1.1, paste('N: ', nn, " (Strength < ", -Str, ", P value < ", Sig, ")", sep=""), cex=2) text(1, 0.9, paste('N: ', np, " (Strength > ", Str, ", P value < ", Sig, ")", sep=""), cex=2) dev.off() # new output # # generating summary # print("Ansysis Summary ... ...") print("p value ... ...") p.value=offtarget$ks.p.value[offtarget$coef < -Str | offtarget$coef > Str] print("FDR ... ...") f=fdr.bum(p.value, Sig) print("rounding ... ...") f=round(f, digits=3) print(f) print("FDR done ... ...") print("off.sum ... ...") off.sum=matrix(rep("", 30), 15, 2) off.sum[1,1]="Analysis is successfully done!" off.sum[3,1]="Parameters set up" off.sum[4:15, 1]=c("Strand Orientation for Analysis:", "Lambda value:", "Seed:", "siRNA Library:", "Strength of seed-linked effect:", "P value:", "", "Analysis summary", "Number of negative off-target seed families:", "Number of positive off-target seed families:", "FDR:", "Identify miRNA with phenotypical effects:") off.sum[4:15, 2]=c(Strand, Lambda, Seed, Library, Str, Sig, "", "", nn, np, f, fe) write.table(off.sum, "Analysis_Summary.csv", row.names=F, col.names=F, sep=",") print("Analysis done ... ...")