# HG changeset patch # User davidvanzessen # Date 1399642532 14400 # Node ID 7d97fa9a042332f707840fc72576eff7e51be2b5 Uploaded diff -r 000000000000 -r 7d97fa9a0423 RScript.bak.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RScript.bak.r Fri May 09 09:35:32 2014 -0400 @@ -0,0 +1,317 @@ +#options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +args <- commandArgs(trailingOnly = TRUE) + +inFile = args[1] +outFile = args[2] +outDir = args[3] +clonalType = args[4] + +if (!("gridExtra" %in% rownames(installed.packages()))) { + install.packages("gridExtra", repos="http://cran.xl-mirror.nl/") +} +library(gridExtra) +if (!("ggplot2" %in% rownames(installed.packages()))) { + install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") +} +require(ggplot2) +if (!("plyr" %in% rownames(installed.packages()))) { + install.packages("plyr", repos="http://cran.xl-mirror.nl/") +} +require(plyr) + +if (!("data.table" %in% rownames(installed.packages()))) { + install.packages("data.table", repos="http://cran.xl-mirror.nl/") +} +library(data.table) + + +test = read.table(inFile, sep="\t", header=TRUE, fill=T, comment.char="") + +test = test[test$Sample != "",] + +test$Top.V.Gene = gsub("[*]([0-9]+)", "", test$Top.V.Gene) +test$Top.D.Gene = gsub("[*]([0-9]+)", "", test$Top.D.Gene) +test$Top.J.Gene = gsub("[*]([0-9]+)", "", test$Top.J.Gene) + +#test$VDJCDR3 = do.call(paste, c(test[c("Top.V.Gene", "Top.D.Gene", "Top.J.Gene","CDR3.Seq.DNA")], sep = ":")) +test$VDJCDR3 = do.call(paste, c(test[unlist(strsplit(clonalType, ","))], sep = ":")) + +PROD = test[test$VDJ.Frame != "In-frame with stop codon" & test$VDJ.Frame != "Out-of-frame" & test$CDR3.Found.How != "NOT_FOUND" , ] +if("Functionality" %in% colnames(test)) { + PROD = test[test$Functionality == "productive" | test$Functionality == "productive (see comment)", ] +} + +NONPROD = test[test$VDJ.Frame == "In-frame with stop codon" | test$VDJ.Frame == "Out-of-frame" | test$CDR3.Found.How == "NOT_FOUND" , ] + +#PRODF = PROD[ -1] + +PRODF = PROD + +#PRODF = unique(PRODF) +PRODF = PRODF[!duplicated(PRODF$VDJCDR3), ] + +PRODFV = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.V.Gene")]) +PRODFV$Length = as.numeric(PRODFV$Length) +Total = 0 +Total = ddply(PRODFV, .(Sample), function(x) data.frame(Total = sum(x$Length))) +PRODFV = merge(PRODFV, Total, by.x='Sample', by.y='Sample', all.x=TRUE) +PRODFV = ddply(PRODFV, c("Sample", "Top.V.Gene"), summarise, relFreq= (Length*100 / Total)) + +PRODFD = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.D.Gene")]) +PRODFD$Length = as.numeric(PRODFD$Length) +Total = 0 +Total = ddply(PRODFD, .(Sample), function(x) data.frame(Total = sum(x$Length))) +PRODFD = merge(PRODFD, Total, by.x='Sample', by.y='Sample', all.x=TRUE) +PRODFD = ddply(PRODFD, c("Sample", "Top.D.Gene"), summarise, relFreq= (Length*100 / Total)) + +PRODFJ = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.J.Gene")]) +PRODFJ$Length = as.numeric(PRODFJ$Length) +Total = 0 +Total = ddply(PRODFJ, .(Sample), function(x) data.frame(Total = sum(x$Length))) +PRODFJ = merge(PRODFJ, Total, by.x='Sample', by.y='Sample', all.x=TRUE) +PRODFJ = ddply(PRODFJ, c("Sample", "Top.J.Gene"), summarise, relFreq= (Length*100 / Total)) + +V = c("v.name\tchr.orderV\nIGHV7-81\t1\nIGHV3-74\t2\nIGHV3-73\t3\nIGHV3-72\t4\nIGHV2-70\t6\nIGHV1-69\t7\nIGHV3-66\t8\nIGHV3-64\t9\nIGHV4-61\t10\nIGHV4-59\t11\nIGHV1-58\t12\nIGHV3-53\t13\nIGHV5-a\t15\nIGHV5-51\t16\nIGHV3-49\t17\nIGHV3-48\t18\nIGHV1-46\t20\nIGHV1-45\t21\nIGHV3-43\t22\nIGHV4-39\t23\nIGHV3-35\t24\nIGHV4-34\t25\nIGHV3-33\t26\nIGHV4-31\t27\nIGHV4-30-4\t28\nIGHV4-30-2\t29\nIGHV3-30-3\t30\nIGHV3-30\t31\nIGHV4-28\t32\nIGHV2-26\t33\nIGHV1-24\t34\nIGHV3-23\t35\nIGHV3-21\t37\nIGHV3-20\t38\nIGHV1-18\t40\nIGHV3-15\t41\nIGHV3-13\t42\nIGHV3-11\t43\nIGHV3-9\t44\nIGHV1-8\t45\nIGHV3-7\t46\nIGHV2-5\t47\nIGHV7-4-1\t48\nIGHV4-4\t49\nIGHV4-b\t50\nIGHV1-3\t51\nIGHV1-2\t52\nIGHV6-1\t53") +tcV = textConnection(V) +Vchain = read.table(tcV, sep="\t", header=TRUE) +PRODFV = merge(PRODFV, Vchain, by.x='Top.V.Gene', by.y='v.name', all.x=TRUE) +close(tcV) + +D = c("v.name\tchr.orderD\nIGHD1-1\t1\nIGHD2-2\t2\nIGHD3-3\t3\nIGHD6-6\t4\nIGHD1-7\t5\nIGHD2-8\t6\nIGHD3-9\t7\nIGHD3-10\t8\nIGHD4-11\t9\nIGHD5-12\t10\nIGHD6-13\t11\nIGHD1-14\t12\nIGHD2-15\t13\nIGHD3-16\t14\nIGHD4-17\t15\nIGHD5-18\t16\nIGHD6-19\t17\nIGHD1-20\t18\nIGHD2-21\t19\nIGHD3-22\t20\nIGHD4-23\t21\nIGHD5-24\t22\nIGHD6-25\t23\nIGHD1-26\t24\nIGHD7-27\t25") +tcD = textConnection(D) +Dchain = read.table(tcD, sep="\t", header=TRUE) +PRODFD = merge(PRODFD, Dchain, by.x='Top.D.Gene', by.y='v.name', all.x=TRUE) +close(tcD) + + +J = c("v.name\tchr.orderJ\nIGHJ1\t1\nIGHJ2\t2\nIGHJ3\t3\nIGHJ4\t4\nIGHJ5\t5\nIGHJ6\t6") +tcJ = textConnection(J) +Jchain = read.table(tcJ, sep="\t", header=TRUE) +PRODFJ = merge(PRODFJ, Jchain, by.x='Top.J.Gene', by.y='v.name', all.x=TRUE) +close(tcJ) + +setwd(outDir) + +write.table(PRODF, "allUnique.tsv", sep="\t",quote=F,row.names=F,col.names=T) + +pV = ggplot(PRODFV) +pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage") + +png("VPlot.png",width = 1280, height = 720) +pV +dev.off(); + +pD = ggplot(PRODFD) +pD = pD + geom_bar( aes( x=factor(reorder(Top.D.Gene, chr.orderD)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +pD = pD + xlab("Summary of D gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage") + +png("DPlot.png",width = 800, height = 600) +pD +dev.off(); + +pJ = ggplot(PRODFJ) +pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage") + +png("JPlot.png",width = 800, height = 600) +pJ +dev.off(); + +revVchain = Vchain +revDchain = Dchain +revVchain$chr.orderV = rev(revVchain$chr.orderV) +revDchain$chr.orderD = rev(revDchain$chr.orderD) + +plotVD <- function(dat){ + if(length(dat[,1]) == 0){ + return() + } + img = ggplot() + + geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + scale_fill_gradient(low="gold", high="blue", na.value="white") + + ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + + xlab("D genes") + + ylab("V Genes") + + png(paste("HeatmapVD_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Dchain$v.name)), height=100+(15*length(Vchain$v.name))) + print(img) + dev.off() +} + +VandDCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.D.Gene", "Sample")]) + +VandDCount$l = log(VandDCount$Length) +maxVD = data.frame(data.table(VandDCount)[, list(max=max(l)), by=c("Sample")]) +VandDCount = merge(VandDCount, maxVD, by.x="Sample", by.y="Sample", all.x=T) +VandDCount$relLength = VandDCount$l / VandDCount$max + +cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(test$Sample)) + +completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE) +completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) +completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) +VDList = split(completeVD, f=completeVD[,"Sample"]) + +lapply(VDList, FUN=plotVD) + + + +plotVJ <- function(dat){ + if(length(dat[,1]) == 0){ + return() + } + img = ggplot() + + geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + scale_fill_gradient(low="gold", high="blue", na.value="white") + + ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + + xlab("J genes") + + ylab("V Genes") + + png(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Vchain$v.name))) + print(img) + dev.off() +} + +VandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.J.Gene", "Sample")]) + +VandJCount$l = log(VandJCount$Length) +maxVJ = data.frame(data.table(VandJCount)[, list(max=max(l)), by=c("Sample")]) +VandJCount = merge(VandJCount, maxVJ, by.x="Sample", by.y="Sample", all.x=T) +VandJCount$relLength = VandJCount$l / VandJCount$max + +cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample)) + +completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE) +completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) +completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) +VJList = split(completeVJ, f=completeVJ[,"Sample"]) +lapply(VJList, FUN=plotVJ) + +plotDJ <- function(dat){ + if(length(dat[,1]) == 0){ + return() + } + img = ggplot() + + geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=relLength)) + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + scale_fill_gradient(low="gold", high="blue", na.value="white") + + ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + + xlab("J genes") + + ylab("D Genes") + + png(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Dchain$v.name))) + print(img) + dev.off() +} + +DandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.D.Gene", "Top.J.Gene", "Sample")]) + +DandJCount$l = log(DandJCount$Length) +maxDJ = data.frame(data.table(DandJCount)[, list(max=max(l)), by=c("Sample")]) +DandJCount = merge(DandJCount, maxDJ, by.x="Sample", by.y="Sample", all.x=T) +DandJCount$relLength = DandJCount$l / DandJCount$max + +cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample)) + +completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE) +completeDJ = merge(completeDJ, revDchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) +completeDJ = merge(completeDJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) +DJList = split(completeDJ, f=completeDJ[,"Sample"]) +lapply(DJList, FUN=plotDJ) + + +sampleFile <- file("samples.txt") +un = unique(test$Sample) +un = paste(un, sep="\n") +writeLines(un, sampleFile) +close(sampleFile) + + +if("Replicate" %in% colnames(test)) +{ + clonalityFrame = PROD + clonalityFrame$ReplicateConcat = do.call(paste, c(clonalityFrame[c("VDJCDR3", "Sample", "Replicate")], sep = ":")) + clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$ReplicateConcat), ] + write.table(clonalityFrame, "clonalityComplete.tsv", sep="\t",quote=F,row.names=F,col.names=T) + + ClonalitySampleReplicatePrint <- function(dat){ + write.table(dat, paste("clonality_", unique(dat$Sample) , "_", unique(dat$Replicate), ".tsv", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + } + + clonalityFrameSplit = split(clonalityFrame, f=clonalityFrame[,c("Sample", "Replicate")]) + lapply(clonalityFrameSplit, FUN=ClonalitySampleReplicatePrint) + + ClonalitySamplePrint <- function(dat){ + write.table(dat, paste("clonality_", unique(dat$Sample) , ".tsv", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + } + + clonalityFrameSplit = split(clonalityFrame, f=clonalityFrame[,"Sample"]) + lapply(clonalityFrameSplit, FUN=ClonalitySamplePrint) + + clonalFreq = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "VDJCDR3")]) + clonalFreqCount = data.frame(data.table(clonalFreq)[, list(Count=.N), by=c("Sample", "Type")]) + clonalFreqCount$realCount = clonalFreqCount$Type * clonalFreqCount$Count + clonalSum = data.frame(data.table(clonalFreqCount)[, list(Reads=sum(realCount)), by=c("Sample")]) + clonalFreqCount = merge(clonalFreqCount, clonalSum, by.x="Sample", by.y="Sample") + + ct = c('Type\tWeight\n2\t1\n3\t3\n4\t6\n5\t10\n6\t15') + tcct = textConnection(ct) + CT = read.table(tcct, sep="\t", header=TRUE) + close(tcct) + clonalFreqCount = merge(clonalFreqCount, CT, by.x="Type", by.y="Type", all.x=T) + clonalFreqCount$WeightedCount = clonalFreqCount$Count * clonalFreqCount$Weight + + ReplicateReads = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "Replicate", "VDJCDR3")]) + ReplicateReads = data.frame(data.table(ReplicateReads)[, list(Reads=.N), by=c("Sample", "Replicate")]) + clonalFreqCount$Reads = as.numeric(clonalFreqCount$Reads) + ReplicateReads$squared = ReplicateReads$Reads * ReplicateReads$Reads + + ReplicatePrint <- function(dat){ + write.table(dat[-1], paste("ReplicateReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) + } + + ReplicateSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) + lapply(ReplicateSplit, FUN=ReplicatePrint) + + ReplicateReads = data.frame(data.table(ReplicateReads)[, list(ReadsSum=sum(Reads), ReadsSquaredSum=sum(squared)), by=c("Sample")]) + clonalFreqCount = merge(clonalFreqCount, ReplicateReads, by.x="Sample", by.y="Sample", all.x=T) + + + ReplicateSumPrint <- function(dat){ + write.table(dat[-1], paste("ReplicateSumReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) + } + + ReplicateSumSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) + lapply(ReplicateSumSplit, FUN=ReplicateSumPrint) + + clonalFreqCountSum = data.frame(data.table(clonalFreqCount)[, list(Numerator=sum(WeightedCount, na.rm=T)), by=c("Sample")]) + clonalFreqCount = merge(clonalFreqCount, clonalFreqCountSum, by.x="Sample", by.y="Sample", all.x=T) + clonalFreqCount$ReadsSum = as.numeric(clonalFreqCount$ReadsSum) #prevent integer overflow + clonalFreqCount$Denominator = (((clonalFreqCount$ReadsSum * clonalFreqCount$ReadsSum) - clonalFreqCount$ReadsSquaredSum) / 2) + clonalFreqCount$Result = (clonalFreqCount$Numerator + 1) / (clonalFreqCount$Denominator + 1) + + ClonalityScorePrint <- function(dat){ + write.table(dat$Result, paste("ClonalityScore_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) + } + + clonalityScore = clonalFreqCount[c("Sample", "Result")] + clonalityScore = unique(clonalityScore) + + clonalityScoreSplit = split(clonalityScore, f=clonalityScore[,"Sample"]) + lapply(clonalityScoreSplit, FUN=ClonalityScorePrint) + + clonalityOverview = clonalFreqCount[c("Sample", "Type", "Count", "Weight", "WeightedCount")] + + + + ClonalityOverviewPrint <- function(dat){ + write.table(dat[-1], paste("ClonalityOverView_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) + } + + clonalityOverviewSplit = split(clonalityOverview, f=clonalityOverview$Sample) + lapply(clonalityOverviewSplit, FUN=ClonalityOverviewPrint) +} diff -r 000000000000 -r 7d97fa9a0423 RScript.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RScript.r Fri May 09 09:35:32 2014 -0400 @@ -0,0 +1,452 @@ +#options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +args <- commandArgs(trailingOnly = TRUE) + +inFile = args[1] +outFile = args[2] +outDir = args[3] +clonalType = args[4] +species = args[5] +locus = args[6] +selection = args[7] + + + +if (!("gridExtra" %in% rownames(installed.packages()))) { + install.packages("gridExtra", repos="http://cran.xl-mirror.nl/") +} +library(gridExtra) +if (!("ggplot2" %in% rownames(installed.packages()))) { + install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") +} +require(ggplot2) +if (!("plyr" %in% rownames(installed.packages()))) { + install.packages("plyr", repos="http://cran.xl-mirror.nl/") +} +require(plyr) + +if (!("data.table" %in% rownames(installed.packages()))) { + install.packages("data.table", repos="http://cran.xl-mirror.nl/") +} +library(data.table) + +if (!("reshape2" %in% rownames(installed.packages()))) { + install.packages("reshape2", repos="http://cran.xl-mirror.nl/") +} +library(reshape2) + + +test = read.table(inFile, sep="\t", header=TRUE, fill=T, comment.char="") + +test = test[test$Sample != "",] + +test$Top.V.Gene = gsub("[*]([0-9]+)", "", test$Top.V.Gene) +test$Top.D.Gene = gsub("[*]([0-9]+)", "", test$Top.D.Gene) +test$Top.J.Gene = gsub("[*]([0-9]+)", "", test$Top.J.Gene) + +#test$VDJCDR3 = do.call(paste, c(test[c("Top.V.Gene", "Top.D.Gene", "Top.J.Gene","CDR3.Seq.DNA")], sep = ":")) +test$VDJCDR3 = do.call(paste, c(test[unlist(strsplit(clonalType, ","))], sep = ":")) + +PROD = test[test$VDJ.Frame != "In-frame with stop codon" & test$VDJ.Frame != "Out-of-frame" & test$CDR3.Found.How != "NOT_FOUND" , ] +if("Functionality" %in% colnames(test)) { + PROD = test[test$Functionality == "productive" | test$Functionality == "productive (see comment)", ] +} + +NONPROD = test[test$VDJ.Frame == "In-frame with stop codon" | test$VDJ.Frame == "Out-of-frame" | test$CDR3.Found.How == "NOT_FOUND" , ] + +#PRODF = PROD[ -1] + +PRODF = PROD + +#PRODF = unique(PRODF) + + + +if(selection == "unique"){ + PRODF = PRODF[!duplicated(PRODF$VDJCDR3), ] +} + +PRODFV = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.V.Gene")]) +PRODFV$Length = as.numeric(PRODFV$Length) +Total = 0 +Total = ddply(PRODFV, .(Sample), function(x) data.frame(Total = sum(x$Length))) +PRODFV = merge(PRODFV, Total, by.x='Sample', by.y='Sample', all.x=TRUE) +PRODFV = ddply(PRODFV, c("Sample", "Top.V.Gene"), summarise, relFreq= (Length*100 / Total)) + +PRODFD = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.D.Gene")]) +PRODFD$Length = as.numeric(PRODFD$Length) +Total = 0 +Total = ddply(PRODFD, .(Sample), function(x) data.frame(Total = sum(x$Length))) +PRODFD = merge(PRODFD, Total, by.x='Sample', by.y='Sample', all.x=TRUE) +PRODFD = ddply(PRODFD, c("Sample", "Top.D.Gene"), summarise, relFreq= (Length*100 / Total)) + +PRODFJ = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Sample", "Top.J.Gene")]) +PRODFJ$Length = as.numeric(PRODFJ$Length) +Total = 0 +Total = ddply(PRODFJ, .(Sample), function(x) data.frame(Total = sum(x$Length))) +PRODFJ = merge(PRODFJ, Total, by.x='Sample', by.y='Sample', all.x=TRUE) +PRODFJ = ddply(PRODFJ, c("Sample", "Top.J.Gene"), summarise, relFreq= (Length*100 / Total)) + + +if(species == "human"){ + if(locus == "igh"){ + cat("human igh") + } else if (locus == "igk"){ + cat("human igk") + } else if (locus == "igl"){ + cat("human igl") + } +} else if (species == "mouse"){ + if(locus == "igh"){ + cat("mouse igh") + } else if (locus == "igk"){ + cat("mouse igk") + } else if (locus == "igl"){ + cat("mouse igl") + } +} + +V = c("v.name\tchr.orderV\nIGHV7-81\t1\nIGHV3-74\t2\nIGHV3-73\t3\nIGHV3-72\t4\nIGHV2-70\t6\nIGHV1-69\t7\nIGHV3-66\t8\nIGHV3-64\t9\nIGHV4-61\t10\nIGHV4-59\t11\nIGHV1-58\t12\nIGHV3-53\t13\nIGHV5-a\t15\nIGHV5-51\t16\nIGHV3-49\t17\nIGHV3-48\t18\nIGHV1-46\t20\nIGHV1-45\t21\nIGHV3-43\t22\nIGHV4-39\t23\nIGHV3-35\t24\nIGHV4-34\t25\nIGHV3-33\t26\nIGHV4-31\t27\nIGHV4-30-4\t28\nIGHV4-30-2\t29\nIGHV3-30-3\t30\nIGHV3-30\t31\nIGHV4-28\t32\nIGHV2-26\t33\nIGHV1-24\t34\nIGHV3-23\t35\nIGHV3-21\t37\nIGHV3-20\t38\nIGHV1-18\t40\nIGHV3-15\t41\nIGHV3-13\t42\nIGHV3-11\t43\nIGHV3-9\t44\nIGHV1-8\t45\nIGHV3-7\t46\nIGHV2-5\t47\nIGHV7-4-1\t48\nIGHV4-4\t49\nIGHV4-b\t50\nIGHV1-3\t51\nIGHV1-2\t52\nIGHV6-1\t53") +D = c("v.name\tchr.orderD\nIGHD1-1\t1\nIGHD2-2\t2\nIGHD3-3\t3\nIGHD6-6\t4\nIGHD1-7\t5\nIGHD2-8\t6\nIGHD3-9\t7\nIGHD3-10\t8\nIGHD4-11\t9\nIGHD5-12\t10\nIGHD6-13\t11\nIGHD1-14\t12\nIGHD2-15\t13\nIGHD3-16\t14\nIGHD4-17\t15\nIGHD5-18\t16\nIGHD6-19\t17\nIGHD1-20\t18\nIGHD2-21\t19\nIGHD3-22\t20\nIGHD4-23\t21\nIGHD5-24\t22\nIGHD6-25\t23\nIGHD1-26\t24\nIGHD7-27\t25") +J = c("v.name\tchr.orderJ\nIGHJ1\t1\nIGHJ2\t2\nIGHJ3\t3\nIGHJ4\t4\nIGHJ5\t5\nIGHJ6\t6") + + + + +tcV = textConnection(V) +Vchain = read.table(tcV, sep="\t", header=TRUE) +PRODFV = merge(PRODFV, Vchain, by.x='Top.V.Gene', by.y='v.name', all.x=TRUE) +close(tcV) + + +tcD = textConnection(D) +Dchain = read.table(tcD, sep="\t", header=TRUE) +PRODFD = merge(PRODFD, Dchain, by.x='Top.D.Gene', by.y='v.name', all.x=TRUE) +close(tcD) + + + +tcJ = textConnection(J) +Jchain = read.table(tcJ, sep="\t", header=TRUE) +PRODFJ = merge(PRODFJ, Jchain, by.x='Top.J.Gene', by.y='v.name', all.x=TRUE) +close(tcJ) + +setwd(outDir) + +write.table(PRODF, "allUnique.csv", sep=",",quote=F,row.names=F,col.names=T) + +pV = ggplot(PRODFV) +pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage") +write.table(x=PRODFV, file="VFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) + +png("VPlot.png",width = 1280, height = 720) +pV +dev.off(); + +pD = ggplot(PRODFD) +pD = pD + geom_bar( aes( x=factor(reorder(Top.D.Gene, chr.orderD)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +pD = pD + xlab("Summary of D gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage") +write.table(x=PRODFD, file="DFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) + +png("DPlot.png",width = 800, height = 600) +pD +dev.off(); + +pJ = ggplot(PRODFJ) +pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage") +write.table(x=PRODFJ, file="JFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) + +png("JPlot.png",width = 800, height = 600) +pJ +dev.off(); + +VGenes = PRODF[,c("Sample", "Top.V.Gene")] +VGenes$Top.V.Gene = gsub("-.*", "", VGenes$Top.V.Gene) +VGenes = data.frame(data.table(VGenes)[, list(Count=.N), by=c("Sample", "Top.V.Gene")]) +TotalPerSample = data.frame(data.table(VGenes)[, list(total=sum(.SD$Count)), by=Sample]) +VGenes = merge(VGenes, TotalPerSample, by="Sample") +VGenes$Frequency = VGenes$Count * 100 / VGenes$total +VPlot = ggplot(VGenes) +VPlot = VPlot + geom_bar(aes( x = Top.V.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + ggtitle("Distribution of V gene families") + + ylab("Percentage of sequences") +png("VFPlot.png") +VPlot +dev.off(); +write.table(x=VGenes, file="VFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) + +DGenes = PRODF[,c("Sample", "Top.D.Gene")] +DGenes$Top.D.Gene = gsub("-.*", "", DGenes$Top.D.Gene) +DGenes = data.frame(data.table(DGenes)[, list(Count=.N), by=c("Sample", "Top.D.Gene")]) +TotalPerSample = data.frame(data.table(DGenes)[, list(total=sum(.SD$Count)), by=Sample]) +DGenes = merge(DGenes, TotalPerSample, by="Sample") +DGenes$Frequency = DGenes$Count * 100 / DGenes$total +DPlot = ggplot(DGenes) +DPlot = DPlot + geom_bar(aes( x = Top.D.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + ggtitle("Distribution of D gene families") + + ylab("Percentage of sequences") +png("DFPlot.png") +DPlot +dev.off(); +write.table(x=DGenes, file="DFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) + +JGenes = PRODF[,c("Sample", "Top.J.Gene")] +JGenes$Top.J.Gene = gsub("-.*", "", JGenes$Top.J.Gene) +JGenes = data.frame(data.table(JGenes)[, list(Count=.N), by=c("Sample", "Top.J.Gene")]) +TotalPerSample = data.frame(data.table(JGenes)[, list(total=sum(.SD$Count)), by=Sample]) +JGenes = merge(JGenes, TotalPerSample, by="Sample") +JGenes$Frequency = JGenes$Count * 100 / JGenes$total +JPlot = ggplot(JGenes) +JPlot = JPlot + geom_bar(aes( x = Top.J.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + ggtitle("Distribution of J gene families") + + ylab("Percentage of sequences") +png("JFPlot.png") +JPlot +dev.off(); +write.table(x=JGenes, file="JFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) + +CDR3Length = data.frame(data.table(PRODF)[, list(Count=.N), by=c("Sample", "CDR3.Length.DNA")]) +TotalPerSample = data.frame(data.table(CDR3Length)[, list(total=sum(.SD$Count)), by=Sample]) +CDR3Length = merge(CDR3Length, TotalPerSample, by="Sample") +CDR3Length$Frequency = CDR3Length$Count * 100 / CDR3Length$total +CDR3LengthPlot = ggplot(CDR3Length) +CDR3LengthPlot = CDR3LengthPlot + geom_bar(aes( x = CDR3.Length.DNA, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + ggtitle("Length distribution of CDR3") + + xlab("CDR3 Length") + + ylab("Percentage of sequences") +png("CDR3LengthPlot.png",width = 1280, height = 720) +CDR3LengthPlot +dev.off() +write.table(x=CDR3Length, file="CDR3LengthPlot.csv", sep=",",quote=F,row.names=F,col.names=T) + +revVchain = Vchain +revDchain = Dchain +revVchain$chr.orderV = rev(revVchain$chr.orderV) +revDchain$chr.orderD = rev(revDchain$chr.orderD) + +plotVD <- function(dat){ + if(length(dat[,1]) == 0){ + return() + } + img = ggplot() + + geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + scale_fill_gradient(low="gold", high="blue", na.value="white") + + ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + + xlab("D genes") + + ylab("V Genes") + + png(paste("HeatmapVD_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Dchain$v.name)), height=100+(15*length(Vchain$v.name))) + print(img) + + dev.off() + write.table(x=acast(dat, Top.V.Gene~Top.D.Gene, value.var="Length"), file=paste("HeatmapVD_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA) +} + +VandDCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.D.Gene", "Sample")]) + +VandDCount$l = log(VandDCount$Length) +maxVD = data.frame(data.table(VandDCount)[, list(max=max(l)), by=c("Sample")]) +VandDCount = merge(VandDCount, maxVD, by.x="Sample", by.y="Sample", all.x=T) +VandDCount$relLength = VandDCount$l / VandDCount$max + +cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(test$Sample)) + +completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE) +completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) +completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) +VDList = split(completeVD, f=completeVD[,"Sample"]) + +lapply(VDList, FUN=plotVD) + + + +plotVJ <- function(dat){ + if(length(dat[,1]) == 0){ + return() + } + img = ggplot() + + geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + scale_fill_gradient(low="gold", high="blue", na.value="white") + + ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + + xlab("J genes") + + ylab("V Genes") + + png(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Vchain$v.name))) + print(img) + dev.off() + write.table(x=acast(dat, Top.V.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapVJ_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA) +} + +VandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.J.Gene", "Sample")]) + +VandJCount$l = log(VandJCount$Length) +maxVJ = data.frame(data.table(VandJCount)[, list(max=max(l)), by=c("Sample")]) +VandJCount = merge(VandJCount, maxVJ, by.x="Sample", by.y="Sample", all.x=T) +VandJCount$relLength = VandJCount$l / VandJCount$max + +cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample)) + +completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE) +completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) +completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) +VJList = split(completeVJ, f=completeVJ[,"Sample"]) +lapply(VJList, FUN=plotVJ) + +plotDJ <- function(dat){ + if(length(dat[,1]) == 0){ + return() + } + img = ggplot() + + geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=relLength)) + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + scale_fill_gradient(low="gold", high="blue", na.value="white") + + ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + + xlab("J genes") + + ylab("D Genes") + + png(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Dchain$v.name))) + print(img) + dev.off() + write.table(x=acast(dat, Top.D.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapDJ_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA) +} + +DandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.D.Gene", "Top.J.Gene", "Sample")]) + +DandJCount$l = log(DandJCount$Length) +maxDJ = data.frame(data.table(DandJCount)[, list(max=max(l)), by=c("Sample")]) +DandJCount = merge(DandJCount, maxDJ, by.x="Sample", by.y="Sample", all.x=T) +DandJCount$relLength = DandJCount$l / DandJCount$max + +cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(test$Sample)) + +completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE) +completeDJ = merge(completeDJ, revDchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) +completeDJ = merge(completeDJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) +DJList = split(completeDJ, f=completeDJ[,"Sample"]) +lapply(DJList, FUN=plotDJ) + + +sampleFile <- file("samples.txt") +un = unique(test$Sample) +un = paste(un, sep="\n") +writeLines(un, sampleFile) +close(sampleFile) + + +if("Replicate" %in% colnames(test)) +{ + clonalityFrame = PROD + clonalityFrame$ReplicateConcat = do.call(paste, c(clonalityFrame[c("VDJCDR3", "Sample", "Replicate")], sep = ":")) + clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$ReplicateConcat), ] + write.table(clonalityFrame, "clonalityComplete.csv", sep=",",quote=F,row.names=F,col.names=T) + + ClonalitySampleReplicatePrint <- function(dat){ + write.table(dat, paste("clonality_", unique(dat$Sample) , "_", unique(dat$Replicate), ".csv", sep=""), sep=",",quote=F,row.names=F,col.names=T) + } + + clonalityFrameSplit = split(clonalityFrame, f=clonalityFrame[,c("Sample", "Replicate")]) + #lapply(clonalityFrameSplit, FUN=ClonalitySampleReplicatePrint) + + ClonalitySamplePrint <- function(dat){ + write.table(dat, paste("clonality_", unique(dat$Sample) , ".csv", sep=""), sep=",",quote=F,row.names=F,col.names=T) + } + + clonalityFrameSplit = split(clonalityFrame, f=clonalityFrame[,"Sample"]) + #lapply(clonalityFrameSplit, FUN=ClonalitySamplePrint) + + clonalFreq = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "VDJCDR3")]) + clonalFreqCount = data.frame(data.table(clonalFreq)[, list(Count=.N), by=c("Sample", "Type")]) + clonalFreqCount$realCount = clonalFreqCount$Type * clonalFreqCount$Count + clonalSum = data.frame(data.table(clonalFreqCount)[, list(Reads=sum(realCount)), by=c("Sample")]) + clonalFreqCount = merge(clonalFreqCount, clonalSum, by.x="Sample", by.y="Sample") + + ct = c('Type\tWeight\n2\t1\n3\t3\n4\t6\n5\t10\n6\t15') + tcct = textConnection(ct) + CT = read.table(tcct, sep="\t", header=TRUE) + close(tcct) + clonalFreqCount = merge(clonalFreqCount, CT, by.x="Type", by.y="Type", all.x=T) + clonalFreqCount$WeightedCount = clonalFreqCount$Count * clonalFreqCount$Weight + + ReplicateReads = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "Replicate", "VDJCDR3")]) + ReplicateReads = data.frame(data.table(ReplicateReads)[, list(Reads=.N), by=c("Sample", "Replicate")]) + clonalFreqCount$Reads = as.numeric(clonalFreqCount$Reads) + ReplicateReads$squared = ReplicateReads$Reads * ReplicateReads$Reads + + ReplicatePrint <- function(dat){ + write.table(dat[-1], paste("ReplicateReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) + } + + ReplicateSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) + lapply(ReplicateSplit, FUN=ReplicatePrint) + + ReplicateReads = data.frame(data.table(ReplicateReads)[, list(ReadsSum=sum(Reads), ReadsSquaredSum=sum(squared)), by=c("Sample")]) + clonalFreqCount = merge(clonalFreqCount, ReplicateReads, by.x="Sample", by.y="Sample", all.x=T) + + + ReplicateSumPrint <- function(dat){ + write.table(dat[-1], paste("ReplicateSumReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) + } + + ReplicateSumSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) + lapply(ReplicateSumSplit, FUN=ReplicateSumPrint) + + clonalFreqCountSum = data.frame(data.table(clonalFreqCount)[, list(Numerator=sum(WeightedCount, na.rm=T)), by=c("Sample")]) + clonalFreqCount = merge(clonalFreqCount, clonalFreqCountSum, by.x="Sample", by.y="Sample", all.x=T) + clonalFreqCount$ReadsSum = as.numeric(clonalFreqCount$ReadsSum) #prevent integer overflow + clonalFreqCount$Denominator = (((clonalFreqCount$ReadsSum * clonalFreqCount$ReadsSum) - clonalFreqCount$ReadsSquaredSum) / 2) + clonalFreqCount$Result = (clonalFreqCount$Numerator + 1) / (clonalFreqCount$Denominator + 1) + + ClonalityScorePrint <- function(dat){ + write.table(dat$Result, paste("ClonalityScore_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) + } + + clonalityScore = clonalFreqCount[c("Sample", "Result")] + clonalityScore = unique(clonalityScore) + + clonalityScoreSplit = split(clonalityScore, f=clonalityScore[,"Sample"]) + lapply(clonalityScoreSplit, FUN=ClonalityScorePrint) + + clonalityOverview = clonalFreqCount[c("Sample", "Type", "Count", "Weight", "WeightedCount")] + + + + ClonalityOverviewPrint <- function(dat){ + write.table(dat[-1], paste("ClonalityOverView_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) + } + + clonalityOverviewSplit = split(clonalityOverview, f=clonalityOverview$Sample) + lapply(clonalityOverviewSplit, FUN=ClonalityOverviewPrint) +} + +if("Functionality" %in% colnames(test)) +{ + newData = data.frame(data.table(PROD)[,list(unique=.N, + VH.DEL=mean(X3V.REGION.trimmed.nt.nb), + P1=mean(P3V.nt.nb), + N1=mean(N1.REGION.nt.nb), + P2=mean(P5D.nt.nb), + DEL.DH=mean(X5D.REGION.trimmed.nt.nb), + DH.DEL=mean(X3D.REGION.trimmed.nt.nb), + P3=mean(P3D.nt.nb), + N2=mean(N2.REGION.nt.nb), + P4=mean(P5J.nt.nb), + DEL.JH=mean(X5J.REGION.trimmed.nt.nb), + Total.Del=( mean(X3V.REGION.trimmed.nt.nb) + + mean(X5D.REGION.trimmed.nt.nb) + + mean(X3D.REGION.trimmed.nt.nb) + + mean(X5J.REGION.trimmed.nt.nb)), + + Total.N=( mean(N1.REGION.nt.nb) + + mean(N2.REGION.nt.nb)), + + Total.P=( mean(P3V.nt.nb) + + mean(P5D.nt.nb) + + mean(P3D.nt.nb) + + mean(P5J.nt.nb))), + by=c("Sample")]) + write.table(newData, "junctionAnalysis.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) +} diff -r 000000000000 -r 7d97fa9a0423 asc.gif Binary file asc.gif has changed diff -r 000000000000 -r 7d97fa9a0423 bg.gif Binary file bg.gif has changed diff -r 000000000000 -r 7d97fa9a0423 complete.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/complete.sh Fri May 09 09:35:32 2014 -0400 @@ -0,0 +1,204 @@ +#!/bin/bash + +clonalType=${@:(-6):1} +html=${@:(-5):1} +imageDir=${@:(-4):1} +species=${@:(-3):1} +locus=${@:(-2):1} +selection=${@:(-1):1} +dataCount=`expr $# - 6` +inputData=${@:(1):dataCount} +echo ${inputData[@]} +dir="$(cd "$(dirname "$0")" && pwd)" +array=("$@") +echo "

Progress

" > $html +echo "" >> $html + +mkdir $PWD/igblastdatabase +unzip $dir/database.zip -d $PWD/igblastdatabase/ +export IGDATA=$PWD/igblastdatabase/ + +function blastAndParse { + echo "" >> $html + echo "igblastn -germline_db_V $PWD/igblastdatabase/database/human_gl_V -germline_db_J $PWD/igblastdatabase/database/human_gl_J -germline_db_D $PWD/igblastdatabase/database/human_gl_D -domain_system imgt -query $1 -auxiliary_data $PWD/igblastdatabase/optional_file/human_gl.aux -show_translation -outfmt 3 > $PWD/$4" + /home/galaxy/galaxy/igblast/igblastn -germline_db_V $PWD/igblastdatabase/database/human_gl_V -germline_db_J $PWD/igblastdatabase/database/human_gl_J -germline_db_D $PWD/igblastdatabase/database/human_gl_D -domain_system imgt -query $1 -auxiliary_data $PWD/igblastdatabase/optional_file/human_gl.aux -show_translation -outfmt 3 > $PWD/$4 + echo "" >> $html + + echo "" >> $html + perl $dir/igparse.pl $PWD/$4 0 | grep -v "D:" | cut -f2- > $5 + echo "" >> $html +} + +function imgtConvert { + echo "" >> $html + bash $dir/imgtconvert.sh $1 $2 $3 $4 + echo "" >> $html + +} + +id=${inputData[0]} +forwardSlash="/" +mergerInput=() +count=0 +for current in $inputData; do + if [[ "$current" != *"$forwardSlash"* ]]; then + id=$current + count=0 + mergerInput+=($id) + continue + fi + fileName=$(basename $current) + parsedFileName="${fileName%.*}" + parsedFileName="$PWD/$parsedFileName.parsed" + f=$(file $current) + zipType="Zip archive" + if [[ "$f" == *"$zipType"* ]] + then + echo "" >> $html + fileName=$(basename $current) + imgtConvert $current $id $count $parsedFileName & + else + echo "" >> $html + blastAndParse $current $id $count $fileName $parsedFileName & + fi + mergerInput+=($parsedFileName) + count=$((count+1)) +done +wait + + + +echo "" >> $html +echo "" >> $html + +python $dir/igblastmerge.py ${mergerInput[*]} --output $PWD/merged.txt + +echo "" >> $html +echo "" >> $html +echo "" >> $html + + +inputFile=$PWD/merged.txt +outputDir=$imageDir +outputFile=$outputDir/index.html +mkdir $outputDir +Rscript --verbose $dir/RScript.r $inputFile $outputDir $outputDir $clonalType $species $locus $selection 2>&1 +cp $dir/tabber.js $outputDir +cp $dir/style.css $outputDir +cp $dir/script.js $outputDir +echo "

Click here for the results

Tip: Open it in a new tab (middle mouse button or right mouse button -> 'open in new tab' on the link above)
" > $html +echo "Report on:" >> $outputFile +for sample in $samples; do + echo " $sample" >> $outputFile +done +echo "" >> $outputFile +echo "" >> $outputFile +echo "" >> $outputFile +echo "" >> $outputFile +echo "" >> $outputFile + +echo "
" >> $outputFile +echo "
" >> $outputFile +echo "" >> $outputFile +echo "" >> $outputFile +echo "" >> $outputFile +echo "" >> $outputFile +echo "" >> $outputFile +echo "
" >> $outputFile + +samples=`cat $outputDir/samples.txt` +count=1 +echo "
" >> $outputFile +for sample in $samples; do + echo "
info
-----------------------------------
Starting blast of sample $3 of patient $2
Finished blast of sample $3 of patient $2
Starting parse of sample $3 of patient $2
Finished parse of sample $3 of patient $2
Starting imgt convert of sample $3 of patient $2
Finished conversion of sample $3 of patient $2
Sample $count of patient $id is a zip file, using IMGT Loader
Sample $count of patient $id is not a zip file, using igBLASTn
-----------------------------------
merging
done
-----------------------------------
plotting
" >> $outputFile + + mv "$outputDir/HeatmapVD_$sample.png" "$outputDir/VD_$sample.png" + echo "" >> $outputFile + mv "$outputDir/HeatmapVJ_$sample.png" "$outputDir/VJ_$sample.png" + echo "" >> $outputFile + mv "$outputDir/HeatmapDJ_$sample.png" "$outputDir/DJ_$sample.png" + echo "
" >> $outputFile + count=$((count+1)) +done +echo "" >> $outputFile + + +hasReplicateColumn="$(if head -n 1 $inputFile | grep -q 'Replicate'; then echo 'Yes'; else echo 'No'; fi)" +#if its a 'new' merged file with replicate info +if [[ "$hasReplicateColumn" == "Yes" ]] ; then + echo "
" >> $outputFile + for sample in $samples; do + clonalityScore="$(cat $outputDir/ClonalityScore_$sample.csv)" + echo "
" >> $outputFile + echo "" >> $outputFile + + #replicate,reads,squared + echo "" >> $outputFile + while IFS=, read replicate reads squared + do + + echo "" >> $outputFile + done < $outputDir/ReplicateReads_$sample.csv + + #sum of reads and reads squared + while IFS=, read readsSum squaredSum + do + echo "" >> $outputFile + done < $outputDir/ReplicateSumReads_$sample.csv + + #overview + echo "" >> $outputFile + while IFS=, read type count weight weightedCount + do + echo "" >> $outputFile + done < $outputDir/ClonalityOverView_$sample.csv + echo "
Clonality Score: $clonalityScore
Replicate IDNumber of ReadsReads Squared
$replicate$reads$squared
Sum$readsSum$squaredSum
Coincidence TypeRaw Coincidence FreqCoincidence WeightCoincidences, Weighted
$type$count$weight$weightedCount
" >> $outputFile + done + echo "
" >> $outputFile +fi + +hasJunctionData="$(if head -n 1 $inputFile | grep -q '3V-REGION trimmed-nt nb'; then echo 'Yes'; else echo 'No'; fi)" + +if [[ "$hasJunctionData" == "Yes" ]] ; then + echo "
" >> $outputFile + while IFS=, read Sample unique VHDEL P1 N1 P2 DELDH DHDEL P3 N2 P4 DELJH TotalDel TotalN TotalP + do + echo "" >> $outputFile + done < $outputDir/junctionAnalysis.csv + echo "
SampleuniqueVH.DELP1N1P2DEL.DHDH.DELP3N2P4DEL.JHTotal.DelTotal.NTotal.P
$Sample$unique$VHDEL$P1$N1$P2$DELDH$DHDEL$P3$N2$P4$DELJH$TotalDel$TotalN$TotalP
" >> $outputFile +fi + +echo "
" >> $outputFile +for sample in $samples; do + echo "" >> $outputFile +done +echo "
IDInclude
$sample
" >> $outputFile +echo "
" >> $outputFile +echo "
" >> $outputFile +echo "
" >> $outputFile +echo "
" >> $outputFile + +echo "
" >> $outputFile +echo "" >> $outputFile +echo "" >> $outputFile +echo "" >> $outputFile +echo "" >> $outputFile + +echo "" >> $outputFile + +echo "" >> $outputFile +echo "" >> $outputFile +echo "" >> $outputFile + +echo "" >> $outputFile +echo "" >> $outputFile +echo "" >> $outputFile + +for sample in $samples; do + echo "" >> $outputFile + echo "" >> $outputFile + echo "" >> $outputFile +done + +echo "
DescriptionLink
The dataset used to generate the frequency graphs and the heatmaps (Unique based on clonaltype, $clonalType)Download
The dataset used to calculate clonality score (Unique based on clonaltype, $clonalType)Download
The dataset used to generate the CDR3 length frequency graphDownload
The dataset used to generate the V gene family frequency graphDownload
The dataset used to generate the D gene family frequency graphDownload
The dataset used to generate the J gene family frequency graphDownload
The dataset used to generate the V gene frequency graphDownload
The dataset used to generate the D gene frequency graphDownload
The dataset used to generate the J gene frequency graphDownload
The data used to generate the VD heatmap for $sample.Download
The data used to generate the VJ heatmap for $sample.Download
The data used to generate the DJ heatmap for $sample.Download
" >> $outputFile +echo "
" >> $outputFile diff -r 000000000000 -r 7d97fa9a0423 complete_immunerepertoire.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/complete_immunerepertoire.xml Fri May 09 09:35:32 2014 -0400 @@ -0,0 +1,54 @@ + + + + complete.sh + #for $i, $f in enumerate($patients) + ${f.id} + #for $j, $g in enumerate($f.samples) + ${g.sample} + #end for + #end for + "$clonaltype_select" $out_file $out_file.files_path $species $locus $selection + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The entire Immune Repertoire pipeline as a single tool, input several FASTA files, give them an ID and it will BLAST, parse, merge and plot them. + + + igBlastn + + + diff -r 000000000000 -r 7d97fa9a0423 database.zip Binary file database.zip has changed diff -r 000000000000 -r 7d97fa9a0423 desc.gif Binary file desc.gif has changed diff -r 000000000000 -r 7d97fa9a0423 igblastmerge.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/igblastmerge.py Fri May 09 09:35:32 2014 -0400 @@ -0,0 +1,44 @@ +import sys +import pandas as pd + +def main(): + patients = {} + files = [] + sample_id = sys.argv[1] + imgt_files = 0 + blast_files = 0 + #organize files + for arg in sys.argv[2:-2]: + if arg.find("/") is -1: + patients[sample_id] = files + files = [] + sample_id = arg + else: + df = pd.read_csv(arg, sep="\t") + if "Functionality" in list(df.columns.values): + df["VDJ Frame"][df["Functionality"] != "productive"] = "In-frame with stop codon" + imgt_files += 1 + else: + blast_files += 1 + files.append(df) + patients[sample_id] = files + columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Functionality', 'V-REGION identity %', 'V-REGION identity nt', 'D-REGION reading frame', 'AA JUNCTION', 'Functionality comment', 'Sequence', 'FR1-IMGT', 'FR2-IMGT', 'FR3-IMGT', 'CDR3-IMGT', 'JUNCTION', 'J-REGION', 'FR4-IMGT', 'P3V-nt nb', 'N1-REGION-nt nb', 'P5D-nt nb', 'P3D-nt nb', 'N2-REGION-nt nb', 'P5J-nt nb', '3V-REGION trimmed-nt nb', '5D-REGION trimmed-nt nb', '3D-REGION trimmed-nt nb', '5J-REGION trimmed-nt nb', u'Sample', u'Replicate'] + if blast_files is not 0: + print "Has a parsed blastn file, using limited columns." + columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Sample', u'Replicate'] + + result = None + for patient_id, samples in patients.iteritems(): + count = 1 + for sample in samples: + sample['Sample'] = patient_id + sample['Replicate'] = str(count) + count += 1 + if result is None: + result = sample[columns] + else: + result = result.append(sample[columns]) + result.to_csv(sys.argv[-1], sep="\t", index=False, index_label="index") + +if __name__ == "__main__": + main() diff -r 000000000000 -r 7d97fa9a0423 igparse.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/igparse.pl Fri May 09 09:35:32 2014 -0400 @@ -0,0 +1,1252 @@ +#!/usr/bin/perl +=head1 IGBLAST_simple.pl + +This version (1.4) has been heavily adapted since the original program was first created back in October 2012. +Bas Horsman (EMC, Rotterdam, The Netherlands) has contributed with minor - though important - code changes. + +From V 1.2 onwards a 'Change Log' is included at the end of the program + +=head2 Usage + +Requires no modules in general use; the Data::Dumper (supplied as part of the Perl Core module set) might be useful for debugging/adjustment +as it allows inspection of the data stores. + +The program takes a text file of the + + ./IGBLAST_simple.pl igBLASTOutput.txt <-optional: index of record to process-> + +Supply the text version of the igBLAST report in the format as in the example below. +The extra command line arugment is the record number (aka. BLAST report) to process. +If 0 or absent all are processed, if supplied that record (base 1) is processed and the program dies afterwards. + +=head2 Example Input + +A standard igBLAST record or set of them in a file; this being typical: + + BLASTN 2.2.27+ + + +Reference: Stephen F. Altschul, Thomas L. Madden, Alejandro A. +Schaffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. +Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of +protein database search programs", Nucleic Acids Res. 25:3389-3402. + + + +Database: human_gl_V; human_gl_D; human_gl_J + 674 sequences; 179,480 total letters + + + +Query= HL67IUI01D26LR length=433 xy=1559_1437 region=1 +run=R_2012_04_10_11_57_56_ + +Length=433 + Score E +Sequences producing significant alignments: (Bits) Value + +lcl|IGHV3-30*04 330 2e-92 +lcl|IGHV3-30-3*01 330 2e-92 +lcl|IGHV3-30*01 327 2e-91 +lcl|IGHD3-16*01 14.4 11 +lcl|IGHD3-16*02 14.4 11 +lcl|IGHD1-14*01 12.4 43 +lcl|IGHJ4*02 78.3 1e-18 +lcl|IGHJ5*02 70.3 4e-16 +lcl|IGHJ4*01 68.3 2e-15 + + +Domain classification requested: imgt + + +V(D)J rearrangement summary for query sequence (Top V gene match, Top D gene match, Top J gene match, Chain type, V-J Frame, Strand): +IGHV3-30*04 IGHD3-16*01 IGHJ4*02 VH In-frame + + +V(D)J junction details (V end, V-D junction, D region, D-J junction, J start). Note that possible overlapping nucleotides at VDJ junction (i.e, nucleotides that could be assigned to either joining gene segment) are indicated in parentheses (i.e., (TACT)) but are not included under V, D, or J gene itself +AGAGA TATGAGCCCCATCATGACA ACGTTTG CCGGAA ACTAC + +Alignment summary between query and top germline V gene hit (from, to, length, matches, mismatches, gaps, percent identity) +FWR1 27 38 12 11 1 0 91.7 +CDR1 39 62 24 22 2 0 91.7 +FWR2 63 113 51 50 1 0 98 +CDR2 114 137 24 23 1 0 95.8 +FWR3 138 251 114 109 5 0 95.6 +CDR3 (V region only) 252 259 8 7 1 0 87.5 +Total N/A N/A 233 222 11 0 95.3 + + +Alignments + + <----FWR1--><----------CDR1--------><-----------------------FWR2------ + W A A S G F T F N T Y A V H W V R Q A P G K G + Query_1 27 TGGGCAGCCTCTGGATTCACCTTCAATACCTATGCTGTGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGC 96 +V 95.3% (222/233) IGHV3-30*04 64 ..T......................G..G.......A................................. 133 + C A A S G F T F S S Y A M H W V R Q A P G K G +V 95.7% (221/231) IGHV3-30-3*01 64 ..T......................G..G.......A................................. 133 +V 94.8% (221/233) IGHV3-30*01 64 ..T......................G..G.......A................................. 133 + + ----------------><----------CDR2--------><---------------------------- + L E W V A V I S Y D G S N K N Y A D S V K G R F + Query_1 97 TGGAGTGGGTGGCAGTTATATCATATGATGGAAGCAATAAAAACTACGCAGACTCCGTGAAGGGCCGATT 166 +V 95.3% (222/233) IGHV3-30*04 134 ..................................T......T............................ 203 + L E W V A V I S Y D G S N K Y Y A D S V K G R F +V 95.7% (221/231) IGHV3-30-3*01 134 .........................................T............................ 203 +V 94.8% (221/233) IGHV3-30*01 134 .A................................T......T............................ 203 + + ---------------------------FWR3--------------------------------------- + T I S R D N S K N T L Y L Q M N S L R V E D T + Query_1 167 CACCATCTCCAGAGACAATTCCAAGAACACGTTATATCTGCAAATGAACAGCCTGAGAGTTGAGGACACG 236 +V 95.3% (222/233) IGHV3-30*04 204 ...............................C.G.........................C.......... 273 + T I S R D N S K N T L Y L Q M N S L R A E D T +V 95.7% (221/231) IGHV3-30-3*01 204 ...............................C.G.........................C.......... 273 +V 94.8% (221/233) IGHV3-30*01 204 ...............................C.G.........................C.......... 273 + + --------------> + A V Y Y C T R D M S P I M T T F A G N Y W G Q + Query_1 237 GCTGTTTATTACTGTACGAGAGATATGAGCCCCATCATGACAACGTTTGCCGGAAACTACTGGGGCCAGG 306 +V 95.3% (222/233) IGHV3-30*04 274 .....G.........G.......----------------------------------------------- 296 + A V Y Y C A R +V 95.7% (221/231) IGHV3-30-3*01 274 .....G.........G.....------------------------------------------------- 294 +V 94.8% (221/233) IGHV3-30*01 274 .....G.........G.......----------------------------------------------- 296 +D 100.0% (7/7) IGHD3-16*01 12 ------------------------------------------.......--------------------- 18 +D 100.0% (7/7) IGHD3-16*02 12 ------------------------------------------.......--------------------- 18 +D 100.0% (6/6) IGHD1-14*01 8 -------------------------------------------------......--------------- 13 +J 100.0% (39/39) IGHJ4*02 10 -------------------------------------------------------............... 24 +J 100.0% (35/35) IGHJ5*02 17 -----------------------------------------------------------........... 27 +J 97.4% (38/39) IGHJ4*01 10 -------------------------------------------------------.............A. 24 + + + G T L V T V S S + Query_1 307 GAACCCTGGTCACCGTCTCCTCAG 330 +J 100.0% (39/39) IGHJ4*02 25 ........................ 48 +J 100.0% (35/35) IGHJ5*02 28 ........................ 51 +J 97.4% (38/39) IGHJ4*01 25 ........................ 48 + + +Lambda K H + 1.10 0.333 0.549 + +Gapped +Lambda K H + 1.08 0.280 0.540 + +Effective search space used: 64847385 + + +Query= HL67IUI01EQMLY length=609 xy=1826_1636 region=1 +run=R_2012_04_10_11_57_56_ + + +...etc... + +=head2 Example Output + + +Example output from the data above sent: + $ ./IGBLAST_simple.pl igBLASTOutput.txt 1 + D: Request to process just record '1' received + D: printOUTPUTData: Running + D: printOUTPUTData: HEADER Printout requested 'ID VDJ Frame Top V Gene Top D Gene Top J Gene CDR1 Seq CDR1 Length CDR2 Seq CDR2 Length CDR3 Seq CDR3 Length CDR3 Found How' + OUTPUT: # ID VDJ Frame Top V Gene Top D Gene Top J Gene CDR1 Seq CDR1 Length CDR2 Seq CDR2 Length CDR3 Seq CDR3 Length CDR3 Found How + D: ID is: 'HL67IUI01D26LR' + D: Minimum base marked-up (27) - aka. $AlignmentStart; maximum: (259) + D: Starting Search for CDR3 + D: markUpCDR3: Passed Parameters '251, 27, TGGGG....GG., WG.G' (& AA & DNA sequence) + D: markUpCDR3: returning: 223, 282, MOTIF_FOUND_IN_BOTH, (3) [NB: offset of :'+ 27' + D: CDR3 was found by pattern matching: 'MOTIF_FOUND_IN_BOTH' (250, 309) + D: Top Hits (raw)= 'IGHV3-30*04 IGHD3-16*01 IGHJ4*02 VH In-frame +' + D: Top Hits (parsed)= 'IGHV3-30*04, IGHD3-16*01, IGHJ4*02, VH, In-frame, +' + D: printOUTPUTData: Running + OUTPUT: HL67IUI01D26LR In-frame IGHV3-30*04 IGHD3-16*01 IGHJ4*02 GFTFNTYA 23 ISYDGSNK 23 CTRDMSPIMTTFAGNYWGQG 59 MOTIF_FOUND_IN_BOTH + +=head4 Usage notes: + +Designed to be easy to "grep -v D:" or "grep OUTPUT:" for to select the parts you need: + + ./IGBLAST_simple.pl igBLASTOutput.txt 1 | grep OUTPUT: + + OUTPUT: # ID VDJ Frame Top V Gene Top D Gene Top J Gene CDR1 Seq CDR1 Length CDR2 Seq CDR2 Length CDR3 Seq CDR3 Length CDR3 Found How + OUTPUT: HL67IUI01D26LR In-frame IGHV3-30*04 IGHD3-16*01 IGHJ4*02 GFTFNTYA 23 ISYDGSNK 23 CTRDMSPIMTTFAGNYWGQG 59 MOTIF_FOUND_IN_BOTH + OUTPUT: HL67IUI01EQMLY In-frame IGHV4-39*01 IGHD2-8*01 IGHJ3*02 GGSISSSSYY 29 IYHSGST 20 CARDATYYSNGFDIWGQG 53 MOTIF_FOUND_IN_BOTH + OUTPUT: HL67IUI01CDCLP Out-of-frame IGHV3-23*01 IGHD3-3*01 IGHJ4*02 FSNYAM 16 SGSGDRTY 23 AKAD*FLEWLFRIGDGERLLGPGN 72 MOTIF_FOUND_IN_DNA + OUTPUT: HL67IUI01AHRNH N/A IGHV3-33*01 N/A N/A WIHLQ*LW 23 YGMMEVI 23 NOT_FOUND + OUTPUT: HL67IUI01DZZ1V Out-of-frame IGHV3-23*01 IGHD5-12*01 IGHJ4*02 GFTFDKYA 23 ILASG 20 LYCASEGDIVASELLSTGARV 62 MOTIF_FOUND_IN_DNA + OUTPUT: HL67IUI01DTR2Y Out-of-frame IGHV3-23*01 IGHD5-12*01 IGHJ4*02 LDSPLTNM 23 LYLPVV 20 TVRVRGT*WLRSF*VLGPG 59 MOTIF_FOUND_IN_DNA + OUTPUT: HL67IUI01EQL3S In-frame IGHV7-4-1*02 IGHD6-19*01 IGHJ6*02 GYTFRTFT 23 INTNTGTP 23 CAKESGTGSAHFFYGMDVWGQG 65 MOTIF_FOUND_IN_BOTH + OUTPUT: HL67IUI01AFG46 In-frame IGLV2-34*01 N/A IGHJ4*02 NOT_FOUND + OUTPUT: HL67IUI01EFFKO In-frame IGHV3-11*01 IGHD6-6*01 IGHJ4*02 GFTFSDYY 23 ISYSGGTI 23 CARASGAARHRPLDYWGQG 56 MOTIF_FOUND_IN_BOTH + OUTPUT: HL67IUI01B18SG In-frame IGHV3-33*01 IGHD5-12*01 IGHJ4*02 VRQA 11 KYYANSVK 23 RLGGFDYWGQGTLVTVSS 53 MOTIF_FOUND_IN_BOTH + OUTPUT: HL67IUI01D6LER In-frame IGHV1-24*01 IGHD3-22*01 IGHJ4*02 GYSLNELS 23 PDPEDDE 23 TVQPSRITMMAVVITRIHWGASGARE 76 MOTIF_FOUND_IN_DNA + OUTPUT: HL67IUI01CYCLF N/A IGHV4-39*01 N/A N/A GGSISSSSYY 29 IYYSGST 20 NOT_FOUND + OUTPUT: HL67IUI01B4LEE In-frame IGHV7-4-1*02 IGHD6-19*01 IGHJ6*02 GYTFRTFT 23 INTNTGTP 23 CAKESGTGSAHFFYGMDVWGQG 65 MOTIF_FOUND_IN_BOTH + OUTPUT: HL67IUI01A4KW4 Out-of-frame IGHV3-23*01 IGHD5-12*01 IGHJ4*02 LDSPLTNM 23 LYLPVV 20 TVRVRGT*WLRSF*IWGQG 58 MOTIF_FOUND_IN_BOTH + OUTPUT: HL67IUI01E05BV In-frame IGHV1-24*01 IGHD3-22*01 IGHJ2*01 GYSLNELS 23 PDPEDDE 23 NOT_FOUND + OUTPUT: HL67IUI01CVVKY In-frame IGHV1-3*01 IGHD2-15*01 IGHJ1*01 NOT_FOUND + OUTPUT: HL67IUI01CN5P2 In-frame IGHV7-4-1*02 IGHD2-21*02 IGHJ5*02 GYSITDYG 23 LNTRTGNP 23 CAVKDARDFVSWGQG 44 MOTIF_FOUND_IN_BOTH + OUTPUT: HL67IUI01DUUJ5 In-frame IGHV3-21*01 IGHD1-7*01 IGHJ4*02 GYTFSTYS 23 ISSSSAYR 23 CARDIRLELRDWGQG 44 MOTIF_FOUND_IN_BOTH + OUTPUT: HL67IUI01E1AIR Out-of-frame IGHV4-39*01 N/A IGHJ3*01 WGLHRRW**L 29 FVS*RAPR 23 NOT_FOUND + OUTPUT: HL67IUI01CCZ8D Out-of-frame IGHV3-23*01 IGHD5-12*01 IGHJ4*02 GFTFDKYA 23 ILASGR 20 YCASEGDIVASELLSTGARE 58 MOTIF_FOUND_IN_DNA + OUTPUT: HL67IUI01BT9IR N/A IGHV3-21*02 N/A N/A NOT_FOUND + OUTPUT: HL67IUI01COTO0 Out-of-frame IGHV4-39*01 N/A IGHJ3*01 GGFIGGGDNF 29 LYHDGRPA 23 NOT_FOUND + OUTPUT: HL67IUI01D994O In-frame IGHV7-4-1*02 IGHD2-21*02 IGHJ5*02 GYSITDYG 23 LNTRTGNP 23 CAVKDARDFVSWGQG 44 MOTIF_FOUND_IN_BOTH + OUTPUT: HL67IUI01A08CJ In-frame IGHV4-39*01 IGHD6-13*01 IGHJ5*02 GGSISSSSYY 29 IYYTWEH 21 CERARRGSSWGQLVRPLGPG 62 MOTIF_FOUN + + + + OUTPUT: # ID VDJ Frame Top V Gene Top D Gene Top J Gene CDR1 Seq CDR1 Length CDR2 Seq CDR2 Length CDR3 Seq CDR3 Length CDR3 Found How + OUTPUT: HL67IUI01D26LR In-frame IGHV3-30*04 IGHD3-16*01 IGHJ4*02 GFTFNTYA 23 ISYDGSNK 23 CTRDMSPIMTTFAGNYWGQG 59 MOTIF_FOUND_IN_BOTH + ...etc... + +=head4 Also, combined grep & sed: + + $ ./IGBLAST_simple.pl igBLASTOutput.txt | grep OUTPUT: | sed 's/OUTPUT:\t//' + +=cut + +=head3 CDR3 Patterns: + +We use these two variables to try to identify the end of the CDR3 region if igBLAST doesn't report it directly: + + my $DNACDR3_Pat = "TGGGG....GG."; + my $AASequenceMotifPattern = "WG.G"; + +They are treated as regex's when tested (so use "." to mean any DNA base, rather than 'N' or 'X'). + +[NB: These are original patterns used for testing, check the code for the current ones.] + +=cut + +my $DNACDR3_Pat = "TGGGG....GG."; +my $AACDR3_Pat = "WG.G"; + +use strict; +use Data::Dumper; +# Set this as to number of the result (aka "record") you want to process or 0 for all: +my $ProcessRecord =0; +if (defined $ARGV[1]) { $ProcessRecord = pop @ARGV; } #Also accept from the command line: +if ($ProcessRecord != 0) { print "D: Request to process just record '$ProcessRecord' received\n"; } + +#Adjust the record separator: +$/="Query= "; +my $Record=0; # A simple counter, that we might not use. +#Force-loaded header / version information: +my $Header = <>; +#At the moment we don't use this - so dump it immediately: +$Header = undef; +#print "D: Force-loaded header / version information: '$Header'\n"; + +#Print the Header for the output line (we need this once, at the start) +print &printOUTPUTData ({"HEADER" => 1})."\n"; + +while (<>) + { +=head4 First check - should we be processing this record at all? + +=cut + $Record++; #Increment the record counter: + #Do we process this record - or all records? + if ($ProcessRecord != $Record && $ProcessRecord != 0) + { next; } #We need to increment the record counter before we increment + +=head4 Setup the output line storage and print the header: + +We enter this initially and work to change it: + + $DomainBoundaries{"CDR3"}{"FoundHow"} = "NOT_FOUND"; + +=cut + + my %OUTPUT_Data; #To collect data for the output line in + #Assume the first and work to find better: + $OUTPUT_Data{"CDR3 Found How"} = "NOT_FOUND"; + #The whole record - one per read - is now stored in $_ + my @Lines =split (/[\r\n]+/,$_); # split on windows/linux/mac new lines + + #If you are interested enable either of the next lines depending on how curious you are as to how the splitting went: + #print "D: Record #$Record\n"; print $_; print "\n---------\n"; + print "D: ''$Lines[0]'\nD: ...etc...'\nD: ############\n"; + +=head3 Get the ID + +Quite easy: the first field on the first line: + + Query= HL67IUI01DTR2Y length=577 xy=1452_0984 region=1 + +=cut + + (my $ID) = $Lines[0]=~ m/^(\S+)/; + unless (defined $ID && $ID ne "") + { # So a near total failure...? + $OUTPUT_Data{"ID"} = "Unknown"; + print &printOUTPUTData (\%OUTPUT_Data)."\n"; + next; #No ID is terminal for this record + } + else + { + print "D: ID is: '$ID'\n"; + $OUTPUT_Data{"ID"} = $ID; + } +=head3 Declare the variables we will need here in the next few sections to store data + +=cut + + my $CurrentRegion; + my $RegionMarkup; + + #So we can sync the coordinated of the alignment up to the domains found: + my $Query_Start = -1; my $Query_End = -1; + + #Where on the Query Sequence (i.e. the 454 read) does the alignment start & stop? + my $ThisQueryStart =-1; my $ThisQueryEnd =-1; #Think $ThisQueryEnd isn't used at the moment. + my $DNAQuerySequence =""; #The actual DNA Query sequence... + my $AAQuerySequence = ""; + + #As this changes with the alleles identified: + my $CurrentAASequence; + #The main storage variables + + my %Alginments; my %Alleles; + my %DomainBoundaries; + +=head2 Stanza 1: Get the general structure of the sequence identified + +=head3 Method 1: Use the table supplied + +Technically this valid for the top hit...realistically this is the only information we have reported to us +so we use this or nothing. This is fine for the top hit which is likely what we are interested in....but for the 2nd or 3rd? Who knows! + +Targets this block: + + Alignment summary between query and top germline V gene hit (from, to, length, matches, mismatches, gaps, percent identity) + FWR1 167 240 75 72 2 1 96 + CDR1 241 264 24 20 4 0 83.3 + FWR2 265 315 51 48 3 0 94.1 + CDR2 316 336 24 15 6 3 62.5 + FWR3 337 450 114 106 8 0 93 + CDR3 (V region only) 451 454 4 4 0 0 100 + Total N/A N/A 292 265 23 4 90.8 + +Then we split out the lines inside it in a second scanning step - less optimal but easier to read: + + FWR1 167 240 75 72 2 1 96 + CDR1 241 264 24 20 4 0 83.3 + FWR2 265 315 51 48 3 0 94.1 + CDR2 316 336 24 15 6 3 62.5 + FWR3 337 450 114 106 8 0 93 + CDR3 (V region only) 451 454 4 4 0 0 100 + +into: + + (Section, from, to, length, matches, mismatches, gaps, percent identity) + +=head3 Method 2: Use the table supplied + +The other way to do this is to split the graphical markup out of the alignment. +This works for _any_ reported alignment, not just the top hits: + +In the main alignment table processing section collect the information, collect the information: + + #Is region mark-up: + if ($#InfoColumns == -1 && $#AlignmentColumns ==0) + { +# print ": Region Markup detected\n"; + $RegionMarkup = $RegionMarkup.$AlignmentPanel; #Collect the information, then re-synthesise it at the end of record + next; + } + +Then afterwards when all the region was collected, process it like this: +#Pad the CDER3 region: + + #Remove the trailing spaces: + $RegionMarkup =~ s/ *$//g; + #Calculate the length of the CDR3 region so we can add it in: + my $CDR3PaddingNeeded = ($Query_End-$Query_Start)-length ($RegionMarkup) -length ("<-CDR3>")+1; + #Build up the CDR3 region, the 'x' operator is very helpful here (implict foreach loop): + $RegionMarkup = $RegionMarkup."<-CDR3"."-" x $CDR3PaddingNeeded. ">"; + #print "D: Need to pad with:'$CDR3PaddingNeeded' characters\n"; + + #Now really process it: + my $C_Pos = 0; + my @Domains = split (/(<*-*...[123]-*>*)/,$RegionMarkup); # + foreach my $C_Domain (@Domains) + { + if (length ($C_Domain) <=0) {next;} + my $DomainStart= $C_Pos; + my $DomainEnd = $DomainStart + length ($C_Domain)-1; + my ($DomainType) = $C_Domain =~ m/(...[123])/; +# print "D: $DomainType \t($DomainStart-$DomainEnd=",$DomainEnd-$DomainStart,"):\t$C_Domain\n"; + $DomainBoundaries{$DomainType}{"Start"} = $DomainStart; + $DomainBoundaries{$DomainType}{"End"} = $DomainEnd; + $C_Pos = $DomainEnd+1; + } + +The two pieces of code are interchangable; the table version as used below, is neater, easier to understand and works nicely. +Why stress? + + +=head3 The end of the FWR3 is the start of CDR3? + +This is an assumption made. Hence the two variables: + + my $MaxDomainReported =0 ; # In nts / bps + my $FWR3_Found_Flag = 0; # Did we find the end of the FWR3 - which is the start of the CDR3. Set to 'false' initially. + + $MaxDomainBaseFound + +=cut + my $MaxDomainBaseFound =0 ; # In nts / bps + my $AlignmentStart ; # In nts /bp #Alternative name would be: '$MinDomainBaseFound'; set to null until primed +# my $FWR3_Found_Flag = 0; # Did we find the end of the FWR3 - which is the start of the CDR3. Set to 'false' initially. + + (my @StructureSummaryTable) = returnLinesBetween (\@Lines, "Alignment summary", "Total" ); +#Enable the next line if you want the raw data we are going to parse in this section: + #print Dumper @StructureSummaryTable; + foreach my $C_Section (@StructureSummaryTable) + { + my ($DomainType, $DomainStart, $DomainEnd, $SectLength, $Matches, $Mismatches, $Gaps, $PID) = split (/\t+/,$C_Section); + #print "D: Domain type: '$DomainType'\n"; + #$DomainType =~ s/ .*$//g; + $DomainBoundaries{$DomainType}{"Start"} = $DomainStart; + $DomainBoundaries{$DomainType}{"End"} = $DomainEnd; + +#So we can do a reality check on the length / start of the CDR3 if we have to go looking: + if ($MaxDomainBaseFound <= $DomainEnd) + { $MaxDomainBaseFound = $DomainEnd; } #Store the maximum base found + if ($AlignmentStart eq undef or $AlignmentStart >= $DomainStart) + { $AlignmentStart = $DomainStart; } + } +#print Dumper %DomainBoundaries; +#die "HIT BLOCK\n"; + +=head3 Did we find the CDR3 region specifically? + +If we did fine; otherwise try to find it using the FWR3 region if we found that; otherwise give up. + +=cut + print "D: Minimum base marked-up ($AlignmentStart) - aka. \$AlignmentStart; maximum: ($MaxDomainBaseFound)\n"; + +#my @WantedSections = qw (V D J); + +=head2 Second Stanza: Parse the main Alignment Table + +=head3 Get the table, then determine the character at which to split the 'Info' & 'Alignment' panels. + +As this is a little involved and comparamentalises nicely we sub-contract this to two functions"" + + (my @Table) = returnLinesBetween (\@Lines, "Alignment", "Lambda" ); + my $PanelSplitPoint = findSplitPoint (\@Table); #Why can't they just use a fixed field width or a tab as a delimiter? + +=cut + (my @Table) = returnLinesBetween (\@Lines, "Alignment", "Lambda" ); + my $PanelSplitPoint = findSplitPoint (\@Table); #Why can't they just use a fixed field width or a tab as a delimiter? +#If you are interested, enable this line: +# print "D: The info panel was detected at: '$splitPoint'\n"; + +=head3 + +=cut + + +foreach my $C_Line (0..$#Table) + { + +=head3 Call the line type we find: There are 4: + +These are distinguished by the number of fields (one or mores spacer is a field separator) in the Info & Alignment Panels (see values in brackets) + + | <- This split is ~40 chars. from the start of the line + * InfoPanel * | * Alignment Panel * + : is a "Blank" line (-1,-1) + <----FWR1--><----------CDR1--------><-----------------------FWR2------ : is "Region Markup" (-1,0) + W A A S G F T F N T Y A V H W V R Q A P G K G : is "AA Sequence" (-1, >=0) + Query_1 27 TGGGCAGCCTCTGGATTCACCTTCAATACCTATGCTGTGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGC 96 : is "DNA Sequence" (2,1) + V 95.3% (222/233) IGHV3-30*04 64 ..T......................G..G.......A................................. 133 : is "" " + +So we split 40 chars in and then the two parts on spaces. + + +=cut + +# print "D: (sub) Line in parsed table: '$C_Line': \n"; + + my ($InfoPanel, $AlignmentPanel) = $Table[$C_Line] =~ /^(.{$PanelSplitPoint})(.*)$/; + + my @InfoColumns = split (/\s+/,$InfoPanel); + my @AlignmentColumns = split (/\s+/,$AlignmentPanel); + +#If you want to see how the line is being split enable either of these next two lines; the 2nd is more detailed than the first +# print "D: Line: $C_Line/t Number of Columns (Info, Alignment): \t$#InfoColumns \t $#AlignmentColumns\n"; +# print "D: For '$C_Line' \t line in the table there are parts: '$InfoPanel' [$#InfoColumns], '$AlignmentPanel [$#AlignmentColumns]'\n"; + +#Populate this so we can step through it + +=head4 Is a blank line: +=cut + if ($#InfoColumns == -1 && $#AlignmentColumns == -1) + { +# print ": Blank\n"; + next; + } #For now I think we just skip - is not needed (though might be implict mark-up) + +=head4 Is region mark-up: +=cut + if ($#InfoColumns == -1 && $#AlignmentColumns ==0) + { +# print ": Region Markup detected\n"; + $RegionMarkup = $RegionMarkup.$AlignmentPanel; #Collect the information, then re-synthesise it at the end of record + next; + } +=head4 Is query DNA Sequence: +=cut + if ($#InfoColumns == 2 && $#AlignmentColumns ==1) + { +# print ": DNA Query Sequence\n"; + #Detect the two coordinatates of alignment against the query sequence: (last two numbers of the two 'panels') + ($ThisQueryStart) = $InfoPanel =~ / (\d+) *$/; + ($ThisQueryEnd) = $AlignmentPanel =~ / (\d+) *$/; + my ($ThisDNASeq) = $AlignmentPanel =~ /^(.*?) /; + #If you want to know what we just found: + #print "D: This DNA Sequence: '$ThisDNASeq'\n"; + $DNAQuerySequence = $DNAQuerySequence. $ThisDNASeq; #Add it on to whatever we already have. + #Move the needle if there are smaller / greater; otherwise prime the 'needles': + if ($ThisQueryStart < $Query_Start or $Query_Start == -1) + { $Query_Start = $ThisQueryStart; } + if ($ThisQueryEnd > $Query_End or $Query_End == -1) + { $Query_End = $ThisQueryEnd; } +# print ": Query DNA Sequence detected This line: ($ThisQueryStart, $ThisQueryEnd) & Maximally: ($Query_Start, $Query_End)\n"; + next; + } +=head4 Is AA Sequence: + +This is complicated as it Need to decide whether this is the sequence of the read or that of the original V / D / J regions: + --------------> + A V Y Y C T R D M S P I M T T F A G N Y W G Q << Want this + Query_1 237 GCTGTTTATTACTGTACGAGAGATATGAGCCCCATCATGACAACGTTTGCCGGAAACTACTGGGGCCAGG 306 + V 95.3% (222/233) IGHV3-30*04 274 .....G.........G.......----------------------------------------------- 296 + A V Y Y C A R + V 95.7% (221/231) IGHV3-30-3*01 274 .....G.........G.....------------------------------------------------- 294 + + ...etc... + G T L V T V S S << Want this + Query_1 307 GAACCCTGGTCACCGTCTCCTCAG 330 + +To solve this we peak at the next line that it has the tag "Query" in it (we assume the line exists...) + +=cut + + if ($#InfoColumns == -1 && $#AlignmentColumns >=-1) + { + unless ($Table[$C_Line+1] =~ /Query/) { next; } #Is the next line the DNA sequence ? + # +# print ": AA sequence\n"; + + + $CurrentAASequence = $AlignmentPanel; + #print "D: Panel Split Point = $PanelSplitPoint, '$AlignmentPanel'\n"; + $CurrentAASequence =~ s/^ {$PanelSplitPoint}//; + #print "D: '$AAQuerySequence'\n"; +# print "D: Current AA Sequence: \t'$CurrentAASequence'\n"; + $AAQuerySequence = $AAQuerySequence.$CurrentAASequence; #Store the elongating AA Sequence as well + next; + } +=head4 Is Alignment: +=cut + if ($#InfoColumns == 4 && $#AlignmentColumns ==1) + { + #Not acutally interesting to us for this version of the parser. Delete ultimately? + next; + } + +#Is weird! Don't recognise it! + + warn "Weird! Don't recongnise this: '$ID' [$#InfoColumns,$#AlignmentColumns]// '",$Lines[$C_Line],15,"...'\n"; + } #End main iteration loop for alignment parsing. + + +=head2 The CDR3 is noted as problematic. Can we identify it? + +=cut + print "D: Starting Search for CDR3\n"; + #Do have the end of the FWR3 but not the CDR3? If so then it is worth trying to find the CDR3, otherwise...nothing we can do at this point + if (exists ($DomainBoundaries{"FWR3"}{"End"}) + && $AlignmentStart !=0 + && not (exists $DomainBoundaries{"CDR3"}{"End"}) ) #Guess we need to go looking for the end then... + { + #print "D: Placing call to markUpCDR3\n"; + my ($CDR3_Start, my $CDR3_End, my $CDR3_Found_Tag) = markUpCDR3 ($DNAQuerySequence, $AAQuerySequence, + $DomainBoundaries{"FWR3"}{"End"}, $AlignmentStart, + $DNACDR3_Pat, $AACDR3_Pat); + if ($CDR3_Start !=0 && $CDR3_End !=0) + { + $DomainBoundaries{"CDR3"}{"Start"} = $CDR3_Start; + $DomainBoundaries{"CDR3"}{"End"} = $CDR3_End ; + $DomainBoundaries{"CDR3"}{"FoundHow"} = $CDR3_Found_Tag; + print "D: CDR3 was found by pattern matching: '$CDR3_Found_Tag' ($CDR3_Start, $CDR3_End)\n"; + } + else + { print "D: CDR3 was not found [either by igBLAST or by pattern matching]\n"; + $DomainBoundaries{"CDR3"}{"FoundHow"} = "NOT_FOUND"; + } + } + else + { #Was reported by igBLAST + print "D: Found the FWR3 from the Domain Boundary Table\n"; + $DomainBoundaries{"CDR3"}{"FoundHow"} = "IGBLAST_NATIVE"; + } + +#print Dumper %DomainBoundaries; + +=head2 Get the top VDJ regions: + +=cut + +=head2 Extract General Features: + +=cut + (my $TopHit) = $_ =~ m/V-J Frame, Strand\):\n(.*?)\n/s; + print "D: Top Hits (raw)= '$TopHit' \n"; + my ($Top_V_gene_match, $Top_D_gene_match, $Top_J_gene_match, $Chain, $VJFrame, $Strand) = split (/\t/,$TopHit); + print "D: Top Hits (parsed)= '$Top_V_gene_match, $Top_D_gene_match, $Top_J_gene_match, $Chain, $VJFrame, $Strand'\n"; + +=head2 Store the V / D / J Genes used + +=cut + + if (defined $Top_V_gene_match && $Top_V_gene_match ne "") + { $OUTPUT_Data{"Top V Gene"} = $Top_V_gene_match; } + + if (defined $Top_D_gene_match && $Top_D_gene_match ne "") + { $OUTPUT_Data{"Top D Gene"} = $Top_D_gene_match; } + + if (defined $Top_J_gene_match && $Top_J_gene_match ne "") + { $OUTPUT_Data{"Top J Gene"} = $Top_J_gene_match; } + + if (defined $Strand && $Strand ne "") + { $OUTPUT_Data{"Strand"} = $Strand;} + +=head4 Preamble: ID, Frame, and V / D / J used: + +=cut + #Do a reality check: if we didn't get an ID, then skip: + unless (defined (defined $ID) && $ID ne "" && + defined $VJFrame && $VJFrame ne "") + { + print &printOUTPUTData (\%OUTPUT_Data)."\n"; + next; + } + +#Ok, so we have data...most likely: + #print "OUTPUT:\t",join ("\t", $ID, $VJFrame, $Top_V_gene_match, $Top_D_gene_match, $Top_J_gene_match); + + if (defined $VJFrame && defined $ID && $VJFrame ne "" && $ID ne "") + { $OUTPUT_Data{"VDJ Frame"} = $VJFrame;} + else + { + print &printOUTPUTData (\%OUTPUT_Data)."\n"; + next; + }#REALLY? We didn't find anything? Oh well, move to next record + +=head4 CDR1 + +=cut + #Remember that the alignment starts at the FWR1 start, not nt =0 on the read, hence we substract this off all future AA (& DNA coordinates) + + my $AlignmentOffset = $DomainBoundaries{"FWR1"}{"Start"}; + +# print "D: AA Seqeunce is: '$AAQuerySequence'\n"; + if (exists $DomainBoundaries{"CDR1"}{"Start"}) #It is very possible that it doesn't; assume the End does though if we find the Start + { +# my $VRegion = $Alginments{"V"}{$C_VRegion}; #Convenience.... + my $CDR1Start = $DomainBoundaries{"CDR1"}{"Start"}; + my $CDR1End = $DomainBoundaries{"CDR1"}{"End"}; + my $CDR1_Length = $CDR1End - $CDR1Start; +# print "D: CDR1 $CDR1Start $CDR1End = $CDR1_Length\n"; + #Remember that the alignment starts at the FWR1 start, not nt =0 on the read + my $CDR1_Seq_AA = substr ($AAQuerySequence, $CDR1Start - $AlignmentOffset, $CDR1_Length); +# print "D: '$CDR1_Seq_AA'\n"; + $CDR1_Seq_AA =~ s/ //g; + my $CDR1_Seq_AA_Length = length ($CDR1_Seq_AA); + #Add this data to the output store specifically: + $OUTPUT_Data{"CDR1 Seq"} = $CDR1_Seq_AA; + $OUTPUT_Data{"CDR1 Length"} = $CDR1_Length; + } + #What happens if there is no CDR1 found? Leave blank - the output routine can handle this + +=head4 CDR2 + +=cut + + if (exists $DomainBoundaries{"CDR2"}{"Start"}) #It is very possible that it doesn't; assume the End does though if we find the Start + { +# my $VRegion = $Alginments{"V"}{$C_VRegion}; #Convenience.... + my $CDR2Start = $DomainBoundaries{"CDR2"}{"Start"}; + my $CDR2End = $DomainBoundaries{"CDR2"}{"End"}; + my $CDR2_Length = $CDR2End - $CDR2Start; + my $CDR2_Seq_AA = substr ($AAQuerySequence, $CDR2Start - $AlignmentOffset , $CDR2_Length); + $CDR2_Seq_AA =~ s/ //g; + my $CDR2_Seq_AA_Length = length ($CDR2_Seq_AA); + #Add this data to the output store specifically: + $OUTPUT_Data{"CDR2 Seq"} = $CDR2_Seq_AA; + $OUTPUT_Data{"CDR2 Length"} = $CDR2_Length; + } + #What happens if there is no CDR2 found? Leave blank - the output routine can handle this. + +=head4 CDR3 + +=cut + if (exists $DomainBoundaries{"CDR3"}{"Start"}) #It is very possible that it doesn't; assume the End does though if we find the Start + { +# my $VRegion = $Alginments{"V"}{$C_VRegion}; #Convenience.... + my $CDR3Start = $DomainBoundaries{"CDR3"}{"Start"}; + my $CDR3End = $DomainBoundaries{"CDR3"}{"End"}; + my $CDR3_Length = $CDR3End - $CDR3Start; # This variable isn't used - delete it when safe to do so + my $CDR3_Seq_AA = substr ($AAQuerySequence, $CDR3Start - $AlignmentOffset, $CDR3_Length); + my $CDR3_Seq_DNA = substr ($DNAQuerySequence, $CDR3Start - $AlignmentOffset, $CDR3_Length); + $CDR3_Seq_AA =~ s/ //g; + $CDR3_Seq_DNA =~ s/ //g; + my $CDR3_Seq_AA_Length = length ($CDR3_Seq_AA); + my $CDR3_Seq_DNA_Length = length ($CDR3_Seq_DNA); + #Add this data to the output store specifically: + $OUTPUT_Data{"CDR3 Seq"} = $CDR3_Seq_AA; + $OUTPUT_Data{"CDR3 Length"} = $CDR3_Seq_AA_Length; + $OUTPUT_Data{"CDR3 Seq DNA"} = $CDR3_Seq_DNA; + $OUTPUT_Data{"CDR3 Length DNA"} = $CDR3_Seq_DNA_Length; + #And in the case of the CDR3 how we found it: + $OUTPUT_Data{"CDR3 Found How"} = $DomainBoundaries{"CDR3"}{"FoundHow"}; + } + #What happens if there is no CDR3 found? Leave blank - the output routine can handle this. +#die "HIT BLOCK\n"; +#End of the record; output the data we have collected and move on. +print &printOUTPUTData (\%OUTPUT_Data)."\n"; +} + + + +############ +sub returnLinesBetween { +=head3 SUB: returnLinesBetween ({reference to array Index array}, {regex for top of section}, {regex for bottom of section}) + +When passed a reference to an array and two strings - interpreted as REGEX's - will return the lines of the Array +that are bounded by these tags. + +If either of the tags are not found - or are found in the wrong order - then a null list is returned. + +=cut + +my ($Text_ref, $TopTag, $BotTag) = @_; + +my @Table; +#The two boundary conditions at which we will cut the table: +#print "D: [returnLinesBetween]: '$TopTag, $BotTag'\n"; +#How we record these: +my $AlignmentLine_Top=0; my $AlignmentLine_Bot=0; + +my $LineIndex=-1; #-1 As the loop increments this line counter first, then does its checks. +#If you care: +#print "D: Lines of text passed: $$#Lines\n"; + +#Iterate through until we find what we are looking for or run out of text to search: +while (($AlignmentLine_Bot ==0 or $AlignmentLine_Top==0) && $LineIndex <=$#{$Text_ref}) + { + $LineIndex++; + #Enable if you need to care: +# print "D: Line Index = $LineIndex\n"; + + if ($$Text_ref[$LineIndex] =~ m/$TopTag/) + { + $AlignmentLine_Top = $LineIndex; +# print "D: [returnLinesBetween]: TopTag found in Line: '$$Text_ref[$LineIndex]'\n"; #Enable if you are interested + } + if ($$Text_ref[$LineIndex] =~ m/$BotTag/) + { + $AlignmentLine_Bot = $LineIndex; +# print "D: [returnLinesBetween]: Bottom Tag found in Line: '$$Text_ref[$LineIndex]'\n"; #Enable if you are interested + } + } +#Reality check: did we find anything? If not then we return null. +if ($AlignmentLine_Top ==0 && $AlignmentLine_Bot ==0) + { return; } +#Again, enable if you care: +#print "D: [returnLinesBetween] Lines for section table: '$AlignmentLine_Top to $AlignmentLine_Bot'\n"; + +#We want the lines one down and one up - so polish these. +$AlignmentLine_Top++; $AlignmentLine_Bot--; + +#Return as an array slice: +return (@$Text_ref[$AlignmentLine_Top .. $AlignmentLine_Bot]); +} +############ + +sub findSplitPoint +{ +=head2 sub: $PanelBoundaryCahracter = findSplitPoint (\@Table) + +When passed a table with the alignment in it makes an educated guess as to the precise split point to +spearate the 'info' and 'alignment' panels. +This is a right olde faff because the field / panel boundaries change. + + ' Query_6 167 GAGGTGCAGTTGTTGGAGTCTGGGGGAGGCTTGGCACAGCC-GGGGGGTCCCTGAGACTCTCCTGTGCAG 235' + ' Query_6 236 CCTCTGGATTCACCTTTGACAAATATGCCATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGTCTGGAGTG 305' + ' Query_6 306 GGTCTCAACTATACTTGCCAGTGGTCG---CACAGACGACGCAGACTCCGTGAAGGGCCGGTTTGCCATC 372' + ' Query_6 373 TCCAGAGACAATTCCAAGAACACTCTGTATCTGCAAATGAACAGCCTGAGAGTCGAGGACACGGCCCTTT 442' + ' Query_6 443 ATTACTGTGCGAGTGAGGGGGACATAGTGGCTTCGGAGCTTTTGAGTACTGGGGCCAGGGAAACCTGGTC 512' +MOTIF_FOUND_IN_AA +i.e to contain just ATGC + "X" bases & the gap "-" character but not the "." character (found in the alingment proper) and have 4 fields in total + +Returns either -1 or the location of the panel boundary, issues a warning and returns -1 if is the most frequent boundary +because the pattern match has been failing more often that it suceeded. + +=cut +#A rough guess is 38 for normal sequences, 48 for reversed ones: + +my $SplitPos = 0; + +(my $Table_ref) = @_; #Get the reference to the table +my @DNALines; #We populate this for mining in the next section +foreach my $C_Line (@{$Table_ref}) + { + #print "D: $C_Line\n"; +# (my $SplitLine) = $C_Line; + #Split on consecutive tabs or spaces: + my @LineFields = split (/[\t\s]+/,$C_Line); + #print "D: Split Line: '",join (",",@LineFields),"' : $#LineFields\n"; + unless ( $LineFields[3] =~ m/[^\.]/ + && $LineFields[3] =~ m/[ATGCX]{20,}/ + && $#LineFields==4) + { next; } +#Enable if you want to know the lines we think are the DNA Query strings: + #print "D: DNA Line: '$C_Line'\n"; + push @DNALines, $C_Line; #Note it down + } + +my %PanelBounds; #Will contain the positions of the panel boundaries + +foreach my $C_DNALine (@DNALines) + { + #print "D: '$C_DNALine'\n"; + $C_DNALine =~ m/[ATGC-]+ \d+$/; #Match the DNA string and the indexingMOTIF_FOUND_IN_AA numbers afterwards, allow gap characters. + my $MatchPos = $-[0]; #This is the position of the start of the last match because we can't get the index() function to work + #(my $MatchPos) = index ($C_DNALine, / [ATGCX-]{20}/,0); + #print "D: '$C_DNALine' DNA panel starts at:'$MatchPos'\n"; + $PanelBounds{$MatchPos}++; + } +#Sort the hash values in order and then return the most frequent (will offer some resistance to the occasion pattern failure) +#The brackets around "($SplitPos)" are really necessary it seems. +($SplitPos) = (sort { $a <=> $b } keys %PanelBounds); +#If you want +#print Dumper %PanelBounds; +#Tell people if we are having difficultlty: +if ($SplitPos == -1) { warn "Couldn't identify the panel boundaries\n"; } +#print "D: $SplitPos: Returning the split position of: '$SplitPos'\n"; +return $SplitPos; +} + + +## +# +# +### + + + + + +##### +# +# +##### +sub markUpCDR3 +{ +=head3 Sub: (Start, End, Found How) = markUpCDR3 (DNASeq, AASeq, FWR3 End, FWR1 Offset, DNA Regex, AA Regex) + +Tries to identify the end of the CDR3 using the DNA and RNA Sequence patterns MOTIF_FOUND_IN_AAsupplied. The CDR3 is assumed to start +at the end of the FWR3. +To reduce FP matches only the sequences (DNA & AA) after the FWR3 are tested with the pattern. +The position of the first matching pattern is reported. + +=head4 Fuller Usage: + +my ($CDR3_Start, my $CDR3_End) = markUpCDR3 ($DNAQuerySequence, $AAQuerySequence, + $DomainBoundaries{"FWR3"}{"End"}, $DomainBoundaries{"FWR1"}{"Start"}, + $DNACDR3_Pat, $AACDR3_Pat); + + + +=head4 Returned Values + +If the CDR3 was found then we we signal like this: + + $MotifFound ==0 : Nope, didn't find either motif + $MotifFound ==1 : Found at the DNA level, not the AA level + $MotifFound ==2 : Found at the the AA level, not the DNA level + $MotifFound ==3 : Found at the the AA level & the DNA level + +(Also remember that if the FWR3 region couldn't be identified in the sequence there is a 4th option: not tested; this routine isn't called therefore) + +The Start and Ends returned are from the first sucessful match (MotifFound==3): though hopefully they are the same. +Formally the test order is: + + 1) DNA + 2) AA + +i.e. DNA bp locations have priority. + +Technically the locations are determined by a regex match then the $+[0] array (i.e. the end of the pattern match). +See pages like this: http://stackoverflow.com/questions/87380/how-can-i-find-the-location-of-a-regex-match-in-perl for an explanation. + +=head3 Manipulation of AA patternsMOTIF_FOUND_IN_AA + +Note that patterns are assumed to require white space inserting in them between the letters. +This could be a serious limitation + + +=cut + +#Get the parameters passed: +my ($DNA, $AA, $FWR3_End, $FWR1_Start, $DNAPat, $AAPat) = @_; +print "D: markUpCDR3: Passed Parameters '$FWR3_End, $FWR1_Start, $DNAPat, $AAPat' (& AA & DNA sequence)\n"; + + +#Setup our return values: +my $Start = 0; my $End =0; my $MotifFound = 0; +my $How; #Literally How the motif was found (or not if blank) + + +=head4 Prepare the sequences and the patterns for use + +Specifically: trim off the start of the AA & DNA string already allocated to other CDRs or FWRs + +Add in spaces into the AA regex pattern because we can't get regex-ex freespacing mode i.e. "$Var =~ m/$AAPat/x" working. + + +We take the "-1" as the CropPoint position to include the previous 3 nucleotides / AAs; remember to add this back on +in position calculations. + + +=cut + +#Because igBLAST doesn't always report from the start of the read (primers and things are upstream): + +my $CropPoint = $FWR3_End - $FWR1_Start - 1 ; +#print "D: markUpCDR3: Crop point is: '$CropPoint'\n"; + +#print "D: markUpCDR3: Cropping point is: '$CropPoint' characters from start\n"; +#We trim off the parts we expect to find the CDR3 motifs in leaving at extra 3nts on to allow for base miss-calling: + +my $AA_Trimmed = substr ($AA, $CropPoint); +my $DNA_Trimmed = substr ($DNA ,$CropPoint); +#print "D: markUpCDR3: AA = '$AA' (untrimmed)\nD: markUpCDR3: TR = '$AA_Trimmed' (Trimmed) ", length ($AA_Trimmed)," nts long\n"; +#print "D: markUpCDR3: Testing: AA = '$AA_Trimmed', DNA = '$DNA_Trimmed'\n"; + +#This lovely hack is to account for the spaces in the AA sequence and we can't get the "$Var =~ m/$AAPat/x" working +my $AAPat_Spaced; +foreach my $C_Char (0..length($AAPat)-1) #The -1 is because we don't want trailing spaces until the next nt -> AA translation. + { $AAPat_Spaced = $AAPat_Spaced.'\s+'.substr ($AAPat,$C_Char,1); } +#And write this back into the main pattern we were passed: +$AAPat = $AAPat_Spaced; + +#temp hack: +#$AA_Trimmed = $AA; +my $MotifFound=0; #So we can record which patterns we found +my $MotifPositionDNA =-1; +my $MotifPositionAA =-1; + +#print "D: markUpCDR3: Pattern: '$AAPat_Spaced'\n"; +=head4 At DNA level: "TGG GGx xxx GGx" [+1] + +=cut + +#print "D: markUpCDR3: '$DNA_Trimmed' (Trimmed DNA string)\n"; + +if ($DNA_Trimmed =~ m/$DNAPat/) + { + $MotifPositionDNA = $+[0]; #Just the easiest way to do this in Perl +# print "D: markUpCDR3:: Found Motif match on DNA at bp: '$MotifPositionDNA'\n"; + $MotifFound = $MotifFound + 1; + #Any more matches further on? + my $LaterString = substr ($DNA_Trimmed, $MotifPositionDNA); +# print "D: markUpCDR3: '$AA_Trimmed' (AA Trimmed string)\n"; +# print "D: markUpCDR3: '", substr ($DNA_Trimmed,0, $MotifPositionDNA)," (DNA until pattern match string)\n"; +# print "D: markUpCDR3: '$DNA_Trimmed' (Trimmed DNA string)\n"; +# print "D: markUpCDR3: '$LaterString' (Later part of DNA string)\n"; + if ($LaterString =~ m/$DNAPat/) + { print "D: markUPCDR3: Also got a match further down the DNA String: at ", $-[0] ," to ", $+ [0], " - which might be worrying\n"; } + } + +=head4 At AA level: "WGxG" [+2] + +=cut + +if ($AA_Trimmed=~ m/$AAPat/) + { + $MotifPositionAA = $+[0]; #Just the easiest way to do this in Perl + $MotifFound = $MotifFound + 2; +# print "D: markUpCDR3: Found Motif match on AA at position (on DNA remember): '$MotifPositionAA' (ie.)\n"; + (my $CDR3_seq) = substr ($AA_Trimmed, 0, $MotifPositionAA); +# print "D: markUpCDR3: Seq ='$CDR3_seq' - as detected\n"; + + } + +=head4 Assess the results of motif position finding + +=cut + +#print "D: markUpCDR3: MotifFound = '$MotifFound'\n"; + +if ($MotifFound ==0) + { return ($Start, $End, $MotifFound); } #The easy one really: return we didn't find the CDR3 + +# +$Start = $FWR3_End; #We assume the end of the FWR3 is the start of CDR3: +#Just found in DNA: +if ($MotifFound ==1) + { + $Start = $FWR3_End; #We assume the end of the FWR3 is the start of CDR3: + $End = $MotifPositionDNA; + $How = "MOTIF_FOUND_IN_DNA"; + } +#Just found in AA: +if ($MotifFound ==2) + { + $End = $MotifPositionAA; + $How = "MOTIF_FOUND_IN_AA"; + } + +#Found in both, DNA has priority: +if ($MotifFound ==3) + { + $Start = $FWR3_End ; #We assume the end of the FWR3 is the start of CDR3: + $End = $MotifPositionDNA; + $How = "MOTIF_FOUND_IN_BOTH"; + } + +#print "D: markUpCDR3: Motif found = $MotifFound\n"; + +=head4 These next few lines are for testing / diagnostics only - disable for general use + +If you are interested in getting the CDR3 directly then remember the main coordinate system is defined such that +the start of FWR1 is unlikely to be at nt 1. + +=cut + +$Start = $FWR3_End - $FWR1_Start -1; +$End = $End + $CropPoint; +my $CDR3_RegionLength = $End - $Start; +#print "D: markUpCDR3: CDR3 Length= $Start - $End = '$CDR3_RegionLength'\n"; +(my $CDR3_seq) = substr ($AA, $Start, $CDR3_RegionLength); + +#Add onto the coordinates what we trimmed off: + + +#print "D: markUpCDR3: Seq ='$CDR3_seq'\n"; + +print "D: markUpCDR3: returning: $Start, $End, $How, ($MotifFound) [NB: offset of :'+ $FWR1_Start'\n"; +#die "HIT BLOCK\n"; +return ($Start + $FWR1_Start, $End + $FWR1_Start, $How); +} + + +sub printOUTPUTData { +=head2 sub: $OutputDataString = printOUTPUTData {\%OutputData} + +When passed an array containing the appropriate CDR, Top V / D/ J genes and the seqeunce ID. +This prepared and then returned as a text string that can then be printed to STDOUT: + + print (printOUTPUTData (\%OutputData)); + +Any missing data in the Hash array it polietly ignored and a null string printed in place. +The text field is tab delimited; there are no extra trailing tabs or carriage returns in place. + +Actually the fields printed out are stored in an index array. + +=head3 Header output + +If the routine is passed a key 'HEADER' then the header columns are returned as that string. +This is tested first - so don't add this unless you mean to. + +=cut + +my @HeaderFields = ("ID", "VDJ Frame", "Top V Gene", "Top D Gene", "Top J Gene", + "CDR1 Seq", "CDR1 Length", + "CDR2 Seq", "CDR2 Length", + "CDR3 Seq", "CDR3 Length", "CDR3 Seq DNA", "CDR3 Length DNA", "Strand", + "CDR3 Found How"); + +my $OutputString = "OUTPUT:"; #What we are going to build the output into. + +=head4 Print Header & Exit? + +=cut + +my ($Data_ref) = @_; +#print "D: printOUTPUTData: Running\n"; + +if (exists $$Data_ref {"HEADER"}) + { + $OutputString .= "\t"; + for(my $n = 0; $n <= $#HeaderFields; $n++) + { + $OutputString .= $HeaderFields[$n]; + $OutputString .= "\t" if($n < $#HeaderFields); + } + + # foreach my $C_Header (@HeaderFields) + # { $OutputString .= "$C_Header"; } # + + print "D: printOUTPUTData: HEADER Printout requested '@HeaderFields'\n"; + return ($OutputString); + } + +=head3 Assemble whatever data we have - and tab delimit the null fields + +=cut +#print "D: printOUTPUTData: Will pretty print this:\n", Dumper $Data_ref; +foreach my $C_Header (@HeaderFields) + { + + if (exists ($$Data_ref {$C_Header})) + { $OutputString .= "\t". $$Data_ref{$C_Header}; } #We have data to print out + else + { $OutputString .="\t"; } #Add a trailing space + } # + +return ($OutputString); +} + + +######################################### Code Junk ######################## + + +=head2 Code Junk Attic + +=head3 Demonstrates how to reverse translate an amino acid sequence into DNA: + +use Bio::Tools::CodonTable; +use Bio::Seq; + +# print possible codon tables + my $tables = Bio::Tools::CodonTable->tables; + while ( (my $id, my $name) = each %{$tables} ) { + print "$id = $name\n"; + } + my $CodonTable = Bio::Tools::CodonTable->new(); + + my $ExampleSeq = Bio::PrimarySeq->new(-seq=>"WGxG", -alphabet => 'protein') or die "Cannot create sequence object\n"; + + +my $rvSeq = $CodonTable->reverse_translate_all($ExampleSeq); +print "D: '$rvSeq'\n"; +die "TEST OVER\n"; + +=cut + + +=head3 For processing the 'Alignment lines' section of the alginment table + + #If we are ever interested; then enable the code below: +# print ": Alignment\n"; +# $InfoPanel =~ s/^ +//; $InfoPanel =~ s/ +$//; #Clean off trailing spaces +# my ($Germclass, $PID, $PID_Counts, $Allele) = split (/\s+/,$InfoPanel); #Split on spaces +##Enable if you need to know what we just found: +# #print "D: Fields are (Germclass, PID, PID_Counts, Allele) \t$Germclass, $PID, $PID_Counts, $Allele\n"; +# #A reality check: we should have an Allele - or some text here. +# unless (defined $Allele && $Allele ne "") +# { warn "Cannot get Allele for Line '$C_Line' - implies improper parsing: '",substr ($Lines[$C_Line],0,15),"...'\n"; } +# if (exists ($Alginments {$Germclass}{$Allele})) +# { $Alginments {$Germclass}{$Allele} = $Alginments {$Germclass}{$Allele}.$CurrentAASequence; } #Carry on adding +# else #more work needed as we need to 'pad' the sequence with fake gap characters) +# { +##Do we still need this padding? I don't think so +# +# +# my $PaddingChars = ($ThisQueryStart-$Query_Start); +# print "D: New gene found: need to pad it with ($ThisQueryStart-$Query_Start) i.e. '$PaddingChars' characters\n"; +# #To help testing, calculate this first: +# my $PaddingString = " "x $PaddingChars; +# $Alginments {$Germclass}{$Allele} = $CurrentAASequence; +# } +# next + +=head3 Demonstration of Pattern match positions + +my $Text = "12345TTT TTAAAAA"; +my $TestPat = "TTT\\s+TT"; +(my $Result)= $Text =~ m/$TestPat/; +print "D: Two vars are: - = ",$-[0], " & + =", $+[0]," for test pattern '$TestPat'\n"; + +sub printCDR3 { + +=head3 Subroutine: printCDR3 ($CDR3_Start, $CDR3_End, "SUMMARY_TABLE", $AAQuerySequence, $DNAQuerySequence); + +???? IS THIS FUNCTION IN USE ????? + +Handles the printing of the output when passed information about the CDR3 region. + + +The result is sent returned as a text string in this version hence use it like this if you want to send it to STDOUT: + + print printCDR3 ($CDR3_Start, $CDR3_End, "SUMMARY_TABLE", $AAQuerySequence, $DNAQuerySequence), "\n"; + +#=cut + +#Despite the similarity in names, these are all local copies passed to us: + +my ($Start, $End, $Tag, $FullAAQuerySequence, $FullDNAQuerySequence) = @_; + +#For DNA: +my ($CDR_DNA_Seq) = substr ($FullDNAQuerySequence, $Start, $Start+$End); +my ($CDR_DNA_Length) = length ($CDR_DNA_Seq); + +#For AA: +my ($CDR_AA_Seq) = substr ($FullAAQuerySequence, $Start, $Start+$End); +my ($CDR_AA_Length) = length ($CDR_AA_Seq); + +my $ReturnString = join ("\t", $CDR_DNA_Seq, $CDR_DNA_Length, $CDR_AA_Seq, $CDR_AA_Length, $Tag); #Create here so we can inspect it / post process it if needed: +print "D: SUB: printCDR3: As returned: '$ReturnString'\n"; +return ($ReturnString); + +} + +=cut + + + +=head2 Change Log + +=head3 Version 1.2 + + 1) Fixed the 'Process recrod request' feature' [was failed increment in $Record] + 2) Deleted / Deactivated the function 'printCDR3' [wasn't in used; kept if useful for parts]. + This function is replaced by the more general printOUTPUTData() + 3) A tag for the CDR3 status is now output for every record / read. + Initially this is set to "NOT_FOUND" and changed if evidence for the CDR3 is found. + +=head4 Version 1.3 + + 1) The tophit line was split on whitespace, however sometimes the VJFrame is something like “In-frame with stop codon”, + which means the line is also split on the spaces therein. It now splits on tabs only, and this seems to work properly. + - found by Bas Horsman. + +=head4 Version 1.3a + + 1) "MOTIF_FOUND_IN_AA" reported correctly (was impossible previously due to addition error to the $MotifFound var (never could == 3) + +=cut + +=head4 Version 1.4 + + 1) Now processes files using Mac/Unix/MS-DOS newline characters: + + $_ =~ s/\r\n/\n/g; #In case line ends are MS-DOS + $_ =~ s/\r/\n/g; #In case line ends are Mac + #The whole record - one per read - is now stored in $_ + my @Lines =split (/\R/,$_); #Split on new lines + +=head4 Version 1.4a + +1) Fixed the length of the CDR3 AA string being reported correctly: + + $OUTPUT_Data{"CDR3 Length"} = $CDR3_Length; + to: + $OUTPUT_Data{"CDR3 Length"} = $CDR3_Seq_AA_Length; + diff -r 000000000000 -r 7d97fa9a0423 imgtconvert.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/imgtconvert.py Fri May 09 09:35:32 2014 -0400 @@ -0,0 +1,142 @@ +import pandas as pd +try: + pd.options.mode.chained_assignment = None # default='warn' +except: + pass +import re +import argparse +import os + +def stop_err( msg, ret=1 ): + sys.stderr.write( msg ) + sys.exit( ret ) + +#docs.python.org/dev/library/argparse.html +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="Input folder with files") +parser.add_argument("--output", help="Output file") + +args = parser.parse_args() + +old_summary_columns = [u'Sequence ID', u'JUNCTION frame', u'V-GENE and allele', u'D-GENE and allele', u'J-GENE and allele', u'CDR1-IMGT length', u'CDR2-IMGT length', u'CDR3-IMGT length', u'Orientation'] +old_sequence_columns = [u'CDR1-IMGT', u'CDR2-IMGT', u'CDR3-IMGT'] +old_junction_columns = [u'JUNCTION'] + +added_summary_columns = [u'Functionality', u'V-REGION identity %', u'V-REGION identity nt', u'D-REGION reading frame', u'AA JUNCTION', u'Functionality comment', u'Sequence'] +added_sequence_columns = [u'FR1-IMGT', u'FR2-IMGT', u'FR3-IMGT', u'CDR3-IMGT', u'JUNCTION', u'J-REGION', u'FR4-IMGT'] +added_junction_columns = [u"P3'V-nt nb", u'N1-REGION-nt nb', u"P5'D-nt nb", u"P3'D-nt nb", u'N2-REGION-nt nb', u"P5'J-nt nb", u"3'V-REGION trimmed-nt nb", u"5'D-REGION trimmed-nt nb", u"3'D-REGION trimmed-nt nb", u"5'J-REGION trimmed-nt nb"] + +inputFolder = args.input + +dirContents = os.listdir(inputFolder) +if len(dirContents) == 1: + inputFolder = os.path.join(inputFolder, dirContents[0]) + if os.path.isdir(inputFolder): + print "is dir" + dirContents = os.listdir(inputFolder) +files = sorted([os.path.join(inputFolder, f) for f in dirContents]) + +if len(files) % 3 is not 0: + stop_err("Files in zip not a multiple of 3, it should contain the all the 1_, 5_ and 6_ files for a sample") + import sys + sys.exit() + +triplets = [] +step = len(files) / 3 +for i in range(0, step): + triplets.append((files[i], files[i + step], files[i + step + step])) + +outFile = args.output + +fSummary = pd.read_csv(triplets[0][0], sep="\t") +fSequence = pd.read_csv(triplets[0][1], sep="\t") +fJunction = pd.read_csv(triplets[0][2], sep="\t") +tmp = fSummary[["Sequence ID", "JUNCTION frame", "V-GENE and allele", "D-GENE and allele", "J-GENE and allele"]] + +tmp["CDR1 Seq"] = fSequence["CDR1-IMGT"] +tmp["CDR1 Length"] = fSummary["CDR1-IMGT length"] + +tmp["CDR2 Seq"] = fSequence["CDR2-IMGT"] +tmp["CDR2 Length"] = fSummary["CDR2-IMGT length"] + +tmp["CDR3 Seq"] = fSequence["CDR3-IMGT"] +tmp["CDR3 Length"] = fSummary["CDR3-IMGT length"] + +tmp["CDR3 Seq DNA"] = fJunction["JUNCTION"] +tmp["CDR3 Length DNA"] = '1' +tmp["Strand"] = fSummary["Orientation"] +tmp["CDR3 Found How"] = 'a' + +for col in added_summary_columns: + tmp[col] = fSummary[col] + +for col in added_sequence_columns: + tmp[col] = fSequence[col] + +for col in added_junction_columns: + tmp[col] = fJunction[col] + +outFrame = tmp + +for triple in triplets[1:]: + fSummary = pd.read_csv(triple[0], sep="\t") + fSequence = pd.read_csv(triple[1], sep="\t") + fJunction = pd.read_csv(triple[2], sep="\t") + + tmp = fSummary[["Sequence ID", "JUNCTION frame", "V-GENE and allele", "D-GENE and allele", "J-GENE and allele"]] + + tmp["CDR1 Seq"] = fSequence["CDR1-IMGT"] + tmp["CDR1 Length"] = fSummary["CDR1-IMGT length"] + + tmp["CDR2 Seq"] = fSequence["CDR2-IMGT"] + tmp["CDR2 Length"] = fSummary["CDR2-IMGT length"] + + tmp["CDR3 Seq"] = fSequence["CDR3-IMGT"] + tmp["CDR3 Length"] = fSummary["CDR3-IMGT length"] + + tmp["CDR3 Seq DNA"] = fJunction["JUNCTION"] + tmp["CDR3 Length DNA"] = '1' + tmp["Strand"] = fSummary["Orientation"] + tmp["CDR3 Found How"] = 'a' + + for col in added_summary_columns: + tmp[col] = fSummary[col] + + for col in added_sequence_columns: + tmp[col] = fSequence[col] + + for col in added_junction_columns: + tmp[col] = fJunction[col] + + outFrame = outFrame.append(tmp) + +outFrame.columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Functionality', 'V-REGION identity %', 'V-REGION identity nt', 'D-REGION reading frame', 'AA JUNCTION', 'Functionality comment', 'Sequence', 'FR1-IMGT', 'FR2-IMGT', 'FR3-IMGT', 'CDR3-IMGT', 'JUNCTION', 'J-REGION', 'FR4-IMGT', 'P3V-nt nb', 'N1-REGION-nt nb', 'P5D-nt nb', 'P3D-nt nb', 'N2-REGION-nt nb', 'P5J-nt nb', '3V-REGION trimmed-nt nb', '5D-REGION trimmed-nt nb', '3D-REGION trimmed-nt nb', '5J-REGION trimmed-nt nb'] + +vPattern = re.compile(r"IGHV[1-9]-[0-9ab]+-?[1-9]?") +dPattern = re.compile(r"IGHD[1-9]-[0-9ab]+") +jPattern = re.compile(r"IGHJ[1-9]") + +def filterGenes(s, pattern): + if type(s) is not str: + return "NA" + res = pattern.search(s) + if res: + return res.group(0) + return "NA" + + +outFrame["Top V Gene"] = outFrame["Top V Gene"].apply(lambda x: filterGenes(x, vPattern)) +outFrame["Top D Gene"] = outFrame["Top D Gene"].apply(lambda x: filterGenes(x, dPattern)) +outFrame["Top J Gene"] = outFrame["Top J Gene"].apply(lambda x: filterGenes(x, jPattern)) + + + +tmp = outFrame["VDJ Frame"] +tmp = tmp.replace("in-frame", "In-frame") +tmp = tmp.replace("null", "Out-of-frame") +tmp = tmp.replace("out-of-frame", "Out-of-frame") +outFrame["VDJ Frame"] = tmp +outFrame["CDR3 Length DNA"] = outFrame["CDR3 Seq DNA"].map(str).map(len) +safeLength = lambda x: len(x) if type(x) == str else 0 +outFrame = outFrame[(outFrame["CDR3 Seq DNA"].map(safeLength) > 0) & (outFrame["Top V Gene"] != "NA") & (outFrame["Top D Gene"] != "NA") & (outFrame["Top J Gene"] != "NA")] #filter out weird rows? +outFrame.to_csv(outFile, sep="\t", index=False, index_label="index") diff -r 000000000000 -r 7d97fa9a0423 imgtconvert.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/imgtconvert.sh Fri May 09 09:35:32 2014 -0400 @@ -0,0 +1,57 @@ +#!/bin/bash +dir="$(cd "$(dirname "$0")" && pwd)" +mkdir $PWD/$2_$3 + + +#!/bin/bash +f=$(file $1) +zip7Type="7-zip archive" +tarType="tar archive" +bzip2Type="bzip2 compressed" +gzipType="gzip compressed" +zipType="Zip archive" +rarType="RAR archive" + +if [[ "$f" == *"$zip7Type"* ]]; then + echo "7-zip" + echo "Trying: 7za e $1 -o$PWD/$2_$3/" + 7za e $1 -o$PWD/$2_$3/ +fi + +if [[ "$f" == *"$tarType"* ]] +then + echo "tar archive" + echo "Trying: tar xvf $1 -C $PWD/$2_$3/" + tar xvf $1 -C $PWD/$2_$3/ +fi + +if [[ "$f" == *"$bzip2Type"* ]] +then + echo "bzip2 compressed data" + echo "Trying: tar jxf $1 -C $PWD/$2_$3/" + tar jxf $1 -C $PWD/$2_$3/ +fi + +if [[ "$f" == *"$gzipType"* ]] +then + echo "gzip compressed data" + echo "Trying: tar xvzf $1 -C $PWD/$2_$3/" + tar xvzf $1 -C $PWD/$2_$3/ +fi + +if [[ "$f" == *"$zipType"* ]] +then + echo "Zip archive" + echo "Trying: unzip $1 -d $PWD/$2_$3/" + unzip $1 -d $PWD/$2_$3/ +fi + +if [[ "$f" == *"$rarType"* ]] +then + echo "RAR archive" + echo "Trying: unrar e $1 $PWD/$2_$3/" + unrar e $1 $PWD/$2_$3/ +fi +find $PWD/$2_$3/ -type f | grep -v "1_Summary_\|5_AA-sequences_\|6_Junction_" | xargs rm -f +python $dir/imgtconvert.py --input $PWD/$2_$3 --output $4 + diff -r 000000000000 -r 7d97fa9a0423 jquery-1.11.0.min.js --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jquery-1.11.0.min.js Fri May 09 09:35:32 2014 -0400 @@ -0,0 +1,4 @@ +/*! jQuery v1.11.0 | (c) 2005, 2014 jQuery Foundation, Inc. | jquery.org/license */ +!function(a,b){"object"==typeof module&&"object"==typeof module.exports?module.exports=a.document?b(a,!0):function(a){if(!a.document)throw new Error("jQuery requires a window with a document");return b(a)}:b(a)}("undefined"!=typeof window?window:this,function(a,b){var c=[],d=c.slice,e=c.concat,f=c.push,g=c.indexOf,h={},i=h.toString,j=h.hasOwnProperty,k="".trim,l={},m="1.11.0",n=function(a,b){return new n.fn.init(a,b)},o=/^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g,p=/^-ms-/,q=/-([\da-z])/gi,r=function(a,b){return b.toUpperCase()};n.fn=n.prototype={jquery:m,constructor:n,selector:"",length:0,toArray:function(){return d.call(this)},get:function(a){return null!=a?0>a?this[a+this.length]:this[a]:d.call(this)},pushStack:function(a){var b=n.merge(this.constructor(),a);return b.prevObject=this,b.context=this.context,b},each:function(a,b){return n.each(this,a,b)},map:function(a){return this.pushStack(n.map(this,function(b,c){return a.call(b,c,b)}))},slice:function(){return this.pushStack(d.apply(this,arguments))},first:function(){return this.eq(0)},last:function(){return this.eq(-1)},eq:function(a){var b=this.length,c=+a+(0>a?b:0);return this.pushStack(c>=0&&b>c?[this[c]]:[])},end:function(){return this.prevObject||this.constructor(null)},push:f,sort:c.sort,splice:c.splice},n.extend=n.fn.extend=function(){var a,b,c,d,e,f,g=arguments[0]||{},h=1,i=arguments.length,j=!1;for("boolean"==typeof g&&(j=g,g=arguments[h]||{},h++),"object"==typeof g||n.isFunction(g)||(g={}),h===i&&(g=this,h--);i>h;h++)if(null!=(e=arguments[h]))for(d in e)a=g[d],c=e[d],g!==c&&(j&&c&&(n.isPlainObject(c)||(b=n.isArray(c)))?(b?(b=!1,f=a&&n.isArray(a)?a:[]):f=a&&n.isPlainObject(a)?a:{},g[d]=n.extend(j,f,c)):void 0!==c&&(g[d]=c));return g},n.extend({expando:"jQuery"+(m+Math.random()).replace(/\D/g,""),isReady:!0,error:function(a){throw new Error(a)},noop:function(){},isFunction:function(a){return"function"===n.type(a)},isArray:Array.isArray||function(a){return"array"===n.type(a)},isWindow:function(a){return null!=a&&a==a.window},isNumeric:function(a){return a-parseFloat(a)>=0},isEmptyObject:function(a){var b;for(b in a)return!1;return!0},isPlainObject:function(a){var b;if(!a||"object"!==n.type(a)||a.nodeType||n.isWindow(a))return!1;try{if(a.constructor&&!j.call(a,"constructor")&&!j.call(a.constructor.prototype,"isPrototypeOf"))return!1}catch(c){return!1}if(l.ownLast)for(b in a)return j.call(a,b);for(b in a);return void 0===b||j.call(a,b)},type:function(a){return null==a?a+"":"object"==typeof a||"function"==typeof a?h[i.call(a)]||"object":typeof a},globalEval:function(b){b&&n.trim(b)&&(a.execScript||function(b){a.eval.call(a,b)})(b)},camelCase:function(a){return a.replace(p,"ms-").replace(q,r)},nodeName:function(a,b){return a.nodeName&&a.nodeName.toLowerCase()===b.toLowerCase()},each:function(a,b,c){var d,e=0,f=a.length,g=s(a);if(c){if(g){for(;f>e;e++)if(d=b.apply(a[e],c),d===!1)break}else for(e in a)if(d=b.apply(a[e],c),d===!1)break}else if(g){for(;f>e;e++)if(d=b.call(a[e],e,a[e]),d===!1)break}else for(e in a)if(d=b.call(a[e],e,a[e]),d===!1)break;return a},trim:k&&!k.call("\ufeff\xa0")?function(a){return null==a?"":k.call(a)}:function(a){return null==a?"":(a+"").replace(o,"")},makeArray:function(a,b){var c=b||[];return null!=a&&(s(Object(a))?n.merge(c,"string"==typeof a?[a]:a):f.call(c,a)),c},inArray:function(a,b,c){var d;if(b){if(g)return g.call(b,a,c);for(d=b.length,c=c?0>c?Math.max(0,d+c):c:0;d>c;c++)if(c in b&&b[c]===a)return c}return-1},merge:function(a,b){var c=+b.length,d=0,e=a.length;while(c>d)a[e++]=b[d++];if(c!==c)while(void 0!==b[d])a[e++]=b[d++];return a.length=e,a},grep:function(a,b,c){for(var d,e=[],f=0,g=a.length,h=!c;g>f;f++)d=!b(a[f],f),d!==h&&e.push(a[f]);return e},map:function(a,b,c){var d,f=0,g=a.length,h=s(a),i=[];if(h)for(;g>f;f++)d=b(a[f],f,c),null!=d&&i.push(d);else for(f in a)d=b(a[f],f,c),null!=d&&i.push(d);return e.apply([],i)},guid:1,proxy:function(a,b){var c,e,f;return"string"==typeof b&&(f=a[b],b=a,a=f),n.isFunction(a)?(c=d.call(arguments,2),e=function(){return a.apply(b||this,c.concat(d.call(arguments)))},e.guid=a.guid=a.guid||n.guid++,e):void 0},now:function(){return+new Date},support:l}),n.each("Boolean Number String Function Array Date RegExp Object Error".split(" "),function(a,b){h["[object "+b+"]"]=b.toLowerCase()});function s(a){var b=a.length,c=n.type(a);return"function"===c||n.isWindow(a)?!1:1===a.nodeType&&b?!0:"array"===c||0===b||"number"==typeof b&&b>0&&b-1 in a}var t=function(a){var b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s="sizzle"+-new Date,t=a.document,u=0,v=0,w=eb(),x=eb(),y=eb(),z=function(a,b){return a===b&&(j=!0),0},A="undefined",B=1<<31,C={}.hasOwnProperty,D=[],E=D.pop,F=D.push,G=D.push,H=D.slice,I=D.indexOf||function(a){for(var b=0,c=this.length;c>b;b++)if(this[b]===a)return b;return-1},J="checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|ismap|loop|multiple|open|readonly|required|scoped",K="[\\x20\\t\\r\\n\\f]",L="(?:\\\\.|[\\w-]|[^\\x00-\\xa0])+",M=L.replace("w","w#"),N="\\["+K+"*("+L+")"+K+"*(?:([*^$|!~]?=)"+K+"*(?:(['\"])((?:\\\\.|[^\\\\])*?)\\3|("+M+")|)|)"+K+"*\\]",O=":("+L+")(?:\\(((['\"])((?:\\\\.|[^\\\\])*?)\\3|((?:\\\\.|[^\\\\()[\\]]|"+N.replace(3,8)+")*)|.*)\\)|)",P=new RegExp("^"+K+"+|((?:^|[^\\\\])(?:\\\\.)*)"+K+"+$","g"),Q=new RegExp("^"+K+"*,"+K+"*"),R=new RegExp("^"+K+"*([>+~]|"+K+")"+K+"*"),S=new RegExp("="+K+"*([^\\]'\"]*?)"+K+"*\\]","g"),T=new RegExp(O),U=new RegExp("^"+M+"$"),V={ID:new RegExp("^#("+L+")"),CLASS:new RegExp("^\\.("+L+")"),TAG:new RegExp("^("+L.replace("w","w*")+")"),ATTR:new RegExp("^"+N),PSEUDO:new RegExp("^"+O),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+K+"*(even|odd|(([+-]|)(\\d*)n|)"+K+"*(?:([+-]|)"+K+"*(\\d+)|))"+K+"*\\)|)","i"),bool:new RegExp("^(?:"+J+")$","i"),needsContext:new RegExp("^"+K+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+K+"*((?:-\\d)?\\d*)"+K+"*\\)|)(?=[^-]|$)","i")},W=/^(?:input|select|textarea|button)$/i,X=/^h\d$/i,Y=/^[^{]+\{\s*\[native \w/,Z=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,$=/[+~]/,_=/'|\\/g,ab=new RegExp("\\\\([\\da-f]{1,6}"+K+"?|("+K+")|.)","ig"),bb=function(a,b,c){var d="0x"+b-65536;return d!==d||c?b:0>d?String.fromCharCode(d+65536):String.fromCharCode(d>>10|55296,1023&d|56320)};try{G.apply(D=H.call(t.childNodes),t.childNodes),D[t.childNodes.length].nodeType}catch(cb){G={apply:D.length?function(a,b){F.apply(a,H.call(b))}:function(a,b){var c=a.length,d=0;while(a[c++]=b[d++]);a.length=c-1}}}function db(a,b,d,e){var f,g,h,i,j,m,p,q,u,v;if((b?b.ownerDocument||b:t)!==l&&k(b),b=b||l,d=d||[],!a||"string"!=typeof a)return d;if(1!==(i=b.nodeType)&&9!==i)return[];if(n&&!e){if(f=Z.exec(a))if(h=f[1]){if(9===i){if(g=b.getElementById(h),!g||!g.parentNode)return d;if(g.id===h)return d.push(g),d}else if(b.ownerDocument&&(g=b.ownerDocument.getElementById(h))&&r(b,g)&&g.id===h)return d.push(g),d}else{if(f[2])return G.apply(d,b.getElementsByTagName(a)),d;if((h=f[3])&&c.getElementsByClassName&&b.getElementsByClassName)return G.apply(d,b.getElementsByClassName(h)),d}if(c.qsa&&(!o||!o.test(a))){if(q=p=s,u=b,v=9===i&&a,1===i&&"object"!==b.nodeName.toLowerCase()){m=ob(a),(p=b.getAttribute("id"))?q=p.replace(_,"\\$&"):b.setAttribute("id",q),q="[id='"+q+"'] ",j=m.length;while(j--)m[j]=q+pb(m[j]);u=$.test(a)&&mb(b.parentNode)||b,v=m.join(",")}if(v)try{return G.apply(d,u.querySelectorAll(v)),d}catch(w){}finally{p||b.removeAttribute("id")}}}return xb(a.replace(P,"$1"),b,d,e)}function eb(){var a=[];function b(c,e){return a.push(c+" ")>d.cacheLength&&delete b[a.shift()],b[c+" "]=e}return b}function fb(a){return a[s]=!0,a}function gb(a){var b=l.createElement("div");try{return!!a(b)}catch(c){return!1}finally{b.parentNode&&b.parentNode.removeChild(b),b=null}}function hb(a,b){var c=a.split("|"),e=a.length;while(e--)d.attrHandle[c[e]]=b}function ib(a,b){var c=b&&a,d=c&&1===a.nodeType&&1===b.nodeType&&(~b.sourceIndex||B)-(~a.sourceIndex||B);if(d)return d;if(c)while(c=c.nextSibling)if(c===b)return-1;return a?1:-1}function jb(a){return function(b){var c=b.nodeName.toLowerCase();return"input"===c&&b.type===a}}function kb(a){return function(b){var c=b.nodeName.toLowerCase();return("input"===c||"button"===c)&&b.type===a}}function lb(a){return fb(function(b){return b=+b,fb(function(c,d){var e,f=a([],c.length,b),g=f.length;while(g--)c[e=f[g]]&&(c[e]=!(d[e]=c[e]))})})}function mb(a){return a&&typeof a.getElementsByTagName!==A&&a}c=db.support={},f=db.isXML=function(a){var b=a&&(a.ownerDocument||a).documentElement;return b?"HTML"!==b.nodeName:!1},k=db.setDocument=function(a){var b,e=a?a.ownerDocument||a:t,g=e.defaultView;return e!==l&&9===e.nodeType&&e.documentElement?(l=e,m=e.documentElement,n=!f(e),g&&g!==g.top&&(g.addEventListener?g.addEventListener("unload",function(){k()},!1):g.attachEvent&&g.attachEvent("onunload",function(){k()})),c.attributes=gb(function(a){return a.className="i",!a.getAttribute("className")}),c.getElementsByTagName=gb(function(a){return a.appendChild(e.createComment("")),!a.getElementsByTagName("*").length}),c.getElementsByClassName=Y.test(e.getElementsByClassName)&&gb(function(a){return a.innerHTML="
",a.firstChild.className="i",2===a.getElementsByClassName("i").length}),c.getById=gb(function(a){return m.appendChild(a).id=s,!e.getElementsByName||!e.getElementsByName(s).length}),c.getById?(d.find.ID=function(a,b){if(typeof b.getElementById!==A&&n){var c=b.getElementById(a);return c&&c.parentNode?[c]:[]}},d.filter.ID=function(a){var b=a.replace(ab,bb);return function(a){return a.getAttribute("id")===b}}):(delete d.find.ID,d.filter.ID=function(a){var b=a.replace(ab,bb);return function(a){var c=typeof a.getAttributeNode!==A&&a.getAttributeNode("id");return c&&c.value===b}}),d.find.TAG=c.getElementsByTagName?function(a,b){return typeof b.getElementsByTagName!==A?b.getElementsByTagName(a):void 0}:function(a,b){var c,d=[],e=0,f=b.getElementsByTagName(a);if("*"===a){while(c=f[e++])1===c.nodeType&&d.push(c);return d}return f},d.find.CLASS=c.getElementsByClassName&&function(a,b){return typeof b.getElementsByClassName!==A&&n?b.getElementsByClassName(a):void 0},p=[],o=[],(c.qsa=Y.test(e.querySelectorAll))&&(gb(function(a){a.innerHTML="",a.querySelectorAll("[t^='']").length&&o.push("[*^$]="+K+"*(?:''|\"\")"),a.querySelectorAll("[selected]").length||o.push("\\["+K+"*(?:value|"+J+")"),a.querySelectorAll(":checked").length||o.push(":checked")}),gb(function(a){var b=e.createElement("input");b.setAttribute("type","hidden"),a.appendChild(b).setAttribute("name","D"),a.querySelectorAll("[name=d]").length&&o.push("name"+K+"*[*^$|!~]?="),a.querySelectorAll(":enabled").length||o.push(":enabled",":disabled"),a.querySelectorAll("*,:x"),o.push(",.*:")})),(c.matchesSelector=Y.test(q=m.webkitMatchesSelector||m.mozMatchesSelector||m.oMatchesSelector||m.msMatchesSelector))&&gb(function(a){c.disconnectedMatch=q.call(a,"div"),q.call(a,"[s!='']:x"),p.push("!=",O)}),o=o.length&&new RegExp(o.join("|")),p=p.length&&new RegExp(p.join("|")),b=Y.test(m.compareDocumentPosition),r=b||Y.test(m.contains)?function(a,b){var c=9===a.nodeType?a.documentElement:a,d=b&&b.parentNode;return a===d||!(!d||1!==d.nodeType||!(c.contains?c.contains(d):a.compareDocumentPosition&&16&a.compareDocumentPosition(d)))}:function(a,b){if(b)while(b=b.parentNode)if(b===a)return!0;return!1},z=b?function(a,b){if(a===b)return j=!0,0;var d=!a.compareDocumentPosition-!b.compareDocumentPosition;return d?d:(d=(a.ownerDocument||a)===(b.ownerDocument||b)?a.compareDocumentPosition(b):1,1&d||!c.sortDetached&&b.compareDocumentPosition(a)===d?a===e||a.ownerDocument===t&&r(t,a)?-1:b===e||b.ownerDocument===t&&r(t,b)?1:i?I.call(i,a)-I.call(i,b):0:4&d?-1:1)}:function(a,b){if(a===b)return j=!0,0;var c,d=0,f=a.parentNode,g=b.parentNode,h=[a],k=[b];if(!f||!g)return a===e?-1:b===e?1:f?-1:g?1:i?I.call(i,a)-I.call(i,b):0;if(f===g)return ib(a,b);c=a;while(c=c.parentNode)h.unshift(c);c=b;while(c=c.parentNode)k.unshift(c);while(h[d]===k[d])d++;return d?ib(h[d],k[d]):h[d]===t?-1:k[d]===t?1:0},e):l},db.matches=function(a,b){return db(a,null,null,b)},db.matchesSelector=function(a,b){if((a.ownerDocument||a)!==l&&k(a),b=b.replace(S,"='$1']"),!(!c.matchesSelector||!n||p&&p.test(b)||o&&o.test(b)))try{var d=q.call(a,b);if(d||c.disconnectedMatch||a.document&&11!==a.document.nodeType)return d}catch(e){}return db(b,l,null,[a]).length>0},db.contains=function(a,b){return(a.ownerDocument||a)!==l&&k(a),r(a,b)},db.attr=function(a,b){(a.ownerDocument||a)!==l&&k(a);var e=d.attrHandle[b.toLowerCase()],f=e&&C.call(d.attrHandle,b.toLowerCase())?e(a,b,!n):void 0;return void 0!==f?f:c.attributes||!n?a.getAttribute(b):(f=a.getAttributeNode(b))&&f.specified?f.value:null},db.error=function(a){throw new Error("Syntax error, unrecognized expression: "+a)},db.uniqueSort=function(a){var b,d=[],e=0,f=0;if(j=!c.detectDuplicates,i=!c.sortStable&&a.slice(0),a.sort(z),j){while(b=a[f++])b===a[f]&&(e=d.push(f));while(e--)a.splice(d[e],1)}return i=null,a},e=db.getText=function(a){var b,c="",d=0,f=a.nodeType;if(f){if(1===f||9===f||11===f){if("string"==typeof a.textContent)return a.textContent;for(a=a.firstChild;a;a=a.nextSibling)c+=e(a)}else if(3===f||4===f)return a.nodeValue}else while(b=a[d++])c+=e(b);return c},d=db.selectors={cacheLength:50,createPseudo:fb,match:V,attrHandle:{},find:{},relative:{">":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(a){return a[1]=a[1].replace(ab,bb),a[3]=(a[4]||a[5]||"").replace(ab,bb),"~="===a[2]&&(a[3]=" "+a[3]+" "),a.slice(0,4)},CHILD:function(a){return a[1]=a[1].toLowerCase(),"nth"===a[1].slice(0,3)?(a[3]||db.error(a[0]),a[4]=+(a[4]?a[5]+(a[6]||1):2*("even"===a[3]||"odd"===a[3])),a[5]=+(a[7]+a[8]||"odd"===a[3])):a[3]&&db.error(a[0]),a},PSEUDO:function(a){var b,c=!a[5]&&a[2];return V.CHILD.test(a[0])?null:(a[3]&&void 0!==a[4]?a[2]=a[4]:c&&T.test(c)&&(b=ob(c,!0))&&(b=c.indexOf(")",c.length-b)-c.length)&&(a[0]=a[0].slice(0,b),a[2]=c.slice(0,b)),a.slice(0,3))}},filter:{TAG:function(a){var b=a.replace(ab,bb).toLowerCase();return"*"===a?function(){return!0}:function(a){return a.nodeName&&a.nodeName.toLowerCase()===b}},CLASS:function(a){var b=w[a+" "];return b||(b=new RegExp("(^|"+K+")"+a+"("+K+"|$)"))&&w(a,function(a){return b.test("string"==typeof a.className&&a.className||typeof a.getAttribute!==A&&a.getAttribute("class")||"")})},ATTR:function(a,b,c){return function(d){var e=db.attr(d,a);return null==e?"!="===b:b?(e+="","="===b?e===c:"!="===b?e!==c:"^="===b?c&&0===e.indexOf(c):"*="===b?c&&e.indexOf(c)>-1:"$="===b?c&&e.slice(-c.length)===c:"~="===b?(" "+e+" ").indexOf(c)>-1:"|="===b?e===c||e.slice(0,c.length+1)===c+"-":!1):!0}},CHILD:function(a,b,c,d,e){var f="nth"!==a.slice(0,3),g="last"!==a.slice(-4),h="of-type"===b;return 1===d&&0===e?function(a){return!!a.parentNode}:function(b,c,i){var j,k,l,m,n,o,p=f!==g?"nextSibling":"previousSibling",q=b.parentNode,r=h&&b.nodeName.toLowerCase(),t=!i&&!h;if(q){if(f){while(p){l=b;while(l=l[p])if(h?l.nodeName.toLowerCase()===r:1===l.nodeType)return!1;o=p="only"===a&&!o&&"nextSibling"}return!0}if(o=[g?q.firstChild:q.lastChild],g&&t){k=q[s]||(q[s]={}),j=k[a]||[],n=j[0]===u&&j[1],m=j[0]===u&&j[2],l=n&&q.childNodes[n];while(l=++n&&l&&l[p]||(m=n=0)||o.pop())if(1===l.nodeType&&++m&&l===b){k[a]=[u,n,m];break}}else if(t&&(j=(b[s]||(b[s]={}))[a])&&j[0]===u)m=j[1];else while(l=++n&&l&&l[p]||(m=n=0)||o.pop())if((h?l.nodeName.toLowerCase()===r:1===l.nodeType)&&++m&&(t&&((l[s]||(l[s]={}))[a]=[u,m]),l===b))break;return m-=e,m===d||m%d===0&&m/d>=0}}},PSEUDO:function(a,b){var c,e=d.pseudos[a]||d.setFilters[a.toLowerCase()]||db.error("unsupported pseudo: "+a);return e[s]?e(b):e.length>1?(c=[a,a,"",b],d.setFilters.hasOwnProperty(a.toLowerCase())?fb(function(a,c){var d,f=e(a,b),g=f.length;while(g--)d=I.call(a,f[g]),a[d]=!(c[d]=f[g])}):function(a){return e(a,0,c)}):e}},pseudos:{not:fb(function(a){var b=[],c=[],d=g(a.replace(P,"$1"));return d[s]?fb(function(a,b,c,e){var f,g=d(a,null,e,[]),h=a.length;while(h--)(f=g[h])&&(a[h]=!(b[h]=f))}):function(a,e,f){return b[0]=a,d(b,null,f,c),!c.pop()}}),has:fb(function(a){return function(b){return db(a,b).length>0}}),contains:fb(function(a){return function(b){return(b.textContent||b.innerText||e(b)).indexOf(a)>-1}}),lang:fb(function(a){return U.test(a||"")||db.error("unsupported lang: "+a),a=a.replace(ab,bb).toLowerCase(),function(b){var c;do if(c=n?b.lang:b.getAttribute("xml:lang")||b.getAttribute("lang"))return c=c.toLowerCase(),c===a||0===c.indexOf(a+"-");while((b=b.parentNode)&&1===b.nodeType);return!1}}),target:function(b){var c=a.location&&a.location.hash;return c&&c.slice(1)===b.id},root:function(a){return a===m},focus:function(a){return a===l.activeElement&&(!l.hasFocus||l.hasFocus())&&!!(a.type||a.href||~a.tabIndex)},enabled:function(a){return a.disabled===!1},disabled:function(a){return a.disabled===!0},checked:function(a){var b=a.nodeName.toLowerCase();return"input"===b&&!!a.checked||"option"===b&&!!a.selected},selected:function(a){return a.parentNode&&a.parentNode.selectedIndex,a.selected===!0},empty:function(a){for(a=a.firstChild;a;a=a.nextSibling)if(a.nodeType<6)return!1;return!0},parent:function(a){return!d.pseudos.empty(a)},header:function(a){return X.test(a.nodeName)},input:function(a){return W.test(a.nodeName)},button:function(a){var b=a.nodeName.toLowerCase();return"input"===b&&"button"===a.type||"button"===b},text:function(a){var b;return"input"===a.nodeName.toLowerCase()&&"text"===a.type&&(null==(b=a.getAttribute("type"))||"text"===b.toLowerCase())},first:lb(function(){return[0]}),last:lb(function(a,b){return[b-1]}),eq:lb(function(a,b,c){return[0>c?c+b:c]}),even:lb(function(a,b){for(var c=0;b>c;c+=2)a.push(c);return a}),odd:lb(function(a,b){for(var c=1;b>c;c+=2)a.push(c);return a}),lt:lb(function(a,b,c){for(var d=0>c?c+b:c;--d>=0;)a.push(d);return a}),gt:lb(function(a,b,c){for(var d=0>c?c+b:c;++db;b++)d+=a[b].value;return d}function qb(a,b,c){var d=b.dir,e=c&&"parentNode"===d,f=v++;return b.first?function(b,c,f){while(b=b[d])if(1===b.nodeType||e)return a(b,c,f)}:function(b,c,g){var h,i,j=[u,f];if(g){while(b=b[d])if((1===b.nodeType||e)&&a(b,c,g))return!0}else while(b=b[d])if(1===b.nodeType||e){if(i=b[s]||(b[s]={}),(h=i[d])&&h[0]===u&&h[1]===f)return j[2]=h[2];if(i[d]=j,j[2]=a(b,c,g))return!0}}}function rb(a){return a.length>1?function(b,c,d){var e=a.length;while(e--)if(!a[e](b,c,d))return!1;return!0}:a[0]}function sb(a,b,c,d,e){for(var f,g=[],h=0,i=a.length,j=null!=b;i>h;h++)(f=a[h])&&(!c||c(f,d,e))&&(g.push(f),j&&b.push(h));return g}function tb(a,b,c,d,e,f){return d&&!d[s]&&(d=tb(d)),e&&!e[s]&&(e=tb(e,f)),fb(function(f,g,h,i){var j,k,l,m=[],n=[],o=g.length,p=f||wb(b||"*",h.nodeType?[h]:h,[]),q=!a||!f&&b?p:sb(p,m,a,h,i),r=c?e||(f?a:o||d)?[]:g:q;if(c&&c(q,r,h,i),d){j=sb(r,n),d(j,[],h,i),k=j.length;while(k--)(l=j[k])&&(r[n[k]]=!(q[n[k]]=l))}if(f){if(e||a){if(e){j=[],k=r.length;while(k--)(l=r[k])&&j.push(q[k]=l);e(null,r=[],j,i)}k=r.length;while(k--)(l=r[k])&&(j=e?I.call(f,l):m[k])>-1&&(f[j]=!(g[j]=l))}}else r=sb(r===g?r.splice(o,r.length):r),e?e(null,g,r,i):G.apply(g,r)})}function ub(a){for(var b,c,e,f=a.length,g=d.relative[a[0].type],i=g||d.relative[" "],j=g?1:0,k=qb(function(a){return a===b},i,!0),l=qb(function(a){return I.call(b,a)>-1},i,!0),m=[function(a,c,d){return!g&&(d||c!==h)||((b=c).nodeType?k(a,c,d):l(a,c,d))}];f>j;j++)if(c=d.relative[a[j].type])m=[qb(rb(m),c)];else{if(c=d.filter[a[j].type].apply(null,a[j].matches),c[s]){for(e=++j;f>e;e++)if(d.relative[a[e].type])break;return tb(j>1&&rb(m),j>1&&pb(a.slice(0,j-1).concat({value:" "===a[j-2].type?"*":""})).replace(P,"$1"),c,e>j&&ub(a.slice(j,e)),f>e&&ub(a=a.slice(e)),f>e&&pb(a))}m.push(c)}return rb(m)}function vb(a,b){var c=b.length>0,e=a.length>0,f=function(f,g,i,j,k){var m,n,o,p=0,q="0",r=f&&[],s=[],t=h,v=f||e&&d.find.TAG("*",k),w=u+=null==t?1:Math.random()||.1,x=v.length;for(k&&(h=g!==l&&g);q!==x&&null!=(m=v[q]);q++){if(e&&m){n=0;while(o=a[n++])if(o(m,g,i)){j.push(m);break}k&&(u=w)}c&&((m=!o&&m)&&p--,f&&r.push(m))}if(p+=q,c&&q!==p){n=0;while(o=b[n++])o(r,s,g,i);if(f){if(p>0)while(q--)r[q]||s[q]||(s[q]=E.call(j));s=sb(s)}G.apply(j,s),k&&!f&&s.length>0&&p+b.length>1&&db.uniqueSort(j)}return k&&(u=w,h=t),r};return c?fb(f):f}g=db.compile=function(a,b){var c,d=[],e=[],f=y[a+" "];if(!f){b||(b=ob(a)),c=b.length;while(c--)f=ub(b[c]),f[s]?d.push(f):e.push(f);f=y(a,vb(e,d))}return f};function wb(a,b,c){for(var d=0,e=b.length;e>d;d++)db(a,b[d],c);return c}function xb(a,b,e,f){var h,i,j,k,l,m=ob(a);if(!f&&1===m.length){if(i=m[0]=m[0].slice(0),i.length>2&&"ID"===(j=i[0]).type&&c.getById&&9===b.nodeType&&n&&d.relative[i[1].type]){if(b=(d.find.ID(j.matches[0].replace(ab,bb),b)||[])[0],!b)return e;a=a.slice(i.shift().value.length)}h=V.needsContext.test(a)?0:i.length;while(h--){if(j=i[h],d.relative[k=j.type])break;if((l=d.find[k])&&(f=l(j.matches[0].replace(ab,bb),$.test(i[0].type)&&mb(b.parentNode)||b))){if(i.splice(h,1),a=f.length&&pb(i),!a)return G.apply(e,f),e;break}}}return g(a,m)(f,b,!n,e,$.test(a)&&mb(b.parentNode)||b),e}return c.sortStable=s.split("").sort(z).join("")===s,c.detectDuplicates=!!j,k(),c.sortDetached=gb(function(a){return 1&a.compareDocumentPosition(l.createElement("div"))}),gb(function(a){return a.innerHTML="","#"===a.firstChild.getAttribute("href")})||hb("type|href|height|width",function(a,b,c){return c?void 0:a.getAttribute(b,"type"===b.toLowerCase()?1:2)}),c.attributes&&gb(function(a){return a.innerHTML="",a.firstChild.setAttribute("value",""),""===a.firstChild.getAttribute("value")})||hb("value",function(a,b,c){return c||"input"!==a.nodeName.toLowerCase()?void 0:a.defaultValue}),gb(function(a){return null==a.getAttribute("disabled")})||hb(J,function(a,b,c){var d;return c?void 0:a[b]===!0?b.toLowerCase():(d=a.getAttributeNode(b))&&d.specified?d.value:null}),db}(a);n.find=t,n.expr=t.selectors,n.expr[":"]=n.expr.pseudos,n.unique=t.uniqueSort,n.text=t.getText,n.isXMLDoc=t.isXML,n.contains=t.contains;var u=n.expr.match.needsContext,v=/^<(\w+)\s*\/?>(?:<\/\1>|)$/,w=/^.[^:#\[\.,]*$/;function x(a,b,c){if(n.isFunction(b))return n.grep(a,function(a,d){return!!b.call(a,d,a)!==c});if(b.nodeType)return n.grep(a,function(a){return a===b!==c});if("string"==typeof b){if(w.test(b))return n.filter(b,a,c);b=n.filter(b,a)}return n.grep(a,function(a){return n.inArray(a,b)>=0!==c})}n.filter=function(a,b,c){var d=b[0];return c&&(a=":not("+a+")"),1===b.length&&1===d.nodeType?n.find.matchesSelector(d,a)?[d]:[]:n.find.matches(a,n.grep(b,function(a){return 1===a.nodeType}))},n.fn.extend({find:function(a){var b,c=[],d=this,e=d.length;if("string"!=typeof a)return this.pushStack(n(a).filter(function(){for(b=0;e>b;b++)if(n.contains(d[b],this))return!0}));for(b=0;e>b;b++)n.find(a,d[b],c);return c=this.pushStack(e>1?n.unique(c):c),c.selector=this.selector?this.selector+" "+a:a,c},filter:function(a){return this.pushStack(x(this,a||[],!1))},not:function(a){return this.pushStack(x(this,a||[],!0))},is:function(a){return!!x(this,"string"==typeof a&&u.test(a)?n(a):a||[],!1).length}});var y,z=a.document,A=/^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]*))$/,B=n.fn.init=function(a,b){var c,d;if(!a)return this;if("string"==typeof a){if(c="<"===a.charAt(0)&&">"===a.charAt(a.length-1)&&a.length>=3?[null,a,null]:A.exec(a),!c||!c[1]&&b)return!b||b.jquery?(b||y).find(a):this.constructor(b).find(a);if(c[1]){if(b=b instanceof n?b[0]:b,n.merge(this,n.parseHTML(c[1],b&&b.nodeType?b.ownerDocument||b:z,!0)),v.test(c[1])&&n.isPlainObject(b))for(c in b)n.isFunction(this[c])?this[c](b[c]):this.attr(c,b[c]);return this}if(d=z.getElementById(c[2]),d&&d.parentNode){if(d.id!==c[2])return y.find(a);this.length=1,this[0]=d}return this.context=z,this.selector=a,this}return a.nodeType?(this.context=this[0]=a,this.length=1,this):n.isFunction(a)?"undefined"!=typeof y.ready?y.ready(a):a(n):(void 0!==a.selector&&(this.selector=a.selector,this.context=a.context),n.makeArray(a,this))};B.prototype=n.fn,y=n(z);var C=/^(?:parents|prev(?:Until|All))/,D={children:!0,contents:!0,next:!0,prev:!0};n.extend({dir:function(a,b,c){var d=[],e=a[b];while(e&&9!==e.nodeType&&(void 0===c||1!==e.nodeType||!n(e).is(c)))1===e.nodeType&&d.push(e),e=e[b];return d},sibling:function(a,b){for(var c=[];a;a=a.nextSibling)1===a.nodeType&&a!==b&&c.push(a);return c}}),n.fn.extend({has:function(a){var b,c=n(a,this),d=c.length;return this.filter(function(){for(b=0;d>b;b++)if(n.contains(this,c[b]))return!0})},closest:function(a,b){for(var c,d=0,e=this.length,f=[],g=u.test(a)||"string"!=typeof a?n(a,b||this.context):0;e>d;d++)for(c=this[d];c&&c!==b;c=c.parentNode)if(c.nodeType<11&&(g?g.index(c)>-1:1===c.nodeType&&n.find.matchesSelector(c,a))){f.push(c);break}return this.pushStack(f.length>1?n.unique(f):f)},index:function(a){return a?"string"==typeof a?n.inArray(this[0],n(a)):n.inArray(a.jquery?a[0]:a,this):this[0]&&this[0].parentNode?this.first().prevAll().length:-1},add:function(a,b){return this.pushStack(n.unique(n.merge(this.get(),n(a,b))))},addBack:function(a){return this.add(null==a?this.prevObject:this.prevObject.filter(a))}});function E(a,b){do a=a[b];while(a&&1!==a.nodeType);return a}n.each({parent:function(a){var b=a.parentNode;return b&&11!==b.nodeType?b:null},parents:function(a){return n.dir(a,"parentNode")},parentsUntil:function(a,b,c){return n.dir(a,"parentNode",c)},next:function(a){return E(a,"nextSibling")},prev:function(a){return E(a,"previousSibling")},nextAll:function(a){return n.dir(a,"nextSibling")},prevAll:function(a){return n.dir(a,"previousSibling")},nextUntil:function(a,b,c){return n.dir(a,"nextSibling",c)},prevUntil:function(a,b,c){return n.dir(a,"previousSibling",c)},siblings:function(a){return n.sibling((a.parentNode||{}).firstChild,a)},children:function(a){return n.sibling(a.firstChild)},contents:function(a){return n.nodeName(a,"iframe")?a.contentDocument||a.contentWindow.document:n.merge([],a.childNodes)}},function(a,b){n.fn[a]=function(c,d){var e=n.map(this,b,c);return"Until"!==a.slice(-5)&&(d=c),d&&"string"==typeof d&&(e=n.filter(d,e)),this.length>1&&(D[a]||(e=n.unique(e)),C.test(a)&&(e=e.reverse())),this.pushStack(e)}});var F=/\S+/g,G={};function H(a){var b=G[a]={};return n.each(a.match(F)||[],function(a,c){b[c]=!0}),b}n.Callbacks=function(a){a="string"==typeof a?G[a]||H(a):n.extend({},a);var b,c,d,e,f,g,h=[],i=!a.once&&[],j=function(l){for(c=a.memory&&l,d=!0,f=g||0,g=0,e=h.length,b=!0;h&&e>f;f++)if(h[f].apply(l[0],l[1])===!1&&a.stopOnFalse){c=!1;break}b=!1,h&&(i?i.length&&j(i.shift()):c?h=[]:k.disable())},k={add:function(){if(h){var d=h.length;!function f(b){n.each(b,function(b,c){var d=n.type(c);"function"===d?a.unique&&k.has(c)||h.push(c):c&&c.length&&"string"!==d&&f(c)})}(arguments),b?e=h.length:c&&(g=d,j(c))}return this},remove:function(){return h&&n.each(arguments,function(a,c){var d;while((d=n.inArray(c,h,d))>-1)h.splice(d,1),b&&(e>=d&&e--,f>=d&&f--)}),this},has:function(a){return a?n.inArray(a,h)>-1:!(!h||!h.length)},empty:function(){return h=[],e=0,this},disable:function(){return h=i=c=void 0,this},disabled:function(){return!h},lock:function(){return i=void 0,c||k.disable(),this},locked:function(){return!i},fireWith:function(a,c){return!h||d&&!i||(c=c||[],c=[a,c.slice?c.slice():c],b?i.push(c):j(c)),this},fire:function(){return k.fireWith(this,arguments),this},fired:function(){return!!d}};return k},n.extend({Deferred:function(a){var b=[["resolve","done",n.Callbacks("once memory"),"resolved"],["reject","fail",n.Callbacks("once memory"),"rejected"],["notify","progress",n.Callbacks("memory")]],c="pending",d={state:function(){return c},always:function(){return e.done(arguments).fail(arguments),this},then:function(){var a=arguments;return n.Deferred(function(c){n.each(b,function(b,f){var g=n.isFunction(a[b])&&a[b];e[f[1]](function(){var a=g&&g.apply(this,arguments);a&&n.isFunction(a.promise)?a.promise().done(c.resolve).fail(c.reject).progress(c.notify):c[f[0]+"With"](this===d?c.promise():this,g?[a]:arguments)})}),a=null}).promise()},promise:function(a){return null!=a?n.extend(a,d):d}},e={};return d.pipe=d.then,n.each(b,function(a,f){var g=f[2],h=f[3];d[f[1]]=g.add,h&&g.add(function(){c=h},b[1^a][2].disable,b[2][2].lock),e[f[0]]=function(){return e[f[0]+"With"](this===e?d:this,arguments),this},e[f[0]+"With"]=g.fireWith}),d.promise(e),a&&a.call(e,e),e},when:function(a){var b=0,c=d.call(arguments),e=c.length,f=1!==e||a&&n.isFunction(a.promise)?e:0,g=1===f?a:n.Deferred(),h=function(a,b,c){return function(e){b[a]=this,c[a]=arguments.length>1?d.call(arguments):e,c===i?g.notifyWith(b,c):--f||g.resolveWith(b,c)}},i,j,k;if(e>1)for(i=new Array(e),j=new Array(e),k=new Array(e);e>b;b++)c[b]&&n.isFunction(c[b].promise)?c[b].promise().done(h(b,k,c)).fail(g.reject).progress(h(b,j,i)):--f;return f||g.resolveWith(k,c),g.promise()}});var I;n.fn.ready=function(a){return n.ready.promise().done(a),this},n.extend({isReady:!1,readyWait:1,holdReady:function(a){a?n.readyWait++:n.ready(!0)},ready:function(a){if(a===!0?!--n.readyWait:!n.isReady){if(!z.body)return setTimeout(n.ready);n.isReady=!0,a!==!0&&--n.readyWait>0||(I.resolveWith(z,[n]),n.fn.trigger&&n(z).trigger("ready").off("ready"))}}});function J(){z.addEventListener?(z.removeEventListener("DOMContentLoaded",K,!1),a.removeEventListener("load",K,!1)):(z.detachEvent("onreadystatechange",K),a.detachEvent("onload",K))}function K(){(z.addEventListener||"load"===event.type||"complete"===z.readyState)&&(J(),n.ready())}n.ready.promise=function(b){if(!I)if(I=n.Deferred(),"complete"===z.readyState)setTimeout(n.ready);else if(z.addEventListener)z.addEventListener("DOMContentLoaded",K,!1),a.addEventListener("load",K,!1);else{z.attachEvent("onreadystatechange",K),a.attachEvent("onload",K);var c=!1;try{c=null==a.frameElement&&z.documentElement}catch(d){}c&&c.doScroll&&!function e(){if(!n.isReady){try{c.doScroll("left")}catch(a){return setTimeout(e,50)}J(),n.ready()}}()}return I.promise(b)};var L="undefined",M;for(M in n(l))break;l.ownLast="0"!==M,l.inlineBlockNeedsLayout=!1,n(function(){var a,b,c=z.getElementsByTagName("body")[0];c&&(a=z.createElement("div"),a.style.cssText="border:0;width:0;height:0;position:absolute;top:0;left:-9999px;margin-top:1px",b=z.createElement("div"),c.appendChild(a).appendChild(b),typeof b.style.zoom!==L&&(b.style.cssText="border:0;margin:0;width:1px;padding:1px;display:inline;zoom:1",(l.inlineBlockNeedsLayout=3===b.offsetWidth)&&(c.style.zoom=1)),c.removeChild(a),a=b=null)}),function(){var a=z.createElement("div");if(null==l.deleteExpando){l.deleteExpando=!0;try{delete a.test}catch(b){l.deleteExpando=!1}}a=null}(),n.acceptData=function(a){var b=n.noData[(a.nodeName+" ").toLowerCase()],c=+a.nodeType||1;return 1!==c&&9!==c?!1:!b||b!==!0&&a.getAttribute("classid")===b};var N=/^(?:\{[\w\W]*\}|\[[\w\W]*\])$/,O=/([A-Z])/g;function P(a,b,c){if(void 0===c&&1===a.nodeType){var d="data-"+b.replace(O,"-$1").toLowerCase();if(c=a.getAttribute(d),"string"==typeof c){try{c="true"===c?!0:"false"===c?!1:"null"===c?null:+c+""===c?+c:N.test(c)?n.parseJSON(c):c}catch(e){}n.data(a,b,c)}else c=void 0}return c}function Q(a){var b;for(b in a)if(("data"!==b||!n.isEmptyObject(a[b]))&&"toJSON"!==b)return!1;return!0}function R(a,b,d,e){if(n.acceptData(a)){var f,g,h=n.expando,i=a.nodeType,j=i?n.cache:a,k=i?a[h]:a[h]&&h;if(k&&j[k]&&(e||j[k].data)||void 0!==d||"string"!=typeof b)return k||(k=i?a[h]=c.pop()||n.guid++:h),j[k]||(j[k]=i?{}:{toJSON:n.noop}),("object"==typeof b||"function"==typeof b)&&(e?j[k]=n.extend(j[k],b):j[k].data=n.extend(j[k].data,b)),g=j[k],e||(g.data||(g.data={}),g=g.data),void 0!==d&&(g[n.camelCase(b)]=d),"string"==typeof b?(f=g[b],null==f&&(f=g[n.camelCase(b)])):f=g,f +}}function S(a,b,c){if(n.acceptData(a)){var d,e,f=a.nodeType,g=f?n.cache:a,h=f?a[n.expando]:n.expando;if(g[h]){if(b&&(d=c?g[h]:g[h].data)){n.isArray(b)?b=b.concat(n.map(b,n.camelCase)):b in d?b=[b]:(b=n.camelCase(b),b=b in d?[b]:b.split(" ")),e=b.length;while(e--)delete d[b[e]];if(c?!Q(d):!n.isEmptyObject(d))return}(c||(delete g[h].data,Q(g[h])))&&(f?n.cleanData([a],!0):l.deleteExpando||g!=g.window?delete g[h]:g[h]=null)}}}n.extend({cache:{},noData:{"applet ":!0,"embed ":!0,"object ":"clsid:D27CDB6E-AE6D-11cf-96B8-444553540000"},hasData:function(a){return a=a.nodeType?n.cache[a[n.expando]]:a[n.expando],!!a&&!Q(a)},data:function(a,b,c){return R(a,b,c)},removeData:function(a,b){return S(a,b)},_data:function(a,b,c){return R(a,b,c,!0)},_removeData:function(a,b){return S(a,b,!0)}}),n.fn.extend({data:function(a,b){var c,d,e,f=this[0],g=f&&f.attributes;if(void 0===a){if(this.length&&(e=n.data(f),1===f.nodeType&&!n._data(f,"parsedAttrs"))){c=g.length;while(c--)d=g[c].name,0===d.indexOf("data-")&&(d=n.camelCase(d.slice(5)),P(f,d,e[d]));n._data(f,"parsedAttrs",!0)}return e}return"object"==typeof a?this.each(function(){n.data(this,a)}):arguments.length>1?this.each(function(){n.data(this,a,b)}):f?P(f,a,n.data(f,a)):void 0},removeData:function(a){return this.each(function(){n.removeData(this,a)})}}),n.extend({queue:function(a,b,c){var d;return a?(b=(b||"fx")+"queue",d=n._data(a,b),c&&(!d||n.isArray(c)?d=n._data(a,b,n.makeArray(c)):d.push(c)),d||[]):void 0},dequeue:function(a,b){b=b||"fx";var c=n.queue(a,b),d=c.length,e=c.shift(),f=n._queueHooks(a,b),g=function(){n.dequeue(a,b)};"inprogress"===e&&(e=c.shift(),d--),e&&("fx"===b&&c.unshift("inprogress"),delete f.stop,e.call(a,g,f)),!d&&f&&f.empty.fire()},_queueHooks:function(a,b){var c=b+"queueHooks";return n._data(a,c)||n._data(a,c,{empty:n.Callbacks("once memory").add(function(){n._removeData(a,b+"queue"),n._removeData(a,c)})})}}),n.fn.extend({queue:function(a,b){var c=2;return"string"!=typeof a&&(b=a,a="fx",c--),arguments.lengthh;h++)b(a[h],c,g?d:d.call(a[h],h,b(a[h],c)));return e?a:j?b.call(a):i?b(a[0],c):f},X=/^(?:checkbox|radio)$/i;!function(){var a=z.createDocumentFragment(),b=z.createElement("div"),c=z.createElement("input");if(b.setAttribute("className","t"),b.innerHTML="
a",l.leadingWhitespace=3===b.firstChild.nodeType,l.tbody=!b.getElementsByTagName("tbody").length,l.htmlSerialize=!!b.getElementsByTagName("link").length,l.html5Clone="<:nav>"!==z.createElement("nav").cloneNode(!0).outerHTML,c.type="checkbox",c.checked=!0,a.appendChild(c),l.appendChecked=c.checked,b.innerHTML="",l.noCloneChecked=!!b.cloneNode(!0).lastChild.defaultValue,a.appendChild(b),b.innerHTML="",l.checkClone=b.cloneNode(!0).cloneNode(!0).lastChild.checked,l.noCloneEvent=!0,b.attachEvent&&(b.attachEvent("onclick",function(){l.noCloneEvent=!1}),b.cloneNode(!0).click()),null==l.deleteExpando){l.deleteExpando=!0;try{delete b.test}catch(d){l.deleteExpando=!1}}a=b=c=null}(),function(){var b,c,d=z.createElement("div");for(b in{submit:!0,change:!0,focusin:!0})c="on"+b,(l[b+"Bubbles"]=c in a)||(d.setAttribute(c,"t"),l[b+"Bubbles"]=d.attributes[c].expando===!1);d=null}();var Y=/^(?:input|select|textarea)$/i,Z=/^key/,$=/^(?:mouse|contextmenu)|click/,_=/^(?:focusinfocus|focusoutblur)$/,ab=/^([^.]*)(?:\.(.+)|)$/;function bb(){return!0}function cb(){return!1}function db(){try{return z.activeElement}catch(a){}}n.event={global:{},add:function(a,b,c,d,e){var f,g,h,i,j,k,l,m,o,p,q,r=n._data(a);if(r){c.handler&&(i=c,c=i.handler,e=i.selector),c.guid||(c.guid=n.guid++),(g=r.events)||(g=r.events={}),(k=r.handle)||(k=r.handle=function(a){return typeof n===L||a&&n.event.triggered===a.type?void 0:n.event.dispatch.apply(k.elem,arguments)},k.elem=a),b=(b||"").match(F)||[""],h=b.length;while(h--)f=ab.exec(b[h])||[],o=q=f[1],p=(f[2]||"").split(".").sort(),o&&(j=n.event.special[o]||{},o=(e?j.delegateType:j.bindType)||o,j=n.event.special[o]||{},l=n.extend({type:o,origType:q,data:d,handler:c,guid:c.guid,selector:e,needsContext:e&&n.expr.match.needsContext.test(e),namespace:p.join(".")},i),(m=g[o])||(m=g[o]=[],m.delegateCount=0,j.setup&&j.setup.call(a,d,p,k)!==!1||(a.addEventListener?a.addEventListener(o,k,!1):a.attachEvent&&a.attachEvent("on"+o,k))),j.add&&(j.add.call(a,l),l.handler.guid||(l.handler.guid=c.guid)),e?m.splice(m.delegateCount++,0,l):m.push(l),n.event.global[o]=!0);a=null}},remove:function(a,b,c,d,e){var f,g,h,i,j,k,l,m,o,p,q,r=n.hasData(a)&&n._data(a);if(r&&(k=r.events)){b=(b||"").match(F)||[""],j=b.length;while(j--)if(h=ab.exec(b[j])||[],o=q=h[1],p=(h[2]||"").split(".").sort(),o){l=n.event.special[o]||{},o=(d?l.delegateType:l.bindType)||o,m=k[o]||[],h=h[2]&&new RegExp("(^|\\.)"+p.join("\\.(?:.*\\.|)")+"(\\.|$)"),i=f=m.length;while(f--)g=m[f],!e&&q!==g.origType||c&&c.guid!==g.guid||h&&!h.test(g.namespace)||d&&d!==g.selector&&("**"!==d||!g.selector)||(m.splice(f,1),g.selector&&m.delegateCount--,l.remove&&l.remove.call(a,g));i&&!m.length&&(l.teardown&&l.teardown.call(a,p,r.handle)!==!1||n.removeEvent(a,o,r.handle),delete k[o])}else for(o in k)n.event.remove(a,o+b[j],c,d,!0);n.isEmptyObject(k)&&(delete r.handle,n._removeData(a,"events"))}},trigger:function(b,c,d,e){var f,g,h,i,k,l,m,o=[d||z],p=j.call(b,"type")?b.type:b,q=j.call(b,"namespace")?b.namespace.split("."):[];if(h=l=d=d||z,3!==d.nodeType&&8!==d.nodeType&&!_.test(p+n.event.triggered)&&(p.indexOf(".")>=0&&(q=p.split("."),p=q.shift(),q.sort()),g=p.indexOf(":")<0&&"on"+p,b=b[n.expando]?b:new n.Event(p,"object"==typeof b&&b),b.isTrigger=e?2:3,b.namespace=q.join("."),b.namespace_re=b.namespace?new RegExp("(^|\\.)"+q.join("\\.(?:.*\\.|)")+"(\\.|$)"):null,b.result=void 0,b.target||(b.target=d),c=null==c?[b]:n.makeArray(c,[b]),k=n.event.special[p]||{},e||!k.trigger||k.trigger.apply(d,c)!==!1)){if(!e&&!k.noBubble&&!n.isWindow(d)){for(i=k.delegateType||p,_.test(i+p)||(h=h.parentNode);h;h=h.parentNode)o.push(h),l=h;l===(d.ownerDocument||z)&&o.push(l.defaultView||l.parentWindow||a)}m=0;while((h=o[m++])&&!b.isPropagationStopped())b.type=m>1?i:k.bindType||p,f=(n._data(h,"events")||{})[b.type]&&n._data(h,"handle"),f&&f.apply(h,c),f=g&&h[g],f&&f.apply&&n.acceptData(h)&&(b.result=f.apply(h,c),b.result===!1&&b.preventDefault());if(b.type=p,!e&&!b.isDefaultPrevented()&&(!k._default||k._default.apply(o.pop(),c)===!1)&&n.acceptData(d)&&g&&d[p]&&!n.isWindow(d)){l=d[g],l&&(d[g]=null),n.event.triggered=p;try{d[p]()}catch(r){}n.event.triggered=void 0,l&&(d[g]=l)}return b.result}},dispatch:function(a){a=n.event.fix(a);var b,c,e,f,g,h=[],i=d.call(arguments),j=(n._data(this,"events")||{})[a.type]||[],k=n.event.special[a.type]||{};if(i[0]=a,a.delegateTarget=this,!k.preDispatch||k.preDispatch.call(this,a)!==!1){h=n.event.handlers.call(this,a,j),b=0;while((f=h[b++])&&!a.isPropagationStopped()){a.currentTarget=f.elem,g=0;while((e=f.handlers[g++])&&!a.isImmediatePropagationStopped())(!a.namespace_re||a.namespace_re.test(e.namespace))&&(a.handleObj=e,a.data=e.data,c=((n.event.special[e.origType]||{}).handle||e.handler).apply(f.elem,i),void 0!==c&&(a.result=c)===!1&&(a.preventDefault(),a.stopPropagation()))}return k.postDispatch&&k.postDispatch.call(this,a),a.result}},handlers:function(a,b){var c,d,e,f,g=[],h=b.delegateCount,i=a.target;if(h&&i.nodeType&&(!a.button||"click"!==a.type))for(;i!=this;i=i.parentNode||this)if(1===i.nodeType&&(i.disabled!==!0||"click"!==a.type)){for(e=[],f=0;h>f;f++)d=b[f],c=d.selector+" ",void 0===e[c]&&(e[c]=d.needsContext?n(c,this).index(i)>=0:n.find(c,this,null,[i]).length),e[c]&&e.push(d);e.length&&g.push({elem:i,handlers:e})}return h]","i"),ib=/^\s+/,jb=/<(?!area|br|col|embed|hr|img|input|link|meta|param)(([\w:]+)[^>]*)\/>/gi,kb=/<([\w:]+)/,lb=/\s*$/g,sb={option:[1,""],legend:[1,"
","
"],area:[1,"",""],param:[1,"",""],thead:[1,"","
"],tr:[2,"","
"],col:[2,"","
"],td:[3,"","
"],_default:l.htmlSerialize?[0,"",""]:[1,"X
","
"]},tb=eb(z),ub=tb.appendChild(z.createElement("div"));sb.optgroup=sb.option,sb.tbody=sb.tfoot=sb.colgroup=sb.caption=sb.thead,sb.th=sb.td;function vb(a,b){var c,d,e=0,f=typeof a.getElementsByTagName!==L?a.getElementsByTagName(b||"*"):typeof a.querySelectorAll!==L?a.querySelectorAll(b||"*"):void 0;if(!f)for(f=[],c=a.childNodes||a;null!=(d=c[e]);e++)!b||n.nodeName(d,b)?f.push(d):n.merge(f,vb(d,b));return void 0===b||b&&n.nodeName(a,b)?n.merge([a],f):f}function wb(a){X.test(a.type)&&(a.defaultChecked=a.checked)}function xb(a,b){return n.nodeName(a,"table")&&n.nodeName(11!==b.nodeType?b:b.firstChild,"tr")?a.getElementsByTagName("tbody")[0]||a.appendChild(a.ownerDocument.createElement("tbody")):a}function yb(a){return a.type=(null!==n.find.attr(a,"type"))+"/"+a.type,a}function zb(a){var b=qb.exec(a.type);return b?a.type=b[1]:a.removeAttribute("type"),a}function Ab(a,b){for(var c,d=0;null!=(c=a[d]);d++)n._data(c,"globalEval",!b||n._data(b[d],"globalEval"))}function Bb(a,b){if(1===b.nodeType&&n.hasData(a)){var c,d,e,f=n._data(a),g=n._data(b,f),h=f.events;if(h){delete g.handle,g.events={};for(c in h)for(d=0,e=h[c].length;e>d;d++)n.event.add(b,c,h[c][d])}g.data&&(g.data=n.extend({},g.data))}}function Cb(a,b){var c,d,e;if(1===b.nodeType){if(c=b.nodeName.toLowerCase(),!l.noCloneEvent&&b[n.expando]){e=n._data(b);for(d in e.events)n.removeEvent(b,d,e.handle);b.removeAttribute(n.expando)}"script"===c&&b.text!==a.text?(yb(b).text=a.text,zb(b)):"object"===c?(b.parentNode&&(b.outerHTML=a.outerHTML),l.html5Clone&&a.innerHTML&&!n.trim(b.innerHTML)&&(b.innerHTML=a.innerHTML)):"input"===c&&X.test(a.type)?(b.defaultChecked=b.checked=a.checked,b.value!==a.value&&(b.value=a.value)):"option"===c?b.defaultSelected=b.selected=a.defaultSelected:("input"===c||"textarea"===c)&&(b.defaultValue=a.defaultValue)}}n.extend({clone:function(a,b,c){var d,e,f,g,h,i=n.contains(a.ownerDocument,a);if(l.html5Clone||n.isXMLDoc(a)||!hb.test("<"+a.nodeName+">")?f=a.cloneNode(!0):(ub.innerHTML=a.outerHTML,ub.removeChild(f=ub.firstChild)),!(l.noCloneEvent&&l.noCloneChecked||1!==a.nodeType&&11!==a.nodeType||n.isXMLDoc(a)))for(d=vb(f),h=vb(a),g=0;null!=(e=h[g]);++g)d[g]&&Cb(e,d[g]);if(b)if(c)for(h=h||vb(a),d=d||vb(f),g=0;null!=(e=h[g]);g++)Bb(e,d[g]);else Bb(a,f);return d=vb(f,"script"),d.length>0&&Ab(d,!i&&vb(a,"script")),d=h=e=null,f},buildFragment:function(a,b,c,d){for(var e,f,g,h,i,j,k,m=a.length,o=eb(b),p=[],q=0;m>q;q++)if(f=a[q],f||0===f)if("object"===n.type(f))n.merge(p,f.nodeType?[f]:f);else if(mb.test(f)){h=h||o.appendChild(b.createElement("div")),i=(kb.exec(f)||["",""])[1].toLowerCase(),k=sb[i]||sb._default,h.innerHTML=k[1]+f.replace(jb,"<$1>")+k[2],e=k[0];while(e--)h=h.lastChild;if(!l.leadingWhitespace&&ib.test(f)&&p.push(b.createTextNode(ib.exec(f)[0])),!l.tbody){f="table"!==i||lb.test(f)?""!==k[1]||lb.test(f)?0:h:h.firstChild,e=f&&f.childNodes.length;while(e--)n.nodeName(j=f.childNodes[e],"tbody")&&!j.childNodes.length&&f.removeChild(j)}n.merge(p,h.childNodes),h.textContent="";while(h.firstChild)h.removeChild(h.firstChild);h=o.lastChild}else p.push(b.createTextNode(f));h&&o.removeChild(h),l.appendChecked||n.grep(vb(p,"input"),wb),q=0;while(f=p[q++])if((!d||-1===n.inArray(f,d))&&(g=n.contains(f.ownerDocument,f),h=vb(o.appendChild(f),"script"),g&&Ab(h),c)){e=0;while(f=h[e++])pb.test(f.type||"")&&c.push(f)}return h=null,o},cleanData:function(a,b){for(var d,e,f,g,h=0,i=n.expando,j=n.cache,k=l.deleteExpando,m=n.event.special;null!=(d=a[h]);h++)if((b||n.acceptData(d))&&(f=d[i],g=f&&j[f])){if(g.events)for(e in g.events)m[e]?n.event.remove(d,e):n.removeEvent(d,e,g.handle);j[f]&&(delete j[f],k?delete d[i]:typeof d.removeAttribute!==L?d.removeAttribute(i):d[i]=null,c.push(f))}}}),n.fn.extend({text:function(a){return W(this,function(a){return void 0===a?n.text(this):this.empty().append((this[0]&&this[0].ownerDocument||z).createTextNode(a))},null,a,arguments.length)},append:function(){return this.domManip(arguments,function(a){if(1===this.nodeType||11===this.nodeType||9===this.nodeType){var b=xb(this,a);b.appendChild(a)}})},prepend:function(){return this.domManip(arguments,function(a){if(1===this.nodeType||11===this.nodeType||9===this.nodeType){var b=xb(this,a);b.insertBefore(a,b.firstChild)}})},before:function(){return this.domManip(arguments,function(a){this.parentNode&&this.parentNode.insertBefore(a,this)})},after:function(){return this.domManip(arguments,function(a){this.parentNode&&this.parentNode.insertBefore(a,this.nextSibling)})},remove:function(a,b){for(var c,d=a?n.filter(a,this):this,e=0;null!=(c=d[e]);e++)b||1!==c.nodeType||n.cleanData(vb(c)),c.parentNode&&(b&&n.contains(c.ownerDocument,c)&&Ab(vb(c,"script")),c.parentNode.removeChild(c));return this},empty:function(){for(var a,b=0;null!=(a=this[b]);b++){1===a.nodeType&&n.cleanData(vb(a,!1));while(a.firstChild)a.removeChild(a.firstChild);a.options&&n.nodeName(a,"select")&&(a.options.length=0)}return this},clone:function(a,b){return a=null==a?!1:a,b=null==b?a:b,this.map(function(){return n.clone(this,a,b)})},html:function(a){return W(this,function(a){var b=this[0]||{},c=0,d=this.length;if(void 0===a)return 1===b.nodeType?b.innerHTML.replace(gb,""):void 0;if(!("string"!=typeof a||nb.test(a)||!l.htmlSerialize&&hb.test(a)||!l.leadingWhitespace&&ib.test(a)||sb[(kb.exec(a)||["",""])[1].toLowerCase()])){a=a.replace(jb,"<$1>");try{for(;d>c;c++)b=this[c]||{},1===b.nodeType&&(n.cleanData(vb(b,!1)),b.innerHTML=a);b=0}catch(e){}}b&&this.empty().append(a)},null,a,arguments.length)},replaceWith:function(){var a=arguments[0];return this.domManip(arguments,function(b){a=this.parentNode,n.cleanData(vb(this)),a&&a.replaceChild(b,this)}),a&&(a.length||a.nodeType)?this:this.remove()},detach:function(a){return this.remove(a,!0)},domManip:function(a,b){a=e.apply([],a);var c,d,f,g,h,i,j=0,k=this.length,m=this,o=k-1,p=a[0],q=n.isFunction(p);if(q||k>1&&"string"==typeof p&&!l.checkClone&&ob.test(p))return this.each(function(c){var d=m.eq(c);q&&(a[0]=p.call(this,c,d.html())),d.domManip(a,b)});if(k&&(i=n.buildFragment(a,this[0].ownerDocument,!1,this),c=i.firstChild,1===i.childNodes.length&&(i=c),c)){for(g=n.map(vb(i,"script"),yb),f=g.length;k>j;j++)d=i,j!==o&&(d=n.clone(d,!0,!0),f&&n.merge(g,vb(d,"script"))),b.call(this[j],d,j);if(f)for(h=g[g.length-1].ownerDocument,n.map(g,zb),j=0;f>j;j++)d=g[j],pb.test(d.type||"")&&!n._data(d,"globalEval")&&n.contains(h,d)&&(d.src?n._evalUrl&&n._evalUrl(d.src):n.globalEval((d.text||d.textContent||d.innerHTML||"").replace(rb,"")));i=c=null}return this}}),n.each({appendTo:"append",prependTo:"prepend",insertBefore:"before",insertAfter:"after",replaceAll:"replaceWith"},function(a,b){n.fn[a]=function(a){for(var c,d=0,e=[],g=n(a),h=g.length-1;h>=d;d++)c=d===h?this:this.clone(!0),n(g[d])[b](c),f.apply(e,c.get());return this.pushStack(e)}});var Db,Eb={};function Fb(b,c){var d=n(c.createElement(b)).appendTo(c.body),e=a.getDefaultComputedStyle?a.getDefaultComputedStyle(d[0]).display:n.css(d[0],"display");return d.detach(),e}function Gb(a){var b=z,c=Eb[a];return c||(c=Fb(a,b),"none"!==c&&c||(Db=(Db||n("