# HG changeset patch # User davidvanzessen # Date 1461835454 14400 # Node ID 14ea4c464435c544434644740462d8045ef2ac4d # Parent 1349412c89c649db8fe378df9853dc84b09b8ecf Uploaded diff -r 1349412c89c6 -r 14ea4c464435 report_clonality/RScript.r --- a/report_clonality/RScript.r Fri Apr 08 09:25:14 2016 -0400 +++ b/report_clonality/RScript.r Thu Apr 28 05:24:14 2016 -0400 @@ -49,19 +49,27 @@ inputdata = read.table(infile, sep="\t", header=TRUE, fill=T, comment.char="") +print(paste("nrows: ", nrow(inputdata))) + setwd(outdir) # remove weird rows inputdata = inputdata[inputdata$Sample != "",] +print(paste("nrows: ", nrow(inputdata))) + #remove the allele from the V,D and J genes inputdata$Top.V.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.V.Gene) inputdata$Top.D.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.D.Gene) inputdata$Top.J.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.J.Gene) +print(paste("nrows: ", nrow(inputdata))) + #filter uniques inputdata.removed = inputdata[NULL,] +print(paste("nrows: ", nrow(inputdata))) + inputdata$clonaltype = 1:nrow(inputdata) #keep track of the count of sequences in samples or samples/replicates for the front page overview @@ -144,25 +152,28 @@ #create the table on the overview page with the productive/unique counts per sample/replicate #first for sample -sample.count = merge(input.sample.count, prod.sample.count, by="Sample") +sample.count = merge(input.sample.count, prod.sample.count, by="Sample", all.x=T) sample.count$perc_prod = round(sample.count$Productive / sample.count$All * 100) -sample.count = merge(sample.count, prod.unique.sample.count, by="Sample") +sample.count = merge(sample.count, prod.unique.sample.count, by="Sample", all.x=T) sample.count$perc_prod_un = round(sample.count$Productive_unique / sample.count$All * 100) -sample.count = merge(sample.count , unprod.sample.count, by="Sample") +sample.count = merge(sample.count , unprod.sample.count, by="Sample", all.x=T) +print(sample.count) sample.count$perc_unprod = round(sample.count$Unproductive / sample.count$All * 100) -sample.count = merge(sample.count, unprod.unique.sample.count, by="Sample") +sample.count = merge(sample.count, unprod.unique.sample.count, by="Sample", all.x=T) sample.count$perc_unprod_un = round(sample.count$Unproductive_unique / sample.count$All * 100) +print(sample.count) + #then sample/replicate -rep.count = merge(input.rep.count, prod.rep.count, by=c("Sample", "Replicate")) +rep.count = merge(input.rep.count, prod.rep.count, by=c("Sample", "Replicate"), all.x=T) rep.count$perc_prod = round(rep.count$Productive / rep.count$All * 100) -rep.count = merge(rep.count, prod.unique.rep.count, by=c("Sample", "Replicate")) +rep.count = merge(rep.count, prod.unique.rep.count, by=c("Sample", "Replicate"), all.x=T) rep.count$perc_prod_un = round(rep.count$Productive_unique / rep.count$All * 100) -rep.count = merge(rep.count, unprod.rep.count, by=c("Sample", "Replicate")) +rep.count = merge(rep.count, unprod.rep.count, by=c("Sample", "Replicate"), all.x=T) rep.count$perc_unprod = round(rep.count$Unproductive / rep.count$All * 100) -rep.count = merge(rep.count, unprod.unique.rep.count, by=c("Sample", "Replicate")) +rep.count = merge(rep.count, unprod.unique.rep.count, by=c("Sample", "Replicate"), all.x=T) rep.count$perc_unprod_un = round(rep.count$Unproductive_unique / rep.count$All * 100) rep.count$Sample = paste(rep.count$Sample, rep.count$Replicate, sep="_") @@ -170,6 +181,8 @@ count = rbind(sample.count, rep.count) + + write.table(x=count, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F) # ---------------------- Frequency calculation for V, D and J ---------------------- @@ -627,43 +640,44 @@ } } +bak = PRODF + imgtcolumns = c("X3V.REGION.trimmed.nt.nb","P3V.nt.nb", "N1.REGION.nt.nb", "P5D.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "P3D.nt.nb", "N2.REGION.nt.nb", "P5J.nt.nb", "X5J.REGION.trimmed.nt.nb", "X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb") if(all(imgtcolumns %in% colnames(inputdata))) { print("found IMGT columns, running junction analysis") if(locus %in% c("IGK","IGL", "TRA", "TRG")){ - print("VJ recombination, using N column for junction analysis") - print(names(PRODF)) - print(head(PRODF$N.REGION.nt.nb, 30)) - PRODF$N1.REGION.nt.nb = PRODF$N.REGION.nt.nb + print("VJ recombination, no filtering on absent D") + } else { + print("VDJ recombination, using N column for junction analysis") + fltr = nchar(PRODF$Top.D.Gene) < 4 + print(paste("Removing", sum(fltr), "sequences without a identified D")) + PRODF = PRODF[!fltr,] } num_median = function(x, na.rm) { as.numeric(median(x, na.rm=na.rm)) } + if(locus %in% c("IGH", "TRB", "TRD")){ + PRODF$new.n = PRODF$N1.REGION.nt.nb + PRODF$N2.REGION.nt.nb + } else { + PRODF$new.n = PRODF$N.REGION.nt.nb + } + newData = data.frame(data.table(PRODF)[,list(unique=.N, VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=mean(.SD$P3V.nt.nb, na.rm=T), - N1=mean(.SD$N1.REGION.nt.nb, na.rm=T), + N1=mean(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb, na.rm=T), P2=mean(.SD$P5D.nt.nb, na.rm=T), DEL.DH=mean(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), DH.DEL=mean(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T), P3=mean(.SD$P3D.nt.nb, na.rm=T), - N2=mean(.SD$N2.REGION.nt.nb, na.rm=T), + N2=mean(.SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), P4=mean(.SD$P5J.nt.nb, na.rm=T), DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), - Total.Del=( mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T) + - mean(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T) + - mean(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T) + - mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T)), - - Total.N=( mean(.SD$N1.REGION.nt.nb, na.rm=T) + - mean(.SD$N2.REGION.nt.nb, na.rm=T)), - - Total.P=( mean(.SD$P3V.nt.nb, na.rm=T) + - mean(.SD$P5D.nt.nb, na.rm=T) + - mean(.SD$P3D.nt.nb, na.rm=T) + - mean(.SD$P5J.nt.nb, na.rm=T))), + Total.Del=mean(.SD$X3V.REGION.trimmed.nt.nb + .SD$X5D.REGION.trimmed.nt.nb + .SD$X3D.REGION.trimmed.nt.nb + .SD$X5J.REGION.trimmed.nt.nb, na.rm=T), + Total.N=mean(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb + .SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), + Total.P=mean(.SD$P3V.nt.nb + .SD$P5D.nt.nb + .SD$P3D.nt.nb + .SD$P5J.nt.nb, na.rm=T)), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisProd_mean.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) @@ -671,24 +685,17 @@ newData = data.frame(data.table(PRODF)[,list(unique=.N, VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=num_median(.SD$P3V.nt.nb, na.rm=T), - N1=num_median(.SD$N1.REGION.nt.nb, na.rm=T), + N1=num_median(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb, na.rm=T), P2=num_median(.SD$P5D.nt.nb, na.rm=T), DEL.DH=num_median(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), DH.DEL=num_median(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T), P3=num_median(.SD$P3D.nt.nb, na.rm=T), - N2=num_median(.SD$N2.REGION.nt.nb, na.rm=T), + N2=num_median(.SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), P4=num_median(.SD$P5J.nt.nb, na.rm=T), DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), - Total.Del=num_median(c(.SD$X3V.REGION.trimmed.nt.nb, - .SD$X5D.REGION.trimmed.nt.nb, - .SD$X3D.REGION.trimmed.nt.nb, - .SD$X5J.REGION.trimmed.nt.nb), na.rm=T), - Total.N=num_median( c(.SD$N1.REGION.nt.nb, - .SD$N2.REGION.nt.nb), na.rm=T), - Total.P=num_median(c(.SD$P3V.nt.nb, - .SD$P5D.nt.nb, - .SD$P3D.nt.nb, - .SD$P5J.nt.nb), na.rm=T)), + Total.Del=num_median(.SD$X3V.REGION.trimmed.nt.nb + .SD$X5D.REGION.trimmed.nt.nb + .SD$X3D.REGION.trimmed.nt.nb + .SD$X5J.REGION.trimmed.nt.nb, na.rm=T), + Total.N=num_median(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb + .SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), + Total.P=num_median(.SD$P3V.nt.nb + .SD$P5D.nt.nb + .SD$P3D.nt.nb + .SD$P5J.nt.nb, na.rm=T)), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisProd_median.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) @@ -696,24 +703,17 @@ newData = data.frame(data.table(UNPROD)[,list(unique=.N, VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=mean(.SD$P3V.nt.nb, na.rm=T), - N1=mean(.SD$N1.REGION.nt.nb, na.rm=T), + N1=mean(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb, na.rm=T), P2=mean(.SD$P5D.nt.nb, na.rm=T), DEL.DH=mean(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), DH.DEL=mean(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T), P3=mean(.SD$P3D.nt.nb, na.rm=T), - N2=mean(.SD$N2.REGION.nt.nb, na.rm=T), + N2=mean(.SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), P4=mean(.SD$P5J.nt.nb, na.rm=T), DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), - Total.Del=(mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T) + - mean(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T) + - mean(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T) + - mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T)), - Total.N=( mean(.SD$N1.REGION.nt.nb, na.rm=T) + - mean(.SD$N2.REGION.nt.nb, na.rm=T)), - Total.P=( mean(.SD$P3V.nt.nb, na.rm=T) + - mean(.SD$P5D.nt.nb, na.rm=T) + - mean(.SD$P3D.nt.nb, na.rm=T) + - mean(.SD$P5J.nt.nb, na.rm=T))), + Total.Del=mean(.SD$X3V.REGION.trimmed.nt.nb + .SD$X5D.REGION.trimmed.nt.nb + .SD$X3D.REGION.trimmed.nt.nb + .SD$X5J.REGION.trimmed.nt.nb, na.rm=T), + Total.N=mean(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb + .SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), + Total.P=mean(.SD$P3V.nt.nb + .SD$P5D.nt.nb + .SD$P3D.nt.nb + .SD$P5J.nt.nb, na.rm=T)), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisUnProd_mean.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) @@ -721,30 +721,25 @@ newData = data.frame(data.table(UNPROD)[,list(unique=.N, VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=num_median(.SD$P3V.nt.nb, na.rm=T), - N1=num_median(.SD$N1.REGION.nt.nb, na.rm=T), + N1=num_median(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb, na.rm=T), P2=num_median(.SD$P5D.nt.nb, na.rm=T), DEL.DH=num_median(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), DH.DEL=num_median(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T), P3=num_median(.SD$P3D.nt.nb, na.rm=T), - N2=num_median(.SD$N2.REGION.nt.nb, na.rm=T), + N2=num_median(.SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), P4=num_median(.SD$P5J.nt.nb, na.rm=T), DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), - Total.Del=num_median(c(.SD$X3V.REGION.trimmed.nt.nb, - .SD$X5D.REGION.trimmed.nt.nb, - .SD$X3D.REGION.trimmed.nt.nb, - .SD$X5J.REGION.trimmed.nt.nb), na.rm=T), - Total.N=num_median( c(.SD$N1.REGION.nt.nb, - .SD$N2.REGION.nt.nb), na.rm=T), - Total.P=num_median(c(.SD$P3V.nt.nb, - .SD$P5D.nt.nb, - .SD$P3D.nt.nb, - .SD$P5J.nt.nb), na.rm=T)), + Total.Del=num_median(.SD$X3V.REGION.trimmed.nt.nb + .SD$X5D.REGION.trimmed.nt.nb + .SD$X3D.REGION.trimmed.nt.nb + .SD$X5J.REGION.trimmed.nt.nb, na.rm=T), + Total.N=num_median(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb + .SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), + Total.P=num_median(.SD$P3V.nt.nb + .SD$P5D.nt.nb + .SD$P3D.nt.nb + .SD$P5J.nt.nb, na.rm=T)), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisUnProd_median.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) } +PRODF = bak + # ---------------------- AA composition in CDR3 ---------------------- AACDR3 = PRODF[,c("Sample", "CDR3.Seq")]