# HG changeset patch # User davidvanzessen # Date 1461845464 14400 # Node ID bd6fb6c03948201a194ea80e529a85c7815fd162 # Parent 14ea4c464435c544434644740462d8045ef2ac4d Uploaded diff -r 14ea4c464435 -r bd6fb6c03948 report_clonality/RScript.r --- a/report_clonality/RScript.r Thu Apr 28 05:24:14 2016 -0400 +++ b/report_clonality/RScript.r Thu Apr 28 08:11:04 2016 -0400 @@ -656,28 +656,44 @@ PRODF = PRODF[!fltr,] } - num_median = function(x, na.rm) { as.numeric(median(x, na.rm=na.rm)) } - if(locus %in% c("IGH", "TRB", "TRD")){ - PRODF$new.n = PRODF$N1.REGION.nt.nb + PRODF$N2.REGION.nt.nb - } else { - PRODF$new.n = PRODF$N.REGION.nt.nb + print(names(PRODF)) + #ensure certain columns are in the data (files generated with older versions of IMGT Loader) + col.checks = c("N3.REGION.nt.nb", "N4.REGION.nt.nb") + for(col.check in col.checks){ + if(!(col.check %in% names(PRODF))){ + print(paste(col.check, "not found adding new column")) + if(nrow(PRODF) > 0){ #because R is anoying... + PRODF[,col.check] = 0 + } else { + PRODF = cbind(PRODF, data.frame(N3.REGION.nt.nb=numeric(0), N4.REGION.nt.nb=numeric(0))) + } + if(nrow(UNPROD) > 0){ + UNPROD[,col.check] = 0 + } else { + UNPROD = cbind(UNPROD, data.frame(N3.REGION.nt.nb=numeric(0), N4.REGION.nt.nb=numeric(0))) + } + } } + print(names(PRODF)) + + num_median = function(x, na.rm=T) { as.numeric(median(x, na.rm=na.rm)) } + newData = data.frame(data.table(PRODF)[,list(unique=.N, VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=mean(.SD$P3V.nt.nb, na.rm=T), - N1=mean(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb, na.rm=T), + N1=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), P2=mean(.SD$P5D.nt.nb, na.rm=T), DEL.DH=mean(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), DH.DEL=mean(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T), P3=mean(.SD$P3D.nt.nb, na.rm=T), - N2=mean(.SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), + N2=mean(rowSums(.SD[,c("N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), P4=mean(.SD$P5J.nt.nb, na.rm=T), DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), - Total.Del=mean(.SD$X3V.REGION.trimmed.nt.nb + .SD$X5D.REGION.trimmed.nt.nb + .SD$X3D.REGION.trimmed.nt.nb + .SD$X5J.REGION.trimmed.nt.nb, na.rm=T), - Total.N=mean(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb + .SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), - Total.P=mean(.SD$P3V.nt.nb + .SD$P5D.nt.nb + .SD$P3D.nt.nb + .SD$P5J.nt.nb, na.rm=T)), + Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), + Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), + Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T))), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisProd_mean.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) @@ -685,17 +701,17 @@ newData = data.frame(data.table(PRODF)[,list(unique=.N, VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=num_median(.SD$P3V.nt.nb, na.rm=T), - N1=num_median(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb, na.rm=T), + N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), P2=num_median(.SD$P5D.nt.nb, na.rm=T), DEL.DH=num_median(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), DH.DEL=num_median(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T), P3=num_median(.SD$P3D.nt.nb, na.rm=T), - N2=num_median(.SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), + N2=num_median(rowSums(.SD[,c("N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), P4=num_median(.SD$P5J.nt.nb, na.rm=T), DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), - Total.Del=num_median(.SD$X3V.REGION.trimmed.nt.nb + .SD$X5D.REGION.trimmed.nt.nb + .SD$X3D.REGION.trimmed.nt.nb + .SD$X5J.REGION.trimmed.nt.nb, na.rm=T), - Total.N=num_median(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb + .SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), - Total.P=num_median(.SD$P3V.nt.nb + .SD$P5D.nt.nb + .SD$P3D.nt.nb + .SD$P5J.nt.nb, na.rm=T)), + Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), + Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), + Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T))), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisProd_median.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) @@ -703,17 +719,17 @@ newData = data.frame(data.table(UNPROD)[,list(unique=.N, VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=mean(.SD$P3V.nt.nb, na.rm=T), - N1=mean(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb, na.rm=T), + N1=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), P2=mean(.SD$P5D.nt.nb, na.rm=T), DEL.DH=mean(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), DH.DEL=mean(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T), P3=mean(.SD$P3D.nt.nb, na.rm=T), - N2=mean(.SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), + N2=mean(rowSums(.SD[,c("N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), P4=mean(.SD$P5J.nt.nb, na.rm=T), DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), - Total.Del=mean(.SD$X3V.REGION.trimmed.nt.nb + .SD$X5D.REGION.trimmed.nt.nb + .SD$X3D.REGION.trimmed.nt.nb + .SD$X5J.REGION.trimmed.nt.nb, na.rm=T), - Total.N=mean(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb + .SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), - Total.P=mean(.SD$P3V.nt.nb + .SD$P5D.nt.nb + .SD$P3D.nt.nb + .SD$P5J.nt.nb, na.rm=T)), + Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), + Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), + Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T))), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisUnProd_mean.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) @@ -721,17 +737,17 @@ newData = data.frame(data.table(UNPROD)[,list(unique=.N, VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=num_median(.SD$P3V.nt.nb, na.rm=T), - N1=num_median(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb, na.rm=T), + N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), P2=num_median(.SD$P5D.nt.nb, na.rm=T), DEL.DH=num_median(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), DH.DEL=num_median(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T), P3=num_median(.SD$P3D.nt.nb, na.rm=T), - N2=num_median(.SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), + N2=num_median(rowSums(.SD[,c("N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), P4=num_median(.SD$P5J.nt.nb, na.rm=T), DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), - Total.Del=num_median(.SD$X3V.REGION.trimmed.nt.nb + .SD$X5D.REGION.trimmed.nt.nb + .SD$X3D.REGION.trimmed.nt.nb + .SD$X5J.REGION.trimmed.nt.nb, na.rm=T), - Total.N=num_median(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb + .SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), - Total.P=num_median(.SD$P3V.nt.nb + .SD$P5D.nt.nb + .SD$P3D.nt.nb + .SD$P5J.nt.nb, na.rm=T)), + Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), + Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), + Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T))), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)