comparison report_clonality/RScript.r @ 63:bd6fb6c03948 draft

Uploaded
author davidvanzessen
date Thu, 28 Apr 2016 08:11:04 -0400
parents 14ea4c464435
children 55f18bf19d72
comparison
equal deleted inserted replaced
62:14ea4c464435 63:bd6fb6c03948
654 fltr = nchar(PRODF$Top.D.Gene) < 4 654 fltr = nchar(PRODF$Top.D.Gene) < 4
655 print(paste("Removing", sum(fltr), "sequences without a identified D")) 655 print(paste("Removing", sum(fltr), "sequences without a identified D"))
656 PRODF = PRODF[!fltr,] 656 PRODF = PRODF[!fltr,]
657 } 657 }
658 658
659 num_median = function(x, na.rm) { as.numeric(median(x, na.rm=na.rm)) } 659
660 660 print(names(PRODF))
661 if(locus %in% c("IGH", "TRB", "TRD")){ 661 #ensure certain columns are in the data (files generated with older versions of IMGT Loader)
662 PRODF$new.n = PRODF$N1.REGION.nt.nb + PRODF$N2.REGION.nt.nb 662 col.checks = c("N3.REGION.nt.nb", "N4.REGION.nt.nb")
663 } else { 663 for(col.check in col.checks){
664 PRODF$new.n = PRODF$N.REGION.nt.nb 664 if(!(col.check %in% names(PRODF))){
665 } 665 print(paste(col.check, "not found adding new column"))
666 if(nrow(PRODF) > 0){ #because R is anoying...
667 PRODF[,col.check] = 0
668 } else {
669 PRODF = cbind(PRODF, data.frame(N3.REGION.nt.nb=numeric(0), N4.REGION.nt.nb=numeric(0)))
670 }
671 if(nrow(UNPROD) > 0){
672 UNPROD[,col.check] = 0
673 } else {
674 UNPROD = cbind(UNPROD, data.frame(N3.REGION.nt.nb=numeric(0), N4.REGION.nt.nb=numeric(0)))
675 }
676 }
677 }
678
679 print(names(PRODF))
680
681 num_median = function(x, na.rm=T) { as.numeric(median(x, na.rm=na.rm)) }
666 682
667 newData = data.frame(data.table(PRODF)[,list(unique=.N, 683 newData = data.frame(data.table(PRODF)[,list(unique=.N,
668 VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), 684 VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
669 P1=mean(.SD$P3V.nt.nb, na.rm=T), 685 P1=mean(.SD$P3V.nt.nb, na.rm=T),
670 N1=mean(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb, na.rm=T), 686 N1=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)),
671 P2=mean(.SD$P5D.nt.nb, na.rm=T), 687 P2=mean(.SD$P5D.nt.nb, na.rm=T),
672 DEL.DH=mean(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), 688 DEL.DH=mean(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T),
673 DH.DEL=mean(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T), 689 DH.DEL=mean(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T),
674 P3=mean(.SD$P3D.nt.nb, na.rm=T), 690 P3=mean(.SD$P3D.nt.nb, na.rm=T),
675 N2=mean(.SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), 691 N2=mean(rowSums(.SD[,c("N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),
676 P4=mean(.SD$P5J.nt.nb, na.rm=T), 692 P4=mean(.SD$P5J.nt.nb, na.rm=T),
677 DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), 693 DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),
678 Total.Del=mean(.SD$X3V.REGION.trimmed.nt.nb + .SD$X5D.REGION.trimmed.nt.nb + .SD$X3D.REGION.trimmed.nt.nb + .SD$X5J.REGION.trimmed.nt.nb, na.rm=T), 694 Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
679 Total.N=mean(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb + .SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), 695 Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),
680 Total.P=mean(.SD$P3V.nt.nb + .SD$P5D.nt.nb + .SD$P3D.nt.nb + .SD$P5J.nt.nb, na.rm=T)), 696 Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T))),
681 by=c("Sample")]) 697 by=c("Sample")])
682 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) 698 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
683 write.table(newData, "junctionAnalysisProd_mean.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) 699 write.table(newData, "junctionAnalysisProd_mean.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F)
684 700
685 newData = data.frame(data.table(PRODF)[,list(unique=.N, 701 newData = data.frame(data.table(PRODF)[,list(unique=.N,
686 VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), 702 VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
687 P1=num_median(.SD$P3V.nt.nb, na.rm=T), 703 P1=num_median(.SD$P3V.nt.nb, na.rm=T),
688 N1=num_median(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb, na.rm=T), 704 N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)),
689 P2=num_median(.SD$P5D.nt.nb, na.rm=T), 705 P2=num_median(.SD$P5D.nt.nb, na.rm=T),
690 DEL.DH=num_median(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), 706 DEL.DH=num_median(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T),
691 DH.DEL=num_median(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T), 707 DH.DEL=num_median(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T),
692 P3=num_median(.SD$P3D.nt.nb, na.rm=T), 708 P3=num_median(.SD$P3D.nt.nb, na.rm=T),
693 N2=num_median(.SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), 709 N2=num_median(rowSums(.SD[,c("N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),
694 P4=num_median(.SD$P5J.nt.nb, na.rm=T), 710 P4=num_median(.SD$P5J.nt.nb, na.rm=T),
695 DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), 711 DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),
696 Total.Del=num_median(.SD$X3V.REGION.trimmed.nt.nb + .SD$X5D.REGION.trimmed.nt.nb + .SD$X3D.REGION.trimmed.nt.nb + .SD$X5J.REGION.trimmed.nt.nb, na.rm=T), 712 Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
697 Total.N=num_median(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb + .SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), 713 Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),
698 Total.P=num_median(.SD$P3V.nt.nb + .SD$P5D.nt.nb + .SD$P3D.nt.nb + .SD$P5J.nt.nb, na.rm=T)), 714 Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T))),
699 by=c("Sample")]) 715 by=c("Sample")])
700 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) 716 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
701 write.table(newData, "junctionAnalysisProd_median.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) 717 write.table(newData, "junctionAnalysisProd_median.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F)
702 718
703 newData = data.frame(data.table(UNPROD)[,list(unique=.N, 719 newData = data.frame(data.table(UNPROD)[,list(unique=.N,
704 VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), 720 VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
705 P1=mean(.SD$P3V.nt.nb, na.rm=T), 721 P1=mean(.SD$P3V.nt.nb, na.rm=T),
706 N1=mean(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb, na.rm=T), 722 N1=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)),
707 P2=mean(.SD$P5D.nt.nb, na.rm=T), 723 P2=mean(.SD$P5D.nt.nb, na.rm=T),
708 DEL.DH=mean(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), 724 DEL.DH=mean(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T),
709 DH.DEL=mean(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T), 725 DH.DEL=mean(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T),
710 P3=mean(.SD$P3D.nt.nb, na.rm=T), 726 P3=mean(.SD$P3D.nt.nb, na.rm=T),
711 N2=mean(.SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), 727 N2=mean(rowSums(.SD[,c("N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),
712 P4=mean(.SD$P5J.nt.nb, na.rm=T), 728 P4=mean(.SD$P5J.nt.nb, na.rm=T),
713 DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), 729 DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),
714 Total.Del=mean(.SD$X3V.REGION.trimmed.nt.nb + .SD$X5D.REGION.trimmed.nt.nb + .SD$X3D.REGION.trimmed.nt.nb + .SD$X5J.REGION.trimmed.nt.nb, na.rm=T), 730 Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
715 Total.N=mean(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb + .SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), 731 Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),
716 Total.P=mean(.SD$P3V.nt.nb + .SD$P5D.nt.nb + .SD$P3D.nt.nb + .SD$P5J.nt.nb, na.rm=T)), 732 Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T))),
717 by=c("Sample")]) 733 by=c("Sample")])
718 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) 734 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
719 write.table(newData, "junctionAnalysisUnProd_mean.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) 735 write.table(newData, "junctionAnalysisUnProd_mean.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F)
720 736
721 newData = data.frame(data.table(UNPROD)[,list(unique=.N, 737 newData = data.frame(data.table(UNPROD)[,list(unique=.N,
722 VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), 738 VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
723 P1=num_median(.SD$P3V.nt.nb, na.rm=T), 739 P1=num_median(.SD$P3V.nt.nb, na.rm=T),
724 N1=num_median(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb, na.rm=T), 740 N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)),
725 P2=num_median(.SD$P5D.nt.nb, na.rm=T), 741 P2=num_median(.SD$P5D.nt.nb, na.rm=T),
726 DEL.DH=num_median(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), 742 DEL.DH=num_median(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T),
727 DH.DEL=num_median(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T), 743 DH.DEL=num_median(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T),
728 P3=num_median(.SD$P3D.nt.nb, na.rm=T), 744 P3=num_median(.SD$P3D.nt.nb, na.rm=T),
729 N2=num_median(.SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), 745 N2=num_median(rowSums(.SD[,c("N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),
730 P4=num_median(.SD$P5J.nt.nb, na.rm=T), 746 P4=num_median(.SD$P5J.nt.nb, na.rm=T),
731 DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), 747 DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),
732 Total.Del=num_median(.SD$X3V.REGION.trimmed.nt.nb + .SD$X5D.REGION.trimmed.nt.nb + .SD$X3D.REGION.trimmed.nt.nb + .SD$X5J.REGION.trimmed.nt.nb, na.rm=T), 748 Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
733 Total.N=num_median(.SD$N.REGION.nt.nb + .SD$N1.REGION.nt.nb + .SD$N2.REGION.nt.nb + .SD$N3.REGION.nt.nb + .SD$N4.REGION.nt.nb, na.rm=T), 749 Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),
734 Total.P=num_median(.SD$P3V.nt.nb + .SD$P5D.nt.nb + .SD$P3D.nt.nb + .SD$P5J.nt.nb, na.rm=T)), 750 Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T))),
735 by=c("Sample")]) 751 by=c("Sample")])
736 752
737 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) 753 newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
738 write.table(newData, "junctionAnalysisUnProd_median.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) 754 write.table(newData, "junctionAnalysisUnProd_median.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F)
739 } 755 }