Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples

--- a/RScript.r	Fri Feb 20 04:14:24 2015 -0500
+++ b/RScript.r	Mon Feb 23 10:22:59 2015 -0500
@@ -51,7 +51,7 @@

 dat$paste = paste(dat$Sample, dat$Clone_Sequence)

-patients = split(dat, dat$Patient, drop=T)
+patients = split(dat, dat$Patient, drop=T)[1:10]
 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
@@ -257,9 +257,25 @@
   patientMerge = merge(patientMerge, patient3, by="merge")
   colnames(patientMerge)[30:length(colnames(patientMerge))] = paste(colnames(patientMerge)[30:length(colnames(patientMerge))], ".z", sep="")
   patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
+
+  patientMerge12 = merge(patient1, patient2, by="merge")
+  patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
+  patientMerge13 = merge(patient1, patient3, by="merge")
+  patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
+  patientMerge23 = merge(patient2, patient3, by="merge")
+  patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
+
+  scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count")
+  scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
+  scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$Clone_Sequence),]
+  scatterplot_data$type = factor(x="single", levels=c("In one", "In two", "In three"))
+
   res1 = vector()
   res2 = vector()
   res3 = vector()
+  res12 = vector()
+  res13 = vector()
+  res23 = vector()
   resAll = vector()
   read1Count = vector()
   read2Count = vector()
@@ -278,6 +294,10 @@
     two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$Clone_Sequence %in% patientMerge[all,]$merge))
     three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$Clone_Sequence %in% patientMerge[all,]$merge))

+    one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold)
+    one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold)
+    two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold)
+
     read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
     read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
     read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
@@ -285,41 +305,90 @@
     res2 = append(res2, sum(two))
     res3 = append(res3, sum(three))
     resAll = append(resAll, sum(all))
+    res12 = append(res12, sum(one_two))
+    res13 = append(res13, sum(one_three))
+    res23 = append(res23, sum(two_three))
     #threshhold = 0
     if(threshhold != 0){
       if(sum(one) > 0){
-        dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")]
-        colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone")
+        dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
+        colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
         filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
         write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
       }
       if(sum(two) > 0){
-        dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")]
-        colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone")
+        dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
+        colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
         filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
         write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
       }
       if(sum(three) > 0){
-        dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")]
-        colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone")
+        dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
+        colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
         filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
         write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
       }
-    }
+      if(sum(one_two) > 0){
+        dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
+        colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
+        filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
+        write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
+      }
+      if(sum(one_three) > 0){
+        dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
+        colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
+        filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
+        write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
+      }
+      if(sum(two_three) > 0){
+        dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
+        colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
+        filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
+        write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
+      }
+    } else { #scatterplot data
+      scatterplot_locus_data = scatterplot_data
+      scatterplot_locus_data$type = ifelse(scatterplot_locus_data$Clone_Sequence %in% patientMerge12[one_two,]$Clone_Sequence.x, "In two", "In one")
+      scatterplot_locus_data$type = ifelse(scatterplot_locus_data$Clone_Sequence %in% patientMerge13[one_three,]$Clone_Sequence.x, "In two", "In one")
+      scatterplot_locus_data$type = ifelse(scatterplot_locus_data$Clone_Sequence %in% patientMerge23[two_three,]$Clone_Sequence.x, "In two", "In one")
+      scatterplot_locus_data$type = ifelse(scatterplot_locus_data$type == "In two", ifelse(scatterplot_locus_data$Clone_Sequence %in% patientMerge[all,]$Clone_Sequence.x, "In three", "In two"), "In one")
+      scatterplot_locus_data$type = ifelse(scatterplot_locus_data$type == "In one", "In one", "In multiple")
+
+      p = NULL
+      if(nrow(scatterplot_locus_data) != 0){
+        if(on == "normalized_read_count"){
+          scales = 10^(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
+          p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales)
+        } else {
+          p = ggplot(scatterplot_locus_data, aes(type, Frequency))
+        }
+        p = p + geom_point(aes(colour=type), position="jitter")
+        p = p + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
+      } else {
+        p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
+      }
+      png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
+      print(p)
+      dev.off()
+    }
     if(sum(all) > 0){
-      dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "CDR3_Sense_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
-      colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"CDR3_Sense_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
+      dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
+      colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
       filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
       write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
     }
   }
-  patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
+  #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
+  patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
   colnames(patientResult)[6] = oneSample
-  colnames(patientResult)[8] = twoSample
-  colnames(patientResult)[10] = threeSample
+  colnames(patientResult)[7] = twoSample
+  colnames(patientResult)[8] = threeSample
+  colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
+  colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
+  colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")

   colnamesBak = colnames(patientResult)
-  colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample), paste("Normalized Read Count", twoSample), paste("Number of sequences", threeSample), paste("Normalized Read Count", threeSample))
+  colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
   write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
   colnames(patientResult) = colnamesBak