Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples

--- a/RScript.r	Mon Aug 31 09:28:38 2015 -0400
+++ b/RScript.r	Fri Sep 11 08:26:18 2015 -0400
@@ -162,88 +162,100 @@
     patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
     patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)

-    merge.freq.table = data.frame(table(c(patient1.fuzzy$merge, patient2.fuzzy$merge)))
+    merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"])))
     merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]

     patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
     patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]

-    while(nrow(patient1.fuzzy) > 0 & nrow(patient2.fuzzy) > 0){
-      current.merge.1 = patient1.fuzzy[1,"merge"]
-      current.clone.seq.1 = patient1.fuzzy[1,"Clone_Sequence"]
-      current.merge.in.2 = patient2.fuzzy[patient2.fuzzy$merge == current.merge.1,]
-
-      #agrep/adist the two samples
-      agrep.match = agrep(current.clone.seq.1, current.merge.in.2$Clone_Sequence, max.distance = 9, costs=list(insertions=1, deletions=1, substitutions=10))
+    patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
+    patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
+
+    while(nrow(patient.fuzzy) > 1){
+      first.merge = patient.fuzzy[1,"merge"]
+      first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
+
+      merge.filter = first.merge == patient.fuzzy$merge
+
+      match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence)

-
-      if(length(agrep.match) == 1){
-				current.clone.seq.2 = patient2.fuzzy[agrep.match,"Clone_Sequence"]
-        patientMerge.new.row = data.frame(merge=current.clone.seq.1,
-                                          min_cell_paste.x=patient1.fuzzy[1,"min_cell_paste"],
-                                          Patient.x=patient1.fuzzy[1,"Patient"],
-                                          Receptor.x=patient1.fuzzy[1,"Receptor"],
-                                          Sample.x=patient1.fuzzy[1,"Sample"],
-                                          Cell_Count.x=patient1.fuzzy[1,"Cell_Count"],
-                                          Clone_Molecule_Count_From_Spikes.x=patient1.fuzzy[1,"Clone_Molecule_Count_From_Spikes"],
-                                          Log10_Frequency.x=patient1.fuzzy[1,"Log10_Frequency"],
-                                          Total_Read_Count.x=patient1.fuzzy[1,"Total_Read_Count"],
-                                          dsPerM.x=patient1.fuzzy[1,"dsPerM"],
-                                          J_Segment_Major_Gene.x=patient1.fuzzy[1,"J_Segment_Major_Gene"],
-                                          V_Segment_Major_Gene.x=patient1.fuzzy[1,"V_Segment_Major_Gene"],
-                                          Clone_Sequence.x=patient1.fuzzy[1,"Clone_Sequence"],
-                                          CDR3_Sense_Sequence.x=patient1.fuzzy[1,"CDR3_Sense_Sequence"],
-                                          Related_to_leukemia_clone.x=patient1.fuzzy[1,"Related_to_leukemia_clone"],
-                                          Frequency.x=patient1.fuzzy[1,"Frequency"],
-                                          locus_V.x=patient1.fuzzy[1,"locus_V"],
-                                          locus_J.x=patient1.fuzzy[1,"locus_J"],
-                                          min_cell_count.x=patient1.fuzzy[1,"min_cell_count"],
-                                          normalized_read_count.x=patient1.fuzzy[1,"normalized_read_count"],
-                                          paste.x=patient1.fuzzy[1,"paste"],
-                                          min_cell_paste.y=patient2.fuzzy[agrep.match,"min_cell_paste"],
-                                          Patient.y=patient2.fuzzy[agrep.match,"Patient"],
-                                          Receptor.y=patient2.fuzzy[agrep.match,"Receptor"],
-                                          Sample.y=patient2.fuzzy[agrep.match,"Sample"],
-                                          Cell_Count.y=patient2.fuzzy[agrep.match,"Cell_Count"],
-                                          Clone_Molecule_Count_From_Spikes.y=patient2.fuzzy[agrep.match,"Clone_Molecule_Count_From_Spikes"],
-                                          Log10_Frequency.y=patient2.fuzzy[agrep.match,"Log10_Frequency"],
-                                          Total_Read_Count.y=patient2.fuzzy[agrep.match,"Total_Read_Count"],
-                                          dsPerM.y=patient2.fuzzy[agrep.match,"dsPerM"],
-                                          J_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"J_Segment_Major_Gene"],
-                                          V_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"V_Segment_Major_Gene"],
-                                          Clone_Sequence.y=patient2.fuzzy[agrep.match,"Clone_Sequence"],
-                                          CDR3_Sense_Sequence.y=patient2.fuzzy[agrep.match,"CDR3_Sense_Sequence"],
-                                          Related_to_leukemia_clone.y=patient2.fuzzy[agrep.match,"Related_to_leukemia_clone"],
-                                          Frequency.y=patient2.fuzzy[agrep.match,"Frequency"],
-                                          locus_V.y=patient2.fuzzy[agrep.match,"locus_V"],
-                                          locus_J.y=patient2.fuzzy[agrep.match,"locus_J"],
-                                          min_cell_count.y=patient2.fuzzy[agrep.match,"min_cell_count"],
-                                          normalized_read_count.y=patient2.fuzzy[agrep.match,"normalized_read_count"],
-                                          paste.y=patient2.fuzzy[agrep.match,"paste"])
-				#add to patientMerge
-        patientMerge = rbind(patientMerge, patientMerge.new.row)
-        #remove from patient*.fuzzy
+      if(sum(match.filter) == 2){
+        second.match = which(match.filter)[2]
+        second.clone.sequence = patient.fuzzy[second.match,"Clone_Sequence"]
+        first.sample = patient.fuzzy[1,"Sample"]
+        second.sample = patient.fuzzy[second.match,"Sample"]

-
-        #remove the fuzzy merged clone sequences from the original datasets
-        patient1 = patient1[patient1$Clone_Sequence != patient1.fuzzy[1,"Clone_Sequence"],]
-        patient2 = patient2[patient2$Clone_Sequence != patient2.fuzzy[agrep.match,"Clone_Sequence"],]
-
-        scatterplot_data = scatterplot_data[scatterplot_data$merge != current.clone.seq.2,]
-
-        patient2.fuzzy <<- patient2.fuzzy[-c(agrep.match),]
-
-
-      } else if (length(agrep.match) > 1){
-        #multiple matches, whatdo?
-        cat(paste("<tr><td>", "Multiple matches found for ", current.merge.1, ", ", current.clone.seq.1, " in ", patient, ", ", oneSample, "</td></tr>", sep=""), file=logfile, append=T)
+        if(((nchar(second.clone.sequence) - nchar(first.clone.sequence)) <= 9) & (first.sample != second.sample)){
+          first.match.row = patient.fuzzy[1,]
+          second.match.row = patient.fuzzy[second.match,]
+          print(paste(first.merge, first.match.row$normalized_read_count, second.match.row$normalized_read_count, first.clone.sequence, second.clone.sequence))
+          patientMerge.new.row = data.frame(merge=first.clone.sequence,
+                                            min_cell_paste.x=first.match.row[1,"min_cell_paste"],
+                                            Patient.x=first.match.row[1,"Patient"],
+                                            Receptor.x=first.match.row[1,"Receptor"],
+                                            Sample.x=first.match.row[1,"Sample"],
+                                            Cell_Count.x=first.match.row[1,"Cell_Count"],
+                                            Clone_Molecule_Count_From_Spikes.x=first.match.row[1,"Clone_Molecule_Count_From_Spikes"],
+                                            Log10_Frequency.x=first.match.row[1,"Log10_Frequency"],
+                                            Total_Read_Count.x=first.match.row[1,"Total_Read_Count"],
+                                            dsPerM.x=first.match.row[1,"dsPerM"],
+                                            J_Segment_Major_Gene.x=first.match.row[1,"J_Segment_Major_Gene"],
+                                            V_Segment_Major_Gene.x=first.match.row[1,"V_Segment_Major_Gene"],
+                                            Clone_Sequence.x=first.match.row[1,"Clone_Sequence"],
+                                            CDR3_Sense_Sequence.x=first.match.row[1,"CDR3_Sense_Sequence"],
+                                            Related_to_leukemia_clone.x=first.match.row[1,"Related_to_leukemia_clone"],
+                                            Frequency.x=first.match.row[1,"Frequency"],
+                                            locus_V.x=first.match.row[1,"locus_V"],
+                                            locus_J.x=first.match.row[1,"locus_J"],
+                                            min_cell_count.x=first.match.row[1,"min_cell_count"],
+                                            normalized_read_count.x=first.match.row[1,"normalized_read_count"],
+                                            paste.x=first.match.row[1,"paste"],
+                                            min_cell_paste.y=second.match.row[1,"min_cell_paste"],
+                                            Patient.y=second.match.row[1,"Patient"],
+                                            Receptor.y=second.match.row[1,"Receptor"],
+                                            Sample.y=second.match.row[1,"Sample"],
+                                            Cell_Count.y=second.match.row[1,"Cell_Count"],
+                                            Clone_Molecule_Count_From_Spikes.y=second.match.row[1,"Clone_Molecule_Count_From_Spikes"],
+                                            Log10_Frequency.y=second.match.row[1,"Log10_Frequency"],
+                                            Total_Read_Count.y=second.match.row[1,"Total_Read_Count"],
+                                            dsPerM.y=second.match.row[1,"dsPerM"],
+                                            J_Segment_Major_Gene.y=second.match.row[1,"J_Segment_Major_Gene"],
+                                            V_Segment_Major_Gene.y=second.match.row[1,"V_Segment_Major_Gene"],
+                                            Clone_Sequence.y=second.match.row[1,"Clone_Sequence"],
+                                            CDR3_Sense_Sequence.y=second.match.row[1,"CDR3_Sense_Sequence"],
+                                            Related_to_leukemia_clone.y=second.match.row[1,"Related_to_leukemia_clone"],
+                                            Frequency.y=second.match.row[1,"Frequency"],
+                                            locus_V.y=second.match.row[1,"locus_V"],
+                                            locus_J.y=second.match.row[1,"locus_J"],
+                                            min_cell_count.y=second.match.row[1,"min_cell_count"],
+                                            normalized_read_count.y=second.match.row[1,"normalized_read_count"],
+                                            paste.y=first.match.row[1,"paste"])
+
+
+          patientMerge = rbind(patientMerge, patientMerge.new.row)
+          patient.fuzzy = patient.fuzzy[-match.filter,]
+
+          patient1 = patient1[!(patient1$Clone_Sequence %in% c(first.clone.sequence, second.clone.sequence)),]
+          patient2 = patient2[!(patient2$Clone_Sequence %in% c(first.clone.sequence, second.clone.sequence)),]
+
+          scatterplot_data = scatterplot_data[scatterplot_data$merge != second.clone.sequence,]
+
+        } else {
+					patient.fuzzy = patient.fuzzy[-1,]
+        }
+
+      } else if (sum(match.filter) > 2){
+        print(paste("Multiple matches found for", first.merge, "in", patient))
+      } else {
+        patient.fuzzy = patient.fuzzy[-1,]
       }
-      patient1.fuzzy = patient1.fuzzy[-1,]
+
+
     }

-    #adist(patient1.fuzzy$Clone_Sequence, patient2.fuzzy$Clone_Sequence, list(insertions=0.1, deletions=0.1, substitutions=1))
   }

+
   patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
   res1 = vector()
   res2 = vector()