# HG changeset patch # User davidvanzessen # Date 1441974378 14400 # Node ID 623bbe972363ddafd7fd977f012d9cf4aa635dda # Parent d592dab2fca14a502ee559edbb9008501c3bb913 Uploaded diff -r d592dab2fca1 -r 623bbe972363 RScript.r --- a/RScript.r Mon Aug 31 09:28:38 2015 -0400 +++ b/RScript.r Fri Sep 11 08:26:18 2015 -0400 @@ -162,88 +162,100 @@ patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence) patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence) - merge.freq.table = data.frame(table(c(patient1.fuzzy$merge, patient2.fuzzy$merge))) + merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,] patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,] patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,] - while(nrow(patient1.fuzzy) > 0 & nrow(patient2.fuzzy) > 0){ - current.merge.1 = patient1.fuzzy[1,"merge"] - current.clone.seq.1 = patient1.fuzzy[1,"Clone_Sequence"] - current.merge.in.2 = patient2.fuzzy[patient2.fuzzy$merge == current.merge.1,] - - #agrep/adist the two samples - agrep.match = agrep(current.clone.seq.1, current.merge.in.2$Clone_Sequence, max.distance = 9, costs=list(insertions=1, deletions=1, substitutions=10)) + patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy) + patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),] + + while(nrow(patient.fuzzy) > 1){ + first.merge = patient.fuzzy[1,"merge"] + first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"] + + merge.filter = first.merge == patient.fuzzy$merge + + match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) - - if(length(agrep.match) == 1){ - current.clone.seq.2 = patient2.fuzzy[agrep.match,"Clone_Sequence"] - patientMerge.new.row = data.frame(merge=current.clone.seq.1, - min_cell_paste.x=patient1.fuzzy[1,"min_cell_paste"], - Patient.x=patient1.fuzzy[1,"Patient"], - Receptor.x=patient1.fuzzy[1,"Receptor"], - Sample.x=patient1.fuzzy[1,"Sample"], - Cell_Count.x=patient1.fuzzy[1,"Cell_Count"], - Clone_Molecule_Count_From_Spikes.x=patient1.fuzzy[1,"Clone_Molecule_Count_From_Spikes"], - Log10_Frequency.x=patient1.fuzzy[1,"Log10_Frequency"], - Total_Read_Count.x=patient1.fuzzy[1,"Total_Read_Count"], - dsPerM.x=patient1.fuzzy[1,"dsPerM"], - J_Segment_Major_Gene.x=patient1.fuzzy[1,"J_Segment_Major_Gene"], - V_Segment_Major_Gene.x=patient1.fuzzy[1,"V_Segment_Major_Gene"], - Clone_Sequence.x=patient1.fuzzy[1,"Clone_Sequence"], - CDR3_Sense_Sequence.x=patient1.fuzzy[1,"CDR3_Sense_Sequence"], - Related_to_leukemia_clone.x=patient1.fuzzy[1,"Related_to_leukemia_clone"], - Frequency.x=patient1.fuzzy[1,"Frequency"], - locus_V.x=patient1.fuzzy[1,"locus_V"], - locus_J.x=patient1.fuzzy[1,"locus_J"], - min_cell_count.x=patient1.fuzzy[1,"min_cell_count"], - normalized_read_count.x=patient1.fuzzy[1,"normalized_read_count"], - paste.x=patient1.fuzzy[1,"paste"], - min_cell_paste.y=patient2.fuzzy[agrep.match,"min_cell_paste"], - Patient.y=patient2.fuzzy[agrep.match,"Patient"], - Receptor.y=patient2.fuzzy[agrep.match,"Receptor"], - Sample.y=patient2.fuzzy[agrep.match,"Sample"], - Cell_Count.y=patient2.fuzzy[agrep.match,"Cell_Count"], - Clone_Molecule_Count_From_Spikes.y=patient2.fuzzy[agrep.match,"Clone_Molecule_Count_From_Spikes"], - Log10_Frequency.y=patient2.fuzzy[agrep.match,"Log10_Frequency"], - Total_Read_Count.y=patient2.fuzzy[agrep.match,"Total_Read_Count"], - dsPerM.y=patient2.fuzzy[agrep.match,"dsPerM"], - J_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"J_Segment_Major_Gene"], - V_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"V_Segment_Major_Gene"], - Clone_Sequence.y=patient2.fuzzy[agrep.match,"Clone_Sequence"], - CDR3_Sense_Sequence.y=patient2.fuzzy[agrep.match,"CDR3_Sense_Sequence"], - Related_to_leukemia_clone.y=patient2.fuzzy[agrep.match,"Related_to_leukemia_clone"], - Frequency.y=patient2.fuzzy[agrep.match,"Frequency"], - locus_V.y=patient2.fuzzy[agrep.match,"locus_V"], - locus_J.y=patient2.fuzzy[agrep.match,"locus_J"], - min_cell_count.y=patient2.fuzzy[agrep.match,"min_cell_count"], - normalized_read_count.y=patient2.fuzzy[agrep.match,"normalized_read_count"], - paste.y=patient2.fuzzy[agrep.match,"paste"]) - #add to patientMerge - patientMerge = rbind(patientMerge, patientMerge.new.row) - #remove from patient*.fuzzy + if(sum(match.filter) == 2){ + second.match = which(match.filter)[2] + second.clone.sequence = patient.fuzzy[second.match,"Clone_Sequence"] + first.sample = patient.fuzzy[1,"Sample"] + second.sample = patient.fuzzy[second.match,"Sample"] - - #remove the fuzzy merged clone sequences from the original datasets - patient1 = patient1[patient1$Clone_Sequence != patient1.fuzzy[1,"Clone_Sequence"],] - patient2 = patient2[patient2$Clone_Sequence != patient2.fuzzy[agrep.match,"Clone_Sequence"],] - - scatterplot_data = scatterplot_data[scatterplot_data$merge != current.clone.seq.2,] - - patient2.fuzzy <<- patient2.fuzzy[-c(agrep.match),] - - - } else if (length(agrep.match) > 1){ - #multiple matches, whatdo? - cat(paste("", "Multiple matches found for ", current.merge.1, ", ", current.clone.seq.1, " in ", patient, ", ", oneSample, "", sep=""), file=logfile, append=T) + if(((nchar(second.clone.sequence) - nchar(first.clone.sequence)) <= 9) & (first.sample != second.sample)){ + first.match.row = patient.fuzzy[1,] + second.match.row = patient.fuzzy[second.match,] + print(paste(first.merge, first.match.row$normalized_read_count, second.match.row$normalized_read_count, first.clone.sequence, second.clone.sequence)) + patientMerge.new.row = data.frame(merge=first.clone.sequence, + min_cell_paste.x=first.match.row[1,"min_cell_paste"], + Patient.x=first.match.row[1,"Patient"], + Receptor.x=first.match.row[1,"Receptor"], + Sample.x=first.match.row[1,"Sample"], + Cell_Count.x=first.match.row[1,"Cell_Count"], + Clone_Molecule_Count_From_Spikes.x=first.match.row[1,"Clone_Molecule_Count_From_Spikes"], + Log10_Frequency.x=first.match.row[1,"Log10_Frequency"], + Total_Read_Count.x=first.match.row[1,"Total_Read_Count"], + dsPerM.x=first.match.row[1,"dsPerM"], + J_Segment_Major_Gene.x=first.match.row[1,"J_Segment_Major_Gene"], + V_Segment_Major_Gene.x=first.match.row[1,"V_Segment_Major_Gene"], + Clone_Sequence.x=first.match.row[1,"Clone_Sequence"], + CDR3_Sense_Sequence.x=first.match.row[1,"CDR3_Sense_Sequence"], + Related_to_leukemia_clone.x=first.match.row[1,"Related_to_leukemia_clone"], + Frequency.x=first.match.row[1,"Frequency"], + locus_V.x=first.match.row[1,"locus_V"], + locus_J.x=first.match.row[1,"locus_J"], + min_cell_count.x=first.match.row[1,"min_cell_count"], + normalized_read_count.x=first.match.row[1,"normalized_read_count"], + paste.x=first.match.row[1,"paste"], + min_cell_paste.y=second.match.row[1,"min_cell_paste"], + Patient.y=second.match.row[1,"Patient"], + Receptor.y=second.match.row[1,"Receptor"], + Sample.y=second.match.row[1,"Sample"], + Cell_Count.y=second.match.row[1,"Cell_Count"], + Clone_Molecule_Count_From_Spikes.y=second.match.row[1,"Clone_Molecule_Count_From_Spikes"], + Log10_Frequency.y=second.match.row[1,"Log10_Frequency"], + Total_Read_Count.y=second.match.row[1,"Total_Read_Count"], + dsPerM.y=second.match.row[1,"dsPerM"], + J_Segment_Major_Gene.y=second.match.row[1,"J_Segment_Major_Gene"], + V_Segment_Major_Gene.y=second.match.row[1,"V_Segment_Major_Gene"], + Clone_Sequence.y=second.match.row[1,"Clone_Sequence"], + CDR3_Sense_Sequence.y=second.match.row[1,"CDR3_Sense_Sequence"], + Related_to_leukemia_clone.y=second.match.row[1,"Related_to_leukemia_clone"], + Frequency.y=second.match.row[1,"Frequency"], + locus_V.y=second.match.row[1,"locus_V"], + locus_J.y=second.match.row[1,"locus_J"], + min_cell_count.y=second.match.row[1,"min_cell_count"], + normalized_read_count.y=second.match.row[1,"normalized_read_count"], + paste.y=first.match.row[1,"paste"]) + + + patientMerge = rbind(patientMerge, patientMerge.new.row) + patient.fuzzy = patient.fuzzy[-match.filter,] + + patient1 = patient1[!(patient1$Clone_Sequence %in% c(first.clone.sequence, second.clone.sequence)),] + patient2 = patient2[!(patient2$Clone_Sequence %in% c(first.clone.sequence, second.clone.sequence)),] + + scatterplot_data = scatterplot_data[scatterplot_data$merge != second.clone.sequence,] + + } else { + patient.fuzzy = patient.fuzzy[-1,] + } + + } else if (sum(match.filter) > 2){ + print(paste("Multiple matches found for", first.merge, "in", patient)) + } else { + patient.fuzzy = patient.fuzzy[-1,] } - patient1.fuzzy = patient1.fuzzy[-1,] + + } - #adist(patient1.fuzzy$Clone_Sequence, patient2.fuzzy$Clone_Sequence, list(insertions=0.1, deletions=0.1, substitutions=1)) } + patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony]) res1 = vector() res2 = vector()