# HG changeset patch # User davidvanzessen # Date 1440580795 14400 # Node ID 37d9074ef2c63c9d1b5a28f9a54dd762cb781c7c # Parent 642b4593f0a416969d4ddf2ec30091612528ea34 Uploaded diff -r 642b4593f0a4 -r 37d9074ef2c6 RScript.r --- a/RScript.r Fri Jul 24 05:33:02 2015 -0400 +++ b/RScript.r Wed Aug 26 05:19:55 2015 -0400 @@ -17,6 +17,8 @@ #require(xtable) cat("Reading input", file=logfile, append=T) dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F) +dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")] +dat$dsPerM = 0 dat = dat[!is.na(dat$Patient),] dat$Related_to_leukemia_clone = F @@ -43,33 +45,34 @@ min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J) min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")] - +print(paste("rows:", nrow(dat))) dat = merge(dat, min_cell_count, by="min_cell_paste") - +print(paste("rows:", nrow(dat))) dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2 dat = dat[dat$normalized_read_count >= min_cells,] dat$paste = paste(dat$Sample, dat$Clone_Sequence) -cat("Adding duplicate V+J+CDR3 sequences", file=logfile, append=T) #remove duplicate V+J+CDR3, add together numerical values -dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor), - Cell_Count=unique(.SD$Cell_Count), - Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), - Total_Read_Count=sum(.SD$Total_Read_Count), - dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0), - Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone), - Frequency=sum(.SD$Frequency), - locus_V=unique(.SD$locus_V), - locus_J=unique(.SD$locus_J), - min_cell_count=unique(.SD$min_cell_count), - normalized_read_count=sum(.SD$normalized_read_count), - Log10_Frequency=sum(.SD$Log10_Frequency), - Clone_Sequence=.SD$Clone_Sequence[1], - min_cell_paste=.SD$min_cell_paste[1], - paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")]) - +if(mergeOn != "Clone_Sequence"){ + cat("Adding duplicate V+J+CDR3 sequences", file=logfile, append=T) + dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor), + Cell_Count=unique(.SD$Cell_Count), + Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), + Total_Read_Count=sum(.SD$Total_Read_Count), + dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0), + Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone), + Frequency=sum(.SD$Frequency), + locus_V=unique(.SD$locus_V), + locus_J=unique(.SD$locus_J), + min_cell_count=unique(.SD$min_cell_count), + normalized_read_count=sum(.SD$normalized_read_count), + Log10_Frequency=sum(.SD$Log10_Frequency), + Clone_Sequence=.SD$Clone_Sequence[1], + min_cell_paste=.SD$min_cell_paste[1], + paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")]) +} patients = split(dat, dat$Patient, drop=T) intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000)) @@ -86,7 +89,8 @@ if (!is.data.frame(x) & is.list(x)){ x = x[[1]] } - x$Sample = factor(x$Sample, levels=unique(x$Sample)) + #x$Sample = factor(x$Sample, levels=unique(x$Sample)) + x = data.frame(x,stringsAsFactors=F) onShort = "reads" if(on == "Frequency"){ onShort = "freq" @@ -143,6 +147,100 @@ scatterplot_data$on = onShort patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") + + + #fuzzy matching here... + if(mergeOn == "Clone_Sequence"){ + merge.list = patientMerge$merge + + patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),] + patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),] + + patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence) + patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence) + + merge.freq.table = data.frame(table(c(patient1.fuzzy$merge, patient2.fuzzy$merge))) + merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,] + + patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,] + patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,] + + while(nrow(patient1.fuzzy) > 0 & nrow(patient2.fuzzy) > 0){ + current.merge.1 = patient1.fuzzy[1,"merge"] + current.clone.seq.1 = patient1.fuzzy[1,"Clone_Sequence"] + current.merge.in.2 = patient2.fuzzy[patient2.fuzzy$merge == current.merge.1,] + + #agrep/adist the two samples + agrep.match = agrep(current.clone.seq.1, current.merge.in.2$Clone_Sequence, max.distance = 1, costs=list(insertions=0.1, deletions=0.1, substitutions=1)) + + + if(length(agrep.match) == 1){ + current.clone.seq.2 = patient2.fuzzy[agrep.match,"Clone_Sequence"] + patientMerge.new.row = data.frame(merge=current.clone.seq.1, + min_cell_paste.x=patient1.fuzzy[1,"min_cell_paste"], + Patient.x=patient1.fuzzy[1,"Patient"], + Receptor.x=patient1.fuzzy[1,"Receptor"], + Sample.x=patient1.fuzzy[1,"Sample"], + Cell_Count.x=patient1.fuzzy[1,"Cell_Count"], + Clone_Molecule_Count_From_Spikes.x=patient1.fuzzy[1,"Clone_Molecule_Count_From_Spikes"], + Log10_Frequency.x=patient1.fuzzy[1,"Log10_Frequency"], + Total_Read_Count.x=patient1.fuzzy[1,"Total_Read_Count"], + dsPerM.x=patient1.fuzzy[1,"dsPerM"], + J_Segment_Major_Gene.x=patient1.fuzzy[1,"J_Segment_Major_Gene"], + V_Segment_Major_Gene.x=patient1.fuzzy[1,"V_Segment_Major_Gene"], + Clone_Sequence.x=patient1.fuzzy[1,"Clone_Sequence"], + CDR3_Sense_Sequence.x=patient1.fuzzy[1,"CDR3_Sense_Sequence"], + Related_to_leukemia_clone.x=patient1.fuzzy[1,"Related_to_leukemia_clone"], + Frequency.x=patient1.fuzzy[1,"Frequency"], + locus_V.x=patient1.fuzzy[1,"locus_V"], + locus_J.x=patient1.fuzzy[1,"locus_J"], + min_cell_count.x=patient1.fuzzy[1,"min_cell_count"], + normalized_read_count.x=patient1.fuzzy[1,"normalized_read_count"], + paste.x=patient1.fuzzy[1,"paste"], + min_cell_paste.y=patient2.fuzzy[agrep.match,"min_cell_paste"], + Patient.y=patient2.fuzzy[agrep.match,"Patient"], + Receptor.y=patient2.fuzzy[agrep.match,"Receptor"], + Sample.y=patient2.fuzzy[agrep.match,"Sample"], + Cell_Count.y=patient2.fuzzy[agrep.match,"Cell_Count"], + Clone_Molecule_Count_From_Spikes.y=patient2.fuzzy[agrep.match,"Clone_Molecule_Count_From_Spikes"], + Log10_Frequency.y=patient2.fuzzy[agrep.match,"Log10_Frequency"], + Total_Read_Count.y=patient2.fuzzy[agrep.match,"Total_Read_Count"], + dsPerM.y=patient2.fuzzy[agrep.match,"dsPerM"], + J_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"J_Segment_Major_Gene"], + V_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"V_Segment_Major_Gene"], + Clone_Sequence.y=patient2.fuzzy[agrep.match,"Clone_Sequence"], + CDR3_Sense_Sequence.y=patient2.fuzzy[agrep.match,"CDR3_Sense_Sequence"], + Related_to_leukemia_clone.y=patient2.fuzzy[agrep.match,"Related_to_leukemia_clone"], + Frequency.y=patient2.fuzzy[agrep.match,"Frequency"], + locus_V.y=patient2.fuzzy[agrep.match,"locus_V"], + locus_J.y=patient2.fuzzy[agrep.match,"locus_J"], + min_cell_count.y=patient2.fuzzy[agrep.match,"min_cell_count"], + normalized_read_count.y=patient2.fuzzy[agrep.match,"normalized_read_count"], + paste.y=patient2.fuzzy[agrep.match,"paste"]) + #add to patientMerge + patientMerge = rbind(patientMerge, patientMerge.new.row) + #remove from patient*.fuzzy + + + #remove the fuzzy merged clone sequences from the original datasets + patient1 <<- patient1[patient1$Clone_Sequence != patient1.fuzzy[1,"Clone_Sequence"],] + patient2 <<- patient2[patient2$Clone_Sequence != patient2.fuzzy[agrep.match,"Clone_Sequence"],] + + scatterplot_data = scatterplot_data[scatterplot_data$merge != current.clone.seq.2,] + + patient2.fuzzy <<- patient2.fuzzy[-c(agrep.match),] + + + } else if (length(agrep.match) > 1){ + #multiple matches, whatdo? + cat(paste("", "Multiple matches found for ", current.merge.1, " in ", patient, "", sep=""), file=logfile, append=T) + } + patient1.fuzzy = patient1.fuzzy[-1,] + } + + #adist(patient1.fuzzy$Clone_Sequence, patient2.fuzzy$Clone_Sequence, list(insertions=0.1, deletions=0.1, substitutions=1)) + } + patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony]) res1 = vector() res2 = vector() @@ -187,15 +285,16 @@ if(nrow(scatterplot_locus_data) > 0){ scatterplot_locus_data$Rearrangement = product[iter, titleIndex] } + in_one = (scatterplot_locus_data$merge %in% patient1$merge) + in_two = (scatterplot_locus_data$merge %in% patient2$merge) + not_in_one = !in_one + if(any(in_two)){ + scatterplot_locus_data[not_in_one,]$type = twoSample + } in_both = (scatterplot_locus_data$merge %in% patientMerge[both,]$merge) if(any(in_both)){ scatterplot_locus_data[in_both,]$type = "In Both" } - in_one = (scatterplot_locus_data$merge %in% patient1$merge) - not_in_one = !in_one - if(any(not_in_one)){ - scatterplot_locus_data[not_in_one,]$type = twoSample - } if(type == "single"){ single_patients <<- rbind(single_patients, scatterplot_locus_data) } @@ -203,7 +302,7 @@ if(nrow(scatterplot_locus_data) != 0){ if(on == "normalized_read_count"){ scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count)))) - p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000)) + p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) } else { p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) }