# HG changeset patch
# User davidvanzessen
# Date 1440580795 14400
# Node ID 37d9074ef2c63c9d1b5a28f9a54dd762cb781c7c
# Parent 642b4593f0a416969d4ddf2ec30091612528ea34
Uploaded
diff -r 642b4593f0a4 -r 37d9074ef2c6 RScript.r
--- a/RScript.r Fri Jul 24 05:33:02 2015 -0400
+++ b/RScript.r Wed Aug 26 05:19:55 2015 -0400
@@ -17,6 +17,8 @@
#require(xtable)
cat("
Reading input |
", file=logfile, append=T)
dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
+dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")]
+dat$dsPerM = 0
dat = dat[!is.na(dat$Patient),]
dat$Related_to_leukemia_clone = F
@@ -43,33 +45,34 @@
min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
-
+print(paste("rows:", nrow(dat)))
dat = merge(dat, min_cell_count, by="min_cell_paste")
-
+print(paste("rows:", nrow(dat)))
dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
dat = dat[dat$normalized_read_count >= min_cells,]
dat$paste = paste(dat$Sample, dat$Clone_Sequence)
-cat("Adding duplicate V+J+CDR3 sequences |
", file=logfile, append=T)
#remove duplicate V+J+CDR3, add together numerical values
-dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor),
- Cell_Count=unique(.SD$Cell_Count),
- Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
- Total_Read_Count=sum(.SD$Total_Read_Count),
- dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
- Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
- Frequency=sum(.SD$Frequency),
- locus_V=unique(.SD$locus_V),
- locus_J=unique(.SD$locus_J),
- min_cell_count=unique(.SD$min_cell_count),
- normalized_read_count=sum(.SD$normalized_read_count),
- Log10_Frequency=sum(.SD$Log10_Frequency),
- Clone_Sequence=.SD$Clone_Sequence[1],
- min_cell_paste=.SD$min_cell_paste[1],
- paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
-
+if(mergeOn != "Clone_Sequence"){
+ cat("Adding duplicate V+J+CDR3 sequences |
", file=logfile, append=T)
+ dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor),
+ Cell_Count=unique(.SD$Cell_Count),
+ Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
+ Total_Read_Count=sum(.SD$Total_Read_Count),
+ dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
+ Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
+ Frequency=sum(.SD$Frequency),
+ locus_V=unique(.SD$locus_V),
+ locus_J=unique(.SD$locus_J),
+ min_cell_count=unique(.SD$min_cell_count),
+ normalized_read_count=sum(.SD$normalized_read_count),
+ Log10_Frequency=sum(.SD$Log10_Frequency),
+ Clone_Sequence=.SD$Clone_Sequence[1],
+ min_cell_paste=.SD$min_cell_paste[1],
+ paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
+}
patients = split(dat, dat$Patient, drop=T)
intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
@@ -86,7 +89,8 @@
if (!is.data.frame(x) & is.list(x)){
x = x[[1]]
}
- x$Sample = factor(x$Sample, levels=unique(x$Sample))
+ #x$Sample = factor(x$Sample, levels=unique(x$Sample))
+ x = data.frame(x,stringsAsFactors=F)
onShort = "reads"
if(on == "Frequency"){
onShort = "freq"
@@ -143,6 +147,100 @@
scatterplot_data$on = onShort
patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
+
+
+ #fuzzy matching here...
+ if(mergeOn == "Clone_Sequence"){
+ merge.list = patientMerge$merge
+
+ patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
+ patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
+
+ patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
+ patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
+
+ merge.freq.table = data.frame(table(c(patient1.fuzzy$merge, patient2.fuzzy$merge)))
+ merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
+
+ patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
+ patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
+
+ while(nrow(patient1.fuzzy) > 0 & nrow(patient2.fuzzy) > 0){
+ current.merge.1 = patient1.fuzzy[1,"merge"]
+ current.clone.seq.1 = patient1.fuzzy[1,"Clone_Sequence"]
+ current.merge.in.2 = patient2.fuzzy[patient2.fuzzy$merge == current.merge.1,]
+
+ #agrep/adist the two samples
+ agrep.match = agrep(current.clone.seq.1, current.merge.in.2$Clone_Sequence, max.distance = 1, costs=list(insertions=0.1, deletions=0.1, substitutions=1))
+
+
+ if(length(agrep.match) == 1){
+ current.clone.seq.2 = patient2.fuzzy[agrep.match,"Clone_Sequence"]
+ patientMerge.new.row = data.frame(merge=current.clone.seq.1,
+ min_cell_paste.x=patient1.fuzzy[1,"min_cell_paste"],
+ Patient.x=patient1.fuzzy[1,"Patient"],
+ Receptor.x=patient1.fuzzy[1,"Receptor"],
+ Sample.x=patient1.fuzzy[1,"Sample"],
+ Cell_Count.x=patient1.fuzzy[1,"Cell_Count"],
+ Clone_Molecule_Count_From_Spikes.x=patient1.fuzzy[1,"Clone_Molecule_Count_From_Spikes"],
+ Log10_Frequency.x=patient1.fuzzy[1,"Log10_Frequency"],
+ Total_Read_Count.x=patient1.fuzzy[1,"Total_Read_Count"],
+ dsPerM.x=patient1.fuzzy[1,"dsPerM"],
+ J_Segment_Major_Gene.x=patient1.fuzzy[1,"J_Segment_Major_Gene"],
+ V_Segment_Major_Gene.x=patient1.fuzzy[1,"V_Segment_Major_Gene"],
+ Clone_Sequence.x=patient1.fuzzy[1,"Clone_Sequence"],
+ CDR3_Sense_Sequence.x=patient1.fuzzy[1,"CDR3_Sense_Sequence"],
+ Related_to_leukemia_clone.x=patient1.fuzzy[1,"Related_to_leukemia_clone"],
+ Frequency.x=patient1.fuzzy[1,"Frequency"],
+ locus_V.x=patient1.fuzzy[1,"locus_V"],
+ locus_J.x=patient1.fuzzy[1,"locus_J"],
+ min_cell_count.x=patient1.fuzzy[1,"min_cell_count"],
+ normalized_read_count.x=patient1.fuzzy[1,"normalized_read_count"],
+ paste.x=patient1.fuzzy[1,"paste"],
+ min_cell_paste.y=patient2.fuzzy[agrep.match,"min_cell_paste"],
+ Patient.y=patient2.fuzzy[agrep.match,"Patient"],
+ Receptor.y=patient2.fuzzy[agrep.match,"Receptor"],
+ Sample.y=patient2.fuzzy[agrep.match,"Sample"],
+ Cell_Count.y=patient2.fuzzy[agrep.match,"Cell_Count"],
+ Clone_Molecule_Count_From_Spikes.y=patient2.fuzzy[agrep.match,"Clone_Molecule_Count_From_Spikes"],
+ Log10_Frequency.y=patient2.fuzzy[agrep.match,"Log10_Frequency"],
+ Total_Read_Count.y=patient2.fuzzy[agrep.match,"Total_Read_Count"],
+ dsPerM.y=patient2.fuzzy[agrep.match,"dsPerM"],
+ J_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"J_Segment_Major_Gene"],
+ V_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"V_Segment_Major_Gene"],
+ Clone_Sequence.y=patient2.fuzzy[agrep.match,"Clone_Sequence"],
+ CDR3_Sense_Sequence.y=patient2.fuzzy[agrep.match,"CDR3_Sense_Sequence"],
+ Related_to_leukemia_clone.y=patient2.fuzzy[agrep.match,"Related_to_leukemia_clone"],
+ Frequency.y=patient2.fuzzy[agrep.match,"Frequency"],
+ locus_V.y=patient2.fuzzy[agrep.match,"locus_V"],
+ locus_J.y=patient2.fuzzy[agrep.match,"locus_J"],
+ min_cell_count.y=patient2.fuzzy[agrep.match,"min_cell_count"],
+ normalized_read_count.y=patient2.fuzzy[agrep.match,"normalized_read_count"],
+ paste.y=patient2.fuzzy[agrep.match,"paste"])
+ #add to patientMerge
+ patientMerge = rbind(patientMerge, patientMerge.new.row)
+ #remove from patient*.fuzzy
+
+
+ #remove the fuzzy merged clone sequences from the original datasets
+ patient1 <<- patient1[patient1$Clone_Sequence != patient1.fuzzy[1,"Clone_Sequence"],]
+ patient2 <<- patient2[patient2$Clone_Sequence != patient2.fuzzy[agrep.match,"Clone_Sequence"],]
+
+ scatterplot_data = scatterplot_data[scatterplot_data$merge != current.clone.seq.2,]
+
+ patient2.fuzzy <<- patient2.fuzzy[-c(agrep.match),]
+
+
+ } else if (length(agrep.match) > 1){
+ #multiple matches, whatdo?
+ cat(paste("", "Multiple matches found for ", current.merge.1, " in ", patient, " |
", sep=""), file=logfile, append=T)
+ }
+ patient1.fuzzy = patient1.fuzzy[-1,]
+ }
+
+ #adist(patient1.fuzzy$Clone_Sequence, patient2.fuzzy$Clone_Sequence, list(insertions=0.1, deletions=0.1, substitutions=1))
+ }
+
patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
res1 = vector()
res2 = vector()
@@ -187,15 +285,16 @@
if(nrow(scatterplot_locus_data) > 0){
scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
}
+ in_one = (scatterplot_locus_data$merge %in% patient1$merge)
+ in_two = (scatterplot_locus_data$merge %in% patient2$merge)
+ not_in_one = !in_one
+ if(any(in_two)){
+ scatterplot_locus_data[not_in_one,]$type = twoSample
+ }
in_both = (scatterplot_locus_data$merge %in% patientMerge[both,]$merge)
if(any(in_both)){
scatterplot_locus_data[in_both,]$type = "In Both"
}
- in_one = (scatterplot_locus_data$merge %in% patient1$merge)
- not_in_one = !in_one
- if(any(not_in_one)){
- scatterplot_locus_data[not_in_one,]$type = twoSample
- }
if(type == "single"){
single_patients <<- rbind(single_patients, scatterplot_locus_data)
}
@@ -203,7 +302,7 @@
if(nrow(scatterplot_locus_data) != 0){
if(on == "normalized_read_count"){
scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
- p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
+ p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales)
} else {
p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
}