# HG changeset patch
# User davidvanzessen
# Date 1421928733 18000
# Node ID 576de7c96c4f097a9f3d82e199de711239118dfd
# Parent eb5b569b44dd76f88c1221e8802d80b830d9717d
Uploaded
diff -r eb5b569b44dd -r 576de7c96c4f RScript.r
--- a/RScript.r Wed Dec 10 09:49:16 2014 -0500
+++ b/RScript.r Thu Jan 22 07:12:13 2015 -0500
@@ -15,56 +15,41 @@
library(parallel)
#require(xtable)
cat("
Reading input |
", file=logfile, append=T)
-dat = read.table(inFile, header=T, sep="\t", dec=",", fill=T, stringsAsFactors=F)
+dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
dat = dat[!is.na(dat$Patient),]
-dat = dat[!duplicated(dat$Clone_Sequence), ]
+dat$Related_to_leukemia_clone = F
setwd(outDir)
cat("Selecting first V/J Genes |
", file=logfile, append=T)
dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
-dat$Frequency = ((10^dat$Log10_Frequency)*100)
-
-cat("Deduplication |
", file=logfile, append=T)
-#dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")])
-
-most.common = function(x, ret="V"){
- past = paste(x$V_Segment_Major_Gene, x$J_Segment_Major_Gene, sep=";")
- ux = unique(past)
- if(length(ux) > 1){
- xtdf = data.frame(table(past))
- #print(xtdf)
- res = unlist(strsplit(as.character(xtdf$past[which.max(xtdf$Freq)]), ";"))
- #print(res)
- if(ret == "V"){
- return(res[1])
- } else {
- return(res[2])
- }
- }
-
- if(ret == "V"){
- return(unique(x$V_Segment_Major_Gene))
- } else {
- return(unique(x$J_Segment_Major_Gene))
- }
-}
-
-dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), V_Segment_Major_Gene= as.character(most.common(.SD, ret="V")), J_Segment_Major_Gene= as.character(most.common(.SD, ret="J")), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "CDR3_Sense_Sequence")])
-dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")])
-
cat("Calculating Frequency |
", file=logfile, append=T)
+dat$Frequency = ((10^dat$Log10_Frequency)*100)
dat = dat[dat$Frequency >= min_freq,]
-#cat("Normalizing cell count to 1.000.000 |
", file=logfile, append=T)
-#dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2)
-dat$normalized_read_count = dat$Clone_Molecule_Count_From_Spikes
+triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
+
+cat("Normalizing to lowest cell count within locus |
", file=logfile, append=T)
+
+dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
+dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
+min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
+
+dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
+min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
+
+min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
+
+dat = merge(dat, min_cell_count, by="min_cell_paste")
+
+dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
+
dat = dat[dat$normalized_read_count >= min_cells,]
-dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence)
-triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
+
+dat$paste = paste(dat$Sample, dat$Clone_Sequence)
patients = split(dat, dat$Patient, drop=T)
intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
@@ -118,8 +103,8 @@
#patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
#patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
- patient1$merge = paste(patient1$CDR3_Sense_Sequence)
- patient2$merge = paste(patient2$CDR3_Sense_Sequence)
+ patient1$merge = paste(patient1$Clone_Sequence)
+ patient2$merge = paste(patient2$Clone_Sequence)
#patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
@@ -235,6 +220,7 @@
cat("