# HG changeset patch # User davidvanzessen # Date 1421928733 18000 # Node ID 576de7c96c4f097a9f3d82e199de711239118dfd # Parent eb5b569b44dd76f88c1221e8802d80b830d9717d Uploaded diff -r eb5b569b44dd -r 576de7c96c4f RScript.r --- a/RScript.r Wed Dec 10 09:49:16 2014 -0500 +++ b/RScript.r Thu Jan 22 07:12:13 2015 -0500 @@ -15,56 +15,41 @@ library(parallel) #require(xtable) cat("Reading input", file=logfile, append=T) -dat = read.table(inFile, header=T, sep="\t", dec=",", fill=T, stringsAsFactors=F) +dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F) dat = dat[!is.na(dat$Patient),] -dat = dat[!duplicated(dat$Clone_Sequence), ] +dat$Related_to_leukemia_clone = F setwd(outDir) cat("Selecting first V/J Genes", file=logfile, append=T) dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1))) dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1))) -dat$Frequency = ((10^dat$Log10_Frequency)*100) - -cat("Deduplication", file=logfile, append=T) -#dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) - -most.common = function(x, ret="V"){ - past = paste(x$V_Segment_Major_Gene, x$J_Segment_Major_Gene, sep=";") - ux = unique(past) - if(length(ux) > 1){ - xtdf = data.frame(table(past)) - #print(xtdf) - res = unlist(strsplit(as.character(xtdf$past[which.max(xtdf$Freq)]), ";")) - #print(res) - if(ret == "V"){ - return(res[1]) - } else { - return(res[2]) - } - } - - if(ret == "V"){ - return(unique(x$V_Segment_Major_Gene)) - } else { - return(unique(x$J_Segment_Major_Gene)) - } -} - -dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), V_Segment_Major_Gene= as.character(most.common(.SD, ret="V")), J_Segment_Major_Gene= as.character(most.common(.SD, ret="J")), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "CDR3_Sense_Sequence")]) -dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) - cat("Calculating Frequency", file=logfile, append=T) +dat$Frequency = ((10^dat$Log10_Frequency)*100) dat = dat[dat$Frequency >= min_freq,] -#cat("Normalizing cell count to 1.000.000", file=logfile, append=T) -#dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2) -dat$normalized_read_count = dat$Clone_Molecule_Count_From_Spikes +triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),] + +cat("Normalizing to lowest cell count within locus", file=logfile, append=T) + +dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4) +dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4) +min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")]) + +dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J) +min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J) + +min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")] + +dat = merge(dat, min_cell_count, by="min_cell_paste") + +dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2 + dat = dat[dat$normalized_read_count >= min_cells,] -dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) -triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),] + +dat$paste = paste(dat$Sample, dat$Clone_Sequence) patients = split(dat, dat$Patient, drop=T) intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000)) @@ -118,8 +103,8 @@ #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) - patient1$merge = paste(patient1$CDR3_Sense_Sequence) - patient2$merge = paste(patient2$CDR3_Sense_Sequence) + patient1$merge = paste(patient1$Clone_Sequence) + patient2$merge = paste(patient2$Clone_Sequence) #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") @@ -235,6 +220,7 @@ cat("", file=logfile, append=T) + tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){ onShort = "reads" if(on == "Frequency"){ @@ -262,7 +248,7 @@ patientMerge = merge(patient1, patient2, by="merge") patientMerge = merge(patientMerge, patient3, by="merge") - colnames(patientMerge)[28:length(colnames(patientMerge))] = paste(colnames(patientMerge)[28:length(colnames(patientMerge))], ".z", sep="") + colnames(patientMerge)[30:length(colnames(patientMerge))] = paste(colnames(patientMerge)[30:length(colnames(patientMerge))], ".z", sep="") res1 = vector() res2 = vector() res3 = vector() @@ -294,19 +280,19 @@ if(threshhold != 0){ if(sum(one) > 0){ dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] - colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") + colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="") write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) } if(sum(two) > 0){ dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] - colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") + colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="") write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) } if(sum(three) > 0){ dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] - colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") + colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="") write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) } @@ -360,7 +346,6 @@ dev.off() } - triplets$uniqueID = "ID" triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" @@ -373,10 +358,24 @@ triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696" -triplets = data.frame(data.table(triplets)[, list(Patient=unique(.SD$uniqueID), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) +triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4) +triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4) +min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")]) + +triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J) +min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J) + +min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")] -triplets$Frequency = (10^as.numeric(triplets$Log10_Frequency))*100 -triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * 1000000 / 2) +triplets = merge(triplets, min_cell_count, by="min_cell_paste") + +triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2 + +triplets = triplets[triplets$normalized_read_count >= min_cells,] + +column_drops = c("locus_V", "locus_J", "min_cell_count", "min_cell_paste") + +triplets = triplets[,!(colnames(triplets) %in% column_drops)] interval = intervalReads intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))