Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples
diff RScript.r @ 12:eb5b569b44dd draft
Uploaded
author | davidvanzessen |
---|---|
date | Wed, 10 Dec 2014 09:49:16 -0500 |
parents | bc4612998d50 |
children | 576de7c96c4f |
line wrap: on
line diff
--- a/RScript.r Mon Oct 06 05:57:55 2014 -0400 +++ b/RScript.r Wed Dec 10 09:49:16 2014 -0500 @@ -17,35 +17,51 @@ cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) dat = read.table(inFile, header=T, sep="\t", dec=",", fill=T, stringsAsFactors=F) dat = dat[!is.na(dat$Patient),] +dat = dat[!duplicated(dat$Clone_Sequence), ] setwd(outDir) cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1))) dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1))) -str(dat) +dat$Frequency = ((10^dat$Log10_Frequency)*100) + cat("<tr><td>Deduplication</td></tr>", file=logfile, append=T) #dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) -most.common = function(x){ - ux = unique(x) +most.common = function(x, ret="V"){ + past = paste(x$V_Segment_Major_Gene, x$J_Segment_Major_Gene, sep=";") + ux = unique(past) if(length(ux) > 1){ - xtdf = data.frame(table(x)) - return(xtdf$Var1[which.max(xtdf$Freq)]) + xtdf = data.frame(table(past)) #print(xtdf) + res = unlist(strsplit(as.character(xtdf$past[which.max(xtdf$Freq)]), ";")) + #print(res) + if(ret == "V"){ + return(res[1]) + } else { + return(res[2]) + } } - return(unique(x)) + + if(ret == "V"){ + return(unique(x$V_Segment_Major_Gene)) + } else { + return(unique(x$J_Segment_Major_Gene)) + } } -dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), V_Segment_Major_Gene=most.common(.SD$V_Segment_Major_Gene), J_Segment_Major_Gene=most.common(.SD$J_Segment_Major_Gene), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "CDR3_Sense_Sequence")]) +dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), V_Segment_Major_Gene= as.character(most.common(.SD, ret="V")), J_Segment_Major_Gene= as.character(most.common(.SD, ret="J")), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "CDR3_Sense_Sequence")]) +dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")]) cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T) -dat$Frequency = ((10^dat$Log10_Frequency)*100) + dat = dat[dat$Frequency >= min_freq,] -cat("<tr><td>Normalizing cell count to 1.000.000</td></tr>", file=logfile, append=T) -dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2) +#cat("<tr><td>Normalizing cell count to 1.000.000</td></tr>", file=logfile, append=T) +#dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2) +dat$normalized_read_count = dat$Clone_Molecule_Count_From_Spikes dat = dat[dat$normalized_read_count >= min_cells,] dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence) triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),] @@ -100,9 +116,12 @@ } cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T) - patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) - patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) + #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) + #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) + patient1$merge = paste(patient1$CDR3_Sense_Sequence) + patient2$merge = paste(patient2$CDR3_Sense_Sequence) + #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") res1 = vector() res2 = vector() @@ -233,9 +252,13 @@ twoSample = paste(patient2[1,sampleIndex], sep="") threeSample = paste(patient3[1,sampleIndex], sep="") - patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) - patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) - patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence) + #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) + #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) + #patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence) + + patient1$merge = paste(patient1$CDR3_Sense_Sequence) + patient2$merge = paste(patient2$CDR3_Sense_Sequence) + patient3$merge = paste(patient3$CDR3_Sense_Sequence) patientMerge = merge(patient1, patient2, by="merge") patientMerge = merge(patientMerge, patient3, by="merge") @@ -271,19 +294,19 @@ if(threshhold != 0){ if(sum(one) > 0){ dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] - colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") + colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="") write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) } if(sum(two) > 0){ dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] - colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") + colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="") write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) } if(sum(three) > 0){ dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")] - colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone") + colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone") filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="") write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) }