Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples

--- a/ALL.xml	Tue May 19 08:13:49 2015 -0400
+++ b/ALL.xml	Fri May 22 09:06:04 2015 -0400
@@ -1,12 +1,16 @@
 <tool id="vct_clonal_sequences" name="Clonal Sequences in paired samples" version="1.0">
 	<description>Comparison of clonal sequences in paired samples</description>
 	<command interpreter="bash">
-		wrapper.sh $in_file $out_file $out_file.files_path $min_freq $min_cells
+		wrapper.sh $in_file $out_file $out_file.files_path $min_freq $min_cells $merge_on
 	</command>
 	<inputs>
 		<param name="in_file" format="tabular" type="data" label="Data to Process" />
 		<param name="min_freq" type="text" label="Minimum Frequency, between 0 and 100 in percentage" value='0'/>
 		<param name="min_cells" type="text" label="Minimum cell count" value='0'/>
+		<param name="merge_on" type="select" label="Merge On">
+				<option value="Clone_Sequence">Clone_Sequence</option>
+				<option value="V_J_CDR3">V+J+CDR3</option>
+		</param>
 	</inputs>
 	<outputs>
 		<data format="html" name="out_file" />
--- a/RScript.r	Tue May 19 08:13:49 2015 -0400
+++ b/RScript.r	Fri May 22 09:06:04 2015 -0400
@@ -5,6 +5,7 @@
 logfile = args[3]
 min_freq = as.numeric(args[4])
 min_cells = as.numeric(args[5])
+mergeOn = args[6]

 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)

@@ -51,6 +52,25 @@

 dat$paste = paste(dat$Sample, dat$Clone_Sequence)

+cat("<tr><td>Adding duplicate V+J+CDR3 sequences</td></tr>", file=logfile, append=T)
+#remove duplicate V+J+CDR3, add together numerical values
+dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor),
+                                        Cell_Count=unique(.SD$Cell_Count),
+                                        Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
+                                        Total_Read_Count=sum(.SD$Total_Read_Count),
+                                        dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
+                                        Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
+                                        Frequency=sum(.SD$Frequency),
+                                        locus_V=unique(.SD$locus_V),
+                                        locus_J=unique(.SD$locus_J),
+                                        min_cell_count=unique(.SD$min_cell_count),
+                                        normalized_read_count=sum(.SD$normalized_read_count),
+                                        Log10_Frequency=sum(.SD$Log10_Frequency),
+                                        Clone_Sequence=.SD$Clone_Sequence[1],
+                                        min_cell_paste=.SD$min_cell_paste[1],
+                                        paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
+
+
 patients = split(dat, dat$Patient, drop=T)
 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
@@ -60,6 +80,8 @@
 Titles = factor(Titles, levels=Titles)
 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))

+single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
+
 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
   if (!is.data.frame(x) & is.list(x)){
     x = x[[1]]
@@ -106,12 +128,20 @@
   }
   cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T)

-  #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
-  #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
-  patient1$merge = paste(patient1$Clone_Sequence)
-  patient2$merge = paste(patient2$Clone_Sequence)
+  if(mergeOn == "Clone_Sequence"){
+    patient1$merge = paste(patient1$Clone_Sequence)
+    patient2$merge = paste(patient2$Clone_Sequence)
+  } else {
+    patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
+    patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
+  }

-  #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
+  scatterplot_data_columns = c("Patient", "Sample", "Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene")
+  scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
+  scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$Clone_Sequence),]
+  scatterplot_data$type = factor(x="In one", levels=c("In one", "In Both"))
+  scatterplot_data$on = onShort
+
   patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
   patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
   res1 = vector()
@@ -122,14 +152,13 @@
   locussum1 = vector()
   locussum2 = vector()

-  print(patient)
   #for(iter in 1){
   for(iter in 1:length(product[,1])){
     threshhold = product[iter,threshholdIndex]
     V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
     J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
     #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
-    both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both higher than threshold
+    both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
     one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$Clone_Sequence %in% patientMerge[both,]$merge))
     two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$Clone_Sequence %in% patientMerge[both,]$merge))
     read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
@@ -153,13 +182,41 @@
         filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
         write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
       }
+    } else {
+      scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
+      if(nrow(scatterplot_locus_data) > 0){
+        scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
+      }
+      in_two = (scatterplot_locus_data$Clone_Sequence %in% patientMerge[both,]$Clone_Sequence.x)
+      if(any(in_two)){
+        scatterplot_locus_data[in_two,]$type = "In Both"
+      }
+      if(type == "single"){
+        single_patients <<- rbind(single_patients, scatterplot_locus_data)
+      }
+      p = NULL
+      if(nrow(scatterplot_locus_data) != 0){
+        if(on == "normalized_read_count"){
+          scales = 10^(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
+          p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales)
+        } else {
+          p = ggplot(scatterplot_locus_data, aes(type, Frequency))
+        }
+        p = p + geom_point(aes(colour=type), position="jitter")
+        p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
+      } else {
+        p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
+      }
+      png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
+      print(p)
+      dev.off()
     }
     if(sum(both) > 0){
       dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
       colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
       filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
       write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
-    }
+    }
   }
   patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
   if(sum(is.na(patientResult$percentage)) > 0){
@@ -215,18 +272,33 @@
 interval = intervalFreq
 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
-mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
+lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)

 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)

 interval = intervalReads
 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
-mclapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
+lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")

 cat("</table></html>", file=logfile, append=T)

+scales = 10^(0:ceiling(log10(max(single_patients$normalized_read_count))))
+p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales)
+p = p + geom_point(aes(colour=type), position="jitter")
+p = p + xlab("In one or both samples") + ylab("Reads")
+p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
+png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)), height=1080)
+print(p)
+dev.off()

+p = ggplot(single_patients, aes(Rearrangement, Frequency))
+p = p + geom_point(aes(colour=type), position="jitter")
+p = p + xlab("In one or both samples") + ylab("Frequency")
+p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
+png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)), height=1080)
+print(p)
+dev.off()

 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
   onShort = "reads"
@@ -248,13 +320,16 @@
   twoSample = paste(patient2[1,sampleIndex], sep="")
   threeSample = paste(patient3[1,sampleIndex], sep="")

-  #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
-  #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
-  #patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
-
-  patient1$merge = paste(patient1$Clone_Sequence)
-  patient2$merge = paste(patient2$Clone_Sequence)
-  patient3$merge = paste(patient3$Clone_Sequence)
+  if(mergeOn == "Clone_Sequence"){
+    patient1$merge = paste(patient1$Clone_Sequence)
+		patient2$merge = paste(patient2$Clone_Sequence)
+		patient3$merge = paste(patient3$Clone_Sequence)
+
+  } else {
+		patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
+		patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
+		patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
+  }

   patientMerge = merge(patient1, patient2, by="merge")
   patientMerge = merge(patientMerge, patient3, by="merge")
@@ -465,6 +540,19 @@

 triplets = triplets[,!(colnames(triplets) %in% column_drops)]

+#remove duplicate V+J+CDR3, add together numerical values
+triplets = data.frame(data.table(triplets)[, list(Receptor=unique(.SD$Receptor),
+                                                 Cell_Count=unique(.SD$Cell_Count),
+                                                 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
+                                                 Total_Read_Count=sum(.SD$Total_Read_Count),
+                                                 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
+                                                 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
+                                                 Frequency=sum(.SD$Frequency),
+                                                 normalized_read_count=sum(.SD$normalized_read_count),
+                                                 Log10_Frequency=sum(.SD$Log10_Frequency),
+                                                 Clone_Sequence=.SD$Clone_Sequence[1]), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
+
+
 interval = intervalReads
 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
--- a/wrapper.sh	Tue May 19 08:13:49 2015 -0400
+++ b/wrapper.sh	Fri May 22 09:06:04 2015 -0400
@@ -5,12 +5,13 @@
 outputDir=$3
 min_freq=$4
 min_cells=$5
+merge_on="$6"

 dir="$(cd "$(dirname "$0")" && pwd)"
 mkdir $outputDir


-Rscript --verbose $dir/RScript.r $inputFile $outputDir $outputFile $min_freq $min_cells 2>&1
+Rscript --verbose $dir/RScript.r $inputFile $outputDir $outputFile $min_freq $min_cells "${merge_on}" 2>&1
 cp $dir/jquery-1.11.0.min.js $outputDir
 cp $dir/script.js $outputDir
 cp $dir/style.css $outputDir
@@ -51,6 +52,7 @@
 	echo "<table border = 1 class='result_table summary_table' id='summary_table_${patient}_freq'>" >> "$html"
 	echo "<thead><th>Ig/TCR gene rearrangement type</th><th>Proximal gene segment</th><th>Distal gene segment</th><th>Cut off value</th><th>Number of sequences ${patient}_Both</th><th>Number of sequences_$sample1</th><th>Read Count $sample1</th><th>Number of sequences_$sample2</th><th>Read Count $sample2</th><th>Sum number of sequences $patient</th><th>Percentage of sequences ${patient}_both</th></thead>" >> "$html"
 	echo "<tbody>" >> "$html"
+	scatterplot_tab="<div class='tabbertab' title='Scatter Plots Reads'>"
 	while read locus j_segment v_segment cut_off_value both one read_count1 two read_count2 sum percent locusreadsum1 locusreadsum2
 	do
 		if [ "$locus" != "$oldLocus" ] ; then
@@ -82,7 +84,10 @@
 		echo "<td>$sum</td>" >> "$html"
 		echo "<td>${percent}&#37;</td>" >> "$html"
 		echo "</tr>" >> "$html"
-		oldLocus="$locus"
+		oldLocus="$locus"
+		if [ "${cut_off_value}" == "0" ] ; then
+			scatterplot_tab="${scatterplot_tab}<img src='${patient}_${sample1}_${sample2}_freq_${locus}_scatter.png' /><br />"
+		fi
 	done < tmp.txt
 	echo "</tbody></table>" >> "$html"
 	echo "</td><td style='vertical-align:top;'><div id='result_div_${patient}_freq'></div></td></tr></table>" >> "$html"
@@ -91,6 +96,7 @@
 	echo "<a href='${patient}_freq.png'><img src='${patient}_freq.png' width='1280' height='720' /></a><br />" >> "$html"
 	echo "<a href='${patient}_freq_both.png'><img src='${patient}_freq_both.png' width='1280' height='720' /></a><br />" >> "$html"
 	echo "<a href='${patient}_percent_freq.png'><img src='${patient}_percent_freq.png' width='1280' height='720' /></a></div>" >> "$html"
+	echo "${scatterplot_tab}</div>" >> "$html"

 	tail -n+2 ${patient}_reads.txt | sed "s/>//" > tmp.txt
 	echo "<div class='tabbertab' title='Data reads'>" >> "$html"
@@ -98,6 +104,7 @@
 	echo "<table border = 1 class='result_table summary_table' id='summary_table_${patient}_reads'>" >> "$html"
 	echo "<thead><th>Ig/TCR gene rearrangement type</th><th>Proximal gene segment</th><th>Distal gene segment</th><th>Cut off value</th><th>Number of sequences ${patient}_Both</th><th>Number of sequences_$sample1</th><th>Read Count $sample1</th><th>Number of sequences_$sample2</th><th>Read Count $sample2</th><th>Sum number of sequences $patient</th><th>Percentage of sequences ${patient}_both</th></thead>" >> "$html"
 	echo "<tbody>" >> "$html"
+	scatterplot_tab="<div class='tabbertab' title='Scatter Plots Frequency'>"
 	while read locus j_segment v_segment cut_off_value both one read_count1 two read_count2 sum percent locusreadsum1 locusreadsum2
 	do
 		if [ "$locus" != "$oldLocus" ] ; then
@@ -130,6 +137,9 @@
 		echo "<td>${percent}&#37;</td>" >> "$html"
 		echo "</tr>" >> "$html"
 		oldLocus="$locus"
+		if [ "${cut_off_value}" == "0" ] ; then
+			scatterplot_tab="${scatterplot_tab}<img src='${patient}_${sample1}_${sample2}_reads_${locus}_scatter.png' /><br />"
+		fi
 	done < tmp.txt
 	echo "</tbody></table>" >> "$html"
 	echo "</td><td style='vertical-align:top;'><div id='result_div_${patient}_reads'></div></td></tr></table>" >> "$html"
@@ -138,6 +148,7 @@
 	echo "<a href='${patient}_reads.png'><img src='${patient}_reads.png' width='1280' height='720' /></a><br />" >> "$html"
 	echo "<a href='${patient}_reads_both.png'><img src='${patient}_reads_both.png' width='1280' height='720' /></a><br />" >> "$html"
 	echo "<a href='${patient}_percent_reads.png'><img src='${patient}_percent_reads.png' width='1280' height='720' /></a></div>" >> "$html"
+	echo "${scatterplot_tab}</div>" >> "$html"
 	echo "</div>" >> "$html"
 	echo "</div>" >> "$html"
 	echo "</html>" >> "$html"
@@ -146,7 +157,7 @@
 html="index.html"
 echo "<html>" > $html
 echo "<table>" >> "$html"
-echo "<tr><td><b>Singles:</b></td></tr>" >> "$html"
+echo "<tr><td><b>Singles (<a href='singles_freq_scatterplot.png'>Frequency scatterplot</a>, <a href='singles_reads_scatterplot.png'>Reads scatterplot</a>):</b></td></tr>" >> "$html"
 for patient in "${singles[@]}"
 do
 	echo "<tr><td><a href='${patient}.html'>$patient</a></td></tr>" >> "$html"