Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ALL.xml	Wed Dec 10 09:49:16 2014 -0500
@@ -0,0 +1,42 @@
+<tool id="vct_clonal_sequences" name="Clonal Sequences in paired samples" version="1.0">
+	<description>Comparison of clonal sequences in paired samples</description>
+	<command interpreter="bash">
+		wrapper.sh $in_file $out_file $out_file.files_path $min_freq $min_cells
+	</command>
+	<inputs>
+		<param name="in_file" format="tabular" type="data" label="Data to Process" />
+		<param name="min_freq" type="text" label="Minimum Frequency, between 0 and 100 in percentage" value='0'/>
+		<param name="min_cells" type="text" label="Minimum cell count" value='0'/>
+	</inputs>
+	<outputs>
+		<data format="html" name="out_file" />
+	</outputs>
+	<help>
+Takes a tabular file as input, it needs to following columns:
+
++----------------------------------+----------------------------------------------+
+| **Column name**                  | **Column contents**                          |
++----------------------------------+----------------------------------------------+
+| Patient                          | The patient ID                               |
++----------------------------------+----------------------------------------------+
+| Sample                           | The Sample ID, one, two or three per Patient |
++----------------------------------+----------------------------------------------+
+| Cell_Count                       | The cell count within a sample/loci          |
++----------------------------------+----------------------------------------------+
+| Clone_Molocule_Count_From_Spikes | The count of a clone                         |
++----------------------------------+----------------------------------------------+
+| Log10_Frequency                  | The frequency of a clone in log10            |
++----------------------------------+----------------------------------------------+
+| J_Segment_Major_Gene             | The J Gene of this clone                     |
++----------------------------------+----------------------------------------------+
+| V_Segment_Major_Gene             | The V Gene of this clone                     |
++----------------------------------+----------------------------------------------+
+| Clone_Sequence                   | The entire sequence                          |
++----------------------------------+----------------------------------------------+
+| CDR3_Sense_Sequence              | The CDR3 sequence region.                    |
++----------------------------------+----------------------------------------------+
+
+
+And generate a detailed HTML report on the sequences found in indiviual samples and in both samples.
+	</help>
+</tool>
--- a/CLL.xml	Mon Oct 06 05:57:55 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,17 +0,0 @@
-<tool id="vct_clonal_sequences" name="Clonal Sequences in paired samples" version="1.0">
-	<description>Comparison of clonal sequences in paired samples</description>
-	<command interpreter="bash">
-		wrapper.sh $in_file $out_file $out_file.files_path $min_freq $min_cells
-	</command>
-	<inputs>
-		<param name="in_file" format="tabular" type="data" label="Data to Process" />
-		<param name="min_freq" type="text" label="Minimum Frequency, between 0 and 100 in percentage" value='0'/>
-		<param name="min_cells" type="text" label="Minimum cell count" value='0'/>
-	</inputs>
-	<outputs>
-		<data format="html" name="out_file" />
-	</outputs>
-	<help>
-		stuff and words
-	</help>
-</tool>
--- a/RScript.r	Mon Oct 06 05:57:55 2014 -0400
+++ b/RScript.r	Wed Dec 10 09:49:16 2014 -0500
@@ -17,35 +17,51 @@
 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
 dat = read.table(inFile, header=T, sep="\t", dec=",", fill=T, stringsAsFactors=F)
 dat = dat[!is.na(dat$Patient),]
+dat = dat[!duplicated(dat$Clone_Sequence), ]

 setwd(outDir)
 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))

-str(dat)
+dat$Frequency = ((10^dat$Log10_Frequency)*100)
+
 cat("<tr><td>Deduplication</td></tr>", file=logfile, append=T)
 #dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")])

-most.common = function(x){
-  ux = unique(x)
+most.common = function(x, ret="V"){
+  past = paste(x$V_Segment_Major_Gene, x$J_Segment_Major_Gene, sep=";")
+  ux = unique(past)
   if(length(ux) > 1){
-    xtdf = data.frame(table(x))
-    return(xtdf$Var1[which.max(xtdf$Freq)])
+    xtdf = data.frame(table(past))
     #print(xtdf)
+    res = unlist(strsplit(as.character(xtdf$past[which.max(xtdf$Freq)]), ";"))
+    #print(res)
+    if(ret == "V"){
+      return(res[1])
+    } else {
+      return(res[2])
+    }
   }
-  return(unique(x))
+
+  if(ret == "V"){
+    return(unique(x$V_Segment_Major_Gene))
+  } else {
+    return(unique(x$J_Segment_Major_Gene))
+  }
 }

-dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), V_Segment_Major_Gene=most.common(.SD$V_Segment_Major_Gene), J_Segment_Major_Gene=most.common(.SD$J_Segment_Major_Gene), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "CDR3_Sense_Sequence")])
+dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), V_Segment_Major_Gene= as.character(most.common(.SD, ret="V")), J_Segment_Major_Gene= as.character(most.common(.SD, ret="J")), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "CDR3_Sense_Sequence")])
+dat = data.frame(data.table(dat)[, list(Patient=unique(.SD$Patient), Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes), Log10_Frequency=sum(.SD$Log10_Frequency), Frequency=sum(.SD$Frequency), Total_Read_Count=sum(.SD$Total_Read_Count), Related_to_leukemia_clone=any(.SD$Related_to_leukemia_clone)), by=c("Sample", "Cell_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence")])

 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
-dat$Frequency = ((10^dat$Log10_Frequency)*100)
+

 dat = dat[dat$Frequency >= min_freq,]

-cat("<tr><td>Normalizing cell count to 1.000.000</td></tr>", file=logfile, append=T)
-dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2)
+#cat("<tr><td>Normalizing cell count to 1.000.000</td></tr>", file=logfile, append=T)
+#dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2)
+dat$normalized_read_count = dat$Clone_Molecule_Count_From_Spikes
 dat = dat[dat$normalized_read_count >= min_cells,]
 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence)
 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
@@ -100,9 +116,12 @@
   }
   cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T)

-  patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
-  patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
+  #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
+  #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
+  patient1$merge = paste(patient1$CDR3_Sense_Sequence)
+  patient2$merge = paste(patient2$CDR3_Sense_Sequence)

+  #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
   patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
   res1 = vector()
   res2 = vector()
@@ -233,9 +252,13 @@
   twoSample = paste(patient2[1,sampleIndex], sep="")
   threeSample = paste(patient3[1,sampleIndex], sep="")

-  patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
-  patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
-  patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
+  #patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
+  #patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
+  #patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
+
+  patient1$merge = paste(patient1$CDR3_Sense_Sequence)
+  patient2$merge = paste(patient2$CDR3_Sense_Sequence)
+  patient3$merge = paste(patient3$CDR3_Sense_Sequence)

   patientMerge = merge(patient1, patient2, by="merge")
   patientMerge = merge(patientMerge, patient3, by="merge")
@@ -271,19 +294,19 @@
     if(threshhold != 0){
       if(sum(one) > 0){
         dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")]
-        colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone")
+        colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone")
         filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
         write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
       }
       if(sum(two) > 0){
         dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")]
-        colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone")
+        colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone")
         filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
         write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
       }
       if(sum(three) > 0){
         dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "CDR3_Sense_Sequence", "Related_to_leukemia_clone")]
-        colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Sequence", "Related_to_leukemia_clone")
+        colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "CDR3 Sequence", "Related_to_leukemia_clone")
         filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
         write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
       }
--- a/script.js	Mon Oct 06 05:57:55 2014 -0400
+++ b/script.js	Wed Dec 10 09:49:16 2014 -0500
@@ -7,7 +7,11 @@
 	var tr = document.createElement('tr');
 	tr.className = "evenrowcolor";
 	var cells = lines[0].split("\t");
+	var cdr3column = 0;
 	for(var a = 0;a < cells.length;++a){
+		if(cells[a] == "CDR3 Sequence" || cells[a] == "CDR3_Sense_Sequence"){
+			cdr3column = a;
+		}
 		var td = document.createElement('td');
 		td.appendChild(document.createTextNode(cells[a]));
 		tr.appendChild(td);
@@ -16,7 +20,6 @@
 	tbl.appendChild(thead);
 	var tbdy = document.createElement('tbody');

-
 	for(var a = 1;a < lines.length;++a){
 		tr = document.createElement('tr');
 		var cells = lines[a].split("\t");
@@ -26,6 +29,9 @@
 		for(var b = 0;b < cells.length;++b){
 			td = document.createElement('td');
 			td.appendChild(document.createTextNode(cells[b]));
+			if(b == cdr3column){
+				td.setAttribute('style', 'text-align:right');
+			}
 			tr.appendChild(td)
 		}
 		if(a % 2 == 0){
@@ -40,6 +46,7 @@
 }

 function loadfile(file, patient, type){
+	patient = patient.replace(".", "\\.");
 	$('#hidden_div').load(file, function(){
 		$('#result_div_' + patient + '_' + type).html(tsvToTable($('#hidden_div').html()));
 		$('#result_div_' + patient + '_' + type + ' tr').hover(function() {