Mercurial > repos > davidvanzessen > clonal_sequences_in_paired_samples

--- a/CLL.xml	Mon Aug 25 03:38:07 2014 -0400
+++ b/CLL.xml	Tue Aug 26 09:53:22 2014 -0400
@@ -1,10 +1,12 @@
 <tool id="vct_clonal_sequences" name="Clonal Sequences in paired samples" version="1.0">
 	<description>Comparison of clonal sequences in paired samples</description>
 	<command interpreter="bash">
-		wrapper.sh $in_file $out_file $out_file.files_path
+		wrapper.sh $in_file $out_file $out_file.files_path $min_freq $min_cells
 	</command>
 	<inputs>
-	<param name="in_file" format="tabular" type="data" label="Data to Process" />
+		<param name="in_file" format="tabular" type="data" label="Data to Process" />
+		<param name="min_freq" type="text" label="Minimum Frequency, between 0 and 100 in percentage" value='0'/>
+		<param name="min_cells" type="text" label="Minimum cell count" value='0'/>
 	</inputs>
 	<outputs>
 		<data format="html" name="out_file" />
--- a/RScript.r	Mon Aug 25 03:38:07 2014 -0400
+++ b/RScript.r	Tue Aug 26 09:53:22 2014 -0400
@@ -2,22 +2,35 @@

 inFile = args[1]
 outDir = args[2]
+logfile = args[3]
+min_freq = as.numeric(args[4])
+min_cells = as.numeric(args[5])
+
+cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)

 require(ggplot2)
 require(reshape2)
 require(data.table)
 require(grid)
 #require(xtable)
+cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
 dat = read.csv(inFile, sep="\t")
 #dat = data.frame(fread(inFile)) #faster but with a dep
 setwd(outDir)
+cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))

+cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
 dat$Frequency = ((10^dat$Log10_Frequency)*100)

+dat = dat[dat$Frequency >= min_freq,]
+
+cat("<tr><td>Normalizing cell count to 1.000.000</td></tr>", file=logfile, append=T)
 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * 1000000 / 2)
+dat = dat[dat$normalized_read_count >= min_cells,]
 dat$paste = paste(dat$Sample, dat$V_Segment_Major_Gene, dat$J_Segment_Major_Gene, dat$CDR3_Sense_Sequence)
+cat("<tr><td>Removing duplicates</td></tr>", file=logfile, append=T)
 dat = dat[!duplicated(dat$paste),]
 patients = split(dat, dat$Patient, drop=T)
 intervalReads = rev(c(0,2,10,100,1000,10000))
@@ -36,8 +49,8 @@
   }
   splt = split(x, x$Sample, drop=T)
   if(length(splt) == 1){
-    print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample, skipping"))
-    return()
+    print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
+    splt[[2]] = data.frame("Patient" = 'NA', "Receptor" = 'NA', "Sample" = 'NA', "Cell_Count" = 100, "Clone_Molecule_Count_From_Spikes" = 10, "Log10_Frequency" = 1, "Total_Read_Count" = 100, "dsMol_per_1e6_cells" = 100, "J_Segment_Major_Gene" = 'NA', "V_Segment_Major_Gene" = 'NA', "Clone_Sequence" = 'NA', "CDR3_Sense_Sequence" = 'NA', "Related_to_leukemia_clone" = FALSE, "Frequency"= 0, "normalized_read_count" = 0, "paste" = 'a')
   }
   patient1 = splt[[1]]
   patient2 = splt[[2]]
@@ -51,10 +64,7 @@
   oneSample = paste(patient1[1,sampleIndex], sep="")
   twoSample = paste(patient2[1,sampleIndex], sep="")
   patient = paste(x[1,patientIndex])
-
-  #print(c(length(grep(".*_Right$", twoSample)) == 1, "Right"))
-  #print(c(length(grep(".*_Dx_BM$", twoSample)) == 1, "Dx_BM"))
-  #print(c(length(grep(".*_Dx$", twoSample)) == 1, "Dx"))
+
   switched = F
   if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
     tmp = twoSample
@@ -68,6 +78,7 @@
   if(appendtxt){
     cat(paste(patient, oneSample, twoSample, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
   }
+  cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T)
   patientMerge = merge(patient1, patient2, by="Clone_Sequence")
   res1 = vector()
   res2 = vector()
@@ -164,6 +175,8 @@
   dev.off()
 }

+cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
+
 interval = intervalFreq
 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
 product = data.frame("Titles"=rep(Titles, each=6), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=6), "J_Segments"=rep(J_Segments, each=6))
@@ -174,6 +187,8 @@
 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)

+cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
+
 interval = intervalReads
 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
 product = data.frame("Titles"=rep(Titles, each=6), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=6), "J_Segments"=rep(J_Segments, each=6))
@@ -183,3 +198,5 @@
 #lapply(patients[c(6)], FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes")
 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Clone_Molecule_Count_From_Spikes")

+cat("</table></html>", file=logfile, append=T)
+
--- a/wrapper.sh	Mon Aug 25 03:38:07 2014 -0400
+++ b/wrapper.sh	Tue Aug 26 09:53:22 2014 -0400
@@ -3,11 +3,15 @@
 inputFile=$1
 outputFile=$2
 outputDir=$3
+min_freq=$4
+min_cells=$5

 dir="$(cd "$(dirname "$0")" && pwd)"
 mkdir $outputDir

-Rscript --verbose $dir/RScript.r $inputFile $outputDir 2>&1
+echo "testtestsetset" > $outputFile
+
+Rscript --verbose $dir/RScript.r $inputFile $outputDir $outputFile $min_freq $min_cells 2>&1
 cp $dir/jquery-1.11.0.min.js $outputDir
 cp $dir/script.js $outputDir
 cp $dir/style.css $outputDir