Mercurial > repos > davidvanzessen > baseline_bayesian_estimation

--- a/baseline.xml	Wed Aug 13 07:32:38 2014 -0400
+++ b/baseline.xml	Wed Aug 13 09:07:54 2014 -0400
@@ -11,7 +11,7 @@
  ${input.id}
 		#end for
 		"
-    $reference $out_file
+    $reference $out_file "$selection"
 	</command>
 	<inputs>
 		<repeat name="inputs" title="inputs" min="1" default="1">
@@ -50,6 +50,12 @@
 			<option value="1:26:38:55:65:104:-">IMGT® No CDR3</option>
 			<option value="1:26:38:55:65:104:116">IMGT®</option>
 		</param>
+		<param name="selection" type="select" label="Unique Selection Definition">
+			<option value="VGene,AA.JUNCTION">VGene, AA CDR3</option>
+			<option value="VGene,JGene,AA.JUNCTION">VGene, JGene, AA CDR3</option>
+			<option value="VGene,DGene,JGene,CDR3.Seq">VGene, DGene, JGene, AA CDR3</option>
+			<option value="Sequence.ID">None</option>
+		</param>
 	</inputs>
 	<outputs>
 		<data format="pdf" name="out_file" label = "Baseline on ${on_string}"/>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter.r	Wed Aug 13 09:07:54 2014 -0400
@@ -0,0 +1,29 @@
+arg = commandArgs(TRUE)
+summaryfile = arg[1]
+gappedfile = arg[2]
+selection = arg[3]
+output = arg[4]
+print(paste("-----", selection, "------"))
+
+summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F)[,c("Sequence.ID", "AA.JUNCTION")]
+gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
+head(summarydat)
+head(gappeddat)
+
+dat = merge(gappeddat, summarydat, by="Sequence.ID")
+head(dat)
+
+dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele)
+dat$VGene = gsub("[*].*", "", dat$VGene)
+
+dat$DGene = gsub("^Homsap ", "", dat$D.GENE.and.allele)
+dat$DGene = gsub("[*].*", "", dat$DGene)
+
+dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele)
+dat$JGene = gsub("[*].*", "", dat$JGene)
+
+dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":"))
+
+dat = dat[!duplicated(dat$past), ]
+
+write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)
--- a/wrapper.sh	Wed Aug 13 07:32:38 2014 -0400
+++ b/wrapper.sh	Wed Aug 13 09:07:54 2014 -0400
@@ -14,6 +14,7 @@
 IDs=($IDs)
 ref=${10}
 output=${11}
+selection=${12}
 outID="result"

 echo "testID = $testID"
@@ -42,9 +43,16 @@
 		id=${IDs[$count]}
 		echo "id=$id"
 		unzip $current -d $PWD/$id/ >> $PWD/unziplog.log
-		filename="$PWD/gappednt_${id}.txt"
-		cat $PWD/$id/*/2_* | cut -f2,4,7 > $filename
-		python $dir/script_imgt.py --input $filename --ref $ref --output $fasta --id $id
+		summaryfile="$PWD/summary_${id}.txt"
+		gappedfile="$PWD/gappednt_${id}.txt"
+		filtered="$PWD/filtered_${id}.txt"
+		cat $PWD/$id/*/1_* > $summaryfile
+		cat $PWD/$id/*/2_* > $gappedfile
+		Rscript $dir/filter.r $summaryfile $gappedfile "$selection" $filtered
+
+		final="$PWD/final_${id}.txt"
+		cat $filtered | cut -f2,4,7 > $final
+		python $dir/script_imgt.py --input $final --ref $ref --output $fasta --id $id
 	else
 		python $dir/script_xlsx.py --input $current --ref $ref --output $fasta
 	fi