Mercurial > repos > davidvanzessen > baseline_fasta_generator

--- a/baseline_generator.xml	Wed Jul 23 10:13:05 2014 -0400
+++ b/baseline_generator.xml	Thu Aug 07 10:19:53 2014 -0400
@@ -1,14 +1,27 @@
 <tool id="baseline_fasta_generator" name="Baseline generator" version="1.0">
 	<description>Generate baseline fasta file</description>
 	<command interpreter="bash">
-		wrapper.sh $in_file $reference $out_file
+		wrapper.sh "
+		#for $i, $input in enumerate($inputs)
+ ${input.in_file}
+		#end for
+		"
+		"
+		#for $i, $input in enumerate($inputs)
+ ${input.id}
+		#end for
+		"
+    $reference $out_file
 	</command>
 	<inputs>
-		<param name="in_file" type="data" label="Input excel or IMGT zip file" />
+		<repeat name="inputs" title="inputs" min="1" default="1">
+			<param name="in_file" type="data" label="Input excel or IMGT zip file" />
+			<param name="id" type="text" label="ID (alpha-numeric, no spaces)" />
+		</repeat>
 		<param name="reference" type="data" format="fasta" label="Reference fasta file" />
 	</inputs>
 	<outputs>
-		<data format="fasta" name="out_file" label = "Baseline generator on $in_file.name with $reference.name"/>
+		<data format="fasta" name="out_file" label = "Baseline generator on ${on_string} with $reference.name"/>
 	</outputs>
 	<help>
 			Gur Yaari; Mohamed Uduman; Steven H. Kleinstein. Quantifying selection in high-throughput Immunoglobulin sequencing data sets. Nucleic Acids Res. 2012 May 27.
--- a/script_imgt.py	Wed Jul 23 10:13:05 2014 -0400
+++ b/script_imgt.py	Thu Aug 07 10:19:53 2014 -0400
@@ -6,6 +6,7 @@
 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
 parser.add_argument("--ref", help="Reference file")
 parser.add_argument("--output", help="Output file")
+parser.add_argument("--id", help="ID to be used at the '>>>' line in the output")

 args = parser.parse_args()

@@ -48,8 +49,8 @@
 currentSeq = ""
 currentId = ""
 with open(args.input, 'r') as i:
-	with open(args.output, 'w') as o:
-		o.write(">>>IMGT\n")
+	with open(args.output, 'a') as o:
+		o.write(">>>" + args.id + "\n")
 		outputdic = dict()
 		for line in i.readlines()[1:]:
 			linesplt = line.split("\t")
--- a/script_xlsx.py	Wed Jul 23 10:13:05 2014 -0400
+++ b/script_xlsx.py	Thu Aug 07 10:19:53 2014 -0400
@@ -31,7 +31,7 @@
 currentSeq = ""
 currentId = ""
 with xlrd.open_workbook(args.input, 'r') as wb:
-	with open(args.output, 'w') as o:
+	with open(args.output, 'a') as o:
 		for sheet in wb.sheets():
 			if sheet.cell(1,gene_column).value.find("IGHV") < 0:
 				print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name
--- a/wrapper.sh	Wed Jul 23 10:13:05 2014 -0400
+++ b/wrapper.sh	Thu Aug 07 10:19:53 2014 -0400
@@ -2,17 +2,31 @@
 dir="$(cd "$(dirname "$0")" && pwd)"

 input=$1
-ref=$2
-output=$3
-f=$(file $input)
-zipType="Zip archive"
-if [[ "$f" == *"$zipType"* ]]
-then
-	echo "Zip archive, assuming IMGT output file"
-	echo "Trying: unzip $input -d $PWD/files/"
-	unzip $input -d $PWD/files/ > $PWD/unziplog.log
-	cat $PWD/files/*/2_* | cut -f2,4,7 > $PWD/gappednt.txt
-	python $dir/script_imgt.py --input $PWD/gappednt.txt --ref $ref --output $output
-else
-	python $dir/script_xlsx.py --input $input --ref $ref --output $output
-fi
+ids=$2
+ref=$3
+output=$4
+
+input=($input)
+
+ids=($ids)
+
+first="${input[0]}"
+
+count=0
+for current in ${input[@]}
+do
+	f=$(file $current)
+	zipType="Zip archive"
+	if [[ "$f" == *"$zipType"* ]]
+	then
+		id=${ids[$count]}
+		unzip $current -d $PWD/$id/ >> $PWD/unziplog.log
+		filename="$PWD/gappednt_${id}.txt"
+		cat $PWD/$id/*/2_* | cut -f2,4,7 > $filename
+		python $dir/script_imgt.py --input $filename --ref $ref --output $output --id $id
+	else
+		python $dir/script_xlsx.py --input $current --ref $ref --output $output
+	fi
+	count=$((count+1))
+done
+