Mercurial > repos > davidvanzessen > baseline_fasta_generator
changeset 5:048f8cdbb6d2 draft default tip
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 07 Aug 2014 10:19:53 -0400 |
parents | aa4b95abef11 |
children | |
files | baseline_generator.xml script_imgt.py script_xlsx.py wrapper.sh |
diffstat | 4 files changed, 48 insertions(+), 20 deletions(-) [+] |
line wrap: on
line diff
--- a/baseline_generator.xml Wed Jul 23 10:13:05 2014 -0400 +++ b/baseline_generator.xml Thu Aug 07 10:19:53 2014 -0400 @@ -1,14 +1,27 @@ <tool id="baseline_fasta_generator" name="Baseline generator" version="1.0"> <description>Generate baseline fasta file</description> <command interpreter="bash"> - wrapper.sh $in_file $reference $out_file + wrapper.sh " + #for $i, $input in enumerate($inputs) + ${input.in_file} + #end for + " + " + #for $i, $input in enumerate($inputs) + ${input.id} + #end for + " + $reference $out_file </command> <inputs> - <param name="in_file" type="data" label="Input excel or IMGT zip file" /> + <repeat name="inputs" title="inputs" min="1" default="1"> + <param name="in_file" type="data" label="Input excel or IMGT zip file" /> + <param name="id" type="text" label="ID (alpha-numeric, no spaces)" /> + </repeat> <param name="reference" type="data" format="fasta" label="Reference fasta file" /> </inputs> <outputs> - <data format="fasta" name="out_file" label = "Baseline generator on $in_file.name with $reference.name"/> + <data format="fasta" name="out_file" label = "Baseline generator on ${on_string} with $reference.name"/> </outputs> <help> Gur Yaari; Mohamed Uduman; Steven H. Kleinstein. Quantifying selection in high-throughput Immunoglobulin sequencing data sets. Nucleic Acids Res. 2012 May 27.
--- a/script_imgt.py Wed Jul 23 10:13:05 2014 -0400 +++ b/script_imgt.py Thu Aug 07 10:19:53 2014 -0400 @@ -6,6 +6,7 @@ parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") parser.add_argument("--ref", help="Reference file") parser.add_argument("--output", help="Output file") +parser.add_argument("--id", help="ID to be used at the '>>>' line in the output") args = parser.parse_args() @@ -48,8 +49,8 @@ currentSeq = "" currentId = "" with open(args.input, 'r') as i: - with open(args.output, 'w') as o: - o.write(">>>IMGT\n") + with open(args.output, 'a') as o: + o.write(">>>" + args.id + "\n") outputdic = dict() for line in i.readlines()[1:]: linesplt = line.split("\t")
--- a/script_xlsx.py Wed Jul 23 10:13:05 2014 -0400 +++ b/script_xlsx.py Thu Aug 07 10:19:53 2014 -0400 @@ -31,7 +31,7 @@ currentSeq = "" currentId = "" with xlrd.open_workbook(args.input, 'r') as wb: - with open(args.output, 'w') as o: + with open(args.output, 'a') as o: for sheet in wb.sheets(): if sheet.cell(1,gene_column).value.find("IGHV") < 0: print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name
--- a/wrapper.sh Wed Jul 23 10:13:05 2014 -0400 +++ b/wrapper.sh Thu Aug 07 10:19:53 2014 -0400 @@ -2,17 +2,31 @@ dir="$(cd "$(dirname "$0")" && pwd)" input=$1 -ref=$2 -output=$3 -f=$(file $input) -zipType="Zip archive" -if [[ "$f" == *"$zipType"* ]] -then - echo "Zip archive, assuming IMGT output file" - echo "Trying: unzip $input -d $PWD/files/" - unzip $input -d $PWD/files/ > $PWD/unziplog.log - cat $PWD/files/*/2_* | cut -f2,4,7 > $PWD/gappednt.txt - python $dir/script_imgt.py --input $PWD/gappednt.txt --ref $ref --output $output -else - python $dir/script_xlsx.py --input $input --ref $ref --output $output -fi +ids=$2 +ref=$3 +output=$4 + +input=($input) + +ids=($ids) + +first="${input[0]}" + +count=0 +for current in ${input[@]} +do + f=$(file $current) + zipType="Zip archive" + if [[ "$f" == *"$zipType"* ]] + then + id=${ids[$count]} + unzip $current -d $PWD/$id/ >> $PWD/unziplog.log + filename="$PWD/gappednt_${id}.txt" + cat $PWD/$id/*/2_* | cut -f2,4,7 > $filename + python $dir/script_imgt.py --input $filename --ref $ref --output $output --id $id + else + python $dir/script_xlsx.py --input $current --ref $ref --output $output + fi + count=$((count+1)) +done +