Mercurial > repos > bgruening > augustus
changeset 1:3c5116448979 draft
Uploaded
author | bgruening |
---|---|
date | Thu, 06 Jun 2013 12:20:31 -0400 |
parents | 633039f94425 |
children | b8ccbad1b062 |
files | augustus.xml extract_features.py |
diffstat | 2 files changed, 125 insertions(+), 21 deletions(-) [+] |
line wrap: on
line diff
--- a/augustus.xml Thu Jun 06 09:52:32 2013 -0400 +++ b/augustus.xml Thu Jun 06 12:20:31 2013 -0400 @@ -3,26 +3,35 @@ <requirements> <requirement type="package" version="2.7">augustus</requirement> </requirements> - <command>augustus - --strand=$strand - $noInFrameStop - $gff - $protein - $introns - $start - $stop - $cds - $codingseq - $singlestrand - $input_genome - $mea - $utr - --genemodel=$genemodel - --species=$organism - --outfile=$output + <command> + ## please set export AUGUSTUS_CONFIG_PATH=/path_to_augustus/augustus/config + ## or use the --AUGUSTUS_CONFIG_PATH=path if you are not installing through the toolshed + ## Augustus writes the protein and coding sequences as comment into the gff/gtf file an external script is used to extract the sequences into additional files - #please set export AUGUSTUS_CONFIG_PATH=/path_to_augustus/augustus/config - #or use the --AUGUSTUS_CONFIG_PATH=path switch + augustus + --strand=$strand + $noInFrameStop + $gff + $protein + $introns + $start + $stop + $cds + $codingseq + $singlestrand + $input_genome + $mea + $utr + --genemodel=$genemodel + --species=$organism + ##--outfile=$output + | tee $output + #if $protein or $codingseq: + | python extract_features.py + #if $protein: + --protein $protein_output + #if $codingseq: + --codingseq $codingseq_output </command> <inputs> @@ -109,12 +118,12 @@ <option value="bacterium">bacterium (beta version)</option> </param> - <param name="protein" type="boolean" label="Output predicted protein sequences (--protein)" truevalue="--protein=on" falsevalue="--protein=off" checked="false" /> + <param name="protein" type="boolean" label="Output predicted protein sequences (--protein)" truevalue="--protein=on" falsevalue="--protein=off" checked="true" /> + <param name="codingseq" type="boolean" label="Output coding sequence as comment in the output file (codingseq)" truevalue="--codingseq=on" falsevalue="--codingseq=off" checked="true" /> <param name="introns" type="boolean" label="Output predicted intron sequences (--introns)" truevalue="--introns=on" falsevalue="--introns=off" checked="false" /> <param name="start" type="boolean" label="Output predicted start codons (--start)" truevalue="--start=on" falsevalue="--start=off" checked="false" /> <param name="stop" type="boolean" label="Output predicted stop codons (--stop)" truevalue="--stop=on" falsevalue="--stop=off" checked="false" /> <param name="cds" type="boolean" label="Output CDS region (--cds)" truevalue="--cds=on" falsevalue="--cds=off" checked="true" /> - <param name="codingseq" type="boolean" label="Output coding sequence as comment in the output file (codingseq)" truevalue="--codingseq=on" falsevalue="--codingseq=off" checked="false" /> <param name="gff" type="boolean" label="GFF formated output, standard is GTF (--gff3)" truevalue="--gff3=on" falsevalue="--gff3=off" checked="false" /> </inputs> @@ -124,6 +133,12 @@ <when input="gff" value="--gff3=on" format="gff" /> </change_format> </data> + <data format="fasta" name="protein_output"> + <filter>protein == "--protein=on"</filter> + </data> + <data format="fasta" name="codingseq_output"> + <filter>codingseq == "--codingseq=on"</filter> + </data> </outputs> <tests> <test>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_features.py Thu Jun 06 12:20:31 2013 -0400 @@ -0,0 +1,89 @@ +#!/usr/bin/env python + +import os +import sys +import argparse +import textwrap + +def main( args ): + """ + Extract the protein and coding section from an augustus gff, gtf file + Example file: +HS04636 AUGUSTUS stop_codon 6901 6903 . + 0 Parent=g1.t1 +HS04636 AUGUSTUS transcription_end_site 8857 8857 . + . Parent=g1.t1 +# protein sequence = [MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYIL +# THFKGFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNEIVEKLLLRRKFIPD +# PQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQIIDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVG +# QEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLILIGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYH +# WHPLLPDTFQIHDQKYNYQQFIYNNSILLEHGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGE +# KEMSAELEALYGDIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCPFTSFSV +# PDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL] +# end gene g1 +### +# +# ----- prediction on sequence number 2 (length = 2344, name = HS08198) ----- +# +# Predicted genes for sequence number 2 on both strands +# start gene g2 +HS08198 AUGUSTUS gene 86 2344 1 + . ID=g2 +HS08198 AUGUSTUS transcript 86 2344 . + . ID=g2.t1;Parent=g2 +HS08198 AUGUSTUS transcription_start_site 86 86 . + . Parent=g2.t1 +HS08198 AUGUSTUS exon 86 582 . + . Parent=g2.t1 +HS08198 AUGUSTUS start_codon 445 447 . + 0 Parent=g2.t1 + """ + protein_seq = '' + coding_seq = '' + if args.protein: + po = open( args.protein, 'w+' ) + if args.codingseq: + co = open( args.codingseq, 'w+' ) + + for line in sys.stdin: + # protein- and coding-sequence are stored as comments + if line.startswith('#'): + line = line[2:].strip() + if line.startswith('start gene'): + gene_name = line[11:].strip() + + if args.protein and line.startswith('protein sequence = ['): + if line.endswith(']'): + line = line[20:-1] + protein_seq = line + else: + line = line[20:] + protein_seq = line + + if args.codingseq and line.startswith('coding sequence = ['): + if line.endswith(']'): + coding_seq = line[19:-1] + else: + coding_seq = line[19:] + + if protein_seq: + if line.endswith(']'): + protein_seq += line[:-1] + po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) ) + protein_seq = '' + else: + protein_seq += line + + if coding_seq: + if line.endswith(']'): + coding_seq += line[:-1] + co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) ) + coding_seq = '' + else: + coding_seq += line + if args.codingseq: + co.close() + if args.protein: + po.close() + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--protein', help='Path to the protein file.') + parser.add_argument('-c', '--codingseq', help='Path to the coding file.') + + args = parser.parse_args() + main( args ) +