Mercurial > repos > bgruening > augustus
changeset 3:702d9e042295 draft
Uploaded
author | bgruening |
---|---|
date | Thu, 06 Jun 2013 13:26:37 -0400 |
parents | b8ccbad1b062 |
children | c243e17fb224 |
files | augustus.xml extract_features.py test-data/human_augustus_protein_codingseq_introns_cds_codingseq.fasta test-data/human_augustus_protein_codingseq_introns_cds_main.gtf test-data/human_augustus_protein_codingseq_introns_cds_protein.fasta |
diffstat | 5 files changed, 170 insertions(+), 5 deletions(-) [+] |
line wrap: on
line diff
--- a/augustus.xml Thu Jun 06 12:51:27 2013 -0400 +++ b/augustus.xml Thu Jun 06 13:26:37 2013 -0400 @@ -2,6 +2,7 @@ <description>gene prediction for eukaryotic genomes</description> <requirements> <requirement type="package" version="2.7">augustus</requirement> + <requirement type="set_environment">SCRIPT_PATH</requirement> </requirements> <command> ## please set export AUGUSTUS_CONFIG_PATH=/path_to_augustus/augustus/config @@ -27,7 +28,7 @@ ##--outfile=$output | tee $output #if $protein or $codingseq: - | python extract_features.py + | python \$SCRIPT_PATH/extract_features.py #if $protein: --protein $protein_output #end if @@ -136,10 +137,10 @@ </change_format> </data> <data format="fasta" name="protein_output"> - <filter>protein == "--protein=on"</filter> + <filter>protein == True</filter> </data> <data format="fasta" name="codingseq_output"> - <filter>codingseq == "--codingseq=on"</filter> + <filter>codingseq == True</filter> </data> </outputs> <tests> @@ -163,6 +164,17 @@ <param name="mea" value="--mea=1" /> <output name="output" file="arabidopsis_augustus_utr-off_singlestrand-on_mea-on" ftype="gtf" /> </test> + <test> + <param name="input_genome" value="HS04636_augustus.fa" ftype="fasta" /> + <param name="organism" value="human" /> + <param name="protein" value="--protein=on" /> + <param name="codingseq" value="--codingseq=on" /> + <param name="introns" value="--introns=on" /> + <param name="cds" value="--cds=on" /> + <output name="output" file="human_augustus_protein_codingseq_introns_cds_main.gtf" ftype="gff" /> + <output name="codingseq_output" file="human_augustus_protein_codingseq_introns_cds_codingseq.fasta" ftype="fasta" /> + <output name="protein_output" file="human_augustus_protein_codingseq_introns_cds_protein.fasta" ftype="fasta" /> + </test> </tests> <help>
--- a/extract_features.py Thu Jun 06 12:51:27 2013 -0400 +++ b/extract_features.py Thu Jun 06 13:26:37 2013 -0400 @@ -55,9 +55,11 @@ if args.codingseq and line.startswith('coding sequence = ['): if line.endswith(']'): - coding_seq = line[19:-1] + line = line[19:-1] + coding_seq = line else: - coding_seq = line[19:] + line = line[19:] + coding_seq = line if protein_seq: if line.endswith(']'):
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/human_augustus_protein_codingseq_introns_cds_codingseq.fasta Thu Jun 06 13:26:37 2013 -0400 @@ -0,0 +1,35 @@ +>g1 +atgctcgcccgcgccctgctgctgtgcgcggtcctggcgctcagccatacagcaaatccttgctgttcccacccatgtca +tgctcgcccgcgccctgctgctgtgcgcggtcctggcgctcagccatacagcaaatccttgctgttcccacccatgtcaa +aaccgaggtgtatgtatgagtgtgggatttgaccagtataagtgcgattgtacccggacaggattctatggagaaaactg +ctcaacaccggaatttttgacaagaataaaattatttctgaaacccactccaaacacagtgcactacatacttacccact +tcaagggattttggaacgttgtgaataacattcccttccttcgaaatgcaattatgagttatgtcttgacatccagatca +catttgattgacagtccaccaacttacaatgctgactatggctacaaaagctgggaagccttctctaacctctcctatta +tactagagcccttcctcctgtgcctgatgattgcccgactcccttgggtgtcaaaggtaaaaagcagcttcctgattcaa +atgagattgtggaaaaattgcttctaagaagaaagttcatccctgatccccagggctcaaacatgatgtttgcattcttt +gcccagcacttcacgcatcagtttttcaagacagatcataagcgagggccagctttcaccaacgggctgggccatggggt +ggacttaaatcatatttacggtgaaactctggctagacagcgtaaactgcgccttttcaaggatggaaaaatgaaatatc +agataattgatggagagatgtatcctcccacagtcaaagatactcaggcagagatgatctaccctcctcaagtccctgag +catctacggtttgctgtggggcaggaggtctttggtctggtgcctggtctgatgatgtatgccacaatctggctgcggga +acacaacagagtatgcgatgtgcttaaacaggagcatcctgaatggggtgatgagcagttgttccagacaagcaggctaa +tactgataggagagactattaagattgtgattgaagattatgtgcaacacttgagtggctatcacttcaaactgaaattt +gacccagaactacttttcaacaaacaattccagtaccaaaatcgtattgctgctgaatttaacaccctctatcactggca +tccccttctgcctgacacctttcaaattcatgaccagaaatacaactatcaacagtttatctacaacaactctatattgc +tggaacatggaattacccagtttgttgaatcattcaccaggcaaattgctggcagggttgctggtggtaggaatgttcca +cccgcagtacagaaagtatcacaggcttccattgaccagagcaggcagatgaaataccagtcttttaatgagtaccgcaa +acgctttatgctgaagccctatgaatcatttgaagaacttacaggagaaaaggaaatgtctgcagagttggaagcactct +atggtgacatcgatgctgtggagctgtatcctgcccttctggtagaaaagcctcggccagatgccatctttggtgaaacc +atggtagaagttggagcaccattctccttgaaaggacttatgggtaatgttatatgttctcctgcctactggaagccaag +cacttttggtggagaagtgggttttcaaatcatcaacactgcctcaattcagtctctcatctgcaataacgtgaagggct +gtccctttacttcattcagtgttccagatccagagctcattaaaacagtcaccatcaatgcaagttcttcccgctccgga +ctagatgatatcaatcccacagtactactaaaagaacgttcgactgaactgtag +>g2 +atgctgccccctgggactgcgaccctcttgactctgctcctggcagctggctcgctgggccagaagcctcagaggccaca +tgctgccccctgggactgcgaccctcttgactctgctcctggcagctggctcgctgggccagaagcctcagaggccacgc +cggcccgcatcccccatcagcaccatccagcccaaggccaattttgatgcgcagcaggagcagggccaccgggccgaggc +caccacactgcatgtggctccccagggcacagccatggctgtcagtaccttccgaaagctggatgggatctgctggcagg +tgcgccagctctatggagacacaggggtcctcggccgcttcctgcttcaagcccgaggcgcccgaggggctgtgcacgtg +gttgtcgctgagaccgactaccagagtttcgctgtcctgtacctggagcgggcggggcagctgtcagtgaagctctacgc +ccgctcgctccctgtgagcgactcggtcctgagtgggtttgagcagcgggtccaggaggcccacctgactgaggaccaga +tcttctacttccccaagtacggcttctgcgaggctgcagaccagttccacgtcctggacggtgagtgcacagcgggggca +agcatggcggcgtggtga
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/human_augustus_protein_codingseq_introns_cds_main.gtf Thu Jun 06 13:26:37 2013 -0400 @@ -0,0 +1,101 @@ +# This output was generated with AUGUSTUS (version 2.7). +# AUGUSTUS is a gene prediction tool for eukaryotes written by Mario Stanke (mario.stanke@uni-greifswald.de) +# and Oliver Keller (keller@cs.uni-goettingen.de). +# Please cite: Mario Stanke, Mark Diekhans, Robert Baertsch, David Haussler (2008), +# Using native and syntenically mapped cDNA alignments to improve de novo gene finding +# Bioinformatics 24: 637-644, doi 10.1093/bioinformatics/btn013 +# No extrinsic information on sequences given. +# Initialising the parameters ... +# human version. Using default transition matrix. +# Looks like /home/bag/projects/galaxy/galaxy-central/database/files/001/dataset_1149.dat is in fasta format. +# We have hints for 0 sequences and for 0 of the sequences in the input set. +# +# ----- prediction on sequence number 1 (length = 9453, name = HS04636) ----- +# +# Constraints/Hints: +# (none) +# Predicted genes for sequence number 1 on both strands +# start gene g1 +HS04636 AUGUSTUS gene 966 6903 1 + . g1 +HS04636 AUGUSTUS transcript 966 6903 . + . g1.t1 +HS04636 AUGUSTUS intron 1018 1817 . + . transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS intron 1935 2054 . + . transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS intron 2199 2851 . + . transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS intron 2996 3425 . + . transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS intron 3608 4339 . + . transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS intron 4424 4542 . + . transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS intron 4790 5071 . + . transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS intron 5359 5859 . + . transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS intron 6008 6493 . + . transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS CDS 966 1017 . + 0 transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS CDS 1818 1934 . + 2 transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS CDS 2055 2198 . + 2 transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS CDS 2852 2995 . + 2 transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS CDS 3426 3607 . + 2 transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS CDS 4340 4423 . + 0 transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS CDS 4543 4789 . + 0 transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS CDS 5072 5358 . + 2 transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS CDS 5860 6007 . + 0 transcript_id "g1.t1"; gene_id "g1"; +HS04636 AUGUSTUS CDS 6494 6903 . + 2 transcript_id "g1.t1"; gene_id "g1"; +# coding sequence = [atgctcgcccgcgccctgctgctgtgcgcggtcctggcgctcagccatacagcaaatccttgctgttcccacccatgtc +# aaaaccgaggtgtatgtatgagtgtgggatttgaccagtataagtgcgattgtacccggacaggattctatggagaaaactgctcaacaccggaattt +# ttgacaagaataaaattatttctgaaacccactccaaacacagtgcactacatacttacccacttcaagggattttggaacgttgtgaataacattcc +# cttccttcgaaatgcaattatgagttatgtcttgacatccagatcacatttgattgacagtccaccaacttacaatgctgactatggctacaaaagct +# gggaagccttctctaacctctcctattatactagagcccttcctcctgtgcctgatgattgcccgactcccttgggtgtcaaaggtaaaaagcagctt +# cctgattcaaatgagattgtggaaaaattgcttctaagaagaaagttcatccctgatccccagggctcaaacatgatgtttgcattctttgcccagca +# cttcacgcatcagtttttcaagacagatcataagcgagggccagctttcaccaacgggctgggccatggggtggacttaaatcatatttacggtgaaa +# ctctggctagacagcgtaaactgcgccttttcaaggatggaaaaatgaaatatcagataattgatggagagatgtatcctcccacagtcaaagatact +# caggcagagatgatctaccctcctcaagtccctgagcatctacggtttgctgtggggcaggaggtctttggtctggtgcctggtctgatgatgtatgc +# cacaatctggctgcgggaacacaacagagtatgcgatgtgcttaaacaggagcatcctgaatggggtgatgagcagttgttccagacaagcaggctaa +# tactgataggagagactattaagattgtgattgaagattatgtgcaacacttgagtggctatcacttcaaactgaaatttgacccagaactacttttc +# aacaaacaattccagtaccaaaatcgtattgctgctgaatttaacaccctctatcactggcatccccttctgcctgacacctttcaaattcatgacca +# gaaatacaactatcaacagtttatctacaacaactctatattgctggaacatggaattacccagtttgttgaatcattcaccaggcaaattgctggca +# gggttgctggtggtaggaatgttccacccgcagtacagaaagtatcacaggcttccattgaccagagcaggcagatgaaataccagtcttttaatgag +# taccgcaaacgctttatgctgaagccctatgaatcatttgaagaacttacaggagaaaaggaaatgtctgcagagttggaagcactctatggtgacat +# cgatgctgtggagctgtatcctgcccttctggtagaaaagcctcggccagatgccatctttggtgaaaccatggtagaagttggagcaccattctcct +# tgaaaggacttatgggtaatgttatatgttctcctgcctactggaagccaagcacttttggtggagaagtgggttttcaaatcatcaacactgcctca +# attcagtctctcatctgcaataacgtgaagggctgtccctttacttcattcagtgttccagatccagagctcattaaaacagtcaccatcaatgcaag +# ttcttcccgctccggactagatgatatcaatcccacagtactactaaaagaacgttcgactgaactgtag] +# protein sequence = [MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYIL +# THFKGFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNEIVEKLLLRRKFIPD +# PQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQIIDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVG +# QEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLILIGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYH +# WHPLLPDTFQIHDQKYNYQQFIYNNSILLEHGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGE +# KEMSAELEALYGDIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCPFTSFSV +# PDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL] +# end gene g1 +### +# +# ----- prediction on sequence number 2 (length = 2344, name = HS08198) ----- +# +# Constraints/Hints: +# (none) +# Predicted genes for sequence number 2 on both strands +# start gene g2 +HS08198 AUGUSTUS gene 445 1848 1 + . g2 +HS08198 AUGUSTUS transcript 445 1848 . + . g2.t1 +HS08198 AUGUSTUS intron 583 811 . + . transcript_id "g2.t1"; gene_id "g2"; +HS08198 AUGUSTUS intron 895 1052 . + . transcript_id "g2.t1"; gene_id "g2"; +HS08198 AUGUSTUS intron 1124 1207 . + . transcript_id "g2.t1"; gene_id "g2"; +HS08198 AUGUSTUS intron 1316 1586 . + . transcript_id "g2.t1"; gene_id "g2"; +HS08198 AUGUSTUS intron 1689 1771 . + . transcript_id "g2.t1"; gene_id "g2"; +HS08198 AUGUSTUS CDS 445 582 . + 0 transcript_id "g2.t1"; gene_id "g2"; +HS08198 AUGUSTUS CDS 812 894 . + 0 transcript_id "g2.t1"; gene_id "g2"; +HS08198 AUGUSTUS CDS 1053 1123 . + 1 transcript_id "g2.t1"; gene_id "g2"; +HS08198 AUGUSTUS CDS 1208 1315 . + 2 transcript_id "g2.t1"; gene_id "g2"; +HS08198 AUGUSTUS CDS 1587 1688 . + 2 transcript_id "g2.t1"; gene_id "g2"; +HS08198 AUGUSTUS CDS 1772 1848 . + 2 transcript_id "g2.t1"; gene_id "g2"; +# coding sequence = [atgctgccccctgggactgcgaccctcttgactctgctcctggcagctggctcgctgggccagaagcctcagaggccac +# gccggcccgcatcccccatcagcaccatccagcccaaggccaattttgatgcgcagcaggagcagggccaccgggccgaggccaccacactgcatgtg +# gctccccagggcacagccatggctgtcagtaccttccgaaagctggatgggatctgctggcaggtgcgccagctctatggagacacaggggtcctcgg +# ccgcttcctgcttcaagcccgaggcgcccgaggggctgtgcacgtggttgtcgctgagaccgactaccagagtttcgctgtcctgtacctggagcggg +# cggggcagctgtcagtgaagctctacgcccgctcgctccctgtgagcgactcggtcctgagtgggtttgagcagcgggtccaggaggcccacctgact +# gaggaccagatcttctacttccccaagtacggcttctgcgaggctgcagaccagttccacgtcctggacggtgagtgcacagcgggggcaagcatggc +# ggcgtggtga] +# protein sequence = [MLPPGTATLLTLLLAAGSLGQKPQRPRRPASPISTIQPKANFDAQQEQGHRAEATTLHVAPQGTAMAVSTFRKLDGIC +# WQVRQLYGDTGVLGRFLLQARGARGAVHVVVAETDYQSFAVLYLERAGQLSVKLYARSLPVSDSVLSGFEQRVQEAHLTEDQIFYFPKYGFCEAADQF +# HVLDGECTAGASMAAW] +# end gene g2 +### +# command line: +# augustus --strand=both --noInFrameStop=false --gff3=off --protein=on --introns=on --start=off --stop=off --cds=on --codingseq=on --singlestrand=false /home/bag/projects/galaxy/galaxy-central/database/files/001/dataset_1149.dat --UTR=off --genemodel=complete --species=human
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/human_augustus_protein_codingseq_introns_cds_protein.fasta Thu Jun 06 13:26:37 2013 -0400 @@ -0,0 +1,15 @@ +>g1 +MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYILML +ARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYILTHFK +GFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNE +IVEKLLLRRKFIPDPQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQI +IDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVGQEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLIL +IGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYHWHPLLPDTFQIHDQKYNYQQFIYNNSILLE +HGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGEKEMSAELEALYG +DIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCP +FTSFSVPDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL +>g2 +MLPPGTATLLTLLLAAGSLGQKPQRPRRPASPISTIQPKANFDAQQEQGHRAEATTLHVAPQGTAMAVSTFRKLDGICML +PPGTATLLTLLLAAGSLGQKPQRPRRPASPISTIQPKANFDAQQEQGHRAEATTLHVAPQGTAMAVSTFRKLDGICWQVR +QLYGDTGVLGRFLLQARGARGAVHVVVAETDYQSFAVLYLERAGQLSVKLYARSLPVSDSVLSGFEQRVQEAHLTEDQIF +YFPKYGFCEAADQFHVLDGECTAGASMAAW