changeset 3:702d9e042295 draft

Uploaded
author bgruening
date Thu, 06 Jun 2013 13:26:37 -0400
parents b8ccbad1b062
children c243e17fb224
files augustus.xml extract_features.py test-data/human_augustus_protein_codingseq_introns_cds_codingseq.fasta test-data/human_augustus_protein_codingseq_introns_cds_main.gtf test-data/human_augustus_protein_codingseq_introns_cds_protein.fasta
diffstat 5 files changed, 170 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/augustus.xml	Thu Jun 06 12:51:27 2013 -0400
+++ b/augustus.xml	Thu Jun 06 13:26:37 2013 -0400
@@ -2,6 +2,7 @@
     <description>gene prediction for eukaryotic genomes</description>
     <requirements>
         <requirement type="package" version="2.7">augustus</requirement>
+        <requirement type="set_environment">SCRIPT_PATH</requirement>
     </requirements>
     <command>
         ## please set export AUGUSTUS_CONFIG_PATH=/path_to_augustus/augustus/config
@@ -27,7 +28,7 @@
             ##--outfile=$output
         | tee $output 
         #if $protein or $codingseq:
-            | python extract_features.py 
+            | python \$SCRIPT_PATH/extract_features.py
                 #if $protein:
                     --protein $protein_output
                 #end if
@@ -136,10 +137,10 @@
             </change_format>
         </data>
         <data format="fasta" name="protein_output">
-            <filter>protein == "--protein=on"</filter>
+            <filter>protein == True</filter>
         </data>
         <data format="fasta" name="codingseq_output">
-            <filter>codingseq == "--codingseq=on"</filter>
+            <filter>codingseq == True</filter>
         </data>
     </outputs>
     <tests>
@@ -163,6 +164,17 @@
             <param name="mea" value="--mea=1" />
             <output name="output" file="arabidopsis_augustus_utr-off_singlestrand-on_mea-on" ftype="gtf" />
         </test>
+        <test>
+            <param name="input_genome" value="HS04636_augustus.fa" ftype="fasta" />
+            <param name="organism" value="human" />
+            <param name="protein" value="--protein=on" />
+            <param name="codingseq" value="--codingseq=on" />
+            <param name="introns" value="--introns=on" />
+            <param name="cds" value="--cds=on" />
+            <output name="output" file="human_augustus_protein_codingseq_introns_cds_main.gtf" ftype="gff" />
+            <output name="codingseq_output" file="human_augustus_protein_codingseq_introns_cds_codingseq.fasta" ftype="fasta" />
+            <output name="protein_output" file="human_augustus_protein_codingseq_introns_cds_protein.fasta" ftype="fasta" />
+        </test>
     </tests>
     <help>
 
--- a/extract_features.py	Thu Jun 06 12:51:27 2013 -0400
+++ b/extract_features.py	Thu Jun 06 13:26:37 2013 -0400
@@ -55,9 +55,11 @@
 
             if args.codingseq and line.startswith('coding sequence = ['):
                 if line.endswith(']'):
-                    coding_seq = line[19:-1]
+                    line = line[19:-1]
+                    coding_seq = line
                 else:
-                    coding_seq = line[19:]
+                    line = line[19:]
+                    coding_seq = line
 
             if protein_seq:
                 if line.endswith(']'):
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/human_augustus_protein_codingseq_introns_cds_codingseq.fasta	Thu Jun 06 13:26:37 2013 -0400
@@ -0,0 +1,35 @@
+>g1
+atgctcgcccgcgccctgctgctgtgcgcggtcctggcgctcagccatacagcaaatccttgctgttcccacccatgtca
+tgctcgcccgcgccctgctgctgtgcgcggtcctggcgctcagccatacagcaaatccttgctgttcccacccatgtcaa
+aaccgaggtgtatgtatgagtgtgggatttgaccagtataagtgcgattgtacccggacaggattctatggagaaaactg
+ctcaacaccggaatttttgacaagaataaaattatttctgaaacccactccaaacacagtgcactacatacttacccact
+tcaagggattttggaacgttgtgaataacattcccttccttcgaaatgcaattatgagttatgtcttgacatccagatca
+catttgattgacagtccaccaacttacaatgctgactatggctacaaaagctgggaagccttctctaacctctcctatta
+tactagagcccttcctcctgtgcctgatgattgcccgactcccttgggtgtcaaaggtaaaaagcagcttcctgattcaa
+atgagattgtggaaaaattgcttctaagaagaaagttcatccctgatccccagggctcaaacatgatgtttgcattcttt
+gcccagcacttcacgcatcagtttttcaagacagatcataagcgagggccagctttcaccaacgggctgggccatggggt
+ggacttaaatcatatttacggtgaaactctggctagacagcgtaaactgcgccttttcaaggatggaaaaatgaaatatc
+agataattgatggagagatgtatcctcccacagtcaaagatactcaggcagagatgatctaccctcctcaagtccctgag
+catctacggtttgctgtggggcaggaggtctttggtctggtgcctggtctgatgatgtatgccacaatctggctgcggga
+acacaacagagtatgcgatgtgcttaaacaggagcatcctgaatggggtgatgagcagttgttccagacaagcaggctaa
+tactgataggagagactattaagattgtgattgaagattatgtgcaacacttgagtggctatcacttcaaactgaaattt
+gacccagaactacttttcaacaaacaattccagtaccaaaatcgtattgctgctgaatttaacaccctctatcactggca
+tccccttctgcctgacacctttcaaattcatgaccagaaatacaactatcaacagtttatctacaacaactctatattgc
+tggaacatggaattacccagtttgttgaatcattcaccaggcaaattgctggcagggttgctggtggtaggaatgttcca
+cccgcagtacagaaagtatcacaggcttccattgaccagagcaggcagatgaaataccagtcttttaatgagtaccgcaa
+acgctttatgctgaagccctatgaatcatttgaagaacttacaggagaaaaggaaatgtctgcagagttggaagcactct
+atggtgacatcgatgctgtggagctgtatcctgcccttctggtagaaaagcctcggccagatgccatctttggtgaaacc
+atggtagaagttggagcaccattctccttgaaaggacttatgggtaatgttatatgttctcctgcctactggaagccaag
+cacttttggtggagaagtgggttttcaaatcatcaacactgcctcaattcagtctctcatctgcaataacgtgaagggct
+gtccctttacttcattcagtgttccagatccagagctcattaaaacagtcaccatcaatgcaagttcttcccgctccgga
+ctagatgatatcaatcccacagtactactaaaagaacgttcgactgaactgtag
+>g2
+atgctgccccctgggactgcgaccctcttgactctgctcctggcagctggctcgctgggccagaagcctcagaggccaca
+tgctgccccctgggactgcgaccctcttgactctgctcctggcagctggctcgctgggccagaagcctcagaggccacgc
+cggcccgcatcccccatcagcaccatccagcccaaggccaattttgatgcgcagcaggagcagggccaccgggccgaggc
+caccacactgcatgtggctccccagggcacagccatggctgtcagtaccttccgaaagctggatgggatctgctggcagg
+tgcgccagctctatggagacacaggggtcctcggccgcttcctgcttcaagcccgaggcgcccgaggggctgtgcacgtg
+gttgtcgctgagaccgactaccagagtttcgctgtcctgtacctggagcgggcggggcagctgtcagtgaagctctacgc
+ccgctcgctccctgtgagcgactcggtcctgagtgggtttgagcagcgggtccaggaggcccacctgactgaggaccaga
+tcttctacttccccaagtacggcttctgcgaggctgcagaccagttccacgtcctggacggtgagtgcacagcgggggca
+agcatggcggcgtggtga
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/human_augustus_protein_codingseq_introns_cds_main.gtf	Thu Jun 06 13:26:37 2013 -0400
@@ -0,0 +1,101 @@
+# This output was generated with AUGUSTUS (version 2.7).
+# AUGUSTUS is a gene prediction tool for eukaryotes written by Mario Stanke (mario.stanke@uni-greifswald.de)
+# and Oliver Keller (keller@cs.uni-goettingen.de).
+# Please cite: Mario Stanke, Mark Diekhans, Robert Baertsch, David Haussler (2008),
+# Using native and syntenically mapped cDNA alignments to improve de novo gene finding
+# Bioinformatics 24: 637-644, doi 10.1093/bioinformatics/btn013
+# No extrinsic information on sequences given.
+# Initialising the parameters ...
+# human version. Using default transition matrix.
+# Looks like /home/bag/projects/galaxy/galaxy-central/database/files/001/dataset_1149.dat is in fasta format.
+# We have hints for 0 sequences and for 0 of the sequences in the input set.
+#
+# ----- prediction on sequence number 1 (length = 9453, name = HS04636) -----
+#
+# Constraints/Hints:
+# (none)
+# Predicted genes for sequence number 1 on both strands
+# start gene g1
+HS04636	AUGUSTUS	gene	966	6903	1	+	.	g1
+HS04636	AUGUSTUS	transcript	966	6903	.	+	.	g1.t1
+HS04636	AUGUSTUS	intron	1018	1817	.	+	.	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	intron	1935	2054	.	+	.	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	intron	2199	2851	.	+	.	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	intron	2996	3425	.	+	.	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	intron	3608	4339	.	+	.	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	intron	4424	4542	.	+	.	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	intron	4790	5071	.	+	.	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	intron	5359	5859	.	+	.	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	intron	6008	6493	.	+	.	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	CDS	966	1017	.	+	0	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	CDS	1818	1934	.	+	2	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	CDS	2055	2198	.	+	2	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	CDS	2852	2995	.	+	2	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	CDS	3426	3607	.	+	2	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	CDS	4340	4423	.	+	0	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	CDS	4543	4789	.	+	0	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	CDS	5072	5358	.	+	2	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	CDS	5860	6007	.	+	0	transcript_id "g1.t1"; gene_id "g1";
+HS04636	AUGUSTUS	CDS	6494	6903	.	+	2	transcript_id "g1.t1"; gene_id "g1";
+# coding sequence = [atgctcgcccgcgccctgctgctgtgcgcggtcctggcgctcagccatacagcaaatccttgctgttcccacccatgtc
+# aaaaccgaggtgtatgtatgagtgtgggatttgaccagtataagtgcgattgtacccggacaggattctatggagaaaactgctcaacaccggaattt
+# ttgacaagaataaaattatttctgaaacccactccaaacacagtgcactacatacttacccacttcaagggattttggaacgttgtgaataacattcc
+# cttccttcgaaatgcaattatgagttatgtcttgacatccagatcacatttgattgacagtccaccaacttacaatgctgactatggctacaaaagct
+# gggaagccttctctaacctctcctattatactagagcccttcctcctgtgcctgatgattgcccgactcccttgggtgtcaaaggtaaaaagcagctt
+# cctgattcaaatgagattgtggaaaaattgcttctaagaagaaagttcatccctgatccccagggctcaaacatgatgtttgcattctttgcccagca
+# cttcacgcatcagtttttcaagacagatcataagcgagggccagctttcaccaacgggctgggccatggggtggacttaaatcatatttacggtgaaa
+# ctctggctagacagcgtaaactgcgccttttcaaggatggaaaaatgaaatatcagataattgatggagagatgtatcctcccacagtcaaagatact
+# caggcagagatgatctaccctcctcaagtccctgagcatctacggtttgctgtggggcaggaggtctttggtctggtgcctggtctgatgatgtatgc
+# cacaatctggctgcgggaacacaacagagtatgcgatgtgcttaaacaggagcatcctgaatggggtgatgagcagttgttccagacaagcaggctaa
+# tactgataggagagactattaagattgtgattgaagattatgtgcaacacttgagtggctatcacttcaaactgaaatttgacccagaactacttttc
+# aacaaacaattccagtaccaaaatcgtattgctgctgaatttaacaccctctatcactggcatccccttctgcctgacacctttcaaattcatgacca
+# gaaatacaactatcaacagtttatctacaacaactctatattgctggaacatggaattacccagtttgttgaatcattcaccaggcaaattgctggca
+# gggttgctggtggtaggaatgttccacccgcagtacagaaagtatcacaggcttccattgaccagagcaggcagatgaaataccagtcttttaatgag
+# taccgcaaacgctttatgctgaagccctatgaatcatttgaagaacttacaggagaaaaggaaatgtctgcagagttggaagcactctatggtgacat
+# cgatgctgtggagctgtatcctgcccttctggtagaaaagcctcggccagatgccatctttggtgaaaccatggtagaagttggagcaccattctcct
+# tgaaaggacttatgggtaatgttatatgttctcctgcctactggaagccaagcacttttggtggagaagtgggttttcaaatcatcaacactgcctca
+# attcagtctctcatctgcaataacgtgaagggctgtccctttacttcattcagtgttccagatccagagctcattaaaacagtcaccatcaatgcaag
+# ttcttcccgctccggactagatgatatcaatcccacagtactactaaaagaacgttcgactgaactgtag]
+# protein sequence = [MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYIL
+# THFKGFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNEIVEKLLLRRKFIPD
+# PQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQIIDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVG
+# QEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLILIGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYH
+# WHPLLPDTFQIHDQKYNYQQFIYNNSILLEHGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGE
+# KEMSAELEALYGDIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCPFTSFSV
+# PDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL]
+# end gene g1
+###
+#
+# ----- prediction on sequence number 2 (length = 2344, name = HS08198) -----
+#
+# Constraints/Hints:
+# (none)
+# Predicted genes for sequence number 2 on both strands
+# start gene g2
+HS08198	AUGUSTUS	gene	445	1848	1	+	.	g2
+HS08198	AUGUSTUS	transcript	445	1848	.	+	.	g2.t1
+HS08198	AUGUSTUS	intron	583	811	.	+	.	transcript_id "g2.t1"; gene_id "g2";
+HS08198	AUGUSTUS	intron	895	1052	.	+	.	transcript_id "g2.t1"; gene_id "g2";
+HS08198	AUGUSTUS	intron	1124	1207	.	+	.	transcript_id "g2.t1"; gene_id "g2";
+HS08198	AUGUSTUS	intron	1316	1586	.	+	.	transcript_id "g2.t1"; gene_id "g2";
+HS08198	AUGUSTUS	intron	1689	1771	.	+	.	transcript_id "g2.t1"; gene_id "g2";
+HS08198	AUGUSTUS	CDS	445	582	.	+	0	transcript_id "g2.t1"; gene_id "g2";
+HS08198	AUGUSTUS	CDS	812	894	.	+	0	transcript_id "g2.t1"; gene_id "g2";
+HS08198	AUGUSTUS	CDS	1053	1123	.	+	1	transcript_id "g2.t1"; gene_id "g2";
+HS08198	AUGUSTUS	CDS	1208	1315	.	+	2	transcript_id "g2.t1"; gene_id "g2";
+HS08198	AUGUSTUS	CDS	1587	1688	.	+	2	transcript_id "g2.t1"; gene_id "g2";
+HS08198	AUGUSTUS	CDS	1772	1848	.	+	2	transcript_id "g2.t1"; gene_id "g2";
+# coding sequence = [atgctgccccctgggactgcgaccctcttgactctgctcctggcagctggctcgctgggccagaagcctcagaggccac
+# gccggcccgcatcccccatcagcaccatccagcccaaggccaattttgatgcgcagcaggagcagggccaccgggccgaggccaccacactgcatgtg
+# gctccccagggcacagccatggctgtcagtaccttccgaaagctggatgggatctgctggcaggtgcgccagctctatggagacacaggggtcctcgg
+# ccgcttcctgcttcaagcccgaggcgcccgaggggctgtgcacgtggttgtcgctgagaccgactaccagagtttcgctgtcctgtacctggagcggg
+# cggggcagctgtcagtgaagctctacgcccgctcgctccctgtgagcgactcggtcctgagtgggtttgagcagcgggtccaggaggcccacctgact
+# gaggaccagatcttctacttccccaagtacggcttctgcgaggctgcagaccagttccacgtcctggacggtgagtgcacagcgggggcaagcatggc
+# ggcgtggtga]
+# protein sequence = [MLPPGTATLLTLLLAAGSLGQKPQRPRRPASPISTIQPKANFDAQQEQGHRAEATTLHVAPQGTAMAVSTFRKLDGIC
+# WQVRQLYGDTGVLGRFLLQARGARGAVHVVVAETDYQSFAVLYLERAGQLSVKLYARSLPVSDSVLSGFEQRVQEAHLTEDQIFYFPKYGFCEAADQF
+# HVLDGECTAGASMAAW]
+# end gene g2
+###
+# command line:
+# augustus --strand=both --noInFrameStop=false --gff3=off --protein=on --introns=on --start=off --stop=off --cds=on --codingseq=on --singlestrand=false /home/bag/projects/galaxy/galaxy-central/database/files/001/dataset_1149.dat --UTR=off --genemodel=complete --species=human
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/human_augustus_protein_codingseq_introns_cds_protein.fasta	Thu Jun 06 13:26:37 2013 -0400
@@ -0,0 +1,15 @@
+>g1
+MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYILML
+ARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYILTHFK
+GFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNE
+IVEKLLLRRKFIPDPQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQI
+IDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVGQEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLIL
+IGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYHWHPLLPDTFQIHDQKYNYQQFIYNNSILLE
+HGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGEKEMSAELEALYG
+DIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCP
+FTSFSVPDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL
+>g2
+MLPPGTATLLTLLLAAGSLGQKPQRPRRPASPISTIQPKANFDAQQEQGHRAEATTLHVAPQGTAMAVSTFRKLDGICML
+PPGTATLLTLLLAAGSLGQKPQRPRRPASPISTIQPKANFDAQQEQGHRAEATTLHVAPQGTAMAVSTFRKLDGICWQVR
+QLYGDTGVLGRFLLQARGARGAVHVVVAETDYQSFAVLYLERAGQLSVKLYARSLPVSDSVLSGFEQRVQEAHLTEDQIF
+YFPKYGFCEAADQFHVLDGECTAGASMAAW