changeset 1:3c5116448979 draft

Uploaded
author bgruening
date Thu, 06 Jun 2013 12:20:31 -0400
parents 633039f94425
children b8ccbad1b062
files augustus.xml extract_features.py
diffstat 2 files changed, 125 insertions(+), 21 deletions(-) [+]
line wrap: on
line diff
--- a/augustus.xml	Thu Jun 06 09:52:32 2013 -0400
+++ b/augustus.xml	Thu Jun 06 12:20:31 2013 -0400
@@ -3,26 +3,35 @@
     <requirements>
         <requirement type="package" version="2.7">augustus</requirement>
     </requirements>
-    <command>augustus
-        --strand=$strand
-        $noInFrameStop
-        $gff
-        $protein
-        $introns
-        $start
-        $stop
-        $cds
-        $codingseq
-        $singlestrand
-        $input_genome
-        $mea
-        $utr
-        --genemodel=$genemodel
-        --species=$organism
-        --outfile=$output
+    <command>
+        ## please set export AUGUSTUS_CONFIG_PATH=/path_to_augustus/augustus/config
+        ## or use the --AUGUSTUS_CONFIG_PATH=path if you are not installing through the toolshed
+        ## Augustus writes the protein and coding sequences as comment into the gff/gtf file an external script is used to extract the sequences into additional files
 
-        #please set export AUGUSTUS_CONFIG_PATH=/path_to_augustus/augustus/config
-        #or use the --AUGUSTUS_CONFIG_PATH=path switch
+        augustus
+            --strand=$strand
+            $noInFrameStop
+            $gff
+            $protein
+            $introns
+            $start
+            $stop
+            $cds
+            $codingseq
+            $singlestrand
+            $input_genome
+            $mea
+            $utr
+            --genemodel=$genemodel
+            --species=$organism
+            ##--outfile=$output
+        | tee $output 
+        #if $protein or $codingseq:
+            | python extract_features.py 
+                #if $protein:
+                    --protein $protein_output 
+                #if $codingseq:
+                    --codingseq $codingseq_output
 
     </command>
     <inputs>
@@ -109,12 +118,12 @@
             <option value="bacterium">bacterium (beta version)</option>
         </param>
 
-        <param name="protein" type="boolean" label="Output predicted protein sequences (--protein)" truevalue="--protein=on" falsevalue="--protein=off" checked="false" />
+        <param name="protein" type="boolean" label="Output predicted protein sequences (--protein)" truevalue="--protein=on" falsevalue="--protein=off" checked="true" />
+        <param name="codingseq" type="boolean" label="Output coding sequence as comment in the output file (codingseq)" truevalue="--codingseq=on" falsevalue="--codingseq=off" checked="true" />
         <param name="introns" type="boolean" label="Output predicted intron sequences (--introns)" truevalue="--introns=on" falsevalue="--introns=off" checked="false" />
         <param name="start" type="boolean" label="Output predicted start codons (--start)" truevalue="--start=on" falsevalue="--start=off" checked="false" />
         <param name="stop" type="boolean" label="Output predicted stop codons (--stop)" truevalue="--stop=on" falsevalue="--stop=off" checked="false" />
         <param name="cds" type="boolean" label="Output CDS region (--cds)" truevalue="--cds=on" falsevalue="--cds=off" checked="true" />
-        <param name="codingseq" type="boolean" label="Output coding sequence as comment in the output file (codingseq)" truevalue="--codingseq=on" falsevalue="--codingseq=off" checked="false" />
         <param name="gff" type="boolean" label="GFF formated output, standard is GTF (--gff3)" truevalue="--gff3=on" falsevalue="--gff3=off" checked="false" />
 
     </inputs>
@@ -124,6 +133,12 @@
                 <when input="gff" value="--gff3=on" format="gff" />
             </change_format>
         </data>
+        <data format="fasta" name="protein_output">
+            <filter>protein == "--protein=on"</filter>
+        </data>
+        <data format="fasta" name="codingseq_output">
+            <filter>codingseq == "--codingseq=on"</filter>
+        </data>
     </outputs>
     <tests>
         <test>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_features.py	Thu Jun 06 12:20:31 2013 -0400
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import argparse
+import textwrap
+
+def main( args ):
+    """
+    Extract the protein and coding section from an augustus gff, gtf file
+    Example file:
+HS04636	AUGUSTUS	stop_codon	6901	6903	.	+	0	Parent=g1.t1
+HS04636	AUGUSTUS	transcription_end_site	8857	8857	.	+	.	Parent=g1.t1
+# protein sequence = [MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYIL
+# THFKGFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNEIVEKLLLRRKFIPD
+# PQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQIIDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVG
+# QEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLILIGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYH
+# WHPLLPDTFQIHDQKYNYQQFIYNNSILLEHGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGE
+# KEMSAELEALYGDIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCPFTSFSV
+# PDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL]
+# end gene g1
+###
+#
+# ----- prediction on sequence number 2 (length = 2344, name = HS08198) -----
+#
+# Predicted genes for sequence number 2 on both strands
+# start gene g2
+HS08198	AUGUSTUS	gene	86	2344	1	+	.	ID=g2
+HS08198	AUGUSTUS	transcript	86	2344	.	+	.	ID=g2.t1;Parent=g2
+HS08198	AUGUSTUS	transcription_start_site	86	86	.	+	.	Parent=g2.t1
+HS08198	AUGUSTUS	exon	86	582	.	+	.	Parent=g2.t1
+HS08198	AUGUSTUS	start_codon	445	447	.	+	0	Parent=g2.t1
+    """
+    protein_seq = ''
+    coding_seq = ''
+    if args.protein:
+        po = open( args.protein, 'w+' )
+    if args.codingseq:
+        co = open( args.codingseq, 'w+' )
+
+    for line in sys.stdin:
+        # protein- and coding-sequence are stored as comments
+        if line.startswith('#'):
+            line = line[2:].strip()
+            if line.startswith('start gene'):
+                gene_name = line[11:].strip()
+
+            if args.protein and line.startswith('protein sequence = ['):
+                if line.endswith(']'):
+                    line = line[20:-1]
+                    protein_seq = line
+                else:
+                    line = line[20:]
+                    protein_seq = line
+
+            if args.codingseq and line.startswith('coding sequence = ['):
+                if line.endswith(']'):
+                    coding_seq = line[19:-1]
+                else:
+                    coding_seq = line[19:]
+
+            if protein_seq:
+                if line.endswith(']'):
+                    protein_seq += line[:-1]
+                    po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) )
+                    protein_seq = ''
+                else:
+                    protein_seq += line
+
+            if coding_seq:
+                if line.endswith(']'):
+                    coding_seq += line[:-1]
+                    co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) )
+                    coding_seq = ''
+                else:
+                    coding_seq += line
+    if args.codingseq:
+        co.close()
+    if args.protein:
+        po.close()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--protein', help='Path to the protein file.')
+    parser.add_argument('-c', '--codingseq', help='Path to the coding file.')
+
+    args = parser.parse_args()
+    main( args )
+