view augustus.xml @ 3:702d9e042295 draft

Uploaded
author bgruening
date Thu, 06 Jun 2013 13:26:37 -0400
parents b8ccbad1b062
children c243e17fb224
line wrap: on
line source

<tool id="augustus" name="Augustus" version="0.3">
    <description>gene prediction for eukaryotic genomes</description>
    <requirements>
        <requirement type="package" version="2.7">augustus</requirement>
        <requirement type="set_environment">SCRIPT_PATH</requirement>
    </requirements>
    <command>
        ## please set export AUGUSTUS_CONFIG_PATH=/path_to_augustus/augustus/config
        ## or use the --AUGUSTUS_CONFIG_PATH=path if you are not installing through the toolshed
        ## Augustus writes the protein and coding sequences as comment into the gff/gtf file an external script is used to extract the sequences into additional files

        augustus
            --strand=$strand
            $noInFrameStop
            $gff
            $protein
            $introns
            $start
            $stop
            $cds
            $codingseq
            $singlestrand
            $input_genome
            $mea
            $utr
            --genemodel=$genemodel
            --species=$organism
            ##--outfile=$output
        | tee $output 
        #if $protein or $codingseq:
            | python \$SCRIPT_PATH/extract_features.py
                #if $protein:
                    --protein $protein_output
                #end if
                #if $codingseq:
                    --codingseq $codingseq_output
                #end if
        #end if
    </command>
    <inputs>
        <param name="input_genome" type="data" format="fasta" label="Genome Sequence"/>
        <param name="noInFrameStop" type="boolean" label="Don't report transcripts with in-frame stop codons (--noInFrameStop)" truevalue="--noInFrameStop=true" falsevalue="--noInFrameStop=false" checked="false" help="Otherwise, intron-spanning stop codons could occur" />
        <param name="singlestrand" type="boolean" label="Predict genes independently on each strand, allow overlapping genes on opposite strands (--singlestrand)" truevalue="--singlestrand=true" falsevalue="--singlestrand=false" checked="false" />
        <param name="mea" type="boolean" label="Using the maximum expected accuracy approach (--mea)" truevalue="--mea=1" falsevalue="" checked="false" help="MEA is an alternative decoding approach." />
        <param name="utr" type="boolean" label="Predict the untranslated regions in addition to the coding sequence (--UTR)" truevalue="--UTR=on" falsevalue="--UTR=off" checked="false" help="This currently works only for human, galdieria, toxoplasma and caenorhabditis." />

        <param name="organism" label="Model Organism" type="select" multiple="false" format="text" help="Choose a specialised trainingset.">
            <option value="human">Homo sapiens</option>
            <option value="fly">Drosophila melanogaster</option>
            <option value="arabidopsis">Arabidopsis thaliana</option>
            <option value="brugia ">Brugia malayi</option>
            <option value="aedes">Aedes aegypti</option>
            <option value="tribolium2012">Tribolium castaneum</option>
            <option value="schistosoma">Schistosoma mansoni</option>
            <option value="tetrahymena">Tetrahymena thermophila</option>
            <option value="galdieria">Galdieria sulphuraria</option>
            <option value="maize">Zea mays</option>
            <option value="toxoplasma ">Toxoplasma gondii</option>
            <option value="caenorhabditis ">Caenorhabditis elegans</option>
            <option value="aspergillus_fumigatus">Aspergillus fumigatus</option>
            <option value="aspergillus_nidulans ">Aspergillus nidulans</option>
            <option value="aspergillus_oryzae ">Aspergillus oryzae</option>
            <option value="aspergillus_terreus">Aspergillus terreus</option>
            <option value="botrytis_cinerea ">Botrytis cinerea</option>
            <option value="candida_albicans ">Candida albicans</option>
            <option value="candida_guilliermondii ">Candida guilliermondii</option>
            <option value="candida_tropicalis ">Candida tropicalis</option>
            <option value="chaetomium_globosum">Chaetomium globosum</option>
            <option value="coccidioides_immitis ">Coccidioides immitis</option>
            <option value="coprinus ">Coprinus cinereus</option>
            <option value="coprinus_cinereus">Coprinus cinereus</option>
            <option value="cryptococcus_neoformans_gattii ">Cryptococcus neoformans gattii</option>
            <option value="cryptococcus_neoformans_neoformans_B ">Cryptococcus neoformans neoformans</option>
            <option value="cryptococcus_neoformans_neoformans_JEC21 ">Cryptococcus neoformans neoformans</option>
            <option value="debaryomyces_hansenii">Debaryomyces hansenii</option>
            <option value="encephalitozoon_cuniculi_GB">Encephalitozoon cuniculi</option>
            <option value="eremothecium_gossypii">Eremothecium gossypii</option>
            <option value="fusarium_graminearum ">Fusarium graminearum</option>
            <option value="histoplasma_capsulatum ">Histoplasma capsulatum</option>
            <option value="(histoplasma)">Histoplasma capsulatum</option>
            <option value="kluyveromyces_lactis ">Kluyveromyces lactis</option>
            <option value="laccaria_bicolor ">Laccaria bicolor</option>
            <option value="lamprey">Petromyzon marinus</option>
            <option value="leishmania_tarentolae">Leishmania tarentolae</option>
            <option value="lodderomyces_elongisporus">Lodderomyces elongisporus</option>
            <option value="magnaporthe_grisea ">Magnaporthe grisea</option>
            <option value="neurospora_crassa">Neurospora crassa</option>
            <option value="phanerochaete_chrysosporium">Phanerochaete chrysosporium</option>
            <option value="pichia_stipitis">Pichia stipitis</option>
            <option value="rhizopus_oryzae">Rhizopus oryzae</option>
            <option value="saccharomyces_cerevisiae_S288C ">Saccharomyces cerevisiae</option>
            <option value="saccharomyces_cerevisiae_rm11-1a_1 ">Saccharomyces cerevisiae</option>
            <option value="(saccharomyces)">Saccharomyces cerevisiae</option>
            <option value="schizosaccharomyces_pombe">Schizosaccharomyces pombe</option>
            <option value="trichinella">Trichinella spiralis</option>
            <option value="ustilago_maydis">Ustilago maydis</option>
            <option value="yarrowia_lipolytica">Yarrowia lipolytica</option>
            <option value="nasonia">Nasonia vitripennis</option>
            <option value="tomato">Solanum lycopersicum</option>
            <option value="chlamydomonas">Chlamydomonas reinhardtii</option>
            <option value="amphimedon">Amphimedon queenslandica</option>
            <option value="pneumocystis">Pneumocystis jirovecii</option>
            <option value="chicken">Gallus gallus domesticus (chicken)</option>
            <option value="cacao">Theobroma cacao (cacao)</option>
            <option value="heliconius_melpomene1">Heliconius melpomene</option>
            <option value="xenoturbella">Xenoturbella</option>
        </param> 

        <param name="strand" type="select" multiple="false" format="text" help="Report predicted genes on both strands, just the forward or just the backward strand.">
            <option value="both">both</option>
            <option value="forward">forward</option>
            <option value="backward">backward</option>
        </param> 

        <param name="genemodel" label="Gene Model" type="select" multiple="false" format="text" help="Gene Model to predict, for more information please refere to the help.">
            <option value="complete">complete</option>
            <option value="partial">partial</option>
            <option value="intronless">intronless</option>
            <option value="atleastone">atleastone</option>
            <option value="exactlyone">exactlyone</option>
            <option value="bacterium">bacterium (beta version)</option>
        </param>

        <param name="protein" type="boolean" label="Output predicted protein sequences (--protein)" truevalue="--protein=on" falsevalue="--protein=off" checked="true" />
        <param name="codingseq" type="boolean" label="Output coding sequence as comment in the output file (codingseq)" truevalue="--codingseq=on" falsevalue="--codingseq=off" checked="true" />
        <param name="introns" type="boolean" label="Output predicted intron sequences (--introns)" truevalue="--introns=on" falsevalue="--introns=off" checked="false" />
        <param name="start" type="boolean" label="Output predicted start codons (--start)" truevalue="--start=on" falsevalue="--start=off" checked="false" />
        <param name="stop" type="boolean" label="Output predicted stop codons (--stop)" truevalue="--stop=on" falsevalue="--stop=off" checked="false" />
        <param name="cds" type="boolean" label="Output CDS region (--cds)" truevalue="--cds=on" falsevalue="--cds=off" checked="true" />
        <param name="gff" type="boolean" label="GFF formated output, standard is GTF (--gff3)" truevalue="--gff3=on" falsevalue="--gff3=off" checked="false" />

    </inputs>
    <outputs>
        <data format="gtf" name="output">
            <change_format>
                <when input="gff" value="--gff3=on" format="gff" />
            </change_format>
        </data>
        <data format="fasta" name="protein_output">
            <filter>protein == True</filter>
        </data>
        <data format="fasta" name="codingseq_output">
            <filter>codingseq == True</filter>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="input_genome" value="human_augustus.fa" ftype="fasta" />
            <param name="organism" value="human" />
            <param name="utr" value="--UTR=on" />
            <output name="output" file="human_augustus_utr-on.gtf" ftype="gtf" />
        </test>
        <test>
            <param name="input_genome" value="HS04636_augustus.fa" ftype="fasta" />
            <param name="organism" value="human" />
            <param name="utr" value="--UTR=on" />
            <param name="gff" value="--gff3=on" />
            <output name="output" file="human_augustus_utr-on.gff" ftype="gff3" />
        </test>
        <test>
            <param name="input_genome" value="arabidopsis_augustus.fa" ftype="fasta" />
            <param name="organism" value="arabidopsis" />
            <param name="singlestrand" value="--singlestrand=true" />
            <param name="mea" value="--mea=1" />
            <output name="output" file="arabidopsis_augustus_utr-off_singlestrand-on_mea-on" ftype="gtf" />
        </test>
        <test>
            <param name="input_genome" value="HS04636_augustus.fa" ftype="fasta" />
            <param name="organism" value="human" />
            <param name="protein" value="--protein=on" />
            <param name="codingseq" value="--codingseq=on" />
            <param name="introns" value="--introns=on" />
            <param name="cds" value="--cds=on" />
            <output name="output" file="human_augustus_protein_codingseq_introns_cds_main.gtf" ftype="gff" />
            <output name="codingseq_output" file="human_augustus_protein_codingseq_introns_cds_codingseq.fasta" ftype="fasta" />
            <output name="protein_output" file="human_augustus_protein_codingseq_introns_cds_protein.fasta" ftype="fasta" />
        </test>
    </tests>
    <help>

**What it does**

AUGUSTUS is a gene prediction program for eukaryotes written by Mario Stanke and Oliver Keller.
It can be used as an ab initio program, which means it bases its prediction purely on the
sequence. AUGUSTUS may also incorporate hints on the gene structure coming from extrinsic sources
such as EST, MS/MS, protein alignments and synthenic genomic alignments.

-----

**Parameters**

Gene Model::

    partial      : allow prediction of incomplete genes at the sequence boundaries (default)
    intronless   : only predict single-exon genes like in prokaryotes and some eukaryotes
    complete     : only predict complete genes
    atleastone   : predict at least one complete gene
    exactlyone   : predict exactly one complete gene



**Example**

Suppose you have the following DNA formatted sequences::

    >Seq1
    cccgcggagcgggtaccacatcgctgcgcgatgtgcgagcgaacacccgggctgcgcccg
    ggtgttgcgctcccgctccgcgggagcgctggcgggacgctgcgcgtcccgctcaccaag
    cccgcttcgcgggcttggtgacgctccgtccgctgcgcttccggagttgcggggcttcgc
    cccgctaaccctgggcctcgcttcgctccgccttgggcctgcggcgggtccgctgcgctc
    ccccgcctcaagggcccttccggctgcgcctccaggacccaaccgcttgcgcgggcctgg

Running this tool will produce this::

    # ----- prediction on sequence number 1 (length = 1992969, name = scaffold1|size1992969) -----
    #
    # Constraints/Hints:
    # (none)
    # Predicted genes for sequence number 1 on both strands
    # start gene g1
    scaffold1|size1992969	AUGUSTUS	gene	17453	19382	0.11	+	.	g6
    scaffold1|size1992969	AUGUSTUS	transcript	17453	19382	0.11	+	.	g6.t1
    scaffold1|size1992969	AUGUSTUS	start_codon	17453	17455	.	+	0	transcript_id "g6.t1"; gene_id "g6";
    scaffold1|size1992969	AUGUSTUS	intron	17615	17660	0.38	+	.	transcript_id "g6.t1"; gene_id "g6";
    scaffold1|size1992969	AUGUSTUS	intron	17708	17772	0.54	+	.	transcript_id "g6.t1"; gene_id "g6";
    scaffold1|size1992969	AUGUSTUS	intron	17902	18035	0.58	+	.	transcript_id "g6.t1"; gene_id "g6";
    scaffold1|size1992969	AUGUSTUS	intron	18313	18367	0.99	+	.	transcript_id "g6.t1"; gene_id "g6";
    scaffold1|size1992969	AUGUSTUS	intron	19014	19080	0.44	+	.	transcript_id "g6.t1"; gene_id "g6";
    scaffold1|size1992969	AUGUSTUS	CDS	17453	17614	0.55	+	0	transcript_id "g6.t1"; gene_id "g6";
    scaffold1|size1992969	AUGUSTUS	CDS	17661	17707	0.38	+	0	transcript_id "g6.t1"; gene_id "g6";
    scaffold1|size1992969	AUGUSTUS	CDS	17773	17901	0.54	+	1	transcript_id "g6.t1"; gene_id "g6";
    scaffold1|size1992969	AUGUSTUS	CDS	18036	18312	0.52	+	1	transcript_id "g6.t1"; gene_id "g6";
    scaffold1|size1992969	AUGUSTUS	CDS	18368	19013	0.99	+	0	transcript_id "g6.t1"; gene_id "g6";
    scaffold1|size1992969	AUGUSTUS	CDS	19081	19379	0.31	+	2	transcript_id "g6.t1"; gene_id "g6";
    scaffold1|size1992969	AUGUSTUS	stop_codon	19380	19382	.	+	0	transcript_id "g6.t1"; gene_id "g6";


**References**

Mario Stanke and Stephan Waack (2003) 
Gene Prediction with a Hidden-Markov Model and a new Intron Submodel. 
Bioinformatics, Vol. 19, Suppl. 2, pages ii215-ii225


    </help>
</tool>