view glimmer3-main-wrapper.xml @ 1:a6582d591d64

Uploaded
author bgruening
date Fri, 07 Jun 2013 07:45:02 -0400
parents 8624069d7a0e
children 2d0c26885604
line wrap: on
line source

<tool id="glimmer_knowlegde-based" name="Glimmer3" version="0.1">
    <description>Predict ORFs in prokaryotic genomes (knowlegde-based)</description>
    <requirements>
        <requirement type="package" version="3.02b">glimmer</requirement>
        <requirement type="package" version="1.61">biopython</requirement>
        <requirement type="set_environment">GLIMMER_SCRIPT_PATH</requirement>
    </requirements>
    <command>
    #import tempfile, os
    #set $temp = tempfile.NamedTemporaryFile( delete=False )
    # $temp.close()

    glimmer3
        --max_olap $max_olap
        --gene_len $gene_len
        --threshold $threshold
        #if float( $gc_percent ) > 0.0:
            --gc_percent $gc_percent
        #end if

        #if $stop_codon_opts.stop_codon_opts_selector == "gb":
            --trans_table "${stop_codon_opts.genbank_gencode}"
        #else:
            --stop_codons "${stop_codon_opts.stop_codons}"
        #end if

        $linear
        $no_indep
        $extend
        $seq_input
        $icm_input
        $temp 2>&#38;1;

    ## convert prediction to FASTA sequences
    \$GLIMMER_SCRIPT_PATH/glimmer_orf_to_seq.py $temp".predict" $seq_input $genes_output

    #if $report:
        mv $temp".predict" $prediction;
    #else:
        rm $temp".predict";
    #end if

    #if $detailed_report:
        mv $temp".detail"  $detailed;
    #else:
        rm $temp".detail";
    #end if

    rm $temp
    </command>
    <inputs>
        <param name="seq_input" type="data" format="fasta" label="Genome Sequence" />
        <param name="icm_input" type="data" format="data" label="Interpolated context model (ICM)" />

        <param name="max_olap" type="integer" value="50" label="Set maximum overlap length" help="Overlaps this short or shorter are ignored." />
        <param name="gene_len" type="integer" value="90" label="Set the minimum gene length to n nucleotides" hrlp="This does not include the bases in the stop codon."/>
        <param name="threshold" type="integer" value="30" label="Set threshold score for calling as gene" help="If the in-frame score >= N, then the region is given a number and considered a potential gene." />
        <param name="gc_percent" type="float" value="0.0" label="Set the GC percentage of the independent model, i.e., the model of intergenic sequence" help="If 0.0 specified, the GC percentage will be counted from the input file." />

        <param name="linear" type="boolean" truevalue="--linear" falsevalue="" checked="true" label="Assume linear rather than circular genome, i.e., no wraparound" />
        <param name="no_indep" type="boolean" truevalue="--no_indep" falsevalue="" checked="false" label="Don’t use the independent probability score column at all" help="Using this option will produce more short gene predictions." />
        <param name="extend" type="boolean" truevalue="--extend" falsevalue="" checked="false" label="Also score orfs that extend off the end of the sequence(s)" />
        <param name="start_codons" type="text" value="atg,gtg,ttg" label="Specify start codons as a comma-separated list" />

        <conditional name="stop_codon_opts">
            <param name="stop_codon_opts_selector" type="select" label="Specify start codons as">
              <option value="gb" selected="True">Genbank translation table entry</option>
              <option value="free_form">Comma-separated list</option>
            </param>
            <when value="gb">
                <param name="genbank_gencode" type="select" label="Use Genbank translation table to specify stop codons">
                    <option value="1" select="True">1. Standard</option>
                    <option value="2">2. Vertebrate Mitochondrial</option>
                    <option value="3">3. Yeast Mitochondrial</option>
                    <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
                    <option value="5">5. Invertebrate Mitochondrial</option>
                    <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
                    <option value="9">9. Echinoderm Mitochondrial</option>
                    <option value="10">10. Euplotid Nuclear</option>
                    <option value="11">11. Bacteria and Archaea</option>
                    <option value="12">12. Alternative Yeast Nuclear</option>
                    <option value="13">13. Ascidian Mitochondrial</option>
                    <option value="14">14. Flatworm Mitochondrial</option>
                    <option value="15">15. Blepharisma Macronuclear</option>
                    <option value="16">16. Chlorophycean Mitochondrial</option>
                    <option value="21">21. Trematode Mitochondrial</option>
                    <option value="22">22. Scenedesmus obliquus mitochondrial</option>
                    <option value="23">23. Thraustochytrium Mitochondrial</option>
                    <option value="24">24. Pterobranchia mitochondrial</option>
                </param>
            </when>
            <when value="free_form">
                <param name="stop_codons" type="text" value="tag,tga,taa" label="Specify stop codons as a comma-separated list" />
            </when>
        </conditional>

        <param name="report" type="boolean" truevalue="" falsevalue="" checked="false" label="Report the classic glimmer table output"/>
        <param name="detailed_report" type="boolean" truevalue="" falsevalue="" checked="false" label="Output a detailed gene prediction report as separate file"/>
    </inputs>
    <outputs>
        <data name="genes_output" format="fasta" label="Glimmer3 on ${on_string} (Gene Prediction FASTA)" />
        <data name="prediction" format="text" label="Glimmer3 on ${on_string} (Gene Prediction table)">
            <filter>report == True</filter>
        </data>
        <data name="detailed" format="text" label="Glimmer3 on ${on_string} (detailed report)">
            <filter>detailed_report == True</filter>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="seqInput" value='glimmer3/seqTest.fa' />
            <param name="icmInput" value='glimmer3/icmTest.icm' />
            <param name="overlaplen" value="50" />
            <param name="genlen"  value="90" />
            <param name="thresh"  value="30" />
            <param name="linear" value="-l" />
            <output name="output1" file='glimmer3/output1Test.dat' />
            <output name="output2" file='glimmer3/output2Test.dat' />
        </test>
    </tests>
    <help>


**What it does**

    This is the main program that makes gene preditions based on an interpolated context model (ICM).
    The ICM can be generated either with a de novo prediction (see glimmer Overview) or with extracted CDS from related organisms.

-----

**TIP** To extract CDS from a GenBank file use the tool *Extract ORF from a GenBank file*.

-----

**Glimmer Overview**

::

**************		**************		**************		**************		
*            *		*	     *		*            *		*            *
* long-orfs  *  ===>	*   Extract  *	===>	* build-icm  *  ===>	*  glimmer3  *	
*            *		*	     *		*	     *  	*	     *	
**************		**************		**************		**************

**Example**

* input::
	
	-Genome Sequence

	CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7
	GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT
	GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT
	TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT
	TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC
	GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA
	ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG
	AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA
	CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA
	TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC
	AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA
	GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC
	AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC
	CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA
	AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC
	GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT
	.....


	- interpolated context model (ICM) 	92: glimmer3-build-icm on data 89
	- maximum overlap length		50
	- minimum gene length. 			90
	- threshold score			30
	- linear	 			True

* output:: 

	.predict file
	>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
	orf00001    40137       52  +2     8.68
	orf00004      603       34  -1     2.91
	orf00006     1289     1095  -3     3.16
	orf00007     1555     1391  -2     2.33
	orf00008     1809     1576  -1     1.02
	orf00010     1953     2066  +3     3.09
	orf00011     2182     2304  +1     0.89
	orf00013     2390     2521  +2     0.60
	orf00018     2570     3073  +2     2.54
	orf00020     3196     3747  +1     2.91
	orf00022     3758     4000  +2     0.83
	orf00023     4399     4157  -2     1.31
	orf00025     4463     4759  +2     2.92
	orf00026     4878     5111  +3     0.78
	orf00027     5468     5166  -3     1.64
	orf00029     5590     5832  +1     0.29
	orf00032     6023     6226  +2     6.02
	orf00033     6217     6336  +1     3.09
	........
	

	.details file
	>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
	Sequence length = 40222

		   ----- Start -----           --- Length ----  ------------- Scores -------------
	 ID  Frame   of Orf  of Gene     Stop   of Orf of Gene      Raw InFrm F1 F2 F3 R1 R2 R3 NC
	0001    +2    40137    40137       52      135     135     9.26    96  - 96  -  -  3  -  0
	0002    +1       58       64      180      120     114     5.01    69 69  -  - 30  -  -  0
		+3      300      309      422      120     111    -0.68    20  -  - 20 38  -  - 41
		+3      423      432      545      120     111     1.29    21  - 51 21 13  -  8  5
	0003    +2      401      416      595      192     177     2.51    93  - 93  -  5  -  -  1
	0004    -1      645      552       34      609     516     2.33    99  -  -  - 99  -  -  0
		+1      562      592      762      198     168    -2.54     1  1  -  -  -  -  - 98
		+1      763      772      915      150     141    -1.34     1  1  -  -  -  - 86 11
		+3      837      846     1007      168     159     1.35    28  - 50 28  -  - 17  3
	0005    -3     1073      977      654      417     321     0.52    84  -  -  -  -  - 84 15
	0006    -3     1373     1319     1095      276     222     3.80    99  -  -  -  -  - 99  0
	0007    -2     1585     1555     1391      192     162     2.70    98  -  -  -  - 98  -  1
	0008    -1     1812     1809     1576      234     231     1.26    94  -  -  - 94  -  -  5
	0009    +2     1721     1730     1945      222     213     0.68    80  - 80  -  -  -  - 19
	.....

-------

**References**

A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).


    </help>

</tool>