Mercurial > repos > bgruening > glimmer

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer2gff.py	Fri Jun 07 07:33:02 2013 -0400
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+
+"""
+Input: Glimmer3 prediction
+Output: GFF3 file
+Return a GFF3 file with the genes predicted by Glimmer3
+Bjoern Gruening
+
+Note: Its not a full-fledged GFF3 file, its a really simple one.
+
+"""
+
+import sys, re
+
+def __main__():
+    input_file = open(sys.argv[1], 'r')
+
+    print '##gff-version 3\n'
+    for line in input_file:
+        line = line.strip()
+        if line[0] == '>':
+            header = line[1:]
+        else:
+            (id, start, end, frame, score) = re.split('\s+', line)
+            if int(end) > int(start):
+                strand = '+'
+            else:
+                strand = '-'
+                (start, end) = (end, start)
+
+            rest = 'frame=%s;score=%s' % (frame, score)
+            print '\t'.join([header, 'glimmer_prediction', 'predicted_gene', start, end, '.', strand, '.', rest])
+
+
+if __name__ == "__main__" :
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer2gff.xml	Fri Jun 07 07:33:02 2013 -0400
@@ -0,0 +1,63 @@
+<tool id="glimmer2gff" name="Convert Glimmer to GFF" version="0.1">
+    <description>Converts Glimmer Files to GFF Files</description>
+    <command interpreter="python">
+        glimmer2gff.py
+            $input > $output
+    </command>
+    <inputs>
+        <param name="input" type="data" format="tabular" label="Glimmer Output File"/>
+    </inputs>
+    <outputs>
+        <data name="output" type="data" format="gff"/>
+    </outputs>
+    <tests>
+        <test>
+
+        </test>
+    </tests>
+    <help>
+
+**What it does**
+
+Converts a Glimmer3 output File to an GFF Annotation File::
+
+**Example**
+
+Input::
+    >contig00097 sbe.0.234
+    orf00003     2869      497  -2     5.60
+    orf00005     3894     2875  -1     7.05
+    orf00007     4242     4826  +3     8.04
+    orf00010     4846     5403  +1     8.57
+    orf00012     6858     5413  -1    10.87
+    orf00013     6857     7594  +2     3.61
+    orf00014     7751     9232  +2    11.34
+    orf00015     9374    10357  +2    10.66
+    orf00017    10603    11196  +1    13.39
+    orf00021    11303    11911  +2     8.81
+    orf00025    14791    12050  -2    13.51
+    orf00026    15216    16199  +3     6.37
+    orf00028    16333    16935  +1     8.86
+
+
+Output:
+    contig00097 sbe.0.234	glimmer	gene	497	2869	.	-	.	-2     5.60
+    contig00097 sbe.0.234	glimmer	gene	2875	3894	.	-	.	-1     7.05
+    contig00097 sbe.0.234	glimmer	gene	4242	4826	.	+	.	+3     8.04
+    contig00097 sbe.0.234	glimmer	gene	4846	5403	.	+	.	+1     8.57
+    contig00097 sbe.0.234	glimmer	gene	5413	6858	.	-	.	-1    10.87
+    contig00097 sbe.0.234	glimmer	gene	6857	7594	.	+	.	+2     3.61
+    contig00097 sbe.0.234	glimmer	gene	7751	9232	.	+	.	+2    11.34
+    contig00097 sbe.0.234	glimmer	gene	9374	10357	.	+	.	+2    10.66
+    contig00097 sbe.0.234	glimmer	gene	10603	11196	.	+	.	+1    13.39
+    contig00097 sbe.0.234	glimmer	gene	11303	11911	.	+	.	+2     8.81
+    contig00097 sbe.0.234	glimmer	gene	12050	14791	.	-	.	-2    13.51
+    contig00097 sbe.0.234	glimmer	gene	15216	16199	.	+	.	+3     6.37
+    contig00097 sbe.0.234	glimmer	gene	16333	16935	.	+	.	+1     8.86
+
+
+-----
+
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer3-build-icm-wrapper.xml	Fri Jun 07 07:33:02 2013 -0400
@@ -0,0 +1,119 @@
+<tool id="glimmer_build-icm" name="ICM builder" version="0.1">
+    <description>(glimmer3)</description>
+    <requirements>
+        <requirement type="package" version="3.02b">glimmer</requirement>
+    </requirements>
+    <command>
+        build-icm
+            --depth $depth
+            #if $no_stops:
+                --no_stops
+            #end if
+            --period $period
+            --width $width
+
+            #if $stop_codon_opts.stop_codon_opts_selector == "gb":
+                --trans_table "${stop_codon_opts.genbank_gencode}"
+            #else:
+                --stop_codons "${stop_codon_opts.stop_codons}"
+            #end if
+
+            $outfile &lt; $infile
+    </command>
+    <inputs>
+        <param name="infile" type="data" format="fasta" label="Trainings Dataset" help="A set of known genes in FASTA format." />
+        <param name="depth" type="integer" value="7" label="Set the depth of the ICM" help="The depth is the maximum number of positions in the context window that will be used to determine the probability of the predicted position." />
+        <param name="period" type="integer" value="3" label="Set the period of the ICM" help="The period is the number of different submodels for different positions in the text in a cyclic pattern. E.g., if the period is 3, the first submodel will determine positions 1, 4, 7, ..." />
+        <param name="width" type="integer" value="12" label="Set the width of the ICM" help="The width includes the predicted position." />
+        <param name="no_stops" type="boolean" truevalue="--no_stops" falsevalue="" checked="true" label="Do not use any input strings with in-frame stop codons" />
+
+        <conditional name="stop_codon_opts">
+            <param name="stop_codon_opts_selector" type="select" label="Specify start codons as">
+              <option value="gb" selected="True">Genbank translation table entry</option>
+              <option value="free_form">Comma-separated list</option>
+            </param>
+            <when value="gb">
+                <param name="genbank_gencode" type="select" label="Use Genbank translation table to specify stop codons">
+                    <option value="1" select="True">1. Standard</option>
+                    <option value="2">2. Vertebrate Mitochondrial</option>
+                    <option value="3">3. Yeast Mitochondrial</option>
+                    <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
+                    <option value="5">5. Invertebrate Mitochondrial</option>
+                    <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
+                    <option value="9">9. Echinoderm Mitochondrial</option>
+                    <option value="10">10. Euplotid Nuclear</option>
+                    <option value="11">11. Bacteria and Archaea</option>
+                    <option value="12">12. Alternative Yeast Nuclear</option>
+                    <option value="13">13. Ascidian Mitochondrial</option>
+                    <option value="14">14. Flatworm Mitochondrial</option>
+                    <option value="15">15. Blepharisma Macronuclear</option>
+                    <option value="16">16. Chlorophycean Mitochondrial</option>
+                    <option value="21">21. Trematode Mitochondrial</option>
+                    <option value="22">22. Scenedesmus obliquus mitochondrial</option>
+                    <option value="23">23. Thraustochytrium Mitochondrial</option>
+                    <option value="24">24. Pterobranchia mitochondrial</option>
+                </param>
+            </when>
+            <when value="free_form">
+                <param name="stop_codons" type="text" value="tag,tga,taa" label="Specify stop codons as a comma-separated list" />
+            </when>
+        </conditional>
+
+    </inputs>
+    <outputs>
+        <data format="binary" name="outfile" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="infile" value='glimmer3/seqTest.fa'/>
+            <output name="outfile" file='glimmer3/buildICMTestOutput.dat'/>
+        </test>
+    </tests>
+
+    <help>
+
+**What it does**
+
+	This program constructs an interpolated context model (ICM) from an input set of sequences.
+	This model can be used by Glimmer3 to predict genes.
+
+-----
+
+
+**Example**
+
+* input::
+
+	-Genome Sequence
+
+	>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7
+	GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT
+	GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT
+	TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT
+	TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC
+	GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA
+	ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG
+	AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA
+	CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA
+	TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC
+	AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA
+	GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC
+	AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC
+	CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA
+	AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC
+	GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT
+	.....
+
+* output:
+	interpolated context model (ICM)
+
+
+-------
+
+**References**
+
+A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
+
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer3-main-wrapper.xml	Fri Jun 07 07:33:02 2013 -0400
@@ -0,0 +1,232 @@
+<tool id="glimmer_knowlegde-based" name="Glimmer3" version="0.1">
+    <description>Predict ORFs in prokaryotic genomes (knowlegde-based)</description>
+    <requirements>
+        <requirement type="package" version="3.02b">glimmer</requirement>
+        <requirement type="package" version="1.61">biopython</requirement>
+        <requirement type="set_environment">GLIMMER_SCRIPT_PATH</requirement>
+    </requirements>
+    <command>
+    #import tempfile, os
+    #set $temp = tempfile.NamedTemporaryFile( delete=False )
+    # $temp.close()
+
+    glimmer3
+        --max_olap $max_olap
+        --gene_len $gene_len
+        --threshold $threshold
+        #if float( $gc_percent ) > 0.0:
+            --gc_percent $gc_percent
+        #end if
+
+        #if $stop_codon_opts.stop_codon_opts_selector == "gb":
+            --trans_table "${stop_codon_opts.genbank_gencode}"
+        #else:
+            --stop_codons "${stop_codon_opts.stop_codons}"
+        #end if
+
+        $linear
+        $no_indep
+        $extend
+        $seq_input
+        $icm_input
+        $temp 2>&#38;1;
+
+    ## convert prediction to FASTA sequences
+    \$GLIMMER_SCRIPT_PATH/glimmer_orf_to_seq.py $temp".predict" $seq_input $genes_output
+
+    #if $report:
+        mv $temp".predict" $prediction;
+    #else:
+        rm $temp".predict";
+    #end if
+
+    #if $detailed_report:
+        mv $temp".detail"  $detailed;
+    #else:
+        rm $temp".detail";
+    #end if
+
+    rm $temp
+    </command>
+    <inputs>
+        <param name="seq_input" type="data" format="fasta" label="Genome Sequence" />
+        <param name="icm_input" type="data" format="binary" label="Interpolated context model (ICM)" />
+
+        <param name="max_olap" type="integer" value="50" label="Set maximum overlap length" help="Overlaps this short or shorter are ignored." />
+        <param name="gene_len" type="integer" value="90" label="Set the minimum gene length to n nucleotides" hrlp="This does not include the bases in the stop codon."/>
+        <param name="threshold" type="integer" value="30" label="Set threshold score for calling as gene" help="If the in-frame score >= N, then the region is given a number and considered a potential gene." />
+        <param name="gc_percent" type="float" value="0.0" label="Set the GC percentage of the independent model, i.e., the model of intergenic sequence" help="If 0.0 specified, the GC percentage will be counted from the input file." />
+
+        <param name="linear" type="boolean" truevalue="--linear" falsevalue="" checked="true" label="Assume linear rather than circular genome, i.e., no wraparound" />
+        <param name="no_indep" type="boolean" truevalue="--no_indep" falsevalue="" checked="false" label="Don’t use the independent probability score column at all" help="Using this option will produce more short gene predictions." />
+        <param name="extend" type="boolean" truevalue="--extend" falsevalue="" checked="false" label="Also score orfs that extend off the end of the sequence(s)" />
+        <param name="start_codons" type="text" value="atg,gtg,ttg" label="Specify start codons as a comma-separated list" />
+
+        <conditional name="stop_codon_opts">
+            <param name="stop_codon_opts_selector" type="select" label="Specify start codons as">
+              <option value="gb" selected="True">Genbank translation table entry</option>
+              <option value="free_form">Comma-separated list</option>
+            </param>
+            <when value="gb">
+                <param name="genbank_gencode" type="select" label="Use Genbank translation table to specify stop codons">
+                    <option value="1" select="True">1. Standard</option>
+                    <option value="2">2. Vertebrate Mitochondrial</option>
+                    <option value="3">3. Yeast Mitochondrial</option>
+                    <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
+                    <option value="5">5. Invertebrate Mitochondrial</option>
+                    <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
+                    <option value="9">9. Echinoderm Mitochondrial</option>
+                    <option value="10">10. Euplotid Nuclear</option>
+                    <option value="11">11. Bacteria and Archaea</option>
+                    <option value="12">12. Alternative Yeast Nuclear</option>
+                    <option value="13">13. Ascidian Mitochondrial</option>
+                    <option value="14">14. Flatworm Mitochondrial</option>
+                    <option value="15">15. Blepharisma Macronuclear</option>
+                    <option value="16">16. Chlorophycean Mitochondrial</option>
+                    <option value="21">21. Trematode Mitochondrial</option>
+                    <option value="22">22. Scenedesmus obliquus mitochondrial</option>
+                    <option value="23">23. Thraustochytrium Mitochondrial</option>
+                    <option value="24">24. Pterobranchia mitochondrial</option>
+                </param>
+            </when>
+            <when value="free_form">
+                <param name="stop_codons" type="text" value="tag,tga,taa" label="Specify stop codons as a comma-separated list" />
+            </when>
+        </conditional>
+
+        <param name="report" type="boolean" truevalue="" falsevalue="" checked="false" label="Report the classic glimmer table output"/>
+        <param name="detailed_report" type="boolean" truevalue="" falsevalue="" checked="false" label="Output a detailed gene prediction report as separate file"/>
+    </inputs>
+    <outputs>
+        <data name="genes_output" format="fasta" label="Glimmer3 on ${on_string} (Gene Prediction FASTA)" />
+        <data name="prediction" format="text" label="Glimmer3 on ${on_string} (Gene Prediction table)">
+            <filter>report == True</filter>
+        </data>
+        <data name="detailed" format="text" label="Glimmer3 on ${on_string} (detailed report)">
+            <filter>detailed_report == True</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="seqInput" value='glimmer3/seqTest.fa' />
+            <param name="icmInput" value='glimmer3/icmTest.icm' />
+            <param name="overlaplen" value="50" />
+            <param name="genlen"  value="90" />
+            <param name="thresh"  value="30" />
+            <param name="linear" value="-l" />
+            <output name="output1" file='glimmer3/output1Test.dat' />
+            <output name="output2" file='glimmer3/output2Test.dat' />
+        </test>
+    </tests>
+    <help>
+
+
+**What it does**
+
+    This is the main program that makes gene preditions based on an interpolated context model (ICM).
+    The ICM can be generated either with a de novo prediction (see glimmer Overview) or with extracted CDS from related organisms.
+
+-----
+
+**TIP** To extract CDS from a GenBank file use the tool *Extract ORF from a GenBank file*.
+
+-----
+
+**Glimmer Overview**
+
+::
+
+**************		**************		**************		**************
+*            *		*	     *		*            *		*            *
+* long-orfs  *  ===>	*   Extract  *	===>	* build-icm  *  ===>	*  glimmer3  *
+*            *		*	     *		*	     *  	*	     *
+**************		**************		**************		**************
+
+**Example**
+
+* input::
+
+	-Genome Sequence
+
+	CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7
+	GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT
+	GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT
+	TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT
+	TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC
+	GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA
+	ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG
+	AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA
+	CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA
+	TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC
+	AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA
+	GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC
+	AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC
+	CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA
+	AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC
+	GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT
+	.....
+
+
+	- interpolated context model (ICM) 	92: glimmer3-build-icm on data 89
+	- maximum overlap length		50
+	- minimum gene length. 			90
+	- threshold score			30
+	- linear	 			True
+
+* output::
+
+	.predict file
+	>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
+	orf00001    40137       52  +2     8.68
+	orf00004      603       34  -1     2.91
+	orf00006     1289     1095  -3     3.16
+	orf00007     1555     1391  -2     2.33
+	orf00008     1809     1576  -1     1.02
+	orf00010     1953     2066  +3     3.09
+	orf00011     2182     2304  +1     0.89
+	orf00013     2390     2521  +2     0.60
+	orf00018     2570     3073  +2     2.54
+	orf00020     3196     3747  +1     2.91
+	orf00022     3758     4000  +2     0.83
+	orf00023     4399     4157  -2     1.31
+	orf00025     4463     4759  +2     2.92
+	orf00026     4878     5111  +3     0.78
+	orf00027     5468     5166  -3     1.64
+	orf00029     5590     5832  +1     0.29
+	orf00032     6023     6226  +2     6.02
+	orf00033     6217     6336  +1     3.09
+	........
+
+
+	.details file
+	>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
+	Sequence length = 40222
+
+		   ----- Start -----           --- Length ----  ------------- Scores -------------
+	 ID  Frame   of Orf  of Gene     Stop   of Orf of Gene      Raw InFrm F1 F2 F3 R1 R2 R3 NC
+	0001    +2    40137    40137       52      135     135     9.26    96  - 96  -  -  3  -  0
+	0002    +1       58       64      180      120     114     5.01    69 69  -  - 30  -  -  0
+		+3      300      309      422      120     111    -0.68    20  -  - 20 38  -  - 41
+		+3      423      432      545      120     111     1.29    21  - 51 21 13  -  8  5
+	0003    +2      401      416      595      192     177     2.51    93  - 93  -  5  -  -  1
+	0004    -1      645      552       34      609     516     2.33    99  -  -  - 99  -  -  0
+		+1      562      592      762      198     168    -2.54     1  1  -  -  -  -  - 98
+		+1      763      772      915      150     141    -1.34     1  1  -  -  -  - 86 11
+		+3      837      846     1007      168     159     1.35    28  - 50 28  -  - 17  3
+	0005    -3     1073      977      654      417     321     0.52    84  -  -  -  -  - 84 15
+	0006    -3     1373     1319     1095      276     222     3.80    99  -  -  -  -  - 99  0
+	0007    -2     1585     1555     1391      192     162     2.70    98  -  -  -  - 98  -  1
+	0008    -1     1812     1809     1576      234     231     1.26    94  -  -  - 94  -  -  5
+	0009    +2     1721     1730     1945      222     213     0.68    80  - 80  -  -  -  - 19
+	.....
+
+-------
+
+**References**
+
+A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
+
+
+    </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer_acgt_content.xml	Fri Jun 07 07:33:02 2013 -0400
@@ -0,0 +1,55 @@
+<tool id="glimmer_acgt-content" name="ACGT Content" version="0.1">
+    <description>of windows in each sequence</description>
+    <requirements>
+        <requirement type="package" version="3.02b">glimmer</requirement>
+    </requirements>
+    <command>
+        window-acgt
+            $percentage
+            $input_win_len
+            $input_win_skip
+            &lt; $infile > $output
+
+            ##TODO prettify the output
+    </command>
+    <inputs>
+        <param name="infile" type="data" format="fasta" label="Genome Sequence"/>
+        <param name="input_win_len" type="integer" value="10" label="The width of the sliding window"/>
+        <param name="input_win_skip" type="integer" value="10" label="The number of positions between windows to report"/>
+        <param name="percentage" type="boolean" truevalue="-p" falsevalue="" checked="true" label="Report percentages instead of counts"/>
+    </inputs>
+    <outputs>
+        <data name="output" format="tabular"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="infile" value="streptomyces_coelicolor.dna" />
+            <output name="output" file="fasta_tool_convert_from_dna.out" />
+        </test>
+    </tests>
+    <help>
+
+**What it does**
+
+This tool calculates the ACGT-Content from a given Sequence, given a sliding window.
+
+-------
+
+**Output**
+
+Output is in the format:
+
+	window-start	window-len	A's	C's	G's	T's	#other	%GC
+
+Note the last window in the sequence can be shorter than *window-len* if the sequence ends prematurely
+
+
+
+
+**References**
+
+A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
+
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer_orf_to_seq.py	Fri Jun 07 07:33:02 2013 -0400
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+"""
+Input: DNA FASTA file + Glimmer ORF file
+Output: ORF sequences as FASTA file
+Author: Bjoern Gruening
+"""
+import sys, os
+import Bio.SeqIO
+from Bio.SeqRecord import SeqRecord
+
+def glimmer2seq( glimmer_prediction = sys.argv [1], genome_sequence = sys.argv[2], outfile = sys.argv[3] ):
+    if len(sys.argv) >= 4:
+        glimmerfile = open( glimmer_prediction, "r")
+        sequence = open( genome_sequence )
+    else:
+        print "Missing input values."
+        sys.exit()
+
+    fastafile = Bio.SeqIO.parse(sequence, "fasta")
+
+    sequences = dict()
+    seq_records = list()
+    for entry in fastafile:
+        sequences[entry.description] = entry
+
+    for line in glimmerfile:
+        if line.startswith('>'):
+            entry = sequences[ line[1:].strip() ]
+        else:
+            orf_start = int(line[8:17])
+            orf_end = int(line[18:26])
+
+            orf_name = line[0:8]
+            if orf_start <= orf_end:
+                seq_records.add( SeqRecord( entry.seq[ orf_start-1 : orf_end ], id = orf_name, description = entry.description ) )
+            else:
+                seq_records.add( SeqRecord( entry.seq[ orf_end-1 : orf_start ].reverse_complement(), id = orf_name, description = entry.description ) )
+
+    SeqIO.write( seq_records, outfile, "fasta" )
+    glimmerfile.close()
+    sequence.close()
+
+if __name__ == "__main__" :
+    glimmer2seq()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer_orf_to_seq.xml	Fri Jun 07 07:33:02 2013 -0400
@@ -0,0 +1,30 @@
+<tool id="glimmer_orf-to-sequence" name="ORF to Sequence" version="0.1">
+    <description>assigns ORF to its DNA sequence</description>
+    <requirements>
+        <requirement type="package" version="1.61">biopython</requirement>
+    </requirements>
+    <command interpreter="python">
+        glimmer_orf_to_seq.py
+            $glimmer_orfs
+            $input_fasta
+            $output
+    </command>
+    <inputs>
+        <param name="input_fasta" type="data" format="fasta" label="Genome Sequence"/>
+        <param name="glimmer_orfs" type="data" format="tabular" label="Define Glimmer-ORFs"/>
+    </inputs>
+    <outputs>
+        <data name="output" type="data" format="fasta"/>
+    </outputs>
+    <tests>
+        <test>
+        </test>
+    </tests>
+    <help>
+
+**What it does**
+
+This tool extract all gene sequences from a genome, which are predicted with Glimmer3.
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer_predict.py	Fri Jun 07 07:33:02 2013 -0400
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+"""
+Input: DNA Fasta File
+Output: Tabular
+Return Tabular File with predicted ORF's
+Bjoern Gruening
+"""
+import sys, os
+import tempfile
+import subprocess
+import shutil
+from glimmer_orf_to_seq import glimmer2seq
+
+def main():
+    genome_seq_file = sys.argv[1]
+    outfile_classic_glimmer = sys.argv[2]
+    outfile_ext_path = sys.argv[3]
+    oufile_genes = sys.argv[8]
+
+    tag = 'glimmer_non_knowlegde_based_prediction'
+    tempdir = tempfile.gettempdir()
+
+    trainingset = os.path.join( tempdir, tag + ".train" )
+    icm = os.path.join( tempdir, tag + ".icm" )
+
+    longorfs = tempfile.NamedTemporaryFile()
+    trainingset = tempfile.NamedTemporaryFile()
+    icm = tempfile.NamedTemporaryFile()
+
+    #glimmeropts = "-o0 -g110 -t30 -l"
+    glimmeropts = "-o%s -g%s -t%s" % (sys.argv[4], sys.argv[5], sys.argv[6])
+    if sys.argv[7] == "true":
+        glimmeropts += " -l"
+
+    """
+        1. Find long, non-overlapping orfs to use as a training set
+    """
+    subprocess.Popen(["long-orfs", "-n", "-t", "1.15",
+        genome_seq_file, "-"], stdout = longorfs,
+        stderr = subprocess.PIPE).communicate()
+
+    """
+        2. Extract the training sequences from the genome file
+    """
+    subprocess.Popen(["extract", "-t",
+        genome_seq_file, longorfs.name], stdout=trainingset,
+        stderr=subprocess.PIPE).communicate()
+
+    """
+        3. Build the icm from the training sequences
+    """
+
+    # the "-" parameter is used to redirect the output to stdout
+    subprocess.Popen(["build-icm", "-r", "-"],
+        stdin=open(trainingset.name), stdout = icm,
+        stderr=subprocess.PIPE).communicate()
+
+    """
+        Run Glimmer3
+    """
+    b = subprocess.Popen(["glimmer3", glimmeropts,
+        genome_seq_file, icm.name, os.path.join(tempdir, tag)],
+        stdout = subprocess.PIPE, stderr=subprocess.PIPE).communicate()
+
+    shutil.copyfile( os.path.join( tempdir, tag + ".predict" ), outfile_classic_glimmer )
+    if outfile_ext_path.strip() != 'None':
+        shutil.copyfile( os.path.join( tempdir, tag + ".detail" ), outfile_ext_path )
+
+    glimmer2seq( outfile_classic_glimmer, genome_seq_file, oufile_genes )
+
+
+if __name__ == "__main__" :
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer_predict.xml	Fri Jun 07 07:33:02 2013 -0400
@@ -0,0 +1,94 @@
+<tool id="glimmer_not-knowlegde-based" name="Glimmer3" version="0.1">
+    <description>Predict ORFs in prokaryotic genomes (not knowlegde-based)</description>
+    <requirements>
+        <requirement type="package" version="3.02b">glimmer</requirement>
+        <requirement type="package" version="1.61">biopython</requirement>
+    </requirements>
+    <command interpreter="python">
+        glimmer_predict.py
+            $input
+            $prediction
+            #if $detailed_report:
+                $output_ext
+            #else:
+                "None"
+            #end if
+            $overlap
+            $gene_length
+            $threshold
+            $linear
+            $genes_output
+    </command>
+    <inputs>
+        <param name="input" type="data" format="fasta" label="Genome sequence" />
+        <param name="overlap" type="integer" value="0" label="Set maximum overlap length. Overlaps this short or shorter are ignored." />
+        <param name="gene_length" type="integer" value="110" label="Set minimum gene length." />
+        <param name="threshold" type="integer" value="30" label="Set threshold score for calling as gene. If the in-frame score >= N, then the region is given a number and considered a potential gene." />
+        <param name="linear" type="boolean" truevalue="true" falsevalue="false" checked="true" label="Assume linear rather than circular genome, i.e., no wraparound" />
+        <param name="detailed_report" type="boolean" truevalue="" falsevalue="" checked="false" label="Output a detailed gene prediction report as separate file"/>
+    </inputs>
+    <outputs>
+        <data name="genes_output" format="fasta" label="Glimmer3 on ${on_string} (Gene Prediction FASTA)" />
+        <data name="prediction" format="text" label="Glimmer3 on ${on_string} (Gene Prediction table)">
+            <filter>report == True</filter>
+        </data>
+        <data name="detailed" format="text" label="Glimmer3 on ${on_string} (detailed report)">
+            <filter>detailed_report == True</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" value="streptomyces_coelicolor.dna" />
+            <output name="output" file="fasta_tool_convert_from_dna.out" />
+        </test>
+    </tests>
+    <help>
+
+**What it does**
+
+This tool predicts open reading frames (orfs) from a given DNA Sequence. That tool is not knowlegde-based.
+
+The recommended way is to use a trained Glimmer3 with ICM model. Use the knowlegde-based version for that and insert/generate a training set.
+
+-----
+
+**Example**
+
+Suppose you have the following DNA formatted sequences::
+
+    >SQ   Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other;
+    cccgcggagcgggtaccacatcgctgcgcgatgtgcgagcgaacacccgggctgcgcccg
+    ggtgttgcgctcccgctccgcgggagcgctggcgggacgctgcgcgtcccgctcaccaag
+    cccgcttcgcgggcttggtgacgctccgtccgctgcgcttccggagttgcggggcttcgc
+    cccgctaaccctgggcctcgcttcgctccgccttgggcctgcggcgggtccgctgcgctc
+    ccccgcctcaagggcccttccggctgcgcctccaggacccaaccgcttgcgcgggcctgg
+
+Running this tool will produce this::
+
+    >SQ   Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other;
+    orf00001      577      699  +1     5.24
+    orf00003      800     1123  +2     5.18
+    orf00004     1144     3813  +1    10.62
+    orf00006     3857     6220  +2     6.07
+    orf00007     6226     7173  +1     1.69
+    orf00008     7187     9307  +2     8.95
+    orf00009     9424    10410  +1     8.29
+    orf00010    10515    11363  +3     7.00
+    orf00011    11812    11964  +1     2.80
+    orf00012    12360    13457  +3     4.80
+    orf00013    14379    14044  -1     7.41
+    orf00015    15029    14739  -3    12.43
+    orf00016    15066    15227  +3     1.91
+    orf00020    16061    15351  -3     2.83
+    orf00021    17513    17391  -3     2.20
+    orf00023    17529    17675  +3     0.11
+
+
+-------
+
+**References**
+
+A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/readme.rst	Fri Jun 07 07:33:02 2013 -0400
@@ -0,0 +1,70 @@
+Galaxy wrapper for RepeatMasker
+===============================
+
+This wrapper is copyright 2012-2013 by Björn Grüning.
+
+This is a wrapper for the command line tool of Glimmer3.
+http://www.cbcb.umd.edu/software/glimmer/
+
+Glimmer is a system for finding genes in microbial DNA,
+especially the genomes of bacteria, archaea, and viruses.
+Glimmer (Gene Locator and Interpolated Markov ModelER) uses interpolated
+Markov models (IMMs) to identify the coding regions and distinguish them from noncoding DNA.
+
+A.L. Delcher, D. Harmon, S. Kasif, O. White, and S.L. Salzberg. Improved microbial gene identification with GLIMMER, Nucleic Acids Research 27:23 (1999), 4636-4641.
+S. Salzberg, A. Delcher, S. Kasif, and O. White. Microbial gene identification using interpolated Markov models, Nucleic Acids Research 26:2 (1998), 544-548.
+A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
+
+
+
+Installation
+============
+
+Since version 0.2 the recommended installation procedure is via the Galaxy Tool Shed.
+
+To install Glimmer3 manually, please download Glimmer3 from http://www.cbcb.umd.edu/software/glimmer/glimmer302.tar.gz
+and follow the installation instructions. You can also use packages from your distribution like http://packages.debian.org/stable/science/tigr-glimmer
+
+To install the wrapper copy the glimmer3 folder in the galaxy tools
+folder and modify the tools_conf.xml file to make the tool available to Galaxy.
+For example:
+
+<tool file="gene_prediction/tools/glimmer3/glimmer3-main-wrapper.xml" />
+<tool file="gene_prediction/tools/glimmer3/glimmer_predict.xml" />
+<tool file="gene_prediction/tools/glimmer3/glimmer_orf_to_seq.xml" />
+<tool file="gene_prediction/tools/glimmer3/glimmer2gff.xml" />
+<tool file="gene_prediction/tools/glimmer3/gbktoorfWrapper.xml" />
+<tool file="gene_prediction/tools/glimmer3/glimmer_acgt_content.xml" />
+<tool file="gene_prediction/tools/glimmer3/glimmer3-build-icm-wrapper.xml" />
+<tool file="gene_prediction/tools/glimmer3/glimmer3-extract-wrapper.xml" />
+<tool file="gene_prediction/tools/glimmer3/glimmer3-long-orfs-wrapper.xml" />
+
+
+History
+=======
+
+- v0.1: Initial public release
+- v0.2: Add tool shed integration
+
+
+Wrapper Licence (MIT/BSD style)
+===============================
+
+Permission to use, copy, modify, and distribute this software and its
+documentation with or without modifications and for any purpose and
+without fee is hereby granted, provided that any copyright notices
+appear in all copies and that both those copyright notices and this
+permission notice appear in supporting documentation, and that the
+names of the contributors or copyright holders not be used in
+advertising or publicity pertaining to distribution of the software
+without specific prior permission.
+
+THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
+OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+OR PERFORMANCE OF THIS SOFTWARE.
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Fri Jun 07 07:33:02 2013 -0400
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="biopython" version="1.61">
+        <repository changeset_revision="e87f0c6897a8" name="package_biopython_1_61" owner="bgruening" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+    </package>
+    <set_environment version="1.0">
+        <environment_variable action="set_to" name="GLIMMER_SCRIPT_PATH">$REPOSITORY_INSTALL_DIR</environment_variable>
+    </set_environment>
+    <package name="glimmer" version="3.02b">
+        <install version="1.0">
+            <actions>
+                <action type="download_file">http://www.cbcb.umd.edu/software/glimmer/glimmer302b.tar.gz</action>
+                <action type="shell_command">tar xfvz glimmer302b.tar.gz</action>
+                <action type="shell_command">cd ./glimmer3.02/src &amp;&amp; make</action>
+
+                <action type="move_directory_files">
+                    <source_directory>./glimmer3.02/bin</source_directory>
+                    <destination_directory>$INSTALL_DIR/bin</destination_directory>
+                </action>
+                <action type="set_environment">
+                    <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable>
+                </action>
+            </actions>
+        </install>
+        <readme>To compile glimmer you need a C compiler (usually gcc).
+Glimmer is a system for finding genes in microbial DNA, especially the genomes of bacteria, archaea, and viruses.
+Glimmer (Gene Locator and Interpolated Markov ModelER) uses interpolated Markov models (IMMs) to identify the coding regions and distinguish them from noncoding DNA.
+http://www.cbcb.umd.edu/software/glimmer/</readme>
+    </package>
+</tool_dependency>