changeset 4:8ddf54417ade

Uploaded
author bgruening
date Fri, 07 Jun 2013 10:02:12 -0400
parents 2d0c26885604
children 5a97ff1a7b12 a07c49839f31
files glimmer2gff.py glimmer2gff.xml glimmer2seq.py glimmer3-build-icm-wrapper.xml glimmer3-main-wrapper.xml glimmer_acgt_content.xml glimmer_build-icm.xml glimmer_orf_to_seq.py glimmer_orf_to_seq.xml glimmer_predict.py glimmer_predict.xml glimmer_w_icm.xml glimmer_wo_icm.py glimmer_wo_icm.xml readme.rst
diffstat 15 files changed, 572 insertions(+), 757 deletions(-) [+]
line wrap: on
line diff
--- a/glimmer2gff.py	Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,36 +0,0 @@
-#!/usr/bin/env python
-
-"""
-Input: Glimmer3 prediction
-Output: GFF3 file
-Return a GFF3 file with the genes predicted by Glimmer3
-Bjoern Gruening
-
-Note: Its not a full-fledged GFF3 file, its a really simple one.
-
-"""
-
-import sys, re
-
-def __main__():
-    input_file = open(sys.argv[1], 'r')
-
-    print '##gff-version 3\n'
-    for line in input_file:
-        line = line.strip()
-        if line[0] == '>':
-            header = line[1:]
-        else:
-            (id, start, end, frame, score) = re.split('\s+', line)
-            if int(end) > int(start):
-                strand = '+'
-            else:
-                strand = '-'
-                (start, end) = (end, start)
-
-            rest = 'frame=%s;score=%s' % (frame, score)
-            print '\t'.join([header, 'glimmer_prediction', 'predicted_gene', start, end, '.', strand, '.', rest])
-
-
-if __name__ == "__main__" :
-    __main__()
--- a/glimmer2gff.xml	Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,63 +0,0 @@
-<tool id="glimmer2gff" name="Convert Glimmer to GFF" version="0.1">
-    <description>Converts Glimmer Files to GFF Files</description>
-    <command interpreter="python">
-        glimmer2gff.py
-            $input > $output
-    </command>
-    <inputs>
-        <param name="input" type="data" format="tabular" label="Glimmer Output File"/>
-    </inputs>
-    <outputs>
-        <data name="output" type="data" format="gff"/>
-    </outputs>
-    <tests>
-        <test>
-
-        </test>
-    </tests>
-    <help>
-
-**What it does**
-
-Converts a Glimmer3 output File to an GFF Annotation File::
-
-**Example**
-
-Input::
-    >contig00097 sbe.0.234 
-    orf00003     2869      497  -2     5.60
-    orf00005     3894     2875  -1     7.05
-    orf00007     4242     4826  +3     8.04
-    orf00010     4846     5403  +1     8.57
-    orf00012     6858     5413  -1    10.87
-    orf00013     6857     7594  +2     3.61
-    orf00014     7751     9232  +2    11.34
-    orf00015     9374    10357  +2    10.66
-    orf00017    10603    11196  +1    13.39
-    orf00021    11303    11911  +2     8.81
-    orf00025    14791    12050  -2    13.51
-    orf00026    15216    16199  +3     6.37
-    orf00028    16333    16935  +1     8.86
-
-
-Output:
-    contig00097 sbe.0.234	glimmer	gene	497	2869	.	-	.	-2     5.60
-    contig00097 sbe.0.234	glimmer	gene	2875	3894	.	-	.	-1     7.05
-    contig00097 sbe.0.234	glimmer	gene	4242	4826	.	+	.	+3     8.04
-    contig00097 sbe.0.234	glimmer	gene	4846	5403	.	+	.	+1     8.57
-    contig00097 sbe.0.234	glimmer	gene	5413	6858	.	-	.	-1    10.87
-    contig00097 sbe.0.234	glimmer	gene	6857	7594	.	+	.	+2     3.61
-    contig00097 sbe.0.234	glimmer	gene	7751	9232	.	+	.	+2    11.34
-    contig00097 sbe.0.234	glimmer	gene	9374	10357	.	+	.	+2    10.66
-    contig00097 sbe.0.234	glimmer	gene	10603	11196	.	+	.	+1    13.39
-    contig00097 sbe.0.234	glimmer	gene	11303	11911	.	+	.	+2     8.81
-    contig00097 sbe.0.234	glimmer	gene	12050	14791	.	-	.	-2    13.51
-    contig00097 sbe.0.234	glimmer	gene	15216	16199	.	+	.	+3     6.37
-    contig00097 sbe.0.234	glimmer	gene	16333	16935	.	+	.	+1     8.86
-
-
------
-
-
-    </help>
-</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer2seq.py	Fri Jun 07 10:02:12 2013 -0400
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+"""
+Input: DNA FASTA file + Glimmer ORF file
+Output: ORF sequences as FASTA file
+Author: Bjoern Gruening
+"""
+import sys, os
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+
+def glimmer2seq( glimmer_prediction = sys.argv [1], genome_sequence = sys.argv[2], outfile = sys.argv[3] ):
+    if len(sys.argv) >= 4:
+        glimmerfile = open( glimmer_prediction, "r")
+        sequence = open( genome_sequence )
+    else:
+        print "Missing input values."
+        sys.exit()
+
+    fastafile = SeqIO.parse(sequence, "fasta")
+
+    sequences = dict()
+    seq_records = list()
+    for entry in fastafile:
+        sequences[entry.description] = entry
+
+    for line in glimmerfile:
+        if line.startswith('>'):
+            entry = sequences[ line[1:].strip() ]
+        else:
+            orf_start = int(line[8:17])
+            orf_end = int(line[18:26])
+
+            orf_name = line[0:8]
+            if orf_start <= orf_end:
+                seq_records.append( SeqRecord( entry.seq[ orf_start-1 : orf_end ], id = orf_name, description = entry.description ) )
+            else:
+                seq_records.append( SeqRecord( entry.seq[ orf_end-1 : orf_start ].reverse_complement(), id = orf_name, description = entry.description ) )
+
+    SeqIO.write( seq_records, outfile, "fasta" )
+    glimmerfile.close()
+    sequence.close()
+
+if __name__ == "__main__" :
+    glimmer2seq()
--- a/glimmer3-build-icm-wrapper.xml	Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,119 +0,0 @@
-<tool id="glimmer_build-icm" name="ICM builder" version="0.1">
-    <description>(glimmer3)</description>
-    <requirements>
-        <requirement type="package" version="3.02b">glimmer</requirement>
-    </requirements>
-    <command>
-        build-icm
-            --depth $depth
-            #if $no_stops:
-                --no_stops
-            #end if
-            --period $period
-            --width $width
-
-            #if $stop_codon_opts.stop_codon_opts_selector == "gb":
-                --trans_table "${stop_codon_opts.genbank_gencode}"
-            #else:
-                --stop_codons "${stop_codon_opts.stop_codons}"
-            #end if
-
-            $outfile &lt; $infile
-    </command>
-    <inputs>
-        <param name="infile" type="data" format="fasta" label="Trainings Dataset" help="A set of known genes in FASTA format." />
-        <param name="depth" type="integer" value="7" label="Set the depth of the ICM" help="The depth is the maximum number of positions in the context window that will be used to determine the probability of the predicted position." />
-        <param name="period" type="integer" value="3" label="Set the period of the ICM" help="The period is the number of different submodels for different positions in the text in a cyclic pattern. E.g., if the period is 3, the first submodel will determine positions 1, 4, 7, ..." />
-        <param name="width" type="integer" value="12" label="Set the width of the ICM" help="The width includes the predicted position." />
-        <param name="no_stops" type="boolean" truevalue="--no_stops" falsevalue="" checked="true" label="Do not use any input strings with in-frame stop codons" />
-
-        <conditional name="stop_codon_opts">
-            <param name="stop_codon_opts_selector" type="select" label="Specify start codons as">
-              <option value="gb" selected="True">Genbank translation table entry</option>
-              <option value="free_form">Comma-separated list</option>
-            </param>
-            <when value="gb">
-                <param name="genbank_gencode" type="select" label="Use Genbank translation table to specify stop codons">
-                    <option value="1" select="True">1. Standard</option>
-                    <option value="2">2. Vertebrate Mitochondrial</option>
-                    <option value="3">3. Yeast Mitochondrial</option>
-                    <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
-                    <option value="5">5. Invertebrate Mitochondrial</option>
-                    <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
-                    <option value="9">9. Echinoderm Mitochondrial</option>
-                    <option value="10">10. Euplotid Nuclear</option>
-                    <option value="11">11. Bacteria and Archaea</option>
-                    <option value="12">12. Alternative Yeast Nuclear</option>
-                    <option value="13">13. Ascidian Mitochondrial</option>
-                    <option value="14">14. Flatworm Mitochondrial</option>
-                    <option value="15">15. Blepharisma Macronuclear</option>
-                    <option value="16">16. Chlorophycean Mitochondrial</option>
-                    <option value="21">21. Trematode Mitochondrial</option>
-                    <option value="22">22. Scenedesmus obliquus mitochondrial</option>
-                    <option value="23">23. Thraustochytrium Mitochondrial</option>
-                    <option value="24">24. Pterobranchia mitochondrial</option>
-                </param>
-            </when>
-            <when value="free_form">
-                <param name="stop_codons" type="text" value="tag,tga,taa" label="Specify stop codons as a comma-separated list" />
-            </when>
-        </conditional>
-
-    </inputs>
-    <outputs>
-        <data format="data" name="outfile" />
-    </outputs>
-    <tests>
-        <test>
-            <param name="infile" value='glimmer3/seqTest.fa'/>
-            <output name="outfile" file='glimmer3/buildICMTestOutput.dat'/>
-        </test>
-    </tests>
-
-    <help>
-
-**What it does**
-
-	This program constructs an interpolated context model (ICM) from an input set of sequences.
-	This model can be used by Glimmer3 to predict genes.
-
------
-
-
-**Example**
-
-* input::
-
-	-Genome Sequence
-
-	>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7
-	GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT
-	GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT
-	TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT
-	TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC
-	GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA
-	ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG
-	AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA
-	CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA
-	TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC
-	AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA
-	GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC
-	AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC
-	CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA
-	AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC
-	GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT
-	.....
-
-* output:
-	interpolated context model (ICM)
-
-
--------
-
-**References**
-
-A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
-
-
-    </help>
-</tool>
--- a/glimmer3-main-wrapper.xml	Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,232 +0,0 @@
-<tool id="glimmer_knowlegde-based" name="Glimmer3" version="0.1">
-    <description>Predict ORFs in prokaryotic genomes (knowlegde-based)</description>
-    <requirements>
-        <requirement type="package" version="3.02b">glimmer</requirement>
-        <requirement type="package" version="1.61">biopython</requirement>
-        <requirement type="set_environment">GLIMMER_SCRIPT_PATH</requirement>
-    </requirements>
-    <command>
-    #import tempfile, os
-    #set $temp = tempfile.NamedTemporaryFile( delete=False )
-    # $temp.close()
-
-    glimmer3
-        --max_olap $max_olap
-        --gene_len $gene_len
-        --threshold $threshold
-        #if float( str($gc_percent) ) > 0.0:
-            --gc_percent $gc_percent
-        #end if
-
-        #if $stop_codon_opts.stop_codon_opts_selector == "gb":
-            --trans_table "${stop_codon_opts.genbank_gencode}"
-        #else:
-            --stop_codons "${stop_codon_opts.stop_codons}"
-        #end if
-
-        $linear
-        $no_indep
-        $extend
-        $seq_input
-        $icm_input
-        $temp 2>&#38;1;
-
-    ## convert prediction to FASTA sequences
-    \$GLIMMER_SCRIPT_PATH/glimmer_orf_to_seq.py $temp".predict" $seq_input $genes_output
-
-    #if $report:
-        mv $temp".predict" $prediction;
-    #else:
-        rm $temp".predict";
-    #end if
-
-    #if $detailed_report:
-        mv $temp".detail"  $detailed;
-    #else:
-        rm $temp".detail";
-    #end if
-
-    rm $temp
-    </command>
-    <inputs>
-        <param name="seq_input" type="data" format="fasta" label="Genome Sequence" />
-        <param name="icm_input" type="data" format="data" label="Interpolated context model (ICM)" />
-
-        <param name="max_olap" type="integer" value="50" label="Set maximum overlap length" help="Overlaps this short or shorter are ignored." />
-        <param name="gene_len" type="integer" value="90" label="Set the minimum gene length to n nucleotides" hrlp="This does not include the bases in the stop codon."/>
-        <param name="threshold" type="integer" value="30" label="Set threshold score for calling as gene" help="If the in-frame score >= N, then the region is given a number and considered a potential gene." />
-        <param name="gc_percent" type="float" value="0.0" label="Set the GC percentage of the independent model, i.e., the model of intergenic sequence" help="If 0.0 specified, the GC percentage will be counted from the input file." />
-
-        <param name="linear" type="boolean" truevalue="--linear" falsevalue="" checked="true" label="Assume linear rather than circular genome, i.e., no wraparound" />
-        <param name="no_indep" type="boolean" truevalue="--no_indep" falsevalue="" checked="false" label="Don’t use the independent probability score column at all" help="Using this option will produce more short gene predictions." />
-        <param name="extend" type="boolean" truevalue="--extend" falsevalue="" checked="false" label="Also score orfs that extend off the end of the sequence(s)" />
-        <param name="start_codons" type="text" value="atg,gtg,ttg" label="Specify start codons as a comma-separated list" />
-
-        <conditional name="stop_codon_opts">
-            <param name="stop_codon_opts_selector" type="select" label="Specify start codons as">
-              <option value="gb" selected="True">Genbank translation table entry</option>
-              <option value="free_form">Comma-separated list</option>
-            </param>
-            <when value="gb">
-                <param name="genbank_gencode" type="select" label="Use Genbank translation table to specify stop codons">
-                    <option value="1" select="True">1. Standard</option>
-                    <option value="2">2. Vertebrate Mitochondrial</option>
-                    <option value="3">3. Yeast Mitochondrial</option>
-                    <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
-                    <option value="5">5. Invertebrate Mitochondrial</option>
-                    <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
-                    <option value="9">9. Echinoderm Mitochondrial</option>
-                    <option value="10">10. Euplotid Nuclear</option>
-                    <option value="11">11. Bacteria and Archaea</option>
-                    <option value="12">12. Alternative Yeast Nuclear</option>
-                    <option value="13">13. Ascidian Mitochondrial</option>
-                    <option value="14">14. Flatworm Mitochondrial</option>
-                    <option value="15">15. Blepharisma Macronuclear</option>
-                    <option value="16">16. Chlorophycean Mitochondrial</option>
-                    <option value="21">21. Trematode Mitochondrial</option>
-                    <option value="22">22. Scenedesmus obliquus mitochondrial</option>
-                    <option value="23">23. Thraustochytrium Mitochondrial</option>
-                    <option value="24">24. Pterobranchia mitochondrial</option>
-                </param>
-            </when>
-            <when value="free_form">
-                <param name="stop_codons" type="text" value="tag,tga,taa" label="Specify stop codons as a comma-separated list" />
-            </when>
-        </conditional>
-
-        <param name="report" type="boolean" truevalue="" falsevalue="" checked="false" label="Report the classic glimmer table output"/>
-        <param name="detailed_report" type="boolean" truevalue="" falsevalue="" checked="false" label="Output a detailed gene prediction report as separate file"/>
-    </inputs>
-    <outputs>
-        <data name="genes_output" format="fasta" label="Glimmer3 on ${on_string} (Gene Prediction FASTA)" />
-        <data name="prediction" format="text" label="Glimmer3 on ${on_string} (Gene Prediction table)">
-            <filter>report == True</filter>
-        </data>
-        <data name="detailed" format="text" label="Glimmer3 on ${on_string} (detailed report)">
-            <filter>detailed_report == True</filter>
-        </data>
-    </outputs>
-    <tests>
-        <test>
-            <param name="seqInput" value='glimmer3/seqTest.fa' />
-            <param name="icmInput" value='glimmer3/icmTest.icm' />
-            <param name="overlaplen" value="50" />
-            <param name="genlen"  value="90" />
-            <param name="thresh"  value="30" />
-            <param name="linear" value="-l" />
-            <output name="output1" file='glimmer3/output1Test.dat' />
-            <output name="output2" file='glimmer3/output2Test.dat' />
-        </test>
-    </tests>
-    <help>
-
-
-**What it does**
-
-    This is the main program that makes gene preditions based on an interpolated context model (ICM).
-    The ICM can be generated either with a de novo prediction (see glimmer Overview) or with extracted CDS from related organisms.
-
------
-
-**TIP** To extract CDS from a GenBank file use the tool *Extract ORF from a GenBank file*.
-
------
-
-**Glimmer Overview**
-
-::
-
-**************		**************		**************		**************		
-*            *		*	     *		*            *		*            *
-* long-orfs  *  ===>	*   Extract  *	===>	* build-icm  *  ===>	*  glimmer3  *	
-*            *		*	     *		*	     *  	*	     *	
-**************		**************		**************		**************
-
-**Example**
-
-* input::
-	
-	-Genome Sequence
-
-	CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7
-	GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT
-	GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT
-	TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT
-	TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC
-	GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA
-	ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG
-	AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA
-	CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA
-	TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC
-	AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA
-	GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC
-	AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC
-	CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA
-	AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC
-	GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT
-	.....
-
-
-	- interpolated context model (ICM) 	92: glimmer3-build-icm on data 89
-	- maximum overlap length		50
-	- minimum gene length. 			90
-	- threshold score			30
-	- linear	 			True
-
-* output:: 
-
-	.predict file
-	>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
-	orf00001    40137       52  +2     8.68
-	orf00004      603       34  -1     2.91
-	orf00006     1289     1095  -3     3.16
-	orf00007     1555     1391  -2     2.33
-	orf00008     1809     1576  -1     1.02
-	orf00010     1953     2066  +3     3.09
-	orf00011     2182     2304  +1     0.89
-	orf00013     2390     2521  +2     0.60
-	orf00018     2570     3073  +2     2.54
-	orf00020     3196     3747  +1     2.91
-	orf00022     3758     4000  +2     0.83
-	orf00023     4399     4157  -2     1.31
-	orf00025     4463     4759  +2     2.92
-	orf00026     4878     5111  +3     0.78
-	orf00027     5468     5166  -3     1.64
-	orf00029     5590     5832  +1     0.29
-	orf00032     6023     6226  +2     6.02
-	orf00033     6217     6336  +1     3.09
-	........
-	
-
-	.details file
-	>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
-	Sequence length = 40222
-
-		   ----- Start -----           --- Length ----  ------------- Scores -------------
-	 ID  Frame   of Orf  of Gene     Stop   of Orf of Gene      Raw InFrm F1 F2 F3 R1 R2 R3 NC
-	0001    +2    40137    40137       52      135     135     9.26    96  - 96  -  -  3  -  0
-	0002    +1       58       64      180      120     114     5.01    69 69  -  - 30  -  -  0
-		+3      300      309      422      120     111    -0.68    20  -  - 20 38  -  - 41
-		+3      423      432      545      120     111     1.29    21  - 51 21 13  -  8  5
-	0003    +2      401      416      595      192     177     2.51    93  - 93  -  5  -  -  1
-	0004    -1      645      552       34      609     516     2.33    99  -  -  - 99  -  -  0
-		+1      562      592      762      198     168    -2.54     1  1  -  -  -  -  - 98
-		+1      763      772      915      150     141    -1.34     1  1  -  -  -  - 86 11
-		+3      837      846     1007      168     159     1.35    28  - 50 28  -  - 17  3
-	0005    -3     1073      977      654      417     321     0.52    84  -  -  -  -  - 84 15
-	0006    -3     1373     1319     1095      276     222     3.80    99  -  -  -  -  - 99  0
-	0007    -2     1585     1555     1391      192     162     2.70    98  -  -  -  - 98  -  1
-	0008    -1     1812     1809     1576      234     231     1.26    94  -  -  - 94  -  -  5
-	0009    +2     1721     1730     1945      222     213     0.68    80  - 80  -  -  -  - 19
-	.....
-
--------
-
-**References**
-
-A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
-
-
-    </help>
-
-</tool>
--- a/glimmer_acgt_content.xml	Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,55 +0,0 @@
-<tool id="glimmer_acgt-content" name="ACGT Content" version="0.1">
-    <description>of windows in each sequence</description>
-    <requirements>
-        <requirement type="package" version="3.02b">glimmer</requirement>
-    </requirements>
-    <command>
-        window-acgt
-            $percentage
-            $input_win_len
-            $input_win_skip
-            &lt; $infile > $output
-            
-            ##TODO prettify the output
-    </command>
-    <inputs>
-        <param name="infile" type="data" format="fasta" label="Genome Sequence"/>
-        <param name="input_win_len" type="integer" value="10" label="The width of the sliding window"/>
-        <param name="input_win_skip" type="integer" value="10" label="The number of positions between windows to report"/>
-        <param name="percentage" type="boolean" truevalue="-p" falsevalue="" checked="true" label="Report percentages instead of counts"/>
-    </inputs>
-    <outputs>
-        <data name="output" format="tabular"/>
-    </outputs>
-    <tests>
-        <test>
-            <param name="infile" value="streptomyces_coelicolor.dna" />
-            <output name="output" file="fasta_tool_convert_from_dna.out" />
-        </test>
-    </tests>
-    <help>
-
-**What it does**
-
-This tool calculates the ACGT-Content from a given Sequence, given a sliding window.
-
--------
-
-**Output**
-
-Output is in the format:
-
-	window-start	window-len	A's	C's	G's	T's	#other	%GC
-
-Note the last window in the sequence can be shorter than *window-len* if the sequence ends prematurely
-
-
-
-
-**References**
-
-A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
-
-
-    </help>
-</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer_build-icm.xml	Fri Jun 07 10:02:12 2013 -0400
@@ -0,0 +1,119 @@
+<tool id="glimmer_build-icm" name="ICM builder" version="0.1">
+    <description>(glimmer)</description>
+    <requirements>
+        <requirement type="package" version="3.02b">glimmer</requirement>
+    </requirements>
+    <command>
+        build-icm
+            --depth $depth
+            #if $no_stops:
+                --no_stops
+            #end if
+            --period $period
+            --width $width
+
+            #if $stop_codon_opts.stop_codon_opts_selector == "gb":
+                --trans_table "${stop_codon_opts.genbank_gencode}"
+            #else:
+                --stop_codons "${stop_codon_opts.stop_codons}"
+            #end if
+
+            $outfile &lt; $infile 2>&#38;1;
+    </command>
+    <inputs>
+        <param name="infile" type="data" format="fasta" label="Trainings Dataset" help="A set of known genes in FASTA format." />
+        <param name="depth" type="integer" value="7" label="Set the depth of the ICM" help="The depth is the maximum number of positions in the context window that will be used to determine the probability of the predicted position." />
+        <param name="period" type="integer" value="3" label="Set the period of the ICM" help="The period is the number of different submodels for different positions in the text in a cyclic pattern. E.g., if the period is 3, the first submodel will determine positions 1, 4, 7, ..." />
+        <param name="width" type="integer" value="12" label="Set the width of the ICM" help="The width includes the predicted position." />
+        <param name="no_stops" type="boolean" truevalue="--no_stops" falsevalue="" checked="false" label="Do not use any input strings with in-frame stop codons" />
+
+        <conditional name="stop_codon_opts">
+            <param name="stop_codon_opts_selector" type="select" label="Specify start codons as">
+              <option value="gb" selected="True">Genbank translation table entry</option>
+              <option value="free_form">Comma-separated list</option>
+            </param>
+            <when value="gb">
+                <param name="genbank_gencode" type="select" label="Use Genbank translation table to specify stop codons">
+                    <option value="1" select="True">1. Standard</option>
+                    <option value="2">2. Vertebrate Mitochondrial</option>
+                    <option value="3">3. Yeast Mitochondrial</option>
+                    <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
+                    <option value="5">5. Invertebrate Mitochondrial</option>
+                    <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
+                    <option value="9">9. Echinoderm Mitochondrial</option>
+                    <option value="10">10. Euplotid Nuclear</option>
+                    <option value="11">11. Bacteria and Archaea</option>
+                    <option value="12">12. Alternative Yeast Nuclear</option>
+                    <option value="13">13. Ascidian Mitochondrial</option>
+                    <option value="14">14. Flatworm Mitochondrial</option>
+                    <option value="15">15. Blepharisma Macronuclear</option>
+                    <option value="16">16. Chlorophycean Mitochondrial</option>
+                    <option value="21">21. Trematode Mitochondrial</option>
+                    <option value="22">22. Scenedesmus obliquus mitochondrial</option>
+                    <option value="23">23. Thraustochytrium Mitochondrial</option>
+                    <option value="24">24. Pterobranchia mitochondrial</option>
+                </param>
+            </when>
+            <when value="free_form">
+                <param name="stop_codons" type="text" value="tag,tga,taa" label="Specify stop codons as a comma-separated list" />
+            </when>
+        </conditional>
+
+    </inputs>
+    <outputs>
+        <data format="data" name="outfile" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="infile" value='glimmer3/seqTest.fa'/>
+            <output name="outfile" file='glimmer3/buildICMTestOutput.dat'/>
+        </test>
+    </tests>
+
+    <help>
+
+**What it does**
+
+	This program constructs an interpolated context model (ICM) from an input set of sequences.
+	This model can be used by Glimmer3 to predict genes.
+
+-----
+
+
+**Example**
+
+* input::
+
+	-Genome Sequence
+
+	>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7
+	GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT
+	GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT
+	TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT
+	TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC
+	GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA
+	ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG
+	AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA
+	CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA
+	TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC
+	AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA
+	GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC
+	AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC
+	CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA
+	AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC
+	GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT
+	.....
+
+* output:
+	interpolated context model (ICM)
+
+
+-------
+
+**References**
+
+A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
+
+
+    </help>
+</tool>
--- a/glimmer_orf_to_seq.py	Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,44 +0,0 @@
-#!/usr/bin/env python
-"""
-Input: DNA FASTA file + Glimmer ORF file
-Output: ORF sequences as FASTA file
-Author: Bjoern Gruening
-"""
-import sys, os
-import Bio.SeqIO
-from Bio.SeqRecord import SeqRecord
-
-def glimmer2seq( glimmer_prediction = sys.argv [1], genome_sequence = sys.argv[2], outfile = sys.argv[3] ):
-    if len(sys.argv) >= 4:
-        glimmerfile = open( glimmer_prediction, "r")
-        sequence = open( genome_sequence )
-    else:
-        print "Missing input values."
-        sys.exit()
-
-    fastafile = Bio.SeqIO.parse(sequence, "fasta")
-
-    sequences = dict()
-    seq_records = list()
-    for entry in fastafile:
-        sequences[entry.description] = entry
-
-    for line in glimmerfile:
-        if line.startswith('>'):
-            entry = sequences[ line[1:].strip() ]
-        else:
-            orf_start = int(line[8:17])
-            orf_end = int(line[18:26])
-
-            orf_name = line[0:8]
-            if orf_start <= orf_end:
-                seq_records.add( SeqRecord( entry.seq[ orf_start-1 : orf_end ], id = orf_name, description = entry.description ) )
-            else:
-                seq_records.add( SeqRecord( entry.seq[ orf_end-1 : orf_start ].reverse_complement(), id = orf_name, description = entry.description ) )
-
-    SeqIO.write( seq_records, outfile, "fasta" )
-    glimmerfile.close()
-    sequence.close()
-
-if __name__ == "__main__" :
-    glimmer2seq()
--- a/glimmer_orf_to_seq.xml	Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,30 +0,0 @@
-<tool id="glimmer_orf-to-sequence" name="ORF to Sequence" version="0.1">
-    <description>assigns ORF to its DNA sequence</description>
-    <requirements>
-        <requirement type="package" version="1.61">biopython</requirement>
-    </requirements>
-    <command interpreter="python">
-        glimmer_orf_to_seq.py
-            $glimmer_orfs
-            $input_fasta
-            $output
-    </command>
-    <inputs>
-        <param name="input_fasta" type="data" format="fasta" label="Genome Sequence"/>
-        <param name="glimmer_orfs" type="data" format="tabular" label="Define Glimmer-ORFs"/>
-    </inputs>
-    <outputs>
-        <data name="output" type="data" format="fasta"/>
-    </outputs>
-    <tests>
-        <test>
-        </test>
-    </tests>
-    <help>
-
-**What it does**
-
-This tool extract all gene sequences from a genome, which are predicted with Glimmer3.
-
-    </help>
-</tool>
--- a/glimmer_predict.py	Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,73 +0,0 @@
-#!/usr/bin/env python
-"""
-Input: DNA Fasta File
-Output: Tabular
-Return Tabular File with predicted ORF's
-Bjoern Gruening
-"""
-import sys, os
-import tempfile
-import subprocess
-import shutil
-from glimmer_orf_to_seq import glimmer2seq
-
-def main():
-    genome_seq_file = sys.argv[1]
-    outfile_classic_glimmer = sys.argv[2]
-    outfile_ext_path = sys.argv[3]
-    oufile_genes = sys.argv[8]
-
-    tag = 'glimmer_non_knowlegde_based_prediction'
-    tempdir = tempfile.gettempdir()
-
-    trainingset = os.path.join( tempdir, tag + ".train" )
-    icm = os.path.join( tempdir, tag + ".icm" )
-
-    longorfs = tempfile.NamedTemporaryFile()
-    trainingset = tempfile.NamedTemporaryFile()
-    icm = tempfile.NamedTemporaryFile()
-
-    #glimmeropts = "-o0 -g110 -t30 -l"
-    glimmeropts = "-o%s -g%s -t%s" % (sys.argv[4], sys.argv[5], sys.argv[6])
-    if sys.argv[7] == "true":
-        glimmeropts += " -l"
-
-    """
-        1. Find long, non-overlapping orfs to use as a training set
-    """
-    subprocess.Popen(["long-orfs", "-n", "-t", "1.15",
-        genome_seq_file, "-"], stdout = longorfs,
-        stderr = subprocess.PIPE).communicate()
-
-    """
-        2. Extract the training sequences from the genome file
-    """
-    subprocess.Popen(["extract", "-t",
-        genome_seq_file, longorfs.name], stdout=trainingset,
-        stderr=subprocess.PIPE).communicate()
-
-    """
-        3. Build the icm from the training sequences
-    """
-
-    # the "-" parameter is used to redirect the output to stdout
-    subprocess.Popen(["build-icm", "-r", "-"], 
-        stdin=open(trainingset.name), stdout = icm, 
-        stderr=subprocess.PIPE).communicate()
-
-    """
-        Run Glimmer3
-    """
-    b = subprocess.Popen(["glimmer3", glimmeropts, 
-        genome_seq_file, icm.name, os.path.join(tempdir, tag)], 
-        stdout = subprocess.PIPE, stderr=subprocess.PIPE).communicate()
-
-    shutil.copyfile( os.path.join( tempdir, tag + ".predict" ), outfile_classic_glimmer )
-    if outfile_ext_path.strip() != 'None':
-        shutil.copyfile( os.path.join( tempdir, tag + ".detail" ), outfile_ext_path )
-
-    glimmer2seq( outfile_classic_glimmer, genome_seq_file, oufile_genes )
-
-
-if __name__ == "__main__" :
-    main()
--- a/glimmer_predict.xml	Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,96 +0,0 @@
-<tool id="glimmer_not-knowlegde-based" name="Glimmer3" version="0.1">
-    <description>Predict ORFs in prokaryotic genomes (not knowlegde-based)</description>
-    <requirements>
-        <requirement type="package" version="3.02b">glimmer</requirement>
-        <requirement type="package" version="1.61">biopython</requirement>
-    </requirements>
-    <command interpreter="python">
-        glimmer_predict.py 
-            $input
-            $prediction
-            #if $detailed_report:
-                $output_ext
-            #else:
-                "None"
-            #end if
-            $overlap
-            $gene_length
-            $threshold
-            $linear
-            $genes_output
-    </command>
-    <inputs>
-        <param name="input" type="data" format="fasta" label="Genome sequence" />
-        <param name="overlap" type="integer" value="0" label="Set maximum overlap length. Overlaps this short or shorter are ignored." />
-        <param name="gene_length" type="integer" value="110" label="Set minimum gene length." />
-        <param name="threshold" type="integer" value="30" label="Set threshold score for calling as gene. If the in-frame score >= N, then the region is given a number and considered a potential gene." />
-        <param name="linear" type="boolean" truevalue="true" falsevalue="false" checked="true" label="Assume linear rather than circular genome, i.e., no wraparound" />
-
-        <param name="detailed_report" type="boolean" truevalue="" falsevalue="" checked="false" label="Output a detailed gene prediction report as separate file" />
-        <param name="report" type="boolean" truevalue="" falsevalue="" checked="false" label="Report the classic glimmer table output" />
-    </inputs>
-    <outputs>
-        <data name="genes_output" format="fasta" label="Glimmer3 on ${on_string} (Gene Prediction FASTA)" />
-        <data name="prediction" format="text" label="Glimmer3 on ${on_string} (Gene Prediction table)">
-            <filter>report == True</filter>
-        </data>
-        <data name="detailed" format="text" label="Glimmer3 on ${on_string} (detailed report)">
-            <filter>detailed_report == True</filter>
-        </data>
-    </outputs>
-    <tests>
-        <test>
-            <param name="input" value="streptomyces_coelicolor.dna" />
-            <output name="output" file="fasta_tool_convert_from_dna.out" />
-        </test>
-    </tests>
-    <help>
-
-**What it does**
-
-This tool predicts open reading frames (orfs) from a given DNA Sequence. That tool is not knowlegde-based.
-
-The recommended way is to use a trained Glimmer3 with ICM model. Use the knowlegde-based version for that and insert/generate a training set.
-
------
-
-**Example**
-
-Suppose you have the following DNA formatted sequences::
-
-    >SQ   Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other;
-    cccgcggagcgggtaccacatcgctgcgcgatgtgcgagcgaacacccgggctgcgcccg
-    ggtgttgcgctcccgctccgcgggagcgctggcgggacgctgcgcgtcccgctcaccaag
-    cccgcttcgcgggcttggtgacgctccgtccgctgcgcttccggagttgcggggcttcgc
-    cccgctaaccctgggcctcgcttcgctccgccttgggcctgcggcgggtccgctgcgctc
-    ccccgcctcaagggcccttccggctgcgcctccaggacccaaccgcttgcgcgggcctgg
-
-Running this tool will produce this::
-
-    >SQ   Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other;
-    orf00001      577      699  +1     5.24
-    orf00003      800     1123  +2     5.18
-    orf00004     1144     3813  +1    10.62
-    orf00006     3857     6220  +2     6.07
-    orf00007     6226     7173  +1     1.69
-    orf00008     7187     9307  +2     8.95
-    orf00009     9424    10410  +1     8.29
-    orf00010    10515    11363  +3     7.00
-    orf00011    11812    11964  +1     2.80
-    orf00012    12360    13457  +3     4.80
-    orf00013    14379    14044  -1     7.41
-    orf00015    15029    14739  -3    12.43
-    orf00016    15066    15227  +3     1.91
-    orf00020    16061    15351  -3     2.83
-    orf00021    17513    17391  -3     2.20
-    orf00023    17529    17675  +3     0.11
-
-
--------
-
-**References**
-
-A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
-
-    </help>
-</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer_w_icm.xml	Fri Jun 07 10:02:12 2013 -0400
@@ -0,0 +1,232 @@
+<tool id="glimmer_knowlegde-based" name="Glimmer3" version="0.1">
+    <description>Predict ORFs in prokaryotic genomes (knowlegde-based)</description>
+    <requirements>
+        <requirement type="package" version="3.02b">glimmer</requirement>
+        <requirement type="package" version="1.61">biopython</requirement>
+        <requirement type="set_environment">GLIMMER_SCRIPT_PATH</requirement>
+    </requirements>
+    <command>
+    #import tempfile, os
+    #set $temp = tempfile.NamedTemporaryFile( delete=False )
+    # $temp.close()
+
+    glimmer3
+        --max_olap $max_olap
+        --gene_len $gene_len
+        --threshold $threshold
+        #if float( str($gc_percent) ) > 0.0:
+            --gc_percent $gc_percent
+        #end if
+
+        #if $stop_codon_opts.stop_codon_opts_selector == "gb":
+            --trans_table "${stop_codon_opts.genbank_gencode}"
+        #else:
+            --stop_codons "${stop_codon_opts.stop_codons}"
+        #end if
+
+        $linear
+        $no_indep
+        $extend
+        $seq_input
+        $icm_input
+        $temp 2>&#38;1;
+
+    ## convert prediction to FASTA sequences
+    \$GLIMMER_SCRIPT_PATH/glimmer2seq.py $temp".predict" $seq_input $genes_output
+
+    #if $report:
+        mv $temp".predict" $prediction;
+    #else:
+        rm $temp".predict";
+    #end if
+
+    #if $detailed_report:
+        mv $temp".detail" $detailed;
+    #else:
+        rm $temp".detail";
+    #end if
+
+    rm $temp
+    </command>
+    <inputs>
+        <param name="seq_input" type="data" format="fasta" label="Genome Sequence" />
+        <param name="icm_input" type="data" format="data" label="Interpolated context model (ICM)" />
+
+        <param name="max_olap" type="integer" value="50" label="Set maximum overlap length" help="Overlaps this short or shorter are ignored." />
+        <param name="gene_len" type="integer" value="90" label="Set the minimum gene length to n nucleotides" hrlp="This does not include the bases in the stop codon."/>
+        <param name="threshold" type="integer" value="30" label="Set threshold score for calling as gene" help="If the in-frame score >= N, then the region is given a number and considered a potential gene." />
+        <param name="gc_percent" type="float" value="0.0" label="Set the GC percentage of the independent model, i.e., the model of intergenic sequence" help="If 0.0 specified, the GC percentage will be counted from the input file." />
+
+        <param name="linear" type="boolean" truevalue="--linear" falsevalue="" checked="true" label="Assume linear rather than circular genome, i.e., no wraparound" />
+        <param name="no_indep" type="boolean" truevalue="--no_indep" falsevalue="" checked="false" label="Don’t use the independent probability score column at all" help="Using this option will produce more short gene predictions." />
+        <param name="extend" type="boolean" truevalue="--extend" falsevalue="" checked="false" label="Also score orfs that extend off the end of the sequence(s)" />
+        <param name="start_codons" type="text" value="atg,gtg,ttg" label="Specify start codons as a comma-separated list" />
+
+        <conditional name="stop_codon_opts">
+            <param name="stop_codon_opts_selector" type="select" label="Specify start codons as">
+              <option value="gb" selected="True">Genbank translation table entry</option>
+              <option value="free_form">Comma-separated list</option>
+            </param>
+            <when value="gb">
+                <param name="genbank_gencode" type="select" label="Use Genbank translation table to specify stop codons">
+                    <option value="1" select="True">1. Standard</option>
+                    <option value="2">2. Vertebrate Mitochondrial</option>
+                    <option value="3">3. Yeast Mitochondrial</option>
+                    <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
+                    <option value="5">5. Invertebrate Mitochondrial</option>
+                    <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
+                    <option value="9">9. Echinoderm Mitochondrial</option>
+                    <option value="10">10. Euplotid Nuclear</option>
+                    <option value="11">11. Bacteria and Archaea</option>
+                    <option value="12">12. Alternative Yeast Nuclear</option>
+                    <option value="13">13. Ascidian Mitochondrial</option>
+                    <option value="14">14. Flatworm Mitochondrial</option>
+                    <option value="15">15. Blepharisma Macronuclear</option>
+                    <option value="16">16. Chlorophycean Mitochondrial</option>
+                    <option value="21">21. Trematode Mitochondrial</option>
+                    <option value="22">22. Scenedesmus obliquus mitochondrial</option>
+                    <option value="23">23. Thraustochytrium Mitochondrial</option>
+                    <option value="24">24. Pterobranchia mitochondrial</option>
+                </param>
+            </when>
+            <when value="free_form">
+                <param name="stop_codons" type="text" value="tag,tga,taa" label="Specify stop codons as a comma-separated list" />
+            </when>
+        </conditional>
+
+        <param name="report" type="boolean" truevalue="" falsevalue="" checked="false" label="Report the classic glimmer table output"/>
+        <param name="detailed_report" type="boolean" truevalue="" falsevalue="" checked="false" label="Output a detailed gene prediction report as separate file"/>
+    </inputs>
+    <outputs>
+        <data name="genes_output" format="fasta" label="Glimmer3 on ${on_string} (Gene Prediction FASTA)" />
+        <data name="prediction" format="txt" label="Glimmer3 on ${on_string} (Gene Prediction table)">
+            <filter>report == True</filter>
+        </data>
+        <data name="detailed" format="txt" label="Glimmer3 on ${on_string} (detailed report)">
+            <filter>detailed_report == True</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="seqInput" value='glimmer3/seqTest.fa' />
+            <param name="icmInput" value='glimmer3/icmTest.icm' />
+            <param name="overlaplen" value="50" />
+            <param name="genlen"  value="90" />
+            <param name="thresh"  value="30" />
+            <param name="linear" value="-l" />
+            <output name="output1" file='glimmer3/output1Test.dat' />
+            <output name="output2" file='glimmer3/output2Test.dat' />
+        </test>
+    </tests>
+    <help>
+
+
+**What it does**
+
+    This is the main program that makes gene preditions based on an interpolated context model (ICM).
+    The ICM can be generated either with a de novo prediction (see glimmer Overview) or with extracted CDS from related organisms.
+
+-----
+
+**TIP** To extract CDS from a GenBank file use the tool *Extract ORF from a GenBank file*.
+
+-----
+
+**Glimmer Overview**
+
+::
+
+**************		**************		**************		**************		
+*            *		*	     *		*            *		*            *
+* long-orfs  *  ===>	*   Extract  *	===>	* build-icm  *  ===>	*  glimmer3  *	
+*            *		*	     *		*	     *  	*	     *	
+**************		**************		**************		**************
+
+**Example**
+
+* input::
+	
+	-Genome Sequence
+
+	CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7
+	GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT
+	GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT
+	TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT
+	TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC
+	GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA
+	ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG
+	AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA
+	CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA
+	TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC
+	AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA
+	GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC
+	AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC
+	CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA
+	AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC
+	GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT
+	.....
+
+
+	- interpolated context model (ICM) 	92: glimmer3-build-icm on data 89
+	- maximum overlap length		50
+	- minimum gene length. 			90
+	- threshold score			30
+	- linear	 			True
+
+* output:: 
+
+	.predict file
+	>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
+	orf00001    40137       52  +2     8.68
+	orf00004      603       34  -1     2.91
+	orf00006     1289     1095  -3     3.16
+	orf00007     1555     1391  -2     2.33
+	orf00008     1809     1576  -1     1.02
+	orf00010     1953     2066  +3     3.09
+	orf00011     2182     2304  +1     0.89
+	orf00013     2390     2521  +2     0.60
+	orf00018     2570     3073  +2     2.54
+	orf00020     3196     3747  +1     2.91
+	orf00022     3758     4000  +2     0.83
+	orf00023     4399     4157  -2     1.31
+	orf00025     4463     4759  +2     2.92
+	orf00026     4878     5111  +3     0.78
+	orf00027     5468     5166  -3     1.64
+	orf00029     5590     5832  +1     0.29
+	orf00032     6023     6226  +2     6.02
+	orf00033     6217     6336  +1     3.09
+	........
+	
+
+	.details file
+	>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
+	Sequence length = 40222
+
+		   ----- Start -----           --- Length ----  ------------- Scores -------------
+	 ID  Frame   of Orf  of Gene     Stop   of Orf of Gene      Raw InFrm F1 F2 F3 R1 R2 R3 NC
+	0001    +2    40137    40137       52      135     135     9.26    96  - 96  -  -  3  -  0
+	0002    +1       58       64      180      120     114     5.01    69 69  -  - 30  -  -  0
+		+3      300      309      422      120     111    -0.68    20  -  - 20 38  -  - 41
+		+3      423      432      545      120     111     1.29    21  - 51 21 13  -  8  5
+	0003    +2      401      416      595      192     177     2.51    93  - 93  -  5  -  -  1
+	0004    -1      645      552       34      609     516     2.33    99  -  -  - 99  -  -  0
+		+1      562      592      762      198     168    -2.54     1  1  -  -  -  -  - 98
+		+1      763      772      915      150     141    -1.34     1  1  -  -  -  - 86 11
+		+3      837      846     1007      168     159     1.35    28  - 50 28  -  - 17  3
+	0005    -3     1073      977      654      417     321     0.52    84  -  -  -  -  - 84 15
+	0006    -3     1373     1319     1095      276     222     3.80    99  -  -  -  -  - 99  0
+	0007    -2     1585     1555     1391      192     162     2.70    98  -  -  -  - 98  -  1
+	0008    -1     1812     1809     1576      234     231     1.26    94  -  -  - 94  -  -  5
+	0009    +2     1721     1730     1945      222     213     0.68    80  - 80  -  -  -  - 19
+	.....
+
+-------
+
+**References**
+
+A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
+
+
+    </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer_wo_icm.py	Fri Jun 07 10:02:12 2013 -0400
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+"""
+Input: DNA Fasta File
+Output: Tabular
+Return Tabular File with predicted ORF's
+Bjoern Gruening
+"""
+import sys, os
+import tempfile
+import subprocess
+import shutil
+from glimmer2seq import glimmer2seq
+
+def main():
+    genome_seq_file = sys.argv[1]
+    outfile_classic_glimmer = sys.argv[2]
+    outfile_ext_path = sys.argv[3]
+    oufile_genes = sys.argv[8]
+
+    tag = 'glimmer_non_knowlegde_based_prediction'
+    tempdir = tempfile.gettempdir()
+
+    trainingset = os.path.join( tempdir, tag + ".train" )
+    icm = os.path.join( tempdir, tag + ".icm" )
+
+    longorfs = tempfile.NamedTemporaryFile()
+    trainingset = tempfile.NamedTemporaryFile()
+    icm = tempfile.NamedTemporaryFile()
+
+    #glimmeropts = "-o0 -g110 -t30 -l"
+    glimmeropts = "-o%s -g%s -t%s" % (sys.argv[4], sys.argv[5], sys.argv[6])
+    if sys.argv[7] == "true":
+        glimmeropts += " -l"
+
+    """
+        1. Find long, non-overlapping orfs to use as a training set
+    """
+    subprocess.Popen(["long-orfs", "-n", "-t", "1.15",
+        genome_seq_file, "-"], stdout = longorfs,
+        stderr = subprocess.PIPE).communicate()
+
+    """
+        2. Extract the training sequences from the genome file
+    """
+    subprocess.Popen(["extract", "-t",
+        genome_seq_file, longorfs.name], stdout=trainingset,
+        stderr=subprocess.PIPE).communicate()
+
+    """
+        3. Build the icm from the training sequences
+    """
+
+    # the "-" parameter is used to redirect the output to stdout
+    subprocess.Popen(["build-icm", "-r", "-"], 
+        stdin=open(trainingset.name), stdout = icm, 
+        stderr=subprocess.PIPE).communicate()
+
+    """
+        Run Glimmer3
+    """
+    b = subprocess.Popen(["glimmer3", glimmeropts, 
+        genome_seq_file, icm.name, os.path.join(tempdir, tag)], 
+        stdout = subprocess.PIPE, stderr=subprocess.PIPE).communicate()
+
+    if outfile_classic_glimmer.strip() != 'None':
+        shutil.copyfile( os.path.join( tempdir, tag + ".predict" ), outfile_classic_glimmer )
+    if outfile_ext_path.strip() != 'None':
+        shutil.copyfile( os.path.join( tempdir, tag + ".detail" ), outfile_ext_path )
+
+    glimmer2seq( os.path.join( tempdir, tag + ".predict" ), genome_seq_file, oufile_genes )
+
+
+if __name__ == "__main__" :
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer_wo_icm.xml	Fri Jun 07 10:02:12 2013 -0400
@@ -0,0 +1,100 @@
+<tool id="glimmer_not-knowlegde-based" name="Glimmer3" version="0.1">
+    <description>Predict ORFs in prokaryotic genomes (not knowlegde-based)</description>
+    <requirements>
+        <requirement type="package" version="3.02b">glimmer</requirement>
+        <requirement type="package" version="1.61">biopython</requirement>
+    </requirements>
+    <command interpreter="python">
+        glimmer_wo_icm.py 
+            $input
+            #if $report:
+                $prediction
+            #else:
+                "None"
+            #end if
+            #if $detailed_report:
+                $detailed
+            #else:
+                "None"
+            #end if
+            $overlap
+            $gene_length
+            $threshold
+            $linear
+            $genes_output
+    </command>
+    <inputs>
+        <param name="input" type="data" format="fasta" label="Genome sequence" />
+        <param name="overlap" type="integer" value="0" label="Set maximum overlap length. Overlaps this short or shorter are ignored." />
+        <param name="gene_length" type="integer" value="110" label="Set minimum gene length." />
+        <param name="threshold" type="integer" value="30" label="Set threshold score for calling as gene. If the in-frame score >= N, then the region is given a number and considered a potential gene." />
+        <param name="linear" type="boolean" truevalue="true" falsevalue="false" checked="true" label="Assume linear rather than circular genome, i.e., no wraparound" />
+
+        <param name="detailed_report" type="boolean" truevalue="" falsevalue="" checked="false" label="Output a detailed gene prediction report as separate file" />
+        <param name="report" type="boolean" truevalue="" falsevalue="" checked="false" label="Report the classic glimmer table output" />
+    </inputs>
+    <outputs>
+        <data name="genes_output" format="fasta" label="Glimmer3 on ${on_string} (Gene Prediction FASTA)" />
+        <data name="prediction" format="txt" label="Glimmer3 on ${on_string} (Gene Prediction table)">
+            <filter>report == True</filter>
+        </data>
+        <data name="detailed" format="txt" label="Glimmer3 on ${on_string} (detailed report)">
+            <filter>detailed_report == True</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" value="streptomyces_coelicolor.dna" />
+            <output name="output" file="fasta_tool_convert_from_dna.out" />
+        </test>
+    </tests>
+    <help>
+
+**What it does**
+
+This tool predicts open reading frames (orfs) from a given DNA Sequence. That tool is not knowlegde-based.
+
+The recommended way is to use a trained Glimmer3 with ICM model. Use the knowlegde-based version for that and insert/generate a training set.
+
+-----
+
+**Example**
+
+Suppose you have the following DNA formatted sequences::
+
+    >SQ   Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other;
+    cccgcggagcgggtaccacatcgctgcgcgatgtgcgagcgaacacccgggctgcgcccg
+    ggtgttgcgctcccgctccgcgggagcgctggcgggacgctgcgcgtcccgctcaccaag
+    cccgcttcgcgggcttggtgacgctccgtccgctgcgcttccggagttgcggggcttcgc
+    cccgctaaccctgggcctcgcttcgctccgccttgggcctgcggcgggtccgctgcgctc
+    ccccgcctcaagggcccttccggctgcgcctccaggacccaaccgcttgcgcgggcctgg
+
+Running this tool will produce this::
+
+    >SQ   Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other;
+    orf00001      577      699  +1     5.24
+    orf00003      800     1123  +2     5.18
+    orf00004     1144     3813  +1    10.62
+    orf00006     3857     6220  +2     6.07
+    orf00007     6226     7173  +1     1.69
+    orf00008     7187     9307  +2     8.95
+    orf00009     9424    10410  +1     8.29
+    orf00010    10515    11363  +3     7.00
+    orf00011    11812    11964  +1     2.80
+    orf00012    12360    13457  +3     4.80
+    orf00013    14379    14044  -1     7.41
+    orf00015    15029    14739  -3    12.43
+    orf00016    15066    15227  +3     1.91
+    orf00020    16061    15351  -3     2.83
+    orf00021    17513    17391  -3     2.20
+    orf00023    17529    17675  +3     0.11
+
+
+-------
+
+**References**
+
+A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
+
+    </help>
+</tool>
--- a/readme.rst	Fri Jun 07 07:51:49 2013 -0400
+++ b/readme.rst	Fri Jun 07 10:02:12 2013 -0400
@@ -29,15 +29,9 @@
 folder and modify the tools_conf.xml file to make the tool available to Galaxy.
 For example:
 
-<tool file="gene_prediction/tools/glimmer3/glimmer3-main-wrapper.xml" />
-<tool file="gene_prediction/tools/glimmer3/glimmer_predict.xml" />
-<tool file="gene_prediction/tools/glimmer3/glimmer_orf_to_seq.xml" />
-<tool file="gene_prediction/tools/glimmer3/glimmer2gff.xml" />
-<tool file="gene_prediction/tools/glimmer3/gbktoorfWrapper.xml" />
-<tool file="gene_prediction/tools/glimmer3/glimmer_acgt_content.xml" />
-<tool file="gene_prediction/tools/glimmer3/glimmer3-build-icm-wrapper.xml" />
-<tool file="gene_prediction/tools/glimmer3/glimmer3-extract-wrapper.xml" />
-<tool file="gene_prediction/tools/glimmer3/glimmer3-long-orfs-wrapper.xml" />
+<tool file="gene_prediction/tools/glimmer3/glimmer_w_icm.xml" />
+<tool file="gene_prediction/tools/glimmer3/glimmer_wo_icm.xml" />
+<tool file="gene_prediction/tools/glimmer3/glimmer_build-icm.xml" />
 
 
 History