comparison glimmer_w_icm.xml @ 4:8ddf54417ade

Uploaded
author bgruening
date Fri, 07 Jun 2013 10:02:12 -0400
parents
children a07c49839f31
comparison
equal deleted inserted replaced
2:2d0c26885604 4:8ddf54417ade
1 <tool id="glimmer_knowlegde-based" name="Glimmer3" version="0.1">
2 <description>Predict ORFs in prokaryotic genomes (knowlegde-based)</description>
3 <requirements>
4 <requirement type="package" version="3.02b">glimmer</requirement>
5 <requirement type="package" version="1.61">biopython</requirement>
6 <requirement type="set_environment">GLIMMER_SCRIPT_PATH</requirement>
7 </requirements>
8 <command>
9 #import tempfile, os
10 #set $temp = tempfile.NamedTemporaryFile( delete=False )
11 # $temp.close()
12
13 glimmer3
14 --max_olap $max_olap
15 --gene_len $gene_len
16 --threshold $threshold
17 #if float( str($gc_percent) ) > 0.0:
18 --gc_percent $gc_percent
19 #end if
20
21 #if $stop_codon_opts.stop_codon_opts_selector == "gb":
22 --trans_table "${stop_codon_opts.genbank_gencode}"
23 #else:
24 --stop_codons "${stop_codon_opts.stop_codons}"
25 #end if
26
27 $linear
28 $no_indep
29 $extend
30 $seq_input
31 $icm_input
32 $temp 2>&#38;1;
33
34 ## convert prediction to FASTA sequences
35 \$GLIMMER_SCRIPT_PATH/glimmer2seq.py $temp".predict" $seq_input $genes_output
36
37 #if $report:
38 mv $temp".predict" $prediction;
39 #else:
40 rm $temp".predict";
41 #end if
42
43 #if $detailed_report:
44 mv $temp".detail" $detailed;
45 #else:
46 rm $temp".detail";
47 #end if
48
49 rm $temp
50 </command>
51 <inputs>
52 <param name="seq_input" type="data" format="fasta" label="Genome Sequence" />
53 <param name="icm_input" type="data" format="data" label="Interpolated context model (ICM)" />
54
55 <param name="max_olap" type="integer" value="50" label="Set maximum overlap length" help="Overlaps this short or shorter are ignored." />
56 <param name="gene_len" type="integer" value="90" label="Set the minimum gene length to n nucleotides" hrlp="This does not include the bases in the stop codon."/>
57 <param name="threshold" type="integer" value="30" label="Set threshold score for calling as gene" help="If the in-frame score >= N, then the region is given a number and considered a potential gene." />
58 <param name="gc_percent" type="float" value="0.0" label="Set the GC percentage of the independent model, i.e., the model of intergenic sequence" help="If 0.0 specified, the GC percentage will be counted from the input file." />
59
60 <param name="linear" type="boolean" truevalue="--linear" falsevalue="" checked="true" label="Assume linear rather than circular genome, i.e., no wraparound" />
61 <param name="no_indep" type="boolean" truevalue="--no_indep" falsevalue="" checked="false" label="Don’t use the independent probability score column at all" help="Using this option will produce more short gene predictions." />
62 <param name="extend" type="boolean" truevalue="--extend" falsevalue="" checked="false" label="Also score orfs that extend off the end of the sequence(s)" />
63 <param name="start_codons" type="text" value="atg,gtg,ttg" label="Specify start codons as a comma-separated list" />
64
65 <conditional name="stop_codon_opts">
66 <param name="stop_codon_opts_selector" type="select" label="Specify start codons as">
67 <option value="gb" selected="True">Genbank translation table entry</option>
68 <option value="free_form">Comma-separated list</option>
69 </param>
70 <when value="gb">
71 <param name="genbank_gencode" type="select" label="Use Genbank translation table to specify stop codons">
72 <option value="1" select="True">1. Standard</option>
73 <option value="2">2. Vertebrate Mitochondrial</option>
74 <option value="3">3. Yeast Mitochondrial</option>
75 <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
76 <option value="5">5. Invertebrate Mitochondrial</option>
77 <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
78 <option value="9">9. Echinoderm Mitochondrial</option>
79 <option value="10">10. Euplotid Nuclear</option>
80 <option value="11">11. Bacteria and Archaea</option>
81 <option value="12">12. Alternative Yeast Nuclear</option>
82 <option value="13">13. Ascidian Mitochondrial</option>
83 <option value="14">14. Flatworm Mitochondrial</option>
84 <option value="15">15. Blepharisma Macronuclear</option>
85 <option value="16">16. Chlorophycean Mitochondrial</option>
86 <option value="21">21. Trematode Mitochondrial</option>
87 <option value="22">22. Scenedesmus obliquus mitochondrial</option>
88 <option value="23">23. Thraustochytrium Mitochondrial</option>
89 <option value="24">24. Pterobranchia mitochondrial</option>
90 </param>
91 </when>
92 <when value="free_form">
93 <param name="stop_codons" type="text" value="tag,tga,taa" label="Specify stop codons as a comma-separated list" />
94 </when>
95 </conditional>
96
97 <param name="report" type="boolean" truevalue="" falsevalue="" checked="false" label="Report the classic glimmer table output"/>
98 <param name="detailed_report" type="boolean" truevalue="" falsevalue="" checked="false" label="Output a detailed gene prediction report as separate file"/>
99 </inputs>
100 <outputs>
101 <data name="genes_output" format="fasta" label="Glimmer3 on ${on_string} (Gene Prediction FASTA)" />
102 <data name="prediction" format="txt" label="Glimmer3 on ${on_string} (Gene Prediction table)">
103 <filter>report == True</filter>
104 </data>
105 <data name="detailed" format="txt" label="Glimmer3 on ${on_string} (detailed report)">
106 <filter>detailed_report == True</filter>
107 </data>
108 </outputs>
109 <tests>
110 <test>
111 <param name="seqInput" value='glimmer3/seqTest.fa' />
112 <param name="icmInput" value='glimmer3/icmTest.icm' />
113 <param name="overlaplen" value="50" />
114 <param name="genlen" value="90" />
115 <param name="thresh" value="30" />
116 <param name="linear" value="-l" />
117 <output name="output1" file='glimmer3/output1Test.dat' />
118 <output name="output2" file='glimmer3/output2Test.dat' />
119 </test>
120 </tests>
121 <help>
122
123
124 **What it does**
125
126 This is the main program that makes gene preditions based on an interpolated context model (ICM).
127 The ICM can be generated either with a de novo prediction (see glimmer Overview) or with extracted CDS from related organisms.
128
129 -----
130
131 **TIP** To extract CDS from a GenBank file use the tool *Extract ORF from a GenBank file*.
132
133 -----
134
135 **Glimmer Overview**
136
137 ::
138
139 ************** ************** ************** **************
140 * * * * * * * *
141 * long-orfs * ===> * Extract * ===> * build-icm * ===> * glimmer3 *
142 * * * * * * * *
143 ************** ************** ************** **************
144
145 **Example**
146
147 * input::
148
149 -Genome Sequence
150
151 CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7
152 GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT
153 GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT
154 TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT
155 TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC
156 GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA
157 ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG
158 AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA
159 CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA
160 TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC
161 AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA
162 GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC
163 AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC
164 CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA
165 AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC
166 GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT
167 .....
168
169
170 - interpolated context model (ICM) 92: glimmer3-build-icm on data 89
171 - maximum overlap length 50
172 - minimum gene length. 90
173 - threshold score 30
174 - linear True
175
176 * output::
177
178 .predict file
179 >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
180 orf00001 40137 52 +2 8.68
181 orf00004 603 34 -1 2.91
182 orf00006 1289 1095 -3 3.16
183 orf00007 1555 1391 -2 2.33
184 orf00008 1809 1576 -1 1.02
185 orf00010 1953 2066 +3 3.09
186 orf00011 2182 2304 +1 0.89
187 orf00013 2390 2521 +2 0.60
188 orf00018 2570 3073 +2 2.54
189 orf00020 3196 3747 +1 2.91
190 orf00022 3758 4000 +2 0.83
191 orf00023 4399 4157 -2 1.31
192 orf00025 4463 4759 +2 2.92
193 orf00026 4878 5111 +3 0.78
194 orf00027 5468 5166 -3 1.64
195 orf00029 5590 5832 +1 0.29
196 orf00032 6023 6226 +2 6.02
197 orf00033 6217 6336 +1 3.09
198 ........
199
200
201 .details file
202 >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
203 Sequence length = 40222
204
205 ----- Start ----- --- Length ---- ------------- Scores -------------
206 ID Frame of Orf of Gene Stop of Orf of Gene Raw InFrm F1 F2 F3 R1 R2 R3 NC
207 0001 +2 40137 40137 52 135 135 9.26 96 - 96 - - 3 - 0
208 0002 +1 58 64 180 120 114 5.01 69 69 - - 30 - - 0
209 +3 300 309 422 120 111 -0.68 20 - - 20 38 - - 41
210 +3 423 432 545 120 111 1.29 21 - 51 21 13 - 8 5
211 0003 +2 401 416 595 192 177 2.51 93 - 93 - 5 - - 1
212 0004 -1 645 552 34 609 516 2.33 99 - - - 99 - - 0
213 +1 562 592 762 198 168 -2.54 1 1 - - - - - 98
214 +1 763 772 915 150 141 -1.34 1 1 - - - - 86 11
215 +3 837 846 1007 168 159 1.35 28 - 50 28 - - 17 3
216 0005 -3 1073 977 654 417 321 0.52 84 - - - - - 84 15
217 0006 -3 1373 1319 1095 276 222 3.80 99 - - - - - 99 0
218 0007 -2 1585 1555 1391 192 162 2.70 98 - - - - 98 - 1
219 0008 -1 1812 1809 1576 234 231 1.26 94 - - - 94 - - 5
220 0009 +2 1721 1730 1945 222 213 0.68 80 - 80 - - - - 19
221 .....
222
223 -------
224
225 **References**
226
227 A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
228
229
230 </help>
231
232 </tool>