comparison macros.xml @ 0:6f312797c4c8 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hmmer3 commit 76281ba139c693f75a900a42c314e74d9649b0ef-dirty
author lecorguille
date Tue, 01 Nov 2016 17:13:10 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6f312797c4c8
1 <?xml version="1.0"?>
2 <macros>
3 <xml name="requirements">
4 <requirements>
5 <requirement type="package" version="3.1b2">hmmer</requirement>
6 <yield/>
7 </requirements>
8 </xml>
9 <token name="@WRAPPER_VERSION@">0.1</token>
10 <xml name="stdio">
11 <stdio>
12 <!-- Anything other than zero is an error -->
13 <exit_code range="1:"/>
14 <exit_code range=":-1"/>
15 <!-- In case the return code has not been set propery check stderr too -->
16 <regex match="Error:"/>
17 <regex match="Exception:"/>
18 </stdio>
19 </xml>
20 <token name="@THRESHOLDS@">
21 -E $E
22 --domE $domE
23
24 #if $T:
25 -T $T
26 #end if
27
28 #if $domT:
29 --domT $domT
30 #end if
31
32 #if $incE:
33 --incE $incE
34 #end if
35
36 #if $incT:
37 --incT $incT
38 #end if
39
40 #if $incdomE:
41 --incdomE $incdomE
42 #end if
43
44 #if $incdomT:
45 --incdomT $incdomT
46 #end if
47 </token>
48 <xml name="thresholds_xml">
49 <!-- Options controlling reporting thresholds -->
50 <param name="E" label="report sequences &lt;= this E-Value threshold in output" help="(-E)" value="10.0" type="float" min="0"/>
51 <param name="domE" label="report domains &lt;= this E-Value threshold in output" help="(--domE)" value="10.0" type="float" min="0"/>
52 <param name="T" label="report sequences &gt;= this score threshold in output" help="(-T)" type="float" optional="True"/>
53 <param name="domT" label="report domains &gt;= this score threshold in output" help="(--domT)" type="float" optional="True"/>
54 <!-- Options controlling inclusion (significance) thresholds -->
55 <param name="incE" label="consider sequences &lt;= this E-Value threshold as significant" help="(--incE)" type="float" optional="True"/>
56 <param name="incdomE" label="consider domains &lt;= this E-Value threshold as significant" help="(--incdomE)" type="float" optional="True"/>
57 <param name="incT" label="consider sequences &gt;= this score threshold as significant" help="(--incT)" type="float" optional="True"/>
58 <param name="incdomT" label="consider domains &gt;= this score threshold as significant" help="(--incdomT)" type="float" optional="True"/>
59 </xml>
60 <token name="@THRESHOLDS_NODOM@">
61 -E $E
62
63 #if $T:
64 -T $T
65 #end if
66
67 #if $incE:
68 --incE $incE
69 #end if
70
71 #if $incT:
72 --incT $incT
73 #end if
74 </token>
75 <xml name="thresholds_nodom">
76 <!-- Options controlling reporting thresholds -->
77 <param name="E" label="report sequences &lt;= this E-Value threshold in output" help="(-E)" value="10.0" type="float" min="0"/>
78 <param name="T" label="report sequences &gt;= this score threshold in output" help="(-T)" type="float" optional="True"/>
79 <!-- Options controlling inclusion (significance) thresholds -->
80 <param name="incE" label="consider sequences &lt;= this E-Value threshold as significant" help="(--incE)" type="float" optional="True"/>
81 <param name="incT" label="consider sequences &gt;= this score threshold as significant" help="(--incT)" type="float" optional="True"/>
82 </xml>
83 <token name="@ACCEL_HEUR@">
84 $max
85 --F1 $F1
86 --F2 $F2
87 --F3 $F3
88 $nobias
89
90 </token>
91 <xml name="accel_heur_xml">
92 <!-- Options controlling acceleration heuristics -->
93 <param name="max" type="boolean" truevalue="--max" label="Turn all heuristic filters off (less speed, more power)" help="(--max)" falsevalue=""/>
94 <param name="F1" type="float" label="Stage 1 (MSV) threshold: promote hits w/ P &lt;= F1" help="(--F1)" value="0.02"/>
95 <param name="F2" type="float" label="Stage 2 (Vit) threshold: promote hits w/ P &lt;= F2" help="(--F2)" value="1e-3"/>
96 <param name="F3" type="float" label="Stage 3 (Fwd) threshold: promote hits w/ P &lt;= F3" help="(--F3)" value="1e-5"/>
97 <param name="nobias" type="boolean" truevalue="--nobias" label="Turn off composition bias filter" help="(--nobias)" falsevalue=""/>
98 </xml>
99 <token name="@EVAL_CALIB@">
100 --EmL $EmL
101 --EmN $EmN
102 --EvL $EvL
103 --EvN $EvN
104 --EfL $EfL
105 --EfN $EfN
106 --Eft $Eft
107 </token>
108 <xml name="eval_calib_xml">
109 <!-- Control of E-value calibration -->
110 <param name="EmL" type="integer" value="200" min="1" help="(--EmL)" label="Length of sequences for MSV Gumbel mu fit"/>
111 <param name="EmN" type="integer" value="200" min="1" help="(--EmN)" label="Number of sequences for MSV Gumbel mu fit"/>
112 <param name="EvL" type="integer" value="200" min="1" help="(--EvL)" label="Length of sequences for Viterbi Gumbel mu fit"/>
113 <param name="EvN" type="integer" value="200" min="1" help="(--EvN)" label="Number of sequences for Viterbi Gumbel mu fit"/>
114 <param name="EfL" type="integer" value="100" min="1" help="(--EfL)" label="Length of sequences for Forward exp tail tau fit"/>
115 <param name="EfN" type="integer" value="200" min="1" help="(--EfN)" label="Number of sequences for Forward exp tail tau fit"/>
116 <param name="Eft" type="float" value="0.04" min="0" max="1" help="(--Eft)" label="tail mass for Forward exponential tail tau fit"/>
117 </xml>
118 <token name="@OFORMAT_WITH_OPTS_NOPFAM@">
119 #if 'tblout' in str($oformat):
120 --tblout $tblout
121 #end if
122
123 #if 'domtblout' in str($oformat):
124 --domtblout $domtblout
125 #end if
126
127 $acc $noali $notextw
128 </token>
129 <xml name="oformat_with_opts_nopfam">
130 <!-- Options directing output -->
131 <param name="oformat" multiple="True" display="checkboxes" label="Output Formats" type="select">
132 <option value="tblout" selected="true">Table of per-sequence hits (--tblout)</option>
133 <option value="domtblout" selected="true">Table of per-domain hits (--domtblout)</option>
134 </param>
135 <param name="acc" type="boolean" truevalue="--acc" falsevalue="" label="Prefer accessions over names in output" help="(--acc)"/>
136 <param name="noali" type="boolean" truevalue="--noali" falsevalue="" label="Don't output alignments, so output is smaller" help="(--noali)"/>
137 <param name="notextw" type="boolean" truevalue="--notextw" falsevalue="" label="Unlimited ASCII text output line width" help="(--notextw)"/>
138 </xml>
139 <token name="@OFORMAT_WITH_OPTS@">
140 #if 'tblout' in str($oformat):
141 --tblout $tblout
142 #end if
143
144 #if 'domtblout' in str($oformat):
145 --domtblout $domtblout
146 #end if
147
148 #if 'pfamtblout' in str($oformat):
149 --pfamtblout $pfamtblout
150 #end if
151
152 $acc $noali $notextw
153 </token>
154 <xml name="oformat_with_opts">
155 <!-- Options directing output -->
156 <param name="oformat" multiple="True" display="checkboxes" label="Output Formats" type="select">
157 <option value="tblout" selected="true">Table of per-sequence hits (--tblout)</option>
158 <option value="domtblout" selected="true">Table of per-domain hits (--domtblout)</option>
159 <option value="pfamtblout" selected="true">Table of hits and domains in Pfam format (--pfamtblout)</option>
160 </param>
161 <param name="acc" type="boolean" truevalue="--acc" falsevalue="" label="Prefer accessions over names in output" help="(--acc)"/>
162 <param name="noali" type="boolean" truevalue="--noali" falsevalue="" label="Don't output alignments, so output is smaller" help="(--noali)"/>
163 <param name="notextw" type="boolean" truevalue="--notextw" falsevalue="" label="Unlimited ASCII text output line width" help="(--notextw)"/>
164 </xml>
165 <xml name="oformat_test">
166 <param name="notextw" value="True" />
167 </xml>
168 <!-- TODO: tblout will match 'pfamtblout,dfamtblout' -->
169 <token name="@OFORMAT_WITH_OPTS_N@">
170 #if 'tblout' in str($oformat):
171 --tblout $tblout
172 #end if
173
174 #if 'dfamtblout' in str($oformat):
175 --dfamtblout $dfamtblout
176 #end if
177
178 #if 'aliscoresout' in str($oformat):
179 --aliscoresout $aliscoresout
180 #end if
181
182 $acc $noali $notextw
183 </token>
184 <xml name="oformat_with_opts_n">
185 <!-- Options directing output -->
186 <param name="oformat" multiple="True" display="checkboxes" label="Output Formats" type="select">
187 <option value="tblout" selected="true">Table of hits (--tblout)</option>
188 <option value="dfamtblout" selected="true">Table of hits in Dfam format (--dfamtblout)</option>
189 <option value="aliscoresout">Scores for each position in each alignment to file (--aliscoresout)</option>
190 </param>
191 <param name="acc" type="boolean" truevalue="--acc" falsevalue="" label="Prefer accessions over names in output" help="(--acc)"/>
192 <param name="noali" type="boolean" truevalue="--noali" falsevalue="" label="Don't output alignments, so output is smaller" help="(--noali)"/>
193 <param name="notextw" type="boolean" truevalue="--notextw" falsevalue="" label="Unlimited ASCII text output line width" help="(--notextw)"/>
194 </xml>
195 <token name="@HSSI@">
196 #if $hssi.hssi_select == "singlemx":
197 --popen $hssi.popen
198 --pextend $hssi.pextend
199 #end if
200 </token>
201 <xml name="hssi">
202 <!-- Handling single sequence inputs -->
203 <conditional name="hssi">
204 <param name="hssi_select" type="select" label="Options for handling single sequence inputs">
205 <option value="false" selected="true">Disable</option>
206 <option value="singlemx">Use substitution score matrix for single-sequence inputs</option>
207 </param>
208 <when value="singlemx">
209 <param name="popen" type="float" value="0.02" label="Gap open probability" help="(--popen)" min="0.0" max="0.5"/>
210 <param name="pextend" type="float" value="0.4" label="Gap extend probability" help="(--pextend)" min="0.0" max="1.0"/>
211 </when>
212 <when value="false">
213 </when>
214 <!-- -mx <s> : substitution score matrix (built-in matrices, with -singlemx)-->
215 <!-- -mxfile <f> : read substitution score matrix from file <f> (with -singlemx)-->
216 </conditional>
217 </xml>
218 <token name="@CPU@">
219 --cpu \${GALAXY_SLOTS:-2}
220 </token>
221 <token name="@SEED@">
222 --seed $seed
223 </token>
224 <xml name="seed">
225 <param name="seed" label="RNG seed, 0 generates a random seed" value="42" type="integer" help="(--seed)" min="0"/>
226 </xml>
227 <xml name="seed_test">
228 <param name="seed" value="4" />
229 </xml>
230 <token name="@ADV_OPTS@">
231 $nonull2
232
233 #if $Z:
234 -Z $Z
235 #end if
236
237 #if $domZ:
238 --domZ $domZ
239 #end if
240 </token>
241 <xml name="adv_opts">
242 <!-- Other options -->
243 <param name="nonull2" type="boolean" truevalue="--nonull2" label="Turn off biased composition score corrections" help="(--nonull2)" falsevalue=""/>
244 <param name="Z" type="integer" label="# of comparisons done for E-value calculation" help="(-Z)" optional="True"/>
245 <param name="domZ" type="integer" label="# of significant sequences, for domain E-value calculation" help="(--domZ)" optional="True"/>
246 </xml>
247 <token name="@FORMAT_SELECTOR@">
248 $input_format_select
249 </token>
250 <xml name="format_selector">
251 <param name="input_format_select" type="select" label="Format of sequence and model">
252 <option value="--amino">Protein</option>
253 <option value="--dna">DNA</option>
254 <option value="--rna">RNA</option>
255 </param>
256 </xml>
257 <xml name="format_selector_noprot">
258 <param name="input_format_select" type="select" label="Format of sequence and model">
259 <option value="--dna">DNA</option>
260 <option value="--rna">RNA</option>
261 </param>
262 </xml>
263 <token name="@ARSWS@">
264 $arsws.arsws_select
265
266 #if $arsws.arsws_select == "--wblosum":
267 --wid $arsws.wid
268 #end if
269 </token>
270 <xml name="arsws">
271 <!-- Alternative relative sequence weighting strategies -->
272 <conditional name="arsws">
273 <param name="arsws_select" type="select" label="Alternative relative sequence weighting strategies">
274 <option value="--wpb" selected="true">Henikoff position-based weights (--wpb)</option>
275 <option value="--wgsc">Gerstein/Sonnhammer/Chothia tree weights (--wgsc)</option>
276 <option value="--wblosum">Henikoff simple filter weights (--wblosum)</option>
277 <option value="--wnone">don't do any relative weighting; set all to 1 (--wnnoe)</option>
278 <option value="--wgiven">use weights as given in MSA file (--wgiven)</option>
279 </param>
280 <when value="--wpb">
281 </when>
282 <when value="--wgsc">
283 </when>
284 <when value="--wblosum">
285 <param name="wid" label="Set identity cutoff" value="0.62" type="float" help="(--wid)"/>
286 </when>
287 <when value="--wnone">
288 </when>
289 <when value="--wgiven">
290 </when>
291 </conditional>
292 </xml>
293 <token name="@AEEWS@">
294 #if $aeews.aeews_select != "":
295 --$aeews.aeews_select
296 #if $aeews.aeews_select == "eent":
297 --eset $aeews.eset
298 --ere $aeews.ere
299 --esigma $aeews.esigma
300 #elif $aeews.aeews_select == "eclust":
301 --eset $aeews.eset
302 --eid $aeews.eid
303 #end if
304 #end if
305 </token>
306 <xml name="aeews">
307 <!-- Alternative effective sequence weighting strategies -->
308 <conditional name="aeews">
309 <param name="aeews_select" type="select" label="Alternative effective sequence weighting strategies">
310 <option value="">Disabled</option>
311 <option value="eent">Adjust eff seq # to achieve relative entropy target (--eent)</option>
312 <option value="eclust">Eff seq # is the # of single linkage clusters (--eclust)</option>
313 <option value="enone">No effective seq # weighting: just use nseq (--enone)</option>
314 </param>
315 <when value="">
316 </when>
317 <when value="eent">
318 <param name="eset" type="float" value="0" label="set eff seq # for all models" help="(--eset)"/>
319 <param name="ere" type="float" value="0" label="set minimum rel entropy/position" help="(--ere)"/>
320 <param name="esigma" type="float" value="45" label="set sigma param" help="(--esigma)"/>
321 </when>
322 <when value="eclust">
323 <param name="eset" type="float" value="0" label="set eff seq # for all models" help="(--eset)"/>
324 <param name="eid" type="float" value="0.62" label="set fractional identity cutoff" min="0" max="1" help="(--eid)"/>
325 </when>
326 <when value="enone">
327 </when>
328 </conditional>
329 </xml>
330 <token name="@CUT@">
331 $cut_ga
332 $cut_nc
333 $cut_tc
334 </token>
335 <xml name="cut">
336 <param name="cut_ga" type="boolean" truevalue="--cut_ga" label="use profile's GA gathering cutoffs to set all thresholding" help="(--cut_ga)" falsevalue=""/>
337 <param name="cut_nc" type="boolean" truevalue="--cut_nc" label="use profile's NC gathering cutoffs to set all thresholding" help="(--cut_nc)" falsevalue=""/>
338 <param name="cut_tc" type="boolean" truevalue="--cut_tc" label="use profile's TC gathering cutoffs to set all thresholding" help="(--cut_tc)" falsevalue=""/>
339 </xml>
340 <token name="@MCSS@">
341 --$mcs.model_construction_strategy_select
342
343 #if $mcs.model_construction_strategy_select == "fast":
344 --symfrac $mcs.symfrac
345 #end if
346
347 </token>
348 <xml name="mcss">
349 <!-- Alternative model construction strategies -->
350 <conditional name="mcs">
351 <param name="model_construction_strategy_select" type="select" label="Model Construction Strategy">
352 <option value="fast" selected="true">Assign columns with &gt;= symfrac residues as consensus (--fast)</option>
353 <option value="hand">Manual construction (requires reference annotation) (--hand)</option>
354 </param>
355 <when value="fast">
356 <param name="symfrac" value="0.5" type="float" label="Sets sym fraction controlling --fast construction"/>
357 </when>
358 <when value="hand"></when>
359 </conditional>
360 <param name="fragthresh" label="Fraction of alignment length, under which sequences are excluded" help="HMMER infers fragments if the sequence length L is less than or equal to a fraction x times the alignment length in columns (--fragthresh)" value="0.5" optional="True" type="float" />
361
362 </xml>
363 <token name="@PRIOR@">
364 $aps_select
365 </token>
366 <xml name="prior">
367 <param name="aps_select" type="select" label="Alternative Prior Strategies">
368 <option value="" selected="true">Unspecified</option>
369 <option value="--pnone">Don't use any prior; parameters are frequencies (--pnone)</option>
370 <option value="--plaplace">Use a Laplace +1 prior (--plaplace)</option>
371 </param>
372 </xml>
373 <xml name="citation">
374 <citations>
375 <citation type="doi">10.1093/nar/gkr367</citation>
376 </citations>
377 </xml>
378 <token name="@LENGTHS@">
379 #if $w_beta:
380 --w_beta $w_beta
381 #end if
382
383 #if $w_length:
384 --w_length $w_length
385 #end if
386
387 </token>
388 <xml name="lengths">
389 <param name="w_beta" label="Tail mass at which window length is determined"
390 help="(--w_beta)" optional="True" type="float"/>
391 <param name="w_length" label="Window Length"
392 help="(--w_length)" optional="True" type="integer" />
393 </xml>
394 <token name="@INPUTHMMCHOICE@">
395 #if $inputHmmConditional.inputHmmSource == "history":
396 "$inputHmmConditional.hmmfile"
397 #else:
398 "$inputHmmConditional.index"
399 #end if
400 </token>
401 <xml name="input_hmm_choice">
402 <conditional name="inputHmmConditional">
403 <param name="inputHmmSource" type="select" label="Use a built-in HMM model database or own from your history" >
404 <option value="indexed" selected="True">Use a built-in HMM modal database</option>
405 <option value="history">Use a HMM database from history</option>
406 </param>
407 <when value="indexed">
408 <param name="index" type="select" label="Select a HMM model database" help="If your database of interest is not listed, contact the Galaxy team">
409 <options from_data_table="hmm_database">
410 <filter type="sort_by" column="2"/>
411 <validator type="no_options" message="No indexes are available for the selected input dataset"/>
412 </options>
413 </param>
414 </when>
415 <when value="history">
416 <param name="hmmfile" type="data" label="HMM model" format="hmm2,hmm3"/>
417 </when> <!-- history -->
418 </conditional> <!-- inputHmmConditional -->
419 </xml>
420 <xml name="input_hmm">
421 <param name="hmmfile" type="data" label="HMM model" format="hmm2,hmm3"/>
422 </xml>
423 <xml name="input_msa">
424 <param name="msafile" type="data" label="Multiple Sequence Alignment" format="stockholm,clustal,fasta"
425 help="in Stockholm, Clustal, or Fasta format. While this tool accepts fasta, please ensure that the sequences are not unaligned"/>
426 </xml>
427
428
429 <token name="@ACCEL_HEUR_HELP@"><![CDATA[
430 Acceleration Heuristicts (--F1, --F2, --F3)
431 -------------------------------------------
432
433 **MSV filter**
434
435 The sequence is aligned to the profile using a specialized model that
436 allows multiple high-scoring local ungapped segments to match. The
437 optimal alignment score (Viterbi score) is calculated under this multi-
438 segment model, hence the term MSV, for “multi-segment Viterbi”. This is
439 HMMER’s main speed heuristic. The MSV score is comparable to BLAST’s sum
440 score (optimal sum of ungapped alignment segments). Roughly speaking,
441 MSV is comparable to skipping the heuristic word hit and hit extension
442 steps of the BLAST acceleration algorithm.
443
444 The MSV filter is very, very fast. In addition to avoiding indel
445 calculations in the dynamic programming table, it uses reduced precision
446 scores scaled to 8-bit integers, enabling acceleration via 16-way
447 parallel SIMD vector instructions.
448
449 The MSV score is a true log-odds likelihood ratio, so it obeys
450 conjectures about the expected score distribution (Eddy, 2008) that
451 allow immediate and accurate calculation of the statistical significance
452 (P- value) of the MSV bit score.
453
454 By default, comparisons with a P-value of ≤ 0.02 pass this filter,
455 meaning that about 2% of nonhomol- ogous sequences are expected to pass.
456 You can use the --F1 option to change this threshold. For example, --F1
457 <0.05> would pass 5% of the comparisons, making a search more sensitive
458 but slower. Setting the threshold to ≥ 1.0 (--F1 99 for example) assures
459 that all comparisons will pass. Shutting off the MSV filter may be
460 worthwhile if you want to make sure you don’t miss comparisons that have
461 a lot of scattered insertions and deletions. Alternatively, the --max
462 option causes the MSV filter step (and all other filter steps) to be
463 bypassed.
464
465 The MSV bit score is calculated as a log-odds score using the null model
466 for comparison. No correction for a biased composition or repetitive
467 sequence is done at this stage. For comparisons involving biased
468 sequences and/or profiles, more than 2% of comparisons will pass the MSV
469 filter. At the end of search output, there is a line like:
470
471 Passed MSV filter: 107917 (0.020272); expected 106468.8 (0.02)
472
473 which tells you how many and what fraction of comparisons passed the MSV
474 filter, versus how many (and what fraction) were expected.
475
476 **Viterbi filter**
477
478 The sequence is now aligned to the profile using a fast Viterbi algorithm for
479 optimal gapped alignment.
480
481 This Viterbi implementation is specialized for speed. It is implemented in
482 8-way parallel SIMD vector instructions, using reduced precision scores that
483 have been scaled to 16-bit integers. Only one row of the dynamic programming
484 matrix is stored, so the routine only recovers the score, not the optimal
485 alignment itself. The reduced representation has limited range; local alignment
486 scores will not underflow, but high scoring comparisons can overflow and return
487 infinity, in which case they automatically pass the filter.
488
489 The final Viterbi filter bit score is then computed using the appropriate null
490 model log likelihood (by default the biased composition filter model score, or
491 if the biased filter is off, just the null model score). If the P-value of this
492 score passes the Viterbi filter threshold, the sequence passes on to the next
493 step of the pipeline.
494
495 The --F2 <x> option controls the P-value threshold for passing the Viterbi
496 filter score. The default is 0.001. The --max option bypasses all filters in
497 the pipeline. At the end of a search output, you will see a line like:
498
499 Passed Vit filter: 2207 (0.00443803); expected 497.3 (0.001)
500
501 which tells you how many and what fraction of comparisons passed the Viterbi
502 filter, versus how many were expected.
503
504 **Forward filter/parser**
505
506 The sequence is now aligned to the profile using the full Forward algorithm,
507 which calculates the likelihood of the target sequence given the profile,
508 summed over the ensemble of all possible alignments.
509
510 This is a specialized time- and memory-efficient Forward implementation called
511 the “Forward parser”. It is implemented in 4-way parallel SIMD vector
512 instructions, in full precision (32-bit floating point). It stores just enough
513 information that, in combination with the results of the Backward parser
514 (below), posterior probabilities of start and stop points of alignments
515 (domains) can be calculated in the domain definition step (below), although the
516 detailed alignments themselves cannot be.
517
518 The Forward filter bit score is calculated by correcting this score using the
519 appropriate null model log likelihood (by default the biased composition filter
520 model score, or if the biased filter is off, just the null model score). If the
521 P-value of this bit score passes the Forward filter threshold, the sequence
522 passes on to the next step of the pipeline.
523
524 The bias filter score has no further effect in the pipeline. It is only used in
525 filter stages. It has no effect on final reported bit scores or P-values.
526 Biased composition compensation for final bit scores is done by a more complex
527 domain-specific algorithm, described below.
528
529 The --F3 <x> option controls the P-value threshold for passing the Forward
530 filter score. The default is 1e-5. The --max option bypasses all filters in the
531 pipeline. At the end of a search output, you will see a line like:
532
533 Passed Fwd filter: 1076 (0.00216371); expected 5.0 (1e-05)
534
535 which tells you how many and what fraction of comparisons passed the Forward
536 filter, versus how many were expected.
537
538 **Bias Filter Options**
539
540 The --max option bypasses all filters in the pipeline, including the bias
541 filter.
542
543 The --nobias option turns off (bypasses) the biased composition filter. The
544 simple null model is used as a null hypothesis for MSV and in subsequent filter
545 steps. The biased composition filter step compromises a small amount of
546 sensitivity. Though it is good to have it on by default, you may want to shut
547 it off if you know you will have no problem with biased composition hits.
548
549
550 **Advanced Documentation**
551
552 A more detailed look at the internals of the various filter pipelines was
553 posted on the `developer's blog <http://selab.janelia.org/people/eddys/blog/?p=508>`__.
554 The information posted there may be useful to those who are struggling with
555 poor-scoring sequences.
556
557 ]]></token>
558 <token name="@ADV_OPTS_HELP@"><![CDATA[
559 Advanced Options
560 ----------------
561
562 **nonull2**
563
564 can be too aggressive sometimes, causing you to miss homologs. You can turn the
565 biased-composition score correction off with the --nonull2 option (and if
566 you’re doing that, you may also want to set --nobias, to turn off another
567 biased composition step called the bias filter, which affects which sequences
568 get scored at all).
569
570 **domZ**
571
572 Assert that the total number of targets in your searches is <x>, for the
573 purposes of per-domain conditional E-value calculations, rather than the number
574 of targets that passed the reporting thresholds.
575
576 **Z**
577
578 Assert that the total number of targets in your searches is <x>, for the
579 purposes of per-sequence E-value calculations, rather than the actual number of
580 targets seen.
581 ]]></token>
582 <token name="@AEEWS_HELP@"><![CDATA[
583 Effective Sequence Number
584 -------------------------
585
586 After relative weights are determined, they are normalized to sum to a total
587 effective sequence number, eff nseq. This number may be the actual number of
588 sequences in the alignment, but it is almost always smaller than that. The
589 default entropy weighting method (--eent) reduces the effective sequence num-
590 ber to reduce the information content (relative entropy, or average expected
591 score on true homologs) per consensus position. The target relative entropy is
592 controlled by a two-parameter function, where the two parameters are settable
593 with --ere and --esigma.
594
595 **--eent**
596
597 Adjust effective sequence number to achieve a specific relative entropy per
598 position (see --ere). This is the default.
599
600 **--eclust**
601
602 Set effective sequence number to the number of single-linkage clusters at a
603 specific identity threshold (see --eid). This option is not recommended; it’s
604 for experiments evaluating how much better --eent is.
605
606 **--enone**
607
608 Turn off effective sequence number determination and just use the actual number
609 of sequences. One reason you might want to do this is to try to maximize the
610 relative entropy/position of your model, which may be useful for short models.
611
612 **--eset**
613
614 Explicitly set the effective sequence number for all models to <x>.
615
616 **--ere**
617
618 Set the minimum relative entropy/position target to <x>. Requires --eent. Default
619 depends on the sequence alphabet. For protein sequences, it is 0.59 bits/position;
620 for nucleotide sequences, it is 0.45 bits/position.
621
622 **--esigma**
623
624 Sets the minimum relative entropy contributed by an entire model alignment, over
625 its whole length. This has the effect of making short models have higher relative
626 entropy per position than --ere alone would give. The default is 45.0 bits.
627
628 **--eid**
629
630 Sets the fractional pairwise identity cutoff used by single linkage clustering
631 with the --eclust option. The default is 0.62.
632 ]]></token>
633 <token name="@ARSWS_HELP@"><![CDATA[
634 Options Controlling Relative Weights
635 ------------------------------------
636
637 HMMER uses an ad hoc sequence weighting algorithm to downweight closely related
638 sequences and up-weight distantly related ones. This has the effect of making
639 models less biased by uneven phylogenetic representation. For example, two
640 identical sequences would typically each receive half the weight that one
641 sequence would. These options control which algorithm gets used.
642
643
644 **--wpb**
645
646 Use the Henikoff position-based sequence weighting scheme [Henikoff and
647 Henikoff, J. Mol. Biol. 243:574, 1994]. This is the default.
648
649 **--wgsc**
650
651 Use the Gerstein/Sonnhammer/Chothia weighting algorithm [Gerstein et al, J.
652 Mol. Biol. 235:1067, 1994].
653
654 **--wblosum**
655
656 Use the same clustering scheme that was used to weight data in calculating
657 BLOSUM subsitution matrices [Henikoff and Henikoff, Proc. Natl. Acad. Sci
658 89:10915, 1992]. Sequences are single-linkage clustered at an identity
659 threshold (default 0.62; see --wid) and within each cluster of c sequences,
660 each sequence gets rela- tive weight 1/c.
661
662 **--wnone**
663
664 No relative weights. All sequences are assigned uniform weight.
665
666 **--wid**
667
668 Sets the identity threshold used by single-linkage clustering when using
669 --wblosum. Invalid with any other weighting scheme. Default is 0.62.
670 ]]></token>
671 <token name="@BIAS_COMP_HELP@"><![CDATA[
672 Bias Composition
673 ----------------
674
675 The next number, the bias, is a correction term for biased sequence composition
676 that has been applied to the sequence bit score.1 For instance, for the top hit
677 MYG PHYCA that scored 222.7 bits, the bias of 3.2 bits means that this sequence
678 originally scored 225.9 bits, which was adjusted by the slight 3.2 bit biased-
679 composition correction. The only time you really need to pay attention to the
680 bias value is when it’s large, on the same order of magnitude as the sequence
681 bit score. Sometimes (rarely) the bias correction isn’t aggressive enough, and
682 allows a non-homolog to retain too much score. Conversely, the bias correction
683 can be too aggressive sometimes, causing you to miss homologs. You can turn the
684 biased-composition score correction off with the --nonull2 option (and if
685 you’re doing that, you may also want to set --nobias, to turn off another
686 biased composition step called the bias filter, which affects which sequences
687 get scored at all).
688
689 ]]></token>
690 <token name="@CUT_HELP@"><![CDATA[
691 Options for Model-specific Score Thresholding
692 ---------------------------------------------
693
694 Curated profile databases may define specific bit score thresholds for each
695 profile, superseding any thresholding based on statistical significance alone.
696 To use these options, the profile must contain the appropriate (GA, TC, and/or
697 NC) optional score threshold annotation; this is picked up by hmmbuild from
698 Stockholm format alignment files. Each thresholding option has two scores: the
699 per-sequence threshold <x1> and the per-domain threshold <x2> These act as if
700 -T<x1> --incT<x1> --domT<x2> --incdomT<x2> has been applied specifically using
701 each model’s curated thresholds.
702
703 **--cut_ga**
704
705 Use the GA (gathering) bit scores in the model to set per-sequence (GA1) and
706 per-domain (GA2) reporting and inclusion thresholds. GA thresholds are
707 generally considered to be the reliable curated thresholds defining family
708 membership; for example, in Pfam, these thresholds define what gets included in
709 Pfam Full alignments based on searches with Pfam Seed models.
710
711 **--cut_nc**
712
713 Use the NC (noise cutoff) bit score thresholds in the model to set
714 per-sequence (NC1) and per-domain (NC2) reporting and inclusion thresholds. NC
715 thresholds are generally considered to be the score of the highest-scoring
716 known false positive.
717
718 **--cut_tc**
719
720 Use the NC (trusted cutoff) bit score thresholds in the model to set
721 per-sequence (TC1) and per-domain (TC2) reporting and inclusion thresholds. TC
722 thresholds are generally considered to be the score of the lowest-scoring known
723 true positive that is above all known false positives.
724 ]]></token>
725 <token name="@EVAL_CALIB_HELP@"><![CDATA[
726 Options Controlling H3 Parameter Estimation Methods
727 ---------------------------------------------------
728
729 H3 uses three short random sequence simulations to estimating the location
730 parameters for the expected score distributions for MSV scores, Viterbi scores,
731 and Forward scores. These options allow these simulations to be modified.
732
733 **--EmL**
734
735 Sets the sequence length in simulation that estimates the location parameter mu
736 for MSV E-values. Default is 200.
737
738 **--EmN**
739
740 Sets the number of sequences in simulation that estimates the location parameter
741 mu for MSV E-values. Default is 200.
742
743 **--EvL**
744
745 Sets the sequence length in simulation that estimates the location parameter mu
746 for Viterbi E-values. Default is 200.
747
748 **--EvN**
749
750 Sets the number of sequences in simulation that estimates the location parameter
751 mu for Viterbi E-values. Default is 200.
752
753
754 **--EfL**
755
756 Sets the sequence length in simulation that estimates the location parameter tau
757 for Forward E-values. Default is 100.
758
759 **--EfN**
760
761 Sets the number of sequences in simulation that estimates the location parameter
762 tau for Forward E-values. Default is 200.
763
764 **--Eft**
765
766 Sets the tail mass fraction to fit in the simulation that estimates the location param-
767 eter tau for Forward evalues. Default is 0.04.
768 ]]></token>
769 <token name="@FORMAT_SELECTOR_HELP@"><![CDATA[
770 Options for Specifying the Alphabet
771 -----------------------------------
772
773 The alphabet type (amino, DNA, or RNA) is autodetected by default, by looking
774 at the composition of the msafile. Autodetection is normally quite reliable,
775 but occasionally alphabet type may be ambiguous and autodetection can fail (for
776 instance, on tiny toy alignments of just a few residues). To avoid this, or to
777 increase robustness in automated analysis pipelines, you may specify the
778 alphabet type of msafile with these options.
779 ]]></token>
780 <token name="@HSSI_HELP@"><![CDATA[
781 Options Controlling Single Sequence Scoring (first Iteration)
782 -------------------------------------------------------------
783
784 By default, the first iteration uses a search model constructed from a single
785 query sequence. This model is constructed using a standard 20x20 substitution
786 matrix for residue probabilities, and two additional pa- rameters for
787 position-independent gap open and gap extend probabilities. These options allow
788 the default single-sequence scoring parameters to be changed.
789
790 **Gap Open (--popen)**
791
792 Set the gap open probability for a single sequence query model to <x>
793
794 **Gap Extend (--pextend)**
795
796 Set the gap extend probability for a single sequence query model to <x>.
797
798
799 **--mx/--mxfile**
800
801 These options are not currently supported
802 ]]></token>
803 <token name="@LENGTHS_HELP@"><![CDATA[
804 Tail Mass Options
805 -----------------
806
807 **Window length tail mass (--w_beta)**
808
809 The upper bound, W, on the length at which nhmmer expects to find an instance
810 of the model is set such that the fraction of all sequences generated by the
811 model with length >= W is less than <x>. The default is 1e-7.
812
813
814 **Model instance length upper bound (--w length)**
815
816 Override the model instance length upper bound, W, which is otherwise
817 controlled by --w beta. It should be larger than the model length. The value of
818 W is used deep in the acceleration pipeline, and modest changes are not
819 expected to impact results (though larger values of W do lead to longer run
820 time).
821
822 ]]></token>
823 <token name="@MCSS_HELP@"><![CDATA[
824 **Options Controlling Profile Construction**
825
826 These options control how consensus columns are defined in an alignment.
827
828 **--fast**
829
830 Define consensus columns as those that have a fraction >= symfrac of residues
831 as opposed to gaps. (See below for the --symfrac option.) This is the default.
832
833 **--hand**
834
835 Define consensus columns in next profile using reference annotation to the multiple
836 alignment. This allows you to define any consensus columns you like.
837
838
839 **--symfrac**
840
841 Define the residue fraction threshold necessary to define a consensus column
842 when using the --fast option. The default is 0.5. The symbol fraction in each
843 column is calculated after taking relative sequence weighting into account, and
844 ignoring gap characters corresponding to ends of sequence fragments (as opposed
845 to internal insertions/deletions). Setting this to 0.0 means that every
846 alignment column will be assigned as consensus, which may be useful in some
847 cases. Setting it to 1.0 means that only columns that include 0 gaps (internal
848 insertions/deletions) will be assigned as consensus.
849
850 **--fragthresh**
851
852 We only want to count terminal gaps as deletions if the aligned sequence is
853 known to be full-length, not if it is a fragment (for instance, because only
854 part of it was sequenced). HMMER uses a simple rule to infer fragments: if the
855 sequence length L is less than or equal to a fraction <x> times the alignment
856 length in columns, then the sequence is handled as a fragment. The default is
857 0.5. Setting --fragthresh0 will define no (nonempty) sequence as a fragment;
858 you might want to do this if you know you’ve got a carefully curated alignment
859 of full-length sequences. Setting --fragthresh1 will define all sequences as
860 fragments; you might want to do this if you know your alignment is entirely
861 composed of fragments, such as translated short reads in metagenomic shotgun
862 data.
863
864 ]]></token>
865 <token name="@OFORMAT_WITH_OPTS_HELP@"><![CDATA[
866 Options for Controlling Output
867 ------------------------------
868
869 **Table of hits**
870
871 Save a simple tabular (space-delimited) file summarizing the per-target output, with
872 one data line per homologous target model found.
873
874 **Table of per-domain hits**
875
876 Save a simple tabular (space-delimited) file summarizing the per-domain output,
877 with one data line per homologous domain detected in a query sequence for each
878 homologous model.
879
880 **Table of hits and domains in Pfam Format**
881
882 Save an especially succinct tabular (space-delimited) file summarizing the
883 per-target output, with one data line per homologous target model found.
884 ]]></token>
885 <token name="@OFORMAT_WITH_OPTS_NOPFAM_HELP@"><![CDATA[
886 Options for Controlling Output
887 ------------------------------
888
889 **Table of hits**
890
891 Save a simple tabular (space-delimited) file summarizing the per-target output, with
892 one data line per homologous target model found.
893
894 **Table of per-domain hits**
895
896 Save a simple tabular (space-delimited) file summarizing the per-domain output,
897 with one data line per homologous domain detected in a query sequence for each
898 homologous model.
899 ]]></token>
900 <token name="@OFORMAT_WITH_OPTS_N_HELP@"><![CDATA[
901 Options for Controlling Output
902 ------------------------------
903
904 **Table of hits**
905
906 Save a simple tabular (space-delimited) file summarizing the per-target output, with
907 one data line per homologous target model found.
908
909 **Table of hits (dfam)**
910
911 Save a tabular (space-delimited) file summarizing the per-hit output, similar
912 to --tblout but more succinct.
913
914
915 **List of per-position scores for each hit (--aliscoreout)**
916
917 Save to file a list of per-position scores for each hit. This is useful, for
918 example, in identifying regions of high score density for use in resolving
919 overlapping hits from different models.
920
921 ]]></token>
922 <token name="@PRIOR_HELP@"><![CDATA[
923 Options Controlling Priors
924 --------------------------
925
926 By default, weighted counts are converted to mean posterior probability
927 parameter estimates using mixture Dirichlet priors. Default mixture Dirichlet
928 prior parameters for protein models and for nucleic acid (RNA and DNA) models
929 are built in. The following options allow you to override the default priors.
930
931 **No priors (--pnone)**
932
933 Don’t use any priors. Probability parameters will simply be the observed
934 frequencies, after relative sequence weighting.
935
936 **Laplace +1 prior**
937
938 Use a Laplace +1 prior in place of the default mixture Dirichlet prior.
939 ]]></token>
940 <token name="@SEED_HELP@"><![CDATA[
941 Random Seeding
942 --------------
943
944 Seed the random number generator with <n>, an integer >= 0. If <n> is nonzero,
945 any stochastic simulations will be reproducible; the same command will give the
946 same results. If <n> is 0, the random number generator is seeded arbitrarily,
947 and stochastic simulations will vary from run to run of the same command.
948
949 ]]></token>
950 <token name="@THRESHOLDS_HELP@"><![CDATA[
951 Options for Reporting Thresholds
952 --------------------------------
953
954 Reporting thresholds control which hits are reported in output files (the main
955 output, --tblout, and --domtblout).
956
957 **E-value (-E)**
958
959 In the per-target output, report target profiles with an E-value of <= <x>. The
960 default is 10.0, meaning that on average, about 10 false positives will be
961 reported per query, so you can see the top of the noise and decide for yourself
962 if it’s really noise.
963
964 **Bit score (-T)**
965
966 Instead of thresholding per-profile output on E-value, instead report target profiles
967 with a bit score of >= <x>.
968
969 **domain E-value (--domE)**
970
971 In the per-domain output, for target profiles that have already satisfied the
972 per-profile reporting threshold, report individual domains with a conditional
973 E-value of <= <x>. The default is 10.0. A conditional E-value means the
974 expected number of additional false positive domains in the smaller search
975 space of those comparisons that already satisfied the per-profile reporting
976 threshold (and thus must have at least one homologous domain already).
977
978 **domain Bit scores (--domT)**
979
980 Instead of thresholding per-domain output on E-value, instead report domains
981 with a bit score of >= <x>.
982
983 Options for Inclusion Thresholds
984 --------------------------------
985
986 Inclusion thresholds are stricter than reporting thresholds. Inclusion
987 thresholds control which hits are considered to be reliable enough to be
988 included in an output alignment or a subsequent search round. In hmmscan, which
989 does not have any alignment output (like hmmsearch or phmmer) nor any iterative
990 search steps (like jackhmmer), inclusion thresholds have little effect. They
991 only affect what domains get marked as significant (!) or questionable (?) in
992 domain output.
993
994 **E-value of per target inclusion threshold**
995
996 Use an E-value of <= <x> as the per-target inclusion threshold. The default is
997 0.01, meaning that on average, about 1 false positive would be expected in
998 every 100 searches with different query sequences.
999
1000 **Bit score of per target inclusion threshold**
1001
1002 Instead of using E-values for setting the inclusion threshold, instead use a
1003 bit score of >= <x> as the per-target inclusion threshold. It would be unusual
1004 to use bit score thresholds with hmmscan, because you don’t expect a single
1005 score threshold to work for different profiles; different profiles have
1006 slightly different expected score distributions.
1007
1008 **domain E-value per target inclusion treshold**
1009
1010 Use a conditional E-value of <= <x> as the per-domain inclusion threshold, in
1011 targets that have already satisfied the overall per-target inclusion threshold.
1012
1013 **domain Bit score per target inclusion treshold**
1014
1015 Instead of using E-values, instead use a bit score of >= <x> as the per-domain
1016 inclusion threshold. As with --incT above, it would be unusual to use a single
1017 bit score threshold in hmmscan.
1018
1019 ]]></token>
1020 <token name="@THRESHOLDS_NODOM_HELP@"><![CDATA[
1021 Options for Reporting Thresholds
1022 --------------------------------
1023
1024 Reporting thresholds control which hits are reported in output files (the main
1025 output, --tblout, and --domtblout).
1026
1027 **E-value (-E)**
1028
1029 In the per-target output, report target profiles with an E-value of <= <x>. The
1030 default is 10.0, meaning that on average, about 10 false positives will be
1031 reported per query, so you can see the top of the noise and decide for yourself
1032 if it’s really noise.
1033
1034 **Bit score (-T)**
1035
1036 Instead of thresholding per-profile output on E-value, instead report target profiles
1037 with a bit score of >= <x>.
1038
1039 Options for Inclusion Thresholds
1040 --------------------------------
1041
1042 Inclusion thresholds are stricter than reporting thresholds. Inclusion
1043 thresholds control which hits are considered to be reliable enough to be
1044 included in an output alignment or a subsequent search round. In hmmscan, which
1045 does not have any alignment output (like hmmsearch or phmmer) nor any iterative
1046 search steps (like jackhmmer), inclusion thresholds have little effect. They
1047 only affect what domains get marked as significant (!) or questionable (?) in
1048 domain output.
1049
1050 **E-value of per target inclusion threshold**
1051
1052 Use an E-value of <= <x> as the per-target inclusion threshold. The default is
1053 0.01, meaning that on average, about 1 false positive would be expected in
1054 every 100 searches with different query sequences.
1055
1056 **Bit score of per target inclusion threshold**
1057
1058 Instead of using E-values for setting the inclusion threshold, instead use a
1059 bit score of >= <x> as the per-target inclusion threshold. It would be unusual
1060 to use bit score thresholds with hmmscan, because you don’t expect a single
1061 score threshold to work for different profiles; different profiles have
1062 slightly different expected score distributions.
1063
1064 ]]></token>
1065 <token name="@ATTRIBUTION@"><![CDATA[
1066
1067 Attribution
1068 -----------
1069
1070 This Galaxy tool relies on HMMER3_ from http://hmmer.janelia.org/
1071 Internally the software is cited as:
1072
1073 ::
1074
1075 # hmmscan :: search sequence(s) against a profile database
1076 # HMMER 3.1 (February 2013); http://hmmer.org/
1077 # Copyright (C) 2011 Howard Hughes Medical Institute.
1078 # Freely distributed under the GNU General Public License (GPLv3).
1079 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1080
1081 The wrappers were written by Eric Rasche and is licensed under Apache2_. The
1082 documentation is copied from the HMMER3 documentation.
1083
1084 .. _Apache2: http://www.apache.org/licenses/LICENSE-2.0
1085 .. _HMMER3: http://hmmer.janelia.org/
1086
1087
1088 ]]></token>
1089 <token name="@HELP_PRE@"><![CDATA[
1090
1091 What it does
1092 ============
1093 ]]></token>
1094 <token name="@HELP_PRE_OTH@"><![CDATA[
1095 Options
1096 =======
1097 ]]></token>
1098 </macros>