comparison macros.xml.orig @ 11:405dd85a9408 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hmmer3 commit e0d4688a59e6eeba33adcfe803ac43d0bc2863e7"
author iuc
date Tue, 31 Aug 2021 08:43:59 +0000
parents
children
comparison
equal deleted inserted replaced
10:ffeedf9b8dce 11:405dd85a9408
1 <?xml version="1.0"?>
2 <macros>
3 <xml name="requirements">
4 <requirements>
5 <requirement type="package" version="@TOOL_VERSION@">hmmer</requirement>
6 <yield/>
7 </requirements>
8 </xml>
9 <<<<<<< HEAD
10 <xml name="bio_tools">
11 <xrefs>
12 <xref type="bio.tools">gemini</xref>
13 </xrefs>
14 </xml>
15 =======
16 <xml name="bio_tools">
17 <xrefs>
18 <xref type="bio.tools">hmmer3</xref>
19 </xrefs>
20 </xml>
21 >>>>>>> 5f65381bd (fix id of hmmer3)
22 <token name="@TOOL_VERSION@">3.3.2</token>
23 <xml name="stdio">
24 <stdio>
25 <!-- Anything other than zero is an error -->
26 <exit_code range="1:"/>
27 <exit_code range=":-1"/>
28 <!-- In case the return code has not been set propery check stderr too -->
29 <regex match="Error:"/>
30 <regex match="Exception:"/>
31 </stdio>
32 </xml>
33 <token name="@THRESHOLDS@">
34 -E $E
35 --domE $domE
36
37 #if str($T):
38 -T $T
39 #end if
40
41 #if str($domT):
42 --domT $domT
43 #end if
44
45 #if str($incE):
46 --incE $incE
47 #end if
48
49 #if str($incdomE):
50 --incdomE $incdomE
51 #end if
52
53 #if str($incT):
54 --incT $incT
55 #end if
56
57 #if str($incdomT):
58 --incdomT $incdomT
59 #end if
60 </token>
61 <xml name="thresholds_xml">
62 <!-- Options controlling reporting thresholds -->
63 <param argument="-E" type="float" min="0" value="10.0" label="report sequences &lt;= this E-Value threshold in output" />
64 <param argument="--domE" type="float" min="0" value="10.0" label="report domains &lt;= this E-Value threshold in output" />
65 <param argument="-T" type="float" optional="true" label="report sequences &gt;= this score threshold in output" />
66 <param argument="--domT" type="float" optional="true" label="report domains &gt;= this score threshold in output" />
67 <!-- Options controlling inclusion (significance) thresholds -->
68 <param argument="--incE" type="float" optional="true" label="consider sequences &lt;= this E-Value threshold as significant" />
69 <param argument="--incdomE" type="float" optional="true" label="consider domains &lt;= this E-Value threshold as significant" />
70 <param argument="--incT" type="float" optional="true" label="consider sequences &gt;= this score threshold as significant" />
71 <param argument="--incdomT" type="float" optional="true" label="consider domains &gt;= this score threshold as significant" />
72 </xml>
73 <token name="@THRESHOLDS_NODOM@">
74 -E $E
75
76 #if str($T):
77 -T $T
78 #end if
79
80 #if str($incE):
81 --incE $incE
82 #end if
83
84 #if str($incT):
85 --incT $incT
86 #end if
87 </token>
88 <xml name="thresholds_nodom">
89 <!-- Options controlling reporting thresholds -->
90 <param argument="-E" type="float" min="0" value="10.0" label="report sequences &lt;= this E-Value threshold in output" />
91 <param argument="-T" type="float" optional="true" label="report sequences &gt;= this score threshold in output" />
92 <!-- Options controlling inclusion (significance) thresholds -->
93 <param argument="--incE" type="float" optional="true" label="consider sequences &lt;= this E-Value threshold as significant" />
94 <param argument="--incT" type="float" optional="true" label="consider sequences &gt;= this score threshold as significant" />
95 </xml>
96 <token name="@ACCEL_HEUR@">
97 $max
98 --F1 $F1
99 --F2 $F2
100 --F3 $F3
101 $nobias
102 </token>
103 <xml name="accel_heur_xml">
104 <!-- Options controlling acceleration heuristics -->
105 <param argument="--max" type="boolean" truevalue="--max" falsevalue="" label="Turn all heuristic filters off (less speed, more power)" />
106 <param argument="--F1" type="float" value="0.02" label="Stage 1 (MSV) threshold: promote hits w/ P &lt;= F1" />
107 <param argument="--F2" type="float" value="1e-3" label="Stage 2 (Vit) threshold: promote hits w/ P &lt;= F2" />
108 <param argument="--F3" type="float" value="1e-5" label="Stage 3 (Fwd) threshold: promote hits w/ P &lt;= F3" />
109 <param argument="--nobias" type="boolean" truevalue="--nobias" falsevalue="" label="Turn off composition bias filter" />
110 </xml>
111 <token name="@EVAL_CALIB@">
112 --EmL $EmL
113 --EmN $EmN
114 --EvL $EvL
115 --EvN $EvN
116 --EfL $EfL
117 --EfN $EfN
118 --Eft $Eft
119 </token>
120 <xml name="eval_calib_xml">
121 <!-- Control of E-value calibration -->
122 <param argument="--EmL" type="integer" min="1" value="200" label="Length of sequences for MSV Gumbel mu fit" />
123 <param argument="--EmN" type="integer" min="1" value="200" label="Number of sequences for MSV Gumbel mu fit" />
124 <param argument="--EvL" type="integer" min="1" value="200" label="Length of sequences for Viterbi Gumbel mu fit" />
125 <param argument="--EvN" type="integer" min="1" value="200" label="Number of sequences for Viterbi Gumbel mu fit" />
126 <param argument="--EfL" type="integer" min="1" value="100" label="Length of sequences for Forward exp tail tau fit" />
127 <param argument="--EfN" type="integer" min="1" value="200" label="Number of sequences for Forward exp tail tau fit" />
128 <param argument="--Eft" type="float" min="0" max="1" value="0.04" label="tail mass for Forward exponential tail tau fit" />
129 </xml>
130 <token name="@OFORMAT_WITH_OPTS@">
131 #if $oformat:
132 #for o in str($oformat).split(','):
133 --$o '$getVar($o, 'MISSING_OUTPUT'+$o)'
134 #end for
135 #end if
136 $acc $noali $notextw
137 </token>
138 <xml name="oformat_with_opts">
139 <!-- Options directing output -->
140 <param name="oformat" type="select" multiple="true" display="checkboxes" label="Output Formats">
141 <option value="tblout" selected="true">Table of per-sequence hits (--tblout)</option>
142 <yield/>
143 </param>
144 <param argument="--acc" type="boolean" truevalue="--acc" falsevalue="" label="Prefer accessions over names in output" />
145 <param argument="--noali" type="boolean" truevalue="--noali" falsevalue="" label="Don't output alignments, so output is smaller" />
146 <param argument="--notextw" type="boolean" truevalue="--notextw" falsevalue="" label="Unlimited ASCII text output line width" />
147 </xml>
148
149 <xml name="oformat_with_opts_dom">
150 <expand macro="oformat_with_opts">
151 <option value="domtblout" selected="true">Table of per-domain hits (--domtblout)</option>
152 <yield/>
153 </expand>
154 </xml>
155
156 <xml name="oformat_with_opts_dom_pfam">
157 <expand macro="oformat_with_opts_dom">
158 <option value="pfamtblout" selected="true">Table of hits and domains in Pfam format (--pfamtblout)</option>
159 </expand>
160 </xml>
161
162 <xml name="oformat_with_opts_dfam_alisc">
163 <!-- Options directing output -->
164 <expand macro="oformat_with_opts">
165 <option value="dfamtblout" selected="true">Table of hits in Dfam format (--dfamtblout)</option>
166 <option value="aliscoresout">Scores for each position in each alignment to file (--aliscoresout)</option>
167 </expand>
168 </xml>
169
170 <xml name="output" token_tool="">
171 <data name="output" format="txt" label="@TOOL@ on ${on_string}"/>
172 <data name="tblout" format="txt" label="@TOOL@ on ${on_string}: per-sequence hits from HMM matches">
173 <filter>oformat and 'tblout' in oformat</filter>
174 </data>
175 <yield/>
176 </xml>
177 <xml name="output_dom" token_tool="">
178 <expand macro="output" tool="@TOOL@">
179 <data name="domtblout" format="txt" label="@TOOL@ on ${on_string}: per-domain hits from HMM matches">
180 <filter>oformat and 'domtblout' in oformat</filter>
181 </data>
182 </expand>
183 <yield/>
184 </xml>
185 <xml name="output_dom_pfam" token_tool="">
186 <expand macro="output_dom" tool="@TOOL@">
187 <data name="pfamtblout" format="txt" label="@TOOL@ on ${on_string}: per-sequence/per-domain hits from HMM matches">
188 <filter>oformat and 'pfamtblout' in oformat</filter>
189 </data>
190 </expand>
191 </xml>
192 <xml name="output_dfam_alisc" token_tool="" token_ofvar="seqfile" token_invar="seqdb">
193 <expand macro="output" tool="@TOOL@">
194 <data name="dfamtblout" format="txt" label="@TOOL@ on ${on_string}: per-sequence/per-domain hits from HMM matches">
195 <filter>oformat and 'dfamtblout' in oformat</filter>
196 </data>
197 <data name="aliscoresout" format="txt" label="@TOOL@ on ${on_string}: scores for positional matches">
198 <filter>oformat and 'aliscoresout' in oformat</filter>
199 </data>
200 </expand>
201 </xml>
202
203 <xml name="assert_out" token_tool="">
204 <assert_contents>
205 <has_line_matching expression="# @TOOL@.*"/>
206 <has_line_matching expression="\[ok\]"/>
207 </assert_contents>
208 </xml>
209
210 <xml name="assert_tblout" token_tool="">
211 <assert_contents>
212 <has_line_matching expression="# Program: @TOOL@"/>
213 <has_line_matching expression="# \[ok\]"/>
214 </assert_contents>
215 </xml>
216
217 <xml name="oformat_test">
218 <param name="notextw" value="true" />
219 </xml>
220 <token name="@HSSI@">
221 #if $hssi.hssi_select == "singlemx":
222 --popen $hssi.popen
223 --pextend $hssi.pextend
224 #end if
225 </token>
226 <xml name="hssi">
227 <!-- Handling single sequence inputs -->
228 <conditional name="hssi">
229 <param name="hssi_select" type="select" label="Options for handling single sequence inputs">
230 <option value="false" selected="true">Disable</option>
231 <option value="singlemx">Use substitution score matrix for single-sequence inputs</option>
232 </param>
233 <when value="false" />
234 <when value="singlemx">
235 <param argument="--popen" type="float" min="0.0" max="0.5" value="0.02" label="Gap open probability" />
236 <param argument="--pextend" type="float" min="0.0" max="1.0" value="0.4" label="Gap extend probability" />
237 </when>
238 <!-- -mx <s> : substitution score matrix (built-in matrices, with -singlemx)-->
239 <!-- -mxfile <f> : read substitution score matrix from file <f> (with -singlemx)-->
240 </conditional>
241 </xml>
242 <token name="@ADDTHREADS@"><![CDATA[
243 ##compute the number of ADDITIONAL threads to be used (--cpu)
244 addthreads=\${GALAXY_SLOTS:-1} && (( addthreads-- )) &&
245 ]]></token>
246 <token name="@CPU@">
247 --cpu \$addthreads
248 </token>
249 <token name="@SEED@">
250 --seed $seed
251 </token>
252 <xml name="seed">
253 <param argument="--seed" type="integer" min="0" value="42" label="RNG seed, 0 generates a random seed" />
254 </xml>
255 <xml name="seed_test">
256 <param name="seed" value="4" />
257 </xml>
258 <token name="@ADV_OPTS@">
259 $nonull2
260
261 #if str($Z):
262 -Z $Z
263 #end if
264
265 #if str($domZ):
266 --domZ $domZ
267 #end if
268 </token>
269 <xml name="adv_opts">
270 <!-- Other options -->
271 <param argument="--nonull2" type="boolean" truevalue="--nonull2" falsevalue="" label="Turn off biased composition score corrections" />
272 <param argument="-Z" type="integer" optional="true" label="# of comparisons done for E-value calculation" />
273 <param argument="--domZ" type="integer" optional="true" label="# of significant sequences, for domain E-value calculation" />
274 </xml>
275 <token name="@FORMAT_SELECTOR@">
276 $input_format_select
277 </token>
278 <xml name="format_selector">
279 <param name="input_format_select" type="select" label="Format of sequence and model">
280 <option value="--amino">Protein</option>
281 <option value="--dna">DNA</option>
282 <option value="--rna">RNA</option>
283 </param>
284 </xml>
285 <xml name="format_selector_noprot">
286 <param name="input_format_select" type="select" label="Format of sequence and model">
287 <option value="--dna">DNA</option>
288 <option value="--rna">RNA</option>
289 </param>
290 </xml>
291 <token name="@ARSWS@">
292 $arsws.arsws_select
293
294 #if $arsws.arsws_select == "--wblosum":
295 --wid $arsws.wid
296 #end if
297 </token>
298 <xml name="arsws">
299 <!-- Alternative relative sequence weighting strategies -->
300 <conditional name="arsws">
301 <param name="arsws_select" type="select" label="Alternative relative sequence weighting strategies">
302 <option value="--wpb" selected="true">Henikoff position-based weights (--wpb)</option>
303 <option value="--wgsc">Gerstein/Sonnhammer/Chothia tree weights (--wgsc)</option>
304 <option value="--wblosum">Henikoff simple filter weights (--wblosum)</option>
305 <option value="--wnone">don't do any relative weighting; set all to 1 (--wnnoe)</option>
306 <option value="--wgiven">use weights as given in MSA file (--wgiven)</option>
307 </param>
308 <when value="--wpb">
309 </when>
310 <when value="--wgsc">
311 </when>
312 <when value="--wblosum">
313 <param argument="--wid" type="float" value="0.62" label="Set identity cutoff" />
314 </when>
315 <when value="--wnone">
316 </when>
317 <when value="--wgiven">
318 </when>
319 </conditional>
320 </xml>
321 <token name="@AEEWS@">
322 #if $aeews.aeews_select != "":
323 --$aeews.aeews_select
324 #if $aeews.aeews_select == "eent":
325 --eset $aeews.eset
326 --ere $aeews.ere
327 --esigma $aeews.esigma
328 #elif $aeews.aeews_select == "eclust":
329 --eset $aeews.eset
330 --eid $aeews.eid
331 #end if
332 #end if
333 </token>
334 <xml name="aeews">
335 <!-- Alternative effective sequence weighting strategies -->
336 <conditional name="aeews">
337 <param name="aeews_select" type="select" label="Alternative effective sequence weighting strategies">
338 <option value="">Disabled</option>
339 <option value="eent">Adjust eff seq # to achieve relative entropy target (--eent)</option>
340 <option value="eclust">Eff seq # is the # of single linkage clusters (--eclust)</option>
341 <option value="enone">No effective seq # weighting: just use nseq (--enone)</option>
342 </param>
343 <when value="">
344 </when>
345 <when value="eent">
346 <param argument="--eset" type="float" value="0" label="set eff seq # for all models" />
347 <param argument="--ere" type="float" value="0" label="set minimum rel entropy/position" />
348 <param argument="--esigma" type="float" value="45" label="set sigma param" />
349 </when>
350 <when value="eclust">
351 <param argument="--eset" type="float" value="0" label="set eff seq # for all models" />
352 <param argument="--eid" type="float" min="0" max="1" value="0.62" label="set fractional identity cutoff" />
353 </when>
354 <when value="enone">
355 </when>
356 </conditional>
357 </xml>
358 <token name="@CUT@">
359 $cut_ga
360 $cut_nc
361 $cut_tc
362 </token>
363 <xml name="cut">
364 <param argument="--cut_ga" type="boolean" truevalue="--cut_ga" falsevalue="" label="use profile's GA gathering cutoffs to set all thresholding" />
365 <param argument="--cut_nc" type="boolean" truevalue="--cut_nc" falsevalue="" label="use profile's NC gathering cutoffs to set all thresholding" />
366 <param argument="--cut_tc" type="boolean" truevalue="--cut_tc" falsevalue="" label="use profile's TC gathering cutoffs to set all thresholding" />
367 </xml>
368 <token name="@MCSS@">
369 --$mcs.model_construction_strategy_select
370
371 #if $mcs.model_construction_strategy_select == "fast":
372 --symfrac $mcs.symfrac
373 #end if
374 #if str($fragthresh)
375 --fragthresh $fragthresh
376 #end if
377 </token>
378 <xml name="mcss">
379 <!-- Alternative model construction strategies -->
380 <conditional name="mcs">
381 <param name="model_construction_strategy_select" type="select" label="Model Construction Strategy">
382 <option value="fast" selected="true">Assign columns with &gt;= symfrac residues as consensus (--fast)</option>
383 <option value="hand">Manual construction (requires reference annotation) (--hand)</option>
384 </param>
385 <when value="fast">
386 <param argument="--symfrac" value="0.5" type="float" label="Sets sym fraction controlling --fast construction"/>
387 </when>
388 <when value="hand"></when>
389 </conditional>
390 <param argument="--fragthresh" type="float" value="0.5" optional="true" label="Fraction of alignment length, under which sequences are excluded" help="HMMER infers fragments if the sequence length L is less than or equal to a fraction x times the alignment length in columns" />
391 </xml>
392 <token name="@PRIOR@">
393 $aps_select
394 </token>
395 <xml name="prior">
396 <param name="aps_select" type="select" label="Alternative Prior Strategies">
397 <option value="" selected="true">Unspecified</option>
398 <option value="--pnone">Don't use any prior; parameters are frequencies (--pnone)</option>
399 <option value="--plaplace">Use a Laplace +1 prior (--plaplace)</option>
400 </param>
401 </xml>
402 <xml name="citation">
403 <citations>
404 <citation type="doi">10.1093/nar/gkr367</citation>
405 </citations>
406 </xml>
407 <token name="@LENGTHS@">
408 #if str($w_beta):
409 --w_beta $w_beta
410 #end if
411
412 #if str($w_length):
413 --w_length $w_length
414 #end if
415 </token>
416 <xml name="lengths">
417 <param argument="--w_beta" type="float" optional="true" label="Tail mass at which window length is determined" />
418 <param argument="--w_length" type="integer" optional="true" label="Window Length" />
419 </xml>
420 <token name="@INPUTHMMCHOICE@"><![CDATA[
421 #if $input_hmm_conditional.input_hmm_source == "history":
422 #set $input_hmm_filename = "localref.hmm"
423 ln -s '${input_hmm_conditional.hmmfile}' '${input_hmm_filename}' &&
424 ## "Press" database
425 hmmpress '${input_hmm_filename}' &&
426 #else:
427 #set $input_hmm_filename = str($input_hmm_conditional.index.fields.db_path)
428 #end if
429 ]]></token>
430 <xml name="input_hmm_choice">
431 <conditional name="input_hmm_conditional">
432 <param name="input_hmm_source" type="select" label="Use a built-in HMM model database or own from your history" >
433 <option value="indexed" selected="true">Use a built-in HMM model database</option>
434 <option value="history">Use a HMM database from history</option>
435 </param>
436 <when value="indexed">
437 <param name="index" type="select" label="Select a HMM model database" help="If your database of interest is not listed, contact the Galaxy administrator">
438 <options from_data_table="hmm_database">
439 <filter type="sort_by" column="2"/>
440 <validator type="no_options" message="No indexes are available for the selected input dataset"/>
441 </options>
442 </param>
443 </when>
444 <when value="history">
445 <param name="hmmfile" type="data" format="hmm2,hmm3" label="HMM model" />
446 </when> <!-- history -->
447 </conditional> <!-- input_hmm_conditional -->
448 </xml>
449 <xml name="input_hmm">
450 <param name="hmmfile" type="data" format="hmm2,hmm3" label="HMM model" />
451 </xml>
452 <xml name="input_msa">
453 <param name="msafile" type="data" label="Multiple Sequence Alignment" format="stockholm,clustal,fasta"
454 help="in Stockholm, Clustal, or Fasta format. While this tool accepts fasta, please ensure that the sequences are not unaligned"/>
455 </xml>
456
457
458 <token name="@ACCEL_HEUR_HELP@"><![CDATA[
459 Acceleration Heuristicts (--F1, --F2, --F3)
460 -------------------------------------------
461
462 **MSV filter**
463
464 The sequence is aligned to the profile using a specialized model that
465 allows multiple high-scoring local ungapped segments to match. The
466 optimal alignment score (Viterbi score) is calculated under this multi-
467 segment model, hence the term MSV, for “multi-segment Viterbi”. This is
468 HMMER’s main speed heuristic. The MSV score is comparable to BLAST’s sum
469 score (optimal sum of ungapped alignment segments). Roughly speaking,
470 MSV is comparable to skipping the heuristic word hit and hit extension
471 steps of the BLAST acceleration algorithm.
472
473 The MSV filter is very, very fast. In addition to avoiding indel
474 calculations in the dynamic programming table, it uses reduced precision
475 scores scaled to 8-bit integers, enabling acceleration via 16-way
476 parallel SIMD vector instructions.
477
478 The MSV score is a true log-odds likelihood ratio, so it obeys
479 conjectures about the expected score distribution (Eddy, 2008) that
480 allow immediate and accurate calculation of the statistical significance
481 (P- value) of the MSV bit score.
482
483 By default, comparisons with a P-value of ≤ 0.02 pass this filter,
484 meaning that about 2% of nonhomol- ogous sequences are expected to pass.
485 You can use the --F1 option to change this threshold. For example, --F1
486 <0.05> would pass 5% of the comparisons, making a search more sensitive
487 but slower. Setting the threshold to ≥ 1.0 (--F1 99 for example) assures
488 that all comparisons will pass. Shutting off the MSV filter may be
489 worthwhile if you want to make sure you don’t miss comparisons that have
490 a lot of scattered insertions and deletions. Alternatively, the --max
491 option causes the MSV filter step (and all other filter steps) to be
492 bypassed.
493
494 The MSV bit score is calculated as a log-odds score using the null model
495 for comparison. No correction for a biased composition or repetitive
496 sequence is done at this stage. For comparisons involving biased
497 sequences and/or profiles, more than 2% of comparisons will pass the MSV
498 filter. At the end of search output, there is a line like:
499
500 Passed MSV filter: 107917 (0.020272); expected 106468.8 (0.02)
501
502 which tells you how many and what fraction of comparisons passed the MSV
503 filter, versus how many (and what fraction) were expected.
504
505 **Viterbi filter**
506
507 The sequence is now aligned to the profile using a fast Viterbi algorithm for
508 optimal gapped alignment.
509
510 This Viterbi implementation is specialized for speed. It is implemented in
511 8-way parallel SIMD vector instructions, using reduced precision scores that
512 have been scaled to 16-bit integers. Only one row of the dynamic programming
513 matrix is stored, so the routine only recovers the score, not the optimal
514 alignment itself. The reduced representation has limited range; local alignment
515 scores will not underflow, but high scoring comparisons can overflow and return
516 infinity, in which case they automatically pass the filter.
517
518 The final Viterbi filter bit score is then computed using the appropriate null
519 model log likelihood (by default the biased composition filter model score, or
520 if the biased filter is off, just the null model score). If the P-value of this
521 score passes the Viterbi filter threshold, the sequence passes on to the next
522 step of the pipeline.
523
524 The --F2 <x> option controls the P-value threshold for passing the Viterbi
525 filter score. The default is 0.001. The --max option bypasses all filters in
526 the pipeline. At the end of a search output, you will see a line like:
527
528 Passed Vit filter: 2207 (0.00443803); expected 497.3 (0.001)
529
530 which tells you how many and what fraction of comparisons passed the Viterbi
531 filter, versus how many were expected.
532
533 **Forward filter/parser**
534
535 The sequence is now aligned to the profile using the full Forward algorithm,
536 which calculates the likelihood of the target sequence given the profile,
537 summed over the ensemble of all possible alignments.
538
539 This is a specialized time- and memory-efficient Forward implementation called
540 the “Forward parser”. It is implemented in 4-way parallel SIMD vector
541 instructions, in full precision (32-bit floating point). It stores just enough
542 information that, in combination with the results of the Backward parser
543 (below), posterior probabilities of start and stop points of alignments
544 (domains) can be calculated in the domain definition step (below), although the
545 detailed alignments themselves cannot be.
546
547 The Forward filter bit score is calculated by correcting this score using the
548 appropriate null model log likelihood (by default the biased composition filter
549 model score, or if the biased filter is off, just the null model score). If the
550 P-value of this bit score passes the Forward filter threshold, the sequence
551 passes on to the next step of the pipeline.
552
553 The bias filter score has no further effect in the pipeline. It is only used in
554 filter stages. It has no effect on final reported bit scores or P-values.
555 Biased composition compensation for final bit scores is done by a more complex
556 domain-specific algorithm, described below.
557
558 The --F3 <x> option controls the P-value threshold for passing the Forward
559 filter score. The default is 1e-5. The --max option bypasses all filters in the
560 pipeline. At the end of a search output, you will see a line like:
561
562 Passed Fwd filter: 1076 (0.00216371); expected 5.0 (1e-05)
563
564 which tells you how many and what fraction of comparisons passed the Forward
565 filter, versus how many were expected.
566
567 **Bias Filter Options**
568
569 The --max option bypasses all filters in the pipeline, including the bias
570 filter.
571
572 The --nobias option turns off (bypasses) the biased composition filter. The
573 simple null model is used as a null hypothesis for MSV and in subsequent filter
574 steps. The biased composition filter step compromises a small amount of
575 sensitivity. Though it is good to have it on by default, you may want to shut
576 it off if you know you will have no problem with biased composition hits.
577
578
579 **Advanced Documentation**
580
581 A more detailed look at the internals of the various filter pipelines was
582 posted on the `developer's blog <http://cryptogenomicon.org/hmmer3-is-stubborn.html>`__.
583 The information posted there may be useful to those who are struggling with
584 poor-scoring sequences.
585 ]]></token>
586 <token name="@ADV_OPTS_HELP@"><![CDATA[
587 Advanced Options
588 ----------------
589
590 **nonull2**
591
592 can be too aggressive sometimes, causing you to miss homologs. You can turn the
593 biased-composition score correction off with the --nonull2 option (and if
594 you’re doing that, you may also want to set --nobias, to turn off another
595 biased composition step called the bias filter, which affects which sequences
596 get scored at all).
597
598 **domZ**
599
600 Assert that the total number of targets in your searches is <x>, for the
601 purposes of per-domain conditional E-value calculations, rather than the number
602 of targets that passed the reporting thresholds.
603
604 **Z**
605
606 Assert that the total number of targets in your searches is <x>, for the
607 purposes of per-sequence E-value calculations, rather than the actual number of
608 targets seen.
609 ]]></token>
610 <token name="@AEEWS_HELP@"><![CDATA[
611 Effective Sequence Number
612 -------------------------
613
614 After relative weights are determined, they are normalized to sum to a total
615 effective sequence number, eff nseq. This number may be the actual number of
616 sequences in the alignment, but it is almost always smaller than that. The
617 default entropy weighting method (--eent) reduces the effective sequence num-
618 ber to reduce the information content (relative entropy, or average expected
619 score on true homologs) per consensus position. The target relative entropy is
620 controlled by a two-parameter function, where the two parameters are settable
621 with --ere and --esigma.
622
623 **--eent**
624
625 Adjust effective sequence number to achieve a specific relative entropy per
626 position (see --ere). This is the default.
627
628 **--eclust**
629
630 Set effective sequence number to the number of single-linkage clusters at a
631 specific identity threshold (see --eid). This option is not recommended; it’s
632 for experiments evaluating how much better --eent is.
633
634 **--enone**
635
636 Turn off effective sequence number determination and just use the actual number
637 of sequences. One reason you might want to do this is to try to maximize the
638 relative entropy/position of your model, which may be useful for short models.
639
640 **--eset**
641
642 Explicitly set the effective sequence number for all models to <x>.
643
644 **--ere**
645
646 Set the minimum relative entropy/position target to <x>. Requires --eent. Default
647 depends on the sequence alphabet. For protein sequences, it is 0.59 bits/position;
648 for nucleotide sequences, it is 0.45 bits/position.
649
650 **--esigma**
651
652 Sets the minimum relative entropy contributed by an entire model alignment, over
653 its whole length. This has the effect of making short models have higher relative
654 entropy per position than --ere alone would give. The default is 45.0 bits.
655
656 **--eid**
657
658 Sets the fractional pairwise identity cutoff used by single linkage clustering
659 with the --eclust option. The default is 0.62.
660 ]]></token>
661 <token name="@ARSWS_HELP@"><![CDATA[
662 Options Controlling Relative Weights
663 ------------------------------------
664
665 HMMER uses an ad hoc sequence weighting algorithm to downweight closely related
666 sequences and up-weight distantly related ones. This has the effect of making
667 models less biased by uneven phylogenetic representation. For example, two
668 identical sequences would typically each receive half the weight that one
669 sequence would. These options control which algorithm gets used.
670
671
672 **--wpb**
673
674 Use the Henikoff position-based sequence weighting scheme [Henikoff and
675 Henikoff, J. Mol. Biol. 243:574, 1994]. This is the default.
676
677 **--wgsc**
678
679 Use the Gerstein/Sonnhammer/Chothia weighting algorithm [Gerstein et al, J.
680 Mol. Biol. 235:1067, 1994].
681
682 **--wblosum**
683
684 Use the same clustering scheme that was used to weight data in calculating
685 BLOSUM subsitution matrices [Henikoff and Henikoff, Proc. Natl. Acad. Sci
686 89:10915, 1992]. Sequences are single-linkage clustered at an identity
687 threshold (default 0.62; see --wid) and within each cluster of c sequences,
688 each sequence gets rela- tive weight 1/c.
689
690 **--wnone**
691
692 No relative weights. All sequences are assigned uniform weight.
693
694 **--wid**
695
696 Sets the identity threshold used by single-linkage clustering when using
697 --wblosum. Invalid with any other weighting scheme. Default is 0.62.
698 ]]></token>
699 <token name="@BIAS_COMP_HELP@"><![CDATA[
700 Bias Composition
701 ----------------
702
703 The next number, the bias, is a correction term for biased sequence composition
704 that has been applied to the sequence bit score.1 For instance, for the top hit
705 MYG PHYCA that scored 222.7 bits, the bias of 3.2 bits means that this sequence
706 originally scored 225.9 bits, which was adjusted by the slight 3.2 bit biased-
707 composition correction. The only time you really need to pay attention to the
708 bias value is when it’s large, on the same order of magnitude as the sequence
709 bit score. Sometimes (rarely) the bias correction isn’t aggressive enough, and
710 allows a non-homolog to retain too much score. Conversely, the bias correction
711 can be too aggressive sometimes, causing you to miss homologs. You can turn the
712 biased-composition score correction off with the --nonull2 option (and if
713 you’re doing that, you may also want to set --nobias, to turn off another
714 biased composition step called the bias filter, which affects which sequences
715 get scored at all).
716
717 ]]></token>
718 <token name="@CUT_HELP@"><![CDATA[
719 Options for Model-specific Score Thresholding
720 ---------------------------------------------
721
722 Curated profile databases may define specific bit score thresholds for each
723 profile, superseding any thresholding based on statistical significance alone.
724 To use these options, the profile must contain the appropriate (GA, TC, and/or
725 NC) optional score threshold annotation; this is picked up by hmmbuild from
726 Stockholm format alignment files. Each thresholding option has two scores: the
727 per-sequence threshold <x1> and the per-domain threshold <x2> These act as if
728 -T<x1> --incT<x1> --domT<x2> --incdomT<x2> has been applied specifically using
729 each model’s curated thresholds.
730
731 **--cut_ga**
732
733 Use the GA (gathering) bit scores in the model to set per-sequence (GA1) and
734 per-domain (GA2) reporting and inclusion thresholds. GA thresholds are
735 generally considered to be the reliable curated thresholds defining family
736 membership; for example, in Pfam, these thresholds define what gets included in
737 Pfam Full alignments based on searches with Pfam Seed models.
738
739 **--cut_nc**
740
741 Use the NC (noise cutoff) bit score thresholds in the model to set
742 per-sequence (NC1) and per-domain (NC2) reporting and inclusion thresholds. NC
743 thresholds are generally considered to be the score of the highest-scoring
744 known false positive.
745
746 **--cut_tc**
747
748 Use the NC (trusted cutoff) bit score thresholds in the model to set
749 per-sequence (TC1) and per-domain (TC2) reporting and inclusion thresholds. TC
750 thresholds are generally considered to be the score of the lowest-scoring known
751 true positive that is above all known false positives.
752 ]]></token>
753 <token name="@EVAL_CALIB_HELP@"><![CDATA[
754 Options Controlling H3 Parameter Estimation Methods
755 ---------------------------------------------------
756
757 H3 uses three short random sequence simulations to estimating the location
758 parameters for the expected score distributions for MSV scores, Viterbi scores,
759 and Forward scores. These options allow these simulations to be modified.
760
761 **--EmL**
762
763 Sets the sequence length in simulation that estimates the location parameter mu
764 for MSV E-values. Default is 200.
765
766 **--EmN**
767
768 Sets the number of sequences in simulation that estimates the location parameter
769 mu for MSV E-values. Default is 200.
770
771 **--EvL**
772
773 Sets the sequence length in simulation that estimates the location parameter mu
774 for Viterbi E-values. Default is 200.
775
776 **--EvN**
777
778 Sets the number of sequences in simulation that estimates the location parameter
779 mu for Viterbi E-values. Default is 200.
780
781
782 **--EfL**
783
784 Sets the sequence length in simulation that estimates the location parameter tau
785 for Forward E-values. Default is 100.
786
787 **--EfN**
788
789 Sets the number of sequences in simulation that estimates the location parameter
790 tau for Forward E-values. Default is 200.
791
792 **--Eft**
793
794 Sets the tail mass fraction to fit in the simulation that estimates the location param-
795 eter tau for Forward evalues. Default is 0.04.
796 ]]></token>
797 <token name="@FORMAT_SELECTOR_HELP@"><![CDATA[
798 Options for Specifying the Alphabet
799 -----------------------------------
800
801 The alphabet type (amino, DNA, or RNA) is autodetected by default, by looking
802 at the composition of the msafile. Autodetection is normally quite reliable,
803 but occasionally alphabet type may be ambiguous and autodetection can fail (for
804 instance, on tiny toy alignments of just a few residues). To avoid this, or to
805 increase robustness in automated analysis pipelines, you may specify the
806 alphabet type of msafile with these options.
807 ]]></token>
808 <token name="@HSSI_HELP@"><![CDATA[
809 Options Controlling Single Sequence Scoring (first Iteration)
810 -------------------------------------------------------------
811
812 By default, the first iteration uses a search model constructed from a single
813 query sequence. This model is constructed using a standard 20x20 substitution
814 matrix for residue probabilities, and two additional pa- rameters for
815 position-independent gap open and gap extend probabilities. These options allow
816 the default single-sequence scoring parameters to be changed.
817
818 **Gap Open (--popen)**
819
820 Set the gap open probability for a single sequence query model to <x>
821
822 **Gap Extend (--pextend)**
823
824 Set the gap extend probability for a single sequence query model to <x>.
825
826
827 **--mx/--mxfile**
828
829 These options are not currently supported
830 ]]></token>
831 <token name="@LENGTHS_HELP@"><![CDATA[
832 Tail Mass Options
833 -----------------
834
835 **Window length tail mass (--w_beta)**
836
837 The upper bound, W, on the length at which nhmmer expects to find an instance
838 of the model is set such that the fraction of all sequences generated by the
839 model with length >= W is less than <x>. The default is 1e-7.
840
841
842 **Model instance length upper bound (--w length)**
843
844 Override the model instance length upper bound, W, which is otherwise
845 controlled by --w beta. It should be larger than the model length. The value of
846 W is used deep in the acceleration pipeline, and modest changes are not
847 expected to impact results (though larger values of W do lead to longer run
848 time).
849
850 ]]></token>
851 <token name="@MCSS_HELP@"><![CDATA[
852 **Options Controlling Profile Construction**
853
854 These options control how consensus columns are defined in an alignment.
855
856 **--fast**
857
858 Define consensus columns as those that have a fraction >= symfrac of residues
859 as opposed to gaps. (See below for the --symfrac option.) This is the default.
860
861 **--hand**
862
863 Define consensus columns in next profile using reference annotation to the multiple
864 alignment. This allows you to define any consensus columns you like.
865
866
867 **--symfrac**
868
869 Define the residue fraction threshold necessary to define a consensus column
870 when using the --fast option. The default is 0.5. The symbol fraction in each
871 column is calculated after taking relative sequence weighting into account, and
872 ignoring gap characters corresponding to ends of sequence fragments (as opposed
873 to internal insertions/deletions). Setting this to 0.0 means that every
874 alignment column will be assigned as consensus, which may be useful in some
875 cases. Setting it to 1.0 means that only columns that include 0 gaps (internal
876 insertions/deletions) will be assigned as consensus.
877
878 **--fragthresh**
879
880 We only want to count terminal gaps as deletions if the aligned sequence is
881 known to be full-length, not if it is a fragment (for instance, because only
882 part of it was sequenced). HMMER uses a simple rule to infer fragments: if the
883 sequence length L is less than or equal to a fraction <x> times the alignment
884 length in columns, then the sequence is handled as a fragment. The default is
885 0.5. Setting --fragthresh0 will define no (nonempty) sequence as a fragment;
886 you might want to do this if you know you’ve got a carefully curated alignment
887 of full-length sequences. Setting --fragthresh1 will define all sequences as
888 fragments; you might want to do this if you know your alignment is entirely
889 composed of fragments, such as translated short reads in metagenomic shotgun
890 data.
891
892 ]]></token>
893 <token name="@OFORMAT_WITH_OPTS_HELP@"><![CDATA[
894 Options for Controlling Output
895 ------------------------------
896
897 **Table of hits**
898
899 Save a simple tabular (space-delimited) file summarizing the per-target output, with
900 one data line per homologous target model found.
901
902 **Table of per-domain hits**
903
904 Save a simple tabular (space-delimited) file summarizing the per-domain output,
905 with one data line per homologous domain detected in a query sequence for each
906 homologous model.
907
908 **Table of hits and domains in Pfam Format**
909
910 Save an especially succinct tabular (space-delimited) file summarizing the
911 per-target output, with one data line per homologous target model found.
912 ]]></token>
913 <token name="@OFORMAT_WITH_OPTS_NOPFAM_HELP@"><![CDATA[
914 Options for Controlling Output
915 ------------------------------
916
917 **Table of hits**
918
919 Save a simple tabular (space-delimited) file summarizing the per-target output, with
920 one data line per homologous target model found.
921
922 **Table of per-domain hits**
923
924 Save a simple tabular (space-delimited) file summarizing the per-domain output,
925 with one data line per homologous domain detected in a query sequence for each
926 homologous model.
927 ]]></token>
928 <token name="@OFORMAT_WITH_OPTS_N_HELP@"><![CDATA[
929 Options for Controlling Output
930 ------------------------------
931
932 **Table of hits**
933
934 Save a simple tabular (space-delimited) file summarizing the per-target output, with
935 one data line per homologous target model found.
936
937 **Table of hits (dfam)**
938
939 Save a tabular (space-delimited) file summarizing the per-hit output, similar
940 to --tblout but more succinct.
941
942
943 **List of per-position scores for each hit (--aliscoreout)**
944
945 Save to file a list of per-position scores for each hit. This is useful, for
946 example, in identifying regions of high score density for use in resolving
947 overlapping hits from different models.
948
949 ]]></token>
950 <token name="@PRIOR_HELP@"><![CDATA[
951 Options Controlling Priors
952 --------------------------
953
954 By default, weighted counts are converted to mean posterior probability
955 parameter estimates using mixture Dirichlet priors. Default mixture Dirichlet
956 prior parameters for protein models and for nucleic acid (RNA and DNA) models
957 are built in. The following options allow you to override the default priors.
958
959 **No priors (--pnone)**
960
961 Don’t use any priors. Probability parameters will simply be the observed
962 frequencies, after relative sequence weighting.
963
964 **Laplace +1 prior**
965
966 Use a Laplace +1 prior in place of the default mixture Dirichlet prior.
967 ]]></token>
968 <token name="@SEED_HELP@"><![CDATA[
969 Random Seeding
970 --------------
971
972 Seed the random number generator with <n>, an integer >= 0. If <n> is nonzero,
973 any stochastic simulations will be reproducible; the same command will give the
974 same results. If <n> is 0, the random number generator is seeded arbitrarily,
975 and stochastic simulations will vary from run to run of the same command.
976
977 ]]></token>
978 <token name="@THRESHOLDS_HELP@"><![CDATA[
979 Options for Reporting Thresholds
980 --------------------------------
981
982 Reporting thresholds control which hits are reported in output files (the main
983 output, --tblout, and --domtblout).
984
985 **E-value (-E)**
986
987 In the per-target output, report target profiles with an E-value of <= <x>. The
988 default is 10.0, meaning that on average, about 10 false positives will be
989 reported per query, so you can see the top of the noise and decide for yourself
990 if it’s really noise.
991
992 **Bit score (-T)**
993
994 Instead of thresholding per-profile output on E-value, instead report target profiles
995 with a bit score of >= <x>.
996
997 **domain E-value (--domE)**
998
999 In the per-domain output, for target profiles that have already satisfied the
1000 per-profile reporting threshold, report individual domains with a conditional
1001 E-value of <= <x>. The default is 10.0. A conditional E-value means the
1002 expected number of additional false positive domains in the smaller search
1003 space of those comparisons that already satisfied the per-profile reporting
1004 threshold (and thus must have at least one homologous domain already).
1005
1006 **domain Bit scores (--domT)**
1007
1008 Instead of thresholding per-domain output on E-value, instead report domains
1009 with a bit score of >= <x>.
1010
1011 Options for Inclusion Thresholds
1012 --------------------------------
1013
1014 Inclusion thresholds are stricter than reporting thresholds. Inclusion
1015 thresholds control which hits are considered to be reliable enough to be
1016 included in an output alignment or a subsequent search round. In hmmscan, which
1017 does not have any alignment output (like hmmsearch or phmmer) nor any iterative
1018 search steps (like jackhmmer), inclusion thresholds have little effect. They
1019 only affect what domains get marked as significant (!) or questionable (?) in
1020 domain output.
1021
1022 **E-value of per target inclusion threshold**
1023
1024 Use an E-value of <= <x> as the per-target inclusion threshold. The default is
1025 0.01, meaning that on average, about 1 false positive would be expected in
1026 every 100 searches with different query sequences.
1027
1028 **Bit score of per target inclusion threshold**
1029
1030 Instead of using E-values for setting the inclusion threshold, instead use a
1031 bit score of >= <x> as the per-target inclusion threshold. It would be unusual
1032 to use bit score thresholds with hmmscan, because you don’t expect a single
1033 score threshold to work for different profiles; different profiles have
1034 slightly different expected score distributions.
1035
1036 **domain E-value per target inclusion treshold**
1037
1038 Use a conditional E-value of <= <x> as the per-domain inclusion threshold, in
1039 targets that have already satisfied the overall per-target inclusion threshold.
1040
1041 **domain Bit score per target inclusion treshold**
1042
1043 Instead of using E-values, instead use a bit score of >= <x> as the per-domain
1044 inclusion threshold. As with --incT above, it would be unusual to use a single
1045 bit score threshold in hmmscan.
1046
1047 ]]></token>
1048 <token name="@THRESHOLDS_NODOM_HELP@"><![CDATA[
1049 Options for Reporting Thresholds
1050 --------------------------------
1051
1052 Reporting thresholds control which hits are reported in output files (the main
1053 output, --tblout, and --domtblout).
1054
1055 **E-value (-E)**
1056
1057 In the per-target output, report target profiles with an E-value of <= <x>. The
1058 default is 10.0, meaning that on average, about 10 false positives will be
1059 reported per query, so you can see the top of the noise and decide for yourself
1060 if it’s really noise.
1061
1062 **Bit score (-T)**
1063
1064 Instead of thresholding per-profile output on E-value, instead report target profiles
1065 with a bit score of >= <x>.
1066
1067 Options for Inclusion Thresholds
1068 --------------------------------
1069
1070 Inclusion thresholds are stricter than reporting thresholds. Inclusion
1071 thresholds control which hits are considered to be reliable enough to be
1072 included in an output alignment or a subsequent search round. In hmmscan, which
1073 does not have any alignment output (like hmmsearch or phmmer) nor any iterative
1074 search steps (like jackhmmer), inclusion thresholds have little effect. They
1075 only affect what domains get marked as significant (!) or questionable (?) in
1076 domain output.
1077
1078 **E-value of per target inclusion threshold**
1079
1080 Use an E-value of <= <x> as the per-target inclusion threshold. The default is
1081 0.01, meaning that on average, about 1 false positive would be expected in
1082 every 100 searches with different query sequences.
1083
1084 **Bit score of per target inclusion threshold**
1085
1086 Instead of using E-values for setting the inclusion threshold, instead use a
1087 bit score of >= <x> as the per-target inclusion threshold. It would be unusual
1088 to use bit score thresholds with hmmscan, because you don’t expect a single
1089 score threshold to work for different profiles; different profiles have
1090 slightly different expected score distributions.
1091
1092 ]]></token>
1093 <token name="@ATTRIBUTION@"><![CDATA[
1094
1095 Attribution
1096 -----------
1097
1098 This Galaxy tool relies on HMMER3_
1099 Internally the software is cited as:
1100
1101 ::
1102
1103 # hmmscan :: search sequence(s) against a profile database
1104 # HMMER 3.1 (February 2013); http://hmmer.org/
1105 # Copyright (C) 2011 Howard Hughes Medical Institute.
1106 # Freely distributed under the GNU General Public License (GPLv3).
1107 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1108
1109 The wrappers were written by the IUC and are licensed under Apache2_. The
1110 documentation is copied from the HMMER3 documentation.
1111
1112 .. _Apache2: http://www.apache.org/licenses/LICENSE-2.0
1113 .. _HMMER3: http://hmmer.org/
1114
1115
1116 ]]></token>
1117 <token name="@HELP_PRE@"><![CDATA[
1118
1119 What it does
1120 ============
1121 ]]></token>
1122 <token name="@HELP_PRE_OTH@"><![CDATA[
1123 Options
1124 =======
1125 ]]></token>
1126 </macros>