hmmer_hmmsearch: macros.xml.orig comparison

comparison macros.xml.orig @ 11:405dd85a9408 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hmmer3 commit e0d4688a59e6eeba33adcfe803ac43d0bc2863e7"

author	iuc
date	Tue, 31 Aug 2021 08:43:59 +0000
parents
children

comparison

equal deleted inserted replaced

-:ffeedf9b8dce
+:405dd85a9408
+<?xml version="1.0"?>
+<macros>
+<xml name="requirements">
+<requirements>
+<requirement type="package" version="@TOOL_VERSION@">hmmer</requirement>
+<yield/>
+</requirements>
+</xml>
+<<<<<<< HEAD
+<xml name="bio_tools">
+<xrefs>
+<xref type="bio.tools">gemini</xref>
+</xrefs>
+</xml>
+=======
+<xml name="bio_tools">
+<xrefs>
+<xref type="bio.tools">hmmer3</xref>
+</xrefs>
+</xml>
+>>>>>>> 5f65381bd (fix id of hmmer3)
+<token name="@TOOL_VERSION@">3.3.2</token>
+<xml name="stdio">
+<stdio>
+<!-- Anything other than zero is an error -->
+<exit_code range="1:"/>
+<exit_code range=":-1"/>
+<!-- In case the return code has not been set propery check stderr too -->
+<regex match="Error:"/>
+<regex match="Exception:"/>
+</stdio>
+</xml>
+<token name="@THRESHOLDS@">
+-E $E
+--domE $domE
+#if str($T):
+-T $T
+#end if
+#if str($domT):
+--domT $domT
+#end if
+#if str($incE):
+--incE $incE
+#end if
+#if str($incdomE):
+--incdomE $incdomE
+#end if
+#if str($incT):
+--incT $incT
+#end if
+#if str($incdomT):
+--incdomT $incdomT
+#end if
+</token>
+<xml name="thresholds_xml">
+<!-- Options controlling reporting thresholds -->
+<param argument="-E" type="float" min="0" value="10.0" label="report sequences &lt;= this E-Value threshold in output" />
+<param argument="--domE" type="float" min="0" value="10.0" label="report domains &lt;= this E-Value threshold in output" />
+<param argument="-T" type="float" optional="true" label="report sequences &gt;= this score threshold in output" />
+<param argument="--domT" type="float" optional="true" label="report domains &gt;= this score threshold in output" />
+<!-- Options controlling inclusion (significance) thresholds -->
+<param argument="--incE" type="float" optional="true" label="consider sequences &lt;= this E-Value threshold as significant" />
+<param argument="--incdomE" type="float" optional="true" label="consider domains &lt;= this E-Value threshold as significant" />
+<param argument="--incT" type="float" optional="true" label="consider sequences &gt;= this score threshold as significant" />
+<param argument="--incdomT" type="float" optional="true" label="consider domains &gt;= this score threshold as significant" />
+</xml>
+<token name="@THRESHOLDS_NODOM@">
+-E $E
+#if str($T):
+-T $T
+#end if
+#if str($incE):
+--incE $incE
+#end if
+#if str($incT):
+--incT $incT
+#end if
+</token>
+<xml name="thresholds_nodom">
+<!-- Options controlling reporting thresholds -->
+<param argument="-E" type="float" min="0" value="10.0" label="report sequences &lt;= this E-Value threshold in output" />
+<param argument="-T" type="float" optional="true" label="report sequences &gt;= this score threshold in output" />
+<!-- Options controlling inclusion (significance) thresholds -->
+<param argument="--incE" type="float" optional="true" label="consider sequences &lt;= this E-Value threshold as significant" />
+<param argument="--incT" type="float" optional="true" label="consider sequences &gt;= this score threshold as significant" />
+</xml>
+<token name="@ACCEL_HEUR@">
+$max
+--F1 $F1
+--F2 $F2
+--F3 $F3
+$nobias
+</token>
+<xml name="accel_heur_xml">
+<!-- Options controlling acceleration heuristics -->
+<param argument="--max" type="boolean" truevalue="--max" falsevalue="" label="Turn all heuristic filters off (less speed, more power)" />
+<param argument="--F1" type="float" value="0.02" label="Stage 1 (MSV) threshold: promote hits w/ P &lt;= F1" />
+<param argument="--F2" type="float" value="1e-3" label="Stage 2 (Vit) threshold: promote hits w/ P &lt;= F2" />
+<param argument="--F3" type="float" value="1e-5" label="Stage 3 (Fwd) threshold: promote hits w/ P &lt;= F3" />
+<param argument="--nobias" type="boolean" truevalue="--nobias" falsevalue="" label="Turn off composition bias filter" />
+</xml>
+<token name="@EVAL_CALIB@">
+--EmL $EmL
+--EmN $EmN
+--EvL $EvL
+--EvN $EvN
+--EfL $EfL
+--EfN $EfN
+--Eft $Eft
+</token>
+<xml name="eval_calib_xml">
+<!-- Control of E-value calibration -->
+<param argument="--EmL" type="integer" min="1" value="200" label="Length of sequences for MSV Gumbel mu fit" />
+<param argument="--EmN" type="integer" min="1" value="200" label="Number of sequences for MSV Gumbel mu fit" />
+<param argument="--EvL" type="integer" min="1" value="200" label="Length of sequences for Viterbi Gumbel mu fit" />
+<param argument="--EvN" type="integer" min="1" value="200" label="Number of sequences for Viterbi Gumbel mu fit" />
+<param argument="--EfL" type="integer" min="1" value="100" label="Length of sequences for Forward exp tail tau fit" />
+<param argument="--EfN" type="integer" min="1" value="200" label="Number of sequences for Forward exp tail tau fit" />
+<param argument="--Eft" type="float" min="0" max="1" value="0.04" label="tail mass for Forward exponential tail tau fit" />
+</xml>
+<token name="@OFORMAT_WITH_OPTS@">
+#if $oformat:
+#for o in str($oformat).split(','):
+--$o '$getVar($o, 'MISSING_OUTPUT'+$o)'
+#end for
+#end if
+$acc $noali $notextw
+</token>
+<xml name="oformat_with_opts">
+<!-- Options directing output -->
+<param name="oformat" type="select" multiple="true" display="checkboxes" label="Output Formats">
+<option value="tblout" selected="true">Table of per-sequence hits (--tblout)</option>
+<yield/>
+</param>
+<param argument="--acc" type="boolean" truevalue="--acc" falsevalue="" label="Prefer accessions over names in output" />
+<param argument="--noali" type="boolean" truevalue="--noali" falsevalue="" label="Don't output alignments, so output is smaller" />
+<param argument="--notextw" type="boolean" truevalue="--notextw" falsevalue="" label="Unlimited ASCII text output line width" />
+</xml>
+<xml name="oformat_with_opts_dom">
+<expand macro="oformat_with_opts">
+<option value="domtblout" selected="true">Table of per-domain hits (--domtblout)</option>
+<yield/>
+</expand>
+</xml>
+<xml name="oformat_with_opts_dom_pfam">
+<expand macro="oformat_with_opts_dom">
+<option value="pfamtblout" selected="true">Table of hits and domains in Pfam format (--pfamtblout)</option>
+</expand>
+</xml>
+<xml name="oformat_with_opts_dfam_alisc">
+<!-- Options directing output -->
+<expand macro="oformat_with_opts">
+<option value="dfamtblout" selected="true">Table of hits in Dfam format (--dfamtblout)</option>
+<option value="aliscoresout">Scores for each position in each alignment to file (--aliscoresout)</option>
+</expand>
+</xml>
+<xml name="output" token_tool="">
+<data name="output" format="txt" label="@TOOL@ on ${on_string}"/>
+<data name="tblout" format="txt" label="@TOOL@ on ${on_string}: per-sequence hits from HMM matches">
+<filter>oformat and 'tblout' in oformat</filter>
+</data>
+<yield/>
+</xml>
+<xml name="output_dom" token_tool="">
+<expand macro="output" tool="@TOOL@">
+<data name="domtblout" format="txt" label="@TOOL@ on ${on_string}: per-domain hits from HMM matches">
+<filter>oformat and 'domtblout' in oformat</filter>
+</data>
+</expand>
+<yield/>
+</xml>
+<xml name="output_dom_pfam" token_tool="">
+<expand macro="output_dom" tool="@TOOL@">
+<data name="pfamtblout" format="txt" label="@TOOL@ on ${on_string}: per-sequence/per-domain hits from HMM matches">
+<filter>oformat and 'pfamtblout' in oformat</filter>
+</data>
+</expand>
+</xml>
+<xml name="output_dfam_alisc" token_tool="" token_ofvar="seqfile" token_invar="seqdb">
+<expand macro="output" tool="@TOOL@">
+<data name="dfamtblout" format="txt" label="@TOOL@ on ${on_string}: per-sequence/per-domain hits from HMM matches">
+<filter>oformat and 'dfamtblout' in oformat</filter>
+</data>
+<data name="aliscoresout" format="txt" label="@TOOL@ on ${on_string}: scores for positional matches">
+<filter>oformat and 'aliscoresout' in oformat</filter>
+</data>
+</expand>
+</xml>
+<xml name="assert_out" token_tool="">
+<assert_contents>
+<has_line_matching expression="# @TOOL@.*"/>
+<has_line_matching expression="\[ok\]"/>
+</assert_contents>
+</xml>
+<xml name="assert_tblout" token_tool="">
+<assert_contents>
+<has_line_matching expression="# Program:         @TOOL@"/>
+<has_line_matching expression="# \[ok\]"/>
+</assert_contents>
+</xml>
+<xml name="oformat_test">
+<param name="notextw" value="true" />
+</xml>
+<token name="@HSSI@">
+#if $hssi.hssi_select == "singlemx":
+--popen $hssi.popen
+--pextend $hssi.pextend
+#end if
+</token>
+<xml name="hssi">
+<!-- Handling single sequence inputs -->
+<conditional name="hssi">
+<param name="hssi_select" type="select" label="Options for handling single sequence inputs">
+<option value="false" selected="true">Disable</option>
+<option value="singlemx">Use substitution score matrix for single-sequence inputs</option>
+</param>
+<when value="false" />
+<when value="singlemx">
+<param argument="--popen" type="float" min="0.0" max="0.5" value="0.02" label="Gap open probability" />
+<param argument="--pextend" type="float" min="0.0" max="1.0" value="0.4" label="Gap extend probability" />
+</when>
+<!-- -mx <s>      : substitution score matrix (built-in matrices, with -singlemx)-->
+<!-- -mxfile <f>  : read substitution score matrix from file <f> (with -singlemx)-->
+</conditional>
+</xml>
+<token name="@ADDTHREADS@"><![CDATA[
+##compute the number of ADDITIONAL threads to be used (--cpu)
+addthreads=\${GALAXY_SLOTS:-1} && (( addthreads-- )) &&
+]]></token>
+<token name="@CPU@">
+--cpu \$addthreads
+</token>
+<token name="@SEED@">
+--seed $seed
+</token>
+<xml name="seed">
+<param argument="--seed" type="integer" min="0" value="42" label="RNG seed, 0 generates a random seed" />
+</xml>
+<xml name="seed_test">
+<param name="seed" value="4" />
+</xml>
+<token name="@ADV_OPTS@">
+$nonull2
+#if str($Z):
+-Z $Z
+#end if
+#if str($domZ):
+--domZ $domZ
+#end if
+</token>
+<xml name="adv_opts">
+<!-- Other options -->
+<param argument="--nonull2" type="boolean" truevalue="--nonull2" falsevalue="" label="Turn off biased composition score corrections" />
+<param argument="-Z" type="integer" optional="true" label="# of comparisons done for E-value calculation" />
+<param argument="--domZ" type="integer" optional="true" label="# of significant sequences, for domain E-value calculation" />
+</xml>
+<token name="@FORMAT_SELECTOR@">
+$input_format_select
+</token>
+<xml name="format_selector">
+<param name="input_format_select" type="select" label="Format of sequence and model">
+<option value="--amino">Protein</option>
+<option value="--dna">DNA</option>
+<option value="--rna">RNA</option>
+</param>
+</xml>
+<xml name="format_selector_noprot">
+<param name="input_format_select" type="select" label="Format of sequence and model">
+<option value="--dna">DNA</option>
+<option value="--rna">RNA</option>
+</param>
+</xml>
+<token name="@ARSWS@">
+$arsws.arsws_select
+#if $arsws.arsws_select == "--wblosum":
+--wid $arsws.wid
+#end if
+</token>
+<xml name="arsws">
+<!-- Alternative relative sequence weighting strategies -->
+<conditional name="arsws">
+<param name="arsws_select" type="select" label="Alternative relative sequence weighting strategies">
+<option value="--wpb" selected="true">Henikoff position-based weights (--wpb)</option>
+<option value="--wgsc">Gerstein/Sonnhammer/Chothia tree weights (--wgsc)</option>
+<option value="--wblosum">Henikoff simple filter weights (--wblosum)</option>
+<option value="--wnone">don't do any relative weighting; set all to 1 (--wnnoe)</option>
+<option value="--wgiven">use weights as given in MSA file (--wgiven)</option>
+</param>
+<when value="--wpb">
+</when>
+<when value="--wgsc">
+</when>
+<when value="--wblosum">
+<param argument="--wid" type="float" value="0.62" label="Set identity cutoff" />
+</when>
+<when value="--wnone">
+</when>
+<when value="--wgiven">
+</when>
+</conditional>
+</xml>
+<token name="@AEEWS@">
+#if $aeews.aeews_select != "":
+--$aeews.aeews_select
+#if $aeews.aeews_select == "eent":
+--eset $aeews.eset
+--ere $aeews.ere
+--esigma $aeews.esigma
+#elif $aeews.aeews_select == "eclust":
+--eset $aeews.eset
+--eid $aeews.eid
+#end if
+#end if
+</token>
+<xml name="aeews">
+<!-- Alternative effective sequence weighting strategies -->
+<conditional name="aeews">
+<param name="aeews_select" type="select" label="Alternative effective sequence weighting strategies">
+<option value="">Disabled</option>
+<option value="eent">Adjust eff seq # to achieve relative entropy target (--eent)</option>
+<option value="eclust">Eff seq # is the # of single linkage clusters (--eclust)</option>
+<option value="enone">No effective seq # weighting: just use nseq (--enone)</option>
+</param>
+<when value="">
+</when>
+<when value="eent">
+<param argument="--eset" type="float" value="0" label="set eff seq # for all models" />
+<param argument="--ere" type="float" value="0" label="set minimum rel entropy/position" />
+<param argument="--esigma" type="float" value="45" label="set sigma param" />
+</when>
+<when value="eclust">
+<param argument="--eset" type="float" value="0" label="set eff seq # for all models" />
+<param argument="--eid" type="float" min="0" max="1" value="0.62" label="set fractional identity cutoff" />
+</when>
+<when value="enone">
+</when>
+</conditional>
+</xml>
+<token name="@CUT@">
+$cut_ga
+$cut_nc
+$cut_tc
+</token>
+<xml name="cut">
+<param argument="--cut_ga" type="boolean" truevalue="--cut_ga" falsevalue="" label="use profile's GA gathering cutoffs to set all thresholding" />
+<param argument="--cut_nc" type="boolean" truevalue="--cut_nc" falsevalue="" label="use profile's NC gathering cutoffs to set all thresholding" />
+<param argument="--cut_tc" type="boolean" truevalue="--cut_tc" falsevalue="" label="use profile's TC gathering cutoffs to set all thresholding" />
+</xml>
+<token name="@MCSS@">
+--$mcs.model_construction_strategy_select
+#if $mcs.model_construction_strategy_select == "fast":
+--symfrac $mcs.symfrac
+#end if
+#if str($fragthresh)
+--fragthresh $fragthresh
+#end if
+</token>
+<xml name="mcss">
+<!-- Alternative model construction strategies -->
+<conditional name="mcs">
+<param name="model_construction_strategy_select" type="select" label="Model Construction Strategy">
+<option value="fast" selected="true">Assign columns with &gt;= symfrac residues as consensus (--fast)</option>
+<option value="hand">Manual construction (requires reference annotation) (--hand)</option>
+</param>
+<when value="fast">
+<param argument="--symfrac" value="0.5" type="float" label="Sets sym fraction controlling --fast construction"/>
+</when>
+<when value="hand"></when>
+</conditional>
+<param argument="--fragthresh" type="float" value="0.5" optional="true" label="Fraction of alignment length, under which sequences are excluded" help="HMMER infers fragments if the sequence length L is less than or equal to a fraction x times the alignment length in columns" />
+</xml>
+<token name="@PRIOR@">
+$aps_select
+</token>
+<xml name="prior">
+<param name="aps_select" type="select" label="Alternative Prior Strategies">
+<option value="" selected="true">Unspecified</option>
+<option value="--pnone">Don't use any prior; parameters are frequencies (--pnone)</option>
+<option value="--plaplace">Use a Laplace +1 prior (--plaplace)</option>
+</param>
+</xml>
+<xml name="citation">
+<citations>
+<citation type="doi">10.1093/nar/gkr367</citation>
+</citations>
+</xml>
+<token name="@LENGTHS@">
+#if str($w_beta):
+--w_beta $w_beta
+#end if
+#if str($w_length):
+--w_length $w_length
+#end if
+</token>
+<xml name="lengths">
+<param argument="--w_beta" type="float" optional="true" label="Tail mass at which window length is determined" />
+<param argument="--w_length" type="integer" optional="true" label="Window Length" />
+</xml>
+<token name="@INPUTHMMCHOICE@"><![CDATA[
+#if $input_hmm_conditional.input_hmm_source == "history":
+#set $input_hmm_filename = "localref.hmm"
+ln -s '${input_hmm_conditional.hmmfile}' '${input_hmm_filename}' &&
+## "Press" database
+hmmpress '${input_hmm_filename}' &&
+#else:
+#set $input_hmm_filename = str($input_hmm_conditional.index.fields.db_path)
+#end if
+]]></token>
+<xml name="input_hmm_choice">
+<conditional name="input_hmm_conditional">
+<param name="input_hmm_source" type="select" label="Use a built-in HMM model database or own from your history" >
+<option value="indexed" selected="true">Use a built-in HMM model database</option>
+<option value="history">Use a HMM database from history</option>
+</param>
+<when value="indexed">
+<param name="index" type="select" label="Select a HMM model database" help="If your database of interest is not listed, contact the Galaxy administrator">
+<options from_data_table="hmm_database">
+<filter type="sort_by" column="2"/>
+<validator type="no_options" message="No indexes are available for the selected input dataset"/>
+</options>
+</param>
+</when>
+<when value="history">
+<param name="hmmfile" type="data" format="hmm2,hmm3" label="HMM model" />
+</when>  <!-- history -->
+</conditional>  <!-- input_hmm_conditional -->
+</xml>
+<xml name="input_hmm">
+<param name="hmmfile" type="data" format="hmm2,hmm3" label="HMM model" />
+</xml>
+<xml name="input_msa">
+<param name="msafile" type="data" label="Multiple Sequence Alignment" format="stockholm,clustal,fasta"
+help="in Stockholm, Clustal, or Fasta format. While this tool accepts fasta, please ensure that the sequences are not unaligned"/>
+</xml>
+<token name="@ACCEL_HEUR_HELP@"><![CDATA[
+Acceleration Heuristicts (--F1, --F2, --F3)
+-------------------------------------------
+**MSV filter**
+The sequence is aligned to the profile using a specialized model that
+allows multiple high-scoring local ungapped segments to match. The
+optimal alignment score (Viterbi score) is calculated under this multi-
+segment model, hence the term MSV, for “multi-segment Viterbi”. This is
+HMMER’s main speed heuristic. The MSV score is comparable to BLAST’s sum
+score (optimal sum of ungapped alignment segments). Roughly speaking,
+MSV is comparable to skipping the heuristic word hit and hit extension
+steps of the BLAST acceleration algorithm.
+The MSV filter is very, very fast. In addition to avoiding indel
+calculations in the dynamic programming table, it uses reduced precision
+scores scaled to 8-bit integers, enabling acceleration via 16-way
+parallel SIMD vector instructions.
+The MSV score is a true log-odds likelihood ratio, so it obeys
+conjectures about the expected score distribution (Eddy, 2008) that
+allow immediate and accurate calculation of the statistical significance
+(P- value) of the MSV bit score.
+By default, comparisons with a P-value of ≤ 0.02 pass this filter,
+meaning that about 2% of nonhomol- ogous sequences are expected to pass.
+You can use the --F1 option to change this threshold. For example, --F1
+<0.05> would pass 5% of the comparisons, making a search more sensitive
+but slower. Setting the threshold to ≥ 1.0 (--F1 99 for example) assures
+that all comparisons will pass. Shutting off the MSV filter may be
+worthwhile if you want to make sure you don’t miss comparisons that have
+a lot of scattered insertions and deletions. Alternatively, the --max
+option causes the MSV filter step (and all other filter steps) to be
+bypassed.
+The MSV bit score is calculated as a log-odds score using the null model
+for comparison. No correction for a biased composition or repetitive
+sequence is done at this stage. For comparisons involving biased
+sequences and/or profiles, more than 2% of comparisons will pass the MSV
+filter. At the end of search output, there is a line like:
+Passed MSV filter: 107917 (0.020272); expected 106468.8 (0.02)
+which tells you how many and what fraction of comparisons passed the MSV
+filter, versus how many (and what fraction) were expected.
+**Viterbi filter**
+The sequence is now aligned to the profile using a fast Viterbi algorithm for
+optimal gapped alignment.
+This Viterbi implementation is specialized for speed. It is implemented in
+8-way parallel SIMD vector instructions, using reduced precision scores that
+have been scaled to 16-bit integers. Only one row of the dynamic programming
+matrix is stored, so the routine only recovers the score, not the optimal
+alignment itself. The reduced representation has limited range; local alignment
+scores will not underflow, but high scoring comparisons can overflow and return
+infinity, in which case they automatically pass the filter.
+The final Viterbi filter bit score is then computed using the appropriate null
+model log likelihood (by default the biased composition filter model score, or
+if the biased filter is off, just the null model score). If the P-value of this
+score passes the Viterbi filter threshold, the sequence passes on to the next
+step of the pipeline.
+The --F2 <x> option controls the P-value threshold for passing the Viterbi
+filter score. The default is 0.001. The --max option bypasses all filters in
+the pipeline.  At the end of a search output, you will see a line like:
+Passed Vit filter: 2207  (0.00443803); expected 497.3 (0.001)
+which tells you how many and what fraction of comparisons passed the Viterbi
+filter, versus how many were expected.
+**Forward filter/parser**
+The sequence is now aligned to the profile using the full Forward algorithm,
+which calculates the likelihood of the target sequence given the profile,
+summed over the ensemble of all possible alignments.
+This is a specialized time- and memory-efficient Forward implementation called
+the “Forward parser”. It is implemented in 4-way parallel SIMD vector
+instructions, in full precision (32-bit floating point). It stores just enough
+information that, in combination with the results of the Backward parser
+(below), posterior probabilities of start and stop points of alignments
+(domains) can be calculated in the domain definition step (below), although the
+detailed alignments themselves cannot be.
+The Forward filter bit score is calculated by correcting this score using the
+appropriate null model log likelihood (by default the biased composition filter
+model score, or if the biased filter is off, just the null model score). If the
+P-value of this bit score passes the Forward filter threshold, the sequence
+passes on to the next step of the pipeline.
+The bias filter score has no further effect in the pipeline. It is only used in
+filter stages. It has no effect on final reported bit scores or P-values.
+Biased composition compensation for final bit scores is done by a more complex
+domain-specific algorithm, described below.
+The --F3 <x> option controls the P-value threshold for passing the Forward
+filter score. The default is 1e-5. The --max option bypasses all filters in the
+pipeline.  At the end of a search output, you will see a line like:
+Passed Fwd filter: 1076 (0.00216371); expected 5.0 (1e-05)
+which tells you how many and what fraction of comparisons passed the Forward
+filter, versus how many were expected.
+**Bias Filter Options**
+The --max option bypasses all filters in the pipeline, including the bias
+filter.
+The --nobias option turns off (bypasses) the biased composition filter. The
+simple null model is used as a null hypothesis for MSV and in subsequent filter
+steps. The biased composition filter step compromises a small amount of
+sensitivity. Though it is good to have it on by default, you may want to shut
+it off if you know you will have no problem with biased composition hits.
+**Advanced Documentation**
+A more detailed look at the internals of the various filter pipelines was
+posted on the `developer's blog <http://cryptogenomicon.org/hmmer3-is-stubborn.html>`__.
+The information posted there may be useful to those who are struggling with
+poor-scoring sequences.
+]]></token>
+<token name="@ADV_OPTS_HELP@"><![CDATA[
+Advanced Options
+----------------
+**nonull2**
+can be too aggressive sometimes, causing you to miss homologs. You can turn the
+biased-composition score correction off with the --nonull2 option (and if
+you’re doing that, you may also want to set --nobias, to turn off another
+biased composition step called the bias filter, which affects which sequences
+get scored at all).
+**domZ**
+Assert that the total number of targets in your searches is <x>, for the
+purposes of per-domain conditional E-value calculations, rather than the number
+of targets that passed the reporting thresholds.
+**Z**
+Assert that the total number of targets in your searches is <x>, for the
+purposes of per-sequence E-value calculations, rather than the actual number of
+targets seen.
+]]></token>
+<token name="@AEEWS_HELP@"><![CDATA[
+Effective Sequence Number
+-------------------------
+After relative weights are determined, they are normalized to sum to a total
+effective sequence number, eff nseq. This number may be the actual number of
+sequences in the alignment, but it is almost always smaller than that. The
+default entropy weighting method (--eent) reduces the effective sequence num-
+ber to reduce the information content (relative entropy, or average expected
+score on true homologs) per consensus position. The target relative entropy is
+controlled by a two-parameter function, where the two parameters are settable
+with --ere and --esigma.
+**--eent**
+Adjust effective sequence number to achieve a specific relative entropy per
+position (see --ere). This is the default.
+**--eclust**
+Set effective sequence number to the number of single-linkage clusters at a
+specific identity threshold (see --eid). This option is not recommended; it’s
+for experiments evaluating how much better --eent is.
+**--enone**
+Turn off effective sequence number determination and just use the actual number
+of sequences. One reason you might want to do this is to try to maximize the
+relative entropy/position of your model, which may be useful for short models.
+**--eset**
+Explicitly set the effective sequence number for all models to <x>.
+**--ere**
+Set the minimum relative entropy/position target to <x>. Requires --eent. Default
+depends on the sequence alphabet. For protein sequences, it is 0.59 bits/position;
+for nucleotide sequences, it is 0.45 bits/position.
+**--esigma**
+Sets the minimum relative entropy contributed by an entire model alignment, over
+its whole length. This has the effect of making short models have higher relative
+entropy per position than --ere alone would give. The default is 45.0 bits.
+**--eid**
+Sets the fractional pairwise identity cutoff used by single linkage clustering
+with the --eclust option. The default is 0.62.
+]]></token>
+<token name="@ARSWS_HELP@"><![CDATA[
+Options Controlling Relative Weights
+------------------------------------
+HMMER uses an ad hoc sequence weighting algorithm to downweight closely related
+sequences and up-weight distantly related ones. This has the effect of making
+models less biased by uneven phylogenetic representation. For example, two
+identical sequences would typically each receive half the weight that one
+sequence would. These options control which algorithm gets used.
+**--wpb**
+Use the Henikoff position-based sequence weighting scheme [Henikoff and
+Henikoff, J. Mol. Biol. 243:574, 1994]. This is the default.
+**--wgsc**
+Use the Gerstein/Sonnhammer/Chothia weighting algorithm [Gerstein et al, J.
+Mol. Biol. 235:1067, 1994].
+**--wblosum**
+Use the same clustering scheme that was used to weight data in calculating
+BLOSUM subsitution matrices [Henikoff and Henikoff, Proc. Natl. Acad. Sci
+89:10915, 1992]. Sequences are single-linkage clustered at an identity
+threshold (default 0.62; see --wid) and within each cluster of c sequences,
+each sequence gets rela- tive weight 1/c.
+**--wnone**
+No relative weights. All sequences are assigned uniform weight.
+**--wid**
+Sets the identity threshold used by single-linkage clustering when using
+--wblosum.  Invalid with any other weighting scheme. Default is 0.62.
+]]></token>
+<token name="@BIAS_COMP_HELP@"><![CDATA[
+Bias Composition
+----------------
+The next number, the bias, is a correction term for biased sequence composition
+that has been applied to the sequence bit score.1 For instance, for the top hit
+MYG PHYCA that scored 222.7 bits, the bias of 3.2 bits means that this sequence
+originally scored 225.9 bits, which was adjusted by the slight 3.2 bit biased-
+composition correction. The only time you really need to pay attention to the
+bias value is when it’s large, on the same order of magnitude as the sequence
+bit score. Sometimes (rarely) the bias correction isn’t aggressive enough, and
+allows a non-homolog to retain too much score.  Conversely, the bias correction
+can be too aggressive sometimes, causing you to miss homologs. You can turn the
+biased-composition score correction off with the --nonull2 option (and if
+you’re doing that, you may also want to set --nobias, to turn off another
+biased composition step called the bias filter, which affects which sequences
+get scored at all).
+]]></token>
+<token name="@CUT_HELP@"><![CDATA[
+Options for Model-specific Score Thresholding
+---------------------------------------------
+Curated profile databases may define specific bit score thresholds for each
+profile, superseding any thresholding based on statistical significance alone.
+To use these options, the profile must contain the appropriate (GA, TC, and/or
+NC) optional score threshold annotation; this is picked up by hmmbuild from
+Stockholm format alignment files. Each thresholding option has two scores: the
+per-sequence threshold <x1> and the per-domain threshold <x2> These act as if
+-T<x1> --incT<x1> --domT<x2> --incdomT<x2> has been applied specifically using
+each model’s curated thresholds.
+**--cut_ga**
+Use the GA (gathering) bit scores in the model to set per-sequence (GA1) and
+per-domain (GA2) reporting and inclusion thresholds. GA thresholds are
+generally considered to be the reliable curated thresholds defining family
+membership; for example, in Pfam, these thresholds define what gets included in
+Pfam Full alignments based on searches with Pfam Seed models.
+**--cut_nc**
+Use the NC (noise cutoff) bit score thresholds in the model to set
+per-sequence (NC1) and per-domain (NC2) reporting and inclusion thresholds. NC
+thresholds are generally considered to be the score of the highest-scoring
+known false positive.
+**--cut_tc**
+Use the NC (trusted cutoff) bit score thresholds in the model to set
+per-sequence (TC1) and per-domain (TC2) reporting and inclusion thresholds. TC
+thresholds are generally considered to be the score of the lowest-scoring known
+true positive that is above all known false positives.
+]]></token>
+<token name="@EVAL_CALIB_HELP@"><![CDATA[
+Options Controlling H3 Parameter Estimation Methods
+---------------------------------------------------
+H3 uses three short random sequence simulations to estimating the location
+parameters for the expected score distributions for MSV scores, Viterbi scores,
+and Forward scores. These options allow these simulations to be modified.
+**--EmL**
+Sets the sequence length in simulation that estimates the location parameter mu
+for MSV E-values. Default is 200.
+**--EmN**
+Sets the number of sequences in simulation that estimates the location parameter
+mu for MSV E-values. Default is 200.
+**--EvL**
+Sets the sequence length in simulation that estimates the location parameter mu
+for Viterbi E-values. Default is 200.
+**--EvN**
+Sets the number of sequences in simulation that estimates the location parameter
+mu for Viterbi E-values. Default is 200.
+**--EfL**
+Sets the sequence length in simulation that estimates the location parameter tau
+for Forward E-values. Default is 100.
+**--EfN**
+Sets the number of sequences in simulation that estimates the location parameter
+tau for Forward E-values. Default is 200.
+**--Eft**
+Sets the tail mass fraction to fit in the simulation that estimates the location param-
+eter tau for Forward evalues. Default is 0.04.
+]]></token>
+<token name="@FORMAT_SELECTOR_HELP@"><![CDATA[
+Options for Specifying the Alphabet
+-----------------------------------
+The alphabet type (amino, DNA, or RNA) is autodetected by default, by looking
+at the composition of the msafile. Autodetection is normally quite reliable,
+but occasionally alphabet type may be ambiguous and autodetection can fail (for
+instance, on tiny toy alignments of just a few residues). To avoid this, or to
+increase robustness in automated analysis pipelines, you may specify the
+alphabet type of msafile with these options.
+]]></token>
+<token name="@HSSI_HELP@"><![CDATA[
+Options Controlling Single Sequence Scoring (first Iteration)
+-------------------------------------------------------------
+By default, the first iteration uses a search model constructed from a single
+query sequence. This model is constructed using a standard 20x20 substitution
+matrix for residue probabilities, and two additional pa- rameters for
+position-independent gap open and gap extend probabilities. These options allow
+the default single-sequence scoring parameters to be changed.
+**Gap Open (--popen)**
+Set the gap open probability for a single sequence query model to <x>
+**Gap Extend (--pextend)**
+Set the gap extend probability for a single sequence query model to <x>.
+**--mx/--mxfile**
+These options are not currently supported
+]]></token>
+<token name="@LENGTHS_HELP@"><![CDATA[
+Tail Mass Options
+-----------------
+**Window length tail mass (--w_beta)**
+The upper bound, W, on the length at which nhmmer expects to find an instance
+of the model is set such that the fraction of all sequences generated by the
+model with length >= W is less than <x>. The default is 1e-7.
+**Model instance length upper bound (--w length)**
+Override the model instance length upper bound, W, which is otherwise
+controlled by --w beta. It should be larger than the model length. The value of
+W is used deep in the acceleration pipeline, and modest changes are not
+expected to impact results (though larger values of W do lead to longer run
+time).
+]]></token>
+<token name="@MCSS_HELP@"><![CDATA[
+**Options Controlling Profile Construction**
+These options control how consensus columns are defined in an alignment.
+**--fast**
+Define consensus columns as those that have a fraction >= symfrac of residues
+as opposed to gaps. (See below for the --symfrac option.) This is the default.
+**--hand**
+Define consensus columns in next profile using reference annotation to the multiple
+alignment. This allows you to define any consensus columns you like.
+**--symfrac**
+Define the residue fraction threshold necessary to define a consensus column
+when using the --fast option. The default is 0.5. The symbol fraction in each
+column is calculated after taking relative sequence weighting into account, and
+ignoring gap characters corresponding to ends of sequence fragments (as opposed
+to internal insertions/deletions). Setting this to 0.0 means that every
+alignment column will be assigned as consensus, which may be useful in some
+cases. Setting it to 1.0 means that only columns that include 0 gaps (internal
+insertions/deletions) will be assigned as consensus.
+**--fragthresh**
+We only want to count terminal gaps as deletions if the aligned sequence is
+known to be full-length, not if it is a fragment (for instance, because only
+part of it was sequenced). HMMER uses a simple rule to infer fragments: if the
+sequence length L is less than or equal to a fraction <x> times the alignment
+length in columns, then the sequence is handled as a fragment. The default is
+0.5. Setting --fragthresh0 will define no (nonempty) sequence as a fragment;
+you might want to do this if you know you’ve got a carefully curated alignment
+of full-length sequences. Setting --fragthresh1 will define all sequences as
+fragments; you might want to do this if you know your alignment is entirely
+composed of fragments, such as translated short reads in metagenomic shotgun
+data.
+]]></token>
+<token name="@OFORMAT_WITH_OPTS_HELP@"><![CDATA[
+Options for Controlling Output
+------------------------------
+**Table of hits**
+Save a simple tabular (space-delimited) file summarizing the per-target output, with
+one data line per homologous target model found.
+**Table of per-domain hits**
+Save a simple tabular (space-delimited) file summarizing the per-domain output,
+with one data line per homologous domain detected in a query sequence for each
+homologous model.
+**Table of hits and domains in Pfam Format**
+Save an especially succinct tabular (space-delimited) file summarizing the
+per-target output, with one data line per homologous target model found.
+]]></token>
+<token name="@OFORMAT_WITH_OPTS_NOPFAM_HELP@"><![CDATA[
+Options for Controlling Output
+------------------------------
+**Table of hits**
+Save a simple tabular (space-delimited) file summarizing the per-target output, with
+one data line per homologous target model found.
+**Table of per-domain hits**
+Save a simple tabular (space-delimited) file summarizing the per-domain output,
+with one data line per homologous domain detected in a query sequence for each
+homologous model.
+]]></token>
+<token name="@OFORMAT_WITH_OPTS_N_HELP@"><![CDATA[
+Options for Controlling Output
+------------------------------
+**Table of hits**
+Save a simple tabular (space-delimited) file summarizing the per-target output, with
+one data line per homologous target model found.
+**Table of hits (dfam)**
+Save a tabular (space-delimited) file summarizing the per-hit output, similar
+to --tblout but more succinct.
+**List of per-position scores for each hit (--aliscoreout)**
+Save to file a list of per-position scores for each hit. This is useful, for
+example, in identifying regions of high score density for use in resolving
+overlapping hits from different models.
+]]></token>
+<token name="@PRIOR_HELP@"><![CDATA[
+Options Controlling Priors
+--------------------------
+By default, weighted counts are converted to mean posterior probability
+parameter estimates using mixture Dirichlet priors. Default mixture Dirichlet
+prior parameters for protein models and for nucleic acid (RNA and DNA) models
+are built in. The following options allow you to override the default priors.
+**No priors (--pnone)**
+Don’t use any priors. Probability parameters will simply be the observed
+frequencies, after relative sequence weighting.
+**Laplace +1 prior**
+Use a Laplace +1 prior in place of the default mixture Dirichlet prior.
+]]></token>
+<token name="@SEED_HELP@"><![CDATA[
+Random Seeding
+--------------
+Seed the random number generator with <n>, an integer >= 0. If <n> is nonzero,
+any stochastic simulations will be reproducible; the same command will give the
+same results. If <n> is 0, the random number generator is seeded arbitrarily,
+and stochastic simulations will vary from run to run of the same command.
+]]></token>
+<token name="@THRESHOLDS_HELP@"><![CDATA[
+Options for Reporting Thresholds
+--------------------------------
+Reporting thresholds control which hits are reported in output files (the main
+output, --tblout, and --domtblout).
+**E-value (-E)**
+In the per-target output, report target profiles with an E-value of <= <x>. The
+default is 10.0, meaning that on average, about 10 false positives will be
+reported per query, so you can see the top of the noise and decide for yourself
+if it’s really noise.
+**Bit score (-T)**
+Instead of thresholding per-profile output on E-value, instead report target profiles
+with a bit score of >= <x>.
+**domain E-value (--domE)**
+In the per-domain output, for target profiles that have already satisfied the
+per-profile reporting threshold, report individual domains with a conditional
+E-value of <= <x>. The default is 10.0. A conditional E-value means the
+expected number of additional false positive domains in the smaller search
+space of those comparisons that already satisfied the per-profile reporting
+threshold (and thus must have at least one homologous domain already).
+**domain Bit scores (--domT)**
+Instead of thresholding per-domain output on E-value, instead report domains
+with a bit score of >= <x>.
+Options for Inclusion Thresholds
+--------------------------------
+Inclusion thresholds are stricter than reporting thresholds. Inclusion
+thresholds control which hits are considered to be reliable enough to be
+included in an output alignment or a subsequent search round. In hmmscan, which
+does not have any alignment output (like hmmsearch or phmmer) nor any iterative
+search steps (like jackhmmer), inclusion thresholds have little effect. They
+only affect what domains get marked as significant (!) or questionable (?) in
+domain output.
+**E-value of per target inclusion threshold**
+Use an E-value of <= <x> as the per-target inclusion threshold. The default is
+0.01, meaning that on average, about 1 false positive would be expected in
+every 100 searches with different query sequences.
+**Bit score of per target inclusion threshold**
+Instead of using E-values for setting the inclusion threshold, instead use a
+bit score of >= <x> as the per-target inclusion threshold. It would be unusual
+to use bit score thresholds with hmmscan, because you don’t expect a single
+score threshold to work for different profiles; different profiles have
+slightly different expected score distributions.
+**domain E-value per target inclusion treshold**
+Use a conditional E-value of <= <x> as the per-domain inclusion threshold, in
+targets that have already satisfied the overall per-target inclusion threshold.
+**domain Bit score per target inclusion treshold**
+Instead of using E-values, instead use a bit score of >= <x> as the per-domain
+inclusion threshold. As with --incT above, it would be unusual to use a single
+bit score threshold in hmmscan.
+]]></token>
+<token name="@THRESHOLDS_NODOM_HELP@"><![CDATA[
+Options for Reporting Thresholds
+--------------------------------
+Reporting thresholds control which hits are reported in output files (the main
+output, --tblout, and --domtblout).
+**E-value (-E)**
+In the per-target output, report target profiles with an E-value of <= <x>. The
+default is 10.0, meaning that on average, about 10 false positives will be
+reported per query, so you can see the top of the noise and decide for yourself
+if it’s really noise.
+**Bit score (-T)**
+Instead of thresholding per-profile output on E-value, instead report target profiles
+with a bit score of >= <x>.
+Options for Inclusion Thresholds
+--------------------------------
+Inclusion thresholds are stricter than reporting thresholds. Inclusion
+thresholds control which hits are considered to be reliable enough to be
+included in an output alignment or a subsequent search round. In hmmscan, which
+does not have any alignment output (like hmmsearch or phmmer) nor any iterative
+search steps (like jackhmmer), inclusion thresholds have little effect. They
+only affect what domains get marked as significant (!) or questionable (?) in
+domain output.
+**E-value of per target inclusion threshold**
+Use an E-value of <= <x> as the per-target inclusion threshold. The default is
+0.01, meaning that on average, about 1 false positive would be expected in
+every 100 searches with different query sequences.
+**Bit score of per target inclusion threshold**
+Instead of using E-values for setting the inclusion threshold, instead use a
+bit score of >= <x> as the per-target inclusion threshold. It would be unusual
+to use bit score thresholds with hmmscan, because you don’t expect a single
+score threshold to work for different profiles; different profiles have
+slightly different expected score distributions.
+]]></token>
+<token name="@ATTRIBUTION@"><![CDATA[
+Attribution
+-----------
+This Galaxy tool relies on HMMER3_
+Internally the software is cited as:
+::
+# hmmscan :: search sequence(s) against a profile database
+# HMMER 3.1 (February 2013); http://hmmer.org/
+# Copyright (C) 2011 Howard Hughes Medical Institute.
+# Freely distributed under the GNU General Public License (GPLv3).
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+The wrappers were written by the IUC and are licensed under Apache2_. The
+documentation is copied from the HMMER3 documentation.
+.. _Apache2: http://www.apache.org/licenses/LICENSE-2.0
+.. _HMMER3: http://hmmer.org/
+]]></token>
+<token name="@HELP_PRE@"><![CDATA[
+What it does
+============
+]]></token>
+<token name="@HELP_PRE_OTH@"><![CDATA[
+Options
+=======
+]]></token>
+</macros>

Mercurial > repos > iuc > hmmer_hmmsearch

comparison macros.xml.orig @ 11:405dd85a9408 draft default tip