changeset 5:2b3adbe83979 draft

Uploaded
author bgruening
date Sat, 31 Aug 2013 09:58:04 -0400
parents 351013907f11
children c650ee099c51
files cmsearch.xml
diffstat 1 files changed, 61 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/cmsearch.xml	Sat Aug 31 08:53:15 2013 -0400
+++ b/cmsearch.xml	Sat Aug 31 09:58:04 2013 -0400
@@ -1,13 +1,18 @@
 <tool id="infernal_cmsearch" name="Search covariance model(s)" version="1.1.0.1">
     <description>against a sequence database (cmsearch)</description>
+    <parallelism method="multi" split_inputs="seqdb" split_mode="to_size" split_size="100" shared_inputs="" merge_outputs="outfile,multiple_alignment_output"></parallelism>
     <requirements>
         <requirement type="package" version="1.1rc4">infernal</requirement>
     </requirements>
     <command>
-        cmsearch --tblout
+        ##TODO reformat the tabular-output: 
+        ## -"Each line consists of 18 space-delimited fields followed by a free text target sequence description, as follows:"
+        ## remove line that starts with #
+
+        cmsearch 
             --cpu 12
             -o /dev/null
-            --tformat $seqdb.ext #target format: fasta, embl, genbank, ddbj, stockholm, pfam, a2m, afa, clustal, and phylip 
+            --tformat $seqdb.ext ##target format: fasta, embl, genbank, ddbj, stockholm, pfam, a2m, afa, clustal, and phylip 
             $bottomonly
             $toponly
             $cyk
@@ -22,7 +27,6 @@
                 $A $multiple_alignment_output
             #end if
 
-
             #if $inclusion_thresholds_opts.inclusion_thresholds_selector == "--incE":
                 --incE $inclusion_thresholds_opts.incE
             #else:
@@ -166,7 +170,7 @@
             </conditional>
 
             <param name="A" truevalue="-A" falsevalue="" checked="False" type="boolean" 
-                label="Save a multiple alignment of all significant hits ..." help="... those satisfying inclusion thresholds"/>
+                label="Save a multiple alignment of all significant hits" help="... those satisfying inclusion thresholds"/>
 
 
         </inputs>
@@ -205,6 +209,59 @@
 basepairs.
 
 
+(1) target name: The name of the target sequence or profile.
+(2) accession: The accession of the target sequence or profile, or ’-’ if none.
+(3) query name: The name of the query sequence or profile.
+(4) accession: The accession of the query sequence or profile, or ’-’ if none.
+(5) mdl (model): Which type of model was used to compute the final score. Either ’cm’ or ’hmm’. A CM
+is used to compute the final hit scores unless the model has zero basepairs or the --hmmonly option
+is used, in which case a HMM will be used.
+(6) mdl from (model coord): The start of the alignment of this hit with respect to the profile (CM or
+HMM), numbered 1..N for a profile of N consensus positions.
+(7) mdl to (model coord): The end of the alignment of this hit with respect to the profile (CM or HMM),
+numbered 1..N for a profile of N consensus positions.
+(8) seq from (ali coord): The start of the alignment of this hit with respect to the sequence, numbered
+1..L for a sequence of L residues.
+(9) seq to (ali coord): The end of the alignment of this hit with respect to the sequence, numbered
+1..L for a sequence of L residues.
+(10) strand: The strand on which the hit occurs on the sequence. ’+’ if the hit is on the top (Watson)
+strand, ’-’ if the hit is on the bottom (Crick) strand. If on the top strand, the “seq from” value will be less
+than or equal to the “seq to” value, else it will be greater than or equal to it.
+(11) trunc: Indicates if this is predicted to be a truncated CM hit or not. This will be “no” if it is a CM hit
+that is not predicted to be truncated by the end of the sequence, “5’ ” or “3’ ” if the hit is predicted to
+have one or more 5’ or 3’ residues missing due to a artificial truncation of the sequence, or “5’&3”’ if
+the hit is predicted to have one or more 5’ residues missing and one or more 3’ residues missing. If
+the hit is an HMM hit, this will always be ’-’.
+(12) pass: Indicates what “pass” of the pipeline the hit was detected on. This is probably only useful for
+testing and debugging. Non-truncated hits are found on the first pass, truncated hits are found on
+successive passes.
+(13) gc: Fraction of G and C nucleotides in the hit.
+(14) bias: The biased-composition correction: the bit score difference contributed by the null3 model for
+CM hits, or the null2 model for HMM hits. High bias scores may be a red flag for a false positive.
+It is difficult to correct for all possible ways in which a nonrandom but nonhomologous biological
+sequences can appear to be similar, such as short-period tandem repeats, so there are cases where
+the bias correction is not strong enough (creating false positives).
+1 The tblout format is deliberately space-delimited (rather than tab-delimited) and justified into aligned columns, so these files
+are suitable both for automated parsing and for human examination. Tab-delimited data files are difficult for humans to examine and
+spot check. For this reason, we think tab-delimited files are a minor evil in the world. Although we occasionally receive shrieks of
+outrage about this, we stubbornly feel that space-delimited files are just as trivial to parse as tab-delimited files.
+63(15) score: The score (in bits) for this target/query comparison. It includes the biased-composition cor-
+rection (the “null3” model for CM hits, or the “null2” model for HMM hits).
+(16) E-value: The expectation value (statistical significance) of the target. This is a per query E-value;
+i.e. calculated as the expected number of false positives achieving this comparison’s score for a single
+query against the search space Z. For cmsearch Z is defined as the total number of nucleotides in the
+target dataset multiplied by 2 because both strands are searched. For cmscan Z is the total number of
+nucleotides in the query sequence multiplied by 2 because both strands are searched and multiplied
+by the number of models in the target database. If you search with multiple queries and if you want
+to control the overall false positive rate of that search rather than the false positive rate per query, you
+will want to multiply this per-query E-value by how many queries you’re doing.
+(17) inc: Indicates whether or not this hit achieves the inclusion threshold: ’!’ if it does, ’?’ if it does not
+(and rather only achieves the reporting threshold). By default, the inclusion threshold is an E-value of
+0.01 and the reporting threshold is an E-value of 10.0, but these can be changed with command line
+options as described in the manual pages.
+(18) description of target: The remainder of the line is the target’s description line, as free text.
+
+
 
 
     </help>