changeset 9:1aa4c036e41c draft

Uploaded
author bgruening
date Sat, 07 Sep 2013 16:14:02 -0400
parents b0f9aca4a34f
children 0bffd4183326
files cmbuild.xml cmsearch.xml cmstat.xml datatypes_conf.xml infernal.py readme.rst
diffstat 6 files changed, 618 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cmbuild.xml	Sat Sep 07 16:14:02 2013 -0400
@@ -0,0 +1,275 @@
+<tool id="infernal_cmbuild" name="Build covariance models" version="1.1.0.0">
+    <description>from sequence alignments (cmbuild)</description>
+    <parallelism method="multi" split_inputs="alignment_infile" split_mode="to_size" split_size="10" shared_inputs="" merge_outputs="cmfile_outfile"></parallelism>
+    <requirements>
+        <requirement type="package">infernal</requirement>
+        <requirement type="package" version="1.1rc4">infernal</requirement>
+        <requirement type="package" version="8.21">gnu_coreutils</requirement>
+    </requirements>
+    <command>
+        cmbuild 
+            #if $is_summery_output:
+                -o $summary_outfile
+            #end if
+            
+            ## to many outputs, is that one really needed?
+            ##-O $annotated_source_alignment_outfile
+
+            $model_construction_opts.model_construction_opts_selector
+            #if $model_construction_opts.model_construction_opts_selector == '--fast':
+                --symfrac $model_construction_opts.symfrac
+            #end if
+
+            $noss
+
+            $relative_weights_opts.relative_weights_opts_selector
+            #if $relative_weights_opts.relative_weights_opts_selector == '--wblosum':
+                --wid $relative_weights_opts.wid
+            #end if
+
+            $effective_opts.effective_opts_selector
+            #if $effective_opts.effective_opts_selector == '--eent':
+                --ere $effective_opts.ere
+                --eminseq $effective_opts.eminseq
+                --ehmmre $effective_opts.ehmmre
+                --eset $effective_opts.eset
+            #end if
+
+            #if $refining_opts.refining_opts_selector == '--refine':
+                #if $refining_opts.refine_output:
+                    --refine $refined_multiple_alignment_output
+                #else:
+                    --refine /dev/null
+                #end if
+
+                $l
+                $refining_opts.gibbs_opts.gibbs_opts_selector
+
+                #if $refining_opts.gibbs_opts.gibbs_opts_selector == '--gibbs':
+                    $refining_opts.gibbs_opts.random_seed
+                #end if
+
+                $notrunc
+                $cyk
+            #end if
+
+            $cmfile_outfile
+            $alignment_infile
+            
+    </command>
+        <inputs>
+            <!-- Stockholm or SELEX 
+            SELEX is defined in EMBOSS datatypes
+            -->
+            <param name="alignment_infile" type="data" format="stockholm,selex" label="Sequence database"/>
+
+            <conditional name="model_construction_opts">
+                <param name="model_construction_opts_selector" type="select" label="These options control how consensus columns are defined in an alignment" help="">
+                    <option value="--fast" selected="true">automatic (--fast)</option>
+                    <option value="--hand">user defined (--hand)</option>
+                </param>
+                <when value="--fast">
+                    <param name="symfrac" type="float" value="0.5" size="5" 
+                        label="Define the residue fraction threshold necessary to define a consensus (--symfrac)" help=""/>
+                </when>
+                <when value="--hand"/>
+            </conditional>
+
+            <param name="noss" truevalue="--noss" falsevalue="" checked="False" type="boolean" 
+                label="Ignore the secondary structure annotation, if any, in your multiple alignment file (--noss)" help=""/>
+
+            <conditional name="relative_weights_opts">
+                <param name="relative_weights_opts_selector" type="select" label="Options controlling relative weights" help="">
+                    <option value="--wpb" selected="true">Henikoff (--wgb)</option>
+                    <option value="--wgsc">Gerstein/Sonnhammer/Chothia (--wgsc)</option>
+                    <option value="--wnone">no sequence weighting (--wnone)</option>
+                    <option value="--wgiven">Sequence weight from given in input file (--wgiven)</option>
+                    <option value="--wblosum">BLOSUM filtering algorithm (--wblosum)</option>
+                    <option value="--wid">BLOSUM filtering algorithm (--wblosum)</option>
+                </param>
+                <when value="--wpb"/>
+                <when value="--wgsc"/>
+                <when value="--wnone"/>
+                <when value="--wgiven"/>
+                <when value="--wblosum">
+                    <param name="wid" type="float" value="0.5" size="5" 
+                        label="Percent identity for clustering the alignment (--wid)" help=""/>
+                </when>
+            </conditional>
+
+
+            <conditional name="effective_opts">
+                <param name="effective_opts_selector" type="select" label="Options controlling effective sequence number" help="">
+                    <option value="--eent" selected="true">entropy weighting strategy (--eent)</option>
+                    <option value="--enone">Turn off the entropy weighting strategy (--enone)</option>
+                </param>
+                <when value="--enone"/>
+                <when value="--eent">
+                    <param name="ere" type="float" value="0.59" size="5" 
+                        label="Set the target mean match state relative entropy (--ere)" help=""/>
+
+                    <param name="eminseq" type="integer" value="" size="5" 
+                        label="Define the minimum allowed effective sequence number (--eminseq)" help=""/>
+
+                    <param name="ehmmre" type="float" value="" size="5" 
+                        label="Set the target HMM mean match state relative entropy (--ehmmre)" help=""/>
+
+                    <param name="eset" type="integer" value="" size="5" 
+                        label="Set the effective sequence number for entropy weighting (--eset)" help=""/>
+                </when>
+            </conditional>
+
+
+            <conditional name="refining_opts">
+                <param name="refining_opts_selector" type="select" label="Options for refining the input alignment" help="">
+                    <option value="" selected="true">No refinement</option>
+                    <option value="--refine">refine the input alignment</option>
+                </param>
+                <when value=""/>
+                <when value="--refine">
+
+                    <conditional name="gibbs_opts">
+                        <param name="gibbs_opts_selector" type="select" label="refinement mode" help="">
+                            <option value="" selected="true">expectation-maximization (EM)</option>
+                            <option value="--gibbs">Gibbs sampling</option>
+                        </param>
+                        <when value=""/>
+                        <when value="--gibbs">
+                            <param name="random_seed" type="integer" value="0" label="Randam Seed" help="" />
+                        </when>
+                    </conditional>
+
+                    <param name="l" truevalue="-l" falsevalue="" checked="False" type="boolean" 
+                        label="Turn on the local alignment algorithm" help="... which allows the alignment to span two or more subsequences if necessary"/>
+
+                    <param name="notrunc" truevalue="--notrunc" falsevalue="" checked="False" type="boolean" 
+                        label="Turn off the truncated alignment algorithm" help=""/>
+
+                    <param name="cyk" type="select" label="Options for refining the input alignment" help="">
+                        <option value="" selected="true">optimal accuracy algorithm</option>
+                        <option value="--cyk">align with the CYK algorithm</option>
+                    </param>
+
+                    <param name="refine_output" truevalue="" falsevalue="" checked="False" type="boolean" 
+                        label="Output the refined alignment file as it is used to build the covariance model" help=""/>
+
+                </when>
+            </conditional>
+
+
+            <param name="is_summery_output" truevalue="" falsevalue="" checked="False" type="boolean" 
+                label="Output a summery file?" help=""/>
+
+        </inputs>
+    <outputs>
+
+        <data format="text" name="summary_outfile" label="cmbuild summary on ${on_string}">
+            <filter>is_summery_output is True</filter>
+        </data>
+        <!--<data format="stockholm" name="annotated_source_alignment_outfile" label="Annotated alignment from ${on_string}"/>-->
+        <data format="cm" name="cmfile_outfile" label="Covariance models from ${on_string}"/>
+
+        <data format="stockholm" name="refined_multiple_alignment_output" label="refined alignment file of ${on_string}">
+            <filter>
+                ((
+                refining_opts['refining_opts_selector'] == "--refine" and
+                refining_opts['refine_output'] is True
+                ))
+            </filter>
+        </data>
+
+    </outputs>
+    <help>
+
+
+**What it does**
+
+For each multiple sequence alignment build a covariance model.
+The alignment file must be in Stockholm or SELEX format, and must contain consensus secondary structure annotation.
+cmbuild uses the consensus structure to determine the architecture of the CM.
+
+In addition to writing CM(s) to CMFILE_OUT, cmbuild also outputs a single line for each model created to stdout. Each
+line has the following fields: ”aln”: the index of the alignment used to build the CM; ”idx”: the index of the CM in the
+CMFILE_OUT; ”name”: the name of the CM; ”nseq”: the number of sequences in the alignment used to build the CM;
+”eff nseq”: the effective number of sequences used to build the model; ”alen”: the length of the alignment used to build
+the CM; ”clen”: the number of columns from the alignment defined as consensus (match) columns; ”bps”: the number
+of basepairs in the CM; ”bifs”: the number of bifurcations in the CM; ”rel entropy: CM”: the total relative entropy of the
+model divided by the number of consensus columns; ”rel entropy: HMM”: the total relative entropy of the model ignoring
+secondary structure divided by the number of consensus columns. ”description”: description of the model/alignment.
+
+
+Options controlling model construction
+--------------------------------------
+
+These options control how consensus columns are defined in an alignment.
+
+  * --fast Define consensus columns automatically as those that have a fraction >= symfrac of residues as opposed to gaps. (See below for the --symfrac option.) This is the default.
+  * --hand Use reference coordinate annotation (#=GC RF line, in Stockholm) to determine which columns are consensus, and which are inserts. Any non-gap character indicates a consensus column. (For example, mark consensus columns with ”x”, and insert columns with ”.”.)
+  * --symfrac Define the residue fraction threshold necessary to define a consensus column when not using --hand. The default is 0.5. The symbol fraction in each column is calculated after taking relative sequence weighting into account. Setting this to 0.0 means that every alignment column will be assigned as consensus, which may be useful in some cases. Setting it to 1.0 means that only columns that include 0 gaps will be assigned as consensus.
+  * --noss Ignore the secondary structure annotation, if any, in MSA-Infile and build a CM with zero basepairs. This model will be similar to a profile HMM and the cmsearch and cmscan programs will use HMM algorithms which are faster than CM ones for this model. Additionally, a zero basepair model need not be calibrated with cmcalibrate prior to running cmsearch with it. The --noss option must be used if there is no secondary structure annotation in MSA-Infile.
+
+
+Options controlling relative weights
+------------------------------------
+
+cmbuild uses an ad hoc sequence weighting algorithm to downweight closely related sequences and upweight distantly
+related ones. This has the effect of making models less biased by uneven phylogenetic representation. For example,
+two identical sequences would typically each receive half the weight that one sequence would. These options control
+which algorithm gets used.
+
+  * --wpb Use the Henikoff position-based sequence weighting scheme [Henikoff and Henikoff, J. Mol. Biol. 243:574, 1994]. This is the default.
+  * --wgsc Use the Gerstein/Sonnhammer/Chothia weighting algorithm [Gerstein et al, J. Mol. Biol. 235:1067, 1994].
+  * --wnone Turn sequence weighting off; e.g. explicitly set all sequence weights to 1.0.
+  * --wgiven Use sequence weights as given in annotation in the input alignment file. If no weights were given, assume they are all 1.0. The default is to determine new sequence weights by the Gerstein/Sonnhammer/Chothia algorithm, ignoring any annotated weights.
+  * --wblosum Use the BLOSUM filtering algorithm to weight the sequences, instead of the default GSC weighting. Cluster the sequences at a given percentage identity (see --wid); assign each cluster a total weight of 1.0, distributed equally amongst the members of that cluster.
+  * --wid Controls the behavior of the --wblosum weighting option by setting the percent identity for clustering the alignment.
+
+
+Options controlling effective sequence number
+---------------------------------------------
+
+After relative weights are determined, they are normalized to sum to a total effective sequence number, eff nseq. This
+number may be the actual number of sequences in the alignment, but it is almost always smaller than that. The default
+entropy weighting method (--eent) reduces the effective sequence number to reduce the information content (relative
+entropy, or average expected score on true homologs) per consensus position. The target relative entropy is controlled
+by a two-parameter function, where the two parameters are settable with --ere and --esigma.
+
+  * --eent Use the entropy weighting strategy to determine the effective sequence number that gives a target mean match state relative entropy. This option is the default, and can be turned off with --enone. The default target mean match state relative entropy is 0.59 bits for models with at least 1 basepair and 0.38 bits for models with zero basepairs, but changed with --ere. The default of 0.59 or 0.38 bits is automatically changed if the total relative entropy of the model (summed match state relative entropy) is less than a cutoff, which is is 6.0 bits by default, but can be changed with the expert, undocumented --eX option. If you really want to play with that option, consult the source code.
+  * --enone Turn off the entropy weighting strategy. The effective sequence number is just the number of sequences in the alignment.
+  * --ere Set the target mean match state relative entropy. By default the target relative entropy per match position is 0.59 bits for models with at least 1 basepair and 0.38 for models with zero basepairs.
+  * --eminseq Define the minimum allowed effective sequence number.
+  * --ehmmre Set the target HMM mean match state relative entropy. Entropy for basepairing match states is calculated using marginalized basepair emission probabilities.
+  * --eset Set the effective sequence number for entropy weighting.
+
+
+
+Options for refining the input alignment
+----------------------------------------
+
+  * --refine Attempt to refine the alignment before building the CM using expectation-maximization (EM). A CM is first built from the initial alignment as usual. Then, the sequences in the alignment are realigned optimally (with the HMM banded CYK algorithm, optimal means optimal given the bands) to the CM, and a new CM is built from the resulting alignment. The sequences are then realigned to the new CM, and a new CM is built from that alignment. This is continued until convergence, specifically when the alignments for two successive iterations are not significantly different (the summed bit scores of all the sequences in the alignment changes less than 1% between two successive iterations).
+  * -l Turn on the local alignment algorithm, which allows the alignment to span two or more subsequences if necessary (e.g. if the structures of the query model and target sequence are only partially shared), allowing certain large insertions and deletions in the structure to be penalized differently than normal indels. The default is to globally align the query model to the target sequences.
+  * --gibbs Modifies the behavior of --refine so Gibbs sampling is used instead of EM. The difference is that during the alignment stage the alignment is not necessarily optimal, instead an alignment (parsetree) for each sequences is sampled from the posterior distribution of alignments as determined by the Inside algorithm. Due to this sampling step --gibbs is non- deterministic, so different runs with the same alignment may yield different results. This is not true when --refine is used without the --gibbs option, in which case the final alignment and CM will always be the same. When --gibbs is enabled, the --seed "number" option can be used to seed the random number generator predictably, making the results reproducible. The goal of the --gibbs option is to help expert RNA alignment curators refine structural alignments by allowing them to observe alternative high scoring alignments.
+  * --seed Seed the random number generator with an integer >= 0. This option can only be used in combination with --gibbs. If the given number is nonzero, stochastic sampling of alignments will be reproducible; the same command will give the same results. If the given number is 0, the random number generator is seeded arbitrarily, and stochastic samplings may vary from run to run of the same command. The default seed is 0.
+  * --cyk With --refine, align with the CYK algorithm. By default the optimal accuracy algorithm is used. There is more information on this in the cmalign manual page.
+  * --notrunc With --refine, turn off the truncated alignment algorithm. There is more information on this in the cmalign manual page.
+
+
+For further questions please refere to the Infernal Userguide_.
+
+.. _Userguide: http://selab.janelia.org/software/infernal/Userguide.pdf
+
+
+How do I cite Infernal?
+-----------------------
+
+The Infernal 1.0 paper (Nawrocki et al., 2009) is the best paper to reference. 
+If you’re writing for an enlightened (url-friendly) journal, you may want to cite the webpage
+http://infernal.janelia.org/ because it is kept up-to-date. We hope to publish a paper related to
+Infernal version 1.1 soon.
+
+**Galaxy Wrapper Author**::
+
+    *  Bjoern Gruening, University of Freiburg
+
+    </help>
+</tool>
--- a/cmsearch.xml	Sat Aug 31 17:32:40 2013 -0400
+++ b/cmsearch.xml	Sat Sep 07 16:14:02 2013 -0400
@@ -2,13 +2,17 @@
     <description>against a sequence database (cmsearch)</description>
     <parallelism method="multi" split_inputs="seqdb" split_mode="to_size" split_size="100" shared_inputs="" merge_outputs="outfile,multiple_alignment_output"></parallelism>
     <requirements>
+        <requirement type="package">infernal</requirement>
         <requirement type="package" version="1.1rc4">infernal</requirement>
         <requirement type="package" version="8.21">gnu_coreutils</requirement>
     </requirements>
     <command>
+        ## a temp file is needed, because the standard tabular output from infernal is not usefull in Galaxy
+        ## it will be converted to a tab delimited file and piped to Galaxy
         temp_tabular_output=\$(mktemp);
 
         cmsearch 
+            ## Infernal Options
             --cpu 12
             -o /dev/null
             --tformat $seqdb.ext ##target format: fasta, embl, genbank, ddbj, stockholm, pfam, a2m, afa, clustal, and phylip 
@@ -38,12 +42,15 @@
                 -T $reporting_thresholds_opts.T
             #end if
 
+            ## CM file from the history or stored as database on disc
+
             #if $cm_opts.cm_opts_selector == "db":
                 $cm_opts.database.fields.path
             #else:
                 $cm_opts.cmfile
             #end if
 
+            ## sequence file
             $seqdb
             2>&#38;1
             ;
@@ -71,7 +78,7 @@
                     </param>
                 </when>
                 <when value="histdb">
-                    <param name="cmfile" type="data" format="fasta" label="Covariance models file from the history."/>
+                    <param name="cmfile" type="data" format="cm" label="Covariance models file from the history."/>
                 </when>
             </conditional>
 
@@ -186,8 +193,6 @@
         </data>
 
     </outputs>
-    <requirements>
-    </requirements>
     <help>
 
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cmstat.xml	Sat Sep 07 16:14:02 2013 -0400
@@ -0,0 +1,104 @@
+<tool id="infernal_cmstat" name="Summary statistics" version="1.1.0.0">
+    <description>for covariance model (cmstat)</description>
+    <requirements>
+        <requirement type="package">infernal</requirement>
+        <requirement type="package" version="1.1rc4">infernal</requirement>
+        <requirement type="package" version="8.21">gnu_coreutils</requirement>
+    </requirements>
+    <command>
+        ## a temp file is needed, because the standard tabular output from infernal is not usefull in Galaxy
+        ## it will be converted to a tab delimited file and piped to Galaxy
+        temp_tabular_output=\$(mktemp);
+
+        cmstat
+
+            #if $cm_opts.cm_opts_selector == "db":
+                $cm_opts.database.fields.path
+            #else:
+                $cm_opts.cmfile
+            #end if
+
+            > \$temp_tabular_output
+            ;
+
+            ## 1. replace all lines starting # (comment lines)
+            ## 2. replace the first 18 spaces with tabs, 18th field is a free text field (can contain spaces)
+            sed -e 's/#.*$//' -e '/^$/d' -e 's/ /\t/g' -e 's/\t/ /18g' \$temp_tabular_output > $outfile
+
+    </command>
+        <inputs>
+            <conditional name="cm_opts">
+                <param name="cm_opts_selector" type="select" label="Subject covariance models">
+                  <option value="db" selected="True">Locally installed covariance models</option>
+                  <option value="histdb">Covariance model from your history</option>
+                </param>
+                <when value="db">
+                    <param name="database" type="select" label="Covariance models">
+                        <options from_file="infernal.loc">
+                          <column name="value" index="0"/>
+                          <column name="name" index="1"/>
+                          <column name="path" index="2"/>
+                        </options>
+                    </param>
+                </when>
+                <when value="histdb">
+                    <param name="cmfile" type="data" format="cm" label="Covariance models file from the history."/>
+                </when>
+            </conditional>
+        </inputs>
+    <outputs>
+        <data format="tabular" name="outfile" label="cmsearch on ${on_string}"/>
+    </outputs>
+    <help>
+
+
+**What it does**
+
+The cmstat utility prints out a tabular file of summary statistics for each given covariance model.
+
+
+Output format
+-------------
+
+By default, cmstat prints general statistics of the model and the alignment it was built from, one line per model in a
+tabular format. 
+
+The columns are:
+
+(1) The index of this profile, numbering each on in the file starting from 1.
+(2) The name of the profile.
+(3) The optional accession of the profile, or ”-” if there is none.
+(4) The number of sequences that the profile was estimated from.
+(5) The effective number of sequences that the profile was estimated from, after Infernal applied an effective sequence number calculation such as the default entropy weighting.
+(6) The length of the model in consensus residues (match states).
+(7) The expected maximum length of a hit to the model.
+(8) The number of basepairs in the model.
+(9) The number of bifurcations in the model.
+(10) What type of model will be used by default in cmsearch and cmscan for this profile, either ”cm” or ”hmm”. For profiles with 0 basepairs, this will be ”hmm” (unless the --nohmmonly option is used). For all other profiles, this will be ”cm”.
+(11) Mean relative entropy per match state, in bits. This is the expected (mean) score per con-
+     sensus position. This is what the default entropy-weighting method for effective sequence
+     number estimation focuses on, so for default Infernal, this value will often reflect the default
+     target for entropy-weighting. If the ”model” field for this profile is ”hmm”, this field will be ”-”.
+(12) Mean relative entropy per match state, in bits, if the CM were transformed into an HMM (information from structure is ignored). The larger the difference between the CM and HMM
+     relative entropy, the more the model will rely on structural conservation relative sequence conservation when identifying homologs.
+
+
+For further questions please refere to the Infernal Userguide_.
+
+.. _Userguide: http://selab.janelia.org/software/infernal/Userguide.pdf
+
+
+How do I cite Infernal?
+-----------------------
+
+The Infernal 1.0 paper (Nawrocki et al., 2009) is the best paper to reference. 
+If you’re writing for an enlightened (url-friendly) journal, you may want to cite the webpage
+http://infernal.janelia.org/ because it is kept up-to-date. We hope to publish a paper related to
+Infernal version 1.1 soon.
+
+**Galaxy Wrapper Author**::
+
+    *  Bjoern Gruening, University of Freiburg
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml	Sat Sep 07 16:14:02 2013 -0400
@@ -0,0 +1,15 @@
+<?xml version="1.0"?>
+  <datatypes>
+   <datatype_files>
+     <datatype_file name="infernal.py"/>
+   </datatype_files>
+   <registration>
+        <!--INFERNAL1.1 Datatypes -->
+        <datatype extension="cm" type="galaxy.datatypes.infernal:Infernal_CM_1_1" display_in_upload="False" />
+        <datatype extension="stockholm" type="galaxy.datatypes.infernal:Stockholm_1_0" display_in_upload="False" />
+   </registration>
+   <sniffers>
+        <sniffer type="galaxy.datatypes.infernal:Infernal_CM_1_1"/>
+        <sniffer type="galaxy.datatypes.infernal:Stockholm_1_0"/>
+   </sniffers>
+</datatypes>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/infernal.py	Sat Sep 07 16:14:02 2013 -0400
@@ -0,0 +1,207 @@
+# -*- coding: utf-8 -*-
+
+from galaxy.datatypes import data
+from galaxy.datatypes.sniff import get_headers, get_test_fname
+from galaxy.datatypes.data import get_file_peek
+import subprocess
+import os
+
+from galaxy.datatypes.metadata import MetadataElement
+from galaxy.datatypes import metadata
+
+def count_special_lines( word, filename, invert = False ):
+    """
+        searching for special 'words' using the grep tool
+        grep is used to speed up the searching and counting
+        The number of hits is returned.
+    """
+    try:
+        cmd = ["grep", "-c"]
+        if invert:
+            cmd.append('-v')
+        cmd.extend([word, filename])
+        out = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+        return int(out.communicate()[0].split()[0])
+    except:
+        pass
+    return 0
+
+def count_lines( filename, non_empty = False):
+    """
+        counting the number of lines from the 'filename' file
+    """
+    try:
+        if non_empty:
+            out = subprocess.Popen(['grep', '-cve', '^\s*$', filename], stdout=subprocess.PIPE)
+        else:
+            out = subprocess.Popen(['wc', '-l', filename], stdout=subprocess.PIPE)
+        return int(out.communicate()[0].split()[0])
+    except:
+        pass
+    return 0
+
+
+class Stockholm_1_0( data.text ):
+    file_ext = "stockholm"
+
+    MetadataElement( name="number_of_alignments", default=0, desc="Number of multiple alignments", readonly=True, visible=True, optional=True, no_value=0 )
+
+    def set_peek( self, dataset, is_multi_byte=False ):
+        if not dataset.dataset.purged:
+            dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
+            if (dataset.metadata.number_of_models == 1):
+                dataset.blurb = "1 alignment"
+            else:
+                dataset.blurb = "%s alignments" % dataset.metadata.number_of_models
+            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disc'
+
+    def sniff( self, filename ):
+        if count_special_lines('^#[[:space:]+]STOCKHOLM[[:space:]+]1.0', filename) > 0:
+            return True
+        else:
+            return False
+
+    def set_meta( self, dataset, **kwd ):
+        """
+
+        Set the number of models in dataset.
+        """
+        dataset.metadata.number_of_models = count_special_lines('^#[[:space:]+]STOCKHOLM[[:space:]+]1.0', dataset.file_name)
+
+    def split( cls, input_datasets, subdir_generator_function, split_params):
+        """
+
+        Split the input files by model records.
+        """
+        if split_params is None:
+            return None
+
+        if len(input_datasets) > 1:
+            raise Exception("STOCKHOLM-file splitting does not support multiple files")
+        input_files = [ds.file_name for ds in input_datasets]
+
+        chunk_size = None
+        if split_params['split_mode'] == 'number_of_parts':
+            raise Exception('Split mode "%s" is currently not implemented for STOCKHOLM-files.' % split_params['split_mode'])
+        elif split_params['split_mode'] == 'to_size':
+            chunk_size = int(split_params['split_size'])
+        else:
+            raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+        def _read_stockholm_records( filename ):
+            lines = []
+            with open(filename) as handle:
+                for line in handle:
+                    lines.append( line )
+                    if line.strip() == '//':
+                        yield lines
+                        lines = []
+
+        def _write_part_stockholm_file( accumulated_lines ):
+            part_dir = subdir_generator_function()
+            part_path = os.path.join( part_dir, os.path.basename( input_files[0] ) )
+            part_file = open( part_path, 'w' )
+            part_file.writelines( accumulated_lines )
+            part_file.close()
+
+        try:
+
+            stockholm_records = _read_stockholm_records( input_files[0] )
+            stockholm_lines_accumulated = []
+            for counter, stockholm_record in enumerate( stockholm_records, start = 1):
+                stockholm_lines_accumulated.extend( stockholm_record )
+                if counter % chunk_size == 0:
+                    _write_part_stockholm_file( stockholm_lines_accumulated )
+                    stockholm_lines_accumulated = []
+            if stockholm_lines_accumulated:
+                _write_part_stockholm_file( stockholm_lines_accumulated )
+        except Exception,  e:
+            log.error('Unable to split files: %s' % str(e))
+            raise
+    split = classmethod(split)
+
+
+class Infernal_CM_1_1( data.text ):
+    file_ext = "cm"
+
+    MetadataElement( name="number_of_models", default=0, desc="Number of covariance models", readonly=True, visible=True, optional=True, no_value=0 )
+
+    def set_peek( self, dataset, is_multi_byte=False ):
+        if not dataset.dataset.purged:
+            dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
+            if (dataset.metadata.number_of_models == 1):
+                dataset.blurb = "1 model"
+            else:
+                dataset.blurb = "%s models" % dataset.metadata.number_of_models
+            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disc'
+
+    def sniff( self, filename ):
+        if count_special_lines("^INFERNAL1/a", filename) > 0:
+            return True
+        else:
+            return False
+
+    def set_meta( self, dataset, **kwd ):
+        """
+        Set the number of models in dataset.
+        """
+        dataset.metadata.number_of_models = count_special_lines("^INFERNAL1/a", dataset.file_name)
+
+    def split( cls, input_datasets, subdir_generator_function, split_params):
+        """
+        Split the input files by model records.
+        """
+        if split_params is None:
+            return None
+
+        if len(input_datasets) > 1:
+            raise Exception("CM-file splitting does not support multiple files")
+        input_files = [ds.file_name for ds in input_datasets]
+
+        chunk_size = None
+        if split_params['split_mode'] == 'number_of_parts':
+            raise Exception('Split mode "%s" is currently not implemented for CM-files.' % split_params['split_mode'])
+        elif split_params['split_mode'] == 'to_size':
+            chunk_size = int(split_params['split_size'])
+        else:
+            raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+        def _read_cm_records( filename ):
+            lines = []
+            with open(filename) as handle:
+                for line in handle:
+                    if line.startswith("INFERNAL1/a") and lines:
+                        yield lines
+                        lines = [line]
+                    else:
+                        lines.append( line )
+            yield lines
+
+        def _write_part_cm_file( accumulated_lines ):
+            part_dir = subdir_generator_function()
+            part_path = os.path.join( part_dir, os.path.basename( input_files[0] ) )
+            part_file = open( part_path, 'w' )
+            part_file.writelines( accumulated_lines )
+            part_file.close()
+
+        try:
+            cm_records = _read_cm_records( input_files[0] )
+            cm_lines_accumulated = []
+            for counter, cm_record in enumerate( cm_records, start = 1):
+                cm_lines_accumulated.extend( cm_record )
+                if counter % chunk_size == 0:
+                    _write_part_cm_file( cm_lines_accumulated )
+                    cm_lines_accumulated = []
+            if cm_lines_accumulated:
+                _write_part_cm_file( cm_lines_accumulated )
+        except Exception,  e:
+            log.error('Unable to split files: %s' % str(e))
+            raise
+    split = classmethod(split)
+
--- a/readme.rst	Sat Aug 31 17:32:40 2013 -0400
+++ b/readme.rst	Sat Sep 07 16:14:02 2013 -0400
@@ -28,6 +28,13 @@
 
 Please download install Infernal and the tool wrappers with the Galaxy Tool Shed:
 
+=============
+Miscellaneous
+=============
+
+Included in that repository is a CM datatype for INFERNAL 1.1. If you need that datatype in an additionl package,
+I can source it out as separate package. Please contact me in that case.
+
 
 =======
 History
@@ -38,6 +45,8 @@
  - v1.1.0: Initial public release
 
 
+
+
 ===============================
 Wrapper Licence (MIT/BSD style)
 ===============================