view bamtools-split-ref.xml @ 0:8c17ddca0eee draft

Uploaded
author jjohnson
date Mon, 15 May 2017 16:27:18 -0400
parents
children
line wrap: on
line source

<tool id="bamSplitRef" name="Split BAM by Reference" version="2.4.0">
    <description>into dataset list collection</description>
    <requirements>
        <requirement type="package" version="2.4.0">bamtools</requirement>
    </requirements>
    <command>
        <![CDATA[
            ln -s '${input_bam}' 'localbam.bam' &&
            ln -s '${input_bam.metadata.bam_index}' 'localbam.bam.bai' &&
            bamtools split -reference
            -in localbam.bam 
            -stub split_bam 
            ## Preserve order from metadata in the output collection
            #import re
            #set $name = $re.sub('\W','_',$re.sub('\.bam$','',$input_bam.name))
            #if str($refs) != 'None':
                #set $ref_list = ' '.join(str($refs).split(","))
            #else 
                #set $ref_list = ' '.join([$re.sub('^.*__sq__(.+)__sq__.*$','\\1',n) if n.find('__sq__') >= 0 else n for n in str($input_bam.metadata.reference_names).split(',')])
            #end if
            && mkdir -p outputs
            && (export I=0; 
              for i in $ref_list; 
                do I=\$((++I)); SN=`printf "split_bam.REF_%s.bam" "\$i"`; 
                  if [ -e \$SN ]; then FN=`printf "outputs/split_bam%05d%s.%s.bam" \$((I)) '$name' "\$i"`; mv \$SN \$FN; fi;
                done)
        ]]>
    </command>
    <inputs>
        <param name="input_bam" type="data" format="bam" label="BAM dataset to split by reference"/>
        <param name="refs" type="select" optional="True" multiple="True" label="Select references (chromosomes and contigs) you would like to restrict bam to" >
            <help><![CDATA[Click and type in the box above to see options. You can select multiple entries. 
                  If "No options available" is displayed, you need to re-detect metadata on the input dataset.
            ]]></help>
            <options>
                <filter type="data_meta" ref="input_bam" key="reference_names" />
            </options>
        </param>
    </inputs>
    <outputs>
        <collection name="output_bams" type="list" label="${input_bam.name} Split List">
            <discover_datasets pattern="split_bam\d*(?P&lt;designation&gt;.+)\.bam" ext="bam" directory="outputs" visible="false"/>
        </collection>
    </outputs>
    <tests>
        <test>
            <param name="input_bam" ftype="bam" value="bamtools-input2.bam"/>
            <output_collection name="output_bams"  type="list">
                <element name="bamtools_input2.chr1"  file="bamtools_input2.chr1" compare="sim_size" delta="500" />
            </output_collection>
        </test>
    </tests>
    <help>
**What is does**

BAMTools split is a utility for splitting BAM files. It is based on BAMtools suite of tools by Derek Barnett (https://github.com/pezmaster31/bamtools).

-----

.. class:: warningmark

**DANGER: Multiple Outputs**

As described below, splitting a BAM dataset(s) on reference name or a tag value can produce very large numbers of outputs. Read below and know what you are doing.

-----

**How it works**

Split alignments by reference name into a dataset list collection.  The collection will be in the same order as the input BAM references.

In cases of unfinished genomes with very large number of reference sequences (scaffolds) 
it can generate thousands (if not millions) of output datasets.


-----

.. class:: infomark

**More information**

Additional information about BAMtools can be found at https://github.com/pezmaster31/bamtools/wiki

    </help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btr174</citation>
    </citations>
</tool>