view downsample.xml @ 1:03aeb837e398 draft default tip

Uploaded
author dave
date Tue, 01 Oct 2019 16:25:02 -0400
parents 20823bce09e7
children
line wrap: on
line source

<?xml version="1.0"?>
<tool id="dynamic_downsample" name="Downsample" version="1.0.0">
    <description>reads to desired coverage</description>
    <requirements>
        <requirement type="package" version="1.9">samtools</requirement>
        <requirement type="package" version="5.0.1">gawk</requirement>
    </requirements>
    <command><![CDATA[
        if FACTOR=\$(samtools depth '$reads' | awk '{ readcovs[x++]=\$3; } END { n = asort(readcovs) ; idx=int((x+1)/2) ; coverage = ((idx==(x+1)/2) ? readcovs[idx] : (readcovs[idx]+readcovs[idx+1])/2) ; factor = 1/(coverage/$target_coverage) ; if (factor >= 1) exit 1 ; else print factor }') ;
            then samtools view '$reads' -s \$FACTOR -O BAM -o '$output' -@ \${GALAXY_SLOTS:-1} ;
        else samtools view -O BAM '$reads' -o '$output' ;
        fi
        ]]>
    </command>
    <inputs>
        <param name="reads" type="data" format="sam,bam" label="Reads to downsample" />
        <param name="target_coverage" type="integer" value="1000" label="Target coverage" />
    </inputs>
    <outputs>
        <data format="bam" name="output" label="Downsample ${on_string} to ${target_coverage}x coverage" />
    </outputs>
    <tests>
        <test>
            <param name="reads" ftype="bam" value="downsample-in1.bam" />
            <param name="target_coverage" value="100" />
            <output name="output" file="downsample-out1.bam" />
        </test>
    </tests>
    <help><![CDATA[
.. role:: bash(code)
   :language: bash


Dynamic Downsampling
~~~~~~~~~~~~~~~~~~~~

A known issue with variant analysis is that when small genomes are sequenced,
e.g. HIV at 9.7 kilobases or the human mitochondria at 16.6kb, the resulting
coverage can easily exceed 10,000x. This can cause performance issues for some
variant callers, especially those that employ a haplotyping approach to variant
detection.

This tool attempts to ameliorate that issue by downsampling its input files to
the target coverage using :bash:`samtools depth` to determine the median
coverage for a given BAM file, then running :bash:`samtools view -s` on the file
if 1 / (median coverage / desired coverage) is less than 1.

.. code-block:: bash

    -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the fraction of templates/read pairs to keep; INT part sets seed)

The median coverage is determined by passing the :bash:`samtools depth` command
through the following :bash:`awk` script, where :bash:`$target_coverage` is the
value specified in the tool form:

.. code-block:: awk

    '{ readcovs[x++]=$3; } END
    {
        n = asort(readcovs) ;
        idx=int((x+1)/2) ;
        coverage = ((idx==(x+1)/2) ? readcovs[idx] : (readcovs[idx]+readcovs[idx+1])/2) ;
        factor = 1/(coverage/$target_coverage) ;
        if (factor >= 1) exit 1 ;
        else print factor
    }'

On an exit code of 1, the tool will simply copy the input to the output without
altering it. If the :bash:`awk` step returns a value instead, the tool then runs
:bash:`samtools view -s 1 / (median coverage / desired coverage)`

]]>
    </help>
    <citations>
    </citations>
</tool>