changeset 1:03aeb837e398 draft default tip

Uploaded
author dave
date Tue, 01 Oct 2019 16:25:02 -0400
parents 20823bce09e7
children
files downsample.xml test-data/downsample-in1.bam test-data/downsample-out1.bam
diffstat 3 files changed, 55 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/downsample.xml	Tue Sep 24 16:20:27 2019 -0400
+++ b/downsample.xml	Tue Oct 01 16:25:02 2019 -0400
@@ -1,32 +1,75 @@
 <?xml version="1.0"?>
-<tool id="dynamic_downsample" name="Dynamically downsample" version="1.0.0">
+<tool id="dynamic_downsample" name="Downsample" version="1.0.0">
     <description>reads to desired coverage</description>
     <requirements>
         <requirement type="package" version="1.9">samtools</requirement>
         <requirement type="package" version="5.0.1">gawk</requirement>
     </requirements>
     <command><![CDATA[
-        if FACTOR=\$(samtools depth '$reads' | awk '{ a[i++]=\$3; } END { x=int((i+1)/2); if (x < (i+1)/2) y=(a[x-1]+a[x])/2; else y=a[x-1]; f = 1/(y/$coverage) ; if (f >= 1) exit 1 ; else print f }') ;
-            then samtools view '$reads' -s \$FACTOR -O $reads.datatype -o '$output' ;
-        else ;
-            cp '$reads' '$output'
+        if FACTOR=\$(samtools depth '$reads' | awk '{ readcovs[x++]=\$3; } END { n = asort(readcovs) ; idx=int((x+1)/2) ; coverage = ((idx==(x+1)/2) ? readcovs[idx] : (readcovs[idx]+readcovs[idx+1])/2) ; factor = 1/(coverage/$target_coverage) ; if (factor >= 1) exit 1 ; else print factor }') ;
+            then samtools view '$reads' -s \$FACTOR -O BAM -o '$output' -@ \${GALAXY_SLOTS:-1} ;
+        else samtools view -O BAM '$reads' -o '$output' ;
         fi
         ]]>
     </command>
     <inputs>
         <param name="reads" type="data" format="sam,bam" label="Reads to downsample" />
-        <param name="coverage" type="integer" value="1000" label="Target coverage" />
+        <param name="target_coverage" type="integer" value="1000" label="Target coverage" />
     </inputs>
     <outputs>
-        <data format="bam" name="output" label="${tool.name} on ${on_string} (Downsampled to ${coverage}x coverage)">
-            <change_format>
-                <when input="reads" value="sam" format="sam" />
-            </change_format>
-        </data>
+        <data format="bam" name="output" label="Downsample ${on_string} to ${target_coverage}x coverage" />
     </outputs>
     <tests>
+        <test>
+            <param name="reads" ftype="bam" value="downsample-in1.bam" />
+            <param name="target_coverage" value="100" />
+            <output name="output" file="downsample-out1.bam" />
+        </test>
     </tests>
-    <help>
+    <help><![CDATA[
+.. role:: bash(code)
+   :language: bash
+
+
+Dynamic Downsampling
+~~~~~~~~~~~~~~~~~~~~
+
+A known issue with variant analysis is that when small genomes are sequenced,
+e.g. HIV at 9.7 kilobases or the human mitochondria at 16.6kb, the resulting
+coverage can easily exceed 10,000x. This can cause performance issues for some
+variant callers, especially those that employ a haplotyping approach to variant
+detection.
+
+This tool attempts to ameliorate that issue by downsampling its input files to
+the target coverage using :bash:`samtools depth` to determine the median
+coverage for a given BAM file, then running :bash:`samtools view -s` on the file
+if 1 / (median coverage / desired coverage) is less than 1.
+
+.. code-block:: bash
+
+    -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the fraction of templates/read pairs to keep; INT part sets seed)
+
+The median coverage is determined by passing the :bash:`samtools depth` command
+through the following :bash:`awk` script, where :bash:`$target_coverage` is the
+value specified in the tool form:
+
+.. code-block:: awk
+
+    '{ readcovs[x++]=$3; } END
+    {
+        n = asort(readcovs) ;
+        idx=int((x+1)/2) ;
+        coverage = ((idx==(x+1)/2) ? readcovs[idx] : (readcovs[idx]+readcovs[idx+1])/2) ;
+        factor = 1/(coverage/$target_coverage) ;
+        if (factor >= 1) exit 1 ;
+        else print factor
+    }'
+
+On an exit code of 1, the tool will simply copy the input to the output without
+altering it. If the :bash:`awk` step returns a value instead, the tool then runs
+:bash:`samtools view -s 1 / (median coverage / desired coverage)`
+
+]]>
     </help>
     <citations>
     </citations>
Binary file test-data/downsample-in1.bam has changed
Binary file test-data/downsample-out1.bam has changed