Mercurial > repos > dave > dynamic_downsample
diff downsample.xml @ 1:03aeb837e398 draft default tip
Uploaded
author | dave |
---|---|
date | Tue, 01 Oct 2019 16:25:02 -0400 |
parents | 20823bce09e7 |
children |
line wrap: on
line diff
--- a/downsample.xml Tue Sep 24 16:20:27 2019 -0400 +++ b/downsample.xml Tue Oct 01 16:25:02 2019 -0400 @@ -1,32 +1,75 @@ <?xml version="1.0"?> -<tool id="dynamic_downsample" name="Dynamically downsample" version="1.0.0"> +<tool id="dynamic_downsample" name="Downsample" version="1.0.0"> <description>reads to desired coverage</description> <requirements> <requirement type="package" version="1.9">samtools</requirement> <requirement type="package" version="5.0.1">gawk</requirement> </requirements> <command><![CDATA[ - if FACTOR=\$(samtools depth '$reads' | awk '{ a[i++]=\$3; } END { x=int((i+1)/2); if (x < (i+1)/2) y=(a[x-1]+a[x])/2; else y=a[x-1]; f = 1/(y/$coverage) ; if (f >= 1) exit 1 ; else print f }') ; - then samtools view '$reads' -s \$FACTOR -O $reads.datatype -o '$output' ; - else ; - cp '$reads' '$output' + if FACTOR=\$(samtools depth '$reads' | awk '{ readcovs[x++]=\$3; } END { n = asort(readcovs) ; idx=int((x+1)/2) ; coverage = ((idx==(x+1)/2) ? readcovs[idx] : (readcovs[idx]+readcovs[idx+1])/2) ; factor = 1/(coverage/$target_coverage) ; if (factor >= 1) exit 1 ; else print factor }') ; + then samtools view '$reads' -s \$FACTOR -O BAM -o '$output' -@ \${GALAXY_SLOTS:-1} ; + else samtools view -O BAM '$reads' -o '$output' ; fi ]]> </command> <inputs> <param name="reads" type="data" format="sam,bam" label="Reads to downsample" /> - <param name="coverage" type="integer" value="1000" label="Target coverage" /> + <param name="target_coverage" type="integer" value="1000" label="Target coverage" /> </inputs> <outputs> - <data format="bam" name="output" label="${tool.name} on ${on_string} (Downsampled to ${coverage}x coverage)"> - <change_format> - <when input="reads" value="sam" format="sam" /> - </change_format> - </data> + <data format="bam" name="output" label="Downsample ${on_string} to ${target_coverage}x coverage" /> </outputs> <tests> + <test> + <param name="reads" ftype="bam" value="downsample-in1.bam" /> + <param name="target_coverage" value="100" /> + <output name="output" file="downsample-out1.bam" /> + </test> </tests> - <help> + <help><![CDATA[ +.. role:: bash(code) + :language: bash + + +Dynamic Downsampling +~~~~~~~~~~~~~~~~~~~~ + +A known issue with variant analysis is that when small genomes are sequenced, +e.g. HIV at 9.7 kilobases or the human mitochondria at 16.6kb, the resulting +coverage can easily exceed 10,000x. This can cause performance issues for some +variant callers, especially those that employ a haplotyping approach to variant +detection. + +This tool attempts to ameliorate that issue by downsampling its input files to +the target coverage using :bash:`samtools depth` to determine the median +coverage for a given BAM file, then running :bash:`samtools view -s` on the file +if 1 / (median coverage / desired coverage) is less than 1. + +.. code-block:: bash + + -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the fraction of templates/read pairs to keep; INT part sets seed) + +The median coverage is determined by passing the :bash:`samtools depth` command +through the following :bash:`awk` script, where :bash:`$target_coverage` is the +value specified in the tool form: + +.. code-block:: awk + + '{ readcovs[x++]=$3; } END + { + n = asort(readcovs) ; + idx=int((x+1)/2) ; + coverage = ((idx==(x+1)/2) ? readcovs[idx] : (readcovs[idx]+readcovs[idx+1])/2) ; + factor = 1/(coverage/$target_coverage) ; + if (factor >= 1) exit 1 ; + else print factor + }' + +On an exit code of 1, the tool will simply copy the input to the output without +altering it. If the :bash:`awk` step returns a value instead, the tool then runs +:bash:`samtools view -s 1 / (median coverage / desired coverage)` + +]]> </help> <citations> </citations>