view bbduk.xml @ 3:008b8c6f6b26 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/bbtools commit aca07f4e7d683d1b7d06abb63e05d4ff1b28771f
author iuc
date Mon, 06 Feb 2023 18:04:39 +0000
parents ff3b87a0d205
children
line wrap: on
line source

<tool id="bbtools_bbduk" name="BBTools: BBduk" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>decontamination using kmers</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="edam_ontology"/>
    <expand macro="requirements"/>
    <command detect_errors="exit_code"><![CDATA[
#import os
#import re

#if str($input_type_cond.input_type) in ['single', 'pair']:
    #set read1 = $input_type_cond.read1
    ## bbduk uses the file extension to determine the input format.
    #set ext = '.fastq'
    #if $read1.ext.endswith('.gz'):
        #set ext = $ext + '.gz'
    #end if
    #set read1_file = 'forward'  + $ext
    ln -s '${read1}' '${read1_file}' &&
    #if str($input_type_cond.input_type) == 'pair':
        #set read2 = $input_type_cond.read2
        #set read2_file = 'reverse' + $ext
        ln -s '${read2}' '${read2_file}' &&
    #end if
#else:
    #set read1 = $input_type_cond.reads_collection['forward']
    #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.name))
    ## bbduk uses the file extension to determine the input format.
    #set ext = $read1_identifier + '.fastq'
    #if $read1.ext.endswith('.gz'):
        #set ext = $ext + '.gz'
    #end if
    #set read1_file = $read1_identifier + $ext
    ln -s '${read1}' '${read1_file}' &&
    #set read2 = $input_type_cond.reads_collection['reverse']
    #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.name))
    #set read2_file = $read2_identifier + $ext
    ln -s '${read2}' '${read2_file}' &&
#end if

#if str($reference_type_cond.reference_type) == 'files':
    #set refs = list()
    #for ref in $reference_type_cond.reference:
        ## bbduk looks at the file extension.
        #set ref_name = str($os.path.basename($ref.file_name)) + '.fa'
        #if $ref.ext.endswith('.gz'):
            gunzip -c '$ref' > '$ref_name' &&
        #else:
            ln -s '$ref' '$ref_name' &&
        #end if
        $refs.append(str($ref_name))
    #end for
    #set refs = ','.join($refs)
#else if str($reference_type_cond.reference_type) == 'keywords':
    #set refs = str($reference_type_cond.reference)
#end if

bbduk.sh
in='${read1_file}'

#if str($input_type_cond.input_type) in ['pair', 'paired']:
    in2='${read2_file}'
#end if
#if str($outputs_select).find('outu') >= 0:
    out='${outputu}'
    #if str($input_type_cond.input_type) in ['pair', 'paired']:
        out2='${outputu2}'
    #end if
#end if
#if str($outputs_select).find('outm') >= 0:
    outm='${outputm}'
    #if str($input_type_cond.input_type) in ['pair', 'paired']:
        outm2='${outputm2}'
    #end if
#end if
#if str($outputs_select).find('outs') >= 0:
    outs='${outputs}'
#end if

#if str($reference_type_cond.reference_type) != 'no_reference':
    ref='$refs'
    #if str($reference_type_cond.ktrim_cond.ktrim_select) == 'yes':
        ktrim='${reference_type_cond.ktrim_cond.ktrim}'
        minlength=$reference_type_cond.ktrim_cond.minlength
    #end if
#end if

k=$advanced_options.k
rcomp='${advanced_options.rcomp}'
maskmiddle='${advanced_options.maskmiddle}'
minkmerhits='${advanced_options.minkmerhits}'
minkmerfraction=$advanced_options.minkmerfraction
mincovfraction=$advanced_options.mincovfraction
hammingdistance=$advanced_options.hammingdistance
qhdist=$advanced_options.qhdist
editdistance=$advanced_options.editdistance
forbidn='${advanced_options.forbidn}'
trimfailures='${advanced_options.trimfailures}'
findbestmatch='${advanced_options.findbestmatch}'
skipr1='${advanced_options.skipr1}'
skipr2='${advanced_options.skipr2}'

#if str($output_stats_cond.output_stats) == 'yes':
    #if str($output_stats_cond.output_stats_select).find('stats') >= 0:
        stats='${output_stats}'
    #end if
    #if str($output_stats_cond.output_stats_select).find('ref') >= 0:
        refstats='${output_ref}'
    #end if
    #if str($output_stats_cond.output_stats_select).find('rpkm') >= 0:
        rpkm='${output_rpkm}'
    #end if
    #if str($output_stats_cond.output_stats_select).find('dump') >= 0:
        dump='${output_dump}'
    #end if
#end if
#if str($output_hists_cond.output_hists) == 'yes':
    #if str($output_hists_cond.output_hists_select).find('bhist') >= 0:
        bhist='${output_bhist}'
    #end if
    #if str($output_hists_cond.output_hists_select).find('quhist') >= 0:
        qhist='${output_quhist}'
    #end if
    #if str($output_hists_cond.output_hists_select).find('quchist') >= 0:
        qchist='${output_quchist}'
    #end if
    #if str($output_hists_cond.output_hists_select).find('aqhist') >= 0:
        aqhist='${output_aqhist}'
    #end if
    #if str($output_hists_cond.output_hists_select).find('bqhist') >= 0:
        bqhist='${output_bqhist}'
    #end if
    #if str($output_hists_cond.output_hists_select).find('lhist') >= 0:
        lhist='${output_lhist}'
    #end if
    #if str($output_hists_cond.output_hists_select).find('phist') >= 0:
        phist='${output_phist}'
    #end if
    #if str($output_hists_cond.output_hists_select).find('gchist') >= 0:
        gchist='${output_gchist}'
    #end if
    #if str($output_hists_cond.output_hists_select).find('enthist') >= 0:
        enthist='${output_enthist}'
    #end if
#end if
t=\${GALAXY_SLOTS:-4}
]]></command>
    <inputs>
        <expand macro="input_type_cond"/>
        <conditional name="reference_type_cond">
            <param name="reference_type" type="select" label="Choose the reference type" help="Optional, no reference is the default">
                <option value="no_reference" selected="true">No reference</option>
                <option value="files">files</option>
                <option value="keywords">keywords</option>
            </param>
            <when value="no_reference"/>
            <when value="files">
                <param name="reference" type="data" format="fasta,fasta.gz" multiple="true" optional="false" label="Select one or more fasta file"/>
                <expand macro="ktrim_cond"/>
            </when>
            <when value="keywords">
                <param name="reference"  type="select" multiple="true" optional="false" label="Select one or more keywords">
                    <option value="adapters">adapters</option>
                    <option value="artifacts">artifacts</option>
                    <option value="phix">phix</option>
                    <option value="lambda">lambda</option>
                    <option value="pjet">pjet</option>
                    <option value="mtst">mtst</option>
                    <option value="kapa">kapa</option>
                </param>
                <expand macro="ktrim_cond"/>
            </when>
        </conditional>
        <section name="advanced_options" title="Advanced options" expanded="false">
            <param argument="k" type="integer" value="27" min="1" label="Kmer length used for finding contaminants"/>
            <param argument="rcomp" type="boolean" truevalue="t" falsevalue="f" checked="true" label="Look for reverse-complements of kmers in addition to forward kmers?"/>
            <param argument="maskmiddle" type="boolean" truevalue="t" falsevalue="f" checked="true" label="Treat the middle base of a kmer as a wildcard to increase sensitivity in the presence of errors?"/>
            <param argument="minkmerhits" type="integer" value="1" min="1" label="Reads need at least this many matching kmers to be considered as matching the reference"/>
            <param argument="minkmerfraction" type="float" value="0" min="0" label="A read needs at least this fraction of its total kmers to hit a ref in order to be considered a match"/>
            <param argument="mincovfraction" type="float" value="0" min="0" label="A read needs at least this fraction of its total bases to be covered by ref kmers to be considered a match"/>
            <param argument="hammingdistance" type="integer" value="0" min="0" label="Maximum Hamming distance for ref kmers (subs only)"/>
            <param argument="qhdist" type="integer" value="0" min="0" label="Hamming distance for query kmers"/>
            <param argument="editdistance" type="integer" value="0" min="0" label="Maximum edit distance from ref kmers (subs and indels)"/>
            <param argument="forbidn" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Do not match kmers comntaining N?"/>
            <param argument="trimfailures" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Trim failed reads to 1bp instead of discarding them?"/>
            <param argument="findbestmatch" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Associate read with sequence sharing most kmers if multiple matches?"/>
            <param argument="skipr1" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Don't do kmer-based operations on read 1?"/>
            <param argument="skipr2" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Don't do kmer-based operations on read 2?"/>
        </section>
        <param name="outputs_select"  type="select" multiple="true" optional="false" label="Specify outputs">
            <option value="outu">Unmatched</option>
            <option value="outm">Matched</option>
            <option value="outs">Single</option>
        </param>
        <conditional name="output_stats_cond">
            <param name="output_stats"  type="select" label="Output statistics?">
                <option value="no" selected="true">No</option>
                <option value="yes">Yes</option>
            </param>
            <when value="no"/>
            <when value="yes">
                <param name="output_stats_select"  type="select" multiple="true" optional="false" label="Specify statistics outputs">
                    <option value="stats">Statistics about which contamininants were detected</option>
                    <option value="ref">Statistics on a per-reference-file basis</option>
                    <option value="rpkm">RPKM for each reference sequence (for RNA-seq)</option>
                    <option value="dump">kmer tables in fasta format</option>
                </param>
            </when>
        </conditional>
        <conditional name="output_hists_cond">
            <param name="output_hists"  type="select" label="Output histograms?">
                <option value="no" selected="true">No</option>
                <option value="yes">Yes</option>
            </param>
            <when value="no"/>
            <when value="yes">
                <param name="output_hists_select"  type="select" multiple="true" optional="false" label="Specify statistics outputs">
                    <option value="bhist">Base composition histogram by position</option>
                    <option value="quhist">Quality histogram by position</option>
                    <option value="quchist">Count of bases with each quality value</option>
                    <option value="aqhist">Histogram of average read quality</option>
                    <option value="bqhist">Quality histogram designed for box plots</option>
                    <option value="lhist">Read length histogram</option>
                    <option value="phist">Polymer length histogram</option>
                    <option value="gchist">Read GC content histogram</option>
                    <option value="enthist">Read entropy histogram</option>
                </param>
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="outputu" format="fastqsanger" label="${tool.name} on ${on_string} (Forward Unmatched)">
            <filter>str(outputs_select).find('outu') >= 0</filter>
            <filter>'outu' in outputs_select</filter>
        </data>
        <data name="outputu2" format="fastqsanger" label="${tool.name} on ${on_string} (Reverse Unmatched)">
            <filter>'outu' in outputs_select and input_type_cond['input_type'] != 'single'</filter>
        </data>
        <data name="outputm" format="fastqsanger" label="${tool.name} on ${on_string} (Forward Matched)">
            <filter>'outm' in outputs_select</filter>
        </data>
        <data name="outputm2" format="fastqsanger" label="${tool.name} on ${on_string} (Reverse Matched)">
            <filter>'outm' in outputs_select and input_type_cond['input_type'] != 'single'</filter>
        </data>
        <data name="outputs" format="fastqsanger" label="${tool.name} on ${on_string} (Single)">
            <filter>'outs' in outputs_select</filter>
        </data>
        <data name="output_stats" format="tabular" label="${tool.name} on ${on_string} (Detected contaminates stats)">
            <filter>output_stats_cond['output_stats'] == 'yes' and 'stats' in output_stats_cond['output_stats_select']</filter>
        </data>
        <data name="output_ref" format="tabular" label="${tool.name} on ${on_string} (Per reference file stats)">
            <filter>output_stats_cond['output_stats'] == 'yes' and 'ref' in output_stats_cond['output_stats_select']</filter>
        </data>
        <data name="output_rpkm" format="tabular" label="${tool.name} on ${on_string} (RPKM for each ref seq)">
            <filter>output_stats_cond['output_stats'] == 'yes' and 'rpkm' in output_stats_cond['output_stats_select']</filter>
        </data>
        <data name="output_dump" format="fasta" label="${tool.name} on ${on_string} (kmer tables)">
            <filter>output_stats_cond['output_stats'] == 'yes' and 'dump' in output_stats_cond['output_stats_select']</filter>
        </data>
        <data name="output_bhist" format="tabular" label="${tool.name} on ${on_string} (Base composition py poition)">
            <filter>output_hists_cond['output_hists'] == 'yes' and 'bhist' in output_hists_cond['output_hists_select']</filter>
        </data>
        <data name="output_quhist" format="tabular" label="${tool.name} on ${on_string} (Quality by position)">
            <filter>output_hists_cond['output_hists'] == 'yes' and 'quhist' in output_hists_cond['output_hists_select']</filter>
        </data>
        <data name="output_quchist" format="tabular" label="${tool.name} on ${on_string} (Bases w/ each quality value)">
            <filter>output_hists_cond['output_hists'] == 'yes' and 'quchist' in output_hists_cond['output_hists_select']</filter>
        </data>
        <data name="output_aqhist" format="tabular" label="${tool.name} on ${on_string} (average read quality)">
            <filter>output_hists_cond['output_hists'] == 'yes' and 'aqhist' in output_hists_cond['output_hists_select']</filter>
        </data>
        <data name="output_bqhist" format="tabular" label="${tool.name} on ${on_string} (Quality for box plots)">
            <filter>output_hists_cond['output_hists'] == 'yes' and 'bqhist' in output_hists_cond['output_hists_select']</filter>
        </data>
        <data name="output_lhist" format="tabular" label="${tool.name} on ${on_string} (Read length)">
            <filter>output_hists_cond['output_hists'] == 'yes' and 'lhist' in output_hists_cond['output_hists_select']</filter>
        </data>
        <data name="output_phist" format="tabular" label="${tool.name} on ${on_string} (Polymer length)">
            <filter>output_hists_cond['output_hists'] == 'yes' and 'phist' in output_hists_cond['output_hists_select']</filter>
        </data>
        <data name="output_gchist" format="tabular" label="${tool.name} on ${on_string} (Read GC content)">
            <filter>output_hists_cond['output_hists'] == 'yes' and 'gchist' in output_hists_cond['output_hists_select']</filter>
        </data>
        <data name="output_enthist" format="tabular" label="${tool.name} on ${on_string} (Read entropy)">
            <filter>output_hists_cond['output_hists'] == 'yes' and 'enthist' in output_hists_cond['output_hists_select']</filter>
        </data>
    </outputs>
    <tests>
        <!-- Single read -->
        <test expect_num_outputs="1">
            <param name="read1" value="13-1941-6_S4_L001_R1_600000.fastq.gz" ftype="fastqsanger.gz"/>
            <param name="reference" value="adapters.fa.gz" ftype="fasta.gz"/>
            <param name="reference_type" value="files"/>
            <param name="outputs_select" value="outu"/>
            <output name="outputu" file="bduk_outputu1.fastqsanger" ftype="fastqsanger" compare="contains"/>
        </test>
        <!-- Paired reads in separate datasets -->
        <test expect_num_outputs="4">
            <param name="input_type" value="pair"/>
            <param name="read1" value="13-1941-6_S4_L001_R1_600000.fastq.gz" ftype="fastqsanger.gz"/>
            <param name="read2" value="13-1941-6_S4_L001_R2_600000.fastq.gz" ftype="fastqsanger.gz"/>
            <param name="reference_type" value="files"/>
            <param name="reference" value="adapters.fa.gz" ftype="fasta.gz"/>
            <param name="outputs_select" value="outu"/>
            <param name="output_stats" value="yes"/>
            <param name="output_stats_select" value="dump"/>
            <param name="output_hists" value="yes"/>
            <param name="output_hists_select" value="quhist"/>
            <output name="outputu" file="bduk_outputu1.fastqsanger" ftype="fastqsanger" compare="contains"/>
            <output name="outputu2" file="bduk_outputu2.fastqsanger" ftype="fastqsanger" compare="contains"/>
            <output name="output_dump" file="bduk_output_dump1.fasta" ftype="fasta" compare="contains"/>
            <output name="output_quhist" file="bduk_output_quhist1.tabular" ftype="tabular"/>
        </test>
        <!-- Collection of Paired reads -->
        <test expect_num_outputs="2">
            <param name="input_type" value="paired"/>
            <param name="reads_collection">
                <collection type="paired">
                    <element name="forward" value="13-1941-6_S4_L001_R1_600000.fastq.gz"/>
                    <element name="reverse" value="13-1941-6_S4_L001_R2_600000.fastq.gz"/>
                </collection>
            </param>
            <param name="reference_type" value="keywords"/>
            <param name="reference" value="adapters,artifacts,phix,lambda,pjet,mtst,kapa"/>
            <param name="outputs_select" value="outu"/>
            <output name="outputu" file="bduk_outputu1.fastqsanger" ftype="fastqsanger" compare="contains"/>
            <output name="outputu2" file="bduk_outputu2.fastqsanger" ftype="fastqsanger" compare="contains"/>
        </test>
    </tests>
    <help>
**What it does**

BBDuk was developed to combine most common data-quality-related trimming, filtering, and masking operations into a single
high-performance tool.  It is capable of quality-trimming and filtering, adapter-trimming, contaminant-filtering via kmer
matching, sequence masking, GC-filtering, length filtering, entropy-filtering, format conversion, histogram generation,
subsampling, quality-score recalibration, kmer cardinality estimation, and various other operations in a single pass.
Specifically, any combination of operations is possible in a single pass with the exception of kmer-based operations (kmer
trimming, kmer masking, or kmer filtering). At most 1 kmer-based operation can be done in a single pass.

**Options**

 * **Reference** - if a reference is specified, BBDuk will operate on kmers in one of 4 modes: right-trimming, left-trimming, masking, or filtering.  The default is filtering - any read matching a reference kmer will be discarded.

 * **Trim reads to remove bases matching reference kmers** - When trimming to the right, once a reference kmer is matched in a read, that kmer and all the bases to the right will be trimmed, leaving only the bases to the left.  When trimming to the left, trimming will be done to the left instead.

**Outputs**

 * **Unmatched** - All the reads that pass all filtering criteria. Reads will be at least as long as **Minimum read length** after any trimming operations and reads will not match any reference kmer if kmer-filtering is being performed.  A read’s average quality will be at least as high as the specified **Minimum average quality**.
 * **Matched** - Reads failing any filter criteria (such as matching a reference kmer). By default, if either read in a pair fails, both will be included in *Matched*.
 * **Single** - Singleton reads whose mate was trimmed shorter than the value of **Minimum read length**.
    </help>
    <expand macro="citations"/>
</tool>