view stacks_procrad.xml @ 0:a4e62d5c5101 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/stacks2 commit 98327d2948ae1ccb5aef5db9ab88605fd74a0de7-dirty
author matthias
date Thu, 29 Nov 2018 11:56:45 -0500
parents
children c9fffbd29afc
line wrap: on
line source

<tool id="stacks2_procrad" name="Stacks2: process radtags" version="@WRAPPER_VERSION@">
<description>the Stacks demultiplexing script</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <expand macro="stdio"/>
    <expand macro="version_cmd"/>
    <command><![CDATA[
@CLEAN_EXT@

mkdir stacks_inputs stacks_outputs &&

#for $input in $input_type.fqinputs:
    #if $input_type.options_type_selector == "single"
        #set $isfq=$input.is_of_type('fastqsanger')
        #set $name=$clean_ext($input.element_identifier)
    #else:
        #set $isfq=$input.forward.is_of_type('fastqsanger')
        ## TODO if https://github.com/galaxyproject/galaxy/pull/7031 is backoported use element_identifier consistently and fix release in <tool>
        #set $name=$clean_ext($input.name)
    #end if

    #if $isfq:
        #set $ext = "fastq"
        #set inputype = "fastq"
    #else
        #set $ext = "fastq.gz"
        #set inputype = "gzfastq"
    #end if

    #if $input_type.options_type_selector == "single"
        ln -s '$input' 'stacks_inputs/${name}.${ext}' &&
    #else:
        ## procrad needs _R[12]_ in the file name, so we add an add 0
        ln -s '$input.forward' 'stacks_inputs/${name}_R1_0.${ext}' &&
        ln -s '$input.reverse' 'stacks_inputs/${name}_R2_0.${ext}' &&
    #end if

#end for

process_radtags

-p stacks_inputs/
#if $input_type.options_type_selector == "paired"
    --paired
#end if
-i $inputype
-b '$barcode'

#if $filter_cond.filter_select == 'yes':
    -w $filter_cond.sliding
    -s $filter_cond.score
    $filter_cond.remove
    $filter_cond.discard
    $filter_cond.filter_illumina
#else
    #if str($filter_cond.len_limit) != "":
        --len_limit $filter_cond.len_limit
    #end if
#end if

#if str($options_advanced.truncate)
    -t $options_advanced.truncate
#end if
$options_advanced.rescue
$capture
## -E not implemented in Galaxy defaults to phred33
#if str( $outype ) != "auto"
    -y $outype
#end if

## Barcode options
$input_type.barcode_encoding

## Restriction enzyme options
#if str($options_enzyme.enzyme) != '':
    -e $options_enzyme.enzyme
#end if
#if str( $options_enzyme.options_enzyme_selector ) == "2" and str($options_enzyme.enzyme2)!='':
    --renz_2 $options_enzyme.enzyme2
#end if

## Protocol-specific options
$options_advanced.bestrad

## Adapter options
#if str($options_advanced.adapter_1) != "":
    --adapter_1 $options_advanced.adapter_1
#end if
#if str($options_advanced.adapter_2) != "":
    --adapter_2 $options_advanced.adapter_2
#end if
#if str($options_advanced.adapter_mm) != "":
    --adapter_mm $options_advanced.adapter_mm
#end if

## Output options
$options_advanced.retain_header
## --merge not implemented in Galaxy

## Advanced options
$options_advanced.disable_rad_check
#if str($options_advanced.barcode_dist_1) != "":
    --barcode_dist_1 $options_advanced.barcode_dist_1
#end if
#if str($options_advanced.barcode_dist_2) != "":
    --barcode_dist_2 $options_advanced.barcode_dist_2
#end if

-o stacks_outputs

&& mv stacks_outputs/process_radtags.stacks_inputs.log $output_log

#if $capture:
    && mkdir stacks_outputs/discarded/
    && mv stacks_outputs/*discards stacks_outputs/discarded/

    ## fix the _R[12]_0 that was added for preparing the input
    #if $input_type.options_type_selector == 'paired':
    && find stacks_outputs/discarded/ -type f | while read file; do mv "\$file" "\$(echo \$file | sed 's/_R1_0/.1/; s/_R2_0/.2/;')"; done
    #end if
    ## also remove the gz which is added by procrad (but its uncompressed)
    && find stacks_outputs/discarded/ -type f -iname "*.gz.discards" | while read file; do mv "\$file" "\$(echo \$file | sed 's/.gz.discards$/.discards/;')"; done

    ## the discard files are named fastq even if the output is fasta
    #if str($outype).endswith("fasta"):
        && find stacks_outputs/discarded/ -type f | while read file; do mv "\$file" "\$(echo \$file | sed 's/\.fastq.discards/.fa/;')"; done
    #else
        && find stacks_outputs/discarded/ -type f | while read file; do mv "\$file" "\$(echo \$file | sed 's/\.fastq.discards/.fq/;')"; done
    #end if
#end if
## prepare paired read output for processing in galaxy
#if $input_type.options_type_selector == 'paired':
    && mkdir stacks_outputs/remaining
    && mv stacks_outputs/*.rem.[12].* stacks_outputs/remaining/
    && find stacks_outputs/ -iregex ".*.f[aq]\(\.gz\)?" | while read file; do mv "\$file" "\$(echo \$file | sed 's/\.1\./.forward./; s/\.2\./.reverse./')"; done
#end if
    ]]></command>

    <inputs>
        <conditional name="input_type">
            <param name="options_type_selector" type="select" label="Single-end or paired-end reads files">
                <option value="single" selected="True">Single-end files</option>
                <option value="paired">Paired-end files</option>
            </param>
            <when value="single">
                <param name="fqinputs" argument="-f" format="fastqsanger,fastqsanger.gz" multiple="true" type="data" label="singles-end reads infile(s)" help="input files" />

                <param name="barcode_encoding" type="select" label="Barcode location">
                    <expand macro="barcode_encoding_single" />
                </param>
            </when>
            <when value="paired">
                <param name="fqinputs" type="data_collection" collection_type="list:paired" label="paired-end reads infile(s)" format="fastqsanger,fastqsanger.gz"/>
<!--                <param name="inputs_paired1" argument="-1" format="fastqsanger,fastqsanger.gz" type="data" label="paired-end reads infile(s) 1" help="Files must have this syntax : name_R1_001.fastq" />-->
<!--                <param name="inputs_paired2" argument="-2" format="fastqsanger,fastqsanger.gz" type="data" label="paired-end reads infile(s) 2" help="Files must have this syntax : name_R2_001.fastq" />-->
                <param name="barcode_encoding" type="select" label="Barcode location">
                    <expand macro="barcode_encoding_pair" />
                </param>
            </when>
        </conditional>

        <param name="barcode" argument="-b" type="data"  format="tabular,txt" label="Barcode file" />

        <conditional name="options_enzyme">
            <param name="options_enzyme_selector" type="select" label="Number of enzymes">
                <option value="1">One</option>
                <option value="2">Two</option>
            </param>
            <when value="1">
                <param name="enzyme" type="select" label="Enzyme" argument="-e" help="provide the restriction enzyme used" >
                    <expand macro="enzymes"/>
                </param>
            </when>
            <when value="2">
                <param name="enzyme" type="select" label="Enzyme" argument="-e" help="provide the restriction enzyme used" >
                    <expand macro="enzymes"/>
                </param>
                <param name="enzyme2" type="select" label="Second enzyme" argument="--renz_2" help="provide the second restriction enzyme used" >
                    <expand macro="enzymes"/>
                </param>
            </when>
        </conditional>

        <section name="options_advanced" title="advanced options" expanded="False">
            <param name="truncate" type="integer" value="" optional="True" argument="-t" label="Truncate final read length to this value" />
            <param name="rescue" type="boolean" checked="false" truevalue="-r" falsevalue="" argument="-r" label="Rescue barcodes and RAD-Tags?"/>
            <param argument="--bestrad" type="boolean" checked="false" truevalue="--bestrad" falsevalue="" label="library was generated using BestRAD, check for restriction enzyme on either read and potentially tranpose reads" />
            <param argument="--retain_header" type="boolean" checked="false" truevalue="--retain_header" falsevalue="" label="Retain unmodified FASTQ headers in the output" />
            <param argument="--disable_rad_check" type="boolean" checked="false" truevalue="--disable_rad_check" falsevalue="" label="disable checking if the RAD site is intact" />
            <param argument="--barcode_dist_1" type="integer" value="" optional="true" label="number of allowed mismatches when rescuing first read barcodes" help="(default 1)"/>
            <param argument="--barcode_dist_2" type="integer" value="" optional="true" label="number of allowed mismatches when rescuing paired read barcodes" help="(default value for single end barcodes)"/>
            <param argument="--adapter_1" type="text" value="" optional="true" label="adaptor sequence that may occur on the first read" />
            <param argument="--adapter_2" type="text" value="" optional="true" label="adaptor sequence that may occur on the paired-read" />
            <param argument="--adapter_mm" type="integer" value="" optional="true" label="number of mismatches allowed in the adapter sequence"/>
        </section>

        <conditional name="filter_cond" >
            <param name="filter_select" type="select" label="do quality filtering">
                <option value="yes">Yes</option>
                <option value="no" selected="true">No</option>
            </param>
            <when value="yes">
                <param name="sliding" type="float" value="0.15" min="0" max="1" argument="-w" label="Set the size of the sliding window as a fraction of the read length, between 0 and 1" />
                <param name="score" type="integer" value="10" min="0" max="40" argument="-s" label="Set the score limit. If the average score within the sliding window drops below this value, the read is discarded" />
                <param name="remove" type="boolean" checked="false" truevalue="-c" falsevalue="" argument="-c" label="Clean data, remove any read with an uncalled base" />
                <param name="discard" type="boolean" checked="false" truevalue="-q" falsevalue="" argument="-q" label="Discard reads with low quality scores"/>
                <param argument="--filter_illumina" type="boolean" checked="false" truevalue="--filter_illumina" falsevalue="" label="discard reads that have been marked by Illumina's chastity/purity filter as failing" />
            </when>
            <when value="no">
                <param argument="--len_limit" type="integer" value="" optional="true" label="minimum sequence length" help="useful if your data has already been trimmed"/>
            </when>
        </conditional>
        <param name="capture" type="boolean" checked="false" truevalue="-D" falsevalue="" argument="-D" label="Capture discarded reads to a file" />

        <param name="outype" argument="-y" type="select" label="Output format" >
            <option value="auto" selected="True">Same as input</option>
            <option value="fastq">fastq</option>
            <option value="fasta">fasta</option>
            <option value="gzfastq">gzipped fastq</option>
            <option value="gzfasta">gzipped fasta</option>
        </param>
        <expand macro="in_log"/>
    </inputs>

    <outputs>
        <expand macro="out_log"/>
        <collection name="demultiplexed" type="list" label="${tool.name} on ${on_string} Demultiplexed reads">
            <filter>input_type['options_type_selector'] == "single"</filter>
            <expand macro="discover_faqgz_output_macro" pattern="(?P&lt;name&gt;.+)" dir="stacks_outputs"/>
        </collection>
        <collection name="demultiplexed_paired" type="list:paired" label="${tool.name} on ${on_string} Demultiplexed reads">
            <filter>input_type['options_type_selector'] == "paired"</filter>
            <expand macro="discover_faqgz_output_macro" pattern="(?P&lt;identifier_0&gt;.+)\.(?P&lt;identifier_1&gt;[^.]+)" dir="stacks_outputs"/>
        </collection>

        <collection name="remaining" type="list:paired" label="${tool.name} on ${on_string} Remaining orphan reads">
            <filter>input_type['options_type_selector'] == "paired"</filter>
            <expand macro="discover_faqgz_output_macro" pattern="(?P&lt;identifier_0&gt;.+)\.rem\.(?P&lt;identifier_1&gt;[^.]+)" dir="stacks_outputs/remaining"/>
        </collection>

        <!-- note irrespective of -y output is always named fastq and are never zipped -->
        <collection name="discarded" type="list" label="${tool.name} on ${on_string} Discarded reads">
            <filter>capture is True and input_type['options_type_selector'] == "single"</filter>
            <expand macro="discover_faq_output_macro" pattern="(?P&lt;name&gt;.*)" dir="stacks_outputs/discarded"/>
        </collection>
        <collection name="discarded_paired" type="list:paired" label="${tool.name} on ${on_string} Discarded reads">
            <filter>capture is True and input_type['options_type_selector'] == "paired"</filter>
            <expand macro="discover_faq_output_macro" pattern="(?P&lt;identifier_0&gt;.+)\.(?P&lt;identifier_1&gt;[^.]+)" dir="stacks_outputs/discarded"/>
        </collection>
    </outputs>
    <tests>
        <!-- single single ended input, no filtering (hence no capturing) + log -->
        <test>
            <param name="input_type|options_type_selector" value="single"/>
            <param name="input_type|fqinputs" ftype="fastqsanger" value="procrad/R1.fq"/>
            <param name="input_type|barcode_encoding" value="--inline_null"/>
            <param name="barcode" value="procrad/barcodes"/>
            <param name="options_enzyme|options_enzyme_selector" value="1"/>
            <param name="options_enzyme|enzyme" value="ecoRI"/>
            <param name="add_log" value="yes" />
            <output name="output_log" file="procrad/process_radtags.out" lines_diff="4"/>
            <output_collection name="demultiplexed" count="40">
                <element name="PopA_01" file="demultiplexed/PopA_01.fq" ftype="fastqsanger" />
            </output_collection>
        </test>
        <!-- multiple (zipped) single end input (misusing R2 as add single end read file),
             discarding by quality and capturing them -->
        <test>
            <param name="input_type|options_type_selector" value="single"/>
            <param name="input_type|fqinputs" ftype="fastqsanger.gz" value="procrad/R1.fq.gzip,procrad/R2.fq.gzip"/>
            <param name="input_type|barcode_encoding" value="--inline_null"/>
            <param name="barcode" value="procrad/barcodes"/>
            <param name="options_enzyme|options_enzyme_selector" value="1"/>
            <param name="options_enzyme|enzyme" value="ecoRI"/>
            <param name="filter_cond|filter_select" value="yes"/>
            <param name="filter_cond|discard" value="true"/>
            <param name="filter_cond|sliding" value="0.1" />
            <param name="filter_cond|score" value="11" />
            <param name="filter_cond|remove" value="-c" />
            <param name="filter_cond|filter_illumina" value="--filter_illumina" />
            <param name="capture" value="true"/>
            <param name="outype" value="gzfastq"/>
            <assert_command>
                <has_text text="-q" />
                <has_text text="-w 0.1" />
                <has_text text="-s 11" />
                <has_text text="-c" />
                <has_text text="--filter_illumina" />
            </assert_command>
            <output_collection name="demultiplexed" count="40">
                <element name="PopA_01" ftype="fastqsanger.gz"  md5="c7250f50138cbca747b85223aaae9565"/>
            </output_collection>
            <output_collection name="discarded" count="2">
                <element name="R1" file="procrad/R1.fq.discards" ftype="fastqsanger"/>
                <element name="R2" file="procrad/R2.fq.discards" ftype="fastqsanger"/>
            </output_collection>
        </test>
        <!-- paired input, no quality but length filter, gzfasta output -->
        <test>
            <param name="input_type|options_type_selector" value="paired"/>
            <param name="input_type|fqinputs">
                <collection type="list:paired">
                    <element name="reads">
                        <collection type="paired">
                            <element name="forward" value="procrad/R1.fq" ftype="fastqsanger" />
                            <element name="reverse" value="procrad/R2.fq" ftype="fastqsanger"/>
                        </collection>
                    </element>
                </collection>
            </param>
            <param name="barcode" value="procrad/barcodes"/>
            <param name="options_enzyme|options_enzyme_selector" value="1"/>
            <param name="options_enzyme|enzyme" value="ecoRI"/>
            <param name="filter_cond|filter_select" value="no"/>
            <param name="filter_cond|len_limit" value="50"/>
            <param name="capture" value="true"/>
            <param name="outype" value="gzfasta"/>
            <param name="add_log" value="yes" />
            <output name="output_log" file="procrad/process_radtags_paired.out" lines_diff="4"/>
            <assert_command>
                <has_text text="--len_limit 50" />
            </assert_command>
            <output_collection name="demultiplexed_paired" type="list:paired" count="40">
                <element name="PopA_01">
                    <element name="forward" value="demultiplexed/PopA_01.1.fa.gz" ftype="fasta.gz" />
                    <element name="reverse" value="demultiplexed/PopA_01.2.fa.gz" ftype="fasta.gz" />
                </element>
            </output_collection>
            <output_collection name="remaining" type="list:paired" count="40">
                <element name="PopA_01">
                    <element name="forward" file="demultiplexed/PopA_01.rem.1.fa.gz" ftype="fasta.gz"/>
                    <element name="reverse" file="demultiplexed/PopA_01.rem.2.fa.gz" ftype="fasta.gz"/>
                </element>
            </output_collection>
            <output_collection name="discarded_paired" type="list:paired" count="1">
                <element name="reads">
                    <element name="forward" file="procrad/R1.fa.discards" ftype="fasta"/>
                    <element name="reverse" file="procrad/R2.fa.discards" ftype="fasta"/>
                </element>
            </output_collection>
        </test>
        <!-- paired input (gzipped) + advanced options + two enzymes, fasta output -->
        <test>
            <param name="input_type|options_type_selector" value="paired"/>
            <param name="input_type|fqinputs">
                <collection type="list:paired">
                    <element name="reads">
                        <collection type="paired">
                            <element name="forward" value="procrad/R1.fq.gzip" ftype="fastqsanger.gz" />
                            <element name="reverse" value="procrad/R2.fq.gzip" ftype="fastqsanger.gz"/>
                        </collection>
                    </element>
                </collection>
            </param>
            <param name="barcode" value="procrad/barcodes"/>
            <param name="options_enzyme|options_enzyme_selector" value="2"/>
            <param name="options_enzyme|enzyme" value="ecoRI"/>
            <param name="options_enzyme|enzyme2" value="ecoRI"/>
            <param name="options_advanced|truncate" value="70" />
            <param name="options_advanced|rescue" value="-r"/>
            <param name="options_advanced|bestrad" value="--bestrad" />
            <param name="options_advanced|retain_header" value="true"/>
            <param name="options_advanced|disable_rad_check" value="--disable_rad_check" />
            <param name="options_advanced|barcode_dist_1" value="2" />
            <param name="options_advanced|barcode_dist_2" value="2" />
            <param name="options_advanced|adapter_1" value="" />
            <param name="options_advanced|adapter_2" value="" />
            <param name="options_advanced|adapter_mm" value="" />
            <param name="outype" value="fasta"/>
            <assert_command>
                <has_text text="-e ecoRI" />
                <has_text text="--renz_2 ecoRI" />
                <has_text text="-t 70" />
                <has_text text="-r" />
                <has_text text="--bestrad" />
                <has_text text="--retain_header" />
                <has_text text="--disable_rad_check" />
                <has_text text="--barcode_dist_1 2" />
                <has_text text="--barcode_dist_2 2" />
                <has_text text="--adapter_mm 2" />
            </assert_command>
            <output_collection name="demultiplexed_paired" type="list:paired" count="40">
                <element name="PopA_01">
                    <element name="forward" file="demultiplexed/PopA_01.1.fa" ftype="fasta"/>
                    <element name="reverse" file="demultiplexed/PopA_01.2.fa" ftype="fasta"/>
                </element>
            </output_collection>
            <output_collection name="remaining" type="list:paired" count="40">
                <element name="PopA_01">
                    <element name="forward" file="demultiplexed/PopA_01.rem.1.fa" ftype="fasta" />
                    <element name="reverse" file="demultiplexed/PopA_01.rem.2.fa" ftype="fasta" />
                </element>
            </output_collection>
        </test>
    </tests>

    <help>
<![CDATA[
.. class:: infomark

**What it does**

This program examines raw reads from an Illumina sequencing run and first, checks that the barcode and the RAD cutsite are intact, and demultiplexes the data. If there are errors in the barcode or the RAD site within a certain allowance process_radtags can correct them. Second, it slides a window down the length of the read and checks the average quality score within the window. If the score drops below 90% probability of being correct (a raw phred score of 10), the read is discarded. This allows for some seqeuncing errors while elimating reads where the sequence is degrading as it is being sequenced. By default the sliding window is 15% of the length of the read, but the threshold and window size can be adjusted.

The process_radtags program can:

- handle data that is barcoded, either inline or using an index, or unbarcoded.
- use combinatorial barcodes.
- check and correct for a restriction enzyme cutsite for single or double-digested data.
- filter adapter sequence while allowing for sequencing error in the adapter pattern.
- process individual files or whole directories of files.
- directly read gzipped data
- filter reads based on Illumina's Chastity filter

**Help**

Input files:

- A set of one or more FASTQ files (either selected manually, a data set list, or a paired data set list)

- Barcode File

The barcode file is a very simple format:

======= ===========
Barcode Sample name
======= ===========
ATGGGG  PopA_01
GGGTAA  PopA_02
AGGAAA  PopA_03
TTTAAG  PopA_04
GGTGTG  PopA_05
TGATGT  PopA_06
======= ===========

Combinatorial barcodes are specified, one per column, separated by a tab:

======== ======== ===========
Barcode1 Barcode2 Sample name
======== ======== ===========
CGATA    ACGTA    PopA_01
CGGCG    CGTA     PopA_02
GAAGC    CGTA     PopA_03
GAGAT    CGTA     PopA_04
CGATA    AGCA     PopA_05
CGGCG    AGCA     PopA_06
======== ======== ===========

The sample name column can be omitted. Then the Barcodes are used for naming the output files.

@STACKS_INFOS@
]]>
    </help>
    <expand macro="citation" />
</tool>