Mercurial > repos > iuc > dada2_plotcomplexity

<tool id="dada2_plotComplexity" name="dada2: plotComplexity" version="@DADA2_VERSION@+galaxy@WRAPPER_VERSION@" profile="19.09">
    <description>Plot sequence complexity profile</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="bio_tools"/>
    <expand macro="requirements"/>
    <expand macro="stdio"/>
    <expand macro="version_command"/>
    <command detect_errors="exit_code"><![CDATA[
##name files by linking
#import re
mkdir forward &&
#if $batch_cond.paired_cond.paired_select != "single"
    mkdir reverse &&
#end if

#if $batch_cond.batch_select == "batch":
    #set elid = re.sub('[^\w\-\.]', '_', str($batch_cond.paired_cond.reads.element_identifier))
    #if $batch_cond.paired_cond.paired_select != "paired"
        ln -s '$batch_cond.paired_cond.reads' forward/'$elid' &&
    #else
        ln -s '$batch_cond.paired_cond.reads.forward' forward/'$elid' &&
        ln -s '$batch_cond.paired_cond.reads.reverse' reverse/'$elid' &&
    #end if
    #if $batch_cond.paired_cond.paired_select == "separate"
        ln -s '$batch_cond.paired_cond.sdaer' reverse/'$elid' &&
    #end if
#else
    #for $read in $batch_cond.paired_cond.reads:
        #set elid = re.sub('[^\w\-\.]', '_', str($read.element_identifier))
        #if $batch_cond.paired_cond.paired_select != "paired"
            ln -s '$read' forward/'$elid' &&
        #else
            ln -s '$read.forward' forward/'$elid' &&
            ln -s '$read.reverse' reverse/'$elid' &&
        #end if
    #end for
    #if $batch_cond.paired_cond.paired_select == "separate"
        #for $read in $batch_cond.paired_cond.sdaer:
            #set elid = re.sub('[^\w\-\.]', '_', str($read.element_identifier))
            ln -s '$read' reverse/'$elid' &&
        #end for
    #end if
#end if

    Rscript --slave '$dada2_script'
    ]]></command>
    <configfiles>
        <configfile name="dada2_script"><![CDATA[
#import re
library(ggplot2, quietly=T)
library(dada2, quietly=T)

#if $batch_cond.batch_select != "batch"
agg <- $batch_cond.aggregate
#else
agg <- FALSE
#end if

#if str($window) == ""
    wndw <- NULL
#else
    wndw <- $window
#end if

fwd_files <- list.files("forward", full.names=T)
qp <- plotComplexity(fwd_files, kmerSize=$kmerSize, window=wndw, by=$by, n=$n, bins=$bins, aggregate = agg)
ggsave('output.pdf', qp, width = 20,height = 15,units = c("cm"))

#if $batch_cond.paired_cond.paired_select != "single"
rev_files <- list.files("reverse", full.names=T)
qp <- plotComplexity(rev_files, kmerSize=$kmerSize, window=wndw, by=$by, n=$n, bins=$bins, aggregate = agg)
ggsave('output_rev.pdf', qp, width = 20,height = 15,units = c("cm"))
#end if
    ]]></configfile>
    </configfiles>
    <inputs>
        <conditional name="batch_cond">
            <param name="batch_select" type="select" label="Processing mode" help="Joint processing processes all reads at once in a single job creating a single output (two in the case of paired data). Batch processes the samples in separate jobs and creates separate output for each">
                <option value="joint">Joint</option>
                <option value="batch">Batch</option>
            </param>
            <when value="joint">
                <expand macro="fastq_input" multiple="True" collection_type="list:paired" argument_fwd="fl" argument_rev="fl"/>
                <param argument="aggregate" type="boolean" label="Aggregate data" checked="True" truevalue="TRUE" falsevalue="FALSE" help="Create a single plot for all data sets (default) or a separate plot for each data set"/>
            </when>
            <when value="batch">
                <expand macro="fastq_input" multiple="False" collection_type="paired" argument_fwd="fl" argument_rev="fl"/>
            </when>
        </conditional>
        <param argument="kmerSize" type="integer" value="2" label="kmer size" help="kmer: also known as oligonucleotides words"/>
        <param argument="window" type="integer" value="" optional="true" label="width (nucleotides) of the moving window" help="If not specified (default) the whole sequence is used"/>
        <param argument="by" type="integer" value="5" label="step size (nucleotides)" help="between each moving window tested"/>
        <param argument="n" type="integer" value="100000" label="sample number" help="number of records to sample from the fastq file"/>
        <param argument="bins" type="integer" value="100" label="number of bins to use for the histogram" help=""/>
    </inputs>
    <outputs>
        <data name="output" format="pdf" from_work_dir="output.pdf">
            <filter>batch_cond['paired_cond']['paired_select'] == "single"</filter>
        </data>
        <data name="output_fwd" format="pdf" from_work_dir="output.pdf" label="${tool.name} on ${on_string}: forward reads">
            <filter>batch_cond['paired_cond']['paired_select'] != "single"</filter>
        </data>
        <data name="output_rev" format="pdf" from_work_dir="output_rev.pdf" label="${tool.name} on ${on_string}: reverse reads">
            <filter>batch_cond['paired_cond']['paired_select'] != "single"</filter>
        </data>
    </outputs>
    <tests>
        <!-- all tests are against the same file using a delta that should ensure that the pdf contains a plot -->
        <!-- paired joint, no-aggregate -->
        <test expect_num_outputs="2">
            <param name="batch_cond|batch_select" value="joint"/>
            <param name="batch_cond|paired_cond|paired_select" value="paired"/>
            <param name="batch_cond|paired_cond|reads">
                <collection type="list:paired">
                    <element name="F3D0_S188_L001">
                        <collection type="paired">
                            <element name="forward" value="F3D0_S188_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
                            <element name="reverse" value="F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
                         </collection>
                    </element>
                    <element name="F3D141_S207_L001">
                        <collection type="paired">
                            <element name="forward" value="F3D141_S207_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
                            <element name="reverse" value="F3D141_S207_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
                         </collection>
                    </element>
                </collection>
            </param>
            <param name="batch_cond|aggregate" value="FALSE"/>
            <output name="output_fwd" value="complexity_fwd.pdf" ftype="pdf" compare="sim_size" delta="200"/>
            <output name="output_rev" value="complexity_rev.pdf" ftype="pdf" compare="sim_size" delta="200"/>
        </test>
        <!-- paired-separate joint, no-aggregate (sim_size because element ids differ) -->
        <test expect_num_outputs="2">
            <param name="batch_cond|batch_select" value="joint"/>
            <param name="batch_cond|paired_cond|paired_select" value="separate"/>
            <param name="batch_cond|paired_cond|reads" value="F3D0_S188_L001_R1_001.fastq.gz,F3D141_S207_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
            <param name="batch_cond|paired_cond|sdaer" value="F3D0_S188_L001_R2_001.fastq.gz,F3D141_S207_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
            <param name="batch_cond|aggregate" value="FALSE"/>
            <output name="output_fwd" value="complexity_fwd.pdf" ftype="pdf" compare="sim_size" delta="200"/>
            <output name="output_rev" value="complexity_rev.pdf" ftype="pdf" compare="sim_size" delta="200"/>
        </test>
        <!-- single, non-batch, aggregate, small sample -->
        <test expect_num_outputs="1">
            <param name="batch_cond|batch_select" value="joint"/>
            <param name="batch_cond|paired_cond|paired_select" value="single"/>
            <param name="batch_cond|paired_cond|reads" value="F3D0_S188_L001_R1_001.fastq.gz,F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
            <param name="n" value="10000"/>
            <param name="batch_cond|aggregate" value="TRUE"/>
            <output name="output" value="complexity_fwd.pdf" ftype="pdf" compare="sim_size" delta="900"/>
        </test>

        <!-- paired, batch -->
        <test expect_num_outputs="2">
            <param name="batch_cond|batch_select" value="batch"/>
            <param name="batch_cond|paired_cond|paired_select" value="paired"/>
            <param name="batch_cond|paired_cond|reads">
                <collection type="paired">
                    <element name="forward" value="F3D0_S188_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
                    <element name="reverse" value="F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
                </collection>
            </param>
            <output name="output_fwd" value="complexity.pdf" ftype="pdf" compare="sim_size" delta="200"/>
            <output name="output_rev" value="complexity.pdf" ftype="pdf" compare="sim_size" delta="200"/>
        </test>
        <!-- paired-separate batch  (sim_size because element ids differ)-->
        <test expect_num_outputs="2">
            <param name="batch_cond|batch_select" value="batch"/>
            <param name="batch_cond|paired_cond|paired_select" value="separate"/>
            <param name="batch_cond|paired_cond|reads" value="F3D0_S188_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
            <param name="batch_cond|paired_cond|sdaer" value="F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
            <output name="output_fwd" value="complexity.pdf" ftype="pdf" compare="sim_size" delta="200"/>
            <output name="output_rev" value="complexity.pdf" ftype="pdf" compare="sim_size" delta="200"/>
        </test>
        <!-- single, batch -->
        <test expect_num_outputs="1">
            <param name="batch_cond|batch_select" value="batch"/>
            <param name="batch_cond|paired_cond|paired_select" value="single"/>
            <param name="batch_cond|paired_cond|reads" value="F3D0_S188_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
            <param name="n" value="10000"/>
            <output name="output" value="complexity.pdf" ftype="pdf" compare="sim_size" delta="200"/>
        </test>
    </tests>
    <help><![CDATA[
Summary
.......

This function plots a histogram of the distribution of sequence complexities in the form of effective numbers of kmers as determined by seqComplexity. By default, kmers of size 2 are used, in which case a perfectly random sequences will approach an effective kmer number of 16 = 4 (nucleotides)^ 2 (kmer size).

Details
.......

This function calculates the kmer complexity of input sequences.  Complexity is quantified as the Shannon richness of kmers, which can be thought of as the effective number of kmers if they were all at equal frequencies.  If a window size is provided, the minimum Shannon richness observed over sliding window along the sequence is returned.


@HELP_OVERVIEW@
    ]]></help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Mon, 07 Aug 2023 01:29:22 +0000
parents	9a88f458440d
children