Mercurial > repos > bgruening > canu

<tool id="canu" name="Canu assembler" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
    <description>Assembler optimized for long error-prone reads such as PacBio, Oxford Nanopore</description>
    <xrefs>
        <xref type="bio.tools">canu</xref>
    </xrefs>
    <macros>
        <token name="@TOOL_VERSION@">2.2</token>
        <token name="@VERSION_SUFFIX@">0</token>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">canu</requirement>
    </requirements>
    <version_command>canu --version</version_command>
    <command detect_errors="exit_code">
    <![CDATA[

    #for $counter, $input in enumerate($inputs):
        #if $input.ext in ['fastq.gz', 'fasta.gz']
            ## linking does not work
            cp '$input' ./input_${counter}.gz &&
        #end if
    #end for

    canu
        #if $stage != 'all':
            $stage
        #end if
        -p canu
        -d ./out_dir
        #if $s:
            -s '$s'
        #end if
        #if $rawErrorRate:
            rawErrorRate=$rawErrorRate
        #end if
        #if $correctedErrorRate:
            correctedErrorRate=$correctedErrorRate
        #end if
        minReadLength=$minReadLength
        minOverlapLength=$minOverlapLength
        corOutCoverage=$corOutCoverage
        #if $stopOnLowCoverage
            stopOnLowCoverage=$stopOnLowCoverage
        #end if
        #if $minInputCoverage ##and $stage in ["all", "trim-assemble", "assemble"]
            minInputCoverage=$minInputCoverage
        #end if
        contigFilter='
            ${contigFilter.minReads}
            ${contigFilter.minLength}
            ${contigFilter.singleReadSpan}
            ${contigFilter.lowCovSpan}
            ${contigFilter.lowCovDepth}
        '

        genomeSize='$genomeSize'
        minThreads=\${GALAXY_SLOTS:-4}
        maxThreads=\${GALAXY_SLOTS:-4}
        redMemory=\${GALAXY_MEMORY_MB:-6144}M
        redThreads=\${GALAXY_SLOTS:-4}
        obtovlThreads=\${GALAXY_SLOTS:-4}
        utgovlThreads=\${GALAXY_SLOTS:-4}
        batThreads=\${GALAXY_SLOTS:-4}
        batMemory=\${GALAXY_MEMORY_MB:-6144}M
        cormhapMemory=\${GALAXY_MEMORY_MB:-6144}M
        obtovlMemory=\${GALAXY_MEMORY_MB:-6144}M
        utgovlMemory=\${GALAXY_MEMORY_MB:-6144}M
        corThreads=\${GALAXY_SLOTS:-4}
        corMemory=\${GALAXY_MEMORY_MB:-6144}M
        cnsThreads=\${GALAXY_SLOTS:-4}
        cnsMemory=\${GALAXY_MEMORY_MB:-6144}M
        oeaMemory=\${GALAXY_MEMORY_MB:-6144}M
        oeaThreads=\${GALAXY_SLOTS:-4}
        merylThreads=\${GALAXY_SLOTS:-4}
        merylMemory=\${GALAXY_MEMORY_MB:-6144}M
        corovlThreads=\${GALAXY_SLOTS:-4}
        corovlMemory=\${GALAXY_MEMORY_MB:-6144}M
        useGrid=false

        #for $haplotype in $haplotypes:
            -haplotype${haplotype.haplotype_name} '${haplotype.haplotype_input}'
        #end for

        $technology
        #if $processing:
            $processing
        #end if

        #for $counter, $input in enumerate($inputs):
            #if $input.ext in ['fastq.gz', 'fasta.gz']
                ./input_${counter}.gz
            #else:
                '$input'
            #end if
        #end for
        2>&1

    ]]>
    </command>
    <inputs>
        <param name="inputs" type="data" format="fasta,fasta.gz,fastq,fastq.gz" multiple="true" label="Input reads"/>
        <repeat name="haplotypes" min="0" max="2" title="Haplotypes for Trio Binning Assembly" help="Canu has support for using parental short-read sequencing to classify and bin">
            <param name="haplotype_input" type="data" format="fasta,fastq" multiple="true" label="Haplotype input reads"/>
            <param name="haplotype_name" type="text" label="Shot name to identify your haplotype"/>
        </repeat>
        <param name="technology" type="select" label="Technology">
            <option value="-nanopore" selected="true">Nanopore</option>
            <option value="-pacbio">PacBio</option>
            <option value="-pacbio-hifi">PacBio HiFi</option>
        </param>
        <param name="processing" type="select" optional="true" label="Processing">
            <option value="-corrected">Corrected</option>
            <option value="-trimmed">Trimmed</option>
        </param>
        <param name="stage" type="select" label="To restrict canu to only a specific stage, use">
            <option value="all" selected="true">all</option>
            <option value="-haplotype">generate haplotype-specific reads</option>
            <option value="-correct">generate corrected reads</option>
            <option value="-trim">generate trimmed reads</option>
            <option value="-assemble">generate an assembly</option>
            <option value="-trim-assemble">generate trimmed reads and then assemble them</option>
        </param>
        <param argument="genomeSize" type="text" label="Estimated genome size (e.g. 8.0m, 15k, 2g)">
            <validator type="empty_field"/>
            <validator type="expression" message="Only values similar to 8.0m, 15k or 2g are allowed.">value.replace('.', '').isalnum() and value[-1] in ['m', 'k', 'g'] and float(value[:-1])</validator>
        </param>
        <param argument="rawErrorRate" type="float" value="" optional="true" min="0" max="1" label="Maximum raw overlap mismatch" help="The defaults are 0.300 for PacBio reads and 0.500 for Nanopore reads."/>
        <param argument="correctedErrorRate" type="float" value="" optional="true" min="0" max="1" label="Maximum corrected overlap mismatch" help="The allowed difference in an overlap between two corrected reads.  Assemblies of                 low coverage or data with biological differences will benefit from a slight increase                 in this.  Defaults are 0.045 for PacBio reads and 0.144 for Nanopore reads."/>
        <param argument="minReadLength" type="integer" value="1000" min="1" label="Minimum read length"/>
        <param argument="minOverlapLength" type="integer" value="500" min="1" label="Minimum overlap"/>
        <param argument="minInputCoverage" type="integer" value="" min="1" optional="true" label="Minimum Input Coverage"/>
        <param argument="corOutCoverage" type="integer" value="40" min="1" label="Target coverage for corrected reads"/>
        <param argument="-s" type="data" format="txt" optional="true" label="Additonal options" help="Additional specifications provided in a canu spec file."/>
        <param argument="stopOnLowCoverage" type="integer" value="10" min="1" label="Stop the assembly if read coverage is too low to be useful" help="Coverage is checked whene when input sequences are initially loaded into the sequence store, when corrected reads are generated, and when read ends are trimmed off."/>
        <section name="contigFilter" title="Contig Filters">
            <param argument="minReads" type="integer" value="2" min="0" label="Minimum reads"/>
            <param argument="minLength" type="integer" value="0" min="0" label="Minimum length"/>
            <param argument="singleReadSpan" type="float" value="1.0" min="0.0" max="1.0" label="Maximum single read span (fraction)"/>
            <param argument="lowCovSpan" type="float" value="0.5" min="0.0" max="1.0" label="Low coverage span (fraction)"/>
            <param argument="lowCovDepth" type="integer" value="5" min="0" label="Low coverage depth"/>
        </section>
    </inputs>
    <outputs>
        <data name="report" format="txt" from_work_dir="out_dir/canu.report" label="${tool.name} on ${on_string} (report)"/>
        <data name="contigs" format="fasta" from_work_dir="out_dir/canu.contigs.fasta" label="${tool.name} on ${on_string} (contigs)">
            <filter>stage == 'all'</filter>
        </data>
        <data name="unassembled" format="fasta" from_work_dir="out_dir/canu.unassembled.fasta" label="${tool.name} on ${on_string} (unassembled)">
            <filter>stage == 'all'</filter>
        </data>
        <data name="corrected_reads" format="fasta.gz" from_work_dir="out_dir/canu.correctedReads.fasta.gz" label="${tool.name} on ${on_string} (corrected reads)">
            <filter>'-correct' in stage or stage == 'all'</filter>
        </data>
        <data name="trimmed_reads" format="fasta.gz" from_work_dir="out_dir/canu.trimmedReads.fasta.gz" label="${tool.name} on ${on_string} (trimmed reads)">
            <filter>'-trim' in stage or stage == 'all'</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="5">
            <param name="inputs" ftype="fasta" value="ecoli-reads.fasta"/>
            <param name="technology" value="-nanopore"/>
            <param name="genomeSize" value="20k"/>
            <param name="stopOnLowCoverage" value="1"/>
            <param name="minInputCoverage" value="1"/>
            <param name="minReadLength" value="2000"/>
            <output name="contigs" ftype="fasta" file="ecoli_canu_contigs_result1.fa"/>
            <output name="unassembled" ftype="fasta" file="ecoli_canu_unassembled_result1.fa"/>
            <output name="corrected_reads" ftype="fasta.gz" decompress="True" file="ecoli_canu_corrected_reads_result1.fa.gz"/>
            <output name="trimmed_reads" ftype="fasta.gz" decompress="True" file="ecoli_canu_trimmed_reads_result1.fa.gz"/>
            <output name="report">
                <assert_contents>
                    <has_n_lines n="486"/>
                    <has_text_matching expression="[UNITIGGING/CONTIGS]"/>
                    <has_text_matching expression="-- Contig sizes based on genome size 20kbp:"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="5">
            <param name="inputs" ftype="fasta" value="ecoli-reads.fasta"/>
            <param name="technology" value="-nanopore"/>
            <param name="genomeSize" value="20k"/>
            <param name="stopOnLowCoverage" value="1"/>
            <param name="minInputCoverage" value="1"/>
            <param name="minReadLength" value="2000"/>
            <param name="minOverlapLength" value="800"/>
            <param name="rawErrorRate" value="0.2"/>
            <param name="correctedErrorRate" value="0.05"/>
            <param name="corOutCoverage" value="2"/>
            <output name="contigs" ftype="fasta" file="ecoli_canu_contigs_result2.fa"/>
            <output name="unassembled" ftype="fasta" file="ecoli_canu_unassembled_result2.fa"/>
            <output name="corrected_reads" ftype="fasta.gz" decompress="True" file="ecoli_canu_corrected_reads_result2.fa.gz"/>
            <output name="trimmed_reads" ftype="fasta.gz" decompress="True" file="ecoli_canu_trimmed_reads_result2.fa.gz"/>
            <output name="report">
                <assert_contents>
                    <has_n_lines n="456"/>
                    <has_text_matching expression="[UNITIGGING/CONTIGS]"/>
                    <has_text_matching expression="-- Contig sizes based on genome size 20kbp:"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <param name="inputs" ftype="fasta" value="ecoli-reads.fasta"/>
            <param name="technology" value="-nanopore"/>
            <param name="genomeSize" value="20k"/>
            <param name="stage" value="-correct"/>
            <param name="stopOnLowCoverage" value="1"/>
            <param name="minReadLength" value="2500"/>
            <output name="corrected_reads" ftype="fasta.gz" decompress="True" file="ecoli_canu_corrected_reads_result3.fa.gz"/>
            <output name="report">
                <assert_contents>
                    <has_n_lines n="189"/>
                    <has_text_matching expression="[TRIMMING/READS]"/>
                    <has_text_matching expression="--   Found 93 reads."/>
                </assert_contents>
            </output>
        </test>
        <!--trimming test - it does currently not trim anything due to the input data -->
        <test expect_num_outputs="2">
            <param name="inputs" ftype="fasta" value="ecoli-reads.fasta"/>
            <param name="technology" value="-nanopore"/>
            <param name="processing" value="-corrected"/>
            <!-- <param name="minInputCoverage" value="1"/> -->
            <param name="genomeSize" value="0.01m"/>
            <param name="stage" value="-trim"/>
            <param name="minReadLength" value="500"/>
            <output name="report">
                <assert_contents>
                    <has_text_matching expression="[TRIMMING/READS]"/>
                    <has_n_lines n="209"/>
		    <has_text_matching expression="Found 112 reads."/><!-- before trimming -->
		    <has_text_matching expression="Found 61 reads."/><!-- after trimming -->
                </assert_contents>
            </output>
        </test>
        <!--test expect_num_outputs="5">
            <param name="inputs" ftype="fasta" value="ecoli-reads.fasta"/>
            <param name="technology" value="-pacbio"/>
            <repeat name="haplotypes">
                <param name="haplotype_name" value="K12"/>
                <param name="haplotype_input" ftype="fasta" value="ecoli-reads.fasta"/>
            </repeat>
            <repeat name="haplotypes">
                <param name="haplotype_name" value="K13"/>
                <param name="haplotype_input" ftype="fasta" value="ecoli-reads.fasta"/>
            </repeat>
            <param name="genomeSize" value="20k"/>
            <param name="stopOnLowCoverage" value="1"/>
            <param name="minInputCoverage" value="1"/>
            <param name="minReadLength" value="2000"/>
            <output name="contigs" ftype="fasta" file="ecoli_canu_contigs_result5.fa"/>
            <output name="unassembled" ftype="fasta" file="ecoli_canu_unassembled_result5.fa"/>
            <output name="corrected_reads" ftype="fasta.gz" decompress="True" file="ecoli_canu_corrected_reads_result5.fa.gz"/>
            <output name="trimmed_reads" ftype="fasta.gz" decompress="True" file="ecoli_canu_trimmed_reads_result5.fa.gz"/>
        </test-->
    </tests>
    <help>
    <![CDATA[

        Canu specializes in assembling PacBio or Oxford Nanopore sequences. Canu operates in three phases: correction, trimming and assembly.
        The correction phase will improve the accuracy of bases in reads. The trimming phase will trim reads to the portion that appears to
        be high-quality sequence, removing suspicious regions such as remaining SMRTbell adapter. The assembly phase will order the reads
        into contigs, generate consensus sequences and create graphs of alternate paths.

        For eukaryotic genomes, coverage more than 20x is enough to outperform current hybrid methods, however, between 30x and 60x
        coverage is the recommended minimum. More coverage will let Canu use longer reads for assembly, which will result in better assemblies.

        http://canu.readthedocs.io

    ]]>
    </help>
    <citations>
        <citation type="doi">10.1101/gr.215087.116</citation>
        <citation type="doi">10.1093/bioinformatics/btw753</citation>
        <citation type="doi">10.1038/nbt.3238</citation>
        <citation type="doi">10.1126/science.287.5461.2196</citation>
        <citation type="doi">10.1038/nmeth.4035</citation>
        <citation type="doi">10.1038/nmeth.2474</citation>
    </citations>
</tool>
author	bgruening
date	Wed, 03 Nov 2021 16:10:06 +0000
parents	eb87bd7a5168
children