Mercurial > repos > iuc > concoct

<tool id="concoct" name="CONCOCT" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>for metagenome binning</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <command detect_errors="exit_code"><![CDATA[
## CONCOCT doesn't handle gzipped files.
#if $composition_file.ext.endswith(".gz")
    gunzip -c '$composition_file' > 'composition_file.fa' &&
#else:
    ln -s '$composition_file' 'composition_file.fa' &&
#end if

mkdir outdir &&
concoct
    --coverage_file '$coverage_file'
    --composition_file 'composition_file.fa'
    --clusters $advanced.clusters
    --kmer_length $advanced.kmer_length
    --threads \${GALAXY_SLOTS:-4}
    --length_threshold $advanced.length_threshold
    --read_length $advanced.read_length
    --total_percentage_pca $advanced.total_percentage_pca
    --basename 'outdir/'
    --seed $advanced.seed
    --iterations $advanced.iterations
    $advanced.no_cov_normalization
    $output.no_total_coverage
    --no_original_data
    $output.converge_out
    ]]></command>
    <inputs>
        <param argument="--coverage_file" type="data" format="tabular" label="Coverage file" help="Table where each row correspond to a contig, and each column correspond to a sample. The values are the average coverage for this contig in that sample"/>
        <param argument="--composition_file" type="data" format="fasta,fasta.gz" label="Composition file with sequences" help="It is named the composition file since it is used to calculate the kmer composition (the genomic signature) of each contig."/>
        <section name="advanced" title="Advanced options">
            <param argument="--clusters" type="integer" min="0" value="400" label="Maximum number of clusters for the Variational Gaussian Mixture Model (VGMM) algorithm"/>
            <param argument="--kmer_length" type="integer" min="0" value="4" label="Kmer length"/>
            <param argument="--length_threshold" type="integer" min="0" value="1000" label="Sequence length threshold" help="Contigs shorter than this value will not be included"/>
            <param argument="--read_length" type="integer" min="0" value="100" label="Read length for coverage"/>
            <param argument="--total_percentage_pca" type="integer" min="0" value="90" label="Percentage of variance explained by the principal components for the combined data"/>
            <param argument="--seed" type="integer" min="0" value="1" label="Seed for clustering" help="Zero value will use random seed"/>
            <param argument="--iterations" type="integer" min="0" value="500" label="Maximum number of iterations for the Variational Bayes Gaussian Mixture Models (VBGMM)"/>
            <param argument="--no_cov_normalization" type="boolean" truevalue="--no_cov_normalization" falsevalue="" checked="false" label="Skip normalization and only do log transorm of the coverage?" help="By default, the coverage is normalized for samples, then normalized for contigs and finally log transformed. By setting this flag you skip the normalization and only do log transorm of the   coverage."/>
        </section>
        <section name="output" title="Output">
            <param argument="--no_total_coverage" type="boolean" truevalue="--no_total_coverage" falsevalue="" checked="false" label="Eliminate the total coverage column from the coverage data matrix?" help="By default, total coverage is included, independently of coverage normalization but previous to log transformation. Use this tag to escape this behaviour."/>
            <param argument="--converge_out" type="boolean" truevalue="--converge_out" falsevalue="" checked="false" label="Write convergence information to files?"/>
            <param name="log" type="boolean" checked="false" label="Output process log file?"/>
        </section>
    </inputs>
    <outputs>
        <data name="output_clustering" format="csv" from_work_dir="outdir/clustering_gt*" label="${tool.name} on ${on_string}: Clusters"/>
        <data name="process_log" format="txt" from_work_dir="outdir/log.txt" label="${tool.name} on ${on_string}: Log">
            <filter>output['log']</filter>
        </data>
        <data name="output_pca_components" format="csv" from_work_dir="outdir/PCA_components_data_gt*" label="${tool.name} on ${on_string}: PCA components"/>
        <data name="output_pca_transformed" format="csv" from_work_dir="outdir/PCA_transformed_data_gt*" label="${tool.name} on ${on_string}: PCA transformed clusters"/>
    </outputs>
    <tests>
        <test expect_num_outputs="4">
            <param name="coverage_file" value="coverage" ftype="tabular"/>
            <param name="composition_file" value="composition.fa" ftype="fasta"/>
            <section name="advanced">
                <param name="clusters" value="400"/>
                <param name="kmer_length" value="4"/>
                <param name="length_threshold" value="1000"/>
                <param name="read_length" value="100"/>
                <param name="total_percentage_pca" value="100"/>
                <param name="seed" value="1"/>
                <param name="iterations" value="500"/>
                <param name="no_cov_normalization" value=""/>
            </section>
            <section name="output">
                <param name="no_total_coverage" value=""/>
                <param name="converge_out" value=""/>
                <param name="log" value="true"/>
            </section>
            <output name="process_log" ftype="txt" compare="contains">
                <assert_contents>
                    <has_size value="786" delta="4"/>
                    <has_text text="CONCOCT Finished"/>
                </assert_contents>
            </output>
            <output name="output_pca_components" ftype="csv">
                <assert_contents>
                    <has_size value="362924" delta="10"/>
                    <has_text text="-5.90697200e-02"/>
                </assert_contents>
            </output>
            <output name="output_pca_transformed" ftype="csv">
                <assert_contents>
                    <has_size value="834200" delta="50"/>
                    <has_text text="contig-21000001"/>
                </assert_contents>
            </output>
            <output name="output_clustering" ftype="csv">
                <assert_contents>
                    <has_size value="6923" delta="10"/>
                    <has_text text="contig-21000001,"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="3">
            <param name="coverage_file" value="coverage" ftype="tabular"/>
            <param name="composition_file" value="composition.fa.gz" ftype="fasta.gz"/>
            <section name="advanced">
                <param name="clusters" value="400"/>
                <param name="kmer_length" value="4"/>
                <param name="length_threshold" value="1000"/>
                <param name="read_length" value="100"/>
                <param name="total_percentage_pca" value="100"/>
                <param name="seed" value="1"/>
                <param name="iterations" value="500"/>
                <param name="no_cov_normalization" value=""/>
            </section>
            <section name="output">
                <param name="no_total_coverage" value=""/>
                <param name="converge_out" value=""/>
                <param name="log" value="false"/>
            </section>
            <output name="output_pca_components" ftype="csv">
                <assert_contents>
                    <has_size value="362924" delta="10"/>
                    <has_text text="-5.90697200e-02"/>
                </assert_contents>
            </output>
            <output name="output_pca_transformed" ftype="csv">
                <assert_contents>
                    <has_size value="834200" delta="50"/>
                    <has_text text="contig-21000001"/>
                </assert_contents>
            </output>
            <output name="output_clustering" ftype="csv">
                <assert_contents>
                    <has_size value="6923" delta="10"/>
                    <has_text text="contig-21000001,"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
**What it does**

CONCOCT (Clustering cONtigs with COverage and ComposiTion) performs unsupervised binning of metagenomic contigs by
using nucleotide composition - kmer frequencies - and coverage data for multiple samples.  CONCOCT can accurately
(up to species level) bin metagenomic contigs.

The tool accepts 2 inputs; a tabular file where each row corresponds to a contig and each column corresponds to a
sample (the values are the average coverage for this contig in that sample) and a file containing sequences in
fasta format.

Three outputs are produced; clustering of the > 1000 kmer count, the PCA transformed matrix and the PCA components.

@HELP_OVERVIEW@
    ]]></help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Sat, 26 Aug 2023 02:41:24 +0000
parents	d9600883e676
children