diff sigmut.xml @ 0:2062de974f72 draft default tip

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/sigmut commit bba3eb3950b8772758cc6f19747172be7413ddd9"
author artbio
date Mon, 15 Jun 2020 00:28:49 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sigmut.xml	Mon Jun 15 00:28:49 2020 +0000
@@ -0,0 +1,377 @@
+<tool id="SigProfiler" name="SigProfiler" version="@VERSION@">
+    <description>performs  mutational signature characterization from variant files</description>
+
+    <macros>
+        <import>sigmut_macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <expand macro="stdio"/>
+    <command detect_errors="exit_code"><![CDATA[
+        @VERSION@
+        @pipefail@
+        BIN=`which sigprofiler | sed 's,/sigprofiler,,g'` &&
+        echo \$BIN &&
+        chmod -R 777 \$BIN &&
+        mkdir run_dir &&
+        #if str( $set_analysis.choices ) == "get_sigmut":
+            #if str( $set_analysis.vcfile_input.vcfile ) == "maf":
+                #set $infile = 'run_dir/snps.maf'
+                ln -s -f '$set_analysis.vcfile_input.maf_file' '$infile' &&
+            #else if str( $set_analysis.vcfile_input.vcfile ) == "icgc":
+                #set $infile = 'run_dir/snps.txt'
+		        ln -s -f '$set_analysis.vcfile_input.icgc_file' '$infile' &&
+            #else if str( $set_analysis.vcfile_input.vcfile ) == "vcf":
+                #set $infile = 'run_dir/snps.vcf'
+                ln -s -f '$set_analysis.vcfile_input.vcf_file' '$infile' &&
+            #end if
+        #end if
+        
+        sigprofiler
+        
+        #if str( $set_analysis.choices ) == "install_genome":
+            -ig $set_analysis.refgendwn > install.log
+        #else if str( $set_analysis.choices ) == "get_sigmut":
+            -g $set_analysis.refgendat
+            -f 'run_dir'
+            -n "project"
+            -p
+## ! implement exome functionality when good test available
+##            #if str( $set_analysis.exome ) == "true":
+##                -e
+##            #end if
+## ! implement per chromosome functionality when good test available
+##            #if str( $set_analysis.chrom_based ) == "true":
+##                -c
+##            #end if
+            #if str( $set_analysis.tsb_stat ) == "true":
+                -t
+            #end if
+            #if str( $set_analysis.gs ) == "true":
+                -s
+            #end if
+            ##-b $set_analysis.bed ### to be done
+            && pdfcombine -f -s -o blinder.pdf run_dir/output/plots/*.pdf
+            && ls run_dir/logs/
+            #if str( $set_analysis.tsb_stat ) == "true":
+                && tail -n +1 run_dir/output/TSB/*.txt > transcriptional_strand_biases.txt
+           #end if
+           #if $set_analysis.seqInfo:
+               && tail -n +1 run_dir/output/*/*.all > information.txt
+           #end if
+        #end if
+        ]]></command>
+
+    <inputs>
+        <conditional name="set_analysis">
+            <param name="choices" type="select" label="Which of the following jobs do you want perform?">
+                <option value="install_genome">Install 'de novo' a reference genome </option>
+                <option value="get_sigmut">Obtain the mutational signatures from VCF files</option>
+            </param>
+            <when value="install_genome">
+                <param name="refgendwn" type="select" label="Reference genome" help="Get data from any of the following reference genomes:">
+                    <option value="GRCh37">Homo sapiens, GRCh37.p13 [GCA_000001405.14] </option>
+                    <option value="GRCh38">Homo sapiens, GRCh38.p12 [GCA_000001405.27] </option>
+                    <option value="mm9">Mus musculus, GRCm37 [GCA_000001635.18]</option>
+                    <option value="mm10">Mus musculus, GRCm38.p6 [GCA_000001635.8]</option>
+                    <option value="rn6">Rattus norvegicus, Rnor_6.0 [GCA_000001895.4]</option>
+                    <option value="c_elegans">Caenorhabditis elegans</option>
+                    <option value="dog">Dog</option>
+                </param>
+            </when>
+
+            <when value="get_sigmut">
+                <conditional name="vcfile_input">
+                    <param name="vcfile" type="select" label="VC file" help="Select the format of your input data">
+                        <option value="maf">Mutation Annotation Format</option>
+                        <option value="icgc">Tab-separated file</option>
+                        <option value="vcf">Variant Call Format</option>
+                    </param>
+                    <when value='maf'>
+                        <param name="maf_file" type="data" format="maf" label="select VC file" help="Select the input file in MAF format." />
+                    </when>
+                    <when value='icgc'>
+                        <param name="icgc_file" type="data" format="txt" label="select VC file" help="Select the input file in ICGC format." />
+                    </when>
+                    <when value='vcf'>
+                        <param name="vcf_file" type="data" format="vcf" label="select VC file" help="Select the input file in VCF format." />
+                    </when>
+                </conditional>
+
+                <param name="refgendat" type="select" label="Reference genome to be analyzed" help="Use the following reference genome:">
+                    <option value="GRCh37">Homo sapiens, GRCh37.p13 [GCA_000001405.14] </option>
+                                        <option value="GRCh38">Homo sapiens, GRCh38.p12 [GCA_000001405.27] </option>
+                                        <option value="mm9">Mus musculus, GRCm37 [GCA_000001635.18]</option>
+                                        <option value="mm10">Mus musculus, GRCm38.p6 [GCA_000001635.8]</option>
+                                        <option value="rn6">Rattus norvegicus, Rnor_6.0 [GCA_000001895.4]</option>
+                                        <option value="c_elegans">Caenorhabditis elegans</option>
+                                        <option value="dog">Dog</option>
+                </param>
+
+<!-- implement bed when test available -->
+<!--                <conditional name="bed_input">
+                    <param name="bedfile" type="select" label="BED file" help="Input a BED file">
+                        <option value="yes">Yes</option>
+                        <option value="no" selected="true">No</option>
+                    </param>
+                    <when value='yes'>
+                        <param name="bed_file" format="bed" type="data" label="Use a BED file containing the set of regions" help="Provide a BED file"/>
+                    </when>
+                    <when value='no'>
+                    </when>
+                </conditional> -->
+                <!-- implement exome functionality when test available -->
+                <!-- <param name="exome" type="boolean" label="Use only the exome?" checked="False" help="Use exome"/> -->
+                <!-- implement chrom_based functionality when test available -->
+                <!--<param name="chrom_based" type="boolean" label="Create the matrices on a per chromosome basis?" checked="False" help="Show snvs"/> -->
+                <param name="tsb_stat" type="boolean" truevalue="true" label="Performs a transcriptional strand bias test?" checked="False" help="Show snvs"/>
+                <param name="seqInfo" type="boolean" truevalue="true" label="Export sequence information?" checked="False" help="Show sequence information"/>
+                <param name="gs" type="boolean" label="Performs gene strand bias test?" checked="False" help="Show snvs"/>
+            </when>
+        </conditional>
+    </inputs>
+
+    <outputs>
+        <data format="txt" name="logref" label="Log file: Install a Reference Genome"
+              from_work_dir="./install.log">
+            <filter>set_analysis['choices'] == 'install_genome'</filter>
+        </data>
+        <data format="txt" name="logsmt" label="Log file: Calculate Mutational Signatures"
+              from_work_dir="run_dir/logs/SigProfilerMatrixGenerator*.out">
+            <filter>set_analysis['choices'] == 'get_sigmut'</filter>
+        </data>
+
+        <data format="pdf" name="blinder" label="SBS Mutational Signatures plots (pdf)"
+              from_work_dir="./blinder.pdf" >
+            <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['plot'] is True</filter>
+        </data>
+
+        <!-- implement exome outputs when test available -->
+        <!--
+        <data format="txt" name="dbs_exome" label="DBS_exome.vcf">
+            <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['exome'] is True</filter>
+        </data>
+        <data format="txt" name="snv_exome" label="SNV_exome.vcf">
+            <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['exome'] is True</filter>
+        </data>
+        
+        <data format="txt" name="sig_exome" label="DBS 78 and so on Sig. Mut. EXOME">
+            <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['exome'] is True</filter>
+        </data>
+        -->
+        <data format="txt" name="tsb" label="Transcriptional Strand Biases"
+                      from_work_dir="./transcriptional_strand_biases.txt" >
+            <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['tsb_stat'] is True</filter>
+        </data>
+
+        <data format="txt" name="seqinfo" label="Mutational Signature detailed infos"
+              from_work_dir="./information.txt" >
+            <filter>set_analysis['choices'] == 'get_sigmut' and  set_analysis['seqInfo'] is True</filter>
+        </data>
+
+    </outputs>
+    <tests>
+        <test>
+            <param name="choices" value="install_genome"/>
+            <param name="refgendwn" value="GRCh38"/>
+            <output name="logref" file="hg38_install.log" lines_diff="5"/>
+        </test>
+        <test>
+            <param name="choices" value="get_sigmut"/>
+            <param name="refgendat" value="GRCh38"/>
+            <param name="vcfile" value="vcf"/>
+            <param name="vcf_file" ftype="vcf" value="hg38.vcf"/>
+            <param name="plot" value="True"/>
+            <output name="logsmt" ftype="txt" file="sigmut.log" lines_diff="5" />
+            <output name="blinder" file="hg38_blinder.pdf"  lines_diff="5" />
+        </test>
+    </tests>
+
+    <help><![CDATA[
+
+        **SigProfiler**
+
+        Background:
+
+        Cancer genomes evince somatic mutations, which are imprinted by
+        different mutational processes, that give rise to diverse
+        mutational signatures. Their analysis from single base
+        substitutions and their immediate sequencing context, allows the
+        classification of small mutational events (including
+        substitutions, insertions, deletions, and doublet substitutions)
+        for better understanding the mutational processes that have
+        shaped a cancer genome.
+
+        In this sense, SigProfiler constitutes a Galaxy-based wrapper of
+        a computational method developed by Ludmil B. Alexandrov, that
+        allow the exploration and visualization of mutational patterns
+        for all types of small mutational events. Specifically, the
+        following actions can be performed using SigProfiler wrapper:
+
+        1. Identify and categorize the mutations based on possible
+        single nucleotide variants (SNVs), double base substitutions
+        (DBS), and insertions/deletions and provides further
+        transcriptional strand bias categorization. Afterwards, the
+        classification of these mutations are integrated into distinct
+        matrices.
+        SigProfiler provides matrix generation support for SBS-6,
+        SBS-96, SBS-1536, DBS-78 and DBS-1248. In addition, the
+        generation of mutational matrices of indels including
+        ID-28 and ID-83 are procured. Besides, an ID-8628 matrix that
+        extends the ID-83 classification is generated.
+        SigProfiler examines transcriptional strand bias for single base
+        substitutions, doublet base substitutions, and small indels. It
+        is evaluated whether a mutation occurs on the transcribed or the
+        non-transcribed strand of well-annotated protein coding genes of
+        a reference genome. Mutations found in the transcribed regions
+        of the genome are further subclassified as: (i) transcribed,
+        (ii) un-transcribed, (iii) bi-directional, or (iv) unknown.
+           
+        2. Generation of plots of all types of mutational signatures as
+        well as all types of mutational patterns in cancer genomes.  
+
+        Additional Information:
+
+        Classification of Single Base substitutions (SBSs):
+        Single base substitutions (SBSs) are single DNA base-pairs
+        substituted with another single DNA base-pairs. The most
+        basic classification catalogues SBSs into six distinct
+        categories, including: C:G > A:T, C:G > G:C, C:G > T:A,
+        T:A > A:T, T:A > C:G, and T:A > G:C. In practice, a C:G > A:T
+        substitution is denoted as either a C > A mutation using the
+        pyrimidine base or as a G > T mutation using the purine base.
+        In consequence, the most commonly used SBS-6 classification of
+        single base substitutions can be written as: C > A, C > G,
+        C > T, T > A, T > C, and T > G.
+        Additionally, the SBS-6 classification can be further
+        expanded by considering the base-pairs immediately
+        adjacent 5′ and 3′ to the somatic mutation. Therefore, an
+        extended classification for analysis of mutational signatures is
+        SBS-96, where each of the classes in SBS-6 is further elaborated
+        using one base adjacent at the 5′ of the mutation and one base
+        adjacent at the 3′ of the mutation.
+        Logically, SBS-96 can be further elaborated by including
+        additional 5′ and 3′ adjacent context. Each of the six single
+        base substitutions in SBS-6 has 256 possible pentanucleotides
+        resulting in a classification with 1536 possible channels.
+           
+        Classification of Doublet Base substitutions (DBSs):
+        Doublet base substitutions (DBSs) are somatic mutations in which
+        a set of two adjacent DNA base-pairs is simultaneously
+        substituted with another set of two adjacent DNA base-pairs. An
+        example of a DBS is a set of CT:GA base-pairs mutating to a set
+        of AA:TT base-pairs, which is usually denoted as CT:GA > AA:TT.
+        It should be noted that a CT:GA > AA:TT mutation can be
+        equivalently written as either a CT > AA mutation.  Overall, the
+        basic classification catalogues DBSs into 78 distinct categories
+        denoted as the DBS-78 matrix.
+        Similarly, we can expand the characterization of DBS mutations
+        by considering the 5′ and 3′ adjacent contexts. With
+        seventy-eight possible DBS mutations having sixteen possible
+        tetranucleotides each, this context expansion results in 1248
+        possible channels denoted as the DBS-1248 context.
+        
+        Classification of small insertions and deletions (IDs):
+        A somatic insertion is the incorporation of a set of base-pairs
+        that lengthens a chromosome, while a somatic deletion is the
+        removing of a set of existing base-pairs from a given location
+        of a chromosome.
+        Unfortunately, indel classification cannot be performed
+        analogously to SBS or DBS classifications, where the immediate
+        sequencing context flanking each mutation was
+        utilized to subclassify these mutational events.
+        Consequently, indels (IDs) are classified as single base-pair
+        or longer events. They can be further subclassified as either a
+        C:G or a T:A indel, while longer indels can also be
+        subclassified based on their lengths: 2 bp, 3 bp, 4 bp, and
+        5 + bp.
+
+        Incorporation of transcription Strand Bias (TSB):
+        The mutational classifications described above allow the
+        characterization of mutational patterns of single base
+        substitutions, doublet base substitutions, and small insertions
+        and deletions. Nevertheless, these classifications can be
+        further elaborated by incorporating strand bias. Mutations
+        from the same type are expected to be equally distributed across the two
+        DNA strands. However, in many cases an asymmetric number of mutations are
+        observed due to either one of the strands being preferentially
+        repaired or one of the strands having a higher propensity for
+        being damaged. To sub-classify mutations based on their
+        transcriptional strand bias, the pyrimidine orientation with
+        respect to the locations of well-annotated protein coding genes
+        on a genome is considered.
+
+        Running SigProfiler:
+        
+        1. Reference Genomes:
+        Before using SigProfiler, the installation of a reference genome
+        is demanded. By default, the tool supports the following
+        reference genomes:
+
+                Human: GRCh37 & GRCh38
+
+                Mouse: mm9 & mm10
+
+                Rat: rn6
+
+                Nematode: c_elegans
+
+                A right command line should look like:
+
+                sigprofiler -ig GRCh37
+
+        2. Mutational signatures calculation:
+        
+        After successful installation of a reference genome, SigProfiler
+        can be applied to files containing somatic mutations in multiple
+        formats, for transforming these mutational catalogues into mutational
+        matrices. Specifically, the tool can read data formats such as
+        Variant Calling Format (VCF) and Mutation Annotation Format
+        (MAF) and the following parameters should be provided for
+        generating the diverse matrices and plots:
+        
+        --name | -n = Project name
+        --genome | -g = Reference Genome
+        -files | -f = Absolute path where the input mutation files are located
+
+        A right command line should look like:
+
+        sigprofiler -n MYPROJECT -g GRCh37 -f /path_to_folder_with_VCF_files/ -p
+
+        **Options**
+        --version               show program's version number and exit
+
+        -h, --help              show this help message and exit
+
+        --install_genome    Install de novo any of the following reference
+                    genomes: 'GRCh37', 'GRCh38', 'mm9' or 'mm10'.
+
+        --name=APPENDIX     Provide a project name
+
+        --genome=NAME       Provide a reference genome (ex: GRCh37, GRCh38,
+                    mm9 or mm10).
+
+        --files=Abs_path    Path where the input vcf files are located
+
+        --exome         Use only the exome or not
+
+        --bed=FILE      BED file containing the set of regions to be used
+                    in generating the matrices
+
+        --chrom         Create the matrices on a per chromosome basis
+
+        --plot          Generate the plots for each context
+
+        --tsb           Performs a transcriptional strand bias test for the
+                    24, 384, and 6144 contexts
+
+        --gs            Performs a gene strand bias test
+
+        For further info see: https://github.com/AlexandrovLab/SigProfilerMatrixGenerator
+
+        ]]></help>
+
+    <citations>
+        <citation type="doi">10.1186/s12864-019-6041-2</citation>
+    </citations>
+
+</tool>