Mercurial > repos > artbio > sigmut
diff sigmut.xml @ 0:2062de974f72 draft default tip
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/sigmut commit bba3eb3950b8772758cc6f19747172be7413ddd9"
author | artbio |
---|---|
date | Mon, 15 Jun 2020 00:28:49 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sigmut.xml Mon Jun 15 00:28:49 2020 +0000 @@ -0,0 +1,377 @@ +<tool id="SigProfiler" name="SigProfiler" version="@VERSION@"> + <description>performs mutational signature characterization from variant files</description> + + <macros> + <import>sigmut_macros.xml</import> + </macros> + <expand macro="requirements"/> + <expand macro="stdio"/> + <command detect_errors="exit_code"><![CDATA[ + @VERSION@ + @pipefail@ + BIN=`which sigprofiler | sed 's,/sigprofiler,,g'` && + echo \$BIN && + chmod -R 777 \$BIN && + mkdir run_dir && + #if str( $set_analysis.choices ) == "get_sigmut": + #if str( $set_analysis.vcfile_input.vcfile ) == "maf": + #set $infile = 'run_dir/snps.maf' + ln -s -f '$set_analysis.vcfile_input.maf_file' '$infile' && + #else if str( $set_analysis.vcfile_input.vcfile ) == "icgc": + #set $infile = 'run_dir/snps.txt' + ln -s -f '$set_analysis.vcfile_input.icgc_file' '$infile' && + #else if str( $set_analysis.vcfile_input.vcfile ) == "vcf": + #set $infile = 'run_dir/snps.vcf' + ln -s -f '$set_analysis.vcfile_input.vcf_file' '$infile' && + #end if + #end if + + sigprofiler + + #if str( $set_analysis.choices ) == "install_genome": + -ig $set_analysis.refgendwn > install.log + #else if str( $set_analysis.choices ) == "get_sigmut": + -g $set_analysis.refgendat + -f 'run_dir' + -n "project" + -p +## ! implement exome functionality when good test available +## #if str( $set_analysis.exome ) == "true": +## -e +## #end if +## ! implement per chromosome functionality when good test available +## #if str( $set_analysis.chrom_based ) == "true": +## -c +## #end if + #if str( $set_analysis.tsb_stat ) == "true": + -t + #end if + #if str( $set_analysis.gs ) == "true": + -s + #end if + ##-b $set_analysis.bed ### to be done + && pdfcombine -f -s -o blinder.pdf run_dir/output/plots/*.pdf + && ls run_dir/logs/ + #if str( $set_analysis.tsb_stat ) == "true": + && tail -n +1 run_dir/output/TSB/*.txt > transcriptional_strand_biases.txt + #end if + #if $set_analysis.seqInfo: + && tail -n +1 run_dir/output/*/*.all > information.txt + #end if + #end if + ]]></command> + + <inputs> + <conditional name="set_analysis"> + <param name="choices" type="select" label="Which of the following jobs do you want perform?"> + <option value="install_genome">Install 'de novo' a reference genome </option> + <option value="get_sigmut">Obtain the mutational signatures from VCF files</option> + </param> + <when value="install_genome"> + <param name="refgendwn" type="select" label="Reference genome" help="Get data from any of the following reference genomes:"> + <option value="GRCh37">Homo sapiens, GRCh37.p13 [GCA_000001405.14] </option> + <option value="GRCh38">Homo sapiens, GRCh38.p12 [GCA_000001405.27] </option> + <option value="mm9">Mus musculus, GRCm37 [GCA_000001635.18]</option> + <option value="mm10">Mus musculus, GRCm38.p6 [GCA_000001635.8]</option> + <option value="rn6">Rattus norvegicus, Rnor_6.0 [GCA_000001895.4]</option> + <option value="c_elegans">Caenorhabditis elegans</option> + <option value="dog">Dog</option> + </param> + </when> + + <when value="get_sigmut"> + <conditional name="vcfile_input"> + <param name="vcfile" type="select" label="VC file" help="Select the format of your input data"> + <option value="maf">Mutation Annotation Format</option> + <option value="icgc">Tab-separated file</option> + <option value="vcf">Variant Call Format</option> + </param> + <when value='maf'> + <param name="maf_file" type="data" format="maf" label="select VC file" help="Select the input file in MAF format." /> + </when> + <when value='icgc'> + <param name="icgc_file" type="data" format="txt" label="select VC file" help="Select the input file in ICGC format." /> + </when> + <when value='vcf'> + <param name="vcf_file" type="data" format="vcf" label="select VC file" help="Select the input file in VCF format." /> + </when> + </conditional> + + <param name="refgendat" type="select" label="Reference genome to be analyzed" help="Use the following reference genome:"> + <option value="GRCh37">Homo sapiens, GRCh37.p13 [GCA_000001405.14] </option> + <option value="GRCh38">Homo sapiens, GRCh38.p12 [GCA_000001405.27] </option> + <option value="mm9">Mus musculus, GRCm37 [GCA_000001635.18]</option> + <option value="mm10">Mus musculus, GRCm38.p6 [GCA_000001635.8]</option> + <option value="rn6">Rattus norvegicus, Rnor_6.0 [GCA_000001895.4]</option> + <option value="c_elegans">Caenorhabditis elegans</option> + <option value="dog">Dog</option> + </param> + +<!-- implement bed when test available --> +<!-- <conditional name="bed_input"> + <param name="bedfile" type="select" label="BED file" help="Input a BED file"> + <option value="yes">Yes</option> + <option value="no" selected="true">No</option> + </param> + <when value='yes'> + <param name="bed_file" format="bed" type="data" label="Use a BED file containing the set of regions" help="Provide a BED file"/> + </when> + <when value='no'> + </when> + </conditional> --> + <!-- implement exome functionality when test available --> + <!-- <param name="exome" type="boolean" label="Use only the exome?" checked="False" help="Use exome"/> --> + <!-- implement chrom_based functionality when test available --> + <!--<param name="chrom_based" type="boolean" label="Create the matrices on a per chromosome basis?" checked="False" help="Show snvs"/> --> + <param name="tsb_stat" type="boolean" truevalue="true" label="Performs a transcriptional strand bias test?" checked="False" help="Show snvs"/> + <param name="seqInfo" type="boolean" truevalue="true" label="Export sequence information?" checked="False" help="Show sequence information"/> + <param name="gs" type="boolean" label="Performs gene strand bias test?" checked="False" help="Show snvs"/> + </when> + </conditional> + </inputs> + + <outputs> + <data format="txt" name="logref" label="Log file: Install a Reference Genome" + from_work_dir="./install.log"> + <filter>set_analysis['choices'] == 'install_genome'</filter> + </data> + <data format="txt" name="logsmt" label="Log file: Calculate Mutational Signatures" + from_work_dir="run_dir/logs/SigProfilerMatrixGenerator*.out"> + <filter>set_analysis['choices'] == 'get_sigmut'</filter> + </data> + + <data format="pdf" name="blinder" label="SBS Mutational Signatures plots (pdf)" + from_work_dir="./blinder.pdf" > + <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['plot'] is True</filter> + </data> + + <!-- implement exome outputs when test available --> + <!-- + <data format="txt" name="dbs_exome" label="DBS_exome.vcf"> + <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['exome'] is True</filter> + </data> + <data format="txt" name="snv_exome" label="SNV_exome.vcf"> + <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['exome'] is True</filter> + </data> + + <data format="txt" name="sig_exome" label="DBS 78 and so on Sig. Mut. EXOME"> + <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['exome'] is True</filter> + </data> + --> + <data format="txt" name="tsb" label="Transcriptional Strand Biases" + from_work_dir="./transcriptional_strand_biases.txt" > + <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['tsb_stat'] is True</filter> + </data> + + <data format="txt" name="seqinfo" label="Mutational Signature detailed infos" + from_work_dir="./information.txt" > + <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['seqInfo'] is True</filter> + </data> + + </outputs> + <tests> + <test> + <param name="choices" value="install_genome"/> + <param name="refgendwn" value="GRCh38"/> + <output name="logref" file="hg38_install.log" lines_diff="5"/> + </test> + <test> + <param name="choices" value="get_sigmut"/> + <param name="refgendat" value="GRCh38"/> + <param name="vcfile" value="vcf"/> + <param name="vcf_file" ftype="vcf" value="hg38.vcf"/> + <param name="plot" value="True"/> + <output name="logsmt" ftype="txt" file="sigmut.log" lines_diff="5" /> + <output name="blinder" file="hg38_blinder.pdf" lines_diff="5" /> + </test> + </tests> + + <help><![CDATA[ + + **SigProfiler** + + Background: + + Cancer genomes evince somatic mutations, which are imprinted by + different mutational processes, that give rise to diverse + mutational signatures. Their analysis from single base + substitutions and their immediate sequencing context, allows the + classification of small mutational events (including + substitutions, insertions, deletions, and doublet substitutions) + for better understanding the mutational processes that have + shaped a cancer genome. + + In this sense, SigProfiler constitutes a Galaxy-based wrapper of + a computational method developed by Ludmil B. Alexandrov, that + allow the exploration and visualization of mutational patterns + for all types of small mutational events. Specifically, the + following actions can be performed using SigProfiler wrapper: + + 1. Identify and categorize the mutations based on possible + single nucleotide variants (SNVs), double base substitutions + (DBS), and insertions/deletions and provides further + transcriptional strand bias categorization. Afterwards, the + classification of these mutations are integrated into distinct + matrices. + SigProfiler provides matrix generation support for SBS-6, + SBS-96, SBS-1536, DBS-78 and DBS-1248. In addition, the + generation of mutational matrices of indels including + ID-28 and ID-83 are procured. Besides, an ID-8628 matrix that + extends the ID-83 classification is generated. + SigProfiler examines transcriptional strand bias for single base + substitutions, doublet base substitutions, and small indels. It + is evaluated whether a mutation occurs on the transcribed or the + non-transcribed strand of well-annotated protein coding genes of + a reference genome. Mutations found in the transcribed regions + of the genome are further subclassified as: (i) transcribed, + (ii) un-transcribed, (iii) bi-directional, or (iv) unknown. + + 2. Generation of plots of all types of mutational signatures as + well as all types of mutational patterns in cancer genomes. + + Additional Information: + + Classification of Single Base substitutions (SBSs): + Single base substitutions (SBSs) are single DNA base-pairs + substituted with another single DNA base-pairs. The most + basic classification catalogues SBSs into six distinct + categories, including: C:G > A:T, C:G > G:C, C:G > T:A, + T:A > A:T, T:A > C:G, and T:A > G:C. In practice, a C:G > A:T + substitution is denoted as either a C > A mutation using the + pyrimidine base or as a G > T mutation using the purine base. + In consequence, the most commonly used SBS-6 classification of + single base substitutions can be written as: C > A, C > G, + C > T, T > A, T > C, and T > G. + Additionally, the SBS-6 classification can be further + expanded by considering the base-pairs immediately + adjacent 5′ and 3′ to the somatic mutation. Therefore, an + extended classification for analysis of mutational signatures is + SBS-96, where each of the classes in SBS-6 is further elaborated + using one base adjacent at the 5′ of the mutation and one base + adjacent at the 3′ of the mutation. + Logically, SBS-96 can be further elaborated by including + additional 5′ and 3′ adjacent context. Each of the six single + base substitutions in SBS-6 has 256 possible pentanucleotides + resulting in a classification with 1536 possible channels. + + Classification of Doublet Base substitutions (DBSs): + Doublet base substitutions (DBSs) are somatic mutations in which + a set of two adjacent DNA base-pairs is simultaneously + substituted with another set of two adjacent DNA base-pairs. An + example of a DBS is a set of CT:GA base-pairs mutating to a set + of AA:TT base-pairs, which is usually denoted as CT:GA > AA:TT. + It should be noted that a CT:GA > AA:TT mutation can be + equivalently written as either a CT > AA mutation. Overall, the + basic classification catalogues DBSs into 78 distinct categories + denoted as the DBS-78 matrix. + Similarly, we can expand the characterization of DBS mutations + by considering the 5′ and 3′ adjacent contexts. With + seventy-eight possible DBS mutations having sixteen possible + tetranucleotides each, this context expansion results in 1248 + possible channels denoted as the DBS-1248 context. + + Classification of small insertions and deletions (IDs): + A somatic insertion is the incorporation of a set of base-pairs + that lengthens a chromosome, while a somatic deletion is the + removing of a set of existing base-pairs from a given location + of a chromosome. + Unfortunately, indel classification cannot be performed + analogously to SBS or DBS classifications, where the immediate + sequencing context flanking each mutation was + utilized to subclassify these mutational events. + Consequently, indels (IDs) are classified as single base-pair + or longer events. They can be further subclassified as either a + C:G or a T:A indel, while longer indels can also be + subclassified based on their lengths: 2 bp, 3 bp, 4 bp, and + 5 + bp. + + Incorporation of transcription Strand Bias (TSB): + The mutational classifications described above allow the + characterization of mutational patterns of single base + substitutions, doublet base substitutions, and small insertions + and deletions. Nevertheless, these classifications can be + further elaborated by incorporating strand bias. Mutations + from the same type are expected to be equally distributed across the two + DNA strands. However, in many cases an asymmetric number of mutations are + observed due to either one of the strands being preferentially + repaired or one of the strands having a higher propensity for + being damaged. To sub-classify mutations based on their + transcriptional strand bias, the pyrimidine orientation with + respect to the locations of well-annotated protein coding genes + on a genome is considered. + + Running SigProfiler: + + 1. Reference Genomes: + Before using SigProfiler, the installation of a reference genome + is demanded. By default, the tool supports the following + reference genomes: + + Human: GRCh37 & GRCh38 + + Mouse: mm9 & mm10 + + Rat: rn6 + + Nematode: c_elegans + + A right command line should look like: + + sigprofiler -ig GRCh37 + + 2. Mutational signatures calculation: + + After successful installation of a reference genome, SigProfiler + can be applied to files containing somatic mutations in multiple + formats, for transforming these mutational catalogues into mutational + matrices. Specifically, the tool can read data formats such as + Variant Calling Format (VCF) and Mutation Annotation Format + (MAF) and the following parameters should be provided for + generating the diverse matrices and plots: + + --name | -n = Project name + --genome | -g = Reference Genome + -files | -f = Absolute path where the input mutation files are located + + A right command line should look like: + + sigprofiler -n MYPROJECT -g GRCh37 -f /path_to_folder_with_VCF_files/ -p + + **Options** + --version show program's version number and exit + + -h, --help show this help message and exit + + --install_genome Install de novo any of the following reference + genomes: 'GRCh37', 'GRCh38', 'mm9' or 'mm10'. + + --name=APPENDIX Provide a project name + + --genome=NAME Provide a reference genome (ex: GRCh37, GRCh38, + mm9 or mm10). + + --files=Abs_path Path where the input vcf files are located + + --exome Use only the exome or not + + --bed=FILE BED file containing the set of regions to be used + in generating the matrices + + --chrom Create the matrices on a per chromosome basis + + --plot Generate the plots for each context + + --tsb Performs a transcriptional strand bias test for the + 24, 384, and 6144 contexts + + --gs Performs a gene strand bias test + + For further info see: https://github.com/AlexandrovLab/SigProfilerMatrixGenerator + + ]]></help> + + <citations> + <citation type="doi">10.1186/s12864-019-6041-2</citation> + </citations> + +</tool>