Mercurial > repos > artbio > sigmut
comparison sigmut.xml @ 0:2062de974f72 draft default tip
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/sigmut commit bba3eb3950b8772758cc6f19747172be7413ddd9"
| author | artbio |
|---|---|
| date | Mon, 15 Jun 2020 00:28:49 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:2062de974f72 |
|---|---|
| 1 <tool id="SigProfiler" name="SigProfiler" version="@VERSION@"> | |
| 2 <description>performs mutational signature characterization from variant files</description> | |
| 3 | |
| 4 <macros> | |
| 5 <import>sigmut_macros.xml</import> | |
| 6 </macros> | |
| 7 <expand macro="requirements"/> | |
| 8 <expand macro="stdio"/> | |
| 9 <command detect_errors="exit_code"><![CDATA[ | |
| 10 @VERSION@ | |
| 11 @pipefail@ | |
| 12 BIN=`which sigprofiler | sed 's,/sigprofiler,,g'` && | |
| 13 echo \$BIN && | |
| 14 chmod -R 777 \$BIN && | |
| 15 mkdir run_dir && | |
| 16 #if str( $set_analysis.choices ) == "get_sigmut": | |
| 17 #if str( $set_analysis.vcfile_input.vcfile ) == "maf": | |
| 18 #set $infile = 'run_dir/snps.maf' | |
| 19 ln -s -f '$set_analysis.vcfile_input.maf_file' '$infile' && | |
| 20 #else if str( $set_analysis.vcfile_input.vcfile ) == "icgc": | |
| 21 #set $infile = 'run_dir/snps.txt' | |
| 22 ln -s -f '$set_analysis.vcfile_input.icgc_file' '$infile' && | |
| 23 #else if str( $set_analysis.vcfile_input.vcfile ) == "vcf": | |
| 24 #set $infile = 'run_dir/snps.vcf' | |
| 25 ln -s -f '$set_analysis.vcfile_input.vcf_file' '$infile' && | |
| 26 #end if | |
| 27 #end if | |
| 28 | |
| 29 sigprofiler | |
| 30 | |
| 31 #if str( $set_analysis.choices ) == "install_genome": | |
| 32 -ig $set_analysis.refgendwn > install.log | |
| 33 #else if str( $set_analysis.choices ) == "get_sigmut": | |
| 34 -g $set_analysis.refgendat | |
| 35 -f 'run_dir' | |
| 36 -n "project" | |
| 37 -p | |
| 38 ## ! implement exome functionality when good test available | |
| 39 ## #if str( $set_analysis.exome ) == "true": | |
| 40 ## -e | |
| 41 ## #end if | |
| 42 ## ! implement per chromosome functionality when good test available | |
| 43 ## #if str( $set_analysis.chrom_based ) == "true": | |
| 44 ## -c | |
| 45 ## #end if | |
| 46 #if str( $set_analysis.tsb_stat ) == "true": | |
| 47 -t | |
| 48 #end if | |
| 49 #if str( $set_analysis.gs ) == "true": | |
| 50 -s | |
| 51 #end if | |
| 52 ##-b $set_analysis.bed ### to be done | |
| 53 && pdfcombine -f -s -o blinder.pdf run_dir/output/plots/*.pdf | |
| 54 && ls run_dir/logs/ | |
| 55 #if str( $set_analysis.tsb_stat ) == "true": | |
| 56 && tail -n +1 run_dir/output/TSB/*.txt > transcriptional_strand_biases.txt | |
| 57 #end if | |
| 58 #if $set_analysis.seqInfo: | |
| 59 && tail -n +1 run_dir/output/*/*.all > information.txt | |
| 60 #end if | |
| 61 #end if | |
| 62 ]]></command> | |
| 63 | |
| 64 <inputs> | |
| 65 <conditional name="set_analysis"> | |
| 66 <param name="choices" type="select" label="Which of the following jobs do you want perform?"> | |
| 67 <option value="install_genome">Install 'de novo' a reference genome </option> | |
| 68 <option value="get_sigmut">Obtain the mutational signatures from VCF files</option> | |
| 69 </param> | |
| 70 <when value="install_genome"> | |
| 71 <param name="refgendwn" type="select" label="Reference genome" help="Get data from any of the following reference genomes:"> | |
| 72 <option value="GRCh37">Homo sapiens, GRCh37.p13 [GCA_000001405.14] </option> | |
| 73 <option value="GRCh38">Homo sapiens, GRCh38.p12 [GCA_000001405.27] </option> | |
| 74 <option value="mm9">Mus musculus, GRCm37 [GCA_000001635.18]</option> | |
| 75 <option value="mm10">Mus musculus, GRCm38.p6 [GCA_000001635.8]</option> | |
| 76 <option value="rn6">Rattus norvegicus, Rnor_6.0 [GCA_000001895.4]</option> | |
| 77 <option value="c_elegans">Caenorhabditis elegans</option> | |
| 78 <option value="dog">Dog</option> | |
| 79 </param> | |
| 80 </when> | |
| 81 | |
| 82 <when value="get_sigmut"> | |
| 83 <conditional name="vcfile_input"> | |
| 84 <param name="vcfile" type="select" label="VC file" help="Select the format of your input data"> | |
| 85 <option value="maf">Mutation Annotation Format</option> | |
| 86 <option value="icgc">Tab-separated file</option> | |
| 87 <option value="vcf">Variant Call Format</option> | |
| 88 </param> | |
| 89 <when value='maf'> | |
| 90 <param name="maf_file" type="data" format="maf" label="select VC file" help="Select the input file in MAF format." /> | |
| 91 </when> | |
| 92 <when value='icgc'> | |
| 93 <param name="icgc_file" type="data" format="txt" label="select VC file" help="Select the input file in ICGC format." /> | |
| 94 </when> | |
| 95 <when value='vcf'> | |
| 96 <param name="vcf_file" type="data" format="vcf" label="select VC file" help="Select the input file in VCF format." /> | |
| 97 </when> | |
| 98 </conditional> | |
| 99 | |
| 100 <param name="refgendat" type="select" label="Reference genome to be analyzed" help="Use the following reference genome:"> | |
| 101 <option value="GRCh37">Homo sapiens, GRCh37.p13 [GCA_000001405.14] </option> | |
| 102 <option value="GRCh38">Homo sapiens, GRCh38.p12 [GCA_000001405.27] </option> | |
| 103 <option value="mm9">Mus musculus, GRCm37 [GCA_000001635.18]</option> | |
| 104 <option value="mm10">Mus musculus, GRCm38.p6 [GCA_000001635.8]</option> | |
| 105 <option value="rn6">Rattus norvegicus, Rnor_6.0 [GCA_000001895.4]</option> | |
| 106 <option value="c_elegans">Caenorhabditis elegans</option> | |
| 107 <option value="dog">Dog</option> | |
| 108 </param> | |
| 109 | |
| 110 <!-- implement bed when test available --> | |
| 111 <!-- <conditional name="bed_input"> | |
| 112 <param name="bedfile" type="select" label="BED file" help="Input a BED file"> | |
| 113 <option value="yes">Yes</option> | |
| 114 <option value="no" selected="true">No</option> | |
| 115 </param> | |
| 116 <when value='yes'> | |
| 117 <param name="bed_file" format="bed" type="data" label="Use a BED file containing the set of regions" help="Provide a BED file"/> | |
| 118 </when> | |
| 119 <when value='no'> | |
| 120 </when> | |
| 121 </conditional> --> | |
| 122 <!-- implement exome functionality when test available --> | |
| 123 <!-- <param name="exome" type="boolean" label="Use only the exome?" checked="False" help="Use exome"/> --> | |
| 124 <!-- implement chrom_based functionality when test available --> | |
| 125 <!--<param name="chrom_based" type="boolean" label="Create the matrices on a per chromosome basis?" checked="False" help="Show snvs"/> --> | |
| 126 <param name="tsb_stat" type="boolean" truevalue="true" label="Performs a transcriptional strand bias test?" checked="False" help="Show snvs"/> | |
| 127 <param name="seqInfo" type="boolean" truevalue="true" label="Export sequence information?" checked="False" help="Show sequence information"/> | |
| 128 <param name="gs" type="boolean" label="Performs gene strand bias test?" checked="False" help="Show snvs"/> | |
| 129 </when> | |
| 130 </conditional> | |
| 131 </inputs> | |
| 132 | |
| 133 <outputs> | |
| 134 <data format="txt" name="logref" label="Log file: Install a Reference Genome" | |
| 135 from_work_dir="./install.log"> | |
| 136 <filter>set_analysis['choices'] == 'install_genome'</filter> | |
| 137 </data> | |
| 138 <data format="txt" name="logsmt" label="Log file: Calculate Mutational Signatures" | |
| 139 from_work_dir="run_dir/logs/SigProfilerMatrixGenerator*.out"> | |
| 140 <filter>set_analysis['choices'] == 'get_sigmut'</filter> | |
| 141 </data> | |
| 142 | |
| 143 <data format="pdf" name="blinder" label="SBS Mutational Signatures plots (pdf)" | |
| 144 from_work_dir="./blinder.pdf" > | |
| 145 <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['plot'] is True</filter> | |
| 146 </data> | |
| 147 | |
| 148 <!-- implement exome outputs when test available --> | |
| 149 <!-- | |
| 150 <data format="txt" name="dbs_exome" label="DBS_exome.vcf"> | |
| 151 <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['exome'] is True</filter> | |
| 152 </data> | |
| 153 <data format="txt" name="snv_exome" label="SNV_exome.vcf"> | |
| 154 <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['exome'] is True</filter> | |
| 155 </data> | |
| 156 | |
| 157 <data format="txt" name="sig_exome" label="DBS 78 and so on Sig. Mut. EXOME"> | |
| 158 <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['exome'] is True</filter> | |
| 159 </data> | |
| 160 --> | |
| 161 <data format="txt" name="tsb" label="Transcriptional Strand Biases" | |
| 162 from_work_dir="./transcriptional_strand_biases.txt" > | |
| 163 <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['tsb_stat'] is True</filter> | |
| 164 </data> | |
| 165 | |
| 166 <data format="txt" name="seqinfo" label="Mutational Signature detailed infos" | |
| 167 from_work_dir="./information.txt" > | |
| 168 <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['seqInfo'] is True</filter> | |
| 169 </data> | |
| 170 | |
| 171 </outputs> | |
| 172 <tests> | |
| 173 <test> | |
| 174 <param name="choices" value="install_genome"/> | |
| 175 <param name="refgendwn" value="GRCh38"/> | |
| 176 <output name="logref" file="hg38_install.log" lines_diff="5"/> | |
| 177 </test> | |
| 178 <test> | |
| 179 <param name="choices" value="get_sigmut"/> | |
| 180 <param name="refgendat" value="GRCh38"/> | |
| 181 <param name="vcfile" value="vcf"/> | |
| 182 <param name="vcf_file" ftype="vcf" value="hg38.vcf"/> | |
| 183 <param name="plot" value="True"/> | |
| 184 <output name="logsmt" ftype="txt" file="sigmut.log" lines_diff="5" /> | |
| 185 <output name="blinder" file="hg38_blinder.pdf" lines_diff="5" /> | |
| 186 </test> | |
| 187 </tests> | |
| 188 | |
| 189 <help><![CDATA[ | |
| 190 | |
| 191 **SigProfiler** | |
| 192 | |
| 193 Background: | |
| 194 | |
| 195 Cancer genomes evince somatic mutations, which are imprinted by | |
| 196 different mutational processes, that give rise to diverse | |
| 197 mutational signatures. Their analysis from single base | |
| 198 substitutions and their immediate sequencing context, allows the | |
| 199 classification of small mutational events (including | |
| 200 substitutions, insertions, deletions, and doublet substitutions) | |
| 201 for better understanding the mutational processes that have | |
| 202 shaped a cancer genome. | |
| 203 | |
| 204 In this sense, SigProfiler constitutes a Galaxy-based wrapper of | |
| 205 a computational method developed by Ludmil B. Alexandrov, that | |
| 206 allow the exploration and visualization of mutational patterns | |
| 207 for all types of small mutational events. Specifically, the | |
| 208 following actions can be performed using SigProfiler wrapper: | |
| 209 | |
| 210 1. Identify and categorize the mutations based on possible | |
| 211 single nucleotide variants (SNVs), double base substitutions | |
| 212 (DBS), and insertions/deletions and provides further | |
| 213 transcriptional strand bias categorization. Afterwards, the | |
| 214 classification of these mutations are integrated into distinct | |
| 215 matrices. | |
| 216 SigProfiler provides matrix generation support for SBS-6, | |
| 217 SBS-96, SBS-1536, DBS-78 and DBS-1248. In addition, the | |
| 218 generation of mutational matrices of indels including | |
| 219 ID-28 and ID-83 are procured. Besides, an ID-8628 matrix that | |
| 220 extends the ID-83 classification is generated. | |
| 221 SigProfiler examines transcriptional strand bias for single base | |
| 222 substitutions, doublet base substitutions, and small indels. It | |
| 223 is evaluated whether a mutation occurs on the transcribed or the | |
| 224 non-transcribed strand of well-annotated protein coding genes of | |
| 225 a reference genome. Mutations found in the transcribed regions | |
| 226 of the genome are further subclassified as: (i) transcribed, | |
| 227 (ii) un-transcribed, (iii) bi-directional, or (iv) unknown. | |
| 228 | |
| 229 2. Generation of plots of all types of mutational signatures as | |
| 230 well as all types of mutational patterns in cancer genomes. | |
| 231 | |
| 232 Additional Information: | |
| 233 | |
| 234 Classification of Single Base substitutions (SBSs): | |
| 235 Single base substitutions (SBSs) are single DNA base-pairs | |
| 236 substituted with another single DNA base-pairs. The most | |
| 237 basic classification catalogues SBSs into six distinct | |
| 238 categories, including: C:G > A:T, C:G > G:C, C:G > T:A, | |
| 239 T:A > A:T, T:A > C:G, and T:A > G:C. In practice, a C:G > A:T | |
| 240 substitution is denoted as either a C > A mutation using the | |
| 241 pyrimidine base or as a G > T mutation using the purine base. | |
| 242 In consequence, the most commonly used SBS-6 classification of | |
| 243 single base substitutions can be written as: C > A, C > G, | |
| 244 C > T, T > A, T > C, and T > G. | |
| 245 Additionally, the SBS-6 classification can be further | |
| 246 expanded by considering the base-pairs immediately | |
| 247 adjacent 5′ and 3′ to the somatic mutation. Therefore, an | |
| 248 extended classification for analysis of mutational signatures is | |
| 249 SBS-96, where each of the classes in SBS-6 is further elaborated | |
| 250 using one base adjacent at the 5′ of the mutation and one base | |
| 251 adjacent at the 3′ of the mutation. | |
| 252 Logically, SBS-96 can be further elaborated by including | |
| 253 additional 5′ and 3′ adjacent context. Each of the six single | |
| 254 base substitutions in SBS-6 has 256 possible pentanucleotides | |
| 255 resulting in a classification with 1536 possible channels. | |
| 256 | |
| 257 Classification of Doublet Base substitutions (DBSs): | |
| 258 Doublet base substitutions (DBSs) are somatic mutations in which | |
| 259 a set of two adjacent DNA base-pairs is simultaneously | |
| 260 substituted with another set of two adjacent DNA base-pairs. An | |
| 261 example of a DBS is a set of CT:GA base-pairs mutating to a set | |
| 262 of AA:TT base-pairs, which is usually denoted as CT:GA > AA:TT. | |
| 263 It should be noted that a CT:GA > AA:TT mutation can be | |
| 264 equivalently written as either a CT > AA mutation. Overall, the | |
| 265 basic classification catalogues DBSs into 78 distinct categories | |
| 266 denoted as the DBS-78 matrix. | |
| 267 Similarly, we can expand the characterization of DBS mutations | |
| 268 by considering the 5′ and 3′ adjacent contexts. With | |
| 269 seventy-eight possible DBS mutations having sixteen possible | |
| 270 tetranucleotides each, this context expansion results in 1248 | |
| 271 possible channels denoted as the DBS-1248 context. | |
| 272 | |
| 273 Classification of small insertions and deletions (IDs): | |
| 274 A somatic insertion is the incorporation of a set of base-pairs | |
| 275 that lengthens a chromosome, while a somatic deletion is the | |
| 276 removing of a set of existing base-pairs from a given location | |
| 277 of a chromosome. | |
| 278 Unfortunately, indel classification cannot be performed | |
| 279 analogously to SBS or DBS classifications, where the immediate | |
| 280 sequencing context flanking each mutation was | |
| 281 utilized to subclassify these mutational events. | |
| 282 Consequently, indels (IDs) are classified as single base-pair | |
| 283 or longer events. They can be further subclassified as either a | |
| 284 C:G or a T:A indel, while longer indels can also be | |
| 285 subclassified based on their lengths: 2 bp, 3 bp, 4 bp, and | |
| 286 5 + bp. | |
| 287 | |
| 288 Incorporation of transcription Strand Bias (TSB): | |
| 289 The mutational classifications described above allow the | |
| 290 characterization of mutational patterns of single base | |
| 291 substitutions, doublet base substitutions, and small insertions | |
| 292 and deletions. Nevertheless, these classifications can be | |
| 293 further elaborated by incorporating strand bias. Mutations | |
| 294 from the same type are expected to be equally distributed across the two | |
| 295 DNA strands. However, in many cases an asymmetric number of mutations are | |
| 296 observed due to either one of the strands being preferentially | |
| 297 repaired or one of the strands having a higher propensity for | |
| 298 being damaged. To sub-classify mutations based on their | |
| 299 transcriptional strand bias, the pyrimidine orientation with | |
| 300 respect to the locations of well-annotated protein coding genes | |
| 301 on a genome is considered. | |
| 302 | |
| 303 Running SigProfiler: | |
| 304 | |
| 305 1. Reference Genomes: | |
| 306 Before using SigProfiler, the installation of a reference genome | |
| 307 is demanded. By default, the tool supports the following | |
| 308 reference genomes: | |
| 309 | |
| 310 Human: GRCh37 & GRCh38 | |
| 311 | |
| 312 Mouse: mm9 & mm10 | |
| 313 | |
| 314 Rat: rn6 | |
| 315 | |
| 316 Nematode: c_elegans | |
| 317 | |
| 318 A right command line should look like: | |
| 319 | |
| 320 sigprofiler -ig GRCh37 | |
| 321 | |
| 322 2. Mutational signatures calculation: | |
| 323 | |
| 324 After successful installation of a reference genome, SigProfiler | |
| 325 can be applied to files containing somatic mutations in multiple | |
| 326 formats, for transforming these mutational catalogues into mutational | |
| 327 matrices. Specifically, the tool can read data formats such as | |
| 328 Variant Calling Format (VCF) and Mutation Annotation Format | |
| 329 (MAF) and the following parameters should be provided for | |
| 330 generating the diverse matrices and plots: | |
| 331 | |
| 332 --name | -n = Project name | |
| 333 --genome | -g = Reference Genome | |
| 334 -files | -f = Absolute path where the input mutation files are located | |
| 335 | |
| 336 A right command line should look like: | |
| 337 | |
| 338 sigprofiler -n MYPROJECT -g GRCh37 -f /path_to_folder_with_VCF_files/ -p | |
| 339 | |
| 340 **Options** | |
| 341 --version show program's version number and exit | |
| 342 | |
| 343 -h, --help show this help message and exit | |
| 344 | |
| 345 --install_genome Install de novo any of the following reference | |
| 346 genomes: 'GRCh37', 'GRCh38', 'mm9' or 'mm10'. | |
| 347 | |
| 348 --name=APPENDIX Provide a project name | |
| 349 | |
| 350 --genome=NAME Provide a reference genome (ex: GRCh37, GRCh38, | |
| 351 mm9 or mm10). | |
| 352 | |
| 353 --files=Abs_path Path where the input vcf files are located | |
| 354 | |
| 355 --exome Use only the exome or not | |
| 356 | |
| 357 --bed=FILE BED file containing the set of regions to be used | |
| 358 in generating the matrices | |
| 359 | |
| 360 --chrom Create the matrices on a per chromosome basis | |
| 361 | |
| 362 --plot Generate the plots for each context | |
| 363 | |
| 364 --tsb Performs a transcriptional strand bias test for the | |
| 365 24, 384, and 6144 contexts | |
| 366 | |
| 367 --gs Performs a gene strand bias test | |
| 368 | |
| 369 For further info see: https://github.com/AlexandrovLab/SigProfilerMatrixGenerator | |
| 370 | |
| 371 ]]></help> | |
| 372 | |
| 373 <citations> | |
| 374 <citation type="doi">10.1186/s12864-019-6041-2</citation> | |
| 375 </citations> | |
| 376 | |
| 377 </tool> |
