comparison sigmut.xml @ 0:2062de974f72 draft default tip

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/sigmut commit bba3eb3950b8772758cc6f19747172be7413ddd9"
author artbio
date Mon, 15 Jun 2020 00:28:49 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2062de974f72
1 <tool id="SigProfiler" name="SigProfiler" version="@VERSION@">
2 <description>performs mutational signature characterization from variant files</description>
3
4 <macros>
5 <import>sigmut_macros.xml</import>
6 </macros>
7 <expand macro="requirements"/>
8 <expand macro="stdio"/>
9 <command detect_errors="exit_code"><![CDATA[
10 @VERSION@
11 @pipefail@
12 BIN=`which sigprofiler | sed 's,/sigprofiler,,g'` &&
13 echo \$BIN &&
14 chmod -R 777 \$BIN &&
15 mkdir run_dir &&
16 #if str( $set_analysis.choices ) == "get_sigmut":
17 #if str( $set_analysis.vcfile_input.vcfile ) == "maf":
18 #set $infile = 'run_dir/snps.maf'
19 ln -s -f '$set_analysis.vcfile_input.maf_file' '$infile' &&
20 #else if str( $set_analysis.vcfile_input.vcfile ) == "icgc":
21 #set $infile = 'run_dir/snps.txt'
22 ln -s -f '$set_analysis.vcfile_input.icgc_file' '$infile' &&
23 #else if str( $set_analysis.vcfile_input.vcfile ) == "vcf":
24 #set $infile = 'run_dir/snps.vcf'
25 ln -s -f '$set_analysis.vcfile_input.vcf_file' '$infile' &&
26 #end if
27 #end if
28
29 sigprofiler
30
31 #if str( $set_analysis.choices ) == "install_genome":
32 -ig $set_analysis.refgendwn > install.log
33 #else if str( $set_analysis.choices ) == "get_sigmut":
34 -g $set_analysis.refgendat
35 -f 'run_dir'
36 -n "project"
37 -p
38 ## ! implement exome functionality when good test available
39 ## #if str( $set_analysis.exome ) == "true":
40 ## -e
41 ## #end if
42 ## ! implement per chromosome functionality when good test available
43 ## #if str( $set_analysis.chrom_based ) == "true":
44 ## -c
45 ## #end if
46 #if str( $set_analysis.tsb_stat ) == "true":
47 -t
48 #end if
49 #if str( $set_analysis.gs ) == "true":
50 -s
51 #end if
52 ##-b $set_analysis.bed ### to be done
53 && pdfcombine -f -s -o blinder.pdf run_dir/output/plots/*.pdf
54 && ls run_dir/logs/
55 #if str( $set_analysis.tsb_stat ) == "true":
56 && tail -n +1 run_dir/output/TSB/*.txt > transcriptional_strand_biases.txt
57 #end if
58 #if $set_analysis.seqInfo:
59 && tail -n +1 run_dir/output/*/*.all > information.txt
60 #end if
61 #end if
62 ]]></command>
63
64 <inputs>
65 <conditional name="set_analysis">
66 <param name="choices" type="select" label="Which of the following jobs do you want perform?">
67 <option value="install_genome">Install 'de novo' a reference genome </option>
68 <option value="get_sigmut">Obtain the mutational signatures from VCF files</option>
69 </param>
70 <when value="install_genome">
71 <param name="refgendwn" type="select" label="Reference genome" help="Get data from any of the following reference genomes:">
72 <option value="GRCh37">Homo sapiens, GRCh37.p13 [GCA_000001405.14] </option>
73 <option value="GRCh38">Homo sapiens, GRCh38.p12 [GCA_000001405.27] </option>
74 <option value="mm9">Mus musculus, GRCm37 [GCA_000001635.18]</option>
75 <option value="mm10">Mus musculus, GRCm38.p6 [GCA_000001635.8]</option>
76 <option value="rn6">Rattus norvegicus, Rnor_6.0 [GCA_000001895.4]</option>
77 <option value="c_elegans">Caenorhabditis elegans</option>
78 <option value="dog">Dog</option>
79 </param>
80 </when>
81
82 <when value="get_sigmut">
83 <conditional name="vcfile_input">
84 <param name="vcfile" type="select" label="VC file" help="Select the format of your input data">
85 <option value="maf">Mutation Annotation Format</option>
86 <option value="icgc">Tab-separated file</option>
87 <option value="vcf">Variant Call Format</option>
88 </param>
89 <when value='maf'>
90 <param name="maf_file" type="data" format="maf" label="select VC file" help="Select the input file in MAF format." />
91 </when>
92 <when value='icgc'>
93 <param name="icgc_file" type="data" format="txt" label="select VC file" help="Select the input file in ICGC format." />
94 </when>
95 <when value='vcf'>
96 <param name="vcf_file" type="data" format="vcf" label="select VC file" help="Select the input file in VCF format." />
97 </when>
98 </conditional>
99
100 <param name="refgendat" type="select" label="Reference genome to be analyzed" help="Use the following reference genome:">
101 <option value="GRCh37">Homo sapiens, GRCh37.p13 [GCA_000001405.14] </option>
102 <option value="GRCh38">Homo sapiens, GRCh38.p12 [GCA_000001405.27] </option>
103 <option value="mm9">Mus musculus, GRCm37 [GCA_000001635.18]</option>
104 <option value="mm10">Mus musculus, GRCm38.p6 [GCA_000001635.8]</option>
105 <option value="rn6">Rattus norvegicus, Rnor_6.0 [GCA_000001895.4]</option>
106 <option value="c_elegans">Caenorhabditis elegans</option>
107 <option value="dog">Dog</option>
108 </param>
109
110 <!-- implement bed when test available -->
111 <!-- <conditional name="bed_input">
112 <param name="bedfile" type="select" label="BED file" help="Input a BED file">
113 <option value="yes">Yes</option>
114 <option value="no" selected="true">No</option>
115 </param>
116 <when value='yes'>
117 <param name="bed_file" format="bed" type="data" label="Use a BED file containing the set of regions" help="Provide a BED file"/>
118 </when>
119 <when value='no'>
120 </when>
121 </conditional> -->
122 <!-- implement exome functionality when test available -->
123 <!-- <param name="exome" type="boolean" label="Use only the exome?" checked="False" help="Use exome"/> -->
124 <!-- implement chrom_based functionality when test available -->
125 <!--<param name="chrom_based" type="boolean" label="Create the matrices on a per chromosome basis?" checked="False" help="Show snvs"/> -->
126 <param name="tsb_stat" type="boolean" truevalue="true" label="Performs a transcriptional strand bias test?" checked="False" help="Show snvs"/>
127 <param name="seqInfo" type="boolean" truevalue="true" label="Export sequence information?" checked="False" help="Show sequence information"/>
128 <param name="gs" type="boolean" label="Performs gene strand bias test?" checked="False" help="Show snvs"/>
129 </when>
130 </conditional>
131 </inputs>
132
133 <outputs>
134 <data format="txt" name="logref" label="Log file: Install a Reference Genome"
135 from_work_dir="./install.log">
136 <filter>set_analysis['choices'] == 'install_genome'</filter>
137 </data>
138 <data format="txt" name="logsmt" label="Log file: Calculate Mutational Signatures"
139 from_work_dir="run_dir/logs/SigProfilerMatrixGenerator*.out">
140 <filter>set_analysis['choices'] == 'get_sigmut'</filter>
141 </data>
142
143 <data format="pdf" name="blinder" label="SBS Mutational Signatures plots (pdf)"
144 from_work_dir="./blinder.pdf" >
145 <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['plot'] is True</filter>
146 </data>
147
148 <!-- implement exome outputs when test available -->
149 <!--
150 <data format="txt" name="dbs_exome" label="DBS_exome.vcf">
151 <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['exome'] is True</filter>
152 </data>
153 <data format="txt" name="snv_exome" label="SNV_exome.vcf">
154 <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['exome'] is True</filter>
155 </data>
156
157 <data format="txt" name="sig_exome" label="DBS 78 and so on Sig. Mut. EXOME">
158 <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['exome'] is True</filter>
159 </data>
160 -->
161 <data format="txt" name="tsb" label="Transcriptional Strand Biases"
162 from_work_dir="./transcriptional_strand_biases.txt" >
163 <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['tsb_stat'] is True</filter>
164 </data>
165
166 <data format="txt" name="seqinfo" label="Mutational Signature detailed infos"
167 from_work_dir="./information.txt" >
168 <filter>set_analysis['choices'] == 'get_sigmut' and set_analysis['seqInfo'] is True</filter>
169 </data>
170
171 </outputs>
172 <tests>
173 <test>
174 <param name="choices" value="install_genome"/>
175 <param name="refgendwn" value="GRCh38"/>
176 <output name="logref" file="hg38_install.log" lines_diff="5"/>
177 </test>
178 <test>
179 <param name="choices" value="get_sigmut"/>
180 <param name="refgendat" value="GRCh38"/>
181 <param name="vcfile" value="vcf"/>
182 <param name="vcf_file" ftype="vcf" value="hg38.vcf"/>
183 <param name="plot" value="True"/>
184 <output name="logsmt" ftype="txt" file="sigmut.log" lines_diff="5" />
185 <output name="blinder" file="hg38_blinder.pdf" lines_diff="5" />
186 </test>
187 </tests>
188
189 <help><![CDATA[
190
191 **SigProfiler**
192
193 Background:
194
195 Cancer genomes evince somatic mutations, which are imprinted by
196 different mutational processes, that give rise to diverse
197 mutational signatures. Their analysis from single base
198 substitutions and their immediate sequencing context, allows the
199 classification of small mutational events (including
200 substitutions, insertions, deletions, and doublet substitutions)
201 for better understanding the mutational processes that have
202 shaped a cancer genome.
203
204 In this sense, SigProfiler constitutes a Galaxy-based wrapper of
205 a computational method developed by Ludmil B. Alexandrov, that
206 allow the exploration and visualization of mutational patterns
207 for all types of small mutational events. Specifically, the
208 following actions can be performed using SigProfiler wrapper:
209
210 1. Identify and categorize the mutations based on possible
211 single nucleotide variants (SNVs), double base substitutions
212 (DBS), and insertions/deletions and provides further
213 transcriptional strand bias categorization. Afterwards, the
214 classification of these mutations are integrated into distinct
215 matrices.
216 SigProfiler provides matrix generation support for SBS-6,
217 SBS-96, SBS-1536, DBS-78 and DBS-1248. In addition, the
218 generation of mutational matrices of indels including
219 ID-28 and ID-83 are procured. Besides, an ID-8628 matrix that
220 extends the ID-83 classification is generated.
221 SigProfiler examines transcriptional strand bias for single base
222 substitutions, doublet base substitutions, and small indels. It
223 is evaluated whether a mutation occurs on the transcribed or the
224 non-transcribed strand of well-annotated protein coding genes of
225 a reference genome. Mutations found in the transcribed regions
226 of the genome are further subclassified as: (i) transcribed,
227 (ii) un-transcribed, (iii) bi-directional, or (iv) unknown.
228
229 2. Generation of plots of all types of mutational signatures as
230 well as all types of mutational patterns in cancer genomes.
231
232 Additional Information:
233
234 Classification of Single Base substitutions (SBSs):
235 Single base substitutions (SBSs) are single DNA base-pairs
236 substituted with another single DNA base-pairs. The most
237 basic classification catalogues SBSs into six distinct
238 categories, including: C:G > A:T, C:G > G:C, C:G > T:A,
239 T:A > A:T, T:A > C:G, and T:A > G:C. In practice, a C:G > A:T
240 substitution is denoted as either a C > A mutation using the
241 pyrimidine base or as a G > T mutation using the purine base.
242 In consequence, the most commonly used SBS-6 classification of
243 single base substitutions can be written as: C > A, C > G,
244 C > T, T > A, T > C, and T > G.
245 Additionally, the SBS-6 classification can be further
246 expanded by considering the base-pairs immediately
247 adjacent 5′ and 3′ to the somatic mutation. Therefore, an
248 extended classification for analysis of mutational signatures is
249 SBS-96, where each of the classes in SBS-6 is further elaborated
250 using one base adjacent at the 5′ of the mutation and one base
251 adjacent at the 3′ of the mutation.
252 Logically, SBS-96 can be further elaborated by including
253 additional 5′ and 3′ adjacent context. Each of the six single
254 base substitutions in SBS-6 has 256 possible pentanucleotides
255 resulting in a classification with 1536 possible channels.
256
257 Classification of Doublet Base substitutions (DBSs):
258 Doublet base substitutions (DBSs) are somatic mutations in which
259 a set of two adjacent DNA base-pairs is simultaneously
260 substituted with another set of two adjacent DNA base-pairs. An
261 example of a DBS is a set of CT:GA base-pairs mutating to a set
262 of AA:TT base-pairs, which is usually denoted as CT:GA > AA:TT.
263 It should be noted that a CT:GA > AA:TT mutation can be
264 equivalently written as either a CT > AA mutation. Overall, the
265 basic classification catalogues DBSs into 78 distinct categories
266 denoted as the DBS-78 matrix.
267 Similarly, we can expand the characterization of DBS mutations
268 by considering the 5′ and 3′ adjacent contexts. With
269 seventy-eight possible DBS mutations having sixteen possible
270 tetranucleotides each, this context expansion results in 1248
271 possible channels denoted as the DBS-1248 context.
272
273 Classification of small insertions and deletions (IDs):
274 A somatic insertion is the incorporation of a set of base-pairs
275 that lengthens a chromosome, while a somatic deletion is the
276 removing of a set of existing base-pairs from a given location
277 of a chromosome.
278 Unfortunately, indel classification cannot be performed
279 analogously to SBS or DBS classifications, where the immediate
280 sequencing context flanking each mutation was
281 utilized to subclassify these mutational events.
282 Consequently, indels (IDs) are classified as single base-pair
283 or longer events. They can be further subclassified as either a
284 C:G or a T:A indel, while longer indels can also be
285 subclassified based on their lengths: 2 bp, 3 bp, 4 bp, and
286 5 + bp.
287
288 Incorporation of transcription Strand Bias (TSB):
289 The mutational classifications described above allow the
290 characterization of mutational patterns of single base
291 substitutions, doublet base substitutions, and small insertions
292 and deletions. Nevertheless, these classifications can be
293 further elaborated by incorporating strand bias. Mutations
294 from the same type are expected to be equally distributed across the two
295 DNA strands. However, in many cases an asymmetric number of mutations are
296 observed due to either one of the strands being preferentially
297 repaired or one of the strands having a higher propensity for
298 being damaged. To sub-classify mutations based on their
299 transcriptional strand bias, the pyrimidine orientation with
300 respect to the locations of well-annotated protein coding genes
301 on a genome is considered.
302
303 Running SigProfiler:
304
305 1. Reference Genomes:
306 Before using SigProfiler, the installation of a reference genome
307 is demanded. By default, the tool supports the following
308 reference genomes:
309
310 Human: GRCh37 & GRCh38
311
312 Mouse: mm9 & mm10
313
314 Rat: rn6
315
316 Nematode: c_elegans
317
318 A right command line should look like:
319
320 sigprofiler -ig GRCh37
321
322 2. Mutational signatures calculation:
323
324 After successful installation of a reference genome, SigProfiler
325 can be applied to files containing somatic mutations in multiple
326 formats, for transforming these mutational catalogues into mutational
327 matrices. Specifically, the tool can read data formats such as
328 Variant Calling Format (VCF) and Mutation Annotation Format
329 (MAF) and the following parameters should be provided for
330 generating the diverse matrices and plots:
331
332 --name | -n = Project name
333 --genome | -g = Reference Genome
334 -files | -f = Absolute path where the input mutation files are located
335
336 A right command line should look like:
337
338 sigprofiler -n MYPROJECT -g GRCh37 -f /path_to_folder_with_VCF_files/ -p
339
340 **Options**
341 --version show program's version number and exit
342
343 -h, --help show this help message and exit
344
345 --install_genome Install de novo any of the following reference
346 genomes: 'GRCh37', 'GRCh38', 'mm9' or 'mm10'.
347
348 --name=APPENDIX Provide a project name
349
350 --genome=NAME Provide a reference genome (ex: GRCh37, GRCh38,
351 mm9 or mm10).
352
353 --files=Abs_path Path where the input vcf files are located
354
355 --exome Use only the exome or not
356
357 --bed=FILE BED file containing the set of regions to be used
358 in generating the matrices
359
360 --chrom Create the matrices on a per chromosome basis
361
362 --plot Generate the plots for each context
363
364 --tsb Performs a transcriptional strand bias test for the
365 24, 384, and 6144 contexts
366
367 --gs Performs a gene strand bias test
368
369 For further info see: https://github.com/AlexandrovLab/SigProfilerMatrixGenerator
370
371 ]]></help>
372
373 <citations>
374 <citation type="doi">10.1186/s12864-019-6041-2</citation>
375 </citations>
376
377 </tool>