7
|
1 <tool id="cga_calldiff" name="calldiff(beta)" version="0.0.1">
|
|
2
|
|
3 <description>compares two Complete Genomics variant files.</description> <!--adds description in toolbar-->
|
|
4
|
|
5 <requirements>
|
|
6 <requirement type="binary">cgatools</requirement>
|
|
7 </requirements>
|
|
8
|
|
9 <command> <!--run executable-->
|
|
10 cgatools calldiff --beta
|
|
11 --reference ${crr.fields.path}
|
|
12 --variantsA $data_sources.inputA
|
|
13 --variantsB $data_sources.inputB
|
|
14 $validation
|
|
15 $diploid
|
|
16 --locus-stats-column-count $column
|
|
17 --max-hypothesis-count $hypothesis
|
|
18 --output-prefix cg_
|
|
19 --reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} ${somatic.report6} | sed 's/ */,/g'`
|
|
20 #if $somatic.report6 == "SomaticOutput"
|
|
21 --genome-rootA $somatic.genomeA
|
|
22 --genome-rootB $somatic.genomeB
|
|
23 --calibration-root $somatic.calibration
|
|
24 #end if
|
|
25 </command>
|
|
26
|
|
27 <outputs>
|
|
28 <data format="tabular" name="output1" from_work_dir="cg_SuperlocusOutput.tsv" label="${tool.name} on ${on_string}: SuperlocusOutput">
|
|
29 <filter>(report1 == 'SuperlocusOutput')</filter>
|
|
30 </data>
|
|
31 <data format="tabular" name="output2" from_work_dir="cg_SuperlocusStats.tsv" label="${tool.name} on ${on_string}: SuperlocusStats">
|
|
32 <filter>(report2 == 'SuperlocusStats')</filter>
|
|
33 </data>
|
|
34 <data format="tabular" name="output3" from_work_dir="cg_LocusOutput.tsv" label="${tool.name} on ${on_string}: LocusOutput">
|
|
35 <filter>(report3 == 'LocusOutput')</filter>
|
|
36 </data>
|
|
37 <data format="tabular" name="output4" from_work_dir="cg_LocusStats.tsv" label="${tool.name} on ${on_string}: LocusStats">
|
|
38 <filter>(report4 == 'LocusStats')</filter>
|
|
39 </data>
|
|
40 <data format="tabular" name="output5a" from_work_dir="cg_VariantsA.tsv" label="${tool.name} on ${on_string}: VariantsA">
|
|
41 <filter>(report5 == 'VariantOutput')</filter>
|
|
42 </data>
|
|
43 <data format="tabular" name="output5b" from_work_dir="cg_VariantsB.tsv" label="${tool.name} on ${on_string}: VariantsB">
|
|
44 <filter>(report5 == 'VariantOutput')</filter>
|
|
45 </data>
|
|
46 <data format="tabular" name="output6" from_work_dir="cg_SomaticOutput.tsv" label="${tool.name} on ${on_string}: SomaticOutput">
|
|
47 <filter>(somatic['report6'] == 'SomaticOutput')</filter>
|
|
48 </data>
|
|
49 </outputs>
|
|
50
|
|
51 <inputs>
|
|
52 <!--form field to select crr file-->
|
|
53 <param name="crr" type="select" label="Genome build">
|
|
54 <options from_data_table="cg_crr_files" />
|
|
55 </param>
|
|
56
|
|
57 <!--conditional to select variant file input-->
|
|
58 <conditional name="data_sources">
|
|
59 <param name="data_source" type="select" label="Where are the input varfiles?">
|
|
60 <option value="in" selected="true">imported into Galaxy</option>
|
|
61 <option value="out">located outside Galaxy (available only for local Galaxy instances)</option>
|
|
62 </param>
|
|
63 <when value="in">
|
|
64 <!--form field to select variant files-->
|
|
65 <param name="inputA" type="data" format="cg_var" label="Dataset A">
|
|
66 <validator type="unspecified_build" />
|
|
67 <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
|
|
68 metadata_name="dbkey" metadata_column="1"
|
|
69 message="cgatools is not currently available for this build."/>
|
|
70 </param>
|
|
71 <param name="inputB" type="data" format="cg_var" label="Dataset B">
|
|
72 <validator type="unspecified_build" />
|
|
73 <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
|
|
74 metadata_name="dbkey" metadata_column="1"
|
|
75 message="cgatools is not currently available for this build."/>
|
|
76 </param>
|
|
77 </when>
|
|
78 <when value="out">
|
|
79 <!--form field to select crr file-->
|
|
80 <param name="inputA" type="text" label="Variant file A (path/file_name)" size="300" help="Variant files can be compressed (gz, bz2)."/>
|
|
81 <param name="inputB" type="text" label="Variant file B (path/file_name)" size="300" help="Variant files can be compressed (gz, bz2)."/>
|
|
82 </when>
|
|
83 </conditional>
|
|
84
|
|
85 <param name="diploid" type="select" label="Use diploid variant model" help="Uses varScoreEAF instead of varScoreVAF in somatic score computations. Also, uses diploid variant model instead of variable allele mixture model.">
|
|
86 <option value="">no</option>
|
|
87 <option value="--diploid">yes</option>
|
|
88 </param>
|
|
89
|
|
90 <param name="column" type="integer" label="Number of columns for locus compare classification in the locus stats file (default 15)" value="15"/>
|
|
91
|
|
92 <param name="hypothesis" type="integer" label="Maximum number of possible phasings to consider for a superlocus (default 32)" value="32"/>
|
|
93
|
|
94 <param name="validation" type="select" label="Reference cover validation" help="Turns on/off validation that all bases of a chromosome are covered by calls of the variant file.">
|
|
95 <option value="">on</option>
|
|
96 <option value="--no-reference-cover-validation">off</option>
|
|
97 </param>
|
|
98
|
|
99 <param name="report1" type="select" label="Report SuperlocusOutput">
|
|
100 <option value="">no</option>
|
|
101 <option value="SuperlocusOutput">yes</option>
|
|
102 </param>
|
|
103 <param name="report2" type="select" label="Report SuperlocusStats">
|
|
104 <option value="">no</option>
|
|
105 <option value="SuperlocusStats">yes</option>
|
|
106 </param>
|
|
107 <param name="report3" type="select" label="Report LocusOutput">
|
|
108 <option value="">no</option>
|
|
109 <option value="LocusOutput">yes</option>
|
|
110 </param>
|
|
111 <param name="report4" type="select" label="Report LocusStats">
|
|
112 <option value="">no</option>
|
|
113 <option value="LocusStats">yes</option>
|
|
114 </param>
|
|
115 <param name="report5" type="select" label="Report VariantOutput" help="Both variant files annotated by comparison results.If the somatic output report is requested, file A is also annotated with the same score ranks as produced in that report.">
|
|
116 <option value="">no</option>
|
|
117 <option value="VariantOutput">yes</option>
|
|
118 </param>
|
|
119
|
|
120 <conditional name="somatic">
|
|
121 <param name="report6" type="select" label="Report SomaticOutput" help="This report can only be generated on local Galaxy instances. Report for the list of simple variations that are present only in file 'A', annotated with the score that indicates the probability of the variation being truly somatic. Note: generating this report slows calldiff by 10x-20x.">
|
|
122 <option value="">no</option>
|
|
123 <option value="SomaticOutput">yes</option>
|
|
124 </param>
|
|
125 <when value="SomaticOutput">
|
|
126 <param name="genomeA" type="text" size="300" label="Directory for genome A (path/dir)" help="The 'A' genome directory, for example /data/GS00118-DNA_A01; this directory is expected to contain ASM/REF and ASM/EVIDENCE subdirectories."/>
|
|
127 <param name="genomeB" type="text" size="300" label="Directory for genome B (path/dir)" help="The 'B' genome directory"/>
|
|
128 <param name="calibration" type="text" size="300" label="Directory calibration data (path/dir)" help="The directory containing calibration data. For example, there should exist a file calibration-root/0.0.0/metrics.tsv. Calibration data can be downloaded from ftp://ftp.completegenomics.com/ScoreCalibrationFiles/var-calibration-v1.tgz"/>
|
|
129 </when>
|
|
130 </conditional>
|
|
131
|
|
132 </inputs>
|
|
133
|
|
134 <help>
|
|
135
|
|
136 **What it does**
|
|
137
|
|
138 This tool compares two Complete Genomics variant files.
|
|
139
|
|
140 cgatools: http://sourceforge.net/projects/cgatools/files/
|
|
141
|
|
142 -----
|
|
143
|
|
144 **cgatools Manual**::
|
|
145
|
|
146 COMMAND NAME
|
|
147 calldiff - Compares two Complete Genomics variant files.
|
|
148
|
|
149 DESCRIPTION
|
|
150 Compares two Complete Genomics variant files. Divides the genome up into
|
|
151 superloci of nearby variants, then compares the superloci. Also refines the
|
|
152 comparison to determine per-call or per-locus comparison results.
|
|
153
|
|
154 Comparison results are usually described by a semi-colon separated string,
|
|
155 one per allele. Each allele's comparison result is one of the following
|
|
156 classifications:
|
|
157
|
|
158 ref-identical The alleles of the two variant files are identical, and
|
|
159 they are consistent with the reference.
|
|
160 alt-identical The alleles of the two variant files are identical, and
|
|
161 they are inconsistent with the reference.
|
|
162 ref-consistent The alleles of the two variant files are consistent,
|
|
163 and they are consistent with the reference.
|
|
164 alt-consistent The alleles of the two variant files are consistent,
|
|
165 and they are inconsistent with the reference.
|
|
166 onlyA The alleles of the two variant files are inconsistent,
|
|
167 and only file A is inconsistent with the reference.
|
|
168 onlyB The alleles of the two variant files are inconsistent,
|
|
169 and only file B is inconsistent with the reference.
|
|
170 mismatch The alleles of the two variant files are inconsistent,
|
|
171 and they are both inconsistent with the reference.
|
|
172 phase-mismatch The two variant files would be consistent if the
|
|
173 hapLink field had been empty, but they are
|
|
174 inconsistent.
|
|
175 ploidy-mismatch The superlocus did not have uniform ploidy.
|
|
176
|
|
177 In some contexts, this classification is rolled up into a simplified
|
|
178 classification, which is one of "identical", "consistent", "onlyA",
|
|
179 "onlyB", or "mismatch".
|
|
180
|
|
181 A good place to start looking at the results is the superlocus-output file.
|
|
182 It has columns defined as follows:
|
|
183
|
|
184 SuperlocusId An identifier given to the superlocus.
|
|
185 Chromosome The name of the chromosome.
|
|
186 Begin The 0-based offset of the start of the superlocus.
|
|
187 End The 0-based offset of the base one past the end of the
|
|
188 superlocus.
|
|
189 Classification The match classification of the superlocus.
|
|
190 Reference The reference sequence.
|
|
191 AllelesA A semicolon-separated list of the alleles (one per
|
|
192 haplotype) for variant file A, for the phasing with the
|
|
193 best comparison result.
|
|
194 AllelesB A semicolon-separated list of the alleles (one per
|
|
195 haplotype) for variant file B, for the phasing with the
|
|
196 best comparison result.
|
|
197
|
|
198 The locus-output file contains, for each locus in file A and file B that is
|
|
199 not consistent with the reference, an annotated set of calls for the locus.
|
|
200 The calls are annotated with the following columns:
|
|
201
|
|
202 SuperlocusId The id of the superlocus containing the locus.
|
|
203 File The variant file (A or B).
|
|
204 LocusClassification The locus classification is determined by the
|
|
205 varType column of the call that is inconsistent
|
|
206 with the reference, concatenated with a
|
|
207 modifier that describes whether the locus is
|
|
208 heterozygous, homozygous, or contains no-calls.
|
|
209 If there is no one variant in the locus (i.e.,
|
|
210 it is heterozygous alt-alt), the locus
|
|
211 classification begins with "other".
|
|
212 LocusDiffClassification The match classification for the locus. This is
|
|
213 defined to be the best of the comparison of the
|
|
214 locus to the same region in the other file, or
|
|
215 the comparison of the superlocus.
|
|
216
|
|
217 The somatic output file contains a list of putative somatic variations of
|
|
218 genome A. The output includes only those loci that can be classified as
|
|
219 snp, del, ins or sub in file A, and are called reference in the file B.
|
|
220 Every locus is annotated with the following columns:
|
|
221
|
|
222 VarCvgA The totalReadCount from file A for this locus
|
|
223 (computed on the fly if file A is not a
|
|
224 masterVar file).
|
|
225 VarScoreA The varScoreVAF from file A, or varScoreEAF if
|
|
226 the "--diploid" option is used.
|
|
227 RefCvgB The maximum of the uniqueSequenceCoverage
|
|
228 values for the locus in genome B.
|
|
229 RefScoreB Minimum of the reference scores of the locus in
|
|
230 genome B.
|
|
231 SomaticCategory The category used for determining the
|
|
232 calibrated scores and the SomaticRank.
|
|
233 VarScoreACalib The calibrated variant score of file A, under
|
|
234 the model selected by using or not using the
|
|
235 "--diploid" option, and corrected for the count
|
|
236 of heterozygous variants observed in this
|
|
237 genome. See user guide for more information.
|
|
238 VarScoreBCalib The calibrated reference score of file B, under
|
|
239 the model selected by using or not using the
|
|
240 "--diploid" option, and corrected for the count
|
|
241 of heterozygous variants observed in this
|
|
242 genome. See user guide for more information.
|
|
243 SomaticRank The estimated rank of this somatic mutation,
|
|
244 amongst all true somatic mutations within this
|
|
245 SomaticCategory. The value is a number between
|
|
246 0 and 1; a value of 0.012 means, for example,
|
|
247 that an estimated 1.2% of the true somatic
|
|
248 mutations in this somaticCategory have a
|
|
249 somaticScore less than the somaticScore for
|
|
250 this mutation. See user guide for more
|
|
251 information.
|
|
252 SomaticScore An integer that provides a total order on
|
|
253 quality for all somatic mutations. It is equal
|
|
254 to -10*log10( P(false)/P(true) ), under the
|
|
255 assumption that this genome has a rate of
|
|
256 somatic mutation equal to 1/Mb for
|
|
257 SomaticCategory snp, 1/10Mb for SomaticCategory
|
|
258 ins, 1/10Mb for SomaticCategory del, and 1/20Mb
|
|
259 for SomaticCategory sub. The computation is
|
|
260 based on the assumptions described in the user
|
|
261 guide, and is affected by choice of variant
|
|
262 model selected by using or not using the
|
|
263 "--diploid" option.
|
|
264 SomaticQuality Equal to VQHIGH for all somatic mutations where
|
|
265 SomaticScore >= -10. Otherwise, this column is
|
|
266 empty.
|
|
267
|
|
268 OPTIONS
|
|
269 -h [ --help ]
|
|
270 Print this help message.
|
|
271
|
|
272 --reference arg
|
|
273 The input crr file.
|
|
274
|
|
275 --variantsA arg
|
|
276 The "A" input variant file.
|
|
277
|
|
278 --variantsB arg
|
|
279 The "B" input variant file.
|
|
280
|
|
281 --output-prefix arg
|
|
282 The path prefix for all output reports.
|
|
283
|
|
284 --reports arg (=SuperlocusOutput,SuperlocusStats,LocusOutput,LocusStats)
|
|
285 Comma-separated list of reports to generate. (Beware any reports whose
|
|
286 name begins with "Debug".) A report is one of:
|
|
287 SuperlocusOutput Report for superlocus classification.
|
|
288 SuperlocusStats Report for superlocus classification stats.
|
|
289 LocusOutput Report for locus classification.
|
|
290 LocusStats Report for locus stats.
|
|
291 VariantOutput Both variant files annotated by comparison
|
|
292 results.If the somatic output report is
|
|
293 requested, file A is also annotated with the
|
|
294 same score ranks as produced in that report.
|
|
295 SomaticOutput Report for the list of simple variations that
|
|
296 are present only in file "A", annotated with
|
|
297 the score that indicates the probability of
|
|
298 the variation being truly somatic. Requires
|
|
299 beta, genome-rootA, and genome-rootB options
|
|
300 to be provided as well. Note: generating this
|
|
301 report slows calldiff by 10x-20x.
|
|
302 DebugCallOutput Report for call classification.
|
|
303 DebugSuperlocusOutput Report for debug superlocus information.
|
|
304 DebugSomaticOutput Report for distribution estimates used for
|
|
305 somatic rescoring. Only produced if
|
|
306 SomaticOutput is also turned on.
|
|
307
|
|
308 --diploid
|
|
309 Uses varScoreEAF instead of varScoreVAF in somatic score computations.
|
|
310 Also, uses diploid variant model instead of variable allele mixture
|
|
311 model.
|
|
312
|
|
313 --locus-stats-column-count arg (=15)
|
|
314 The number of columns for locus compare classification in the locus
|
|
315 stats file.
|
|
316
|
|
317 --max-hypothesis-count arg (=32)
|
|
318 The maximum number of possible phasings to consider for a superlocus.
|
|
319
|
|
320 --no-reference-cover-validation
|
|
321 Turns off validation that all bases of a chromosome are covered by
|
|
322 calls of the variant file.
|
|
323
|
|
324 --genome-rootA arg
|
|
325 The "A" genome directory, for example /data/GS00118-DNA_A01; this
|
|
326 directory is expected to contain ASM/REF and ASM/EVIDENCE
|
|
327 subdirectories.
|
|
328
|
|
329 --genome-rootB arg
|
|
330 The "B" genome directory.
|
|
331
|
|
332 --calibration-root arg
|
|
333 The directory containing calibration data. For example, there should
|
|
334 exist a file calibration-root/0.0.0/metrics.tsv.
|
|
335
|
|
336 --beta
|
|
337 This flag enables the SomaticOutput report, which is beta
|
|
338 functionality.
|
|
339
|
|
340 SUPPORTED FORMAT_VERSION
|
|
341 0.3 or later
|
|
342 </help>
|
|
343 </tool>
|