comparison cgatools_suite/tools/cgatools/calldiff.xml @ 7:96829b1b73ea draft

Uploaded
author bcrain-completegenomics
date Wed, 06 Jun 2012 16:58:26 -0400
parents
children
comparison
equal deleted inserted replaced
6:e4eff539a999 7:96829b1b73ea
1 <tool id="cga_calldiff" name="calldiff(beta)" version="0.0.1">
2
3 <description>compares two Complete Genomics variant files.</description> <!--adds description in toolbar-->
4
5 <requirements>
6 <requirement type="binary">cgatools</requirement>
7 </requirements>
8
9 <command> <!--run executable-->
10 cgatools calldiff --beta
11 --reference ${crr.fields.path}
12 --variantsA $data_sources.inputA
13 --variantsB $data_sources.inputB
14 $validation
15 $diploid
16 --locus-stats-column-count $column
17 --max-hypothesis-count $hypothesis
18 --output-prefix cg_
19 --reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} ${somatic.report6} | sed 's/ */,/g'`
20 #if $somatic.report6 == "SomaticOutput"
21 --genome-rootA $somatic.genomeA
22 --genome-rootB $somatic.genomeB
23 --calibration-root $somatic.calibration
24 #end if
25 </command>
26
27 <outputs>
28 <data format="tabular" name="output1" from_work_dir="cg_SuperlocusOutput.tsv" label="${tool.name} on ${on_string}: SuperlocusOutput">
29 <filter>(report1 == 'SuperlocusOutput')</filter>
30 </data>
31 <data format="tabular" name="output2" from_work_dir="cg_SuperlocusStats.tsv" label="${tool.name} on ${on_string}: SuperlocusStats">
32 <filter>(report2 == 'SuperlocusStats')</filter>
33 </data>
34 <data format="tabular" name="output3" from_work_dir="cg_LocusOutput.tsv" label="${tool.name} on ${on_string}: LocusOutput">
35 <filter>(report3 == 'LocusOutput')</filter>
36 </data>
37 <data format="tabular" name="output4" from_work_dir="cg_LocusStats.tsv" label="${tool.name} on ${on_string}: LocusStats">
38 <filter>(report4 == 'LocusStats')</filter>
39 </data>
40 <data format="tabular" name="output5a" from_work_dir="cg_VariantsA.tsv" label="${tool.name} on ${on_string}: VariantsA">
41 <filter>(report5 == 'VariantOutput')</filter>
42 </data>
43 <data format="tabular" name="output5b" from_work_dir="cg_VariantsB.tsv" label="${tool.name} on ${on_string}: VariantsB">
44 <filter>(report5 == 'VariantOutput')</filter>
45 </data>
46 <data format="tabular" name="output6" from_work_dir="cg_SomaticOutput.tsv" label="${tool.name} on ${on_string}: SomaticOutput">
47 <filter>(somatic['report6'] == 'SomaticOutput')</filter>
48 </data>
49 </outputs>
50
51 <inputs>
52 <!--form field to select crr file-->
53 <param name="crr" type="select" label="Genome build">
54 <options from_data_table="cg_crr_files" />
55 </param>
56
57 <!--conditional to select variant file input-->
58 <conditional name="data_sources">
59 <param name="data_source" type="select" label="Where are the input varfiles?">
60 <option value="in" selected="true">imported into Galaxy</option>
61 <option value="out">located outside Galaxy (available only for local Galaxy instances)</option>
62 </param>
63 <when value="in">
64 <!--form field to select variant files-->
65 <param name="inputA" type="data" format="cg_var" label="Dataset A">
66 <validator type="unspecified_build" />
67 <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
68 metadata_name="dbkey" metadata_column="1"
69 message="cgatools is not currently available for this build."/>
70 </param>
71 <param name="inputB" type="data" format="cg_var" label="Dataset B">
72 <validator type="unspecified_build" />
73 <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
74 metadata_name="dbkey" metadata_column="1"
75 message="cgatools is not currently available for this build."/>
76 </param>
77 </when>
78 <when value="out">
79 <!--form field to select crr file-->
80 <param name="inputA" type="text" label="Variant file A (path/file_name)" size="300" help="Variant files can be compressed (gz, bz2)."/>
81 <param name="inputB" type="text" label="Variant file B (path/file_name)" size="300" help="Variant files can be compressed (gz, bz2)."/>
82 </when>
83 </conditional>
84
85 <param name="diploid" type="select" label="Use diploid variant model" help="Uses varScoreEAF instead of varScoreVAF in somatic score computations. Also, uses diploid variant model instead of variable allele mixture model.">
86 <option value="">no</option>
87 <option value="--diploid">yes</option>
88 </param>
89
90 <param name="column" type="integer" label="Number of columns for locus compare classification in the locus stats file (default 15)" value="15"/>
91
92 <param name="hypothesis" type="integer" label="Maximum number of possible phasings to consider for a superlocus (default 32)" value="32"/>
93
94 <param name="validation" type="select" label="Reference cover validation" help="Turns on/off validation that all bases of a chromosome are covered by calls of the variant file.">
95 <option value="">on</option>
96 <option value="--no-reference-cover-validation">off</option>
97 </param>
98
99 <param name="report1" type="select" label="Report SuperlocusOutput">
100 <option value="">no</option>
101 <option value="SuperlocusOutput">yes</option>
102 </param>
103 <param name="report2" type="select" label="Report SuperlocusStats">
104 <option value="">no</option>
105 <option value="SuperlocusStats">yes</option>
106 </param>
107 <param name="report3" type="select" label="Report LocusOutput">
108 <option value="">no</option>
109 <option value="LocusOutput">yes</option>
110 </param>
111 <param name="report4" type="select" label="Report LocusStats">
112 <option value="">no</option>
113 <option value="LocusStats">yes</option>
114 </param>
115 <param name="report5" type="select" label="Report VariantOutput" help="Both variant files annotated by comparison results.If the somatic output report is requested, file A is also annotated with the same score ranks as produced in that report.">
116 <option value="">no</option>
117 <option value="VariantOutput">yes</option>
118 </param>
119
120 <conditional name="somatic">
121 <param name="report6" type="select" label="Report SomaticOutput" help="This report can only be generated on local Galaxy instances. Report for the list of simple variations that are present only in file 'A', annotated with the score that indicates the probability of the variation being truly somatic. Note: generating this report slows calldiff by 10x-20x.">
122 <option value="">no</option>
123 <option value="SomaticOutput">yes</option>
124 </param>
125 <when value="SomaticOutput">
126 <param name="genomeA" type="text" size="300" label="Directory for genome A (path/dir)" help="The 'A' genome directory, for example /data/GS00118-DNA_A01; this directory is expected to contain ASM/REF and ASM/EVIDENCE subdirectories."/>
127 <param name="genomeB" type="text" size="300" label="Directory for genome B (path/dir)" help="The 'B' genome directory"/>
128 <param name="calibration" type="text" size="300" label="Directory calibration data (path/dir)" help="The directory containing calibration data. For example, there should exist a file calibration-root/0.0.0/metrics.tsv. Calibration data can be downloaded from ftp://ftp.completegenomics.com/ScoreCalibrationFiles/var-calibration-v1.tgz"/>
129 </when>
130 </conditional>
131
132 </inputs>
133
134 <help>
135
136 **What it does**
137
138 This tool compares two Complete Genomics variant files.
139
140 cgatools: http://sourceforge.net/projects/cgatools/files/
141
142 -----
143
144 **cgatools Manual**::
145
146 COMMAND NAME
147 calldiff - Compares two Complete Genomics variant files.
148
149 DESCRIPTION
150 Compares two Complete Genomics variant files. Divides the genome up into
151 superloci of nearby variants, then compares the superloci. Also refines the
152 comparison to determine per-call or per-locus comparison results.
153
154 Comparison results are usually described by a semi-colon separated string,
155 one per allele. Each allele's comparison result is one of the following
156 classifications:
157
158 ref-identical The alleles of the two variant files are identical, and
159 they are consistent with the reference.
160 alt-identical The alleles of the two variant files are identical, and
161 they are inconsistent with the reference.
162 ref-consistent The alleles of the two variant files are consistent,
163 and they are consistent with the reference.
164 alt-consistent The alleles of the two variant files are consistent,
165 and they are inconsistent with the reference.
166 onlyA The alleles of the two variant files are inconsistent,
167 and only file A is inconsistent with the reference.
168 onlyB The alleles of the two variant files are inconsistent,
169 and only file B is inconsistent with the reference.
170 mismatch The alleles of the two variant files are inconsistent,
171 and they are both inconsistent with the reference.
172 phase-mismatch The two variant files would be consistent if the
173 hapLink field had been empty, but they are
174 inconsistent.
175 ploidy-mismatch The superlocus did not have uniform ploidy.
176
177 In some contexts, this classification is rolled up into a simplified
178 classification, which is one of "identical", "consistent", "onlyA",
179 "onlyB", or "mismatch".
180
181 A good place to start looking at the results is the superlocus-output file.
182 It has columns defined as follows:
183
184 SuperlocusId An identifier given to the superlocus.
185 Chromosome The name of the chromosome.
186 Begin The 0-based offset of the start of the superlocus.
187 End The 0-based offset of the base one past the end of the
188 superlocus.
189 Classification The match classification of the superlocus.
190 Reference The reference sequence.
191 AllelesA A semicolon-separated list of the alleles (one per
192 haplotype) for variant file A, for the phasing with the
193 best comparison result.
194 AllelesB A semicolon-separated list of the alleles (one per
195 haplotype) for variant file B, for the phasing with the
196 best comparison result.
197
198 The locus-output file contains, for each locus in file A and file B that is
199 not consistent with the reference, an annotated set of calls for the locus.
200 The calls are annotated with the following columns:
201
202 SuperlocusId The id of the superlocus containing the locus.
203 File The variant file (A or B).
204 LocusClassification The locus classification is determined by the
205 varType column of the call that is inconsistent
206 with the reference, concatenated with a
207 modifier that describes whether the locus is
208 heterozygous, homozygous, or contains no-calls.
209 If there is no one variant in the locus (i.e.,
210 it is heterozygous alt-alt), the locus
211 classification begins with "other".
212 LocusDiffClassification The match classification for the locus. This is
213 defined to be the best of the comparison of the
214 locus to the same region in the other file, or
215 the comparison of the superlocus.
216
217 The somatic output file contains a list of putative somatic variations of
218 genome A. The output includes only those loci that can be classified as
219 snp, del, ins or sub in file A, and are called reference in the file B.
220 Every locus is annotated with the following columns:
221
222 VarCvgA The totalReadCount from file A for this locus
223 (computed on the fly if file A is not a
224 masterVar file).
225 VarScoreA The varScoreVAF from file A, or varScoreEAF if
226 the "--diploid" option is used.
227 RefCvgB The maximum of the uniqueSequenceCoverage
228 values for the locus in genome B.
229 RefScoreB Minimum of the reference scores of the locus in
230 genome B.
231 SomaticCategory The category used for determining the
232 calibrated scores and the SomaticRank.
233 VarScoreACalib The calibrated variant score of file A, under
234 the model selected by using or not using the
235 "--diploid" option, and corrected for the count
236 of heterozygous variants observed in this
237 genome. See user guide for more information.
238 VarScoreBCalib The calibrated reference score of file B, under
239 the model selected by using or not using the
240 "--diploid" option, and corrected for the count
241 of heterozygous variants observed in this
242 genome. See user guide for more information.
243 SomaticRank The estimated rank of this somatic mutation,
244 amongst all true somatic mutations within this
245 SomaticCategory. The value is a number between
246 0 and 1; a value of 0.012 means, for example,
247 that an estimated 1.2% of the true somatic
248 mutations in this somaticCategory have a
249 somaticScore less than the somaticScore for
250 this mutation. See user guide for more
251 information.
252 SomaticScore An integer that provides a total order on
253 quality for all somatic mutations. It is equal
254 to -10*log10( P(false)/P(true) ), under the
255 assumption that this genome has a rate of
256 somatic mutation equal to 1/Mb for
257 SomaticCategory snp, 1/10Mb for SomaticCategory
258 ins, 1/10Mb for SomaticCategory del, and 1/20Mb
259 for SomaticCategory sub. The computation is
260 based on the assumptions described in the user
261 guide, and is affected by choice of variant
262 model selected by using or not using the
263 "--diploid" option.
264 SomaticQuality Equal to VQHIGH for all somatic mutations where
265 SomaticScore &gt;= -10. Otherwise, this column is
266 empty.
267
268 OPTIONS
269 -h [ --help ]
270 Print this help message.
271
272 --reference arg
273 The input crr file.
274
275 --variantsA arg
276 The "A" input variant file.
277
278 --variantsB arg
279 The "B" input variant file.
280
281 --output-prefix arg
282 The path prefix for all output reports.
283
284 --reports arg (=SuperlocusOutput,SuperlocusStats,LocusOutput,LocusStats)
285 Comma-separated list of reports to generate. (Beware any reports whose
286 name begins with "Debug".) A report is one of:
287 SuperlocusOutput Report for superlocus classification.
288 SuperlocusStats Report for superlocus classification stats.
289 LocusOutput Report for locus classification.
290 LocusStats Report for locus stats.
291 VariantOutput Both variant files annotated by comparison
292 results.If the somatic output report is
293 requested, file A is also annotated with the
294 same score ranks as produced in that report.
295 SomaticOutput Report for the list of simple variations that
296 are present only in file "A", annotated with
297 the score that indicates the probability of
298 the variation being truly somatic. Requires
299 beta, genome-rootA, and genome-rootB options
300 to be provided as well. Note: generating this
301 report slows calldiff by 10x-20x.
302 DebugCallOutput Report for call classification.
303 DebugSuperlocusOutput Report for debug superlocus information.
304 DebugSomaticOutput Report for distribution estimates used for
305 somatic rescoring. Only produced if
306 SomaticOutput is also turned on.
307
308 --diploid
309 Uses varScoreEAF instead of varScoreVAF in somatic score computations.
310 Also, uses diploid variant model instead of variable allele mixture
311 model.
312
313 --locus-stats-column-count arg (=15)
314 The number of columns for locus compare classification in the locus
315 stats file.
316
317 --max-hypothesis-count arg (=32)
318 The maximum number of possible phasings to consider for a superlocus.
319
320 --no-reference-cover-validation
321 Turns off validation that all bases of a chromosome are covered by
322 calls of the variant file.
323
324 --genome-rootA arg
325 The "A" genome directory, for example /data/GS00118-DNA_A01; this
326 directory is expected to contain ASM/REF and ASM/EVIDENCE
327 subdirectories.
328
329 --genome-rootB arg
330 The "B" genome directory.
331
332 --calibration-root arg
333 The directory containing calibration data. For example, there should
334 exist a file calibration-root/0.0.0/metrics.tsv.
335
336 --beta
337 This flag enables the SomaticOutput report, which is beta
338 functionality.
339
340 SUPPORTED FORMAT_VERSION
341 0.3 or later
342 </help>
343 </tool>