comparison qa.xml @ 0:159422a38a42 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/checkm commit 2a3b068a98bf0e913dc03e0d5c2182cfd102cf27
author iuc
date Fri, 29 Jul 2022 20:24:07 +0000
parents
children cb4a5b624518
comparison
equal deleted inserted replaced
-1:000000000000 0:159422a38a42
1 <tool id="checkm_qa" name="CheckM qa" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2 <description>
3 Assess bins for contamination and completeness
4 </description>
5 <macros>
6 <import>macros.xml</import>
7 </macros>
8 <expand macro="biotools"/>
9 <expand macro="requirements"/>
10 <expand macro="version"/>
11 <command detect_errors="exit_code"><![CDATA[
12 #import re
13 mkdir -p 'output/storage/' &&
14 cp '$checkm_hmm_info' 'output/storage/checkm_hmm_info.pkl.gz' &&
15 cp '$bin_stats_analyze' 'output/storage/bin_stats.analyze.tsv' &&
16 #for $i in $hmmer_analyze
17 #set $identifier = re.sub('[^\s\w\-\\.]', '_', str($i.element_identifier))
18 mkdir -p 'output/bins/${identifier}' &&
19 cp '$i' 'output/bins/${identifier}/hmmer.analyze.txt' &&
20 #end for
21 #if $output.out_format == '9'
22 #for $i in $output.genes_faa
23 #set $identifier = re.sub('[^\s\w\-\\.]', '_', str($i.element_identifier))
24 mkdir -p 'output/bins/${identifier}' &&
25 cp '$i' 'output/bins/${identifier}/genes.faa' &&
26 #end for
27 #end if
28
29 checkm qa
30 '$marker_file'
31 'output'
32 --out_format $output.out_format
33 --tab_table
34 --file 'output_file'
35 #if $exclude_markers
36 --exclude_markers '$exclude_markers'
37 #end if
38 $individual_markers
39 $skip_adj_correction
40 $skip_pseudogene_correction
41 --aai_strain $aai_strain
42 $ignore_thresholds
43 --e_value $e_value
44 --length $length
45 #if $coverage
46 --coverage_file '$coverage'
47 #end if
48 --threads \${GALAXY_SLOTS:-1}
49 ]]></command>
50 <inputs>
51 <expand macro="marker_file" />
52 <param name="checkm_hmm_info" type="data" format="zip" label="Marker gene HMM info for each bin" help="Output of the CheckM analyze tool" />
53 <param name="bin_stats_analyze" type="data" format="tabular" label="Marker gene bin stats" help="Output of the CheckM analyze tool" />
54 <param name="hmmer_analyze" type="data_collection" collection_type="list" format="txt" label="Marker gene HMM hits to each bin" help="Output of the CheckM analyze tool" />
55 <conditional name="output">
56 <param argument="--out_format" type="select" label="Desired output">
57 <option value="1">Summary of bin completeness and contamination</option>
58 <option value="2">Extended summary of bin statistics (includes GC, genome size, ...)</option>
59 <option value="3">Summary of bin quality for increasingly basal lineage-specific marker sets</option>
60 <option value="4">List of marker genes and their counts</option>
61 <option value="5">List of bin id, marker gene id, gene id</option>
62 <option value="6">List of marker genes present multiple times in a bin</option>
63 <option value="7">List of marker genes present multiple times on the same scaffold</option>
64 <option value="8">List indicating position of each marker gene within a bin</option>
65 <option value="9">Marker genes identified in each bin and their sequence</option>
66 </param>
67 <when value="1"/>
68 <when value="2"/>
69 <when value="3"/>
70 <when value="4"/>
71 <when value="5"/>
72 <when value="6"/>
73 <when value="7"/>
74 <when value="8"/>
75 <when value="9">
76 <param name="genes_faa" type="data_collection" collection_type="list" label="Nucleotide gene sequences for each bin"/>
77 </when>
78 </conditional>
79 <param argument="exclude_markers" type="data" format="txt" optional="true" label="Markers to exclude from marker sets" />
80 <expand macro="qa_params" />
81 <param argument="coverage" type="data" format="txt" optional="true" label="Coverage of each sequence" help="Generated by the coverage command" />
82 <param name="extra_outputs" type="select" multiple="true" optional="true" label="Extra outputs">
83 <expand macro="qa_extra_output_options" />
84 </param>
85 </inputs>
86 <outputs>
87 <data name="output_f1" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Summary of bin completeness and contamination">
88 <filter>output['out_format']=="1"</filter>
89 </data>
90 <data name="output_f2" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Extended summary of bin statistics">
91 <filter>output['out_format']=="2"</filter>
92 </data>
93 <data name="output_f3" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Summary of bin quality for increasingly basal lineage-specific marker sets">
94 <filter>output['out_format']=="3"</filter>
95 </data>
96 <data name="output_f4" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Marker genes and their counts">
97 <filter>output['out_format']=="4"</filter>
98 </data>
99 <data name="output_f5" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Bin id, marker gene id, gene id">
100 <filter>output['out_format']=="5"</filter>
101 </data>
102 <data name="output_f6" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Marker genes present multiple times in a bin">
103 <filter>output['out_format']=="6"</filter>
104 </data>
105 <data name="output_f7" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Marker genes present multiple times on the same scaffold">
106 <filter>output['out_format']=="7"</filter>
107 </data>
108 <data name="output_f8" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Indicating position of each marker gene within a bin">
109 <filter>output['out_format']=="8"</filter>
110 </data>
111 <data name="output_f9" format="tabular" from_work_dir="output_file" label="${tool.name} on ${on_string}: Marker genes identified in each bin and their sequence">
112 <filter>output['out_format']=="9"</filter>
113 </data>
114 <data name="bin_stats_ext" format="tabular" from_work_dir="output/storage/bin_stats_ext.tsv" label="${tool.name} on ${on_string}: Marker gene bin extensive stats"/>
115 <expand macro="qa_extra_outputs" />
116 </outputs>
117 <tests>
118 <test expect_num_outputs="3">
119 <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
120 <param name="hmmer_analyze">
121 <collection type="list">
122 <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
123 </collection>
124 </param>
125 <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
126 <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
127 <conditional name="output">
128 <param name="out_format" value="1"/>
129 </conditional>
130 <param name="individual_markers" value="false"/>
131 <param name="skip_adj_correction" value="false"/>
132 <param name="skip_pseudogene_correction" value="false"/>
133 <param name="aai_strain" value="0.9"/>
134 <param name="ignore_thresholds" value="false"/>
135 <param name="e_value" value="1e-10"/>
136 <param name="length" value="0.7"/>
137 <param name="extra_outputs" value="marker_gene_stats"/>
138 <output name="output_f1" ftype="tabular">
139 <assert_contents>
140 <has_text text="Marker lineage"/>
141 <has_text text="637000110"/>
142 <has_text text="f__Enterobacteriaceae"/>
143 </assert_contents>
144 </output>
145 <output name="bin_stats_ext" ftype="tabular">
146 <assert_contents>
147 <has_text text="637000110"/>
148 <has_text text="marker lineage"/>
149 <has_text text="GCN0"/>
150 <has_text text="Longest contig"/>
151 </assert_contents>
152 </output>
153 <!--<output name="alignment_file" ftype="tabular">
154 <assert_contents>
155 <has_text text="637000110"/>
156 <has_text text="Lineage Marker File"/>
157 <has_text text="UID5139"/>
158 </assert_contents>
159 </output>-->
160 <output name="marker_gene_stats" ftype="tabular">
161 <assert_contents>
162 <has_text text="637000110"/>
163 <has_text text="AC_000091_751"/>
164 <has_text text="TIGR02432"/>
165 </assert_contents>
166 </output>
167 </test>
168 <test expect_num_outputs="2">
169 <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
170 <param name="hmmer_analyze">
171 <collection type="list">
172 <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
173 </collection>
174 </param>
175 <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
176 <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
177 <conditional name="output">
178 <param name="out_format" value="2"/>
179 </conditional>
180 <param name="individual_markers" value="false"/>
181 <param name="skip_adj_correction" value="false"/>
182 <param name="skip_pseudogene_correction" value="false"/>
183 <param name="aai_strain" value="0.9"/>
184 <param name="ignore_thresholds" value="false"/>
185 <param name="e_value" value="1e-10"/>
186 <param name="length" value="0.7"/>
187 <param name="extra_outputs" value=""/>
188 <output name="output_f2" ftype="tabular">
189 <assert_contents>
190 <has_text text="Marker lineage"/>
191 <has_text text="Mean scaffold length"/>
192 <has_text text="f__Enterobacteriaceae"/>
193 </assert_contents>
194 </output>
195 <output name="bin_stats_ext" ftype="tabular">
196 <assert_contents>
197 <has_text text="637000110"/>
198 <has_text text="marker lineage"/>
199 <has_text text="GCN0"/>
200 <has_text text="Longest contig"/>
201 </assert_contents>
202 </output>
203 </test>
204 <test expect_num_outputs="2">
205 <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
206 <param name="hmmer_analyze">
207 <collection type="list">
208 <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
209 </collection>
210 </param>
211 <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
212 <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
213 <conditional name="output">
214 <param name="out_format" value="3"/>
215 </conditional>
216 <param name="individual_markers" value="false"/>
217 <param name="skip_adj_correction" value="false"/>
218 <param name="skip_pseudogene_correction" value="false"/>
219 <param name="aai_strain" value="0.9"/>
220 <param name="ignore_thresholds" value="false"/>
221 <param name="e_value" value="1e-10"/>
222 <param name="length" value="0.7"/>
223 <param name="extra_outputs" value=""/>
224 <output name="output_f3" ftype="tabular">
225 <assert_contents>
226 <has_text text="637000110"/>
227 <has_text text="Strain heterogeneity"/>
228 <has_text text="UID5139"/>
229 <has_text text="p__Proteobacteria"/>
230 </assert_contents>
231 </output>
232 <output name="bin_stats_ext" ftype="tabular">
233 <assert_contents>
234 <has_text text="637000110"/>
235 <has_text text="marker lineage"/>
236 <has_text text="GCN0"/>
237 <has_text text="Longest contig"/>
238 </assert_contents>
239 </output>
240 </test>
241 <test expect_num_outputs="2">
242 <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
243 <param name="hmmer_analyze">
244 <collection type="list">
245 <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
246 </collection>
247 </param>
248 <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
249 <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
250 <conditional name="output">
251 <param name="out_format" value="4"/>
252 </conditional>
253 <param name="individual_markers" value="false"/>
254 <param name="skip_adj_correction" value="false"/>
255 <param name="skip_pseudogene_correction" value="false"/>
256 <param name="aai_strain" value="0.9"/>
257 <param name="ignore_thresholds" value="false"/>
258 <param name="e_value" value="1e-10"/>
259 <param name="length" value="0.7"/>
260 <param name="extra_outputs" value=""/>
261 <output name="output_f4" ftype="tabular">
262 <assert_contents>
263 <has_text text="637000110"/>
264 <has_text text="Node Id: UID5103; Marker lineage: f__Enterobacteriaceae"/>
265 <has_text text="PF02542.1"/>
266 </assert_contents>
267 </output>
268 <output name="bin_stats_ext" ftype="tabular">
269 <assert_contents>
270 <has_text text="637000110"/>
271 <has_text text="marker lineage"/>
272 <has_text text="GCN0"/>
273 <has_text text="Longest contig"/>
274 </assert_contents>
275 </output>
276 </test>
277 <test expect_num_outputs="2">
278 <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
279 <param name="hmmer_analyze">
280 <collection type="list">
281 <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
282 </collection>
283 </param>
284 <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
285 <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
286 <conditional name="output">
287 <param name="out_format" value="5"/>
288 </conditional>
289 <param name="individual_markers" value="false"/>
290 <param name="skip_adj_correction" value="false"/>
291 <param name="skip_pseudogene_correction" value="false"/>
292 <param name="aai_strain" value="0.9"/>
293 <param name="ignore_thresholds" value="false"/>
294 <param name="e_value" value="1e-10"/>
295 <param name="length" value="0.7"/>
296 <param name="extra_outputs" value=""/>
297 <output name="output_f5" ftype="tabular">
298 <assert_contents>
299 <has_text text="637000110"/>
300 <has_text text="TIGR02432"/>
301 <has_text text="AC_000091_165"/>
302 </assert_contents>
303 </output>
304 <output name="bin_stats_ext" ftype="tabular">
305 <assert_contents>
306 <has_text text="637000110"/>
307 <has_text text="marker lineage"/>
308 <has_text text="GCN0"/>
309 <has_text text="Longest contig"/>
310 </assert_contents>
311 </output>
312 </test>
313 <test expect_num_outputs="2">
314 <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
315 <param name="hmmer_analyze">
316 <collection type="list">
317 <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
318 </collection>
319 </param>
320 <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
321 <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
322 <conditional name="output">
323 <param name="out_format" value="6"/>
324 </conditional>
325 <param name="individual_markers" value="false"/>
326 <param name="skip_adj_correction" value="false"/>
327 <param name="skip_pseudogene_correction" value="false"/>
328 <param name="aai_strain" value="0.9"/>
329 <param name="ignore_thresholds" value="false"/>
330 <param name="e_value" value="1e-10"/>
331 <param name="length" value="0.7"/>
332 <param name="extra_outputs" value=""/>
333 <output name="output_f6" ftype="tabular">
334 <assert_contents>
335 <has_text text="Marker Id"/>
336 <has_text text="No marker genes satisfied"/>
337 </assert_contents>
338 </output>
339 <output name="bin_stats_ext" ftype="tabular">
340 <assert_contents>
341 <has_text text="637000110"/>
342 <has_text text="marker lineage"/>
343 <has_text text="GCN0"/>
344 <has_text text="Longest contig"/>
345 </assert_contents>
346 </output>
347 </test>
348 <test expect_num_outputs="2">
349 <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
350 <param name="hmmer_analyze">
351 <collection type="list">
352 <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
353 </collection>
354 </param>
355 <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
356 <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
357 <conditional name="output">
358 <param name="out_format" value="7"/>
359 </conditional>
360 <param name="individual_markers" value="false"/>
361 <param name="skip_adj_correction" value="false"/>
362 <param name="skip_pseudogene_correction" value="false"/>
363 <param name="aai_strain" value="0.9"/>
364 <param name="ignore_thresholds" value="false"/>
365 <param name="e_value" value="1e-10"/>
366 <param name="length" value="0.7"/>
367 <param name="extra_outputs" value=""/>
368 <output name="output_f7" ftype="tabular">
369 <assert_contents>
370 <has_text text="Marker Id"/>
371 <has_text text="No marker genes satisfied"/>
372 </assert_contents>
373 </output>
374 <output name="bin_stats_ext" ftype="tabular">
375 <assert_contents>
376 <has_text text="637000110"/>
377 <has_text text="marker lineage"/>
378 <has_text text="GCN0"/>
379 <has_text text="Longest contig"/>
380 </assert_contents>
381 </output>
382 </test>
383 <test expect_num_outputs="2">
384 <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
385 <param name="hmmer_analyze">
386 <collection type="list">
387 <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
388 </collection>
389 </param>
390 <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
391 <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
392 <conditional name="output">
393 <param name="out_format" value="8"/>
394 </conditional>
395 <param name="individual_markers" value="false"/>
396 <param name="skip_adj_correction" value="false"/>
397 <param name="skip_pseudogene_correction" value="false"/>
398 <param name="aai_strain" value="0.9"/>
399 <param name="ignore_thresholds" value="false"/>
400 <param name="e_value" value="1e-10"/>
401 <param name="length" value="0.7"/>
402 <param name="extra_outputs" value=""/>
403 <output name="output_f8" ftype="tabular">
404 <assert_contents>
405 <has_text text="637000110"/>
406 <has_text text="AC_000091_183"/>
407 <has_text text="TIGR02075,9,240"/>
408 </assert_contents>
409 </output>
410 <output name="bin_stats_ext" ftype="tabular">
411 <assert_contents>
412 <has_text text="637000110"/>
413 <has_text text="marker lineage"/>
414 <has_text text="GCN0"/>
415 <has_text text="Longest contig"/>
416 </assert_contents>
417 </output>
418 </test>
419 <test expect_num_outputs="2">
420 <param name="marker_file" ftype="tabular" value="lineage_marker_set"/>
421 <param name="hmmer_analyze">
422 <collection type="list">
423 <element name="637000110" ftype="txt" value="hmmer.analyze.txt"/>
424 </collection>
425 </param>
426 <param name="bin_stats_analyze" ftype="tabular" value="bin_stats.analyze.tsv"/>
427 <param name="checkm_hmm_info" ftype="zip" value="checkm_hmm_info.pkl.gz"/>
428 <conditional name="output">
429 <param name="out_format" value="9"/>
430 <param name="genes_faa">
431 <collection type="list">
432 <element name="637000110" ftype="fasta" value="637000110.faa"/>
433 </collection>
434 </param>
435 </conditional>
436 <param name="exclude_markers" ftype="txt" value="markers_to_exclude" />
437 <param name="individual_markers" value="false"/>
438 <param name="skip_adj_correction" value="false"/>
439 <param name="skip_pseudogene_correction" value="false"/>
440 <param name="aai_strain" value="0.9"/>
441 <param name="ignore_thresholds" value="false"/>
442 <param name="e_value" value="1e-10"/>
443 <param name="length" value="0.7"/>
444 <param name="extra_outputs" value=""/>
445 <output name="output_f9" ftype="tabular">
446 <assert_contents>
447 <has_text text="637000110"/>
448 <has_text text="Sequence"/>
449 <has_text text="PF06574.7"/>
450 <has_text text="MKLIRGI"/>
451 </assert_contents>
452 </output>
453 <output name="bin_stats_ext" ftype="tabular">
454 <assert_contents>
455 <has_text text="637000110"/>
456 <has_text text="marker lineage"/>
457 <has_text text="GCN0"/>
458 <has_text text="Longest contig"/>
459 </assert_contents>
460 </output>
461 </test>
462 </tests>
463 <help><![CDATA[
464 @HELP_HEADER@
465
466 This command identifies marker genes in bins and calculates genome statistics
467
468 Adjacent called genes matching the same marker gene may indicate a true duplication event, a gene calling error, or an assembly error. If adjacent genes hit distinct regions of the same marker gene HMM, CheckM assumes a gene calling error has occurred and concatenate the two genes. When this occurs, CheckM concatenates the gene ids of the two genes with a pair of ampersands (&&).
469
470 Outputs
471 =======
472 Output in function of selection output format
473
474 1. Summary of bin completeness, contamination, and strain heterogeneity
475 Bin Id: bin identifier derived from input FASTA file
476 Marker lineage: indicates lineage used for inferring marker set (a precise indication of where a bin was placed in CheckM's reference tree can be obtained with the tree_qa command)
477 No. genomes: number of reference genomes used to infer marker set
478 No. markers: number of inferred marker genes
479 No. marker sets: number of inferred co-located marker sets
480 0-5+: number of times each marker gene is identified
481 Completeness: estimated completeness
482 Contamination: estimated contamination
483 Strain heterogeneity: estimated strain heterogeneity
484 2. Extended summary of bin quality (includes GC, genome size, coding density, ...)
485 3. Summary of bin quality for increasingly basal lineage-specific marker sets
486 Node Id: unique id of internal node in genome tree from which lineage-specific markers were inferred
487 4. ist of marker genes for each bin along with the number of times each marker was identified
488 Node Id: unique id of internal node in genome tree from which lineage-specific markers were inferred
489 Marker lineage: indicates lineage used for inferring marker set
490 Useful for identifying lineage-specific gene loss or duplication
491 5. List of bin id, marker gene id, and called gene id for each identified marker gene
492 6. List of marker genes present multiple times in a bin
493 7. List of marker genes present multiple times on the same scaffold
494 Useful for identifying true gene duplication events, gene calling errors, or assembly errors. See note below.
495 8. List indicating the position of each marker genes within a bin
496 9. Marker genes identified in each bin and their sequence
497
498 ]]></help>
499 <expand macro="citations"/>
500 </tool>