comparison craq.xml @ 0:7895d95c01fa draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/craq commit fe19727db664bcad91b66546149cbf34a6a012e7
author iuc
date Wed, 18 Mar 2026 13:17:03 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7895d95c01fa
1 <tool id="craq" name="CRAQ" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2 <description>assess the accuracy of assembled genomic sequences</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements"/>
7 <command detect_errors="exit_code"><![CDATA[
8 ## Decision Tree to structure and prepare input files
9 ## Craq expects sorted and indexed bam files
10 cp '$genome' genome.fa &&
11
12 ## Prepare SMS input
13 #if $sms_input:
14 #if $sms_input[0].is_of_type('bam'):
15 cp '${sms_input[0]}' 'sms_sorted.bam' &&
16 samtools index 'sms_sorted.bam' &&
17 #else:
18 #for $i, $f in enumerate($sms_input):
19 cp '$f' 'sms_${i}.${f.ext}' &&
20 #end for
21 #end if
22 #end if
23
24 ## Prepare NGS input
25 #if $ngs_input:
26 #if $ngs_input[0].is_of_type('bam'):
27 cp '${ngs_input[0]}' 'ngs_sorted.bam' &&
28 samtools index 'ngs_sorted.bam' &&
29 #else:
30 #for $i, $f in enumerate($ngs_input):
31 cp '$f' 'ngs_${i}.${f.ext}' &&
32 #end for
33 #end if
34 #end if
35
36
37 ## Build tool command line
38 craq
39 -g genome.fa
40
41 ## SMS input
42 #if $sms_input:
43 -sms
44 #if $sms_input[0].is_of_type('bam'):
45 'sms_sorted.bam'
46 #else:
47 #set $sms_files = ','.join(['sms_%d.%s' % ($i, $f.ext) for $i, $f in enumerate($sms_input)])
48 $sms_files
49 #end if
50 #end if
51
52 ## NGS input
53 #if $ngs_input:
54 -ngs
55 #if $ngs_input[0].is_of_type('bam'):
56 'ngs_sorted.bam'
57 #else:
58 #set $ngs_files = ','.join(['ngs_%d.%s' % ($i, $f.ext) for $i, $f in enumerate($ngs_input)])
59 $ngs_files
60 #end if
61 #end if
62
63 ## Filter parameters
64 -sn $filter_params.sn
65 -sf $filter_params.sf
66 -ln $filter_params.ln
67 -lf $filter_params.lf
68 -hmin $filter_params.hmin
69 -hmax $filter_params.hmax
70 -mgs $filter_params.mgs
71 --sms_coverage $filter_params.sms_coverage
72 --ngs_coverage $filter_params.ngs_coverage
73
74 ## Other parameters
75 $other_params.ser
76 $other_params.snv
77 --gapmodel $other_params.gapmodel
78 $other_params.break
79 --map $other_params.map
80 --mapq $other_params.mapq
81 --norm_window $other_params.norm_window
82 --regional_window $other_params.regional_window
83 $other_params.plot
84 #if $other_params.plot_ids:
85 --plot_ids $other_params.plot_ids
86 #end if
87 --thread "\${GALAXY_SLOTS:-8}"
88 -D outputs
89 ]]></command>
90 <inputs>
91 <param name="genome" type="data" label="Assembly sequence file" format="fasta" help="The genome assembly to be evaluated in FASTA format"/>
92 <param name="sms_input" type="data" optional="true" multiple="true" label="SMS long-read alignment or sequences" format="bam,fastq,fastq.gz" help="Provide either a single BAM file OR multiple FASTQ files (uncompressed or gzipped). At least one of SMS or NGS input must be provided"/>
93 <param name="ngs_input" type="data" optional="true" multiple="true" label="NGS short-read alignment or sequences" format="bam,fastq,fastq.gz" help="Provide either a single BAM file OR multiple FASTQ files (uncompressed or gzipped). At least one of SMS or NGS input must be provided"/>
94 <section name="filter_params" title="Filter Parameters" expanded="False">
95 <param argument="-sn" type="integer" min="0" value="2" label="Minimum number of NGS clipped-reads" help="Minimum number of NGS reads that must show clipping to flag potential errors"/>
96 <param argument="-sf" type="float" min="0.0" max="1.0" value="0.75" label="Minimum proportion of NGS clipped-reads" help="Minimum proportion of NGS reads that must show clipping relative to total coverage"/>
97 <param argument="-ln" type="integer" min="0" value="2" label="Minimum number of SMS clipped-reads" help="Minimum number of SMS long reads that must show clipping to flag potential errors"/>
98 <param argument="-lf" type="float" min="0.0" max="1.0" value="0.75" label="Minimum proportion of SMS clipped-reads" help="Minimum proportion of SMS reads that must show clipping relative to total coverage"/>
99 <param argument="-hmin" type="float" min="0.0" max="1.0" value="0.4" label="Lower clipping rate for heterozygous allele" help="Lower clipping rate threshold to identify heterozygous variants (CRHs)"/>
100 <param argument="-hmax" type="float" min="0.0" max="1.0" value="0.6" label="Upper clipping rate for heterozygous allele" help="Upper clipping rate threshold to identify heterozygous variants (CRHs)"/>
101 <param argument="-mgs" type="integer" min="1" value="10" label="Minimum gap size (bp)" help="Gap[N] sequences longer than this threshold will be treated as breakage"/>
102 <param argument="--sms_coverage" type="integer" min="0" value="100" label="Average SMS coverage" help="Expected average SMS long-read coverage depth for normalization"/>
103 <param argument="--ngs_coverage" type="integer" min="0" value="100" label="Average NGS coverage" help="Expected average NGS short-read coverage depth for normalization"/>
104 </section>
105 <section name="other_params" title="Other Parameters" expanded="False">
106 <param argument="-ser" type="boolean" checked="true" truevalue="-ser T" falsevalue="" label="Search error regions near breakpoints" help="Search noisy error regions near CRE/CSE breakpoints"/>
107 <param argument="-snv" type="boolean" checked="false" truevalue="-snv T" falsevalue="" label="Report SNV/heterozygous variants" help="Report tiny indel errors or heterozygous variants under 40bp.(Resource intensive)"/>
108 <param argument="--gapmodel" type="select" label="Gap model" help="Gap[N] treatment">
109 <option value="1" selected="true">CRE (regional error)</option>
110 <option value="2">CSE (structural error)</option>
111 </param>
112 <param argument="--break" type="boolean" checked="false" truevalue="--break T" falsevalue="" label="Break chimeric fragments" help="Detect and break chimeric contigs at conflict breakpoints"/>
113 <param argument="--map" type="select" label="Mapping preset" help="Ignored if .bam provided">
114 <option value="map-hifi" selected="true">PacBio HiFi</option>
115 <option value="map-pb" >PacBio CLR</option>
116 <option value="map-ont">Nanopore</option>
117 </param>
118 <param argument="--mapq" type="integer" min="0" max="60" value="20" label="Minimum mapping quality" help="Minimum read mapping quality threshold"/>
119 <param argument="--norm_window" type="float" min="0.0" max="1.0" value="0.0001" label="Normalization window fraction" help="Fraction of the total assembly length used as the window size for normalizing error counts"/>
120 <param argument="--regional_window" type="integer" min="1" value="500000" label="Regional quality window size (bp)" help="Window size in base pairs for regional quality benchmarking across the assembly"/>
121 <param argument="--plot" type="boolean" checked="false" truevalue="--plot T" falsevalue="" label="Generate plots" help="Create CRAQ visualization plots"/>
122 <param argument="--plot_ids" type="data" format="tabular,txt" optional="true" label="Selected assembly IDs for plotting" help="File listing specific assembly IDs to plot (default: all IDs)"/>
123 <param name="advanced_output" type="boolean" checked="false" label="Output advanced error region files" help="Output detailed CRE/CRH and CSE/CSH BED files for regional and structural error regions"/>
124 </section>
125 </inputs>
126 <outputs>
127 <collection name="runAQI_out" type="list" label="${tool.name} on ${on_string}: AQI Results">
128 <discover_datasets pattern="__designation_and_ext__" directory="outputs/runAQI_out"/>
129 </collection>
130 <collection name="sr_out" type="list" label="${tool.name} on ${on_string}: Short Reads outputs">
131 <discover_datasets pattern="__designation_and_ext__" directory="outputs/SRout"/>
132 <filter>ngs_input</filter>
133 </collection>
134 <collection name="lr_out" type="list" label="${tool.name} on ${on_string}: Long Reads outputs">
135 <discover_datasets pattern="__designation_and_ext__" directory="outputs/LRout"/>
136 <filter>sms_input</filter>
137 </collection>
138 <collection name="regional_errors" type="list" label="${tool.name} on ${on_string}: Regional Error Regions">
139 <discover_datasets pattern="__designation_and_ext__" directory="outputs/runAQI_out/locER_out"/>
140 <filter>other_params['advanced_output']</filter>
141 </collection>
142 <collection name="structural_errors" type="list" label="${tool.name} on ${on_string}: Structural Error Regions">
143 <discover_datasets pattern="__designation_and_ext__" directory="outputs/runAQI_out/strER_out"/>
144 <filter>other_params['advanced_output']</filter>
145 </collection>
146 </outputs>
147 <tests>
148 <!-- Test 1: Genome + SMS BAM Input, no NGS input -->
149 <test expect_num_outputs="2">
150 <param name="genome" location="https://zenodo.org/records/19091739/files/genome.fa"/>
151 <param name="sms_input" location="https://zenodo.org/records/19091739/files/SMS_sort.bam"/>
152 <output_collection name="runAQI_out" type="list" count="3"/>
153 <output_collection name="lr_out" type="list" count="10"/>
154 </test>
155 <!-- Test 2: Genome + NGS BAM Input, no SMS input -->
156 <test expect_num_outputs="2">
157 <param name="genome" location="https://zenodo.org/records/19091739/files/genome.fa"/>
158 <param name="ngs_input" location="https://zenodo.org/records/19091739/files/NGS_sort.bam"/>
159 <section name="other_params">
160 <param name="plot" value="true"/>
161 </section>
162 <output_collection name="runAQI_out" type="list" count="4"/>
163 <output_collection name="sr_out" type="list" count="11"/>
164 </test>
165 <!-- Test 3: NGS FASTQ pair + break + snv variants + MAPQ30 -->
166 <test expect_num_outputs="3">
167 <param name="genome" location="https://zenodo.org/records/19091739/files/genome.fa"/>
168 <param name="ngs_input" location="https://zenodo.org/records/19091739/files/NGS_R1.fq.gz,https://zenodo.org/records/19091739/files/NGS_R2.fq.gz"/>
169 <param name="sms_input" location="https://zenodo.org/records/19091739/files/SMS_sort.bam"/>
170 <section name="other_params">
171 <param name="break" value="true"/>
172 <param name="snv" value="true"/>
173 <param name="mapq" value="30"/>
174 </section>
175 <output_collection name="runAQI_out" type="list" count="4"/>
176 <output_collection name="sr_out" type="list" count="9"/>
177 </test>
178 <!-- Test 4: Genome + NGS Paired FASTQ + SMS BAM + Advanced outputs-->
179 <test expect_num_outputs="5">
180 <param name="genome" location="https://zenodo.org/records/19091739/files/genome.fa"/>
181 <param name="ngs_input" location="https://zenodo.org/records/19091739/files/NGS_R1.fq.gz,https://zenodo.org/records/19091739/files/NGS_R2.fq.gz"/>
182 <param name="sms_input" location="https://zenodo.org/records/19091739/files/SMS_sort.bam"/>
183 <section name="other_params">
184 <param name="advanced_output" value="true"/>
185 </section>
186 <output_collection name="runAQI_out" type="list" count="3"/>
187 <output_collection name="lr_out" type="list" count="10"/>
188 <output_collection name="sr_out" type="list" count="9"/>
189 <output_collection name="regional_errors" type="list" count="5"/>
190 <output_collection name="structural_errors" type="list" count="6"/>
191 </test>
192 <!-- Test 5: Plot + file ids selected for plotting -->
193 <test expect_num_outputs="2">
194 <param name="genome" location="https://zenodo.org/records/19091739/files/genome.fa"/>
195 <param name="ngs_input" location="https://zenodo.org/records/19091739/files/SMS_sort.bam"/>
196 <section name="other_params">
197 <param name="plot" value="true"/>
198 <param name="plot_ids" value="ids.txt"/>
199 </section>
200 <output_collection name="runAQI_out" type="list" count="4"/>
201 <output_collection name="sr_out" type="list" count="11"/>
202 </test>
203 </tests>
204 <help><![CDATA[
205 **What it does?**
206
207 CRAQ (Clipping Reveals Assembly Quality) is a reference-free genome assembly quality evaluator.
208 It identifies potential errors in assembled sequences by analysing how reads clip (fail to align continuously)
209 at specific positions - without requiring a reference genome.
210
211 CRAQ produces two key quality scores:
212
213 - **R-AQI** (Regional Assembly Quality Index): captures small-scale errors such as indels and local misassemblies detected by short reads
214 - **S-AQI** (Structural Assembly Quality Index): captures large-scale structural errors such as chimeric joins and inversions detected by long reads
215
216 -----
217
218 **Inputs**
219
220 +---------------------------+----------+----------------------------------------------------------+
221 | Input | Required | Description |
222 +===========================+==========+==========================================================+
223 | Assembly FASTA | Yes | Genome assembly to evaluate in FASTA format |
224 +---------------------------+----------+----------------------------------------------------------+
225 | SMS long-read data | No* | PacBio or Nanopore data as BAM (sorted or unsorted) |
226 | | | OR one or more FASTQ/FASTQ.GZ sequence files |
227 +---------------------------+----------+----------------------------------------------------------+
228 | NGS short-read data | No* | Illumina data as BAM (sorted or unsorted) |
229 | | | OR one or more FASTQ/FASTQ.GZ sequence files |
230 +---------------------------+----------+----------------------------------------------------------+
231
232 \* At least one of SMS or NGS input must be provided. Using both together gives the most complete assessment.
233
234 .. class:: warningmark
235
236 If providing sequence files (FASTQ) rather than alignments, CRAQ will perform the mapping internally
237 using minimap2 for SMS and BWA for NGS. Ensure the correct mapping preset is selected under
238 **Other Parameters** when using raw reads.
239
240 -----
241
242 **Outputs**
243
244 **1) AQI Results** *(always produced)*
245
246 The primary output collection containing:
247
248 - ``AQI_summary.txt`` - final R-AQI and S-AQI scores summarising overall assembly quality
249 - ``regional_statistics.txt`` - per-region breakdown of error counts and coverage
250 - ``circos_plot.pdf`` - visualisation of quality metrics across the assembly *(only if plotting is enabled)*
251
252 **2) Long Read Outputs** *(produced when SMS input is provided)*
253
254 - Filtered long-read alignment in BAM format with index
255 - Putative structural error (CSE) breakpoint coordinates
256 - Heterozygous variant (CSH) breakpoint coordinates flagged by long reads
257
258 **3) Short Read Outputs** *(produced when NGS input is provided)*
259
260 - Filtered short-read alignment in BAM format with index
261 - Putative regional error (CRE) coordinates flagged by short reads
262 - Heterozygous variant (CRH) coordinates from short-read clipping patterns
263
264 **4) Regional Error Regions** *(advanced output, optional)*
265
266 BED files with precise coordinates of:
267
268 - CRE (Clipping-based Regional Errors): local assembly errors detected by short reads
269 - CRH (Clipping-based Regional Heterozygous variants): heterozygous positions in regional context
270
271 **5) Structural Error Regions** *(advanced output, optional)*
272
273 BED files with precise coordinates of:
274
275 - CSE (Clipping-based Structural Errors): large-scale misassemblies detected by long reads
276 - CSH (Clipping-based Structural Heterozygous variants): heterozygous structural positions
277 - Low-coverage regions and ambiguous breakpoints
278
279 -----
280
281 **Interpreting AQI Scores**
282
283 Both R-AQI and S-AQI are scored from 0 to 100, where higher is better:
284
285 +------------+-------------------+------------------------------------------+
286 | Score | Quality | Interpretation |
287 +============+===================+==========================================+
288 | 90 – 100 | Excellent | Very few errors, high-confidence assembly|
289 +------------+-------------------+------------------------------------------+
290 | 70 – 89 | Good | Minor errors, suitable for most analyses |
291 +------------+-------------------+------------------------------------------+
292 | 50 – 69 | Moderate | Noticeable errors, use with caution |
293 +------------+-------------------+------------------------------------------+
294 | < 50 | Poor | Significant errors, reassembly advised |
295 +------------+-------------------+------------------------------------------+
296
297 ]]></help>
298 <expand macro="citations"/>
299 <expand macro="creators"/>
300 </tool>