flye: flye.xml comparison

comparison flye.xml @ 10:7066276883d6 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flye commit 3507b06e5d0149d514ede3d1a56c082e89f14f89

author	bgruening
date	Mon, 26 Aug 2024 13:54:16 +0000
parents	5f2671cec451
children	6e5b72b4e1fb

comparison

equal deleted inserted replaced

-:5f2671cec451
+:7066276883d6
 <tool id="flye" name="Flye" version="@TOOL_VERSION@+galaxy@SUFFIX_VERSION@" profile="20.01">
 <description>de novo assembler for single molecule sequencing reads</description>
 <macros>
 <import>macros.xml</import>
 </macros>
+<expand macro="edam_ontology"/>
+<expand macro="xrefs"/>
 <expand macro="requirements" />
-<expand macro="edam_ontology"/>
 <version_command>flye --version</version_command>
 <command detect_errors="exit_code"><![CDATA[
 #for $counter, $input in enumerate($inputs):
 #if $input.is_of_type('fastqsanger', 'fastq'):
 #set $ext = 'fastq'
 #elif $input.is_of_type('fasta.gz'):
 #set $ext = 'fasta.gz'
 #elif $input.is_of_type('fasta'):
 #set $ext = 'fasta'
 #end if
-ln -s '$input' ./input_${counter}.${ext} &&
+ln -sf '$input' ./input_${counter}.${ext} &&
 #end for
 flye
 $mode_conditional.mode
 #for $counter, $input in enumerate($inputs):
 ./input_${counter}.$ext
 <param name="generate_log" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Generate a log file"/>
 </inputs>
 <outputs>
 <data name="consensus" format="fasta" from_work_dir="out_dir/assembly.fasta" label="${tool.name} on ${on_string}: consensus"/>
 <data name="assembly_graph" format="graph_dot" from_work_dir="out_dir/assembly_graph.gv" label="${tool.name} on ${on_string}: assembly graph"/>
-<data name="assembly_gfa" format="txt" from_work_dir="out_dir/assembly_graph.gfa" label="${tool.name} on ${on_string}: graphical fragment assembly"/>
+<data name="assembly_gfa" format="gfa" from_work_dir="out_dir/assembly_graph.gfa" label="${tool.name} on ${on_string}: graphical fragment assembly"/>
 <data name="assembly_info" format="tabular" from_work_dir="out_dir/assembly_info.txt" label="${tool.name} on ${on_string}: assembly info"/>
 <data name="flye_log" format="txt" from_work_dir="out_dir/flye.log" label="${tool.name} on ${on_string}: log">
 <filter>generate_log</filter>
 </data>
 </outputs>
 <param name="mode" value="--pacbio-raw"/>
 <param name="iterations" value="0"/>
 <param name="generate_log" value="true"/>
 <output name="assembly_info" file="result1_assembly_info.txt" ftype="tabular" compare="sim_size"/>
 <output name="assembly_graph" file="result1_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/>
-<output name="assembly_gfa" file="result1_assembly_graph.gfa" ftype="txt" compare="sim_size"/>
+<output name="assembly_gfa" file="result1_assembly_graph.gfa" ftype="gfa" compare="diff" lines_diff="10"/>
-<output name="consensus" file="result1_assembly.fasta" ftype="fasta" compare="sim_size"/>
+<output name="consensus" ftype="fasta">
+<assert_contents>
+<has_line line=">contig_1"/>
+</assert_contents>
+</output>
 <output name="flye_log" file="result1.log" ftype="txt" compare="sim_size"/>
 </test>
 <!--Test 02: nano raw-->
 <test expect_num_outputs="4">
 <param name="inputs" ftype="fasta.gz" value="nanopore.fasta.gz"/>
 <output name="assembly_graph" ftype="graph_dot">
 <assert_contents>
 <has_size value="803" delta="100"/>
 </assert_contents>
 </output>
-<output name="assembly_gfa" ftype="txt">
+<output name="assembly_gfa" ftype="gfa">
 <assert_contents>
 <has_size value="35047" delta="100"/>
 </assert_contents>
 </output>
 <output name="consensus" ftype="fasta">
 <output name="assembly_graph" ftype="graph_dot">
 <assert_contents>
 <has_size value="1840" delta="100"/>
 </assert_contents>
 </output>
-<output name="assembly_gfa" ftype="txt">
+<output name="assembly_gfa" ftype="gfa">
 <assert_contents>
 <has_size value="420752" delta="100"/>
 </assert_contents>
 </output>
 <output name="consensus" ftype="fasta">
 <output name="assembly_graph" ftype="graph_dot">
 <assert_contents>
 <has_size value="367" delta="100"/>
 </assert_contents>
 </output>
-<output name="assembly_gfa" ftype="txt">
+<output name="assembly_gfa" ftype="gfa">
 <assert_contents>
 <has_size value="418729" delta="100"/>
 </assert_contents>
 </output>
 <output name="consensus" ftype="fasta">
 <output name="assembly_graph" ftype="graph_dot">
 <assert_contents>
 <has_size value="1248" delta="100"/>
 </assert_contents>
 </output>
-<output name="assembly_gfa" ftype="txt">
+<output name="assembly_gfa" ftype="gfa">
 <assert_contents>
-<has_size value="420252" delta="100"/>
+<has_size value="419414" delta="1000"/>
 </assert_contents>
 </output>
 <output name="consensus" ftype="fasta">
 <assert_contents>
-<has_size value="427129" delta="100"/>
+<has_size value="426277" delta="1000"/>
 </assert_contents>
 </output>
 </test>
 <!--Test 06: hifi error option-->
 <test expect_num_outputs="4">
 <has_size value="286" delta="100"/>
 </assert_contents>
 </output>
 <output name="assembly_graph" ftype="graph_dot">
 <assert_contents>
-<has_size value="1273" delta="100"/>
+<has_size value="1248" delta="500"/>
 </assert_contents>
 </output>
-<output name="assembly_gfa" ftype="txt">
+<output name="assembly_gfa" ftype="gfa">
 <assert_contents>
-<has_size value="420252" delta="100"/>
+<has_size value="420254" delta="2000"/>
 </assert_contents>
 </output>
 <output name="consensus" ftype="fasta">
 <assert_contents>
-<has_size value="427129" delta="100"/>
+<has_size value="427131" delta="2000"/>
 </assert_contents>
 </output>
 </test>
 <!--Test 07: keep haplotypes-->
 <test expect_num_outputs="4">
 </conditional>
 <param name="min_overlap" value="1000"/>
 <param name="keep-haplotypes" value="true"/>
 <output name="assembly_info" ftype="tabular">
 <assert_contents>
-<has_size value="286" delta="100"/>
+<has_size value="286" delta="200"/>
 </assert_contents>
 </output>
 <output name="assembly_graph" ftype="graph_dot">
 <assert_contents>
-<has_size value="1273" delta="100"/>
+<has_size value="1273" delta="500"/>
 </assert_contents>
 </output>
-<output name="assembly_gfa" ftype="txt">
+<output name="assembly_gfa" ftype="gfa">
 <assert_contents>
-<has_size value="420252" delta="100"/>
+<has_size value="420254" delta="3000"/>
 </assert_contents>
 </output>
 <output name="consensus" ftype="fasta">
 <assert_contents>
-<has_size value="427129" delta="100"/>
+<has_size value="427131" delta="3000"/>
 </assert_contents>
 </output>
 </test>
 <!--Test 08: scaffolding mode-->
 <test expect_num_outputs="4">
 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/>
 <param name="mode" value="--nano-hq"/>
 <param name="min_overlap" value="1000"/>
-<param name="scaffolding" value="true"/>
+<param name="scaffold" value="true"/>
 <output name="assembly_info" ftype="tabular">
 <assert_contents>
 <has_size value="286" delta="100"/>
 </assert_contents>
 </output>
 <output name="assembly_graph" ftype="graph_dot">
 <assert_contents>
 <has_size value="1248" delta="100"/>
 </assert_contents>
 </output>
-<output name="assembly_gfa" ftype="txt">
+<output name="assembly_gfa" ftype="gfa">
 <assert_contents>
-<has_size value="420252" delta="100"/>
+<has_size value="419414" delta="2000"/>
 </assert_contents>
 </output>
 <output name="consensus" ftype="fasta">
 <assert_contents>
-<has_size value="427129" delta="100"/>
+<has_size value="426277" delta="2000"/>
 </assert_contents>
 </output>
 </test>
 <!--Test 09: test not-alt-contigs parameter w-->
 <test expect_num_outputs="4">
 <output name="assembly_graph" ftype="graph_dot">
 <assert_contents>
 <has_size value="217" delta="100"/>
 </assert_contents>
 </output>
-<output name="assembly_gfa" ftype="txt">
+<output name="assembly_gfa" ftype="gfa">
 <assert_contents>
 <has_size value="5110" delta="100"/>
 </assert_contents>
 </output>
 <output name="consensus" ftype="fasta">
 </assert_contents>
 </output>
 </test>
 </tests>
 <help><![CDATA[
-.. class:: infomark
 **Purpose**
 Flye is a de novo assembler for single molecule sequencing reads, such as those produced by PacBio and Oxford Nanopore Technologies.
 It is designed for a wide range of datasets, from small bacterial projects to large mammalian-scale assemblies. The package represents
 a complete pipeline: it takes raw PacBio/ONT reads as input and outputs polished contigs. Flye also has a special mode for metagenome
 assembly.
 ----
-.. class:: infomark
 **Quick usage**
 Input reads can be in FASTA or FASTQ format, uncompressed or compressed with gz. Currently, PacBio (raw, corrected, HiFi) and ONT reads
 (raw, corrected) are supported. Expected error rates are <30% for raw, <3% for corrected, and <1% for HiFi. Note that Flye was primarily
 developed to run on raw reads. You may specify multiple files with reads (separated by spaces). Mixing different read types is not yet supported. The *--meta* o
 To reduce memory consumption for large genome assemblies, you can use a subset of the longest reads for initial disjointig assembly by
 specifying *--asm-coverage* and *--genome-size* options. Typically, 40x coverage is enough to produce good disjointigs.
 ----
-.. class:: infomark
 **Outputs**
 The main output files are:
-::
+* Final assembly: contains contigs and possibly scaffolds (see below).
+* Final repeat graph: note that the edge sequences might be different (shorter) than contig sequences, because contigs might include multiple graph edges.
-- Final assembly: contains contigs and possibly scaffolds (see below).
+* Extra information about contigs (such as length or coverage).
-- Final repeat graph: note that the edge sequences might be different (shorter) than contig sequences, because contigs might include multiple graph edges.
-- Extra information about contigs (such as length or coverage).
 Each contig is formed by a single unique graph edge. If possible, unique contigs are extended with the sequence from flanking unresolved repeats on the graph. Thus,
 a contig fully contains the corresponding graph edge (with the same id), but might be longer then this edge. This is somewhat similar to unitig-contig relation in
 OLC assemblers. In a rare case when a repetitive graph edge is not covered by the set of "extended" contigs, it will be also output in the assembly file.
 the assembly file (with a scaffold prefix). Since it is hard to give a reliable estimate of the gap size, those gaps are represented with the default 100 Ns.
 assembly_info.txt file (below) contains additional information about how scaffolds were formed.
 Extra information about contigs/scaffolds is output into the assembly_info.txt file. It is a tab-delimited table with the columns as follows:
-::
+*  Contig/scaffold id
+*  Length
-- Contig/scaffold id
+*  Coverage
-- Length
+*  Is circular, (Y)es or (N)o
-- Coverage
+*  Is repetitive, (Y)es or (N)o
-- Is circular, (Y)es or (N)o
+*  Multiplicity (based on coverage)
-- Is repetitive, (Y)es or (N)o
+*  Alternative group
-- Multiplicity (based on coverage)
+*  Graph path (graph path corresponding to this contig/scaffold).
-- Alternative group
-- Graph path (graph path corresponding to this contig/scaffold).
+Scaffold gaps are marked with `??` symbols, and `*` symbol denotes a terminal graph node. Alternative contigs (representing alternative haplotypes) will have the same alt.
+group ID. Primary contigs are marked by `*`.
-Scaffold gaps are marked with ?? symbols, and * symbol denotes a terminal graph node. Alternative contigs (representing alternative haplotypes) will have the same alt.
-group ID. Primary contigs are marked by *.
 ----
-.. class:: infomark
 **Algorithm Description**
 This is a brief description of the Flye algorithm. Please refer to the manuscript for more detailed information. The draft contig extension is organized as follows:
-::
+*  K-mer counting / erroneous k-mer pre-filtering
+*  Solid k-mer selection (k-mers with sufficient frequency, which are unlikely to be erroneous)
-- K-mer counting / erroneous k-mer pre-filtering
+*  Contig extension. The algorithm starts from a single read and extends it with a next overlapping read (overlaps are dynamically detected using the selected solid k-mers).
-- Solid k-mer selection (k-mers with sufficient frequency, which are unlikely to be erroneous)
-- Contig extension. The algorithm starts from a single read and extends it with a next overlapping read (overlaps are dynamically detected using the selected solid k-mers).
 Note that we do not attempt to resolve repeats at this stage, thus the reconstructed contigs might contain misassemblies. Flye then aligns the reads on these draft
 contigs using minimap2 and calls a consensus. Afterwards, Flye performs repeat analysis as follows:
-::
+*  Repeat graph is constructed from the (possibly misassembled) contigs
+*  In this graph all repeats longer than minimum overlap are collapsed
-- Repeat graph is constructed from the (possibly misassembled) contigs
+*  The algorithm resolves repeats using the read information and graph structure
-- In this graph all repeats longer than minimum overlap are collapsed
+*  The unbranching paths in the graph are output as contigs
-- The algorithm resolves repeats using the read information and graph structure
-- The unbranching paths in the graph are output as contigs
 If enabled, after resolving bridged repeats, Trestle module attempts to resolve simple unbridged repeats (of multiplicity 2) using the heterogeneities between repeat copies.
 Finally, Flye performs polishing of the resulting assembly to correct the remaining errors:
-::
+*  Alignment of all reads to the current assembly using minimap2
+*  Partition the alignment into mini-alignments (bubbles)
-- Alignment of all reads to the current assembly using minimap2
+*  Error correction of each bubble using a maximum likelihood approach
-- Partition the alignment into mini-alignments (bubbles)
-- Error correction of each bubble using a maximum likelihood approach
 The polishing steps could be repeated, which might slightly increase quality for some datasets.
 ]]></help>

Mercurial > repos > bgruening > flye

comparison flye.xml @ 10:7066276883d6 draft