changeset 20:35d32c807c23 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/ncbi_datasets commit 5a65a62588a36d757f96681bf72f537c12c91beb
author iuc
date Fri, 26 Dec 2025 17:16:51 +0000
parents ced734560c9d
children
files datasets_gene.xml datasets_genome.xml macros.xml test-data/GCF_000007445.1.genomic.gtf test-data/genome.2.GCF_000013305.1.genomic.gtf
diffstat 5 files changed, 220 insertions(+), 144 deletions(-) [+]
line wrap: on
line diff
--- a/datasets_gene.xml	Mon Mar 17 11:05:34 2025 +0000
+++ b/datasets_gene.xml	Fri Dec 26 17:16:51 2025 +0000
@@ -4,7 +4,7 @@
         <import>macros.xml</import>
     </macros>
     <expand macro="bio_tools"/>
-    <expand macro="requirements"></expand>
+    <expand macro="requirements"/>
     <expand macro="version_command"/>
     <command><![CDATA[
 #import re
@@ -41,7 +41,7 @@
 
 #if $filters.fasta_filter_cond.fasta_filter_select
     #if $filters.fasta_filter_cond.fasta_filter_select == 'text'
-        --fasta-filter #echo ",".join(f"'{x}'" for x in $filters.fasta_filter_cond.fasta_filter.split(',') if x)
+        --fasta-filter #echo ",".join(f"'{x}'" for x in str($filters.fasta_filter_cond.fasta_filter).split(',') if x)
     #else
         --fasta-filter-file '$filters.fasta_filter_cond.fasta_filter_file'
     #end if
@@ -97,8 +97,8 @@
                     <param argument="--taxon" type="text" value="human" label="Species for gene symbol" help="NCBI taxid, common or scientific name">
                         <sanitizer invalid_char="">
                             <valid initial="string.letters">
-                                <add value=" " />
-                                <add value="-" />
+                                <add value=" "/>
+                                <add value="-"/>
                             </valid>
                         </sanitizer>
                     </param>
@@ -109,8 +109,8 @@
                     <param argument="--taxon-filter" type="text" value="" label="Limit gene sequences and annotation report file to specified taxon" help="any rank, only available for WP accessions">
                         <sanitizer invalid_char="">
                             <valid initial="string.letters">
-                                <add value=" " />
-                                <add value="-" />
+                                <add value=" "/>
+                                <add value="-"/>
                             </valid>
                         </sanitizer>
                     </param>
@@ -133,7 +133,7 @@
                     <param argument="--fasta-filter" type="text" label="RefSeq nucleotide and protein accessions" help="Comma separated">
                         <sanitizer invalid_char="">
                             <valid initial="string.letters,string.digits">
-                                <add value="," />
+                                <add value=","/>
                             </valid>
                         </sanitizer>
                     </param>
@@ -209,7 +209,7 @@
             <filter>file_choices['kingdom_cond']['include'] and "cds" in file_choices['kingdom_cond']['include']</filter>
         </data>
         <data name="threep_utr_fasta" label="NCBI Gene Datasets: 3' UTR fasta" format="fasta" from_work_dir="ncbi_dataset/data/3p_utr.fna">
-            <filter>file_choices['kingdom_cond']['include'] and "5p-utr" in file_choices['kingdom_cond']['include']</filter>
+            <filter>file_choices['kingdom_cond']['include'] and "3p-utr" in file_choices['kingdom_cond']['include']</filter>
         </data>
         <data name="fivep_utr_fasta" label="NCBI Gene Datasets: 5' UTR fasta" format="fasta" from_work_dir="ncbi_dataset/data/5p_utr.fna">
             <filter>file_choices['kingdom_cond']['include'] and "5p-utr" in file_choices['kingdom_cond']['include']</filter>
@@ -235,12 +235,12 @@
             </output>
             <output name="rna_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
             <output name="protein_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
         </test>
@@ -263,12 +263,12 @@
             </output>
             <output name="rna_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
             <output name="protein_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
         </test>
@@ -284,6 +284,7 @@
             </conditional>
             <section name="file_choices">
                 <conditional name="kingdom_cond">
+                    <param name="kingdom_sel" value="gene"/>
                     <param name="include" value="gene,cds"/>
                 </conditional>
             </section>
@@ -297,17 +298,17 @@
             </output>
             <output name="gene_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
             <output name="cds_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
         </test>
         <!-- 4: datasets download gene symbol tp53 -->
-        <test expect_num_outputs="1">
+        <test expect_num_outputs="3">
             <conditional name="query|subcommand">
                 <param name="download_by" value="symbol"/>
                 <conditional name="text_or_file">
@@ -315,11 +316,6 @@
                     <param name="accession" value="tp53"/>
                 </conditional>
             </conditional>
-            <section name="file_choices">
-                <conditional name="kingdom_cond">
-                    <param name="include" value=""/>
-                </conditional>
-            </section>
             <output name="gene_data_report">
                 <assert_contents>
                     <has_text text="human"/>
@@ -361,17 +357,17 @@
             </output>
             <output name="threep_utr_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
             <output name="fivep_utr_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
         </test>
         <!-- 6: datasets download gene symbol brca1 \-\-ortholog -->
-        <test expect_num_outputs="1">
+        <test expect_num_outputs="3">
             <conditional name="query|subcommand">
                 <param name="download_by" value="symbol"/>
                 <conditional name="text_or_file">
@@ -380,11 +376,6 @@
                 </conditional>
                 <param name="ortholog" value="rodentia"/>
             </conditional>
-            <section name="file_choices">
-                <conditional name="kingdom_cond">
-                    <param name="include" value=""/>
-                </conditional>
-            </section>
             <output name="gene_data_report">
                 <assert_contents>
                     <has_text text="rat"/>
@@ -395,7 +386,7 @@
             </output>
         </test>
         <!-- 7: datasets download gene accession NP_000483.3 -->
-        <test expect_num_outputs="1">
+        <test expect_num_outputs="3">
             <conditional name="query|subcommand">
                 <param name="download_by" value="accession"/>
                 <conditional name="text_or_file">
@@ -403,11 +394,6 @@
                     <param name="accession" value="NP_000483.3"/>
                 </conditional>
             </conditional>
-            <section name="file_choices">
-                <conditional name="kingdom_cond">
-                    <param name="include" value=""/>
-                </conditional>
-            </section>
             <output name="gene_data_report">
                 <assert_contents>
                     <has_text text="human"/>
@@ -417,7 +403,7 @@
             </output>
         </test>
         <!-- 8: datasets download gene accession NM_000546.6 NM_000492.4 + ortholog-->
-        <test expect_num_outputs="1">
+        <test expect_num_outputs="3">
             <conditional name="query|subcommand">
                 <param name="download_by" value="accession"/>
                 <conditional name="text_or_file">
@@ -426,11 +412,6 @@
                 </conditional>
                 <param name="ortholog" value="all"/>
             </conditional>
-            <section name="file_choices">
-                <conditional name="kingdom_cond">
-                    <param name="include" value=""/>
-                </conditional>
-            </section>
             <output name="gene_data_report">
                 <assert_contents>
                     <has_text text="human"/>
@@ -439,7 +420,6 @@
                 </assert_contents>
             </output>
         </test>
-
         <!-- 9: datasets download gene accession WP_003249567.1 + include_flanks_bp -->
         <test expect_num_outputs="4">
             <conditional name="query|subcommand">
@@ -466,24 +446,23 @@
             </output>
             <output name="gene_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
             <output name="gene_flanks">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
             <output name="protein_fasta">
                 <assert_contents>
-                    <has_text text=">"/>
+                    <has_text text="&gt;"/>
                 </assert_contents>
             </output>
             <assert_command>
                 <has_text text="include-flanks-bp 100"/>
             </assert_command>
-        </test> 
-
+        </test>
         <!-- 10: datasets download gene taxon human   -->
         <!-- <test expect_num_outputs="1">
             <conditional name="query|subcommand">
@@ -534,15 +513,60 @@
             </output>
         </test> -->
     </tests>
-    <help>
-<![CDATA[
-**Download Gene Datasets from NCBI**
+    <help><![CDATA[
+.. class:: infomark
+
+**What it does**
+
+Downloads gene data from NCBI using the `datasets`_ command-line tool.
+Retrieve gene sequences, transcripts, proteins, and annotation reports.
+
+**Query Options**
+
+=============  ================================================================
+Method         Description
+=============  ================================================================
+Gene ID        NCBI Gene ID (e.g., 672 for BRCA1)
+Symbol         Gene symbol with taxon (e.g., TP53 in human)
+Accession      RefSeq nucleotide (NM\_) or protein (NP\_/WP\_) accession
+Taxon          All genes for a taxon (large downloads)
+=============  ================================================================
+
+----
+
+**Key Options**
+
+- **Ortholog retrieval**: Get orthologous genes across taxa (vertebrates/insects)
+- **Taxon filter**: Limit WP\_ accession results to specific organisms
+- **Flanking sequence**: Include nucleotides upstream/downstream (WP\_ only)
+- **FASTA filter**: Subset output to specific accessions
 
-Download a gene dataset (gene sequence, transcipt, amino acid sequences, 
-nucleotide coding sequences, 5'-UTR, 3'-UTR) as well as gene and gene
-product reports. Genes can be referred by gene id, symbol, accession,
-or taxon.
-]]>
-    </help>
+**Outputs (Eukaryote)**
+
+- **Gene Data Report**: Tabular metadata (ID, symbol, description, coordinates)
+- **Gene Product Report**: Detailed transcript/protein information
+- **Sequences**: Gene, RNA, protein, CDS, 5'/3' UTR FASTA files
+
+**Outputs (Prokaryote)**
+
+Prokaryotic genes (WP\_ accessions) use a different report format with:
+accession, description, EC number, gene symbol, protein info.
+
+**Examples**
+
+Download human BRCA1::
+
+    Query by: Gene ID
+    Gene ID: 672
+
+Download TP53 orthologs in rodents::
+
+    Query by: Symbol
+    Symbol: tp53
+    Ortholog: rodentia
+
+
+.. _datasets: https://www.ncbi.nlm.nih.gov/datasets/
+]]></help>
     <expand macro="citations"/>
 </tool>
--- a/datasets_genome.xml	Mon Mar 17 11:05:34 2025 +0000
+++ b/datasets_genome.xml	Fri Dec 26 17:16:51 2025 +0000
@@ -4,9 +4,14 @@
         <import>macros.xml</import>
     </macros>
     <expand macro="bio_tools"/>
-    <expand macro="requirements"></expand>
+    <expand macro="requirements"/>
     <expand macro="version_command"/>
-    <command><![CDATA[
+    <stdio>
+        <regex match="Warning" source="stderr" level="warning" description=""/>
+        <regex match="skipping" source="stderr" level="warning" description=""/>
+        <regex match="ERROR" level="fatal"/>
+    </stdio>
+    <command detect_errors="exit_code"><![CDATA[
 #import re
 @SETUP_CERTIFICATES@
 datasets download genome $query.subcommand.download_by
@@ -41,7 +46,7 @@
 @RELEASED_BEFORE@
 @RELEASED_AFTER@
 #for search_term in $filters.search:
-    --search '$filters.search_term'
+    --search '$search_term.search'
 #end for
 --no-progressbar
 --dehydrated
@@ -116,7 +121,6 @@
             </param>
             <expand macro="released_options"/>
             <expand macro="released_options" before_or_after="after"/>
-
             <repeat name="search" title="Add search terms">
                 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
             </repeat>
@@ -137,35 +141,35 @@
     <outputs>
         <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/>
         <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
             <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter>
         </collection>
         <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?\.(?P&lt;ext&gt;fasta(\.gz)?)"  directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?\.(?P&lt;ext&gt;fasta(\.gz)?)" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
             <filter>file_choices['include'] and "genome" in file_choices['include']</filter>
         </collection>
         <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
             <filter>file_choices['include'] and "rna" in file_choices['include']</filter>
         </collection>
         <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
             <filter>file_choices['include'] and "protein" in file_choices['include']</filter>
         </collection>
         <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
             <filter>file_choices['include'] and "cds" in file_choices['include']</filter>
         </collection>
         <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
             <filter>file_choices['include'] and "gff3" in file_choices['include']</filter>
         </collection>
         <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
             <filter>file_choices['include'] and "gtf" in file_choices['include']</filter>
         </collection>
         <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
             <filter>file_choices['include'] and "gbff" in file_choices['include']</filter>
         </collection>
     </outputs>
@@ -175,8 +179,10 @@
                 <param name="download_by" value="taxon"/>
                 <param name="taxon_positional" value="human"/>
             </conditional>
-            <param name="chromosomes" value="21"/>
-            <param name="released_before" value="01/01/2018"/>
+            <section name="filters">
+                <param name="chromosomes" value="21"/>
+                <param name="released_before" value="01/01/2018"/>
+            </section>
             <section name="file_choices">
                 <!-- include a sequence (which should be downloaded as fasta.gz)
                      and one non-sequence (which should be decompressed) output -->
@@ -184,15 +190,15 @@
             </section>
             <output name="genome_data_report">
                 <assert_contents>
-                    <has_text text="Assembly Accession&#009;Assembly Name&#009;Assembly Submitter&#009;Organism Name"/>
-                    <has_n_lines n="142"/>
+                    <has_text text="Assembly Accession&#9;Assembly Name&#9;Assembly Submitter&#9;Organism Name"/>
+                    <has_n_lines min="140"/>
                     <has_n_columns n="4"/>
                 </assert_contents>
             </output>
-            <output_collection name="rna_fasta" type="list" count="1">
+            <output_collection name="rna_fasta" type="list">
                 <element name="GCF_000306695.2" decompress="true">
                     <assert_contents>
-                        <has_text text=">"/>
+                        <has_text text="&gt;"/>
                     </assert_contents>
                 </element>
             </output_collection>
@@ -212,28 +218,25 @@
         <test expect_num_outputs="2">
             <conditional name="query|subcommand">
                 <param name="download_by" value="taxon"/>
-                <param name="taxon_positional" value="human"/>
+                <param name="taxon_positional" value="Norway rat"/>
             </conditional>
-            <param name="chromosomes" value="21"/>
-            <param name="assembly_level" value="chromosome,complete"/>
-            <param name="released_before" value="01/01/2018"/>
+            <section name="filters">
+                <param name="chromosomes" value="MT"/>
+            </section>
             <section name="file_choices">
                 <param name="include" value="genome"/>
                 <param name="decompress" value="true"/>
             </section>
-            <output_collection name="genome_fasta" type="list:list" count="12">
-                <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_000252825.1" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_000306695.2" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_000365445.1" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_001292825.2" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_001524155.4" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_001712695.1" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_022833125.2" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
+            <output_collection name="genome_fasta" type="list:list" count="9">
+                <expand macro="genome_fasta_assert" el1="GCA_000001895.4" el2="chrMT" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCA_015227675.2" el2="chrMT" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCA_036323735.1" el2="chrMT" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCA_041222355.1" el2="chrMT" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCA_045687965.1" el2="chrMT" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCA_045687995.1" el2="chrMT" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCA_045688005.1" el2="chrMT" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCF_000001895.5" el2="chrMT" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCF_015227675.2" el2="chrMT" expression="&gt;"/>
                 <!-- According to  https://github.com/ncbi/datasets/issues/188, the following should not be included among the returned results anymore 09/2023 -->
                 <!--
                 <expand macro="genome_fasta_assert" el1="GCA_000442335.2" el2="GCA_000442335.2_LinearCen1.1_normalized" expression=">" expression_n="25"/>
@@ -242,7 +245,7 @@
             </output_collection>
             <output name="genome_data_report">
                 <assert_contents>
-                    <has_text text="Homo sapiens"/>
+                    <has_text text="Rattus norvegicus"/>
                     <has_n_columns n="4"/>
                 </assert_contents>
             </output>
@@ -253,10 +256,12 @@
                 <param name="download_by" value="taxon"/>
                 <param name="taxon_positional" value="human"/>
             </conditional>
-            <param name="chromosomes" value="21"/>
-            <param name="assembly_level" value="chromosome,complete"/>
-            <param name="assembly_source" value="refseq"/>
-            <param name="released_before" value="01/01/2018"/>
+            <section name="filters">
+                <param name="chromosomes" value="21"/>
+                <param name="assembly_level" value="chromosome,complete"/>
+                <param name="assembly_source" value="refseq"/>
+                <param name="released_before" value="01/01/2018"/>
+            </section>
             <section name="file_choices">
                 <param name="include" value="genome"/>
                 <param name="decompress" value="true"/>
@@ -288,7 +293,9 @@
                     <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/>
                 </conditional>
             </conditional>
-            <param name="released_before" value="01/01/2007"/>
+            <section name="filters">
+                <param name="released_before" value="01/01/2007"/>
+            </section>
             <section name="file_choices">
                 <param name="include" value="seq-report,gtf,cds"/>
                 <param name="decompress" value="true"/>
@@ -300,7 +307,7 @@
                     <has_n_columns n="4"/>
                 </assert_contents>
             </output>
-            <output_collection name="sequence_report" type="list" count="2" >
+            <output_collection name="sequence_report" type="list" count="2">
                 <element name="GCF_000007445.1">
                     <assert_contents>
                         <has_text text="GCF_000007445.1"/>
@@ -316,7 +323,7 @@
                     </assert_contents>
                 </element>
             </output_collection>
-            <output_collection name="genomic_gtf" type="list">
+            <output_collection name="genomic_gtf" type="list" count="2">
                 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
                 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/>
             </output_collection>
@@ -333,7 +340,9 @@
                     <param name="inputfile" value="accessions.txt"/>
                 </conditional>
             </conditional>
-            <param name="released_before" value="01/01/2007"/>
+            <section name="filters">
+                <param name="released_before" value="01/01/2007"/>
+            </section>
             <section name="file_choices">
                 <param name="include" value="seq-report,gff3,gbff"/>
                 <param name="decompress" value="true"/>
@@ -355,7 +364,6 @@
                 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/>
             </output_collection>
         </test>
-
         <!-- should not fail https://github.com/ncbi/datasets/issues/194 -->
         <test expect_num_outputs="2">
             <conditional name="query|subcommand">
@@ -365,8 +373,10 @@
                     <param name="accession" value="GCF_000001405"/>
                 </conditional>
             </conditional>
-            <param name="released_before" value="01/01/2015"/>
-            <param name="assembly_version" value="all"/>
+            <section name="filters">
+                <param name="released_before" value="01/01/2015"/>
+                <param name="assembly_version" value="all"/>
+            </section>
             <section name="file_choices">
                 <param name="include" value="seq-report"/>
             </section>
@@ -395,19 +405,19 @@
                 <param name="decompress" value="true"/>
             </section>
             <output_collection name="genome_fasta" type="list:list" count="1">
-                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
+                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression="&gt;NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
             </output_collection>
             <output_collection name="protein_fasta" type="list" count="1">
                 <element name="GCF_000146045.2" decompress="true">
                     <assert_contents>
-                        <has_text text=">"/>
+                        <has_text text="&gt;"/>
                     </assert_contents>
                 </element>
             </output_collection>
             <output_collection name="rna_fasta" type="list" count="1">
                 <element name="GCF_000146045.2" decompress="true">
                     <assert_contents>
-                        <has_text text=">"/>
+                        <has_text text="&gt;"/>
                     </assert_contents>
                 </element>
             </output_collection>
@@ -437,7 +447,7 @@
             <output_collection name="protein_fasta" type="list" count="1">
                 <element name="GCF_000146045.2" ftype="fasta.gz">
                     <assert_contents>
-                        <has_size value="1845038" delta="2000"/>
+                        <has_size value="1847862" delta="2000"/>
                     </assert_contents>
                 </element>
             </output_collection>
@@ -463,44 +473,90 @@
             </section>
             <output_collection name="sequence_report" type="list" count="2"/>
             <output_collection name="genome_fasta" type="list:list" count="2">
-                <expand macro="genome_fasta_assert" el1="GCF_000002945.2" el2="GCF_000002945.2_ASM294v3" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe.*" expression_n="4"/>
-                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc].*" expression_n="17"/>
+                <expand macro="genome_fasta_assert" el1="GCF_000002945.2" el2="GCF_000002945.2_ASM294v3" expression="&gt;NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe.*" expression_n="4"/>
+                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression="&gt;NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc].*" expression_n="17"/>
             </output_collection>
         </test>
         <!-- tax_exact_match should filter out strains
              https://github.com/ncbi/datasets/issues/187 -->
-        <test expect_num_outputs="1">
+        <test expect_num_outputs="2">
             <conditional name="query|subcommand">
                 <param name="download_by" value="taxon"/>
                 <param name="taxon_positional" value="4932"/>
                 <param name="tax_exact_match" value="true"/>
             </conditional>
-            <param name="include" value=""/>
             <output name="genome_data_report">
                 <assert_contents>
-                   <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>
+                    <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- test search filter -->
+        <test expect_num_outputs="1">
+            <conditional name="query|subcommand">
+                <param name="download_by" value="taxon"/>
+                <param name="taxon_positional" value="Streptococcus"/>
+            </conditional>
+            <section name="filters">
+                <repeat name="search">
+                    <param name="search" value="pyogenes"/>
+                </repeat>
+            </section>
+            <section name="file_choices">
+                <param name="include" value_json="null"/>
+            </section>
+            <output name="genome_data_report">
+                <assert_contents>
+                    <has_text text="pyogenes"/>
                 </assert_contents>
             </output>
         </test>
     </tests>
-    <help>
-<![CDATA[
-**Download Genome Datasets from NCBI**
+    <help><![CDATA[
+.. class:: infomark
+
+**What it does**
 
-Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report.
-Genome datasets can be specified by NCBI Assembly or BioProject accession(s) or by taxon.
+Downloads genome assemblies from NCBI using the `datasets`_ command-line tool.
+Retrieve genome sequences, annotations, and metadata by accession or taxon.
+
+**Query Options**
 
-The download is a three step process:
+- **By Accession**: NCBI Assembly (GCF\_/GCA\_) or BioProject accession
+- **By Taxon**: Taxonomy ID, scientific name, or common name
+
+**Filters**
 
-1. A "dehydrated" zip file is downloaded which includes the metadata and the download URL)
-2. The metadata is transformed into a tabular (TSV) file
-3. The data is hydrated (the actual data is downloaded)
+====================  ===============================================
+Filter                Description
+====================  ===============================================
+Reference only        Limit to reference/representative assemblies
+Annotated only        Include only genomes with annotations
+Assembly level        Chromosome, complete, contig, or scaffold
+Assembly source       RefSeq (GCF\_) or GenBank (GCA\_)
+Exclude atypical      Remove atypical assemblies (e.g., partial)
+MAG filter            Include/exclude metagenome-assembled genomes
+Date range            Filter by release date
+====================  ===============================================
+
+----
+
+.. class:: warningmark
 
-The 3rd step can be skipped by unselecting all output types in the `Include` parameter.
-Thereby its possible to inspect the metadata prior to the actual data download. Also this
-allows to use the tool for querying data sets (and their accessions) of interest which
-can then be downloaded in a second call using the accessions.
-]]>
-    </help>
+**Note**: The "Reference only" filter returns only RefSeq (GCF\_) assemblies.
+If a taxon has only GenBank (GCA\_) assemblies, this filter will return no results
+with a misleading error message. It is a NCBI datasets bug (not a Galaxy bug).
+
+**Outputs**
+
+- **Data Report**: Tabular metadata for matching assemblies
+- **Genome FASTA**: Genomic sequences (nested collection by accession)
+- **Annotation files**: GFF3, GTF, GenBank flat files
+- **Protein/RNA/CDS**: Amino acid and nucleotide sequences
+- **Sequence Report**: Per-sequence metadata (chromosome, length, etc.)
+
+.. _datasets: https://www.ncbi.nlm.nih.gov/datasets/
+
+]]></help>
     <expand macro="citations"/>
 </tool>
--- a/macros.xml	Mon Mar 17 11:05:34 2025 +0000
+++ b/macros.xml	Fri Dec 26 17:16:51 2025 +0000
@@ -1,5 +1,5 @@
 <macros>
-    <token name="@TOOL_VERSION@">17.1.0</token>
+    <token name="@TOOL_VERSION@">18.13.0</token>
     <token name="@VERSION_SUFFIX@">0</token>
     <token name="@PROFILE@">23.0</token>
     <token name="@LICENSE@">MIT</token>
@@ -11,8 +11,9 @@
     <xml name="requirements">
         <requirements>
             <requirement type="package" version="@TOOL_VERSION@">ncbi-datasets-cli</requirement>
-            <requirement type="package" version="2025.1.31">ca-certificates</requirement>
-            <requirement type="package" version="6.0">unzip</requirement>
+            <requirement type="package" version="2025.8.3">ca-certificates</requirement>
+             <!-- Removed line below because it was causing "skipping: [..] need PK compat. v4.5 (can do v2.1)"  -->
+             <!-- <requirement type="package" version="6.0">unzip</requirement> -->
         </requirements>
     </xml>
     <xml name="bio_tools">
@@ -94,7 +95,7 @@
     <xml name="genome_includes">
         <option value="genome" selected="true">genomic sequence (genome)</option>
         <option value="rna">transcript (rna)</option>
-        <option value="protein">amnio acid sequences (protein)</option>
+        <option value="protein">amino acid sequences (protein)</option>
         <option value="cds">nucleotide coding sequences (cds)</option>
         <option value="gff3">general feature file (gff3)</option>
         <option value="gtf">gene transfer format (gtf)</option>
@@ -104,7 +105,7 @@
     </xml>
     <xml name="gene_includes">
         <option value="gene">gene sequence (gene)</option>
-        <option value="protein" selected="true">amnio acid sequences (protein)</option>
+        <option value="protein" selected="true">amino acid sequences (protein)</option>
         <yield/>
     </xml>
 
@@ -401,11 +402,8 @@
     </xml>
     <xml name="citations">
         <citations>
-            <citation type="bibtex">@misc{NCBI,
-                author = "{NCBI}",
-                title = "NCBI Datasets",
-                year = "2022",
-                url = "https://github.com/ncbi/datasets"}
+            <citation type="doi">
+                10.1038/s41597-024-03571-y
             </citation>
         </citations>
     </xml>
--- a/test-data/GCF_000007445.1.genomic.gtf	Mon Mar 17 11:05:34 2025 +0000
+++ b/test-data/GCF_000007445.1.genomic.gtf	Fri Dec 26 17:16:51 2025 +0000
@@ -1,6 +1,5 @@
-#!annotation-source NCBI RefSeq 
 NC_004431.1	RefSeq	gene	190	255	.	+	.	gene_id "C_RS00005"; transcript_id ""; gbkey "Gene"; gene "thrL"; gene_biotype "protein_coding"; locus_tag "C_RS00005"; old_locus_tag "c5491"; 
-NC_004431.1	Protein Homology	CDS	190	252	.	+	0	gene_id "C_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "C_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; 
-NC_004431.1	Protein Homology	start_codon	190	192	.	+	0	gene_id "C_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "C_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; 
-NC_004431.1	Protein Homology	stop_codon	253	255	.	+	0	gene_id "C_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "C_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; 
+NC_004431.1	Protein Homology	CDS	190	252	.	+	0	gene_id "C_RS00005"; transcript_id "unassigned_transcript_1"; db_xref "GenBank:WP_001386572.1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "C_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; exon_number "1"; 
+NC_004431.1	Protein Homology	start_codon	190	192	.	+	0	gene_id "C_RS00005"; transcript_id "unassigned_transcript_1"; db_xref "GenBank:WP_001386572.1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "C_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; exon_number "1"; 
+NC_004431.1	Protein Homology	stop_codon	253	255	.	+	0	gene_id "C_RS00005"; transcript_id "unassigned_transcript_1"; db_xref "GenBank:WP_001386572.1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "C_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; exon_number "1"; 
 NC_004431.1	RefSeq	gene	453	911	.	+	.	gene_id "C_RS00010"; transcript_id ""; gbkey "Gene"; gene "tnpA"; gene_biotype "protein_coding"; locus_tag "C_RS00010"; old_locus_tag "c0002"; 
--- a/test-data/genome.2.GCF_000013305.1.genomic.gtf	Mon Mar 17 11:05:34 2025 +0000
+++ b/test-data/genome.2.GCF_000013305.1.genomic.gtf	Fri Dec 26 17:16:51 2025 +0000
@@ -1,6 +1,5 @@
-#!annotation-source NCBI RefSeq 
 NC_008253.1	RefSeq	gene	190	255	.	+	.	gene_id "ECP_RS00005"; transcript_id ""; gbkey "Gene"; gene "thrL"; gene_biotype "protein_coding"; locus_tag "ECP_RS00005"; old_locus_tag "ECP_0001"; 
-NC_008253.1	Protein Homology	CDS	190	252	.	+	0	gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; 
-NC_008253.1	Protein Homology	start_codon	190	192	.	+	0	gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; 
-NC_008253.1	Protein Homology	stop_codon	253	255	.	+	0	gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; 
+NC_008253.1	Protein Homology	CDS	190	252	.	+	0	gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; db_xref "GenBank:WP_001386572.1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; exon_number "1"; 
+NC_008253.1	Protein Homology	start_codon	190	192	.	+	0	gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; db_xref "GenBank:WP_001386572.1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; exon_number "1"; 
+NC_008253.1	Protein Homology	stop_codon	253	255	.	+	0	gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; db_xref "GenBank:WP_001386572.1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; exon_number "1"; 
 NC_008253.1	RefSeq	gene	336	2798	.	+	.	gene_id "ECP_RS00010"; transcript_id ""; gbkey "Gene"; gene "thrA"; gene_biotype "protein_coding"; locus_tag "ECP_RS00010"; old_locus_tag "ECP_0002";