changeset 3:c87df3f9e19d draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit 800d16f3bd40266d8734f4572988cb2b306b4fd3"
author iuc
date Thu, 27 Jan 2022 08:20:15 +0000
parents 2753a5786114
children 41c18b994108
files datasets_gene.xml datasets_genome.xml datasets_ortholog.xml datasets_virus_genome.xml datasets_virus_protein.xml macros.xml test-data/GCF_000007445.1.genome.fa test-data/GCF_000007445.1.genomic.gbff test-data/GCF_000007445.1.genomic.gff test-data/GCF_000007445.1.protein.fa test-data/GCF_000013305.1.genome.fa test-data/GCF_000013305.1.genomic.gbff test-data/GCF_000013305.1.genomic.gff test-data/GCF_000013305.1.genomic.gtf test-data/GCF_000013305.1.protein.fa test-data/genome.2.GCF_000007445.1.genomic.cds test-data/genome.2.GCF_000007445.1.seq.rpt.jsonl test-data/genome.2.GCF_000013305.1.genomic.cds test-data/genome.2.GCF_000013305.1.genomic.gtf test-data/genome.2.GCF_000013305.1.seq.rpt.jsonl test-data/genome.3.GCF_000007445.1.genomic.gbff test-data/genome.3.GCF_000007445.1.genomic.gff test-data/genome.3.GCF_000007445.1.seq.rpt.jsonl test-data/genome.3.GCF_000013305.1.genomic.gbff test-data/genome.3.GCF_000013305.1.genomic.gff test-data/genome.3.GCF_000013305.1.seq.rpt.jsonl test-data/human_chrom_21_dehydrated.zip
diffstat 27 files changed, 243 insertions(+), 820 deletions(-) [+]
line wrap: on
line diff
--- a/datasets_gene.xml	Thu Jul 15 15:45:43 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,206 +0,0 @@
-<tool id="datasets_download_gene" name="NCBI datasets download gene" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@">
-    <description>Download genes from NCBI</description>
-    <macros>
-        <import>macros.xml</import>
-    </macros>
-    <expand macro="requirements"></expand>
-    <command><![CDATA[
-@SETUP_CERTIFICATES@
-datasets download gene $subcommand.download_by
-#if $subcommand.download_by != 'taxon':
-    #if $subcommand.text_or_file.text_or_file == 'text':
-        #if $subcommand.download_by == 'gene-id':
-            $subcommand.text_or_file.accession
-        #else if $subcommand.download_by == 'taxon':
-            '$subcommand.taxon'
-        #else
-            #echo " ".join(f"'{x}'" for x in $subcommand.text_or_file.accession.split(' ') if x)
-        #end if
-        #if $subcommand.download_by == 'accession' and $subcommand.taxon_filter:
-            --taxon-filter '$subcommand.taxon_filter'
-        #end if
-    #else
-        --inputfile '$subcommand.text_or_file.inputfile'
-    #end if
-#else:
-    '$subcommand.taxon'
-#end if
-@EXCLUDES_GENE@
-#if $subcommand.download_by == 'accession' and $subcommand.include_flanks_bp:
-    --include-flanks-bp $subcommand.include_flanks_bp
-#end if
-&& 7z x ncbi_dataset.zip
-]]></command>
-    <inputs>
-        <conditional name="subcommand">
-            <param name="download_by" type="select" label="Choose how to find genomes to download">
-                <option value="gene-id">Download a gene dataset by NCBI Gene ID</option>
-                <option value="symbol">Download a gene dataset by gene symbol</option>
-                <option value="accession">Download a gene dataset by RefSeq nucleotide or protein accession</option>
-                <option value="taxon">Download a gene dataset by taxon</option>
-            </param>
-            <when value="gene-id">
-                <expand macro="text_or_file" what="gene-id" what_extended="NCBI Gene ID" help="Should be valid NCBI Gene ID">
-                    <sanitizer invalid_char="">
-                        <valid initial="string.digits">
-                            <add value=" " />
-                        </valid>
-                    </sanitizer>
-                </expand>
-            </when>
-            <when value="symbol">
-                <expand macro="text_or_file" what="symbol" what_extended="gene symbol" help="Should be valid gene symbol"/>
-                <param argument="--taxon" type="text" value="human" label="Specify a species name" help="Species name can be common or scientific name or species-level NCBI Taxonomy ID"/>
-            </when>
-            <when value="accession">
-                <expand macro="text_or_file" what="accession" what_extended="RefSeq nucleotide or protein accession" help="Should be RefSeq nucleotide or protein accession"/>
-                <param argument="--include-flanks-bp" type="integer" optional="true" min="0" label="Include gene flanking sequence, limited to prokaryotic genes" help="If not specified flanking gene sequences will not be downloaded. Accession must start with WP"/>
-                <param argument="--taxon-filter" type="text" optional="true" label="limit genes to a specified taxon" help="any rank"/>
-            </when>
-            <when value="taxon">
-                <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurs, etc."></param>
-            </when>
-        </conditional>
-        <expand macro="excludes_gene"></expand>
-        <conditional name="limit_fasta" label="Limit fasta by accession?">
-            <param name="limit" type="select" label="Select limit method">
-                <option value="none">None</option>
-                <option value="text">Enter list of accessions</option>
-                <option value="file">Read list of accessions from file</option>
-            </param>
-            <when value="none">
-            </when>
-            <when value="text">
-                <param argument="--fasta-filter" type="text" label="Limit gene fasta download to these accessions"/>
-            </when>
-            <when value="file">
-                <param argument="--fasta-filter-file" type="data" format="txt" label="File of accessions to limit gene fasta download"/>
-            </when>
-        </conditional>
-    </inputs>
-    <outputs>
-        <data name="gene_fasta" format="fasta" label="NCBI datasets gene: gene fasta" from_work_dir="ncbi_dataset/data/gene.fna">
-            <filter>not exclude_gene</filter>
-        </data>
-        <data name="protein_fasta" format="fasta" label="NCBI datasets gene: protein fasta" from_work_dir="ncbi_dataset/data/protein.faa">
-            <filter>not exclude_protein</filter>
-        </data>
-        <data name="rna_fasta" format="fasta" label="NCBI datasets gene: rna fasta" from_work_dir="ncbi_dataset/data/rna.fna">
-            <filter>not exclude_rna</filter>
-        </data>
-        <data name="gene_flanks" format="fasta" label="NCBI datasets gene: flanking sequence fasta" from_work_dir="ncbi_dataset/data/gene_flank.fna">
-            <filter><![CDATA[subcommand.get('include_flanks_bp')]]></filter>
-        </data>
-    </outputs>
-    <tests>
-        <test title="test download by gene-id" num_outputs="3">
-            <conditional name="subcommand">
-                <param name="download_by" value="gene-id"></param>
-                <conditional name="text_or_file">
-                    <param name="text_or_file" value="text"></param>
-                    <param name="accession" value="472 672"></param>
-                </conditional>
-            </conditional>
-            <output name="gene_fasta">
-                <assert_contents>
-                    <has_line line="CCGCGTCCGCGCTTACCCAATACAAGCCGGGCTACGTCCGAGGGTAACAACATGATCAAAACCACAGCAG"/>
-                    <has_line line="GCTGAGACTTCCTGGACGGGGGACAGGCTGTGGGGTTTCTCAGATAACTGGGCCCCTGCGCTCAGGAGGC"/>
-                </assert_contents>
-            </output>
-        </test>
-        <test title="test download by gene-id, test sanitizer" num_outputs="3">
-            <conditional name="subcommand">
-                <param name="download_by" value="gene-id"></param>
-                <conditional name="text_or_file">
-                    <param name="text_or_file" value="text"></param>
-                    <param name="accession" value="472 672"></param>
-                </conditional>
-            </conditional>
-            <output name="gene_fasta">
-                <assert_contents>
-                    <has_line line="CCGCGTCCGCGCTTACCCAATACAAGCCGGGCTACGTCCGAGGGTAACAACATGATCAAAACCACAGCAG"/>
-                    <has_line line="GCTGAGACTTCCTGGACGGGGGACAGGCTGTGGGGTTTCTCAGATAACTGGGCCCCTGCGCTCAGGAGGC"/>
-                </assert_contents>
-            </output>
-            <assert_command>
-                <not_has_text text="exit"/>
-            </assert_command>
-        </test>
-        <test title="test download by gene symbol" num_outputs="3">
-            <conditional name="subcommand">
-                <param name="download_by" value="symbol"></param>
-                <conditional name="text_or_file">
-                    <param name="text_or_file" value="text"></param>
-                    <param name="accession" value="BRCA1 ATM"></param>
-                </conditional>
-            </conditional>
-            <output name="gene_fasta">
-                <assert_contents>
-                    <has_line line="CCGCGTCCGCGCTTACCCAATACAAGCCGGGCTACGTCCGAGGGTAACAACATGATCAAAACCACAGCAG"/>
-                    <has_line line="GCTGAGACTTCCTGGACGGGGGACAGGCTGTGGGGTTTCTCAGATAACTGGGCCCCTGCGCTCAGGAGGC"/>
-                </assert_contents>
-            </output>
-        </test>
-        <test title="test download by accession" num_outputs="3">
-            <conditional name="subcommand">
-                <param name="download_by" value="accession"></param>
-                <conditional name="text_or_file">
-                    <param name="text_or_file" value="text"></param>
-                    <param name="accession" value="NM_000546.6 NM_000492.4"></param>
-                </conditional>
-            </conditional>
-            <output name="gene_fasta">
-                <assert_contents>
-                    <has_line line="GTAGTAGGTCTTTGGCATTAGGAGCTTGAGCCCAGACGGCCCTAGCAGGGACCCCAGCGCCCGAGAGACC"/>
-                    <has_line line="CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGC"/>
-                </assert_contents>
-            </output>
-            <assert_command>
-                <has_text text="'NM_000546.6' 'NM_000492.4'"/>
-            </assert_command>
-        </test>
-        <test title="test download by accession with flanking sequence" num_outputs="4">
-            <conditional name="subcommand">
-                <param name="download_by" value="accession"></param>
-                <conditional name="text_or_file">
-                    <param name="text_or_file" value="text"></param>
-                    <param name="accession" value="WP_004675351.1"></param>
-                </conditional>
-                <param name="include_flanks_bp" value="10"/>
-            </conditional>
-            <output name="gene_flanks">
-                <assert_contents>
-                    <has_line line="gccctgccgcATGATCGATCTGATGCCGACGAGCGAGGAACAGGCGGCGGCGATCGTCCGCACCCATGCG"/>
-                </assert_contents>
-            </output>
-            <assert_command>
-                <has_text text="--include-flanks-bp 10"/>
-            </assert_command>
-        </test>
-        <test title="test download by taxon" num_outputs="1">
-            <conditional name="subcommand">
-                <param name="download_by" value="taxon"></param>
-                <param name="taxon" value="Mycobacterium tuberculosis H37Rv"></param>
-            </conditional>
-            <param name="exclude_rna" value="true"/>
-            <param name="exclude_protein" value="true"/>
-            <output name="gene_fasta">
-                <assert_contents>
-                    <has_line line="GTGGCGCTGAATATCAAAGACCCTGAGGTAGACCGACTAGCCGCCGAACTCGCTGACCGGCTGCACACCA"/>
-                </assert_contents>
-            </output>
-        </test>
-    </tests>
-    <help>
-Download a gene dataset including gene, transcript and protein sequence, a data table and a data report. Gene datasets can be specified by NCBI Gene ID, symbol or RefSeq accession. Datasets are downloaded as a zip file.
-
-The default gene dataset includes the following files:
- * gene.fna (gene sequences)
- * rna.fna (transcript sequences)
- * protein.faa (protein sequences)
- * data_report.jsonl (data report with gene metadata)
- * data_table.tsv (data table with gene metadata, one transcript per row)
- * dataset_catalog.json (a list of files and file types included in the dataset)
-    </help>
-
-</tool>
--- a/datasets_genome.xml	Thu Jul 15 15:45:43 2021 +0000
+++ b/datasets_genome.xml	Thu Jan 27 08:20:15 2022 +0000
@@ -1,131 +1,170 @@
-<tool id="datasets_download_genome" name="NCBI datasets download genome" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@">
-    <description>Download assembled genomes from NCBI</description>
+<tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@">
+    <description>download genome sequence, annotation and metadata</description>
     <macros>
         <import>macros.xml</import>
     </macros>
     <expand macro="requirements"></expand>
     <command><![CDATA[
 @SETUP_CERTIFICATES@
-datasets download genome $subcommand.download_by
-#if $subcommand.download_by == 'accession':
-    #if $subcommand.text_or_file.text_or_file == 'text':
-        #echo " ".join(f"'{x}'" for x in $subcommand.text_or_file.accession.split(' ') if x)
+datasets download genome $query.subcommand.download_by
+#if $query.subcommand.download_by == 'accession':
+    #if $query.subcommand.text_or_file.text_or_file == 'text':
+        #echo " ".join(f"'{x}'" for x in $query.subcommand.text_or_file.accession.split(' ') if x)
     #else
-        --inputfile '$subcommand.text_or_file.inputfile'
+        --inputfile '$query.subcommand.text_or_file.inputfile'
     #end if
 #else:
-    '$subcommand.taxon'
+    '$query.subcommand.taxon'
+#end if
+$filters.reference
+$filters.annotated
+#if $filters.assembly_level:
+--assembly_level $filters.assembly_level
 #end if
-$annotated
-$dehydrated
-#if $assembly_level:
---assembly_level $assembly_level
+#if $filters.assembly_source:
+--assembly_source $filters.assembly_source
 #end if
-#if $assembly_source:
---assembly_source $assembly_source
+#if $filters.chromosomes:
+--chromosomes '$filters.chromosomes'
 #end if
---chromosomes '$chromosomes'
 @EXCLUDES_GENOME@
 @INCLUDES_GENOME@
-$reference
 @RELEASED_BEFORE@
 @RELEASED_SINCE@
-#for search_term in $search:
-    --search '$search_term'
+#for search_term in $filters.search:
+    --search '$filters.search_term'
 #end for
-#if not $dehydrated:
-    && 7z x ncbi_dataset.zip
+#if $uncompressed
+&& unzip ncbi_dataset.zip
+#else
+&& unzip -l ncbi_dataset.zip > ncbi_dataset.txt
 #end if
 ]]></command>
     <inputs>
-        <conditional name="subcommand">
-            <param name="download_by" type="select" label="Choose how to find genomes to download">
-                <option value="accession">Download by NCBI assembly or BioProject accession</option>
-                <option value="taxon">Download by taxon</option>
-            </param>
-            <when value="accession">
-                <expand macro="text_or_file"/>
-            </when>
-            <when value="taxon">
-                <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurs, etc."></param>
-            </when>
-        </conditional>
-        <expand macro="annotation"></expand>
-        <expand macro="dehydrated"></expand>
-        <expand macro="assembly_level"></expand>
-        <expand macro="assembly_source"></expand>
-        <expand macro="chromosomes"></expand>
-        <expand macro="excludes_genome"></expand>
-        <expand macro="includes_genome"></expand>
-        <expand macro="released_options"></expand>
-        <expand macro="released_options" before_or_after="since"></expand>
-        <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/>
-        <repeat name="search" title="Add search terms">
-            <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
-        </repeat>
+        <section name="query" title="Query" expanded="true">
+            <conditional name="subcommand">
+                <param name="download_by" type="select" label="Choose how to find genomes to download">
+                    <option value="accession">Download by NCBI assembly or BioProject accession</option>
+                    <option value="taxon">Download by taxon</option>
+                </param>
+                <when value="accession">
+                    <expand macro="text_or_file"/>
+                </when>
+                <when value="taxon">
+                    <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."></param>
+                </when>
+            </conditional>
+        </section>
+        <section name="filters" title="Filters and Limit">
+            <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/>            
+            <expand macro="annotation"></expand>
+            <expand macro="assembly_level"></expand>
+            <expand macro="assembly_source"></expand>
+            <expand macro="chromosomes"></expand>
+            <expand macro="released_options"></expand>
+            <expand macro="released_options" before_or_after="since"></expand>
+
+            <repeat name="search" title="Add search terms">
+                <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
+            </repeat>
+        </section>
+        <section name="file_choices" title="File Choices">
+            <expand macro="excludes_genome"></expand>
+            <expand macro="includes_genome"></expand>
+        </section>
+        <param name="uncompressed" type="boolean" label="Uncompress the dataset archive" checked="true"/>
     </inputs>
     <outputs>
-        <data name="dehydrated_archive" format="zip" label="Dehydrated Archive" from_work_dir="ncbi_dataset.zip">
-            <filter>dehydrated</filter>
+        <data name="compressed_archive" format="zip" label="Compressed Archive" from_work_dir="ncbi_dataset.zip">
+            <filter>not uncompressed</filter>
+        </data>
+        <data name="archive_contents" format="txt" label="Archive Contents" from_work_dir="ncbi_dataset.txt">
+            <filter>not uncompressed</filter>
         </data>
-        <collection name="genome_fasta" label="NCBI genome datasets: genome fasta" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/.*_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
-            <filter>not dehydrated and not exclude_seq</filter>
+        <data name="genome_data_report" format="json" label="NCBI Genome Datasets: Data Report" from_work_dir="ncbi_dataset/data/assembly_data_report.jsonl">
+            <filter>uncompressed</filter>
+        </data>
+        <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.jsonl" ext="json" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>uncompressed</filter>
         </collection>
-        <collection name="protein_fasta" label="NCBI genome datasets: protein fasta" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
-            <filter>not dehydrated and not exclude_protein</filter>
+        <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/.*(?&lt;!cds_from)(chr|unplaced|_genomic)*fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>uncompressed and file_choices['exclude_seq']</filter>
         </collection>
-        <collection name="genomic_gff" label="NCBI genome datasets: genomic gff" type="list">
+        <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>uncompressed and file_choices['exclude_genomic_cds']</filter>
+        </collection>
+        <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list">
             <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
-            <filter>not dehydrated and not exclude_gff3</filter>
+            <filter>uncompressed and file_choices['exclude_gff3']</filter>
+        </collection>
+        <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>uncompressed and file_choices['exclude_rna']</filter>
         </collection>
-        <collection name="genomic_gtf" label="NCBI genome datasets: genomic gtf" type="list">
+        <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>uncompressed and file_choices['exclude_protein']</filter>
+        </collection>
+        <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
+            <filter>uncompressed and file_choices['include_gbff']</filter>
+        </collection>
+        <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list">
             <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
-            <filter>not dehydrated and include_gtf</filter>
-        </collection>
-        <collection name="genomic_gbff" label="NCBI genome datasets: genomic gbff" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="genbank" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets>
-            <filter>not dehydrated and include_gbff</filter>
+            <filter>uncompressed and file_choices['include_gtf']</filter>
         </collection>
     </outputs>
     <tests>
-        <test title="test dehydrated download by taxon">
-            <conditional name="subcommand">
+        <test expect_num_outputs="2">
+            <conditional name="query|subcommand">
                 <param name="download_by" value="taxon"></param>
                 <param name="text_or_file" value="text"></param>
                 <param name="taxon" value="human"></param>
             </conditional>
             <param name="chromosomes" value="21"></param>
-            <param name="dehydrated" value="true"/>
+            <param name="uncompressed" value="false"/>
             <param name="released_before" value="01/01/2018"></param>
-            <output name="dehydrated_archive" value="human_chrom_21_dehydrated.zip" compare="sim_size" delta="10000"/>
+            <output name="archive_contents">
+                <assert_contents>
+                    <has_text text="ncbi_dataset/data/dataset_catalog.json"/>
+                </assert_contents>
+            </output>
         </test>
-        <test title="test download by comma-separated accession">
-            <conditional name="subcommand">
+        <test expect_num_outputs="5">
+            <conditional name="query|subcommand">
                 <param name="download_by" value="accession"></param>
                 <conditional name="text_or_file">
                     <param name="text_or_file" value="text"></param>
                     <param name="accession" value="GCF_000013305.1 GCF_000007445.1"></param>
                 </conditional>
             </conditional>
-            <param name="dehydrated" value="false"/>
+            <param name="uncompressed" value="true"/>
             <param name="released_before" value="01/01/2007"></param>
-            <output_collection name="genome_fasta" type="list">
-                <element name="GCF_000013305.1" file="GCF_000013305.1.genome.fa" compare="contains"/>
-                <element name="GCF_000007445.1" file="GCF_000007445.1.genome.fa" compare="contains"/>
+            <param name="exclude_genomic_cds" value="true"/>
+            <param name="include_gtf" value="true"/>
+            <output name="genome_data_report">
+                <assert_contents>
+                    <has_text text="GCF_000013305.1"/>
+                </assert_contents>
+            </output>
+            <output_collection name="sequence_report" type="list">
+                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.seq.rpt.jsonl" compare="contains"/>
+                <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.seq.rpt.jsonl" compare="contains"/>
             </output_collection>
-            <output_collection name="protein_fasta" type="list">
-                <element name="GCF_000013305.1" file="GCF_000013305.1.protein.fa" compare="contains"/>
-                <element name="GCF_000007445.1" file="GCF_000007445.1.protein.fa" compare="contains"/>
+            <output_collection name="genomic_gtf" type="list">
+                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/>
+                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
             </output_collection>
-            <output_collection name="genomic_gff" type="list">
-                <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gff" compare="contains"/>
-                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gff" compare="contains"/>
+            <output_collection name="genomic_cds" type="list">
+                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/>
+                <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/>
             </output_collection>
         </test>
-        <test title="test download by accessions listed in file">
-            <conditional name="subcommand">
+        <test expect_num_outputs="4">
+            <conditional name="query|subcommand">
                 <param name="download_by" value="accession"></param>
                 <conditional name="text_or_file">
                     <param name="text_or_file" value="file"></param>
@@ -133,43 +172,44 @@
                 </conditional>
             </conditional>
             <param name="include_gbff" value="true"/>
-            <param name="include_gtf" value="true"/>
-            <param name="dehydrated" value="false"/>
-            <param name="released_before" value="01/01/2007"></param>
-            <output_collection name="genome_fasta" type="list">
-                <element name="GCF_000013305.1" file="GCF_000013305.1.genome.fa" compare="contains"/>
-                <element name="GCF_000007445.1" file="GCF_000007445.1.genome.fa" compare="contains"/>
-            </output_collection>
-            <output_collection name="protein_fasta" type="list">
-                <element name="GCF_000013305.1" file="GCF_000013305.1.protein.fa" compare="contains"/>
-                <element name="GCF_000007445.1" file="GCF_000007445.1.protein.fa" compare="contains"/>
+            <param name="exclude_seq" value="false"/>
+            <param name="exclude_gff3" value="true"/>
+            <param name="uncompressed" value="true"/>
+            <param name="released_before" value="01/02/2007"></param>
+            <output name="genome_data_report">
+                <assert_contents>
+                   <has_text text="SAMN02604181"/>
+                </assert_contents>
+            </output>
+            <output_collection name="sequence_report" type="list">
+                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.seq.rpt.jsonl" compare="contains"/>
+                <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.seq.rpt.jsonl" compare="contains"/>
             </output_collection>
             <output_collection name="genomic_gff" type="list">
-                <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gff" compare="contains"/>
-                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gff" compare="contains"/>
-            </output_collection>
-            <output_collection name="genomic_gtf" type="list">
-                <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gtf" compare="contains"/>
-                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
+                <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/>
+                <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/>
             </output_collection>
             <output_collection name="genomic_gbff" type="list">
-                <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gbff" compare="contains"/>
-                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gbff" compare="contains"/>
+                <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/>
+                <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/>
             </output_collection>
         </test>
     </tests>
     <help>
+<![CDATA[
+**Download Genome Datasets from NCBI**
 
 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report.
 Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file.
 
-The default genome dataset includes the following files (if available):
-* genomic.fna (genomic sequences)
-* rna.fna (transcript sequences)
-* protein.faa (protein sequences)
-* genomic.gff (genome annotation in gff3 format)
-* data_report.jsonl (data report with genome assembly and annotation metadata)
-* dataset_catalog.json (a list of files and file types included in the dataset)
+Tthe default genome dataset includes the following files (if available):
+ * genomic.fna (genomic sequences)
+ * rna.fna (transcript sequences)
+ * protein.faa (protein sequences)
+ * genomic.gff (genome annotation in gff3 format)
+ * data_report.jsonl (data report with genome assembly and annotation metadata)
+ * dataset_catalog.json (a list of files and file types included in the dataset)
+]]>
     </help>
 
 </tool>
--- a/datasets_ortholog.xml	Thu Jul 15 15:45:43 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,143 +0,0 @@
-<tool id="datasets_download_ortholog" name="NCBI datasets download ortholog" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@">
-    <description>Download an ortholog dataset</description>
-    <macros>
-        <import>macros.xml</import>
-    </macros>
-    <expand macro="requirements"></expand>
-    <command><![CDATA[
-@SETUP_CERTIFICATES@
-datasets download ortholog $subcommand.download_by
-#if $subcommand.download_by != 'taxon':
-    #if $subcommand.text_or_file.text_or_file == 'text':
-        #if $subcommand.download_by == 'gene-id':
-            $subcommand.text_or_file.accession
-        #else
-            #echo " ".join(f"'{x}'" for x in $subcommand.text_or_file.accession.split(' ') if x)
-        #end if
-    #else
-        --inputfile '$subcommand.text_or_file.inputfile'
-    #end if
-#else:
-    '$subcommand.taxon'
-#end if
-@EXCLUDES_GENE@
-#if $taxon_filter:
-    --taxon-filter '$taxon_filter'
-#end if
-&& 7z x ncbi_dataset.zip
-]]></command>
-    <inputs>
-        <conditional name="subcommand">
-            <param name="download_by" type="select" label="Choose how to find ortholog dataset to download">
-                <option value="gene-id">Download a ortholog dataset by NCBI Gene ID</option>
-                <option value="symbol">Download a ortholog dataset by gene symbol</option>
-                <option value="accession">Download a orthologsdataset by RefSeq nucleotide or protein accession</option>
-            </param>
-            <when value="gene-id">
-                <expand macro="text_or_file" what="gene-id" what_extended="NCBI Gene ID" help="Should be valid NCBI Gene ID">
-                    <sanitizer invalid_char="">
-                        <valid initial="string.digits">
-                            <add value=" " />
-                        </valid>
-                    </sanitizer>
-                </expand>
-            </when>
-            <when value="symbol">
-                <expand macro="text_or_file" what="symbol" what_extended="gene symbol" help="Should be valid gene symbol"/>
-                <param argument="--taxon" type="text" value="human" label="Specify a species name" help="Species name can be common or scientific name or species-level NCBI Taxonomy ID"/>
-            </when>
-            <when value="accession">
-                <expand macro="text_or_file" what="accession" what_extended="RefSeq nucleotide or protein accession" help="Should be RefSeq nucleotide or protein accession"/>
-            </when>
-        </conditional>
-        <param argument="--taxon-filter" type="text" optional="true" label="limit genes to a specified taxon" help="any rank"/>
-        <expand macro="excludes_gene"></expand>
-    </inputs>
-    <outputs>
-        <data name="gene_fasta" format="fasta" label="NCBI datasets ortholog: gene fasta" from_work_dir="ncbi_dataset/data/gene.fna">
-            <filter>not exclude_gene</filter>
-        </data>
-        <data name="protein_fasta" format="fasta" label="NCBI datasets ortholog: protein fasta" from_work_dir="ncbi_dataset/data/protein.faa">
-            <filter>not exclude_protein</filter>
-        </data>
-        <data name="rna_fasta" format="fasta" label="NCBI datasets ortholog: rna fasta" from_work_dir="ncbi_dataset/data/rna.fna">
-            <filter>not exclude_rna</filter>
-        </data>
-    </outputs>
-    <tests>
-        <test title="test download by gene-id">
-            <conditional name="subcommand">
-                <param name="download_by" value="gene-id"></param>
-                <conditional name="text_or_file">
-                    <param name="text_or_file" value="text"></param>
-                    <param name="accession" value="472 672"></param>
-                </conditional>
-            </conditional>
-            <param name="taxon_filter" value="Puma"/>
-            <output name="gene_fasta">
-                <assert_contents>
-                    <has_line line="ATGGATTTATCTGCAGATCGTGTTGAAGAAGTACAAAGTGTCCTTAATGCTATGCAGAAAATCTTAGAGT"/>
-                    <has_line line="GGGCAGAGGGGCGGAACTACAAGTGCGCAATCGTGGGCCGCGGCCCATTTCCCCTTCCCAGGTAAATTCG"/>
-                </assert_contents>
-            </output>
-        </test>
-        <test title="test download by gene-id, test sanitizer">
-            <conditional name="subcommand">
-                <param name="download_by" value="gene-id"></param>
-                <conditional name="text_or_file">
-                    <param name="text_or_file" value="text"></param>
-                    <param name="accession" value="472 672"></param>
-                </conditional>
-            </conditional>
-            <param name="taxon_filter" value="Puma"/>
-            <output name="gene_fasta">
-                <assert_contents>
-                    <has_line line="ATGGATTTATCTGCAGATCGTGTTGAAGAAGTACAAAGTGTCCTTAATGCTATGCAGAAAATCTTAGAGT"/>
-                    <has_line line="GGGCAGAGGGGCGGAACTACAAGTGCGCAATCGTGGGCCGCGGCCCATTTCCCCTTCCCAGGTAAATTCG"/>
-                </assert_contents>
-            </output>
-            <assert_command>
-                <not_has_text text="exit"/>
-            </assert_command>
-        </test>
-        <test title="test download by gene symbol">
-            <conditional name="subcommand">
-                <param name="download_by" value="symbol"></param>
-                <conditional name="text_or_file">
-                    <param name="text_or_file" value="text"></param>
-                    <param name="accession" value="BRCA1 ATM"></param>
-                </conditional>
-            </conditional>
-            <param name="taxon_filter" value="Puma"/>
-            <output name="gene_fasta">
-                <assert_contents>
-                    <has_line line="ATGGATTTATCTGCAGATCGTGTTGAAGAAGTACAAAGTGTCCTTAATGCTATGCAGAAAATCTTAGAGT"/>
-                    <has_line line="GGGCAGAGGGGCGGAACTACAAGTGCGCAATCGTGGGCCGCGGCCCATTTCCCCTTCCCAGGTAAATTCG"/>
-                </assert_contents>
-            </output>
-        </test>
-        <test title="test download by accession">
-            <conditional name="subcommand">
-                <param name="download_by" value="accession"></param>
-                <conditional name="text_or_file">
-                    <param name="text_or_file" value="text"></param>
-                    <param name="accession" value="NM_000546.6 NM_000492.4"></param>
-                </conditional>
-            </conditional>
-            <param name="taxon_filter" value="Puma"/>
-            <output name="gene_fasta">
-                <assert_contents>
-                    <has_line line="ATGCAGGAGCCGCCATTGGAACTCACCATCGAGCCCCCTCTGAGCCAGGAGACATTTTCGGAATTGTGGA"/>
-                    <has_line line="AGTTGGAAGCAAATGACATCACTGCGGGTCAGAGAAAAAGGGGCGAGCAGCCTGCGCCAGAAGAGTAGGG"/>
-                </assert_contents>
-            </output>
-            <assert_command>
-                <has_text text="'NM_000546.6' 'NM_000492.4'"/>
-            </assert_command>
-        </test>
-    </tests>
-    <help>
-        Download an ortholog dataset including gene, transcript and protein sequence, a data table and a data report. Ortholog data is calculated by NCBI for vertebrates and insects. Ortholog datasets can be specified by NCBI Gene ID, symbol or RefSeq accession.)
-    </help>
-
-</tool>
--- a/datasets_virus_genome.xml	Thu Jul 15 15:45:43 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,130 +0,0 @@
-<tool id="datasets_virus_genome" name="NCBI datasets download virus genome" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@">
-    <description>Download a coronavirus genome dataset including genome, CDS and protein sequence, annotation
-        and a detailed data report</description>
-    <macros>
-        <import>macros.xml</import>
-    </macros>
-    <expand macro="requirements"></expand>
-    <command><![CDATA[
-@SETUP_CERTIFICATES@
-datasets download virus genome taxon '$taxon'
-$annotated
-$complete_only
-@EXCLUDES_VIRUS_GENOME@
-@INCLUDES_VIRUS_GENOME@
-#if str($geo_location):
-    --geo-location '$geo_location'
-#end if
-#if str($host):
-    --host '$host'
-#end if
-#if str($lineage):
-    --host '$lineage'
-#end if
-$refseq
-@RELEASED_SINCE@
-&& 7z x ncbi_dataset.zip
-]]></command>
-    <inputs>
-        <param name="taxon" type="text" label="Enter taxon" help="e.g. sars-cov-2, coronaviridae"></param>
-        <expand macro="annotation"></expand>
-        <param argument="--complete-only" truevalue="--complete-only" falsevalue="" type="boolean" label="limit to complete coronavirus genomes?"/>
-        <expand macro="excludes_virus_genome"></expand>
-        <expand macro="includes_virus_genome"></expand>
-        <param argument="--geo-location" type="text" label="Limit to coronavirus genomes isolated from a specified geographic location" help="Continent, country or U.S. state"/>
-        <param argument="--host" type="text" label="Limit to coronavirus genomes isolated from a specified host" help="NCBI Taxonomy ID, scientific or common name at any taxonomic rank"/>
-        <param argument="--lineage" type="text" label="Limit to SARS-CoV-2 genomes classified as the specified lineage (variant) by pangolin using the pangoLEARN algorithm" />
-        <param argument="--refseq" type="boolean" truevalue="--refseq" falsevalue="" label="Limit to RefSeq coronavirus genomes"/>
-        <expand macro="released_options" before_or_after="since"></expand>
-    </inputs>
-    <outputs>
-        <data name="genomic_fasta" format="fasta" label="NCBI datasets virus genome: genomic fasta" from_work_dir="ncbi_dataset/data/genomic.fna">
-            <filter>not exclude_seq</filter>
-        </data>
-        <data name="genomic_genbank" format="fasta" label="NCBI datasets virus genome: genomic genbank" from_work_dir="ncbi_dataset/data/genomic.gbff">
-            <filter>include_gbff</filter>
-        </data>
-        <data name="cds_fasta" format="fasta" label="NCBI datasets virus genome: CDS fasta" from_work_dir="ncbi_dataset/data/cds.fna">
-            <filter>not exclude_cds</filter>
-        </data>
-        <data name="protein_fasta" format="fasta" label="NCBI datasets virus genome: protein fasta" from_work_dir="ncbi_dataset/data/protein.faa">
-            <filter>not exclude_protein</filter>
-        </data>
-        <data name="protein_genbank" format="fasta" label="NCBI datasets virus genome: protein genbank" from_work_dir="ncbi_dataset/data/protein.gpff">
-            <filter>not exclude_gpff</filter>
-        </data>
-        <collection name="protein_structure" type="list" format="pdb" label="NCBI datasets virus genome: protein structure">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\.pdb" ext="pdb" directory="ncbi_dataset/data/pdb"></discover_datasets>
-            <filter>not exclude_pdb</filter>
-        </collection>
-    </outputs>
-    <tests>
-        <test title="Test download of PDB collection">
-            <param name="taxon" value="sars-cov-2"/>
-            <param name="exclude_seq" value="true"/>
-            <param name="exclude_cds" value="true"/>
-            <param name="exclude_protein" value="true"/>
-            <param name="exclude_gpff" value="true"/>
-            <param name="released_since" value="07/07/2021"/>
-            <param name="refseq" value="true"/>
-            <output_collection name="protein_structure" type="list">
-                <element name="6VYB" checksum="sha256$307a56951050faa61f4b57e6b8ceabb7ca743125058421c232746f1820484069"/>
-                <element name="6VYO" checksum="sha256$1dab20880b7ae913da336e8a6dba838689256e63a3faaaaa439b7bd7f3651eaf"/>
-                <element name="6W37" checksum="sha256$f115326ed4b3f7b332b44790c1a3ca769deb2d440dae68ce4ccae8e650dd1d7e"/>
-                <element name="6W4H" checksum="sha256$a49bc40b5652664b7e01e562279786520b0abcf6e34a7d7f603d6e429afbf384"/>
-                <element name="6W9C" checksum="sha256$0f845885e5a9d41e42628c3b05b194ecbac59f38211927eb924e80c830190753"/>
-                <element name="6W9Q" checksum="sha256$c5d34126464ac47738c8883f03455cf3d73ba41a6aa9e0c40e92bca411321ed7"/>
-            </output_collection>
-        </test>
-        <test title="Test download of non-collection elements" expect_num_outputs="5">
-            <param name="taxon" value="sars-cov-2"/>
-            <param name="include_gbff" value="true"/>
-            <param name="exclude_pdb" value="true"/>
-            <param name="refseq" value="true"/>
-            <output name="genomic_fasta">
-                <assert_contents>
-                    <has_line line="ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAA"/>
-                </assert_contents>
-            </output>
-            <output name="genomic_genbank">
-                <assert_contents>
-                    <has_line line="ACCESSION   NC_045512"/>
-                </assert_contents>
-            </output>
-            <output name="cds_fasta">
-                <assert_contents>
-                    <has_line line="AGTGGTTTTAGAAAAATGGCATTCCCATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTA"/>
-                </assert_contents>
-            </output>
-            <output name="protein_fasta">
-                <assert_contents>
-                    <has_line line="MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLEQPYVF"/>
-                </assert_contents>
-            </output>
-            <output name="protein_genbank">
-                <assert_contents>
-                    <has_line line="ACCESSION   YP_009724389"/>
-                </assert_contents>
-            </output>
-        </test>
-    </tests>
-    <help>
-Download a coronavirus genome dataset including genome, CDS and protein sequence, annotation
-and a detailed data report. Coronavirus genome datasets are limited to the Coronaviridae family
-including SARS-CoV-2. Coronavirus genome datasets can be specified by taxon. Datasets are
-downloaded as a zip file.
-
-The default coronavirus genome dataset includes the following files (if available):
-* genomic.fna (genomic sequences)
-* cds.fna (nucleotide coding sequences)
-* protein.faa (protein sequences)
-* protein.gpff (protein sequence and annotation in GenPept flat file format)
-* protein structures in PDB format
-* data_report.jsonl (data report with viral metadata)
-* virus_dataset.md (README containing details on sequence file data content and other information)
-* dataset_catalog.json (a list of files and file types included in the dataset)
-
-Refer to NCBI's [command line quickstart](https://www.ncbi.nlm.nih.gov/datasets/docs/quickstarts/command-line-tools/) documentation for information about getting started with the command-line tools.
-    </help>
-
-</tool>
--- a/datasets_virus_protein.xml	Thu Jul 15 15:45:43 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,126 +0,0 @@
-<tool id="datasets_virus_protein" name="NCBI datasets download virus protein" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@">
-    <description>Download a coronavirus protein dataset</description>
-    <macros>
-        <import>macros.xml</import>
-    </macros>
-    <expand macro="requirements"></expand>
-    <command><![CDATA[
-@SETUP_CERTIFICATES@
-datasets download virus protein
-#for $protein in $proteins:
-    $protein
-#end for
-$annotated
-$complete_only
-@EXCLUDES_VIRUS_PROTEIN@
-#if str($geo_location):
-    --geo-location '$geo_location'
-#end if
-#if str($host):
-    --host '$host'
-#end if
-#if str($lineage):
-    --host '$lineage'
-#end if
-$refseq
-@RELEASED_SINCE@
-&& 7z x ncbi_dataset.zip
-]]></command>
-    <inputs>
-        <param name="proteins" type="select" multiple="true" label="Select viral protein(s)">
-            <option value="ORF1ab">ORF1ab</option>
-            <option value="ORF1a">ORF1a</option>
-            <option value="nsp1">nsp1</option>
-            <option value="nsp2">nsp2</option>
-            <option value="nsp3">nsp3</option>
-            <option value="nsp4">nsp4</option>
-            <option value="nsp5">nsp5</option>
-            <option value="nsp6">nsp6</option>
-            <option value="nsp7">nsp7</option>
-            <option value="nsp8">nsp8</option>
-            <option value="nsp9">nsp9</option>
-            <option value="nsp10">nsp10</option>
-            <option value="rdrp">rdrp</option>
-            <option value="nsp11">nsp11</option>
-            <option value="nsp13">nsp13</option>
-            <option value="nsp14">nsp14</option>
-            <option value="nsp15">nsp15</option>
-            <option value="nsp16">nsp16</option>
-            <option value="S">S</option>
-            <option value="ORF3a">ORF3a</option>
-            <option value="E">E</option>
-            <option value="M">M</option>
-            <option value="ORF6">ORF6</option>
-            <option value="ORF7a">ORF7a</option>
-            <option value="ORF7b">ORF7b</option>
-            <option value="ORF8">ORF8</option>
-            <option value="N">N</option>
-            <option value="ORF10">ORF10</option>
-        </param>
-        <expand macro="annotation"></expand>
-        <param argument="--complete-only" truevalue="--complete-only" falsevalue="" type="boolean" label="limit to complete coronavirus genomes?"/>
-        <expand macro="excludes_virus_protein"></expand>
-        <param argument="--geo-location" type="text" label="Limit to coronavirus genomes isolated from a specified geographic location" help="Continent, country or U.S. state"/>
-        <param argument="--host" type="text" label="Limit to coronavirus genomes isolated from a specified host" help="NCBI Taxonomy ID, scientific or common name at any taxonomic rank"/>
-        <param argument="--lineage" type="text" label="Limit to SARS-CoV-2 genomes classified as the specified lineage (variant) by pangolin using the pangoLEARN algorithm" />
-        <param argument="--refseq" type="boolean" truevalue="--refseq" falsevalue="" label="Limit to RefSeq coronavirus genomes"/>
-        <expand macro="released_options" before_or_after="since"></expand>
-    </inputs>
-    <outputs>
-        <data name="cds_fasta" format="fasta" label="NCBI datasets virus genome: CDS fasta" from_work_dir="ncbi_dataset/data/cds.fna">
-            <filter>not exclude_cds</filter>
-        </data>
-        <data name="protein_fasta" format="fasta" label="NCBI datasets virus genome: protein fasta" from_work_dir="ncbi_dataset/data/protein.faa">
-            <filter>not exclude_protein</filter>
-        </data>
-        <data name="protein_genbank" format="fasta" label="NCBI datasets virus genome: protein genbank" from_work_dir="ncbi_dataset/data/protein.gpff">
-            <filter>not exclude_gpff</filter>
-        </data>
-        <collection name="protein_structure" type="list" format="pdb" label="NCBI datasets virus genome: protein structure">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\.pdb" ext="pdb" directory="ncbi_dataset/data/pdb"></discover_datasets>
-            <filter>not exclude_pdb</filter>
-        </collection>
-    </outputs>
-    <tests>
-        <test title="Test download of PDB collection">
-            <param name="proteins" value="S,M"/>
-            <param name="exclude_cds" value="true"/>
-            <param name="exclude_protein" value="true"/>
-            <param name="exclude_gpff" value="true"/>
-            <param name="released_since" value="07/07/2021"/>
-            <param name="refseq" value="true"/>
-            <output_collection name="protein_structure" type="list">
-                <element name="6VYB" checksum="sha256$307a56951050faa61f4b57e6b8ceabb7ca743125058421c232746f1820484069"/>
-                <element name="6VYO" checksum="sha256$1dab20880b7ae913da336e8a6dba838689256e63a3faaaaa439b7bd7f3651eaf"/>
-                <element name="6W37" checksum="sha256$f115326ed4b3f7b332b44790c1a3ca769deb2d440dae68ce4ccae8e650dd1d7e"/>
-                <element name="6W4H" checksum="sha256$a49bc40b5652664b7e01e562279786520b0abcf6e34a7d7f603d6e429afbf384"/>
-                <element name="6W9C" checksum="sha256$0f845885e5a9d41e42628c3b05b194ecbac59f38211927eb924e80c830190753"/>
-                <element name="6W9Q" checksum="sha256$c5d34126464ac47738c8883f03455cf3d73ba41a6aa9e0c40e92bca411321ed7"/>
-            </output_collection>
-        </test>
-        <test title="Test download of non-collection elements" expect_num_outputs="3">
-            <param name="exclude_pdb" value="true"/>
-            <param name="proteins" value="S,M"/>
-            <param name="refseq" value="true"/>
-            <output name="cds_fasta">
-                <assert_contents>
-                    <has_line line="ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAAT"/>
-                </assert_contents>
-            </output>
-            <output name="protein_fasta">
-                <assert_contents>
-                    <has_line line="MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHV"/>
-                </assert_contents>
-            </output>
-            <output name="protein_genbank">
-                <assert_contents>
-                    <has_line line="ACCESSION   YP_009724390"/>
-                </assert_contents>
-            </output>
-        </test>
-    </tests>
-    <help>
-Downloads a coronavirus SARS-CoV-2 protein dataset
-    </help>
-
-</tool>
--- a/macros.xml	Thu Jul 15 15:45:43 2021 +0000
+++ b/macros.xml	Thu Jan 27 08:20:15 2022 +0000
@@ -1,11 +1,11 @@
 <macros>
-    <token name="@TOOL_VERSION@">11.25.1</token>
+    <token name="@TOOL_VERSION@">12.27.1</token>
     <token name="@PROFILE@">20.01</token>
     <token name="@LICENSE@">MIT</token>
     <token name="@PROFILE_AND_LICENSE@">profile="@PROFILE@" license="@LICENSE@"</token>
     <token name="@SETUP_CERTIFICATES@"><![CDATA[
 ## If running in container use certificate from ca-certificates instead of outdated / missing container certificates
-[ -f /usr/local/ssl/cacert.pem ] && export export SSL_CERT_FILE="/usr/local/ssl/cacert.pem";
+[ -f /usr/local/ssl/cacert.pem ] && export SSL_CERT_FILE="/usr/local/ssl/cacert.pem";
         ]]></token>
     <xml name="requirements">
         <requirements>
@@ -51,7 +51,7 @@
         </conditional>
     </xml>
     <xml name="chromosomes">
-        <param argument="--chromosomes" type="text" value="all" label="Limit chromosomes to a comma-delimited list of chromosomes">
+        <param argument="--chromosomes" type="text" label="Limit chromosomes to a comma-delimited list of chromosomes">
             <sanitizer invalid_char="">
                 <valid initial="string.letters,string.digits">
                     <add value="_" />
@@ -61,14 +61,25 @@
             </sanitizer>
         </param>
     </xml>
+    <xml name="include" token_include_what="gbff" token_include_label="Include GenBank flat file sequence and annotation, if available">
+        <param argument="--include-@INCLUDE_WHAT@" type="boolean" truevalue="--include-@INCLUDE_WHAT@" falsevalue="" label="@INCLUDE_LABEL@" />
+    </xml>
+    <xml name="includes_genome">
+        <expand macro="include" include_what="gbff" include_label="Include GenBank flat file sequence and annotation, if available"/>
+        <expand macro="include" include_what="gtf" include_label="Include gtf annotation file, if available"/>
+    </xml>
     <xml name="exclude" token_exclude_what="gff3" token_exclude_label="Exclude gff3 annotation file">
         <param argument="--exclude-@EXCLUDE_WHAT@" type="boolean" truevalue="--exclude-@EXCLUDE_WHAT@" falsevalue="" label="@EXCLUDE_LABEL@" />
     </xml>
+    <xml name="anti-exclude" token_exclude_what="gff3" token_exclude_label="Include gff3 annotation file" token_checked="false">
+        <param argument="--exclude-@EXCLUDE_WHAT@" type="boolean" falsevalue="--exclude-@EXCLUDE_WHAT@" truevalue="" label="@EXCLUDE_LABEL@" checked="@CHECKED@"/>
+    </xml>
     <xml name="excludes_genome">
-        <expand macro="exclude" exclude_what="gff3" exclude_label="Exclude gff3 annotation file"/>
-        <expand macro="exclude" exclude_what="protein" exclude_label="Exclude protein sequence file"/>
-        <expand macro="exclude" exclude_what="rna" exclude_label="Exclude transcript sequence file"/>
-        <expand macro="exclude" exclude_what="seq" exclude_label="Exclude genomic sequence file"/>
+        <expand macro="anti-exclude" exclude_what="seq" exclude_label="Include genomic sequence file" checked="true"/>
+        <expand macro="anti-exclude" exclude_what="gff3" exclude_label="Include gff3 annotation file"/>
+        <expand macro="anti-exclude" exclude_what="genomic-cds" exclude_label="Include cds from genomic sequence file"/>
+        <expand macro="anti-exclude" exclude_what="protein" exclude_label="Include protein sequence file"/>
+        <expand macro="anti-exclude" exclude_what="rna" exclude_label="Include transcript sequence file"/>
     </xml>
     <xml name="excludes_gene">
         <expand macro="exclude" exclude_what="gene" exclude_label="Exclude gene sequence file"/>
@@ -87,31 +98,24 @@
             <expand macro="exclude" exclude_what="seq" exclude_label="Exclude genomic sequence file"/>
         </expand>
     </xml>
-    <token name="@EXCLUDES_GENOME@">$exclude_gff3 $exclude_protein $exclude_rna $exclude_seq</token>
+    <token name="@EXCLUDES_GENOME@">$file_choices.exclude_gff3 $file_choices.exclude_genomic_cds $file_choices.exclude_protein $file_choices.exclude_rna $file_choices.exclude_seq</token>
     <token name="@EXCLUDES_GENE@">$exclude_gene $exclude_protein $exclude_rna</token>
     <token name="@EXCLUDES_VIRUS_PROTEIN@">$exclude_protein $exclude_pdb $exclude_gpff $exclude_cds</token>
     <token name="@EXCLUDES_VIRUS_GENOME@">$exclude_seq @EXCLUDES_VIRUS_PROTEIN@</token>
-    <xml name="include" token_include_what="gbff" token_include_label="Include GenBank flat file sequence and annotation, if available">
-        <param argument="--include-@INCLUDE_WHAT@" type="boolean" truevalue="--include-@INCLUDE_WHAT@" falsevalue="" label="@INCLUDE_LABEL@" />
-    </xml>
-    <xml name="includes_genome">
-        <expand macro="include" include_what="gbff" include_label="Include GenBank flat file sequence and annotation, if available"/>
-        <expand macro="include" include_what="gtf" include_label="Include gtf annotation file, if available"/>
-    </xml>
     <xml name="includes_virus_genome">
         <expand macro="include" include_what="gbff" include_label="Include GenBank flat file sequence and annotation"/>
     </xml>
-    <token name="@INCLUDES_GENOME@">$include_gbff $include_gtf</token>
+    <token name="@INCLUDES_GENOME@">$file_choices.include_gbff $file_choices.include_gtf</token>
     <token name="@INCLUDES_VIRUS_GENOME@">$include_gbff</token>
     <xml name="released_options" token_released_what="genomes" token_before_or_after="before">
         <param argument="--released-@BEFORE_OR_AFTER@" type="text" optional="true" label="Only include @RELEASED_WHAT@ that have been released @BEFORE_OR_AFTER@ a specified date (MM/DD/YYYY)"></param>
     </xml>
-    <token name="@RELEASED_BEFORE@">#if $released_before:
---released-before '$released_before'
+    <token name="@RELEASED_BEFORE@">#if $filters.released_before:
+--released-before '$filters.released_before'
 #end if
     </token>
-    <token name="@RELEASED_SINCE@">#if $released_since:
---released-since '$released_since'
+    <token name="@RELEASED_SINCE@">#if $filters.released_since:
+--released-since '$filters.released_since'
 #end if
     </token>
-</macros>
\ No newline at end of file
+</macros>
--- a/test-data/GCF_000007445.1.genome.fa	Thu Jul 15 15:45:43 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
->NC_004431.1 Escherichia coli CFT073, complete sequence
-AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTG
-GTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGTCACTAAATACTTTAACCAATATAGGCATAGCGCACAGAC
-AGATAAAAATTACAGAGTACACAACATCCATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGT
-AACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTGTGCACAGAAA
-ACCCCCAGCTAGGCTGGGGGTTCCGGAAAGCTTTCAGCTTTGAGCCAGTTATTAAAACCCCTTTTGATTTGTTAAAACAC
-CTTGCGGTCTGGCAACTGCAAGTGTCAAACAAGAAATCAAAAGGGGGTCCCAATGGGGAACGAAAAGAGCTTAGCGCACA
-CCCGATGGAACTGTAAATATCACATAGTATTTGCGCCAAAATACCGAAGACAGGTGTTCTACAGAGAGAAGCGTAGAGCA
-ATAGGCTGTATTTTGAGAAAGCTGTGTGAGTGGAAAAGTGTACGGATTCTGGAAGCTGAATGCTGTGCAGATCATATCCA
-TATGCTTGTGGAGATCCCGCCCAAAATGAGCGTATCAGGCTTTATGGGATATCTGAAAGGGAAAAGCAGTCTGATGCCTT
--- a/test-data/GCF_000007445.1.genomic.gbff	Thu Jul 15 15:45:43 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-LOCUS       NC_004431            5231428 bp    DNA     circular CON 13-MAY-2021
-DEFINITION  Escherichia coli CFT073, complete sequence.
-ACCESSION   NC_004431 NZ_AE016755 NZ_AE016756 NZ_AE016757 NZ_AE016758
-            NZ_AE016759 NZ_AE016760 NZ_AE016761 NZ_AE016762 NZ_AE016763
-            NZ_AE016764 NZ_AE016765 NZ_AE016766 NZ_AE016767 NZ_AE016768
-            NZ_AE016769 NZ_AE016770 NZ_AE016771 NZ_AE016772
-VERSION     NC_004431.1
-DBLINK      BioProject: PRJNA224116
-            BioSample: SAMN02604094
-            Assembly: GCF_000007445.1
--- a/test-data/GCF_000007445.1.genomic.gff	Thu Jul 15 15:45:43 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-##gff-version 3
-#!gff-spec-version 1.21
-#!processor NCBI annotwriter
-#!genome-build ASM744v1
-#!genome-build-accession NCBI_Assembly:GCF_000007445.1
-#!annotation-date 05/06/2021 17:43:00
-#!annotation-source NCBI RefSeq 
-##sequence-region NC_004431.1 1 5231428
-##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=199310
-NC_004431.1	RefSeq	region	1	5231428	.	+	.	ID=NC_004431.1:1..5231428;Dbxref=taxon:199310;Is_circular=true;Name=ANONYMOUS;gbkey=Src;genome=chromosome;mol_type=genomic DNA;strain=CFT073
--- a/test-data/GCF_000007445.1.protein.fa	Thu Jul 15 15:45:43 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
->WP_000002278.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase PhnJ [Escherichia]
-MANLSGYNFAYLDEQTKRMIRRAILKAVAIPGYQVPFGGREMPMPYGWGTGGIQLTASVIGESDVLKVIDQGADDTTNAV
-SIRNFFKRVTGVNTTERTDDATLIQTRHRIPETPLTEDQIIIFQVPIPEPLRFIEPRETETRTMHALEEYGVMQVKLYED
-IARFGHIATTYAYPVKVNGRYVMDPSPIPKFDNPKMDMMPALQLFGAGREKRIYAVPPFTHVESLDFDDHPFTVQQWDEP
-CAICGSTHSYLDEVVLDDAGNRMFVCSDTDYCRQQNEAKSQ
->WP_000002542.1 MULTISPECIES: signal peptidase I [Enterobacteriaceae]
-MANMFALILVIATLVTGILWCVDKFFFAPKRRERQAAAQAAAGDSLDKATLKKVAPKPGWLETGASVFPVLAIVLIVRSF
-IYEPFQIPSGSMMPTLLIGDFILVEKFAYGIKDPIYQKTLIETGHPKRGDIVVFKYPEDPKLDYIKRAVGLPGDKVTYDP
-VSKELTIQPGCSSGQACENALPVTYSNVEPSDFVQTFSRRNGGEATSGFFEVPKNETKENGIRLSERKETLGDVTHRILT
-VPIAQDQVGMYYQQPGQQLATWIVPPGQYFMMGDNRDNSADSRYWGFVPEANLVGRATAIWMSFDKQEGEWPTGVRLSRI
--- a/test-data/GCF_000013305.1.genome.fa	Thu Jul 15 15:45:43 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
->NC_008253.1 Escherichia coli 536, complete sequence
-AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTG
-GTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGTCACTAAATACTTTAACCAATATAGGCATAGCGCACAGAC
-AGATAAAAATTACAGAGTACACAACATCCATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGT
-AACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTCGACCAAAGGT
-AACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGGGTTGCCGA
-TATTCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCATCTGGTAG
-CGATGATTGAAAAAACCATTAGCGGTCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGCCGAACTTCTG
-ACGGGACTCGCCGCCGCCCAGCCGGGATTTCCGCTGGCACAATTGAAAACTTTCGTCGACCAGGAATTTGCCCAAATAAA
-ACATGTCCTGCATGGCATCAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAA
--- a/test-data/GCF_000013305.1.genomic.gbff	Thu Jul 15 15:45:43 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-LOCUS       NC_008253            4938920 bp    DNA     circular CON 13-MAY-2021
-DEFINITION  Escherichia coli 536, complete sequence.
-ACCESSION   NC_008253
-VERSION     NC_008253.1
-DBLINK      BioProject: PRJNA224116
-            BioSample: SAMN02604181
-            Assembly: GCF_000013305.1
-KEYWORDS    RefSeq.
-SOURCE      Escherichia coli 536
-  ORGANISM  Escherichia coli 536
--- a/test-data/GCF_000013305.1.genomic.gff	Thu Jul 15 15:45:43 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-##gff-version 3
-#!gff-spec-version 1.21
-#!processor NCBI annotwriter
-#!genome-build ASM1330v1
-#!genome-build-accession NCBI_Assembly:GCF_000013305.1
-#!annotation-date 05/06/2021 17:31:48
-#!annotation-source NCBI RefSeq 
-##sequence-region NC_008253.1 1 4938920
-##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=362663
-NC_008253.1	RefSeq	region	1	4938920	.	+	.	ID=NC_008253.1:1..4938920;Dbxref=taxon:362663;Is_circular=true;Name=ANONYMOUS;gbkey=Src;genome=chromosome;mol_type=genomic DNA;serogroup=O6:K15:H31;strain=536
--- a/test-data/GCF_000013305.1.genomic.gtf	Thu Jul 15 15:45:43 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-#gtf-version 2.2
-#!genome-build ASM1330v1
-#!genome-build-accession NCBI_Assembly:GCF_000013305.1
-#!annotation-date 05/06/2021 17:31:48
-#!annotation-source NCBI RefSeq 
-NC_008253.1	RefSeq	gene	190	255	.	+	.	gene_id "ECP_RS00005"; transcript_id ""; gbkey "Gene"; gene "thrL"; gene_biotype "protein_coding"; locus_tag "ECP_RS00005"; old_locus_tag "ECP_0001"; 
-NC_008253.1	Protein Homology	CDS	190	252	.	+	0	gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; 
-NC_008253.1	Protein Homology	start_codon	190	192	.	+	0	gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; 
-NC_008253.1	Protein Homology	stop_codon	253	255	.	+	0	gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; 
-NC_008253.1	RefSeq	gene	336	2798	.	+	.	gene_id "ECP_RS00010"; transcript_id ""; gbkey "Gene"; gene "thrA"; gene_biotype "protein_coding"; locus_tag "ECP_RS00010"; old_locus_tag "ECP_0002"; 
--- a/test-data/GCF_000013305.1.protein.fa	Thu Jul 15 15:45:43 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
->WP_000002277.1 alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase PhnJ [Escherichia coli]
-MANLSGYNFAYLDEQTKRMIRRAILKAVAIPGYQVPFGGREMPMPYGWGTGGIQLTASVIGESDVLKVIDQGADDTTNAV
-SIRNFFKRVTGVNTTERTDDATLIQTRHRIPETPLTEDQIIIFQVPIPEPLRFIEPRETETRTMHALEEYGVMQVKLYED
-IARFGHIATTYAYPVKVNGCYVMDPSPIPKFDNPKMNMMPALQLFGAGREKRIYAVPPFTRVESLDFDDHPFTVQQWNEP
-CAICGSTHSYLDEVVLDDAGNRMFVCSDTDYCRQQSEAKSQ
->WP_000002542.1 MULTISPECIES: signal peptidase I [Enterobacteriaceae]
-MANMFALILVIATLVTGILWCVDKFFFAPKRRERQAAAQAAAGDSLDKATLKKVAPKPGWLETGASVFPVLAIVLIVRSF
-IYEPFQIPSGSMMPTLLIGDFILVEKFAYGIKDPIYQKTLIETGHPKRGDIVVFKYPEDPKLDYIKRAVGLPGDKVTYDP
-VSKELTIQPGCSSGQACENALPVTYSNVEPSDFVQTFSRRNGGEATSGFFEVPKNETKENGIRLSERKETLGDVTHRILT
-VPIAQDQVGMYYQQPGQQLATWIVPPGQYFMMGDNRDNSADSRYWGFVPEANLVGRATAIWMSFDKQEGEWPTGVRLSRI
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/genome.2.GCF_000007445.1.genomic.cds	Thu Jan 27 08:20:15 2022 +0000
@@ -0,0 +1,10 @@
+>lcl|NC_004431.1_cds_WP_001386572.1_1 [gene=thrL] [locus_tag=C_RS00005] [protein=thr operon leader peptide] [protein_id=WP_001386572.1] [location=190..255] [gbkey=CDS]
+ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGTAACGGTGCGGGCTGA
+>lcl|NC_004431.1_cds_WP_000526115.1_2 [gene=tnpA] [locus_tag=C_RS00010] [protein=IS200/IS605-like element IS200C family transposase] [protein_id=WP_000526115.1] [location=453..911] [gbkey=CDS]
+ATGGGGAACGAAAAGAGCTTAGCGCACACCCGATGGAACTGTAAATATCACATAGTATTTGCGCCAAAATACCGAAGACA
+GGTGTTCTACAGAGAGAAGCGTAGAGCAATAGGCTGTATTTTGAGAAAGCTGTGTGAGTGGAAAAGTGTACGGATTCTGG
+AAGCTGAATGCTGTGCAGATCATATCCATATGCTTGTGGAGATCCCGCCCAAAATGAGCGTATCAGGCTTTATGGGATAT
+CTGAAAGGGAAAAGCAGTCTGATGCCTTACGAGCAGTTTGGTGATTTGAAATTCAAATACAGGAACAGGGAGTTCTGGTG
+CAGAGGGTATTACGTCGATACGGTGGGTAAGAACACGGCGAAGATACAGGATTACATAAAGCACCAGCTTGAAGAGGATA
+AAATGGGAGAGCAGTTATCGATTCCCTATCCGGGCAGCCCGTTTACGGGCCGTAAGTAA
+>lcl|NC_004431.1_cds_WP_001264710.1_3 [gene=thrA] [locus_tag=C_RS00015] [protein=bifunctional aspartate kinase/homoserine dehydrogenase I] [protein_id=WP_001264710.1] [location=1048..3510] [gbkey=CDS]
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/genome.2.GCF_000007445.1.seq.rpt.jsonl	Thu Jan 27 08:20:15 2022 +0000
@@ -0,0 +1,1 @@
+{"assemblyUnit":"GCF_000007455.1","assignedMoleculeLocationType":"Chromosome","chrName":"ANONYMOUS","gcCount":"2640553","genbankAccession":"AE014075.1","length":5231428,"refseqAccession":"NC_004431.1","sortOrder":1}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/genome.2.GCF_000013305.1.genomic.cds	Thu Jan 27 08:20:15 2022 +0000
@@ -0,0 +1,10 @@
+>lcl|NC_008253.1_cds_WP_001386572.1_1 [gene=thrL] [locus_tag=ECP_RS00005] [protein=thr operon leader peptide] [protein_id=WP_001386572.1] [location=190..255] [gbkey=CDS]
+ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGTAACGGTGCGGGCTGA
+>lcl|NC_008253.1_cds_WP_001264707.1_2 [gene=thrA] [locus_tag=ECP_RS00010] [protein=bifunctional aspartate kinase/homoserine dehydrogenase I] [protein_id=WP_001264707.1] [location=336..2798] [gbkey=CDS]
+ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGGGTTGCCGATATTCTGGAAAGCAA
+TGCCAGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCATCTGGTAGCGATGATTGAAAAAA
+CCATTAGCGGTCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGCCGAACTTCTGACGGGACTCGCCGCC
+GCCCAGCCGGGATTTCCGCTGGCACAATTGAAAACTTTCGTCGACCAGGAATTTGCCCAAATAAAACATGTCCTGCATGG
+CATCAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAATGTCGATCGCCATTA
+TGGCCGGCGTGTTAGAAGCGCGTGGTCACAACGTTACCGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGTCATTAC
+CTCGAATCTACCGTTGATATTGCTGAATCCACCCGCCGTATTGCGGCAAGCCGCATTCCGGCTGACCACATGGTGCTGAT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/genome.2.GCF_000013305.1.genomic.gtf	Thu Jan 27 08:20:15 2022 +0000
@@ -0,0 +1,10 @@
+#gtf-version 2.2
+#!genome-build ASM1330v1
+#!genome-build-accession NCBI_Assembly:GCF_000013305.1
+#!annotation-date 05/06/2021 17:31:48
+#!annotation-source NCBI RefSeq 
+NC_008253.1	RefSeq	gene	190	255	.	+	.	gene_id "ECP_RS00005"; transcript_id ""; gbkey "Gene"; gene "thrL"; gene_biotype "protein_coding"; locus_tag "ECP_RS00005"; old_locus_tag "ECP_0001"; 
+NC_008253.1	Protein Homology	CDS	190	252	.	+	0	gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; 
+NC_008253.1	Protein Homology	start_codon	190	192	.	+	0	gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; 
+NC_008253.1	Protein Homology	stop_codon	253	255	.	+	0	gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; 
+NC_008253.1	RefSeq	gene	336	2798	.	+	.	gene_id "ECP_RS00010"; transcript_id ""; gbkey "Gene"; gene "thrA"; gene_biotype "protein_coding"; locus_tag "ECP_RS00010"; old_locus_tag "ECP_0002"; 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/genome.2.GCF_000013305.1.seq.rpt.jsonl	Thu Jan 27 08:20:15 2022 +0000
@@ -0,0 +1,1 @@
+{"assemblyUnit":"GCF_000013315.1","assignedMoleculeLocationType":"Chromosome","chrName":"ANONYMOUS","gcCount":"2495020","genbankAccession":"CP000247.1","length":4938920,"refseqAccession":"NC_008253.1","sortOrder":1}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/genome.3.GCF_000007445.1.genomic.gbff	Thu Jan 27 08:20:15 2022 +0000
@@ -0,0 +1,10 @@
+LOCUS       NC_004431            5231428 bp    DNA     circular CON 13-MAY-2021
+DEFINITION  Escherichia coli CFT073, complete sequence.
+ACCESSION   NC_004431 NZ_AE016755 NZ_AE016756 NZ_AE016757 NZ_AE016758
+            NZ_AE016759 NZ_AE016760 NZ_AE016761 NZ_AE016762 NZ_AE016763
+            NZ_AE016764 NZ_AE016765 NZ_AE016766 NZ_AE016767 NZ_AE016768
+            NZ_AE016769 NZ_AE016770 NZ_AE016771 NZ_AE016772
+VERSION     NC_004431.1
+DBLINK      BioProject: PRJNA224116
+            BioSample: SAMN02604094
+            Assembly: GCF_000007445.1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/genome.3.GCF_000007445.1.genomic.gff	Thu Jan 27 08:20:15 2022 +0000
@@ -0,0 +1,10 @@
+##gff-version 3
+#!gff-spec-version 1.21
+#!processor NCBI annotwriter
+#!genome-build ASM744v1
+#!genome-build-accession NCBI_Assembly:GCF_000007445.1
+#!annotation-date 05/06/2021 17:43:00
+#!annotation-source NCBI RefSeq 
+##sequence-region NC_004431.1 1 5231428
+##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=199310
+NC_004431.1	RefSeq	region	1	5231428	.	+	.	ID=NC_004431.1:1..5231428;Dbxref=taxon:199310;Is_circular=true;Name=ANONYMOUS;gbkey=Src;genome=chromosome;mol_type=genomic DNA;strain=CFT073
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/genome.3.GCF_000007445.1.seq.rpt.jsonl	Thu Jan 27 08:20:15 2022 +0000
@@ -0,0 +1,1 @@
+{"assemblyUnit":"GCF_000007455.1","assignedMoleculeLocationType":"Chromosome","chrName":"ANONYMOUS","gcCount":"2640553","genbankAccession":"AE014075.1","length":5231428,"refseqAccession":"NC_004431.1","sortOrder":1}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/genome.3.GCF_000013305.1.genomic.gbff	Thu Jan 27 08:20:15 2022 +0000
@@ -0,0 +1,10 @@
+LOCUS       NC_008253            4938920 bp    DNA     circular CON 13-MAY-2021
+DEFINITION  Escherichia coli 536, complete sequence.
+ACCESSION   NC_008253
+VERSION     NC_008253.1
+DBLINK      BioProject: PRJNA224116
+            BioSample: SAMN02604181
+            Assembly: GCF_000013305.1
+KEYWORDS    RefSeq.
+SOURCE      Escherichia coli 536
+  ORGANISM  Escherichia coli 536
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/genome.3.GCF_000013305.1.genomic.gff	Thu Jan 27 08:20:15 2022 +0000
@@ -0,0 +1,10 @@
+##gff-version 3
+#!gff-spec-version 1.21
+#!processor NCBI annotwriter
+#!genome-build ASM1330v1
+#!genome-build-accession NCBI_Assembly:GCF_000013305.1
+#!annotation-date 05/06/2021 17:31:48
+#!annotation-source NCBI RefSeq 
+##sequence-region NC_008253.1 1 4938920
+##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=362663
+NC_008253.1	RefSeq	region	1	4938920	.	+	.	ID=NC_008253.1:1..4938920;Dbxref=taxon:362663;Is_circular=true;Name=ANONYMOUS;gbkey=Src;genome=chromosome;mol_type=genomic DNA;serogroup=O6:K15:H31;strain=536
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/genome.3.GCF_000013305.1.seq.rpt.jsonl	Thu Jan 27 08:20:15 2022 +0000
@@ -0,0 +1,1 @@
+{"assemblyUnit":"GCF_000013315.1","assignedMoleculeLocationType":"Chromosome","chrName":"ANONYMOUS","gcCount":"2495020","genbankAccession":"CP000247.1","length":4938920,"refseqAccession":"NC_008253.1","sortOrder":1}
Binary file test-data/human_chrom_21_dehydrated.zip has changed