Mercurial > repos > iuc > ncbi_datasets
changeset 3:c87df3f9e19d draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit 800d16f3bd40266d8734f4572988cb2b306b4fd3"
| author | iuc |
|---|---|
| date | Thu, 27 Jan 2022 08:20:15 +0000 |
| parents | 2753a5786114 |
| children | 41c18b994108 |
| files | datasets_gene.xml datasets_genome.xml datasets_ortholog.xml datasets_virus_genome.xml datasets_virus_protein.xml macros.xml test-data/GCF_000007445.1.genome.fa test-data/GCF_000007445.1.genomic.gbff test-data/GCF_000007445.1.genomic.gff test-data/GCF_000007445.1.protein.fa test-data/GCF_000013305.1.genome.fa test-data/GCF_000013305.1.genomic.gbff test-data/GCF_000013305.1.genomic.gff test-data/GCF_000013305.1.genomic.gtf test-data/GCF_000013305.1.protein.fa test-data/genome.2.GCF_000007445.1.genomic.cds test-data/genome.2.GCF_000007445.1.seq.rpt.jsonl test-data/genome.2.GCF_000013305.1.genomic.cds test-data/genome.2.GCF_000013305.1.genomic.gtf test-data/genome.2.GCF_000013305.1.seq.rpt.jsonl test-data/genome.3.GCF_000007445.1.genomic.gbff test-data/genome.3.GCF_000007445.1.genomic.gff test-data/genome.3.GCF_000007445.1.seq.rpt.jsonl test-data/genome.3.GCF_000013305.1.genomic.gbff test-data/genome.3.GCF_000013305.1.genomic.gff test-data/genome.3.GCF_000013305.1.seq.rpt.jsonl test-data/human_chrom_21_dehydrated.zip |
| diffstat | 27 files changed, 243 insertions(+), 820 deletions(-) [+] |
line wrap: on
line diff
--- a/datasets_gene.xml Thu Jul 15 15:45:43 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,206 +0,0 @@ -<tool id="datasets_download_gene" name="NCBI datasets download gene" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@"> - <description>Download genes from NCBI</description> - <macros> - <import>macros.xml</import> - </macros> - <expand macro="requirements"></expand> - <command><![CDATA[ -@SETUP_CERTIFICATES@ -datasets download gene $subcommand.download_by -#if $subcommand.download_by != 'taxon': - #if $subcommand.text_or_file.text_or_file == 'text': - #if $subcommand.download_by == 'gene-id': - $subcommand.text_or_file.accession - #else if $subcommand.download_by == 'taxon': - '$subcommand.taxon' - #else - #echo " ".join(f"'{x}'" for x in $subcommand.text_or_file.accession.split(' ') if x) - #end if - #if $subcommand.download_by == 'accession' and $subcommand.taxon_filter: - --taxon-filter '$subcommand.taxon_filter' - #end if - #else - --inputfile '$subcommand.text_or_file.inputfile' - #end if -#else: - '$subcommand.taxon' -#end if -@EXCLUDES_GENE@ -#if $subcommand.download_by == 'accession' and $subcommand.include_flanks_bp: - --include-flanks-bp $subcommand.include_flanks_bp -#end if -&& 7z x ncbi_dataset.zip -]]></command> - <inputs> - <conditional name="subcommand"> - <param name="download_by" type="select" label="Choose how to find genomes to download"> - <option value="gene-id">Download a gene dataset by NCBI Gene ID</option> - <option value="symbol">Download a gene dataset by gene symbol</option> - <option value="accession">Download a gene dataset by RefSeq nucleotide or protein accession</option> - <option value="taxon">Download a gene dataset by taxon</option> - </param> - <when value="gene-id"> - <expand macro="text_or_file" what="gene-id" what_extended="NCBI Gene ID" help="Should be valid NCBI Gene ID"> - <sanitizer invalid_char=""> - <valid initial="string.digits"> - <add value=" " /> - </valid> - </sanitizer> - </expand> - </when> - <when value="symbol"> - <expand macro="text_or_file" what="symbol" what_extended="gene symbol" help="Should be valid gene symbol"/> - <param argument="--taxon" type="text" value="human" label="Specify a species name" help="Species name can be common or scientific name or species-level NCBI Taxonomy ID"/> - </when> - <when value="accession"> - <expand macro="text_or_file" what="accession" what_extended="RefSeq nucleotide or protein accession" help="Should be RefSeq nucleotide or protein accession"/> - <param argument="--include-flanks-bp" type="integer" optional="true" min="0" label="Include gene flanking sequence, limited to prokaryotic genes" help="If not specified flanking gene sequences will not be downloaded. Accession must start with WP"/> - <param argument="--taxon-filter" type="text" optional="true" label="limit genes to a specified taxon" help="any rank"/> - </when> - <when value="taxon"> - <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurs, etc."></param> - </when> - </conditional> - <expand macro="excludes_gene"></expand> - <conditional name="limit_fasta" label="Limit fasta by accession?"> - <param name="limit" type="select" label="Select limit method"> - <option value="none">None</option> - <option value="text">Enter list of accessions</option> - <option value="file">Read list of accessions from file</option> - </param> - <when value="none"> - </when> - <when value="text"> - <param argument="--fasta-filter" type="text" label="Limit gene fasta download to these accessions"/> - </when> - <when value="file"> - <param argument="--fasta-filter-file" type="data" format="txt" label="File of accessions to limit gene fasta download"/> - </when> - </conditional> - </inputs> - <outputs> - <data name="gene_fasta" format="fasta" label="NCBI datasets gene: gene fasta" from_work_dir="ncbi_dataset/data/gene.fna"> - <filter>not exclude_gene</filter> - </data> - <data name="protein_fasta" format="fasta" label="NCBI datasets gene: protein fasta" from_work_dir="ncbi_dataset/data/protein.faa"> - <filter>not exclude_protein</filter> - </data> - <data name="rna_fasta" format="fasta" label="NCBI datasets gene: rna fasta" from_work_dir="ncbi_dataset/data/rna.fna"> - <filter>not exclude_rna</filter> - </data> - <data name="gene_flanks" format="fasta" label="NCBI datasets gene: flanking sequence fasta" from_work_dir="ncbi_dataset/data/gene_flank.fna"> - <filter><![CDATA[subcommand.get('include_flanks_bp')]]></filter> - </data> - </outputs> - <tests> - <test title="test download by gene-id" num_outputs="3"> - <conditional name="subcommand"> - <param name="download_by" value="gene-id"></param> - <conditional name="text_or_file"> - <param name="text_or_file" value="text"></param> - <param name="accession" value="472 672"></param> - </conditional> - </conditional> - <output name="gene_fasta"> - <assert_contents> - <has_line line="CCGCGTCCGCGCTTACCCAATACAAGCCGGGCTACGTCCGAGGGTAACAACATGATCAAAACCACAGCAG"/> - <has_line line="GCTGAGACTTCCTGGACGGGGGACAGGCTGTGGGGTTTCTCAGATAACTGGGCCCCTGCGCTCAGGAGGC"/> - </assert_contents> - </output> - </test> - <test title="test download by gene-id, test sanitizer" num_outputs="3"> - <conditional name="subcommand"> - <param name="download_by" value="gene-id"></param> - <conditional name="text_or_file"> - <param name="text_or_file" value="text"></param> - <param name="accession" value="472 672"></param> - </conditional> - </conditional> - <output name="gene_fasta"> - <assert_contents> - <has_line line="CCGCGTCCGCGCTTACCCAATACAAGCCGGGCTACGTCCGAGGGTAACAACATGATCAAAACCACAGCAG"/> - <has_line line="GCTGAGACTTCCTGGACGGGGGACAGGCTGTGGGGTTTCTCAGATAACTGGGCCCCTGCGCTCAGGAGGC"/> - </assert_contents> - </output> - <assert_command> - <not_has_text text="exit"/> - </assert_command> - </test> - <test title="test download by gene symbol" num_outputs="3"> - <conditional name="subcommand"> - <param name="download_by" value="symbol"></param> - <conditional name="text_or_file"> - <param name="text_or_file" value="text"></param> - <param name="accession" value="BRCA1 ATM"></param> - </conditional> - </conditional> - <output name="gene_fasta"> - <assert_contents> - <has_line line="CCGCGTCCGCGCTTACCCAATACAAGCCGGGCTACGTCCGAGGGTAACAACATGATCAAAACCACAGCAG"/> - <has_line line="GCTGAGACTTCCTGGACGGGGGACAGGCTGTGGGGTTTCTCAGATAACTGGGCCCCTGCGCTCAGGAGGC"/> - </assert_contents> - </output> - </test> - <test title="test download by accession" num_outputs="3"> - <conditional name="subcommand"> - <param name="download_by" value="accession"></param> - <conditional name="text_or_file"> - <param name="text_or_file" value="text"></param> - <param name="accession" value="NM_000546.6 NM_000492.4"></param> - </conditional> - </conditional> - <output name="gene_fasta"> - <assert_contents> - <has_line line="GTAGTAGGTCTTTGGCATTAGGAGCTTGAGCCCAGACGGCCCTAGCAGGGACCCCAGCGCCCGAGAGACC"/> - <has_line line="CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGC"/> - </assert_contents> - </output> - <assert_command> - <has_text text="'NM_000546.6' 'NM_000492.4'"/> - </assert_command> - </test> - <test title="test download by accession with flanking sequence" num_outputs="4"> - <conditional name="subcommand"> - <param name="download_by" value="accession"></param> - <conditional name="text_or_file"> - <param name="text_or_file" value="text"></param> - <param name="accession" value="WP_004675351.1"></param> - </conditional> - <param name="include_flanks_bp" value="10"/> - </conditional> - <output name="gene_flanks"> - <assert_contents> - <has_line line="gccctgccgcATGATCGATCTGATGCCGACGAGCGAGGAACAGGCGGCGGCGATCGTCCGCACCCATGCG"/> - </assert_contents> - </output> - <assert_command> - <has_text text="--include-flanks-bp 10"/> - </assert_command> - </test> - <test title="test download by taxon" num_outputs="1"> - <conditional name="subcommand"> - <param name="download_by" value="taxon"></param> - <param name="taxon" value="Mycobacterium tuberculosis H37Rv"></param> - </conditional> - <param name="exclude_rna" value="true"/> - <param name="exclude_protein" value="true"/> - <output name="gene_fasta"> - <assert_contents> - <has_line line="GTGGCGCTGAATATCAAAGACCCTGAGGTAGACCGACTAGCCGCCGAACTCGCTGACCGGCTGCACACCA"/> - </assert_contents> - </output> - </test> - </tests> - <help> -Download a gene dataset including gene, transcript and protein sequence, a data table and a data report. Gene datasets can be specified by NCBI Gene ID, symbol or RefSeq accession. Datasets are downloaded as a zip file. - -The default gene dataset includes the following files: - * gene.fna (gene sequences) - * rna.fna (transcript sequences) - * protein.faa (protein sequences) - * data_report.jsonl (data report with gene metadata) - * data_table.tsv (data table with gene metadata, one transcript per row) - * dataset_catalog.json (a list of files and file types included in the dataset) - </help> - -</tool>
--- a/datasets_genome.xml Thu Jul 15 15:45:43 2021 +0000 +++ b/datasets_genome.xml Thu Jan 27 08:20:15 2022 +0000 @@ -1,131 +1,170 @@ -<tool id="datasets_download_genome" name="NCBI datasets download genome" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@"> - <description>Download assembled genomes from NCBI</description> +<tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@"> + <description>download genome sequence, annotation and metadata</description> <macros> <import>macros.xml</import> </macros> <expand macro="requirements"></expand> <command><![CDATA[ @SETUP_CERTIFICATES@ -datasets download genome $subcommand.download_by -#if $subcommand.download_by == 'accession': - #if $subcommand.text_or_file.text_or_file == 'text': - #echo " ".join(f"'{x}'" for x in $subcommand.text_or_file.accession.split(' ') if x) +datasets download genome $query.subcommand.download_by +#if $query.subcommand.download_by == 'accession': + #if $query.subcommand.text_or_file.text_or_file == 'text': + #echo " ".join(f"'{x}'" for x in $query.subcommand.text_or_file.accession.split(' ') if x) #else - --inputfile '$subcommand.text_or_file.inputfile' + --inputfile '$query.subcommand.text_or_file.inputfile' #end if #else: - '$subcommand.taxon' + '$query.subcommand.taxon' +#end if +$filters.reference +$filters.annotated +#if $filters.assembly_level: +--assembly_level $filters.assembly_level #end if -$annotated -$dehydrated -#if $assembly_level: ---assembly_level $assembly_level +#if $filters.assembly_source: +--assembly_source $filters.assembly_source #end if -#if $assembly_source: ---assembly_source $assembly_source +#if $filters.chromosomes: +--chromosomes '$filters.chromosomes' #end if ---chromosomes '$chromosomes' @EXCLUDES_GENOME@ @INCLUDES_GENOME@ -$reference @RELEASED_BEFORE@ @RELEASED_SINCE@ -#for search_term in $search: - --search '$search_term' +#for search_term in $filters.search: + --search '$filters.search_term' #end for -#if not $dehydrated: - && 7z x ncbi_dataset.zip +#if $uncompressed +&& unzip ncbi_dataset.zip +#else +&& unzip -l ncbi_dataset.zip > ncbi_dataset.txt #end if ]]></command> <inputs> - <conditional name="subcommand"> - <param name="download_by" type="select" label="Choose how to find genomes to download"> - <option value="accession">Download by NCBI assembly or BioProject accession</option> - <option value="taxon">Download by taxon</option> - </param> - <when value="accession"> - <expand macro="text_or_file"/> - </when> - <when value="taxon"> - <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurs, etc."></param> - </when> - </conditional> - <expand macro="annotation"></expand> - <expand macro="dehydrated"></expand> - <expand macro="assembly_level"></expand> - <expand macro="assembly_source"></expand> - <expand macro="chromosomes"></expand> - <expand macro="excludes_genome"></expand> - <expand macro="includes_genome"></expand> - <expand macro="released_options"></expand> - <expand macro="released_options" before_or_after="since"></expand> - <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/> - <repeat name="search" title="Add search terms"> - <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> - </repeat> + <section name="query" title="Query" expanded="true"> + <conditional name="subcommand"> + <param name="download_by" type="select" label="Choose how to find genomes to download"> + <option value="accession">Download by NCBI assembly or BioProject accession</option> + <option value="taxon">Download by taxon</option> + </param> + <when value="accession"> + <expand macro="text_or_file"/> + </when> + <when value="taxon"> + <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."></param> + </when> + </conditional> + </section> + <section name="filters" title="Filters and Limit"> + <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/> + <expand macro="annotation"></expand> + <expand macro="assembly_level"></expand> + <expand macro="assembly_source"></expand> + <expand macro="chromosomes"></expand> + <expand macro="released_options"></expand> + <expand macro="released_options" before_or_after="since"></expand> + + <repeat name="search" title="Add search terms"> + <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> + </repeat> + </section> + <section name="file_choices" title="File Choices"> + <expand macro="excludes_genome"></expand> + <expand macro="includes_genome"></expand> + </section> + <param name="uncompressed" type="boolean" label="Uncompress the dataset archive" checked="true"/> </inputs> <outputs> - <data name="dehydrated_archive" format="zip" label="Dehydrated Archive" from_work_dir="ncbi_dataset.zip"> - <filter>dehydrated</filter> + <data name="compressed_archive" format="zip" label="Compressed Archive" from_work_dir="ncbi_dataset.zip"> + <filter>not uncompressed</filter> + </data> + <data name="archive_contents" format="txt" label="Archive Contents" from_work_dir="ncbi_dataset.txt"> + <filter>not uncompressed</filter> </data> - <collection name="genome_fasta" label="NCBI genome datasets: genome fasta" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/.*_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> - <filter>not dehydrated and not exclude_seq</filter> + <data name="genome_data_report" format="json" label="NCBI Genome Datasets: Data Report" from_work_dir="ncbi_dataset/data/assembly_data_report.jsonl"> + <filter>uncompressed</filter> + </data> + <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list"> + <discover_datasets pattern="(?P<identifier_0>.*?)\/sequence_report.jsonl" ext="json" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> + <filter>uncompressed</filter> </collection> - <collection name="protein_fasta" label="NCBI genome datasets: protein fasta" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> - <filter>not dehydrated and not exclude_protein</filter> + <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list"> + <discover_datasets pattern="(?P<identifier_0>.*?)\/.*(?<!cds_from)(chr|unplaced|_genomic)*fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> + <filter>uncompressed and file_choices['exclude_seq']</filter> </collection> - <collection name="genomic_gff" label="NCBI genome datasets: genomic gff" type="list"> + <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list"> + <discover_datasets pattern="(?P<identifier_0>.*?)\/cds_from_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> + <filter>uncompressed and file_choices['exclude_genomic_cds']</filter> + </collection> + <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list"> <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> - <filter>not dehydrated and not exclude_gff3</filter> + <filter>uncompressed and file_choices['exclude_gff3']</filter> + </collection> + <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list"> + <discover_datasets pattern="(?P<identifier_0>.*?)\/rna\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> + <filter>uncompressed and file_choices['exclude_rna']</filter> </collection> - <collection name="genomic_gtf" label="NCBI genome datasets: genomic gtf" type="list"> + <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list"> + <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> + <filter>uncompressed and file_choices['exclude_protein']</filter> + </collection> + <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list"> + <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> + <filter>uncompressed and file_choices['include_gbff']</filter> + </collection> + <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list"> <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> - <filter>not dehydrated and include_gtf</filter> - </collection> - <collection name="genomic_gbff" label="NCBI genome datasets: genomic gbff" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="genbank" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"></discover_datasets> - <filter>not dehydrated and include_gbff</filter> + <filter>uncompressed and file_choices['include_gtf']</filter> </collection> </outputs> <tests> - <test title="test dehydrated download by taxon"> - <conditional name="subcommand"> + <test expect_num_outputs="2"> + <conditional name="query|subcommand"> <param name="download_by" value="taxon"></param> <param name="text_or_file" value="text"></param> <param name="taxon" value="human"></param> </conditional> <param name="chromosomes" value="21"></param> - <param name="dehydrated" value="true"/> + <param name="uncompressed" value="false"/> <param name="released_before" value="01/01/2018"></param> - <output name="dehydrated_archive" value="human_chrom_21_dehydrated.zip" compare="sim_size" delta="10000"/> + <output name="archive_contents"> + <assert_contents> + <has_text text="ncbi_dataset/data/dataset_catalog.json"/> + </assert_contents> + </output> </test> - <test title="test download by comma-separated accession"> - <conditional name="subcommand"> + <test expect_num_outputs="5"> + <conditional name="query|subcommand"> <param name="download_by" value="accession"></param> <conditional name="text_or_file"> <param name="text_or_file" value="text"></param> <param name="accession" value="GCF_000013305.1 GCF_000007445.1"></param> </conditional> </conditional> - <param name="dehydrated" value="false"/> + <param name="uncompressed" value="true"/> <param name="released_before" value="01/01/2007"></param> - <output_collection name="genome_fasta" type="list"> - <element name="GCF_000013305.1" file="GCF_000013305.1.genome.fa" compare="contains"/> - <element name="GCF_000007445.1" file="GCF_000007445.1.genome.fa" compare="contains"/> + <param name="exclude_genomic_cds" value="true"/> + <param name="include_gtf" value="true"/> + <output name="genome_data_report"> + <assert_contents> + <has_text text="GCF_000013305.1"/> + </assert_contents> + </output> + <output_collection name="sequence_report" type="list"> + <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.seq.rpt.jsonl" compare="contains"/> + <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.seq.rpt.jsonl" compare="contains"/> </output_collection> - <output_collection name="protein_fasta" type="list"> - <element name="GCF_000013305.1" file="GCF_000013305.1.protein.fa" compare="contains"/> - <element name="GCF_000007445.1" file="GCF_000007445.1.protein.fa" compare="contains"/> + <output_collection name="genomic_gtf" type="list"> + <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/> + <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> </output_collection> - <output_collection name="genomic_gff" type="list"> - <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gff" compare="contains"/> - <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gff" compare="contains"/> + <output_collection name="genomic_cds" type="list"> + <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/> + <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/> </output_collection> </test> - <test title="test download by accessions listed in file"> - <conditional name="subcommand"> + <test expect_num_outputs="4"> + <conditional name="query|subcommand"> <param name="download_by" value="accession"></param> <conditional name="text_or_file"> <param name="text_or_file" value="file"></param> @@ -133,43 +172,44 @@ </conditional> </conditional> <param name="include_gbff" value="true"/> - <param name="include_gtf" value="true"/> - <param name="dehydrated" value="false"/> - <param name="released_before" value="01/01/2007"></param> - <output_collection name="genome_fasta" type="list"> - <element name="GCF_000013305.1" file="GCF_000013305.1.genome.fa" compare="contains"/> - <element name="GCF_000007445.1" file="GCF_000007445.1.genome.fa" compare="contains"/> - </output_collection> - <output_collection name="protein_fasta" type="list"> - <element name="GCF_000013305.1" file="GCF_000013305.1.protein.fa" compare="contains"/> - <element name="GCF_000007445.1" file="GCF_000007445.1.protein.fa" compare="contains"/> + <param name="exclude_seq" value="false"/> + <param name="exclude_gff3" value="true"/> + <param name="uncompressed" value="true"/> + <param name="released_before" value="01/02/2007"></param> + <output name="genome_data_report"> + <assert_contents> + <has_text text="SAMN02604181"/> + </assert_contents> + </output> + <output_collection name="sequence_report" type="list"> + <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.seq.rpt.jsonl" compare="contains"/> + <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.seq.rpt.jsonl" compare="contains"/> </output_collection> <output_collection name="genomic_gff" type="list"> - <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gff" compare="contains"/> - <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gff" compare="contains"/> - </output_collection> - <output_collection name="genomic_gtf" type="list"> - <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gtf" compare="contains"/> - <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> + <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/> + <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/> </output_collection> <output_collection name="genomic_gbff" type="list"> - <element name="GCF_000013305.1" file="GCF_000013305.1.genomic.gbff" compare="contains"/> - <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gbff" compare="contains"/> + <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/> + <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/> </output_collection> </test> </tests> <help> +<![CDATA[ +**Download Genome Datasets from NCBI** Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report. Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file. -The default genome dataset includes the following files (if available): -* genomic.fna (genomic sequences) -* rna.fna (transcript sequences) -* protein.faa (protein sequences) -* genomic.gff (genome annotation in gff3 format) -* data_report.jsonl (data report with genome assembly and annotation metadata) -* dataset_catalog.json (a list of files and file types included in the dataset) +Tthe default genome dataset includes the following files (if available): + * genomic.fna (genomic sequences) + * rna.fna (transcript sequences) + * protein.faa (protein sequences) + * genomic.gff (genome annotation in gff3 format) + * data_report.jsonl (data report with genome assembly and annotation metadata) + * dataset_catalog.json (a list of files and file types included in the dataset) +]]> </help> </tool>
--- a/datasets_ortholog.xml Thu Jul 15 15:45:43 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,143 +0,0 @@ -<tool id="datasets_download_ortholog" name="NCBI datasets download ortholog" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@"> - <description>Download an ortholog dataset</description> - <macros> - <import>macros.xml</import> - </macros> - <expand macro="requirements"></expand> - <command><![CDATA[ -@SETUP_CERTIFICATES@ -datasets download ortholog $subcommand.download_by -#if $subcommand.download_by != 'taxon': - #if $subcommand.text_or_file.text_or_file == 'text': - #if $subcommand.download_by == 'gene-id': - $subcommand.text_or_file.accession - #else - #echo " ".join(f"'{x}'" for x in $subcommand.text_or_file.accession.split(' ') if x) - #end if - #else - --inputfile '$subcommand.text_or_file.inputfile' - #end if -#else: - '$subcommand.taxon' -#end if -@EXCLUDES_GENE@ -#if $taxon_filter: - --taxon-filter '$taxon_filter' -#end if -&& 7z x ncbi_dataset.zip -]]></command> - <inputs> - <conditional name="subcommand"> - <param name="download_by" type="select" label="Choose how to find ortholog dataset to download"> - <option value="gene-id">Download a ortholog dataset by NCBI Gene ID</option> - <option value="symbol">Download a ortholog dataset by gene symbol</option> - <option value="accession">Download a orthologsdataset by RefSeq nucleotide or protein accession</option> - </param> - <when value="gene-id"> - <expand macro="text_or_file" what="gene-id" what_extended="NCBI Gene ID" help="Should be valid NCBI Gene ID"> - <sanitizer invalid_char=""> - <valid initial="string.digits"> - <add value=" " /> - </valid> - </sanitizer> - </expand> - </when> - <when value="symbol"> - <expand macro="text_or_file" what="symbol" what_extended="gene symbol" help="Should be valid gene symbol"/> - <param argument="--taxon" type="text" value="human" label="Specify a species name" help="Species name can be common or scientific name or species-level NCBI Taxonomy ID"/> - </when> - <when value="accession"> - <expand macro="text_or_file" what="accession" what_extended="RefSeq nucleotide or protein accession" help="Should be RefSeq nucleotide or protein accession"/> - </when> - </conditional> - <param argument="--taxon-filter" type="text" optional="true" label="limit genes to a specified taxon" help="any rank"/> - <expand macro="excludes_gene"></expand> - </inputs> - <outputs> - <data name="gene_fasta" format="fasta" label="NCBI datasets ortholog: gene fasta" from_work_dir="ncbi_dataset/data/gene.fna"> - <filter>not exclude_gene</filter> - </data> - <data name="protein_fasta" format="fasta" label="NCBI datasets ortholog: protein fasta" from_work_dir="ncbi_dataset/data/protein.faa"> - <filter>not exclude_protein</filter> - </data> - <data name="rna_fasta" format="fasta" label="NCBI datasets ortholog: rna fasta" from_work_dir="ncbi_dataset/data/rna.fna"> - <filter>not exclude_rna</filter> - </data> - </outputs> - <tests> - <test title="test download by gene-id"> - <conditional name="subcommand"> - <param name="download_by" value="gene-id"></param> - <conditional name="text_or_file"> - <param name="text_or_file" value="text"></param> - <param name="accession" value="472 672"></param> - </conditional> - </conditional> - <param name="taxon_filter" value="Puma"/> - <output name="gene_fasta"> - <assert_contents> - <has_line line="ATGGATTTATCTGCAGATCGTGTTGAAGAAGTACAAAGTGTCCTTAATGCTATGCAGAAAATCTTAGAGT"/> - <has_line line="GGGCAGAGGGGCGGAACTACAAGTGCGCAATCGTGGGCCGCGGCCCATTTCCCCTTCCCAGGTAAATTCG"/> - </assert_contents> - </output> - </test> - <test title="test download by gene-id, test sanitizer"> - <conditional name="subcommand"> - <param name="download_by" value="gene-id"></param> - <conditional name="text_or_file"> - <param name="text_or_file" value="text"></param> - <param name="accession" value="472 672"></param> - </conditional> - </conditional> - <param name="taxon_filter" value="Puma"/> - <output name="gene_fasta"> - <assert_contents> - <has_line line="ATGGATTTATCTGCAGATCGTGTTGAAGAAGTACAAAGTGTCCTTAATGCTATGCAGAAAATCTTAGAGT"/> - <has_line line="GGGCAGAGGGGCGGAACTACAAGTGCGCAATCGTGGGCCGCGGCCCATTTCCCCTTCCCAGGTAAATTCG"/> - </assert_contents> - </output> - <assert_command> - <not_has_text text="exit"/> - </assert_command> - </test> - <test title="test download by gene symbol"> - <conditional name="subcommand"> - <param name="download_by" value="symbol"></param> - <conditional name="text_or_file"> - <param name="text_or_file" value="text"></param> - <param name="accession" value="BRCA1 ATM"></param> - </conditional> - </conditional> - <param name="taxon_filter" value="Puma"/> - <output name="gene_fasta"> - <assert_contents> - <has_line line="ATGGATTTATCTGCAGATCGTGTTGAAGAAGTACAAAGTGTCCTTAATGCTATGCAGAAAATCTTAGAGT"/> - <has_line line="GGGCAGAGGGGCGGAACTACAAGTGCGCAATCGTGGGCCGCGGCCCATTTCCCCTTCCCAGGTAAATTCG"/> - </assert_contents> - </output> - </test> - <test title="test download by accession"> - <conditional name="subcommand"> - <param name="download_by" value="accession"></param> - <conditional name="text_or_file"> - <param name="text_or_file" value="text"></param> - <param name="accession" value="NM_000546.6 NM_000492.4"></param> - </conditional> - </conditional> - <param name="taxon_filter" value="Puma"/> - <output name="gene_fasta"> - <assert_contents> - <has_line line="ATGCAGGAGCCGCCATTGGAACTCACCATCGAGCCCCCTCTGAGCCAGGAGACATTTTCGGAATTGTGGA"/> - <has_line line="AGTTGGAAGCAAATGACATCACTGCGGGTCAGAGAAAAAGGGGCGAGCAGCCTGCGCCAGAAGAGTAGGG"/> - </assert_contents> - </output> - <assert_command> - <has_text text="'NM_000546.6' 'NM_000492.4'"/> - </assert_command> - </test> - </tests> - <help> - Download an ortholog dataset including gene, transcript and protein sequence, a data table and a data report. Ortholog data is calculated by NCBI for vertebrates and insects. Ortholog datasets can be specified by NCBI Gene ID, symbol or RefSeq accession.) - </help> - -</tool>
--- a/datasets_virus_genome.xml Thu Jul 15 15:45:43 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,130 +0,0 @@ -<tool id="datasets_virus_genome" name="NCBI datasets download virus genome" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@"> - <description>Download a coronavirus genome dataset including genome, CDS and protein sequence, annotation - and a detailed data report</description> - <macros> - <import>macros.xml</import> - </macros> - <expand macro="requirements"></expand> - <command><![CDATA[ -@SETUP_CERTIFICATES@ -datasets download virus genome taxon '$taxon' -$annotated -$complete_only -@EXCLUDES_VIRUS_GENOME@ -@INCLUDES_VIRUS_GENOME@ -#if str($geo_location): - --geo-location '$geo_location' -#end if -#if str($host): - --host '$host' -#end if -#if str($lineage): - --host '$lineage' -#end if -$refseq -@RELEASED_SINCE@ -&& 7z x ncbi_dataset.zip -]]></command> - <inputs> - <param name="taxon" type="text" label="Enter taxon" help="e.g. sars-cov-2, coronaviridae"></param> - <expand macro="annotation"></expand> - <param argument="--complete-only" truevalue="--complete-only" falsevalue="" type="boolean" label="limit to complete coronavirus genomes?"/> - <expand macro="excludes_virus_genome"></expand> - <expand macro="includes_virus_genome"></expand> - <param argument="--geo-location" type="text" label="Limit to coronavirus genomes isolated from a specified geographic location" help="Continent, country or U.S. state"/> - <param argument="--host" type="text" label="Limit to coronavirus genomes isolated from a specified host" help="NCBI Taxonomy ID, scientific or common name at any taxonomic rank"/> - <param argument="--lineage" type="text" label="Limit to SARS-CoV-2 genomes classified as the specified lineage (variant) by pangolin using the pangoLEARN algorithm" /> - <param argument="--refseq" type="boolean" truevalue="--refseq" falsevalue="" label="Limit to RefSeq coronavirus genomes"/> - <expand macro="released_options" before_or_after="since"></expand> - </inputs> - <outputs> - <data name="genomic_fasta" format="fasta" label="NCBI datasets virus genome: genomic fasta" from_work_dir="ncbi_dataset/data/genomic.fna"> - <filter>not exclude_seq</filter> - </data> - <data name="genomic_genbank" format="fasta" label="NCBI datasets virus genome: genomic genbank" from_work_dir="ncbi_dataset/data/genomic.gbff"> - <filter>include_gbff</filter> - </data> - <data name="cds_fasta" format="fasta" label="NCBI datasets virus genome: CDS fasta" from_work_dir="ncbi_dataset/data/cds.fna"> - <filter>not exclude_cds</filter> - </data> - <data name="protein_fasta" format="fasta" label="NCBI datasets virus genome: protein fasta" from_work_dir="ncbi_dataset/data/protein.faa"> - <filter>not exclude_protein</filter> - </data> - <data name="protein_genbank" format="fasta" label="NCBI datasets virus genome: protein genbank" from_work_dir="ncbi_dataset/data/protein.gpff"> - <filter>not exclude_gpff</filter> - </data> - <collection name="protein_structure" type="list" format="pdb" label="NCBI datasets virus genome: protein structure"> - <discover_datasets pattern="(?P<identifier_0>.*?)\.pdb" ext="pdb" directory="ncbi_dataset/data/pdb"></discover_datasets> - <filter>not exclude_pdb</filter> - </collection> - </outputs> - <tests> - <test title="Test download of PDB collection"> - <param name="taxon" value="sars-cov-2"/> - <param name="exclude_seq" value="true"/> - <param name="exclude_cds" value="true"/> - <param name="exclude_protein" value="true"/> - <param name="exclude_gpff" value="true"/> - <param name="released_since" value="07/07/2021"/> - <param name="refseq" value="true"/> - <output_collection name="protein_structure" type="list"> - <element name="6VYB" checksum="sha256$307a56951050faa61f4b57e6b8ceabb7ca743125058421c232746f1820484069"/> - <element name="6VYO" checksum="sha256$1dab20880b7ae913da336e8a6dba838689256e63a3faaaaa439b7bd7f3651eaf"/> - <element name="6W37" checksum="sha256$f115326ed4b3f7b332b44790c1a3ca769deb2d440dae68ce4ccae8e650dd1d7e"/> - <element name="6W4H" checksum="sha256$a49bc40b5652664b7e01e562279786520b0abcf6e34a7d7f603d6e429afbf384"/> - <element name="6W9C" checksum="sha256$0f845885e5a9d41e42628c3b05b194ecbac59f38211927eb924e80c830190753"/> - <element name="6W9Q" checksum="sha256$c5d34126464ac47738c8883f03455cf3d73ba41a6aa9e0c40e92bca411321ed7"/> - </output_collection> - </test> - <test title="Test download of non-collection elements" expect_num_outputs="5"> - <param name="taxon" value="sars-cov-2"/> - <param name="include_gbff" value="true"/> - <param name="exclude_pdb" value="true"/> - <param name="refseq" value="true"/> - <output name="genomic_fasta"> - <assert_contents> - <has_line line="ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAA"/> - </assert_contents> - </output> - <output name="genomic_genbank"> - <assert_contents> - <has_line line="ACCESSION NC_045512"/> - </assert_contents> - </output> - <output name="cds_fasta"> - <assert_contents> - <has_line line="AGTGGTTTTAGAAAAATGGCATTCCCATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTA"/> - </assert_contents> - </output> - <output name="protein_fasta"> - <assert_contents> - <has_line line="MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLEQPYVF"/> - </assert_contents> - </output> - <output name="protein_genbank"> - <assert_contents> - <has_line line="ACCESSION YP_009724389"/> - </assert_contents> - </output> - </test> - </tests> - <help> -Download a coronavirus genome dataset including genome, CDS and protein sequence, annotation -and a detailed data report. Coronavirus genome datasets are limited to the Coronaviridae family -including SARS-CoV-2. Coronavirus genome datasets can be specified by taxon. Datasets are -downloaded as a zip file. - -The default coronavirus genome dataset includes the following files (if available): -* genomic.fna (genomic sequences) -* cds.fna (nucleotide coding sequences) -* protein.faa (protein sequences) -* protein.gpff (protein sequence and annotation in GenPept flat file format) -* protein structures in PDB format -* data_report.jsonl (data report with viral metadata) -* virus_dataset.md (README containing details on sequence file data content and other information) -* dataset_catalog.json (a list of files and file types included in the dataset) - -Refer to NCBI's [command line quickstart](https://www.ncbi.nlm.nih.gov/datasets/docs/quickstarts/command-line-tools/) documentation for information about getting started with the command-line tools. - </help> - -</tool>
--- a/datasets_virus_protein.xml Thu Jul 15 15:45:43 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,126 +0,0 @@ -<tool id="datasets_virus_protein" name="NCBI datasets download virus protein" profile="@PROFILE@" license="@LICENSE" version="@TOOL_VERSION@"> - <description>Download a coronavirus protein dataset</description> - <macros> - <import>macros.xml</import> - </macros> - <expand macro="requirements"></expand> - <command><![CDATA[ -@SETUP_CERTIFICATES@ -datasets download virus protein -#for $protein in $proteins: - $protein -#end for -$annotated -$complete_only -@EXCLUDES_VIRUS_PROTEIN@ -#if str($geo_location): - --geo-location '$geo_location' -#end if -#if str($host): - --host '$host' -#end if -#if str($lineage): - --host '$lineage' -#end if -$refseq -@RELEASED_SINCE@ -&& 7z x ncbi_dataset.zip -]]></command> - <inputs> - <param name="proteins" type="select" multiple="true" label="Select viral protein(s)"> - <option value="ORF1ab">ORF1ab</option> - <option value="ORF1a">ORF1a</option> - <option value="nsp1">nsp1</option> - <option value="nsp2">nsp2</option> - <option value="nsp3">nsp3</option> - <option value="nsp4">nsp4</option> - <option value="nsp5">nsp5</option> - <option value="nsp6">nsp6</option> - <option value="nsp7">nsp7</option> - <option value="nsp8">nsp8</option> - <option value="nsp9">nsp9</option> - <option value="nsp10">nsp10</option> - <option value="rdrp">rdrp</option> - <option value="nsp11">nsp11</option> - <option value="nsp13">nsp13</option> - <option value="nsp14">nsp14</option> - <option value="nsp15">nsp15</option> - <option value="nsp16">nsp16</option> - <option value="S">S</option> - <option value="ORF3a">ORF3a</option> - <option value="E">E</option> - <option value="M">M</option> - <option value="ORF6">ORF6</option> - <option value="ORF7a">ORF7a</option> - <option value="ORF7b">ORF7b</option> - <option value="ORF8">ORF8</option> - <option value="N">N</option> - <option value="ORF10">ORF10</option> - </param> - <expand macro="annotation"></expand> - <param argument="--complete-only" truevalue="--complete-only" falsevalue="" type="boolean" label="limit to complete coronavirus genomes?"/> - <expand macro="excludes_virus_protein"></expand> - <param argument="--geo-location" type="text" label="Limit to coronavirus genomes isolated from a specified geographic location" help="Continent, country or U.S. state"/> - <param argument="--host" type="text" label="Limit to coronavirus genomes isolated from a specified host" help="NCBI Taxonomy ID, scientific or common name at any taxonomic rank"/> - <param argument="--lineage" type="text" label="Limit to SARS-CoV-2 genomes classified as the specified lineage (variant) by pangolin using the pangoLEARN algorithm" /> - <param argument="--refseq" type="boolean" truevalue="--refseq" falsevalue="" label="Limit to RefSeq coronavirus genomes"/> - <expand macro="released_options" before_or_after="since"></expand> - </inputs> - <outputs> - <data name="cds_fasta" format="fasta" label="NCBI datasets virus genome: CDS fasta" from_work_dir="ncbi_dataset/data/cds.fna"> - <filter>not exclude_cds</filter> - </data> - <data name="protein_fasta" format="fasta" label="NCBI datasets virus genome: protein fasta" from_work_dir="ncbi_dataset/data/protein.faa"> - <filter>not exclude_protein</filter> - </data> - <data name="protein_genbank" format="fasta" label="NCBI datasets virus genome: protein genbank" from_work_dir="ncbi_dataset/data/protein.gpff"> - <filter>not exclude_gpff</filter> - </data> - <collection name="protein_structure" type="list" format="pdb" label="NCBI datasets virus genome: protein structure"> - <discover_datasets pattern="(?P<identifier_0>.*?)\.pdb" ext="pdb" directory="ncbi_dataset/data/pdb"></discover_datasets> - <filter>not exclude_pdb</filter> - </collection> - </outputs> - <tests> - <test title="Test download of PDB collection"> - <param name="proteins" value="S,M"/> - <param name="exclude_cds" value="true"/> - <param name="exclude_protein" value="true"/> - <param name="exclude_gpff" value="true"/> - <param name="released_since" value="07/07/2021"/> - <param name="refseq" value="true"/> - <output_collection name="protein_structure" type="list"> - <element name="6VYB" checksum="sha256$307a56951050faa61f4b57e6b8ceabb7ca743125058421c232746f1820484069"/> - <element name="6VYO" checksum="sha256$1dab20880b7ae913da336e8a6dba838689256e63a3faaaaa439b7bd7f3651eaf"/> - <element name="6W37" checksum="sha256$f115326ed4b3f7b332b44790c1a3ca769deb2d440dae68ce4ccae8e650dd1d7e"/> - <element name="6W4H" checksum="sha256$a49bc40b5652664b7e01e562279786520b0abcf6e34a7d7f603d6e429afbf384"/> - <element name="6W9C" checksum="sha256$0f845885e5a9d41e42628c3b05b194ecbac59f38211927eb924e80c830190753"/> - <element name="6W9Q" checksum="sha256$c5d34126464ac47738c8883f03455cf3d73ba41a6aa9e0c40e92bca411321ed7"/> - </output_collection> - </test> - <test title="Test download of non-collection elements" expect_num_outputs="3"> - <param name="exclude_pdb" value="true"/> - <param name="proteins" value="S,M"/> - <param name="refseq" value="true"/> - <output name="cds_fasta"> - <assert_contents> - <has_line line="ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAAT"/> - </assert_contents> - </output> - <output name="protein_fasta"> - <assert_contents> - <has_line line="MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHV"/> - </assert_contents> - </output> - <output name="protein_genbank"> - <assert_contents> - <has_line line="ACCESSION YP_009724390"/> - </assert_contents> - </output> - </test> - </tests> - <help> -Downloads a coronavirus SARS-CoV-2 protein dataset - </help> - -</tool>
--- a/macros.xml Thu Jul 15 15:45:43 2021 +0000 +++ b/macros.xml Thu Jan 27 08:20:15 2022 +0000 @@ -1,11 +1,11 @@ <macros> - <token name="@TOOL_VERSION@">11.25.1</token> + <token name="@TOOL_VERSION@">12.27.1</token> <token name="@PROFILE@">20.01</token> <token name="@LICENSE@">MIT</token> <token name="@PROFILE_AND_LICENSE@">profile="@PROFILE@" license="@LICENSE@"</token> <token name="@SETUP_CERTIFICATES@"><![CDATA[ ## If running in container use certificate from ca-certificates instead of outdated / missing container certificates -[ -f /usr/local/ssl/cacert.pem ] && export export SSL_CERT_FILE="/usr/local/ssl/cacert.pem"; +[ -f /usr/local/ssl/cacert.pem ] && export SSL_CERT_FILE="/usr/local/ssl/cacert.pem"; ]]></token> <xml name="requirements"> <requirements> @@ -51,7 +51,7 @@ </conditional> </xml> <xml name="chromosomes"> - <param argument="--chromosomes" type="text" value="all" label="Limit chromosomes to a comma-delimited list of chromosomes"> + <param argument="--chromosomes" type="text" label="Limit chromosomes to a comma-delimited list of chromosomes"> <sanitizer invalid_char=""> <valid initial="string.letters,string.digits"> <add value="_" /> @@ -61,14 +61,25 @@ </sanitizer> </param> </xml> + <xml name="include" token_include_what="gbff" token_include_label="Include GenBank flat file sequence and annotation, if available"> + <param argument="--include-@INCLUDE_WHAT@" type="boolean" truevalue="--include-@INCLUDE_WHAT@" falsevalue="" label="@INCLUDE_LABEL@" /> + </xml> + <xml name="includes_genome"> + <expand macro="include" include_what="gbff" include_label="Include GenBank flat file sequence and annotation, if available"/> + <expand macro="include" include_what="gtf" include_label="Include gtf annotation file, if available"/> + </xml> <xml name="exclude" token_exclude_what="gff3" token_exclude_label="Exclude gff3 annotation file"> <param argument="--exclude-@EXCLUDE_WHAT@" type="boolean" truevalue="--exclude-@EXCLUDE_WHAT@" falsevalue="" label="@EXCLUDE_LABEL@" /> </xml> + <xml name="anti-exclude" token_exclude_what="gff3" token_exclude_label="Include gff3 annotation file" token_checked="false"> + <param argument="--exclude-@EXCLUDE_WHAT@" type="boolean" falsevalue="--exclude-@EXCLUDE_WHAT@" truevalue="" label="@EXCLUDE_LABEL@" checked="@CHECKED@"/> + </xml> <xml name="excludes_genome"> - <expand macro="exclude" exclude_what="gff3" exclude_label="Exclude gff3 annotation file"/> - <expand macro="exclude" exclude_what="protein" exclude_label="Exclude protein sequence file"/> - <expand macro="exclude" exclude_what="rna" exclude_label="Exclude transcript sequence file"/> - <expand macro="exclude" exclude_what="seq" exclude_label="Exclude genomic sequence file"/> + <expand macro="anti-exclude" exclude_what="seq" exclude_label="Include genomic sequence file" checked="true"/> + <expand macro="anti-exclude" exclude_what="gff3" exclude_label="Include gff3 annotation file"/> + <expand macro="anti-exclude" exclude_what="genomic-cds" exclude_label="Include cds from genomic sequence file"/> + <expand macro="anti-exclude" exclude_what="protein" exclude_label="Include protein sequence file"/> + <expand macro="anti-exclude" exclude_what="rna" exclude_label="Include transcript sequence file"/> </xml> <xml name="excludes_gene"> <expand macro="exclude" exclude_what="gene" exclude_label="Exclude gene sequence file"/> @@ -87,31 +98,24 @@ <expand macro="exclude" exclude_what="seq" exclude_label="Exclude genomic sequence file"/> </expand> </xml> - <token name="@EXCLUDES_GENOME@">$exclude_gff3 $exclude_protein $exclude_rna $exclude_seq</token> + <token name="@EXCLUDES_GENOME@">$file_choices.exclude_gff3 $file_choices.exclude_genomic_cds $file_choices.exclude_protein $file_choices.exclude_rna $file_choices.exclude_seq</token> <token name="@EXCLUDES_GENE@">$exclude_gene $exclude_protein $exclude_rna</token> <token name="@EXCLUDES_VIRUS_PROTEIN@">$exclude_protein $exclude_pdb $exclude_gpff $exclude_cds</token> <token name="@EXCLUDES_VIRUS_GENOME@">$exclude_seq @EXCLUDES_VIRUS_PROTEIN@</token> - <xml name="include" token_include_what="gbff" token_include_label="Include GenBank flat file sequence and annotation, if available"> - <param argument="--include-@INCLUDE_WHAT@" type="boolean" truevalue="--include-@INCLUDE_WHAT@" falsevalue="" label="@INCLUDE_LABEL@" /> - </xml> - <xml name="includes_genome"> - <expand macro="include" include_what="gbff" include_label="Include GenBank flat file sequence and annotation, if available"/> - <expand macro="include" include_what="gtf" include_label="Include gtf annotation file, if available"/> - </xml> <xml name="includes_virus_genome"> <expand macro="include" include_what="gbff" include_label="Include GenBank flat file sequence and annotation"/> </xml> - <token name="@INCLUDES_GENOME@">$include_gbff $include_gtf</token> + <token name="@INCLUDES_GENOME@">$file_choices.include_gbff $file_choices.include_gtf</token> <token name="@INCLUDES_VIRUS_GENOME@">$include_gbff</token> <xml name="released_options" token_released_what="genomes" token_before_or_after="before"> <param argument="--released-@BEFORE_OR_AFTER@" type="text" optional="true" label="Only include @RELEASED_WHAT@ that have been released @BEFORE_OR_AFTER@ a specified date (MM/DD/YYYY)"></param> </xml> - <token name="@RELEASED_BEFORE@">#if $released_before: ---released-before '$released_before' + <token name="@RELEASED_BEFORE@">#if $filters.released_before: +--released-before '$filters.released_before' #end if </token> - <token name="@RELEASED_SINCE@">#if $released_since: ---released-since '$released_since' + <token name="@RELEASED_SINCE@">#if $filters.released_since: +--released-since '$filters.released_since' #end if </token> -</macros> \ No newline at end of file +</macros>
--- a/test-data/GCF_000007445.1.genome.fa Thu Jul 15 15:45:43 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ ->NC_004431.1 Escherichia coli CFT073, complete sequence -AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTG -GTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGTCACTAAATACTTTAACCAATATAGGCATAGCGCACAGAC -AGATAAAAATTACAGAGTACACAACATCCATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGT -AACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTGTGCACAGAAA -ACCCCCAGCTAGGCTGGGGGTTCCGGAAAGCTTTCAGCTTTGAGCCAGTTATTAAAACCCCTTTTGATTTGTTAAAACAC -CTTGCGGTCTGGCAACTGCAAGTGTCAAACAAGAAATCAAAAGGGGGTCCCAATGGGGAACGAAAAGAGCTTAGCGCACA -CCCGATGGAACTGTAAATATCACATAGTATTTGCGCCAAAATACCGAAGACAGGTGTTCTACAGAGAGAAGCGTAGAGCA -ATAGGCTGTATTTTGAGAAAGCTGTGTGAGTGGAAAAGTGTACGGATTCTGGAAGCTGAATGCTGTGCAGATCATATCCA -TATGCTTGTGGAGATCCCGCCCAAAATGAGCGTATCAGGCTTTATGGGATATCTGAAAGGGAAAAGCAGTCTGATGCCTT
--- a/test-data/GCF_000007445.1.genomic.gbff Thu Jul 15 15:45:43 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -LOCUS NC_004431 5231428 bp DNA circular CON 13-MAY-2021 -DEFINITION Escherichia coli CFT073, complete sequence. -ACCESSION NC_004431 NZ_AE016755 NZ_AE016756 NZ_AE016757 NZ_AE016758 - NZ_AE016759 NZ_AE016760 NZ_AE016761 NZ_AE016762 NZ_AE016763 - NZ_AE016764 NZ_AE016765 NZ_AE016766 NZ_AE016767 NZ_AE016768 - NZ_AE016769 NZ_AE016770 NZ_AE016771 NZ_AE016772 -VERSION NC_004431.1 -DBLINK BioProject: PRJNA224116 - BioSample: SAMN02604094 - Assembly: GCF_000007445.1
--- a/test-data/GCF_000007445.1.genomic.gff Thu Jul 15 15:45:43 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -##gff-version 3 -#!gff-spec-version 1.21 -#!processor NCBI annotwriter -#!genome-build ASM744v1 -#!genome-build-accession NCBI_Assembly:GCF_000007445.1 -#!annotation-date 05/06/2021 17:43:00 -#!annotation-source NCBI RefSeq -##sequence-region NC_004431.1 1 5231428 -##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=199310 -NC_004431.1 RefSeq region 1 5231428 . + . ID=NC_004431.1:1..5231428;Dbxref=taxon:199310;Is_circular=true;Name=ANONYMOUS;gbkey=Src;genome=chromosome;mol_type=genomic DNA;strain=CFT073
--- a/test-data/GCF_000007445.1.protein.fa Thu Jul 15 15:45:43 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ ->WP_000002278.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase PhnJ [Escherichia] -MANLSGYNFAYLDEQTKRMIRRAILKAVAIPGYQVPFGGREMPMPYGWGTGGIQLTASVIGESDVLKVIDQGADDTTNAV -SIRNFFKRVTGVNTTERTDDATLIQTRHRIPETPLTEDQIIIFQVPIPEPLRFIEPRETETRTMHALEEYGVMQVKLYED -IARFGHIATTYAYPVKVNGRYVMDPSPIPKFDNPKMDMMPALQLFGAGREKRIYAVPPFTHVESLDFDDHPFTVQQWDEP -CAICGSTHSYLDEVVLDDAGNRMFVCSDTDYCRQQNEAKSQ ->WP_000002542.1 MULTISPECIES: signal peptidase I [Enterobacteriaceae] -MANMFALILVIATLVTGILWCVDKFFFAPKRRERQAAAQAAAGDSLDKATLKKVAPKPGWLETGASVFPVLAIVLIVRSF -IYEPFQIPSGSMMPTLLIGDFILVEKFAYGIKDPIYQKTLIETGHPKRGDIVVFKYPEDPKLDYIKRAVGLPGDKVTYDP -VSKELTIQPGCSSGQACENALPVTYSNVEPSDFVQTFSRRNGGEATSGFFEVPKNETKENGIRLSERKETLGDVTHRILT -VPIAQDQVGMYYQQPGQQLATWIVPPGQYFMMGDNRDNSADSRYWGFVPEANLVGRATAIWMSFDKQEGEWPTGVRLSRI
--- a/test-data/GCF_000013305.1.genome.fa Thu Jul 15 15:45:43 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ ->NC_008253.1 Escherichia coli 536, complete sequence -AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTG -GTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGTCACTAAATACTTTAACCAATATAGGCATAGCGCACAGAC -AGATAAAAATTACAGAGTACACAACATCCATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGT -AACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTCGACCAAAGGT -AACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGGGTTGCCGA -TATTCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCATCTGGTAG -CGATGATTGAAAAAACCATTAGCGGTCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGCCGAACTTCTG -ACGGGACTCGCCGCCGCCCAGCCGGGATTTCCGCTGGCACAATTGAAAACTTTCGTCGACCAGGAATTTGCCCAAATAAA -ACATGTCCTGCATGGCATCAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAA
--- a/test-data/GCF_000013305.1.genomic.gbff Thu Jul 15 15:45:43 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -LOCUS NC_008253 4938920 bp DNA circular CON 13-MAY-2021 -DEFINITION Escherichia coli 536, complete sequence. -ACCESSION NC_008253 -VERSION NC_008253.1 -DBLINK BioProject: PRJNA224116 - BioSample: SAMN02604181 - Assembly: GCF_000013305.1 -KEYWORDS RefSeq. -SOURCE Escherichia coli 536 - ORGANISM Escherichia coli 536
--- a/test-data/GCF_000013305.1.genomic.gff Thu Jul 15 15:45:43 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -##gff-version 3 -#!gff-spec-version 1.21 -#!processor NCBI annotwriter -#!genome-build ASM1330v1 -#!genome-build-accession NCBI_Assembly:GCF_000013305.1 -#!annotation-date 05/06/2021 17:31:48 -#!annotation-source NCBI RefSeq -##sequence-region NC_008253.1 1 4938920 -##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=362663 -NC_008253.1 RefSeq region 1 4938920 . + . ID=NC_008253.1:1..4938920;Dbxref=taxon:362663;Is_circular=true;Name=ANONYMOUS;gbkey=Src;genome=chromosome;mol_type=genomic DNA;serogroup=O6:K15:H31;strain=536
--- a/test-data/GCF_000013305.1.genomic.gtf Thu Jul 15 15:45:43 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -#gtf-version 2.2 -#!genome-build ASM1330v1 -#!genome-build-accession NCBI_Assembly:GCF_000013305.1 -#!annotation-date 05/06/2021 17:31:48 -#!annotation-source NCBI RefSeq -NC_008253.1 RefSeq gene 190 255 . + . gene_id "ECP_RS00005"; transcript_id ""; gbkey "Gene"; gene "thrL"; gene_biotype "protein_coding"; locus_tag "ECP_RS00005"; old_locus_tag "ECP_0001"; -NC_008253.1 Protein Homology CDS 190 252 . + 0 gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; -NC_008253.1 Protein Homology start_codon 190 192 . + 0 gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; -NC_008253.1 Protein Homology stop_codon 253 255 . + 0 gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; -NC_008253.1 RefSeq gene 336 2798 . + . gene_id "ECP_RS00010"; transcript_id ""; gbkey "Gene"; gene "thrA"; gene_biotype "protein_coding"; locus_tag "ECP_RS00010"; old_locus_tag "ECP_0002";
--- a/test-data/GCF_000013305.1.protein.fa Thu Jul 15 15:45:43 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ ->WP_000002277.1 alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase PhnJ [Escherichia coli] -MANLSGYNFAYLDEQTKRMIRRAILKAVAIPGYQVPFGGREMPMPYGWGTGGIQLTASVIGESDVLKVIDQGADDTTNAV -SIRNFFKRVTGVNTTERTDDATLIQTRHRIPETPLTEDQIIIFQVPIPEPLRFIEPRETETRTMHALEEYGVMQVKLYED -IARFGHIATTYAYPVKVNGCYVMDPSPIPKFDNPKMNMMPALQLFGAGREKRIYAVPPFTRVESLDFDDHPFTVQQWNEP -CAICGSTHSYLDEVVLDDAGNRMFVCSDTDYCRQQSEAKSQ ->WP_000002542.1 MULTISPECIES: signal peptidase I [Enterobacteriaceae] -MANMFALILVIATLVTGILWCVDKFFFAPKRRERQAAAQAAAGDSLDKATLKKVAPKPGWLETGASVFPVLAIVLIVRSF -IYEPFQIPSGSMMPTLLIGDFILVEKFAYGIKDPIYQKTLIETGHPKRGDIVVFKYPEDPKLDYIKRAVGLPGDKVTYDP -VSKELTIQPGCSSGQACENALPVTYSNVEPSDFVQTFSRRNGGEATSGFFEVPKNETKENGIRLSERKETLGDVTHRILT -VPIAQDQVGMYYQQPGQQLATWIVPPGQYFMMGDNRDNSADSRYWGFVPEANLVGRATAIWMSFDKQEGEWPTGVRLSRI
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/genome.2.GCF_000007445.1.genomic.cds Thu Jan 27 08:20:15 2022 +0000 @@ -0,0 +1,10 @@ +>lcl|NC_004431.1_cds_WP_001386572.1_1 [gene=thrL] [locus_tag=C_RS00005] [protein=thr operon leader peptide] [protein_id=WP_001386572.1] [location=190..255] [gbkey=CDS] +ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGTAACGGTGCGGGCTGA +>lcl|NC_004431.1_cds_WP_000526115.1_2 [gene=tnpA] [locus_tag=C_RS00010] [protein=IS200/IS605-like element IS200C family transposase] [protein_id=WP_000526115.1] [location=453..911] [gbkey=CDS] +ATGGGGAACGAAAAGAGCTTAGCGCACACCCGATGGAACTGTAAATATCACATAGTATTTGCGCCAAAATACCGAAGACA +GGTGTTCTACAGAGAGAAGCGTAGAGCAATAGGCTGTATTTTGAGAAAGCTGTGTGAGTGGAAAAGTGTACGGATTCTGG +AAGCTGAATGCTGTGCAGATCATATCCATATGCTTGTGGAGATCCCGCCCAAAATGAGCGTATCAGGCTTTATGGGATAT +CTGAAAGGGAAAAGCAGTCTGATGCCTTACGAGCAGTTTGGTGATTTGAAATTCAAATACAGGAACAGGGAGTTCTGGTG +CAGAGGGTATTACGTCGATACGGTGGGTAAGAACACGGCGAAGATACAGGATTACATAAAGCACCAGCTTGAAGAGGATA +AAATGGGAGAGCAGTTATCGATTCCCTATCCGGGCAGCCCGTTTACGGGCCGTAAGTAA +>lcl|NC_004431.1_cds_WP_001264710.1_3 [gene=thrA] [locus_tag=C_RS00015] [protein=bifunctional aspartate kinase/homoserine dehydrogenase I] [protein_id=WP_001264710.1] [location=1048..3510] [gbkey=CDS]
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/genome.2.GCF_000007445.1.seq.rpt.jsonl Thu Jan 27 08:20:15 2022 +0000 @@ -0,0 +1,1 @@ +{"assemblyUnit":"GCF_000007455.1","assignedMoleculeLocationType":"Chromosome","chrName":"ANONYMOUS","gcCount":"2640553","genbankAccession":"AE014075.1","length":5231428,"refseqAccession":"NC_004431.1","sortOrder":1}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/genome.2.GCF_000013305.1.genomic.cds Thu Jan 27 08:20:15 2022 +0000 @@ -0,0 +1,10 @@ +>lcl|NC_008253.1_cds_WP_001386572.1_1 [gene=thrL] [locus_tag=ECP_RS00005] [protein=thr operon leader peptide] [protein_id=WP_001386572.1] [location=190..255] [gbkey=CDS] +ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGTAACGGTGCGGGCTGA +>lcl|NC_008253.1_cds_WP_001264707.1_2 [gene=thrA] [locus_tag=ECP_RS00010] [protein=bifunctional aspartate kinase/homoserine dehydrogenase I] [protein_id=WP_001264707.1] [location=336..2798] [gbkey=CDS] +ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGGGTTGCCGATATTCTGGAAAGCAA +TGCCAGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCATCTGGTAGCGATGATTGAAAAAA +CCATTAGCGGTCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGCCGAACTTCTGACGGGACTCGCCGCC +GCCCAGCCGGGATTTCCGCTGGCACAATTGAAAACTTTCGTCGACCAGGAATTTGCCCAAATAAAACATGTCCTGCATGG +CATCAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAATGTCGATCGCCATTA +TGGCCGGCGTGTTAGAAGCGCGTGGTCACAACGTTACCGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGTCATTAC +CTCGAATCTACCGTTGATATTGCTGAATCCACCCGCCGTATTGCGGCAAGCCGCATTCCGGCTGACCACATGGTGCTGAT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/genome.2.GCF_000013305.1.genomic.gtf Thu Jan 27 08:20:15 2022 +0000 @@ -0,0 +1,10 @@ +#gtf-version 2.2 +#!genome-build ASM1330v1 +#!genome-build-accession NCBI_Assembly:GCF_000013305.1 +#!annotation-date 05/06/2021 17:31:48 +#!annotation-source NCBI RefSeq +NC_008253.1 RefSeq gene 190 255 . + . gene_id "ECP_RS00005"; transcript_id ""; gbkey "Gene"; gene "thrL"; gene_biotype "protein_coding"; locus_tag "ECP_RS00005"; old_locus_tag "ECP_0001"; +NC_008253.1 Protein Homology CDS 190 252 . + 0 gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; +NC_008253.1 Protein Homology start_codon 190 192 . + 0 gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; +NC_008253.1 Protein Homology stop_codon 253 255 . + 0 gene_id "ECP_RS00005"; transcript_id "unassigned_transcript_1"; gbkey "CDS"; gene "thrL"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_414542.1"; locus_tag "ECP_RS00005"; product "thr operon leader peptide"; protein_id "WP_001386572.1"; transl_table "11"; +NC_008253.1 RefSeq gene 336 2798 . + . gene_id "ECP_RS00010"; transcript_id ""; gbkey "Gene"; gene "thrA"; gene_biotype "protein_coding"; locus_tag "ECP_RS00010"; old_locus_tag "ECP_0002";
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/genome.2.GCF_000013305.1.seq.rpt.jsonl Thu Jan 27 08:20:15 2022 +0000 @@ -0,0 +1,1 @@ +{"assemblyUnit":"GCF_000013315.1","assignedMoleculeLocationType":"Chromosome","chrName":"ANONYMOUS","gcCount":"2495020","genbankAccession":"CP000247.1","length":4938920,"refseqAccession":"NC_008253.1","sortOrder":1}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/genome.3.GCF_000007445.1.genomic.gbff Thu Jan 27 08:20:15 2022 +0000 @@ -0,0 +1,10 @@ +LOCUS NC_004431 5231428 bp DNA circular CON 13-MAY-2021 +DEFINITION Escherichia coli CFT073, complete sequence. +ACCESSION NC_004431 NZ_AE016755 NZ_AE016756 NZ_AE016757 NZ_AE016758 + NZ_AE016759 NZ_AE016760 NZ_AE016761 NZ_AE016762 NZ_AE016763 + NZ_AE016764 NZ_AE016765 NZ_AE016766 NZ_AE016767 NZ_AE016768 + NZ_AE016769 NZ_AE016770 NZ_AE016771 NZ_AE016772 +VERSION NC_004431.1 +DBLINK BioProject: PRJNA224116 + BioSample: SAMN02604094 + Assembly: GCF_000007445.1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/genome.3.GCF_000007445.1.genomic.gff Thu Jan 27 08:20:15 2022 +0000 @@ -0,0 +1,10 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM744v1 +#!genome-build-accession NCBI_Assembly:GCF_000007445.1 +#!annotation-date 05/06/2021 17:43:00 +#!annotation-source NCBI RefSeq +##sequence-region NC_004431.1 1 5231428 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=199310 +NC_004431.1 RefSeq region 1 5231428 . + . ID=NC_004431.1:1..5231428;Dbxref=taxon:199310;Is_circular=true;Name=ANONYMOUS;gbkey=Src;genome=chromosome;mol_type=genomic DNA;strain=CFT073
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/genome.3.GCF_000007445.1.seq.rpt.jsonl Thu Jan 27 08:20:15 2022 +0000 @@ -0,0 +1,1 @@ +{"assemblyUnit":"GCF_000007455.1","assignedMoleculeLocationType":"Chromosome","chrName":"ANONYMOUS","gcCount":"2640553","genbankAccession":"AE014075.1","length":5231428,"refseqAccession":"NC_004431.1","sortOrder":1}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/genome.3.GCF_000013305.1.genomic.gbff Thu Jan 27 08:20:15 2022 +0000 @@ -0,0 +1,10 @@ +LOCUS NC_008253 4938920 bp DNA circular CON 13-MAY-2021 +DEFINITION Escherichia coli 536, complete sequence. +ACCESSION NC_008253 +VERSION NC_008253.1 +DBLINK BioProject: PRJNA224116 + BioSample: SAMN02604181 + Assembly: GCF_000013305.1 +KEYWORDS RefSeq. +SOURCE Escherichia coli 536 + ORGANISM Escherichia coli 536
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/genome.3.GCF_000013305.1.genomic.gff Thu Jan 27 08:20:15 2022 +0000 @@ -0,0 +1,10 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM1330v1 +#!genome-build-accession NCBI_Assembly:GCF_000013305.1 +#!annotation-date 05/06/2021 17:31:48 +#!annotation-source NCBI RefSeq +##sequence-region NC_008253.1 1 4938920 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=362663 +NC_008253.1 RefSeq region 1 4938920 . + . ID=NC_008253.1:1..4938920;Dbxref=taxon:362663;Is_circular=true;Name=ANONYMOUS;gbkey=Src;genome=chromosome;mol_type=genomic DNA;serogroup=O6:K15:H31;strain=536
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/genome.3.GCF_000013305.1.seq.rpt.jsonl Thu Jan 27 08:20:15 2022 +0000 @@ -0,0 +1,1 @@ +{"assemblyUnit":"GCF_000013315.1","assignedMoleculeLocationType":"Chromosome","chrName":"ANONYMOUS","gcCount":"2495020","genbankAccession":"CP000247.1","length":4938920,"refseqAccession":"NC_008253.1","sortOrder":1}
