changeset 23:f12a64a8a5bb draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/diamond commit 3b8d4b833ee2bd2a99b23b7389def84cd3de84cb
author iuc
date Fri, 12 Dec 2025 11:13:59 +0000
parents 7058b4f32f50
children 51128934680a
files diamond.xml diamond_makedb.xml macros.xml test-data/blastdb/README test-data/blastdb/db.fasta test-data/blastdb/db.fasta.pdb test-data/blastdb/db.fasta.phr test-data/blastdb/db.fasta.pin test-data/blastdb/db.fasta.pjs test-data/blastdb/db.fasta.pnd test-data/blastdb/db.fasta.pni test-data/blastdb/db.fasta.pog test-data/blastdb/db.fasta.pos test-data/blastdb/db.fasta.pot test-data/blastdb/db.fasta.psq test-data/blastdb/db.fasta.ptf test-data/blastdb/db.fasta.pto test-data/blastdb/filter_and_map_ids.py test-data/blastdb/gen.sh test-data/blastdb/map.txt test-data/blastdb/prot.accession2taxid test-data/blastdb/taxdb.btd test-data/blastdb/taxdb.bti test-data/blastdb/taxdb.py test-data/blastdb/taxonomy4blast.sqlite3 test-data/blastdb_p.loc test-data/db-wtax.dmnd test-data/db.dmnd test-data/db.fasta test-data/db.fasta.gz test-data/diamond_results.pairwise test-data/diamond_results.tabular test-data/diamond_results.wtax.tabular test-data/diamond_results_algorithm.tabular test-data/diamond_results_freq_masking.tabular test-data/diamond_results_global_ranking.tabular test-data/diamond_results_iterate.tabular test-data/diamond_results_log_test.tabular test-data/diamond_results_max_hsps.tabular test-data/diamond_results_motif_masking.tabular test-data/diamond_results_soft_masking.tabular test-data/names.dmp test-data/ncbi_taxonomy.loc test-data/ncbi_taxonomy/README.md test-data/ncbi_taxonomy/names.dmp test-data/ncbi_taxonomy/nodes.dmp test-data/ncbi_taxonomy/prot.accession2taxid test-data/nodes.dmp test-data/nucleotide.fasta test-data/prot.accession2taxid test-data/protein.fasta test-data/protein.fasta.gz test-data/taxon.tsv tool-data/blastdb_p.loc tool-data/ncbi_taxonomy.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test
diffstat 57 files changed, 1941 insertions(+), 233 deletions(-) [+]
line wrap: on
line diff
--- a/diamond.xml	Mon Nov 10 15:12:32 2025 +0000
+++ b/diamond.xml	Fri Dec 12 11:13:59 2025 +0000
@@ -13,18 +13,34 @@
 <![CDATA[
 
     #if $ref_db_source.db_source == "history":
-        ln -s '$ref_db_source.reference_database' ./database.dmnd
+        ln -s '$ref_db_source.reference_database' ./database.dmnd &&
+        #set database="database.dmnd"
+    #else if $ref_db_source.db_source == "blast":
+        #import os.path
+        #set basename =  os.path.basename($ref_db_source.reference_database.fields.path)
+        #set dirname =  os.path.dirname($ref_db_source.reference_database.fields.path)
+        #set database="./db/" + basename
+        mkdir ./db &&
+        ## symlink all files in the directory containing the BLAST DB
+        ## in newer BLAST DBs there is a file taxonomy4blast.sqlite3
+        ## that is needed
+        ln -s '$dirname/'* ./db/ &&
+        #if $ref_db_source.ncbi_taxonomy
+            ## symlink names and nodes dump files from NCBI taxonomy
+            ## need to be in the same dir as the BLAST DB (which is the
+            ## reason why we need to create .,/db/)
+            ln -s '$ref_db_source.ncbi_taxonomy.fields.path/nodes.dmp' ./db/nodes.dmp &&
+            ln -s '$ref_db_source.ncbi_taxonomy.fields.path/names.dmp' ./db/names.dmp &&
+        #end if
     #else:
-        ln -s '${ref_db_source.index.fields.db_path}' ./database.dmnd
+        ln -s '${ref_db_source.reference_database.fields.db_path}' ./database.dmnd &&
+        #set database="database.dmnd"
     #end if
 
-    &&
-
     diamond
         $method_cond.method_select
-        --quiet
         --threads "\${GALAXY_SLOTS:-12}"
-        --db ./database
+        --db '$database'
         --query '$query'
         #if $method_cond.method_select == "blastx"
           --query-gencode '$method_cond.query_gencode'
@@ -39,17 +55,11 @@
         #end if
 
         @OUTPUT_ARGS@
-
         #if $output_section.output.outfmt != '100'
             --compress '0'
         #end if
+
         $sens_cond.sensitivity
-        $iterate
-        $swipe
-        --algo $algo
-        #if $global_ranking
-            --global-ranking $global_ranking
-        #end if
         #if str($gapopen) != "":
           --gapopen '$gapopen'
         #end if
@@ -69,6 +79,7 @@
         #end if
 
         --id '$id'
+        --approx-id '$approx_id'    
         --query-cover '$query_cover'
         --subject-cover '$subject_cover'
         --block-size '$sens_cond.block_size'
@@ -94,22 +105,32 @@
             --max-hsps $output_section.max_hsps
         #end if
         #if $tax_cond.tax_select == 'file':
-            --taxonlist `cat '$tax_cond.taxonlistfile' | grep -v "^#" | grep -v "^$" | tr "\n" "," | sed 's/,$//'`
+            --taxonlist \$(cat '$tax_cond.taxonlist' | grep -v "^#" | grep -v "^$" | tr "\n" "," | sed 's/,$//')
         #else if  $tax_cond.tax_select == 'list':
             --taxonlist '$tax_cond.taxonlist'
         #end if
+        #if $tax_exclude_cond.tax_select == 'file':
+            --taxon_exclude \$(cat '$tax_exclude_cond.taxon_exclude' | grep -v "^#" | grep -v "^$" | tr "\n" "," | sed 's/,$//')
+        #else if  $tax_exclude_cond.tax_select == 'list':
+            --taxon_exclude '$tax_exclude_cond.taxon_exclude'
+        #end if
+
         #if $advanced_section.seed_cut
             --seed-cut $advanced_section.seed_cut
         #end if
         $advanced_section.freq_masking
         --motif-masking $advanced_section.motif_masking
         --soft-masking $advanced_section.soft_masking
+        $advanced_section.iterate
+        $advanced_section.swipe
+        --algo $advanced_section.algo
+        #if $advanced_section.global_ranking
+            --global-ranking $advanced_section.global_ranking
+        #end if
         --index-chunks "\${DIAMOND_INDEX_CHUNKS:-4}"
         --file-buffer-size "\${DIAMOND_FILE_BUFFER_SIZE:-67108864}"
         $log
-        
-]]>
-    </command>
+    ]]></command>
     <inputs>
         <conditional name="method_cond">
             <param name="method_select" type="select" label="Alignment mode" help="(blastp/blastx)">
@@ -117,7 +138,7 @@
                 <option value="blastx">DNA query sequences (blastx)</option>
             </param>
             <when value="blastx">
-                <param argument="--query-gencode" type="select" label="Genetic code used for translation of query in BLASTX mode" help="">
+                <param argument="--query-gencode" type="select" label="Genetic code" help="used for translation of query in BLASTX mode">
                     <option value="1">Standard Code</option>
                     <option value="2">Vertebrate Mitochondrial Code</option>
                     <option value="3">Yeast Mitochondrial Code</option>
@@ -144,20 +165,20 @@
                     <option value="31">Blastocrithidia Nuclear Code</option>
                     <option value="33">Cephalodiscidae Mitochondrial UAA-Tyr Code</option>
                 </param>
-                <param argument="--min-orf" type="integer" value="1" min="1" label="ignore translated sequences without an open reading frame of at least this length" help="By default this feature is disabled for sequences of length below 30, set to 20 for sequences of length below 100, and set to 40 otherwise. Setting this option to 1 will disable this feature"/>
-                <param name="query_strand" argument="--strand" type="select" label="query strands to search" help="">
+                <param argument="--min-orf" type="integer" value="1" min="1" label="Minimum ORF length" help="Ignore translated sequences without an open reading frame of at least this length. By default this feature is disabled for sequences of length below 30, set to 20 for sequences of length below 100, and set to 40 otherwise. Setting this option to 1 will disable this feature"/>
+                <param name="query_strand" argument="--strand" type="select" label="Query strands to search" help="">
                     <option value="both" selected="True">Both</option>
                     <option value="plus">Plus</option>
                     <option value="minus">Minus</option>
                 </param>
                 <conditional name="frameshift_cond">
-                    <param name="frameshift_select" type="select" label="Allow for frameshifts?" help="">
+                    <param name="frameshift_select" type="select" label="Allow for frameshifts" help="">
                         <option value="yes">yes</option>
                         <option value="no" selected="true">no</option>
                     </param>
                     <when value="yes">
-                        <param argument="--range-culling" type="boolean" truevalue="--range-culling" falsevalue="" checked="false" label="restrict hit culling to overlapping query ranges" help="This feature is designed for long query DNA sequences that may span several genes. In these cases, the default of reporting the 25 best overall hits could cause hits to a lower scoring gene to be overshadowed. But just increasing the number of alignments reported will bloat the output size and reduce performance. Using this feature along with -k 25 (default), a hit will only be deleted if at least 50% of its query range is spanned by at least 25 higher or equal scoring hits. Using this feature along with --top 10, a hit will only be deleted if its score is more than 10% lower than that of a higher scoring hit over at least 50% of its query range. The percentage is configurable using --range-cover. Note that this feature is currently only available in frameshift alignment mode"/>
-                        <param argument="--frameshift" type="integer" value="0" label="frame shift penalty" help="Values around 15 are reasonable for this parameter. Enabling this feature will have the aligner tolerate missing bases in DNA sequences and is most recommended for long, error-prone sequences like MinION reads. In the pairwise output format, frameshifts will be indicated by \ and / for a shift by +1 and -1 nucleotide in the direction of translation respectively."/>
+                        <param argument="--range-culling" type="boolean" truevalue="--range-culling" falsevalue="" checked="false" label="Restrict hit culling to overlapping query ranges" help="This feature is designed for long query DNA sequences that may span several genes. In these cases, the default of reporting the 25 best overall hits could cause hits to a lower scoring gene to be overshadowed. But just increasing the number of alignments reported will bloat the output size and reduce performance. Using this feature along with -k 25 (default), a hit will only be deleted if at least 50% of its query range is spanned by at least 25 higher or equal scoring hits. Using this feature along with --top 10, a hit will only be deleted if its score is more than 10% lower than that of a higher scoring hit over at least 50% of its query range. The percentage is configurable using --range-cover. Note that this feature is currently only available in frameshift alignment mode"/>
+                        <param argument="--frameshift" type="integer" value="0" label="Frame shift penalty" help="Values around 15 are reasonable for this parameter. Enabling this feature will have the aligner tolerate missing bases in DNA sequences and is most recommended for long, error-prone sequences like MinION reads. In the pairwise output format, frameshifts will be indicated by \ and / for a shift by +1 and -1 nucleotide in the direction of translation respectively."/>
                     </when>
                     <when value="no"/>
                 </conditional>
@@ -174,17 +195,32 @@
                     <option value="2">Compositional matrix adjust conditioned on sequence properties, simplified (Yu, 2005)</option>
                     <option value="3">Compositional matrix adjust conditioned on sequence properties (Yu, 2005)</option>
                     <option value="4">Compositional matrix adjust unconditionally (Yu, 2005)</option>
+                    <option value="5">Compositional matrix adjustment conditioned on sequence properties with fallback on composition-based statistics</option>
                 </param>
             </when>
         </conditional>
         <param argument="--query" type="data" format="fasta,fasta.gz,fastqsanger,fastqsanger.gz,fastqillumina,fastqillumina.gz" label="Input query file in FASTA or FASTQ format"/>
         <conditional name="ref_db_source">
-            <param name="db_source" type="select" label="Will you select a reference database from your history or use a built-in index?" help="Built-ins were indexed using default options">
-                <option value="indexed">Use a built-in index</option>
+            <param name="db_source" type="select" label="Reference database source" help="">
+                <option value="blast">Use a built-in BLAST index</option>
                 <option value="history">Use one from the history</option>
+                <option value="indexed">Use a built-in DIAMOND index</option>
             </param>
+            <when value="blast">
+                <param name="reference_database" type="select" label="Reference database" help="If your database of interest is not listed, contact your Galaxy admin">
+                    <options from_data_table="blastdb_p">
+                        <filter type="sort_by" column="2"/>
+                        <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+                    </options>
+                </param>
+                <param name="ncbi_taxonomy" type="select" optional="true" label="NCBI taxonomy database" help="Needed for output of taxonomy columns in tabular output">
+                    <options from_data_table="ncbi_taxonomy">
+                        <validator message="No NCBI database is available. Ask your Galaxy adin" type="no_options"/>
+                    </options>
+                </param>
+            </when>
             <when value="indexed">
-                <param name="index" type="select" label="Select a reference database" help="If your database of interest is not listed, contact your Galaxy admin">
+                <param name="reference_database" type="select" label="Reference database" help="If your database of interest is not listed, contact your Galaxy admin">
                     <options from_data_table="diamond_database">
                         <filter type="sort_by" column="2"/>
                         <validator type="no_options" message="No indexes are available for the selected input dataset"/>
@@ -192,25 +228,11 @@
                 </param>
             </when>
             <when value="history">
-                <param name="reference_database" argument="--db" type="data" format="dmnd" label="Select the reference database"/>
+                <param name="reference_database" argument="--db" type="data" format="dmnd" label="Reference database"/>
             </when>
         </conditional>
-        <conditional name="tax_cond">
-            <param name="tax_select" type="select" label="Restrict search taxonomically?" help="Any taxonomic rank can be used, and only reference sequences matching one of the specified taxon ids will be searched against.">
-                <option value="no" selected="True">No</option>
-                <option value="list">List of taxids entered manually</option>
-                <option value="file">List of taxids from single column tabular file</option>
-            </param>
-            <when value="no"/>
-            <when value="list">
-                <param argument="--taxonlist" type="text" value="" label="Comma separated list of taxon ids" help="">
-                    <validator type="regex" message="Taxonlist needs to be a comma separated list of integers">[0-9,]*</validator>
-                </param>
-            </when>
-            <when value="file">
-                <param name="taxonlistfile" argument="--taxonlist" type="data" format="tabular" label="Keep alignments within the given percentage range of the top alignment score for a quer" help=""/>
-            </when>
-        </conditional>
+        <expand macro="taxon_cond_macro" argument="--taxonlist" cond_name="tax_cond" label="Restrict search taxonomically" help="Any taxonomic rank can be used. Only reference sequences included in the given taxa will be used"/>
+        <expand macro="taxon_cond_macro" argument="--taxon-exclude" cond_name="tax_exclude_cond" label="Exclude taxa from search" help="Any taxonomic rank can be used. Reference sequences included in the taxonomic rank will be excluded from the search."/>
         <conditional name="sens_cond">
             <param name="sensitivity" type="select" label="Sensitivity Mode" help="Choose one of the sensitivity modes. The default mode is mainly designed for short read alignment, i.e. finding significant matches of &gt;50 bits on 30-40aa fragments. The sensitive mode is a lot more sensitive than the default and generally recommended for aligning longer sequences. The more sensitive mode provides even more sensitivity. More sensitivity may increase computation time.">
                 <option value="--faster">Faster (--faster)</option>
@@ -223,28 +245,28 @@
                 <option value="--ultra-sensitive">Ultra Sensitive (--ultra-sensitive)</option>
             </param>
             <when value="--faster">
-                <expand macro="block_size_low_sens"/>
+                <expand macro="block_size" value="2"/>
             </when>
             <when value="--fast">
-                <expand macro="block_size_low_sens"/>
+                <expand macro="block_size" value="2"/>
             </when>
             <when value="">
-                <expand macro="block_size_low_sens"/>
+                <expand macro="block_size" value="2"/>
             </when>
             <when value="--mid-sensitive">
-                <expand macro="block_size_low_sens"/>
+                <expand macro="block_size" value="2"/>
             </when>
             <when value="--sensitive">
-                <expand macro="block_size_low_sens"/>
+                <expand macro="block_size" value="2"/>
             </when>
             <when value="--more-sensitive">
-                <expand macro="block_size_low_sens"/>
+                <expand macro="block_size" value="2"/>
             </when>
             <when value="--very-sensitive">
-                <expand macro="block_size_hi_sens"/>
+                <expand macro="block_size" value="0.4"/>
             </when>
             <when value="--ultra-sensitive">
-                <expand macro="block_size_hi_sens"/>
+                <expand macro="block_size" value="0.4"/>
             </when>
         </conditional>
         <param argument="--matrix" type="select" label="Scoring matrix" help="In parentheses are the supported values for (gap open)/(gap extend). In brackets are default gap penalties">
@@ -260,40 +282,18 @@
         <param argument="--gapopen" type="integer" optional="True" value="" label="Gap open penalty" help="Leave empty for default (see scoring matrix)"/>
         <param argument="--gapextend" type="integer" optional="True" value="" label="Gap extension penalty" help="Leave empty for default (see scoring matrix)"/>
         <param argument="--masking" type="select" label="Masking algorithm" help="DIAMOND by default applies the tantan repeat masking algorithm to the query and target sequences as described in (Frith, 2011). This masking procedure increases the specificity of alignments and serves to filter out spurious hits. Note that when using --comp-based-stats (2,3,4), tantan masking is disabled by default.">
-            <option value="0">Disabled</option>
-            <option value="1" selected="true">Tantan</option>
+            <option value="none">Disabled</option>
+            <option value="tantan" selected="true">Tantan</option>
             <option value="seg">SEG</option>
         </param>
-        <conditional name="filter_score">
-            <param name="filter_score_select" type="select" label="Method to filter?" help="(--evalue/--min-score)">
-                <option value="evalue" selected="True">Maximum e-value to report alignments</option>
-                <option value="min-score">Minimum bit score to report alignments</option>
-            </param>
-            <when value="evalue">
-                <param argument="--evalue" type="float" value="0.001" label="Maximum expected value to keep an alignment"/>
-            </when>
-            <when value="min-score">
-                <param argument="--min-score" type="integer" value="0" label="Minimum bit score to keep an alignment" help="(--min-score)"/>
-            </when>
-        </conditional>
-        <param argument="--swipe" type="boolean" truevalue="--swipe" falsevalue="" checked="false" label="Run Exhaustive alignment against all database sequences" help="Smith Waterman alignments of all queries will be computed against all targets."/>
-        <param argument="--iterate" type="boolean" truevalue="--iterate" falsevalue="" checked="false" label="Run multiple rounds of searches with increasing sensitivity" help="The query dataset will first be searched at a lower sensitivity setting, only searching those query sequences at the target sensitivity that fail to produce a significant alignment at a lower sensitivity."/>
-        <param argument="--algo" type="select" label="Algorithm for seed search" help="Double-indexed is the main algorithm of the program, designed for large input files but less efficient for small query files. Query-indexed and improves performance for small query files. This mode will be automatically triggered based on the input. Contiguous-seed mode and further improves performance for small query files. The modes differ slightly in their sensitivity, so results are not guaranteed to be 100% identical for different settings of this option.">
-            <option value="0">Doble-indexed (0)</option>
-            <option value="1">Query-indexed (1)</option>
-            <option value="ctg">Contiguous-seed mode (ctg)</option>
-        </param>
+
         <expand macro="hit_filter_macro"/>
-        <param argument="--global-ranking" type="integer" min="0" value="" optional="true" label="Limit on the number of Smith Waterman extensions" help="Target sequences will be ranked according to their ungapped extension scores at seed hits, and gapped extensions will only be computed for the best N targets for each query. Note that this option increases memory use."/>
-        <param argument="--id" type="integer" value="0" label="Minimum identity percentage to report an alignment" help="Report only alignments above the given percentage of sequence identity"/>
-        <param argument="--query-cover" type="integer" value="0" label="Minimum query cover percentage to report an alignment" help="Report only alignments above the given percentage of query cover"/>
-        <param argument="--subject-cover" type="integer" value="0" label="Minimum subject cover percentage to report an alignment" help="Report only alignments above the given percentage of subject cover"/>
-        <section name="output_section" title="Output options">
-            <param argument="--max-hsps" type="integer" min="0" optional="true" label="Maximum number of HSPs" help="The maximum number of HSPs (High-Scoring Segment Pairs) per target sequence to report for each query. The default policy is to report only the highest-scoring HSP for each target, while disregarding alternative, lower-scoring HSPs that are contained in the same target."/>
+        <section name="output_section" title="Output options" expanded="true">
             <expand macro="output_type_macro">
                 <!-- Taxonomy features are not supported for the DAA format (i.e.
                         can't be used in diamond view) -->
-                <option value="staxids">unique Subject Taxonomy ID(s), separated by a ';' (in numerical order)</option>
+                <option value="staxids">Subject Taxonomy ID(s), separated by a ';' (in numerical order)</option>
+                <option value="sscinames">Subject Scientific Name(s)</option>
                 <option value="sskingdoms">Subject super kingdoms</option>
                 <option value="skingdoms">Subject kingdoms</option>
                 <option value="sphylums">Subject phylums</option>
@@ -303,20 +303,30 @@
                 <option value="--al">Output aligned queries (--al)</option>
             </param>
             <param argument="--log" type="boolean" truevalue="--log" falsevalue="" label="Output log file"/>
+            <param argument="--max-hsps" type="integer" min="0" optional="true" label="Maximum number of HSPs" help="The maximum number of HSPs (High-Scoring Segment Pairs) per target sequence to report for each query. The default policy is to report only the highest-scoring HSP for each target, while disregarding alternative, lower-scoring HSPs that are contained in the same target."/>
         </section>
 
         <section name="advanced_section" title="Advanced options" expanded="false">
             <param argument="--seed-cut" type="float" min="0" optional="true" label="Set a complexity cutoff for indexed seeds"/>
             <param argument="--freq-masking" type="boolean" truevalue="--freq-masking" falsevalue="" checked="false" label="Enable masking seeds based on frequency" help="This option is incompatible with --sed-cut"/>
             <param argument="--soft-masking" type="select" label="Soft Masking" help="Select type of soft masking">
+                <!-- https://github.com/bbuchfink/diamond/issues/916 -->
                 <option value="0" selected="True">Disbled</option>
-                <option value="seg">seg</option>
+                <!-- <option value="seg">seg</option> -->
                 <option value="tantan">tantan</option>
             </param>
             <param argument="--motif-masking" type="select" label="Softmask abundant motifs" help="Enable or disable motif masking">
                 <option value="0">Disabled</option>
                 <option value="1">Enabled</option>
             </param>
+            <param argument="--swipe" type="boolean" truevalue="--swipe" falsevalue="" checked="false" label="Run Exhaustive alignment against all database sequences" help="Smith Waterman alignments of all queries will be computed against all targets."/>
+            <param argument="--iterate" type="boolean" truevalue="--iterate" falsevalue="" checked="false" label="Run multiple rounds of searches with increasing sensitivity" help="The query dataset will first be searched at a lower sensitivity setting, only searching those query sequences at the target sensitivity that fail to produce a significant alignment at a lower sensitivity."/>
+            <param argument="--algo" type="select" label="Algorithm for seed search" help="Double-indexed is the main algorithm of the program, designed for large input files but less efficient for small query files. Query-indexed and improves performance for small query files. This mode will be automatically triggered based on the input. Contiguous-seed mode and further improves performance for small query files. The modes differ slightly in their sensitivity, so results are not guaranteed to be 100% identical for different settings of this option.">
+                <option value="0">Double-indexed (0)</option>
+                <option value="1">Query-indexed (1)</option>
+                <option value="ctg">Contiguous-seed mode (ctg)</option>
+            </param>
+            <param argument="--global-ranking" type="integer" min="0" value="" optional="true" label="Limit on the number of Smith Waterman extensions" help="Target sequences will be ranked according to their ungapped extension scores at seed hits, and gapped extensions will only be computed for the best N targets for each query. Note that this option increases memory use."/>
         </section>
     </inputs>
     <outputs>
@@ -332,7 +342,7 @@
         </data>
     </outputs>
     <tests>
-        <!--Test 01-->
+        <!--Test 01 al and unal output -->
         <test expect_num_outputs="3">
             <conditional name="method_cond">
                 <param name="method_select" value="blastp"/>
@@ -354,7 +364,7 @@
                 <param name="sensitivity" value=""/>
             </conditional>
             <param name="matrix" value="BLOSUM62"/>
-            <param name="masking" value="1"/>
+            <param name="masking" value="tantan"/>
             <conditional name="hit_filter">
                 <param name="hit_filter_select" value="max"/>
                 <param name="max_target_seqs" value="25"/>
@@ -375,12 +385,12 @@
             </output>
             <output name="alqueries">
                 <assert_contents>
-                    <has_line line="&gt;sequence more text"/>
+                    <has_line line="&gt;NP_008227.1 cytochrome c oxidase subunit I (mitochondrion) [Pongo pygmaeus]"/>
                 </assert_contents>
             </output>
             <output name="blast_tabular" file="diamond_results.tabular"/>
         </test>
-        <!--Test 02-->
+        <!--Test 02 non-gz input, taxon list, no al and unal output, simple header -->
         <test expect_num_outputs="1">
             <conditional name="method_cond">
                 <param name="method_select" value="blastp"/>
@@ -393,19 +403,20 @@
             </conditional>
             <conditional name="tax_cond">
                 <param name="tax_select" value="list"/>
-                <param name="taxonlist" value="2"/>
+                <param name="taxonlist" value="42"/>  <!-- the taxID needed to use here is printed during the execution of gen.sh (filter_and_map_ids.py) it is not a NCBI taxID-->
             </conditional>
             <section name="output_section">
                 <conditional name="output">
                     <param name="outfmt" value="6"/>
                     <param name="fields" value="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore"/>
+                    <param name="header" value="simple"/>
                 </conditional>
             </section>
             <conditional name="sens_cond">
                 <param name="sensitivity" value=""/>
             </conditional>
             <param name="matrix" value="BLOSUM62"/>
-            <param name="masking" value="1"/>
+            <param name="masking" value="tantan"/>
             <conditional name="hit_filter">
                 <param name="hit_filter_select" value="max"/>
                 <param name="max_target_seqs" value="25"/>
@@ -421,7 +432,7 @@
             </conditional>
             <output name="blast_tabular" file="diamond_results.wtax.tabular"/>
         </test>
-        <!--Test 03-->
+        <!--Test 03 blastx, outfmt -->
         <test expect_num_outputs="1">
             <conditional name="method_cond">
                 <param name="method_select" value="blastx"/>
@@ -444,7 +455,7 @@
                 <param name="sensitivity" value=""/>
             </conditional>
             <param name="matrix" value="BLOSUM62"/>
-            <param name="masking" value="1"/>
+            <param name="masking" value="tantan"/>
             <conditional name="hit_filter">
                 <param name="hit_filter_select" value="top"/>
                 <param name="top" value="10"/>
@@ -460,7 +471,7 @@
             </conditional>
             <output name="blast_tabular" file="diamond_results.pairwise"/>
         </test>
-        <!--Test 04-->
+        <!--Test 04 outfmt daa -->
         <test expect_num_outputs="1">
             <conditional name="method_cond">
                 <param name="method_select" value="blastp"/>
@@ -475,9 +486,13 @@
                     <param name="outfmt" value="100"/>
                 </conditional>
             </section>
-            <output name="daa_output" file="diamond_results.daa" compare="sim_size" delta="10"/>
+            <output name="daa_output" ftype="daa">
+                <assert_contents>
+                    <has_size size="5602" delta="10"/>
+                </assert_contents>
+            </output>
         </test>
-        <!--Test 05-->
+        <!--Test 05 blastx w indexed diamond DB -->
         <test expect_num_outputs="1">
             <conditional name="method_cond">
                 <param name="method_select" value="blastx"/>
@@ -489,7 +504,7 @@
             <param name="query" value="nucleotide.fasta" ftype="fasta"/>
             <conditional name="ref_db_source">
                 <param name="db_source" value="indexed"/>
-                <param name="index" value="testDb"/>
+                <param name="reference_database" value="testDb"/>
             </conditional>
             <section name="output_section">
                 <conditional name="output">
@@ -500,7 +515,7 @@
                 <param name="sensitivity" value=""/>
             </conditional>
             <param name="matrix" value="BLOSUM62"/>
-            <param name="masking" value="1"/>
+            <param name="masking" value="tantan"/>
             <conditional name="hit_filter">
                 <param name="hit_filter_select" value="top"/>
                 <param name="top" value="10"/>
@@ -524,18 +539,28 @@
             <param name="query" value="nucleotide.fasta" ftype="fasta"/>
             <conditional name="ref_db_source">
                 <param name="db_source" value="indexed"/>
-                <param name="index" value="testDb"/>
+                <param name="reference_database" value="testDb"/>
             </conditional>
-            <param name="iterate" value="true"/>
+            <section name="advanced_section">
+                <param name="iterate" value="true"/>
+            </section>
             <section name="output_section">
                 <conditional name="output">
                     <param name="outfmt" value="6"/>
                     <param name="fields" value="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore"/>
+                    <param name="header" value="verbose"/>
                 </conditional>
             </section>
-            <output name="blast_tabular" file="diamond_results_iterate.tabular"/>
+            <!-- verbose header contains path -> allow for lines_diff 2, assert header line separately -->
+            <output name="blast_tabular" file="diamond_results_iterate.tabular" lines_diff="2">
+                <assert_contents>
+                    <has_text text="# Invocation: diamond blastx"/>
+                </assert_contents>
+            </output>
         </test>
         <!-- Test 07 swipe option-->
+        <!-- 
+        https://github.com/bbuchfink/diamond/issues/915
         <test expect_num_outputs="1">
             <conditional name="method_cond">
                 <param name="method_select" value="blastx"/>
@@ -543,7 +568,7 @@
             <param name="query" value="nucleotide.fasta" ftype="fasta"/>
             <conditional name="ref_db_source">
                 <param name="db_source" value="indexed"/>
-                <param name="index" value="testDb"/>
+                <param name="reference_database" value="testDb"/>
             </conditional>
             <param name="swipe" value="true"/>
             <section name="output_section">
@@ -553,7 +578,7 @@
                 </conditional>
             </section>
             <output name="blast_tabular" file="diamond_results_swipe.tabular"/>
-        </test>
+        </test> -->
         <!--Test 08 algo option-->
         <test expect_num_outputs="1">
             <conditional name="method_cond">
@@ -562,9 +587,11 @@
             <param name="query" value="nucleotide.fasta" ftype="fasta"/>
             <conditional name="ref_db_source">
                 <param name="db_source" value="indexed"/>
-                <param name="index" value="testDb"/>
+                <param name="reference_database" value="testDb"/>
             </conditional>
-            <param name="algo" value="1"/>
+            <section name="advanced_section">
+                <param name="algo" value="1"/>
+            </section>
             <section name="output_section">
                 <conditional name="output">
                     <param name="outfmt" value="6"/>
@@ -581,9 +608,11 @@
             <param name="query" value="nucleotide.fasta" ftype="fasta"/>
             <conditional name="ref_db_source">
                 <param name="db_source" value="indexed"/>
-                <param name="index" value="testDb"/>
+                <param name="reference_database" value="testDb"/>
             </conditional>
-            <param name="global_ranking" value="10"/>
+            <section name="advanced_section">
+                <param name="global_ranking" value="10"/>
+            </section>
             <section name="output_section">
                 <conditional name="output">
                     <param name="outfmt" value="6"/>
@@ -600,7 +629,7 @@
             <param name="query" value="nucleotide.fasta" ftype="fasta"/>
             <conditional name="ref_db_source">
                 <param name="db_source" value="indexed"/>
-                <param name="index" value="testDb"/>
+                <param name="reference_database" value="testDb"/>
             </conditional>
             <section name="output_section">
                 <param name="max_hsps" value="10"/>
@@ -619,7 +648,7 @@
             <param name="query" value="nucleotide.fasta" ftype="fasta"/>
             <conditional name="ref_db_source">
                 <param name="db_source" value="indexed"/>
-                <param name="index" value="testDb"/>
+                <param name="reference_database" value="testDb"/>
             </conditional>
             <section name="advanced_section">
                 <param name="seed_cut" value="100"/>
@@ -640,7 +669,7 @@
             <param name="query" value="nucleotide.fasta" ftype="fasta"/>
             <conditional name="ref_db_source">
                 <param name="db_source" value="indexed"/>
-                <param name="index" value="testDb"/>
+                <param name="reference_database" value="testDb"/>
             </conditional>
             <section name="advanced_section">
                 <param name="freq_masking" value="true"/>
@@ -661,10 +690,10 @@
             <param name="query" value="nucleotide.fasta" ftype="fasta"/>
             <conditional name="ref_db_source">
                 <param name="db_source" value="indexed"/>
-                <param name="index" value="testDb"/>
+                <param name="reference_database" value="testDb"/>
             </conditional>
             <section name="advanced_section">
-                <param name="motif_masking" value="1"/>
+                <param name="motif_masking" value="0"/>
             </section>
             <section name="output_section">
                 <conditional name="output">
@@ -682,7 +711,7 @@
             <param name="query" value="nucleotide.fasta" ftype="fasta"/>
             <conditional name="ref_db_source">
                 <param name="db_source" value="indexed"/>
-                <param name="index" value="testDb"/>
+                <param name="reference_database" value="testDb"/>
             </conditional>
             <section name="advanced_section">
                 <param name="soft_masking" value="0"/>
@@ -703,7 +732,7 @@
             <param name="query" value="nucleotide.fasta" ftype="fasta"/>
             <conditional name="ref_db_source">
                 <param name="db_source" value="indexed"/>
-                <param name="index" value="testDb"/>
+                <param name="reference_database" value="testDb"/>
             </conditional>
             <section name="output_section">
                 <conditional name="output">
@@ -715,10 +744,112 @@
             <output name="blast_tabular" file="diamond_results_log_test.tabular"/>
             <output name="log_file">
                 <assert_contents>
-                    <has_n_lines n="259"/>
-                    <has_text text="diamond blastx --quiet"/>
+                    <has_n_lines n="375"/>
+                    <has_text text="diamond blastx"/>
                     <has_text text="--log"/>
-                    <has_line line="Sequences = 6, letters = 1694, average length = 282"/>
+                    <has_line line="Sequences = 6, letters = 3076, average length = 512"/>
+                </assert_contents>
+            </output>
+        </test>
+
+        <!--Test 16 test against cached BLAST DB + NO NCBI taxonomy which works as long as (certain) tax columns are not selected in outputs-->
+        <test expect_num_outputs="1">
+            <conditional name="method_cond">
+                <param name="method_select" value="blastp"/>
+                <param name="comp_based_stats" value="1"/>
+            </conditional>
+            <param name="query" value="protein.fasta.gz" ftype="fasta.gz"/>
+            <conditional name="ref_db_source">
+                <param name="db_source" value="blast"/>
+                <param name="reference_database" value="test"/>
+            </conditional>
+            <conditional name="tax_cond">
+                <param name="tax_select" value="list"/>
+                <param name="taxonlist" value="2,2759"/> <!-- simulate tax filtering ..  -->
+            </conditional>
+            <section name="output_section">
+                <conditional name="output">
+                    <param name="outfmt" value="6"/>
+                    <param name="fields" value="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,scovhsp,staxids,cigar"/>
+                </conditional>
+            </section>
+            <conditional name="sens_cond">
+                <param name="sensitivity" value=""/>
+            </conditional>
+            <param name="matrix" value="BLOSUM62"/>
+            <param name="masking" value="seg"/>
+            <conditional name="hit_filter">
+                <param name="hit_filter_select" value="max"/>
+                <param name="max_target_seqs" value="25"/>
+            </conditional>
+            <conditional name="filter_score">
+                <param name="filter_score_select" value="evalue"/>
+                <param name="evalue" value="0.001"/>
+            </conditional>
+            <param name="id" value="0"/>
+            <param name="query_cover" value="0"/>
+            <conditional name="sens_cond">
+                <param name="block_size" value="2"/>
+            </conditional>
+            <output name="blast_tabular">
+                <assert_contents>
+                    <has_n_columns n="15"/>
+                    <has_n_lines n="5"/>
+                </assert_contents>
+            </output>
+            <assert_command>
+                <!-- ensure that NCBI taxonomy is really not used-->
+                <has_text text="nodes.dmp" negate="true"/>
+                <has_text text="names.dmp" negate="true"/>
+            </assert_command>
+        </test>
+
+        <!--Test 17 test blastx against cached BLAST DB  + tax columns in output + tax filtering file (tetrapoda and ray finned fished should result in mouse, human, zebra fish) -->
+        <test expect_num_outputs="1">
+            <conditional name="method_cond">
+                <param name="method_select" value="blastp"/>
+                <param name="comp_based_stats" value="1"/>
+            </conditional>
+            <param name="query" value="protein.fasta.gz" ftype="fasta.gz"/>
+            <conditional name="ref_db_source">
+                <param name="db_source" value="blast"/>
+                <param name="reference_database" value="test"/>
+                <param name="ncbi_taxonomy" value="test"/>
+            </conditional>
+            <conditional name="tax_cond">
+                <param name="tax_select" value="file"/>
+                <param name="taxonlist" value="taxon.tsv"/>
+            </conditional>
+            <section name="output_section">
+                <conditional name="output">
+                    <param name="outfmt" value="6"/>
+                    <param name="fields" value="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,scovhsp,sskingdoms,skingdoms,sphylums,cigar"/>
+                </conditional>
+            </section>
+            <conditional name="sens_cond">
+                <param name="sensitivity" value=""/>
+            </conditional>
+            <param name="matrix" value="BLOSUM62"/>
+            <param name="masking" value="seg"/>
+            <conditional name="hit_filter">
+                <param name="hit_filter_select" value="max"/>
+                <param name="max_target_seqs" value="25"/>
+            </conditional>
+            <conditional name="filter_score">
+                <param name="filter_score_select" value="evalue"/>
+                <param name="evalue" value="0.001"/>
+            </conditional>
+            <param name="id" value="0"/>
+            <param name="query_cover" value="0"/>
+            <conditional name="sens_cond">
+                <param name="block_size" value="2"/>
+            </conditional>
+            <output name="blast_tabular">
+                <assert_contents>
+                    <has_n_columns n="17"/>
+                    <has_n_lines n="3"/>
+                    <has_text text="Metazoa" n="3"/>
+                    <has_text text="Viridiplantae" n="0"/>
                 </assert_contents>
             </output>
         </test>
--- a/diamond_makedb.xml	Mon Nov 10 15:12:32 2025 +0000
+++ b/diamond_makedb.xml	Fri Dec 12 11:13:59 2025 +0000
@@ -9,15 +9,21 @@
     <command detect_errors="aggressive">
         <!-- DB has two files, *.dmnd and *.tx -->
     <![CDATA[
+    ln -s '$infile' database.$infile.ext &&
+
     diamond makedb
         --threads \${GALAXY_SLOTS:-12}
-        --in '$infile'
+        --in database.$infile.ext
         --db ./database
 
-      #if str($tax_cond.tax_select) == 'yes':
+      #if $tax_cond.tax_select == 'yes':
         --taxonmap '$tax_cond.taxonmap'
         --taxonnodes '$tax_cond.taxonnodes'
         --taxonnames '$tax_cond.taxonnames'
+      #else if $tax_cond.tax_select == 'yes_cached':
+        --taxonmap '$tax_cond.ncbi_taxonomy.fields.path'/prot.accession2taxid
+        --taxonnodes '$tax_cond.ncbi_taxonomy.fields.path'/nodes.dmp
+        --taxonnames '$tax_cond.ncbi_taxonomy.fields.path'/names.dmp
       #end if
     ]]>
     </command>
@@ -25,7 +31,8 @@
         <param name="infile" type="data" format="fasta,fasta.gz" label="Input reference file in FASTA format"/>
         <conditional name="tax_cond">
             <param name="tax_select" type="select" label="Add taxonomic data?" help="Needs to be supplied in order to provide taxonomy features of the aligner">
-                <option value="yes">Yes</option>
+                <option value="yes_cached">Using built in NCBI taxonomy</option>
+                <option value="yes">Yes using datasets from history</option>
                 <option value="no" selected="true">No</option>
             </param>
             <when value="yes">
@@ -33,6 +40,13 @@
                 <param argument="--taxonnodes" type="data" format="tabular" label="Taxonomy nodes.dmp from NCBI" help="This parameter is optional and needs to be supplied in order to provide taxonomy features"/>
                 <param argument="--taxonnames" type="data" format="tabular" label="Taxonomy names.dmp from NCBI" help="This parameter is optional and needs to be supplied in order to provide taxonomy features"/>
             </when>
+            <when value="yes_cached">
+                <param name="ncbi_taxonomy" type="select" optional="true" label="NCBI taxonomy database" help="Needed for output of taxonomy columns in tabular output">
+                    <options from_data_table="ncbi_taxonomy">
+                        <validator message="No NCBI database is available. Ask your Galaxy adin" type="no_options"/>
+                    </options>
+                </param>
+            </when>
             <when value="no"/>
         </conditional>
     </inputs>
@@ -43,11 +57,18 @@
         <test>
             <param name="infile" value="db.fasta" ftype="fasta"/>
             <output name="outfile" value="db.dmnd" compare="sim_size" delta="2"/>
+            <assert_stderr>
+                <has_text_matching expression="Database sequences +5"/>
+                <has_text_matching expression="Database letters +2578"/>
+            </assert_stderr>
         </test>
-
         <test>
             <param name="infile" value="db.fasta.gz" ftype="fasta.gz"/>
             <output name="outfile" value="db.dmnd" compare="sim_size" delta="2"/>
+            <assert_stderr>
+                <has_text_matching expression="Database sequences +5"/>
+                <has_text_matching expression="Database letters +2578"/>
+            </assert_stderr>
         </test>
         <test>
             <param name="infile" value="db.fasta" ftype="fasta"/>
@@ -57,7 +78,32 @@
                 <param name="taxonnodes" ftype="tabular" value="nodes.dmp"/>
                 <param name="taxonnames" ftype="tabular" value="names.dmp"/>
             </conditional>
+            <!-- this test uses a taxdb with consecutive taxIDs which creates the small dmnd test file -->
             <output name="outfile" value="db-wtax.dmnd" compare="sim_size" delta="2"/>
+            <assert_stderr>
+                <has_text_matching expression="Entries in accession to taxid file +5"/>
+                <has_text_matching expression="Database accessions mapped to taxid +5"/>
+                <has_text_matching expression="Database sequences mapped to taxid +5"/>
+            </assert_stderr>
+        </test>
+        <test>
+            <param name="infile" value="db.fasta" ftype="fasta"/>
+            <conditional name="tax_cond">
+                <param name="tax_select" value="yes_cached"/>
+                <param name="ncbi_taxonomy" value="test"/>
+            </conditional>
+            <!-- note that this test uses a different taxDB (original taxIDs - not consecutive)
+                 and therefore we get a larger dmnd file -->
+            <output name="outfile">
+                <assert_contents>
+                    <has_size size="20279226"/>
+                </assert_contents>
+            </output>
+            <assert_stderr>
+                <has_text_matching expression="Entries in accession to taxid file +5"/>
+                <has_text_matching expression="Database accessions mapped to taxid +5"/>
+                <has_text_matching expression="Database sequences mapped to taxid +5"/>
+            </assert_stderr>
         </test>
     </tests>
     <help>
--- a/macros.xml	Mon Nov 10 15:12:32 2025 +0000
+++ b/macros.xml	Fri Dec 12 11:13:59 2025 +0000
@@ -1,6 +1,6 @@
 <macros>
-    <token name="@TOOL_VERSION@">2.1.13</token>
-    <token name="@VERSION_SUFFIX@">1</token>
+    <token name="@TOOL_VERSION@">2.1.16</token>
+    <token name="@VERSION_SUFFIX@">0</token>
     <xml name="requirements">
         <requirements>
             <requirement type="package" version="@TOOL_VERSION@">diamond</requirement>
@@ -28,45 +28,55 @@
             <when value="0"/>
             <when value="5"/>
             <when value="6">
-                <param name="fields" type="select" label="Tabular fields" help="" multiple="true">
+                <param argument="--fields" type="select" label="Tabular fields" help="" multiple="true">
                     <option value="qseqid" selected="true">Query Seq - id</option>
+                    <option value="qlen">Query sequence length</option>
                     <option value="sseqid" selected="true">Subject Seq - id</option>
                     <option value="sallseqid">All subject Seq - id(s)</option>
-                    <option value="qlen">Query sequence length</option>
                     <option value="slen">Subject sequence length</option>
+                    <option value="qstart" selected="true">Start of alignment in query</option>
+                    <option value="qend" selected="true">End of alignment in query</option>
+                    <option value="sstart" selected="true">Start of alignment in subject</option>
+                    <option value="send" selected="true">End of alignment in subject</option>
+                    <option value="qseq">Aligned part of query sequence</option>
+                    <option value="qseq_gapped">Aligned part of query sequence (with gaps)</option>
+                    <option value="qseq_translated">Translation of the aligned part of query sequence</option>
+                    <option value="full_qseq">Query sequence</option>
+                    <option value="full_qseq_mate">Query sequence of the mate</option>
+                    <option value="sseq">Aligned part of subject sequence</option>
+                    <option value="sseq_gapped">Aligned part of subject sequence (with gaps)</option>
+                    <option value="full_sseq">Subject sequence</option>
+                    <option value="evalue" selected="true">Expect value</option>
+                    <option value="bitscore" selected="true">Bit score</option>
+                    <option value="corrected_bitscore" selected="true">Bit score corrected for edge effects</option>
+                    <option value="score">Raw score</option>
+                    <option value="length" selected="true">Alignment length</option>
                     <option value="pident" selected="true">Percentage of identical matches</option>
-                    <option value="length" selected="true">Alignment length</option>
+                    <option value="approx_pident">Approximate percentage of identical matches</option>
                     <option value="nident">Number of identical matches</option>
                     <option value="mismatch" selected="true">Number of mismatches</option>
                     <option value="positive">Number of positive - scoring matches</option>
                     <option value="gapopen" selected="true">Number of gap openings</option>
                     <option value="gaps">Total number of gaps</option>
                     <option value="ppos">Percentage of positive - scoring matches</option>
-                    <option value="qstart" selected="true">Start of alignment in query</option>
-                    <option value="qend" selected="true">End of alignment in query</option>
-                    <option value="sstart" selected="true">Start of alignment in subject</option>
-                    <option value="send" selected="true">End of alignment in subject</option>
-                    <option value="qseq">Aligned part of query sequence</option>
-                    <option value="sseq">Aligned part of subject sequence</option>
-                    <option value="qseq_translated">Translation of the aligned part of query sequence</option>
-                    <option value="evalue" selected="true">Expect value</option>
-                    <option value="bitscore" selected="true">Bit score</option>
-                    <option value="score">Raw score</option>
                     <option value="qframe">Query frame</option>
                     <option value="btop">Blast traceback operations(BTOP)</option>
-                    <option value="scovhsp">Subject coverage per HSP</option>
+                    <option value="cigar">Cigar</option>
                     <option value="stitle">Subject Title</option>
                     <option value="salltitles">All Subject Title(s)</option>
                     <option value="qcovhsp">Query Coverage Per HSP</option>
+                    <option value="scovhsp">Subject coverage per HSP</option>
                     <option value="qtitle">Query title</option>
-                    <option value="full_qseq">Query sequence</option>
-                    <option value="full_sseq">Subject sequence</option>
                     <option value="qqual">Query quality values for the aligned part of the query</option>
                     <option value="full_qqual">Query quality values</option>
                     <option value="qstrand">Query strand</option>
-                    <option value="cigar">Cigar</option>
                     <yield/>
                 </param>
+                <param argument="--header" type="select" label="Use header lines">
+                    <option value="0">No</option>
+                    <option value="simple">Simple</option>
+                    <option value="verbose">Verbose</option>
+                </param>
             </when>
             <when value="100">
             </when>
@@ -79,6 +89,19 @@
         </conditional>
     </xml>
     <xml name="hit_filter_macro">
+        <conditional name="filter_score">
+            <param name="filter_score_select" type="select" label="Method to filter?" help="(--evalue/--min-score)">
+                <option value="evalue" selected="True">Maximum e-value to report alignments</option>
+                <option value="min-score">Minimum bit score to report alignments</option>
+            </param>
+            <when value="evalue">
+                <param argument="--evalue" type="float" value="0.001" label="Maximum expected value to keep an alignment"/>
+            </when>
+            <when value="min-score">
+                <param argument="--min-score" type="integer" value="0" label="Minimum bit score to keep an alignment" help="(--min-score)"/>
+            </when>
+        </conditional>
+
         <conditional name="hit_filter">
             <param name="hit_filter_select" type="select" label="Method to restrict the number of hits?">
                 <option value="max">Maximum number of target sequences</option>
@@ -91,14 +114,14 @@
                 <param argument="--top" type="integer" value="0" min="0" max="100" label="Keep alignments within the given percentage range of the top alignment score for a query" help="For example, setting this to 10 will report all alignments whose score is at most 10% lower than the best alignment score for a query."/>
             </when>
         </conditional>
+        <param argument="--id" type="float" value="0" min="0" max="100" label="Minimum identity percentage to report an alignment" help="Report only alignments above the given percentage of sequence identity"/>
+        <param argument="--approx-id" type="float" value="0" min="0" max="100" label="Minimum approx. identity% to report an alignment"/>
+        <param argument="--query-cover" type="float" value="0" min="0" max="100" label="Minimum query cover percentage to report an alignment" help="Report only alignments above the given percentage of query cover"/>
+        <param argument="--subject-cover" type="float" value="0" min="0" max="100" label="Minimum subject cover percentage to report an alignment" help="Report only alignments above the given percentage of subject cover"/>
     </xml>
-    <xml name="block_size_low_sens">
-        <param argument="--block-size" type="float" value="2" label="Block size in billions of sequence letters to be processed at a time"
-            help="This is the main parameter for controlling the program’s memory and disk space usage. Bigger numbers will increase the use of memory and temporary                  disk space, but also improve performance"/>
-    </xml>
-    <xml name="block_size_hi_sens">
-        <param argument="--block-size" type="float" value="0.4" label="Block size in billions of sequence letters to be processed at a time"
-            help="This is the main parameter for controlling the program’s memory and disk space usage. Bigger numbers will increase the use of memory and temporary                  disk space, but also improve performance"/>
+    <xml name="block_size" tokens="value">
+        <param argument="--block-size" type="float" value="@VALUE@" min="0" label="Block size in billions of sequence letters to be processed at a time"
+            help="This is the main parameter for controlling the program’s memory and disk space usage. Bigger numbers will increase the use of memory and temporary disk space, but also improve performance"/>
     </xml>
     <xml name="citations">
         <citations>
@@ -138,6 +161,7 @@
             --out '$blast_xml'
         #else if $output_section.output.outfmt == "6"
             --outfmt '6' #echo ' '.join(str($output_section.output.fields).split(','))
+            --header $output_section.output.header
             --out '$blast_tabular'
         #else if $output_section.output.outfmt == "100"
             --outfmt '100'
@@ -158,4 +182,23 @@
             --top '$hit_filter.top'
         #end if
     </token>
+
+    <xml name="taxon_cond_macro" tokens="cond_name,label,help,argument">
+        <conditional name="@COND_NAME@">
+            <param name="tax_select" type="select" label="@LABEL@" help="Any taxonomic rank can be used, and only reference sequences matching one of the specified taxon ids will be searched against.">
+                <option value="no" selected="True">No</option>
+                <option value="list">List of taxids entered manually</option>
+                <option value="file">List of taxids from single column tabular file</option>
+            </param>
+            <when value="no"/>
+            <when value="list">
+                <param argument="@ARGUMENT@" type="text" value="" label="Taxon IDss" help="Comma separated list">
+                    <validator type="regex" message="Taxonlist needs to be a comma separated list of integers">[0-9,]*</validator>
+                </param>
+            </when>
+            <when value="file">
+                <param argument="@ARGUMENT@" type="data" format="tabular" label="Taxon id file" help="One taxon ID per line"/>
+            </when>
+        </conditional>
+    </xml>
 </macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/README	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,10 @@
+BLAST DBs come with additional taxonomic data: taxdb.btd, taxdb.bti, taxonomy4blast.sqlite3
+which are quite large. this folder contains small test data covering a few species
+
+Oryza sativa    4530
+Drosophila      7215
+Danio rerio     7955
+Homo sapiens    9606
+Mus musculus    10090
+
+the files have been provided to @bernt-matthias by the NCBI help desk (ticket help #247163)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/db.fasta	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,45 @@
+>gi|3950761|gb|YP_514675.1|cytochrome c oxidase subunit 1 (mitochondrion) [Oryza sativa Indica Group]
+MTNLVRWLFSTNHKDIGTLYFIFGAIAGVMGTCFSVLIRMELARPGDQILGGNHQLYNVLITAHAFLMIF
+FMVMPAMIGGFGNWFVPILIGAPDMAFPRLNNISFWLLPPSLLLLLSSALVEVGSGTGWTVYPPLSGITS
+HSGGAVDLAIFSLHLSGVSSILGSINFITTIFNMRGPGMTMHRLPLFVWSVLVTAFLLLLSLPVLAGAIT
+MLLTDRNFNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHIVSTFSRKPVFGYLGMVYAMI
+SIGVLGFLVWAHHMFTVGLDVDTRAYFTAATMIIAVPTGIKIFSWIATMWGGSIQYKTPMLFAVGFIFLF
+TIGGLTGIVLANSGLDIALHDTYYVVAHFHYVLSMGAVFALFAGFYYWVGKIFGRTYPETLGQIHFWITF
+FGVNLTFFPMHFLGLSGMPRRIPDYPDAYAGWNALSSFGSYISVVGIRRFFVVVAITSSSGKNKRCAESP
+WAVEQNPTTLEWLVQSPPAFHTFGELPAIKETKS
+>gi|19893533|gb|YP_009047267.1|cytochrome c oxidase subunit I, partial (mitochondrion) [Drosophila melanogaster]
+SRQWLFSTNHKDIGTLYFIFGAWAGMVGTSLSILIRAELGHPGALIGDDQIYNVIVTAHAFIMIFFMVMP
+IMIGGFGNWLVPLMLGAPDMAFPRMNNMSFWLLPPALSLLLVSSMVENGAGTGWTVYPPLSAGIAHGGAS
+VDLAIFSLHLAGISSILGAVNFITTVINMRSTGISLDRMPLFVWSVVITALLLLLSLPVLAGAITMLLTD
+RNLNTSFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIISQESGKKETFGSLGMIYAMLAIGL
+LGFIVWAHHMFTVGMDVDTRAYFTSATMIIAVPTGIKIFSWLATLHGTQLSYSPAILWALGFVFLFTVGG
+LTGVVLANSSVDIILHDTYYVVAHFHYVLSMGAVFAIMAGFIHWYPLFTGLTLNNKWLKSHFIIMFIGVN
+LTFFPQHFLGLAGMPRRYSDYPDAYTTWNIVSTIGSTISLLGILFFFFIIWESLVSQRQVIYPIQLNSSI
+EWYQNTPPAEHSYSELPLLTN
+>gi|140539|gb|NP_059333.1|cytochrome c oxidase subunit I (mitochondrion) [Danio rerio]
+MTITRWFFSTNHKDIGTLYLVFGAWAGMVGTALSLLIRAELSQPGALLGDDQIYNVIVTAHAFVMIFFMV
+MPILIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSGVEAGAGTGWTVYPPLAGNLAHAG
+ASVDLTIFSLHLAGVSSILGAINFITTTINMKPPTISQYQTPLFVWAVLVTAVLLLLSLPVLAAGITMLL
+TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHVVAYYAGKKEPFGYMGMVWAMMAI
+GLLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGGAIKWETPMLWALGFIFLFTV
+GGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFTGYTLNSVWTKIHFGVMFIG
+VNLTFFPQHFLGLAGMPRRYSDYPDAYALWNTVSSIGSLISLVAVIMFLFILWEAFTAKREVLSVELTAT
+NVEWLHGCPPPYHTFEEPAFVQIQSN
+>gi|4512|gb|YP_003024028.1|cytochrome c oxidase subunit I (mitochondrion) [Homo sapiens]
+MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTAHAFVMIFFMV
+MPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSLLLLLASAMVEAGAGTGWTVYPPLAGNYSHPG
+ASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLL
+TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWAMMSI
+GFLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGSNMKWSAAVLWALGFIFLFTV
+GGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFIHWFPLFSGYTLDQTYAKIHFTIMFIG
+VNLTFFPQHFLGLSGMPRRYSDYPDAYTTWNILSSVGSFISLTAVMLMIFMIWEAFASKRKVLMVEEPSM
+NLEWLYGCPPPYHTFEEPVYMKS
+>gi|17708|gb|NP_904330.1|cytochrome c oxidase subunit I (mitochondrion) [Mus musculus]
+MFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSILIRAELGQPGALLGDDQIYNVIVTAHAFVMIFFMV
+MPMMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSMVEAGAGTGWTVYPPLAGNLAHAG
+ASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLL
+TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHVVTYYSGKKEPFGYMGMVWAMMSI
+GFLGFIVWAHHMFTVGLDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGGNIKWSPAMLWALGFIFLFTV
+GGLTGIVLSNSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFSGFTLDDTWAKAHFAIMFVG
+VNMTFFPQHFLGLSGMPRRYSDYPDAYTTWNTVSSMGSFISLTAVLIMIFMIWEAFASKREVMSVSYAST
+NLEWLHGCPPPYHTFEEPTYVKVK
Binary file test-data/blastdb/db.fasta.pdb has changed
Binary file test-data/blastdb/db.fasta.phr has changed
Binary file test-data/blastdb/db.fasta.pin has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/db.fasta.pjs	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,27 @@
+{
+  "version": "1.2",
+  "dbname": "db.fasta",
+  "dbtype": "Protein",
+  "db-version": 5,
+  "description": "cox1 blastp DB",
+  "number-of-letters": 2578,
+  "number-of-sequences": 5,
+  "last-updated": "2025-12-09T18:15:00",
+  "number-of-volumes": 1,
+  "number-of-taxids": 5,
+  "bytes-total": 52950,
+  "bytes-to-cache": 2720,
+  "files": [
+    "db.fasta.pdb",
+    "db.fasta.phr",
+    "db.fasta.pin",
+    "db.fasta.pnd",
+    "db.fasta.pni",
+    "db.fasta.pog",
+    "db.fasta.pos",
+    "db.fasta.pot",
+    "db.fasta.psq",
+    "db.fasta.ptf",
+    "db.fasta.pto"
+  ]
+}
Binary file test-data/blastdb/db.fasta.pnd has changed
Binary file test-data/blastdb/db.fasta.pni has changed
Binary file test-data/blastdb/db.fasta.pog has changed
Binary file test-data/blastdb/db.fasta.pos has changed
Binary file test-data/blastdb/db.fasta.pot has changed
Binary file test-data/blastdb/db.fasta.psq has changed
Binary file test-data/blastdb/db.fasta.ptf has changed
Binary file test-data/blastdb/db.fasta.pto has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/filter_and_map_ids.py	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+
+# filter names and nodes dmp files by a list of given IDs
+# parent node IDs will be added if needed
+#
+# IDs will be renamed to give a consecuive set of IDs: 1,2,...
+# oderwise dmnd databases including taxonomy will be huge
+# also make make sure that the order of the taxids is not changed
+
+from sys import argv
+
+names_file_name = argv[1]
+nodes_file_name = argv[2]
+prot2ids_file_name = argv[3]
+names_file_out_name = argv[4]
+nodes_file_out_name = argv[5]
+prot2ids_file_out_name = argv[6]
+
+parent = dict()
+with open(nodes_file_name) as nodes_file:
+    for line in nodes_file:
+        line = line.strip().split("|")
+        parent[line[0].strip()] = line[1].strip()
+
+initial_ids = set()
+with open(prot2ids_file_name) as prot2ids_file:
+    for i, line in enumerate(prot2ids_file):
+        if i == 0:
+            continue
+        line = line.strip().split()
+        initial_ids.add(line[2].strip())
+
+ids = set()
+while len(initial_ids):
+    i = initial_ids.pop()
+    p = parent[i]
+    if p == i:
+        ids.add(p)
+        continue
+    ids.add(i)
+    initial_ids.add(p)
+
+id_map = dict()
+with open(names_file_name) as names_file, open(names_file_out_name, "w") as names_file_out:
+    for line in names_file:
+        line = line.strip().split("|")
+        id = line[0].strip()
+        if id not in ids:
+            continue
+        if id not in id_map:
+            id_map[id] = len(id_map) + 1
+        names_file_out.write(f'{id_map[id]}\t|{"|".join(line[1:])}\n')
+
+print(f'taxonlist for test 2 needs to be {id_map["33090"]}')
+
+with open(nodes_file_name) as nodes_file, open(nodes_file_out_name, "w") as nodes_file_out:
+    for line in nodes_file:
+        line = line.strip().split("|")
+        node = line[0].strip()
+        parent = line[1].strip()
+        if node not in ids or parent not in ids:
+            continue
+        nodes_file_out.write(f'{id_map[node]}\t|\t{id_map[parent]}\t|{"|".join(line[2:])}\n')
+
+with open(prot2ids_file_name) as prot2ids_file, open(prot2ids_file_out_name, "w") as prot2ids_file_out:
+    for i, line in enumerate(prot2ids_file):
+        if i == 0:
+            prot2ids_file_out.write(line)
+            continue
+        line = line.strip().split()
+        id = line[2].strip()
+        line[2] = str(id_map[id])
+        prot2ids_file_out.write("\t".join(line) + "\n")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/gen.sh	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,37 @@
+#/bin/bash
+
+set -e
+
+wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
+tar -xzf taxdump.tar.gz
+
+
+# create blast DB
+# diamond expects 1234 in the tax data: https://github.com/bbuchfink/diamond/blob/56214dfcb4278f08e935147e8dbea7672997386e/src/data/blastdb/blastdb.cpp#L170
+# more precisely in the taxdb.bt* files (which are here constructed from the dmp files)
+# we also add the path to the root (guess not needed strictly)
+# ideally 1234 should also in the sqlite DB, but since the taxon is not in the fasta it should be fine
+sqlite3 taxonomy4blast.sqlite3 "SELECT * FROM TaxidInfo;" | sed 's/|/\n/g' | sort -n -u | sed 's/^/^/; s/$/\\s/' > grep.txt
+echo "^1234\\s" >> grep.txt
+echo "^189779\\s" >> grep.txt
+echo "^189778\\s" >> grep.txt
+echo "^203693\\s" >> grep.txt
+echo "^40117\\s" >> grep.txt
+echo "^3379134\\s" >> grep.txt
+echo "^2\\s" >> grep.txt
+
+grep -f grep.txt names.dmp > ../ncbi_taxonomy/names.dmp
+grep -f grep.txt nodes.dmp > ../ncbi_taxonomy/nodes.dmp
+
+python taxdb.py 
+makeblastdb -in db.fasta -parse_seqids -blastdb_version 5 -taxid_map map.txt -title "cox1 blastp DB" -dbtype prot
+
+# create small dmnd data base with taxonomy
+# the important thing to get a small DB is to have consecutive taxIDs
+# NOTE: filter_and_map_ids modifies taxIDs (to get a small file), i.e. taxIDs will be different from tests using BLAST DB from above
+python filter_and_map_ids.py names.dmp nodes.dmp prot.accession2taxid ../names.dmp ../nodes.dmp ../prot.accession2taxid
+diamond makedb --in db.fasta --db ./database --taxonmap ../prot.accession2taxid --taxonnodes ../nodes.dmp --taxonnames ../names.dmp
+mv database.dmnd ../db-wtax.dmnd
+
+rm *.dmp readme.txt taxdump.tar.gz gc.prt
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/map.txt	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,11 @@
+# file used to store protein IDs to taxids
+# diamond expects 1234 in the tax data: https://github.com/bbuchfink/diamond/blob/56214dfcb4278f08e935147e8dbea7672997386e/src/data/blastdb/blastdb.cpp#L170
+# more precisely in the taxdb.bt* files (which are here constructed from the dmp files)
+# we also add the path to the root (guess not needed strictly)
+# ideally 1234 should also in the sqlite DB, but since the taxon is not in the fasta it should be fine
+X 1234
+YP_514675.1 4530
+YP_009047267.1 7215
+NP_059333.1 7955
+YP_003024028.1 9606
+NP_904330.1 10090
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/prot.accession2taxid	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,6 @@
+accession	accession.version	taxid	gi
+YP_514675	YP_514675.1	4530	3950761
+YP_009047267	YP_009047267.1	7215	19893533
+NP_059333	NP_059333.1	7955	140539
+YP_003024028	YP_003024028.1	9606	4512
+NP_904330	NP_904330.1	10090	17708
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/taxdb.btd	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,1 @@
+Bacteria	eubacteria	bacteria	BacteriaNitrospira		bacteria	BacteriaEukaryota		eukaryotes	EukaryotaEmbryophyta	plants	land plants	EukaryotaMagnoliopsida	angiosperms	flowering plants	EukaryotaLaurales		flowering plants	EukaryotaLauraceae	laurel family	flowering plants	EukaryotaPersea		flowering plants	EukaryotaPersea americana		flowering plants	EukaryotaLiliopsida	monocotyledons	monocots	EukaryotaPoaceae	grass family	monocots	EukaryotaOryza		monocots	EukaryotaOryza sativa	rice	monocots	Eukaryotacommelinids		monocots	EukaryotaEumetazoa		animals	EukaryotaArthropoda		arthropods	EukaryotaHexapoda		hexapods	EukaryotaDiptera		flies	EukaryotaBrachycera		flies	EukaryotaDrosophilidae		flies	EukaryotaDrosophila	fruit fly	flies	EukaryotaPterygota		insects	EukaryotaChordata		chordates	EukaryotaVertebrata		vertebrates	EukaryotaGnathostomata		vertebrates	EukaryotaActinopterygii	fish	ray-finned fishes	EukaryotaCypriniformes		ray-finned fishes	EukaryotaDanio		ray-finned fishes	EukaryotaDanio rerio	zebra fish	ray-finned fishes	EukaryotaSarcopterygii		vertebrates	EukaryotaEutheria	placental mammals	placentals	EukaryotaPrimates		primates	EukaryotaCatarrhini		primates	EukaryotaHominidae		primates	EukaryotaHomo	humans	primates	EukaryotaHomo sapiens		primates	EukaryotaRodentia	rodent	rodents	EukaryotaMuridae		rodents	EukaryotaMus	mouse	rodents	EukaryotaMus musculus	mouse	rodents	EukaryotaCyprinoidei		ray-finned fishes	EukaryotaTeleostei		ray-finned fishes	EukaryotaOstariophysi		ray-finned fishes	EukaryotaTetrapoda		vertebrates	EukaryotaAmniota		vertebrates	EukaryotaTheria		mammals	EukaryotaViridiplantae	green plants	green plants	EukaryotaOpisthokonta		eukaryotes	EukaryotaMetazoa	multicellular animals	animals	EukaryotaBilateria		animals	EukaryotaProtostomia		animals	EukaryotaNeoptera		insects	EukaryotaEndopterygota		insects	EukaryotaDeuterostomia	deuterostomes	animals	EukaryotaStreptophyta		green plants	EukaryotaPoales		monocots	EukaryotaMurinae		rodents	EukaryotaNitrospirota		bacteria	BacteriaMammalia		mammals	EukaryotaNeopterygii		ray-finned fishes	EukaryotaMuscomorpha		flies	EukaryotaSchizophora		flies	EukaryotaAcalyptratae		flies	EukaryotaEphydroidea		flies	EukaryotaDrosophilinae		flies	EukaryotaDrosophilini		flies	EukaryotaInsecta	true insects	insects	EukaryotaTracheophyta	vascular plants	vascular plants	EukaryotaSpermatophyta	seed plants	seed plants	EukaryotaEuphyllophyta		vascular plants	EukaryotaDicondylia		insects	EukaryotaPanarthropoda		animals	EukaryotaCraniata		chordates	EukaryotaTeleostomi		vertebrates	EukaryotaEuteleostomi		vertebrates	EukaryotaStreptophytina		green plants	EukaryotaOryzoideae		monocots	EukaryotaOryzeae		monocots	EukaryotaActinopteri		ray-finned fishes	EukaryotaClupeocephala		ray-finned fishes	EukaryotaOtophysi		ray-finned fishes	EukaryotaCypriniphysae		ray-finned fishes	EukaryotaOtomorpha		ray-finned fishes	EukaryotaNitrospirales		bacteria	BacteriaNitrospiraceae		bacteria	BacteriaPancrustacea		arthropods	EukaryotaMandibulata	mandibulates	arthropods	EukaryotaNitrospiria		bacteria	BacteriaHomininae		primates	EukaryotaMagnoliidae		flowering plants	EukaryotaEuarchontoglires		placentals	EukaryotaGlires		placentals	EukaryotaSimiiformes		primates	EukaryotaHominoidea	ape	primates	EukaryotaMuroidea		rodents	EukaryotaBOP clade		monocots	EukaryotaHaplorrhini		primates	EukaryotaCyclorrhapha		flies	EukaryotaEremoneura		flies	EukaryotaMus		rodents	EukaryotaEcdysozoa		animals	EukaryotaDipnotetrapodomorpha		vertebrates	EukaryotaBoreoeutheria		placentals	EukaryotaMesangiospermae		flowering plants	EukaryotaPetrosaviidae		monocots	EukaryotaOsteoglossocephalai		ray-finned fishes	EukaryotaOryzinae		monocots	EukaryotaMyomorpha		rodents	EukaryotaDanionidae		ray-finned fishes	EukaryotaDanioninae		ray-finned fishes	EukaryotaPseudomonadati		bacteria	Bacteria
\ No newline at end of file
Binary file test-data/blastdb/taxdb.bti has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/taxdb.py	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+"""
+build_taxdb_from_nodes.py
+
+Create taxdb.btd and taxdb.bti (NCBI/BLAST/ISAM format) from a pruned nodes.dmp
+and optional names.dmp found in the current directory.
+
+Usage:
+    python3 build_taxdb_from_nodes.py
+
+Output:
+    taxdb.btd
+    taxdb.bti
+
+Notes:
+- Writes integers in BIG-ENDIAN (network order) as required by the ISAM/NCBI format.
+- The btd records are written as:
+    scientific_name<TAB>common_name<TAB>blast_name<TAB>superkingdom_code
+  with no reliance on newlines for delimitation (offsets define length).
+"""
+import struct
+import sys
+from collections import defaultdict
+
+NODES_FILE = "../ncbi_taxonomy/nodes.dmp"
+NAMES_FILE = "../ncbi_taxonomy/names.dmp"   # optional
+OUT_BTD = "taxdb.btd"
+OUT_BTI = "taxdb.bti"
+
+TAXDB_MAGIC = 0x8739
+
+
+# -------------------------
+# Helpers
+# -------------------------
+def read_nodes(nodes_path):
+    """Return dicts: parent[taxid]=parent_taxid, rank[taxid]=rank"""
+    parent = {}
+    rank = {}
+    with open(nodes_path, encoding="utf-8") as fh:
+        for line in fh:
+            parts = [p.strip() for p in line.split("|")]
+            if len(parts) < 3:
+                continue
+            try:
+                taxid = int(parts[0])
+                parent_tax = int(parts[1])
+            except ValueError:
+                continue
+            parent[taxid] = parent_tax
+            rank[taxid] = parts[2]
+    return parent, rank
+
+
+def read_names(names_path):
+    """Return dict: names[taxid] = {'scientific':..., 'common':..., 'blast':...}"""
+    names = defaultdict(lambda: {"scientific": "", "common": "", "blast": ""})
+    with open(names_path, encoding="utf-8") as fh:
+        for line in fh:
+            parts = [p.strip() for p in line.split("|")]
+            if len(parts) < 4:
+                continue
+            try:
+                taxid = int(parts[0])
+            except ValueError:
+                continue
+            name_txt = parts[1]
+            name_class = parts[3]
+            if name_class == "scientific name":
+                names[taxid]["scientific"] = name_txt
+            elif name_class == "common name":
+                names[taxid]["common"] = name_txt
+            elif name_class == "blast name":
+                names[taxid]["blast"] = name_txt
+    return names
+
+
+def infer_superkingdom_code(taxid, parent, rank, sci_name_lookup):
+    """
+    Walk ancestors until rank == 'superkingdom', then map name to code:
+    B (Bacteria), A (Archaea), E (Eukaryota), V (Viruses), U (Unknown)
+    """
+    seen = set()
+    cur = taxid
+    while True:
+        if cur in seen:
+            return "Unknown"
+        seen.add(cur)
+        r = rank.get(cur, "")
+        if r == "domain":
+            name = sci_name_lookup.get(cur, "").lower()
+            if "bacteria" in name or "eubacteria" in name:
+                return "Bacteria"
+            if "archaea" in name:
+                return "Archaea"
+            if "eukaryota" in name or "eukaryota" in name or "eukary" in name:
+                return "Eukaryota"
+            if "virus" in name or "viruses" in name:
+                return "Viruses"
+            return "Unknown"
+        if cur not in parent:
+            return "Unknown"
+        cur = parent[cur]
+
+
+def infer_blast_name(taxid, parent, lookup):
+    """
+    """
+    seen = set()
+    cur = taxid
+    while True:
+        if cur in seen:
+            return "Unknown"
+        seen.add(cur)
+        name = lookup.get(cur, "").lower()
+
+        if name:
+            return name
+        if cur not in parent:
+            return "Unknown"
+        cur = parent[cur]
+
+
+# -------------------------
+# Main
+# -------------------------
+def main():
+    # Read nodes.dmp
+    try:
+        parent, rank = read_nodes(NODES_FILE)
+    except FileNotFoundError:
+        print(f"Error: {NODES_FILE} not found in current directory.", file=sys.stderr)
+        sys.exit(2)
+
+    # Read names.dmp if present
+    try:
+        names = read_names(NAMES_FILE)
+    except FileNotFoundError:
+        names = defaultdict(lambda: {"scientific": "", "common": "", "blast": ""})
+        print("Warning: names.dmp not found. scientific_name will be set to the taxid.", file=sys.stderr)
+
+    # Determine the taxids to write:
+    # use taxids present in nodes.dmp (pruned set)
+    taxids = sorted(parent.keys())
+
+    if len(taxids) == 0:
+        print("No taxids found in nodes.dmp; nothing to do.", file=sys.stderr)
+        sys.exit(0)
+
+    # Build scientific-name lookup for superkingdom inference
+    sci_lookup = {}
+    for tid, rec in names.items():
+        sci_lookup[tid] = rec.get("scientific", "")
+
+    # Build blast-name lookup blast name inference
+    bla_lookup = {}
+    for tid, rec in names.items():
+        bla_lookup[tid] = rec.get("blast", "")
+
+    # Build btd records and offsets
+    offsets = []
+    btd_buf = bytearray()
+    for tid in taxids:
+        offsets.append(len(btd_buf))
+        rec = names.get(tid, {"scientific": "", "common": "", "blast": ""})
+        sci = rec.get("scientific", "")
+        com = rec.get("common", "")
+
+        if not sci:
+            # fallback: use numeric taxid as scientific name (ensures non-empty)
+            sci = str(tid)
+
+        # infer superkingdom code from nodes.dmp and names if possible
+        sk = infer_superkingdom_code(tid, parent, rank, sci_lookup)
+        bla = infer_blast_name(tid, parent, bla_lookup)
+
+        # exactly 4 fields, tab-separated; no trailing newline required
+        record = f"{sci}\t{com}\t{bla}\t{sk}"
+        btd_buf.extend(record.encode("utf-8"))
+
+    end_offset = len(btd_buf)
+
+    # Write taxdb.btd
+    with open(OUT_BTD, "wb") as fh:
+        fh.write(btd_buf)
+
+    # Write taxdb.bti
+    with open(OUT_BTI, "wb") as fh:
+        # header: magic, count (number of real taxids), reserved[4]
+        # IMPORTANT: write all integers BIG-ENDIAN (>I)
+        fh.write(struct.pack(">I", TAXDB_MAGIC))
+        fh.write(struct.pack(">I", len(taxids)))     # n (real entries only)
+        fh.write(struct.pack(">IIII", 0, 0, 0, 0))   # reserved
+
+        # index entries: (taxid, offset) pairs
+        for tid, off in zip(taxids, offsets):
+            fh.write(struct.pack(">I", int(tid)))
+            fh.write(struct.pack(">I", int(off)))
+
+        # # sentinel entry: taxid=0, offset=end_of_btd
+        # fh.write(struct.pack(">I", 0))
+        # fh.write(struct.pack(">I", end_offset))
+
+    # Summary
+    print(f"Wrote {OUT_BTD} ({end_offset} bytes)")
+    print(f"Wrote {OUT_BTI} (header + {len(taxids)} entries)")
+    print(f"Taxids written: {len(taxids)}")
+
+
+if __name__ == "__main__":
+    main()
Binary file test-data/blastdb/taxonomy4blast.sqlite3 has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb_p.loc	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,1 @@
+test	testDB	${__HERE__}/blastdb/db.fasta
\ No newline at end of file
Binary file test-data/db-wtax.dmnd has changed
Binary file test-data/db.dmnd has changed
--- a/test-data/db.fasta	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/db.fasta	Fri Dec 12 11:13:59 2025 +0000
@@ -1,12 +1,45 @@
->gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
-LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV
-EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG
-LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL
-GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX
-IENY
->gi|5524212|gb|AAD44167.1| cytochrome c [Elephas minimus minimus]
-LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAGGGGGGGWGQMSFWGATVITNLFSAIPYIGTNLV
-EWIWGGFSVDKAAAAAAAAAAAAAAAAAAAAAAAAATFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG
-LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL
-GLMPFLHTSKHRSMMLRPLSQALAAAAAAAAAAAAAAAAAAAAAAATIIGQMASILYFSIILAFLPIAGX
-IENY
+>gi|3950761|gb|YP_514675.1|cytochrome c oxidase subunit 1 (mitochondrion) [Oryza sativa Indica Group]
+MTNLVRWLFSTNHKDIGTLYFIFGAIAGVMGTCFSVLIRMELARPGDQILGGNHQLYNVLITAHAFLMIF
+FMVMPAMIGGFGNWFVPILIGAPDMAFPRLNNISFWLLPPSLLLLLSSALVEVGSGTGWTVYPPLSGITS
+HSGGAVDLAIFSLHLSGVSSILGSINFITTIFNMRGPGMTMHRLPLFVWSVLVTAFLLLLSLPVLAGAIT
+MLLTDRNFNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHIVSTFSRKPVFGYLGMVYAMI
+SIGVLGFLVWAHHMFTVGLDVDTRAYFTAATMIIAVPTGIKIFSWIATMWGGSIQYKTPMLFAVGFIFLF
+TIGGLTGIVLANSGLDIALHDTYYVVAHFHYVLSMGAVFALFAGFYYWVGKIFGRTYPETLGQIHFWITF
+FGVNLTFFPMHFLGLSGMPRRIPDYPDAYAGWNALSSFGSYISVVGIRRFFVVVAITSSSGKNKRCAESP
+WAVEQNPTTLEWLVQSPPAFHTFGELPAIKETKS
+>gi|19893533|gb|YP_009047267.1|cytochrome c oxidase subunit I, partial (mitochondrion) [Drosophila melanogaster]
+SRQWLFSTNHKDIGTLYFIFGAWAGMVGTSLSILIRAELGHPGALIGDDQIYNVIVTAHAFIMIFFMVMP
+IMIGGFGNWLVPLMLGAPDMAFPRMNNMSFWLLPPALSLLLVSSMVENGAGTGWTVYPPLSAGIAHGGAS
+VDLAIFSLHLAGISSILGAVNFITTVINMRSTGISLDRMPLFVWSVVITALLLLLSLPVLAGAITMLLTD
+RNLNTSFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIISQESGKKETFGSLGMIYAMLAIGL
+LGFIVWAHHMFTVGMDVDTRAYFTSATMIIAVPTGIKIFSWLATLHGTQLSYSPAILWALGFVFLFTVGG
+LTGVVLANSSVDIILHDTYYVVAHFHYVLSMGAVFAIMAGFIHWYPLFTGLTLNNKWLKSHFIIMFIGVN
+LTFFPQHFLGLAGMPRRYSDYPDAYTTWNIVSTIGSTISLLGILFFFFIIWESLVSQRQVIYPIQLNSSI
+EWYQNTPPAEHSYSELPLLTN
+>gi|140539|gb|NP_059333.1|cytochrome c oxidase subunit I (mitochondrion) [Danio rerio]
+MTITRWFFSTNHKDIGTLYLVFGAWAGMVGTALSLLIRAELSQPGALLGDDQIYNVIVTAHAFVMIFFMV
+MPILIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSGVEAGAGTGWTVYPPLAGNLAHAG
+ASVDLTIFSLHLAGVSSILGAINFITTTINMKPPTISQYQTPLFVWAVLVTAVLLLLSLPVLAAGITMLL
+TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHVVAYYAGKKEPFGYMGMVWAMMAI
+GLLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGGAIKWETPMLWALGFIFLFTV
+GGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFTGYTLNSVWTKIHFGVMFIG
+VNLTFFPQHFLGLAGMPRRYSDYPDAYALWNTVSSIGSLISLVAVIMFLFILWEAFTAKREVLSVELTAT
+NVEWLHGCPPPYHTFEEPAFVQIQSN
+>gi|4512|gb|YP_003024028.1|cytochrome c oxidase subunit I (mitochondrion) [Homo sapiens]
+MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTAHAFVMIFFMV
+MPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSLLLLLASAMVEAGAGTGWTVYPPLAGNYSHPG
+ASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLL
+TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWAMMSI
+GFLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGSNMKWSAAVLWALGFIFLFTV
+GGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFIHWFPLFSGYTLDQTYAKIHFTIMFIG
+VNLTFFPQHFLGLSGMPRRYSDYPDAYTTWNILSSVGSFISLTAVMLMIFMIWEAFASKRKVLMVEEPSM
+NLEWLYGCPPPYHTFEEPVYMKS
+>gi|17708|gb|NP_904330.1|cytochrome c oxidase subunit I (mitochondrion) [Mus musculus]
+MFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSILIRAELGQPGALLGDDQIYNVIVTAHAFVMIFFMV
+MPMMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSMVEAGAGTGWTVYPPLAGNLAHAG
+ASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLL
+TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHVVTYYSGKKEPFGYMGMVWAMMSI
+GFLGFIVWAHHMFTVGLDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGGNIKWSPAMLWALGFIFLFTV
+GGLTGIVLSNSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFSGFTLDDTWAKAHFAIMFVG
+VNMTFFPQHFLGLSGMPRRYSDYPDAYTTWNTVSSMGSFISLTAVLIMIFMIWEAFASKREVMSVSYAST
+NLEWLHGCPPPYHTFEEPTYVKVK
Binary file test-data/db.fasta.gz has changed
--- a/test-data/diamond_results.pairwise	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/diamond_results.pairwise	Fri Dec 12 11:13:59 2025 +0000
@@ -1,34 +1,136 @@
 BLASTP 2.3.0+
 
 
-Query= sequence more text
+Query= NC_001646.1:5332-6871 Pongo pygmaeus mitochondrion, complete genome
+
+Length=1540
+
+>gi|4512|gb|YP_003024028.1|cytochrome c oxidase subunit I (mitochondrion) [Homo sapiens]
+Length=513
+
+ Score = 897 bits (2318),  Expect = 0.0
+ Identities = 455/512 (88%), Positives = 490/512 (95%), Gaps = 0/512 (0%)
+ Frame = 1
 
-Length=849
+Query     1  MFADRWLFSTNHKDIGTLYLLFGA*AGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTA 180
+             MFADRWLFSTNHKDIGTLYLLFGA AGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTA
+Sbjct     1  MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTA 60
+
+Query   181  HAFVIIFFMVMPIIIGGFGN*LVPLIIGAPDMAFPRINNISF*LLLPSFLLLLASATVEA 360
+             HAFV+IFFMVMPI+IGGFGN LVPL+IGAPDMAFPR+NN+SF LL PS LLLLASA VEA
+Sbjct    61  HAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSLLLLLASAMVEA 120
+
+Query   361  GAGTG*TVYPPLAGNYSHPGASVDLTIFSLHLAGISSILGAINFITTIINIKPPAISQYQ 540
+             GAGTG TVYPPLAGNYSHPGASVDLTIFSLHLAG+SSILGAINFITTIIN+KPPA++QYQ
+Sbjct   121  GAGTGWTVYPPLAGNYSHPGASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQ 180
 
->gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
-Length=284
+Query   541  TPLFV*SILITAVLLLLSLPVLAAGITILLTDRNLNTTFFDPAGGGDPILYQHLF*FFGH 720
+             TPLFV S+LITAVLLLLSLPVLAAGIT+LLTDRNLNTTFFDPAGGGDPILYQHLF FFGH
+Sbjct   181  TPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPILYQHLFWFFGH 240
+
+Query   721  PEVYILILPGFGIISHIVTHYSGKKEPFGYIGIV*AIVSIGFLGFIV*AHHIFTVGIDVD 900
+             PEVYILILPGFG+ISHIVT+YSGKKEPFGY+G+V A++SIGFLGFIV AHH+FTVG+DVD
+Sbjct   241  PEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWAMMSIGFLGFIVWAHHMFTVGMDVD 300
+
+Query   901  TRAYFTSATIIIAIPTGVKVFS*LATLHGSNTK*SAAIL*ALGFIFLFTVGGLTGIVLAN 1080
+             TRAYFTSAT+IIAIPTGVKVFS LATLHGSN K SAA+L ALGFIFLFTVGGLTGIVLAN
+Sbjct   301  TRAYFTSATMIIAIPTGVKVFSWLATLHGSNMKWSAAVLWALGFIFLFTVGGLTGIVLAN 360
 
- Score = 550 bits (1417),  Expect = 1.44e-205
- Identities = 283/284 (99%), Positives = 283/284 (99%), Gaps = 1/284 (0%)
+Query  1081  SSLDIVLHDTYYVVAHFHYVLSIGAVFAIIGGFIHWFPLFSGYTLNQTYAKIHFITIFVG 1260
+             SSLDIVLHDTYYVVAHFHYVLS+GAVFAI+GGFIHWFPLFSGYTL+QTYAKIHF  +F+G
+Sbjct   361  SSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFIHWFPLFSGYTLDQTYAKIHFTIMFIG 420
+
+Query  1261  VNLTFFPQHFLGLSGIPRRYSDYPDAYTT*NILSSAGSFISLTAVILIIFII*EAFASKR 1440
+             VNLTFFPQHFLGLSG+PRRYSDYPDAYTT NILSS GSFISLTAV+L+IF+I EAFASKR
+Sbjct   421  VNLTFFPQHFLGLSGMPRRYSDYPDAYTTWNILSSVGSFISLTAVMLMIFMIWEAFASKR 480
+
+Query  1441  KVPIIEQPSTSLEWLYGCPPPYHTFEEPVYIK 1536
+             KV ++E+PS +LEWLYGCPPPYHTFEEPVY+K
+Sbjct   481  KVLMVEEPSMNLEWLYGCPPPYHTFEEPVYMK 512
+
+>gi|17708|gb|NP_904330.1|cytochrome c oxidase subunit I (mitochondrion) [Mus musculus]
+Length=514
+
+ Score = 847 bits (2189),  Expect = 8.27e-315
+ Identities = 427/512 (83%), Positives = 476/512 (92%), Gaps = 0/512 (0%)
  Frame = 1
 
-Query    1  LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFS 180
-            LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFS
-Sbjct    1  LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFS 60
+Query     1  MFADRWLFSTNHKDIGTLYLLFGA*AGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTA 180
+             MF +RWLFSTNHKDIGTLYLLFGA AG++GTALS+LIRAELGQPG LLG+D IYNVIVTA
+Sbjct     1  MFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSILIRAELGQPGALLGDDQIYNVIVTA 60
+
+Query   181  HAFVIIFFMVMPIIIGGFGN*LVPLIIGAPDMAFPRINNISF*LLLPSFLLLLASATVEA 360
+             HAFV+IFFMVMP++IGGFGN LVPL+IGAPDMAFPR+NN+SF LL PSFLLLLAS+ VEA
+Sbjct    61  HAFVMIFFMVMPMMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSMVEA 120
+
+Query   361  GAGTG*TVYPPLAGNYSHPGASVDLTIFSLHLAGISSILGAINFITTIINIKPPAISQYQ 540
+             GAGTG TVYPPLAGN +H GASVDLTIFSLHLAG+SSILGAINFITTIIN+KPPA++QYQ
+Sbjct   121  GAGTGWTVYPPLAGNLAHAGASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQ 180
+
+Query   541  TPLFV*SILITAVLLLLSLPVLAAGITILLTDRNLNTTFFDPAGGGDPILYQHLF*FFGH 720
+             TPLFV S+LITAVLLLLSLPVLAAGIT+LLTDRNLNTTFFDPAGGGDPILYQHLF FFGH
+Sbjct   181  TPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPILYQHLFWFFGH 240
 
-Query  181  AIPYIGTNLVEWIWGGFSVDKATLNRFFAFHFIL-FTMVALAGVHLTFLHETGSNNPLGL 357
-            AIPYIGTNLVEWIWGGFSVDKATLNRFFAFHFIL FTMVALAGVHLTFLHETGSNNPLGL
-Sbjct   61  AIPYIGTNLVEWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGL 120
+Query   721  PEVYILILPGFGIISHIVTHYSGKKEPFGYIGIV*AIVSIGFLGFIV*AHHIFTVGIDVD 900
+             PEVYILILPGFGIISH+VT+YSGKKEPFGY+G+V A++SIGFLGFIV AHH+FTVG+DVD
+Sbjct   241  PEVYILILPGFGIISHVVTYYSGKKEPFGYMGMVWAMMSIGFLGFIVWAHHMFTVGLDVD 300
+
+Query   901  TRAYFTSATIIIAIPTGVKVFS*LATLHGSNTK*SAAIL*ALGFIFLFTVGGLTGIVLAN 1080
+             TRAYFTSAT+IIAIPTGVKVFS LATLHG N K S A+L ALGFIFLFTVGGLTGIVL+N
+Sbjct   301  TRAYFTSATMIIAIPTGVKVFSWLATLHGGNIKWSPAMLWALGFIFLFTVGGLTGIVLSN 360
+
+Query  1081  SSLDIVLHDTYYVVAHFHYVLSIGAVFAIIGGFIHWFPLFSGYTLNQTYAKIHFITIFVG 1260
+             SSLDIVLHDTYYVVAHFHYVLS+GAVFAI+ GF+HWFPLFSG+TL+ T+AK HF  +FVG
+Sbjct   361  SSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFSGFTLDDTWAKAHFAIMFVG 420
+
+Query  1261  VNLTFFPQHFLGLSGIPRRYSDYPDAYTT*NILSSAGSFISLTAVILIIFII*EAFASKR 1440
+             VN+TFFPQHFLGLSG+PRRYSDYPDAYTT N +SS GSFISLTAV+++IF+I EAFASKR
+Sbjct   421  VNMTFFPQHFLGLSGMPRRYSDYPDAYTTWNTVSSMGSFISLTAVLIMIFMIWEAFASKR 480
+
+Query  1441  KVPIIEQPSTSLEWLYGCPPPYHTFEEPVYIK 1536
+             +V  +   ST+LEWL+GCPPPYHTFEEP Y+K
+Sbjct   481  EVMSVSYASTNLEWLHGCPPPYHTFEEPTYVK 512
+
+>gi|140539|gb|NP_059333.1|cytochrome c oxidase subunit I (mitochondrion) [Danio rerio]
+Length=516
 
-Query  358  TSDSDKIPFHPYYTIKDFLGLLILXXXXXXXALLSPDMLGDPDNHMPADPLNTPLHIKPE 537
-            TSDSDKIPFHPYYTIKDFLGLLILXXXXXXXALLSPDMLGDPDNHMPADPLNTPLHIKPE
-Sbjct  121  TSDSDKIPFHPYYTIKDFLGLLILXXXXXXXALLSPDMLGDPDNHMPADPLNTPLHIKPE 180
+ Score = 810 bits (2091),  Expect = 7.42e-300
+ Identities = 407/512 (79%), Positives = 459/512 (89%), Gaps = 0/512 (0%)
+ Frame = 1
+
+Query     1  MFADRWLFSTNHKDIGTLYLLFGA*AGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTA 180
+             M   RW FSTNHKDIGTLYL+FGA AG++GTALSLLIRAEL QPG LLG+D IYNVIVTA
+Sbjct     1  MTITRWFFSTNHKDIGTLYLVFGAWAGMVGTALSLLIRAELSQPGALLGDDQIYNVIVTA 60
+
+Query   181  HAFVIIFFMVMPIIIGGFGN*LVPLIIGAPDMAFPRINNISF*LLLPSFLLLLASATVEA 360
+             HAFV+IFFMVMPI+IGGFGN LVPL+IGAPDMAFPR+NN+SF LL PSFLLLLAS+ VEA
+Sbjct    61  HAFVMIFFMVMPILIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSGVEA 120
+
+Query   361  GAGTG*TVYPPLAGNYSHPGASVDLTIFSLHLAGISSILGAINFITTIINIKPPAISQYQ 540
+             GAGTG TVYPPLAGN +H GASVDLTIFSLHLAG+SSILGAINFITT IN+KPP ISQYQ
+Sbjct   121  GAGTGWTVYPPLAGNLAHAGASVDLTIFSLHLAGVSSILGAINFITTTINMKPPTISQYQ 180
+
+Query   541  TPLFV*SILITAVLLLLSLPVLAAGITILLTDRNLNTTFFDPAGGGDPILYQHLF*FFGH 720
+             TPLFV ++L+TAVLLLLSLPVLAAGIT+LLTDRNLNTTFFDPAGGGDPILYQHLF FFGH
+Sbjct   181  TPLFVWAVLVTAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPILYQHLFWFFGH 240
 
-Query  538  WYFLFAYAILRSVPNKLGGVLALFLSIVILGLMPFLHTSKHRSMMLRPLSQALFWTLTMD 717
-            WYFLFAYAILRSVPNKLGGVLALFLSIVILGLMPFLHTSKHRSMMLRPLSQALFWTLTMD
-Sbjct  181  WYFLFAYAILRSVPNKLGGVLALFLSIVILGLMPFLHTSKHRSMMLRPLSQALFWTLTMD 240
+Query   721  PEVYILILPGFGIISHIVTHYSGKKEPFGYIGIV*AIVSIGFLGFIV*AHHIFTVGIDVD 900
+             PEVYILILPGFGIISH+V +Y+GKKEPFGY+G+V A+++IG LGFIV AHH+FTVG+DVD
+Sbjct   241  PEVYILILPGFGIISHVVAYYAGKKEPFGYMGMVWAMMAIGLLGFIVWAHHMFTVGMDVD 300
+
+Query   901  TRAYFTSATIIIAIPTGVKVFS*LATLHGSNTK*SAAIL*ALGFIFLFTVGGLTGIVLAN 1080
+             TRAYFTSAT+IIAIPTGVKVFS LATLHG   K    +L ALGFIFLFTVGGLTGIVLAN
+Sbjct   301  TRAYFTSATMIIAIPTGVKVFSWLATLHGGAIKWETPMLWALGFIFLFTVGGLTGIVLAN 360
 
-Query  718  LLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGXIENY 849
-            LLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGXIENY
-Sbjct  241  LLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGXIENY 284
+Query  1081  SSLDIVLHDTYYVVAHFHYVLSIGAVFAIIGGFIHWFPLFSGYTLNQTYAKIHFITIFVG 1260
+             SSLDIVLHDTYYVVAHFHYVLS+GAVFAI+ GF+HWFPLF+GYTLN  + KIHF  +F+G
+Sbjct   361  SSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFTGYTLNSVWTKIHFGVMFIG 420
 
+Query  1261  VNLTFFPQHFLGLSGIPRRYSDYPDAYTT*NILSSAGSFISLTAVILIIFII*EAFASKR 1440
+             VNLTFFPQHFLGL+G+PRRYSDYPDAY   N +SS GS ISL AVI+ +FI+ EAF +KR
+Sbjct   421  VNLTFFPQHFLGLAGMPRRYSDYPDAYALWNTVSSIGSLISLVAVIMFLFILWEAFTAKR 480
+
+Query  1441  KVPIIEQPSTSLEWLYGCPPPYHTFEEPVYIK 1536
+             +V  +E  +T++EWL+GCPPPYHTFEEP +++
+Sbjct   481  EVLSVELTATNVEWLHGCPPPYHTFEEPAFVQ 512
+
--- a/test-data/diamond_results.tabular	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/diamond_results.tabular	Fri Dec 12 11:13:59 2025 +0000
@@ -1,2 +1,5 @@
-sequence	gi|5524211|gb|AAD44166.1|	99.6	284	0	1	1	283	1	284	1.44e-205	550	100	0	0	0	94M1D189M
-sequence	gi|5524212|gb|AAD44167.1|	79.6	284	57	1	1	283	1	284	5.77e-150	409	100	0	0	0	105M1D178M
+NP_008227.1	gi|4512|gb|YP_003024028.1|cytochrome	95.9	512	21	0	1	512	1	512	0.0	999	99.8	0	Metazoa	Chordata	512M
+NP_008227.1	gi|17708|gb|NP_904330.1|cytochrome	89.6	512	53	0	1	512	1	512	0.0	942	99.6	0	Metazoa	Chordata	512M
+NP_008227.1	gi|140539|gb|NP_059333.1|cytochrome	84.2	512	81	0	1	512	1	512	0.0	894	99.2	0	Metazoa	Chordata	512M
+NP_008227.1	gi|19893533|gb|YP_009047267.1|cytochrome	76.2	505	120	0	3	507	1	505	1.13e-295	799	98.8	0	Metazoa	Arthropoda	505M
+NP_008227.1	gi|3950761|gb|YP_514675.1|cytochrome	68.7	511	151	4	5	507	6	515	2.93e-259	707	97.3	0	Viridiplantae	Streptophyta	44M2D214M1I202M2D18M4D24M
--- a/test-data/diamond_results.wtax.tabular	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/diamond_results.wtax.tabular	Fri Dec 12 11:13:59 2025 +0000
@@ -1,1 +1,2 @@
-sequence	gi|5524211|gb|AAD44166.1|	99.6	284	0	1	1	283	1	284	1.44e-205	550
+qseqid	sseqid	pident	length	mismatch	gapopen	qstart	qend	sstart	send	evalue	bitscore
+NP_008227.1	gi|3950761|gb|YP_514675.1|cytochrome	68.7	511	151	4	5	507	6	515	2.93e-259	707
--- a/test-data/diamond_results_algorithm.tabular	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/diamond_results_algorithm.tabular	Fri Dec 12 11:13:59 2025 +0000
@@ -1,2 +1,5 @@
-sequence	gi|5524211|gb|AAD44166.1|	99.6	284	0	1	1	849	1	284	1.44e-205	550
-sequence	gi|5524212|gb|AAD44167.1|	79.6	284	57	1	1	849	1	284	5.77e-150	409
+NC_001646.1:5332-6871	gi|4512|gb|YP_003024028.1|cytochrome	88.9	512	57	0	1	1536	1	512	0.0	897
+NC_001646.1:5332-6871	gi|17708|gb|NP_904330.1|cytochrome	83.4	512	85	0	1	1536	1	512	8.27e-315	847
+NC_001646.1:5332-6871	gi|140539|gb|NP_059333.1|cytochrome	79.5	512	105	0	1	1536	1	512	7.42e-300	810
+NC_001646.1:5332-6871	gi|19893533|gb|YP_009047267.1|cytochrome	71.3	505	145	0	7	1521	1	505	2.88e-263	717
+NC_001646.1:5332-6871	gi|3950761|gb|YP_514675.1|cytochrome	65.5	516	169	4	13	1536	6	520	4.52e-237	651
--- a/test-data/diamond_results_freq_masking.tabular	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/diamond_results_freq_masking.tabular	Fri Dec 12 11:13:59 2025 +0000
@@ -1,2 +1,5 @@
-sequence	gi|5524211|gb|AAD44166.1|	99.6	284	0	1	1	849	1	284	1.44e-205	550
-sequence	gi|5524212|gb|AAD44167.1|	79.6	284	57	1	1	849	1	284	5.77e-150	409
+NC_001646.1:5332-6871	gi|4512|gb|YP_003024028.1|cytochrome	88.9	512	57	0	1	1536	1	512	0.0	897
+NC_001646.1:5332-6871	gi|17708|gb|NP_904330.1|cytochrome	83.4	512	85	0	1	1536	1	512	8.27e-315	847
+NC_001646.1:5332-6871	gi|140539|gb|NP_059333.1|cytochrome	79.5	512	105	0	1	1536	1	512	7.42e-300	810
+NC_001646.1:5332-6871	gi|19893533|gb|YP_009047267.1|cytochrome	71.3	505	145	0	7	1521	1	505	2.88e-263	717
+NC_001646.1:5332-6871	gi|3950761|gb|YP_514675.1|cytochrome	65.5	516	169	4	13	1536	6	520	4.52e-237	651
--- a/test-data/diamond_results_global_ranking.tabular	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/diamond_results_global_ranking.tabular	Fri Dec 12 11:13:59 2025 +0000
@@ -1,2 +1,5 @@
-sequence	gi|5524211|gb|AAD44166.1|	99.6	284	0	1	1	849	1	284	1.44e-205	550
-sequence	gi|5524212|gb|AAD44167.1|	79.6	284	57	1	1	849	1	284	5.77e-150	409
+NC_001646.1:5332-6871	gi|4512|gb|YP_003024028.1|cytochrome	88.9	512	57	0	1	1536	1	512	0.0	897
+NC_001646.1:5332-6871	gi|17708|gb|NP_904330.1|cytochrome	83.4	512	85	0	1	1536	1	512	8.27e-315	847
+NC_001646.1:5332-6871	gi|140539|gb|NP_059333.1|cytochrome	79.5	512	105	0	1	1536	1	512	7.42e-300	810
+NC_001646.1:5332-6871	gi|19893533|gb|YP_009047267.1|cytochrome	71.3	505	145	0	7	1521	1	505	2.88e-263	717
+NC_001646.1:5332-6871	gi|3950761|gb|YP_514675.1|cytochrome	65.5	516	169	4	13	1536	6	520	4.52e-237	651
--- a/test-data/diamond_results_iterate.tabular	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/diamond_results_iterate.tabular	Fri Dec 12 11:13:59 2025 +0000
@@ -1,2 +1,7 @@
-sequence	gi|5524211|gb|AAD44166.1|	99.6	284	0	1	1	849	1	284	1.44e-205	550
-sequence	gi|5524212|gb|AAD44167.1|	79.6	284	57	1	1	849	1	284	5.77e-150	409
+# DIAMOND v2.1.16. http://github.com/bbuchfink/diamond
+# Invocation: diamond blastx --threads 1 --db database.dmnd --query /tmp/tmpn1890frb/files/5/2/9/dataset_529e1e94-1186-4385-a242-298cfe957f6a.dat --query-gencode 1 --strand both --min-orf 1 --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore --header verbose --out /tmp/tmpn1890frb/job_working_directory/000/16/outputs/dataset_d67e7f18-9e9b-4100-8fff-c094b2a162ab.dat --compress 0 --iterate --algo 0 --matrix BLOSUM62 --comp-based-stats 1 --masking 1 --max-target-seqs 25 --evalue 0.001 --id 0 --query-cover 0 --subject-cover 0 --block-size 2.0 --motif-masking 0 --soft-masking 0 --index-chunks 4 --file-buffer-size 67108864
+# Fields: Query ID, Subject ID, Percentage of identical matches, Alignment length, Number of mismatches, Number of gap openings, Start of alignment in query, End of alignment in query, Start of alignment in subject, End of alignment in subject, Expected value, Bit score
+NC_001646.1:5332-6871	gi|4512|gb|YP_003024028.1|cytochrome	88.9	512	57	0	1	1536	1	512	0.0	897
+NC_001646.1:5332-6871	gi|17708|gb|NP_904330.1|cytochrome	83.4	512	85	0	1	1536	1	512	8.27e-315	847
+NC_001646.1:5332-6871	gi|140539|gb|NP_059333.1|cytochrome	79.5	512	105	0	1	1536	1	512	7.42e-300	810
+NC_001646.1:5332-6871	gi|3950761|gb|YP_514675.1|cytochrome	65.5	516	169	4	13	1536	6	520	4.52e-237	651
--- a/test-data/diamond_results_log_test.tabular	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/diamond_results_log_test.tabular	Fri Dec 12 11:13:59 2025 +0000
@@ -1,2 +1,5 @@
-sequence	gi|5524211|gb|AAD44166.1|	99.6	284	0	1	1	849	1	284	1.44e-205	550
-sequence	gi|5524212|gb|AAD44167.1|	79.6	284	57	1	1	849	1	284	5.77e-150	409
+NC_001646.1:5332-6871	gi|4512|gb|YP_003024028.1|cytochrome	88.9	512	57	0	1	1536	1	512	0.0	897
+NC_001646.1:5332-6871	gi|17708|gb|NP_904330.1|cytochrome	83.4	512	85	0	1	1536	1	512	8.27e-315	847
+NC_001646.1:5332-6871	gi|140539|gb|NP_059333.1|cytochrome	79.5	512	105	0	1	1536	1	512	7.42e-300	810
+NC_001646.1:5332-6871	gi|19893533|gb|YP_009047267.1|cytochrome	71.3	505	145	0	7	1521	1	505	2.88e-263	717
+NC_001646.1:5332-6871	gi|3950761|gb|YP_514675.1|cytochrome	65.5	516	169	4	13	1536	6	520	4.52e-237	651
--- a/test-data/diamond_results_max_hsps.tabular	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/diamond_results_max_hsps.tabular	Fri Dec 12 11:13:59 2025 +0000
@@ -1,2 +1,5 @@
-sequence	gi|5524211|gb|AAD44166.1|	99.6	284	0	1	1	849	1	284	1.44e-205	550
-sequence	gi|5524212|gb|AAD44167.1|	79.6	284	57	1	1	849	1	284	5.77e-150	409
+NC_001646.1:5332-6871	gi|4512|gb|YP_003024028.1|cytochrome	88.9	512	57	0	1	1536	1	512	0.0	897
+NC_001646.1:5332-6871	gi|17708|gb|NP_904330.1|cytochrome	83.4	512	85	0	1	1536	1	512	8.27e-315	847
+NC_001646.1:5332-6871	gi|140539|gb|NP_059333.1|cytochrome	79.5	512	105	0	1	1536	1	512	7.42e-300	810
+NC_001646.1:5332-6871	gi|19893533|gb|YP_009047267.1|cytochrome	71.3	505	145	0	7	1521	1	505	2.88e-263	717
+NC_001646.1:5332-6871	gi|3950761|gb|YP_514675.1|cytochrome	65.5	516	169	4	13	1536	6	520	4.52e-237	651
--- a/test-data/diamond_results_motif_masking.tabular	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/diamond_results_motif_masking.tabular	Fri Dec 12 11:13:59 2025 +0000
@@ -1,2 +1,5 @@
-sequence	gi|5524211|gb|AAD44166.1|	99.6	284	0	1	1	849	1	284	1.44e-205	550
-sequence	gi|5524212|gb|AAD44167.1|	79.6	284	57	1	1	849	1	284	5.77e-150	409
+NC_001646.1:5332-6871	gi|4512|gb|YP_003024028.1|cytochrome	88.9	512	57	0	1	1536	1	512	0.0	897
+NC_001646.1:5332-6871	gi|17708|gb|NP_904330.1|cytochrome	83.4	512	85	0	1	1536	1	512	8.27e-315	847
+NC_001646.1:5332-6871	gi|140539|gb|NP_059333.1|cytochrome	79.5	512	105	0	1	1536	1	512	7.42e-300	810
+NC_001646.1:5332-6871	gi|19893533|gb|YP_009047267.1|cytochrome	71.3	505	145	0	7	1521	1	505	2.88e-263	717
+NC_001646.1:5332-6871	gi|3950761|gb|YP_514675.1|cytochrome	65.5	516	169	4	13	1536	6	520	4.52e-237	651
--- a/test-data/diamond_results_soft_masking.tabular	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/diamond_results_soft_masking.tabular	Fri Dec 12 11:13:59 2025 +0000
@@ -1,2 +1,5 @@
-sequence	gi|5524211|gb|AAD44166.1|	99.6	284	0	1	1	849	1	284	1.44e-205	550
-sequence	gi|5524212|gb|AAD44167.1|	79.6	284	57	1	1	849	1	284	5.77e-150	409
+NC_001646.1:5332-6871	gi|4512|gb|YP_003024028.1|cytochrome	88.9	512	57	0	1	1536	1	512	0.0	897
+NC_001646.1:5332-6871	gi|17708|gb|NP_904330.1|cytochrome	83.4	512	85	0	1	1536	1	512	8.27e-315	847
+NC_001646.1:5332-6871	gi|140539|gb|NP_059333.1|cytochrome	79.5	512	105	0	1	1536	1	512	7.42e-300	810
+NC_001646.1:5332-6871	gi|19893533|gb|YP_009047267.1|cytochrome	71.3	505	145	0	7	1521	1	505	2.88e-263	717
+NC_001646.1:5332-6871	gi|3950761|gb|YP_514675.1|cytochrome	65.5	516	169	4	13	1536	6	520	4.52e-237	651
--- a/test-data/names.dmp	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/names.dmp	Fri Dec 12 11:13:59 2025 +0000
@@ -1,11 +1,270 @@
 1	|	all	|		|	synonym	|
 1	|	root	|		|	scientific name	|
-2	|	Bacteria	|	Bacteria <bacteria>	|	scientific name	|
-2	|	bacteria	|		|	blast name	|
-2	|	eubacteria	|		|	genbank common name	|
-2	|	Monera	|	Monera <bacteria>	|	in-part	|
-3	|	Procaryotae	|	Procaryotae <bacteria>	|	in-part	|
-3	|	Prokaryotae	|	Prokaryotae <bacteria>	|	in-part	|
-3	|	Prokaryota	|	Prokaryota <bacteria>	|	in-part	|
-3	|	prokaryote	|	prokaryote <bacteria>	|	in-part	|
-3	|	prokaryotes	|	prokaryotes <bacteria>	|	in-part	|
+2	|	Eucarya	|		|	synonym	|
+2	|	Eucaryotae	|		|	synonym	|
+2	|	Eukarya	|		|	synonym	|
+2	|	Eukaryotae	|		|	synonym	|
+2	|	Eukaryota	|		|	scientific name	|
+2	|	eukaryotes	|	eukaryotes <blast name>	|	blast name	|
+2	|	eukaryotes	|	eukaryotes <genbank common name>	|	genbank common name	|
+3	|	Embryophyta	|		|	scientific name	|
+3	|	higher plants	|		|	common name	|
+3	|	land plants	|	land plants <blast name>	|	blast name	|
+3	|	land plants	|	land plants <genbank common name>	|	genbank common name	|
+3	|	plants	|		|	common name	|
+4	|	Angiospermae	|		|	synonym	|
+4	|	angiosperms	|		|	common name	|
+4	|	flowering plants	|	flowering plants <blast name>	|	blast name	|
+4	|	flowering plants	|	flowering plants <genbank common name>	|	genbank common name	|
+4	|	Magnoliophyta	|		|	synonym	|
+4	|	Magnoliopsida	|		|	scientific name	|
+5	|	Liliopsida	|		|	scientific name	|
+5	|	monocots	|	monocots <blast name>	|	blast name	|
+5	|	monocots	|	monocots <genbank common name>	|	genbank common name	|
+5	|	Monocotyledoneae	|		|	synonym	|
+5	|	monocotyledons	|		|	common name	|
+6	|	Bambusaceae Nakai, 1943	|		|	authority	|
+6	|	Bambusaceae	|		|	synonym	|
+6	|	Gramineae	|		|	synonym	|
+6	|	grass family	|		|	common name	|
+6	|	Poaceae Barnhart, 1895	|		|	authority	|
+6	|	Poaceae	|		|	scientific name	|
+7	|	Oryza L., 1753	|		|	authority	|
+7	|	Oryza	|		|	scientific name	|
+7	|	Porteresia	|		|	includes	|
+8	|	Asian cultivated rice	|		|	genbank common name	|
+8	|	Oryza sativa L., 1753	|		|	authority	|
+8	|	Oryza sativa	|		|	scientific name	|
+8	|	red rice	|	red rice <Oryza sativa>	|	common name	|
+8	|	rice	|		|	common name	|
+9	|	Commelinidae	|		|	synonym	|
+9	|	commelinids	|		|	scientific name	|
+9	|	Commeliniflorae	|		|	synonym	|
+10	|	Eumetazoa	|		|	scientific name	|
+11	|	Arthropoda	|		|	scientific name	|
+11	|	arthropods	|	arthropods <blast name>	|	blast name	|
+11	|	arthropods	|	arthropods <genbank common name>	|	genbank common name	|
+12	|	Atelocerata	|	Atelocerata <hexapods>	|	in-part	|
+12	|	Hexapoda	|		|	scientific name	|
+12	|	hexapods	|	hexapods <blast name>	|	blast name	|
+12	|	hexapods	|	hexapods <genbank common name>	|	genbank common name	|
+12	|	Tracheata	|	Tracheata <hexapods>	|	in-part	|
+12	|	Uniramia	|	Uniramia <hexapods>	|	in-part	|
+13	|	Diptera	|		|	scientific name	|
+13	|	flies	|	flies <blast name>	|	blast name	|
+13	|	flies	|	flies <genbank common name>	|	genbank common name	|
+14	|	Brachycera	|		|	scientific name	|
+15	|	Drosophilidae	|		|	scientific name	|
+15	|	pomace flies	|		|	genbank common name	|
+16	|	Drosophila	|	Drosophila <flies,genus>	|	scientific name	|
+16	|	Drosophila Fallen, 1823	|		|	authority	|
+16	|	fruit flies	|	fruit flies <Drosophila>	|	genbank common name	|
+16	|	fruit fly	|	fruit fly <Drosophila>	|	common name	|
+17	|	Pterygota	|	Pterygota <insects>	|	scientific name	|
+17	|	winged insects	|		|	genbank common name	|
+18	|	Chordata	|		|	scientific name	|
+18	|	chordates	|	chordates <blast name>	|	blast name	|
+18	|	chordates	|	chordates <genbank common name>	|	genbank common name	|
+19	|	Vertebrata Cuvier, 1812	|		|	authority	|
+19	|	Vertebrata	|	Vertebrata <vertebrates>	|	scientific name	|
+19	|	vertebrates	|	vertebrates <blast name>	|	blast name	|
+19	|	vertebrates	|	vertebrates <genbank common name>	|	genbank common name	|
+20	|	Gnathostomata	|	Gnathostomata <vertebrates>	|	scientific name	|
+20	|	jawed vertebrates	|		|	genbank common name	|
+21	|	Actinopterygii	|		|	scientific name	|
+21	|	Actinopterygi	|		|	synonym	|
+21	|	fishes	|	fishes <ray-finned fishes>	|	common name	|
+21	|	fish	|	fish <ray-finned fishes>	|	common name	|
+21	|	Osteichthyes	|	Osteichthyes <ray-finned fishes>	|	in-part	|
+21	|	ray-finned fishes	|	ray-finned fishes <blast name>	|	blast name	|
+21	|	ray-finned fishes	|	ray-finned fishes <genbank common name>	|	genbank common name	|
+22	|	carps and others	|		|	genbank common name	|
+22	|	Cypriniformes	|		|	scientific name	|
+23	|	Brachydanio	|		|	synonym	|
+23	|	Celestichthys	|		|	synonym	|
+23	|	Danio	|		|	scientific name	|
+24	|	Brachydanio rerio frankei	|		|	synonym	|
+24	|	Brachydanio rerio	|		|	synonym	|
+24	|	Cyprinus rerio Hamilton, 1822	|		|	authority	|
+24	|	Cyprinus rerio	|		|	synonym	|
+24	|	Danio frankei	|		|	synonym	|
+24	|	Danio rerio frankei	|		|	synonym	|
+24	|	Danio rerio (Hamilton, 1822)	|		|	authority	|
+24	|	Danio rerio	|		|	scientific name	|
+24	|	leopard danio	|		|	common name	|
+24	|	zebra danio	|		|	common name	|
+24	|	zebrafish	|		|	genbank common name	|
+24	|	zebra fish	|	zebra fish <Danio rerio>	|	common name	|
+25	|	Sarcopterygii	|		|	scientific name	|
+26	|	eutherian mammals	|		|	common name	|
+26	|	Eutheria	|		|	scientific name	|
+26	|	Placentalia	|		|	synonym	|
+26	|	placental mammals	|		|	common name	|
+26	|	placentals	|	placentals <blast name>	|	blast name	|
+26	|	placentals	|	placentals <genbank common name>	|	genbank common name	|
+27	|	Primata	|		|	synonym	|
+27	|	primate	|		|	equivalent name	|
+27	|	Primates Linnaeus, 1758	|		|	authority	|
+27	|	primates	|	primates <blast name>	|	blast name	|
+27	|	primates	|	primates <genbank common name>	|	genbank common name	|
+27	|	Primates	|		|	scientific name	|
+28	|	Catarrhini	|		|	scientific name	|
+29	|	great apes	|		|	genbank common name	|
+29	|	Hominidae Gray, 1825	|		|	authority	|
+29	|	Hominidae	|		|	scientific name	|
+29	|	Pongidae	|		|	synonym	|
+30	|	Homo Linnaeus, 1758	|		|	authority	|
+30	|	Homo	|		|	scientific name	|
+30	|	humans	|		|	common name	|
+31	|	Homo sapiens Linnaeus, 1758	|		|	authority	|
+31	|	Homo sapiens	|		|	scientific name	|
+31	|	human	|		|	genbank common name	|
+32	|	rodent	|		|	common name	|
+32	|	Rodentia	|		|	scientific name	|
+32	|	rodents	|	rodents <blast name>	|	blast name	|
+32	|	rodents	|	rodents <genbank common name>	|	genbank common name	|
+33	|	Muridae	|		|	scientific name	|
+34	|	mice	|	mice <Mus>	|	genbank common name	|
+34	|	mouse	|	mouse <Mus>	|	common name	|
+34	|	Mus	|	Mus <genus>	|	scientific name	|
+35	|	Balb/c mouse	|		|	includes	|
+35	|	house mouse	|		|	genbank common name	|
+35	|	LK3 transgenic mice	|		|	includes	|
+35	|	mouse	|	mouse <Mus musculus>	|	common name	|
+35	|	Mus musculus Linnaeus, 1758	|		|	authority	|
+35	|	Mus musculus	|		|	scientific name	|
+35	|	Mus sp. 129SV	|		|	includes	|
+35	|	nude mice	|		|	includes	|
+35	|	transgenic mice	|		|	includes	|
+36	|	Cyprinoidea	|		|	synonym	|
+36	|	Cyprinoidei	|		|	scientific name	|
+37	|	Teleostei	|		|	scientific name	|
+37	|	teleost fishes	|		|	genbank common name	|
+38	|	Ostariophysi	|		|	scientific name	|
+39	|	Tetrapoda	|		|	scientific name	|
+39	|	tetrapods	|		|	genbank common name	|
+40	|	Amniota	|		|	scientific name	|
+40	|	amniotes	|		|	genbank common name	|
+41	|	Theria Parker & Haswell, 1897	|		|	authority	|
+41	|	Theria	|	Theria <mammals>	|	scientific name	|
+42	|	Chlorobionta Jeffrey, 1982	|		|	authority	|
+42	|	Chlorobionta	|		|	synonym	|
+42	|	Chlorophyta/Embryophyta group	|		|	equivalent name	|
+42	|	chlorophyte/embryophyte group	|		|	equivalent name	|
+42	|	Chloroplastida Adl et al. 2005	|		|	authority	|
+42	|	Chloroplastida	|		|	synonym	|
+42	|	green plants	|	green plants <blast name>	|	blast name	|
+42	|	green plants	|	green plants <common name>	|	common name	|
+42	|	Viridiplantae Cavalier-Smith, 1981	|		|	authority	|
+42	|	Viridiplantae	|		|	scientific name	|
+43	|	Fungi/Metazoa group	|		|	synonym	|
+43	|	Opisthokonta Cavalier-Smith 1987	|		|	authority	|
+43	|	Opisthokonta	|		|	scientific name	|
+43	|	opisthokonts	|		|	synonym	|
+44	|	Animalia	|		|	synonym	|
+44	|	animals	|	animals <blast name>	|	blast name	|
+44	|	animals	|	animals <genbank common name>	|	genbank common name	|
+44	|	metazoans	|		|	common name	|
+44	|	Metazoa	|		|	scientific name	|
+44	|	multicellular animals	|		|	common name	|
+45	|	Bilateria	|		|	scientific name	|
+46	|	Protostomia	|		|	scientific name	|
+47	|	Neoptera	|		|	scientific name	|
+48	|	Endopterygota	|		|	scientific name	|
+48	|	Holometabola	|		|	synonym	|
+49	|	deuterostomes	|		|	common name	|
+49	|	Deuterostomia	|		|	scientific name	|
+50	|	Streptophyta Bremer, 1985	|		|	authority	|
+50	|	Streptophyta	|		|	scientific name	|
+51	|	Cyperales	|		|	includes	|
+51	|	Poales	|		|	scientific name	|
+51	|	Typhales	|		|	includes	|
+52	|	Murinae	|		|	scientific name	|
+52	|	Otomyinae	|		|	includes	|
+53	|	Mammalia	|		|	scientific name	|
+53	|	mammals	|	mammals <blast name>	|	blast name	|
+53	|	mammals	|	mammals <genbank common name>	|	genbank common name	|
+54	|	Neopterygii	|		|	scientific name	|
+54	|	Neopterygi	|		|	synonym	|
+55	|	Asilomorpha	|		|	synonym	|
+55	|	Muscomorpha	|		|	scientific name	|
+56	|	Schizophora	|		|	scientific name	|
+57	|	Acalyptratae	|		|	scientific name	|
+58	|	Ephydroidea	|		|	scientific name	|
+59	|	Drosophilinae	|		|	scientific name	|
+60	|	Drosophilini	|		|	scientific name	|
+61	|	Insecta	|		|	scientific name	|
+61	|	insects	|	insects <blast name>	|	blast name	|
+61	|	insects	|	insects <genbank common name>	|	genbank common name	|
+61	|	true insects	|		|	common name	|
+62	|	Tracheophyta	|		|	scientific name	|
+62	|	Tracheophyta Sinnott ex Cavalier-Smith, 1998	|		|	authority	|
+62	|	vascular plants	|	vascular plants <blast name>	|	blast name	|
+62	|	vascular plants	|	vascular plants <common name>	|	common name	|
+63	|	seed plants	|	seed plants <blast name>	|	blast name	|
+63	|	seed plants	|	seed plants <common name>	|	common name	|
+63	|	Spermatophyta	|		|	scientific name	|
+64	|	Euphyllophyta	|		|	scientific name	|
+64	|	euphyllophytes	|		|	equivalent name	|
+65	|	Dicondylia	|		|	scientific name	|
+66	|	Panarthropoda	|		|	scientific name	|
+67	|	Craniata	|	Craniata <chordates>	|	scientific name	|
+68	|	Teleostomi	|		|	scientific name	|
+69	|	bony vertebrates	|		|	genbank common name	|
+69	|	Euteleostomi	|		|	scientific name	|
+70	|	Charophyta/Embryophyta group	|		|	synonym	|
+70	|	charophyte/embryophyte group	|		|	equivalent name	|
+70	|	Streptophytina	|		|	scientific name	|
+71	|	biota	|		|	synonym	|
+71	|	cellular organisms	|		|	scientific name	|
+72	|	Ehrhartoideae Jacq.-Fel. ex Caro, 1982	|		|	authority	|
+72	|	Ehrhartoideae	|		|	synonym	|
+72	|	Oryzoideae Kunth ex Beilschm., 1833	|		|	authority	|
+72	|	Oryzoideae	|		|	scientific name	|
+73	|	Oryzeae Dumort., 1824	|		|	authority	|
+73	|	Oryzeae	|		|	scientific name	|
+74	|	Actinopteri	|		|	scientific name	|
+75	|	Clupeocephala	|		|	scientific name	|
+76	|	Otophysa	|		|	synonym	|
+76	|	Otophysi	|		|	scientific name	|
+77	|	Cypriniphysae	|		|	scientific name	|
+77	|	Cypriniphysi	|		|	synonym	|
+78	|	Ostarioclupeomorpha	|		|	synonym	|
+78	|	Otocephala	|		|	synonym	|
+78	|	Otomorpha	|		|	scientific name	|
+79	|	Pancrustacea	|		|	scientific name	|
+80	|	Mandibulata	|		|	scientific name	|
+80	|	mandibulates	|		|	common name	|
+81	|	Homininae	|		|	scientific name	|
+81	|	Homo/Pan/Gorilla group	|		|	synonym	|
+82	|	Euarchontoglires	|		|	scientific name	|
+83	|	Glires	|		|	scientific name	|
+83	|	Rodents and rabbits	|		|	genbank common name	|
+84	|	Anthropoidea	|		|	synonym	|
+84	|	Simiiformes	|		|	scientific name	|
+85	|	ape	|	ape <primates>	|	common name	|
+85	|	apes	|		|	genbank common name	|
+85	|	Hominoidea	|		|	scientific name	|
+86	|	Muroidea	|		|	scientific name	|
+87	|	BEP clade	|		|	equivalent name	|
+87	|	BOP clade	|		|	scientific name	|
+88	|	Haplorrhini	|		|	scientific name	|
+89	|	Cyclorrhapha	|		|	scientific name	|
+90	|	Eremoneura	|		|	scientific name	|
+91	|	Mus	|	Mus <subgenus>	|	scientific name	|
+92	|	Ecdysozoa	|		|	scientific name	|
+93	|	Dipnotetrapodomorpha	|		|	scientific name	|
+94	|	Boreoeutheria	|		|	scientific name	|
+94	|	Boreotheria	|		|	synonym	|
+95	|	Mesangiospermae M.J.Donoghue, J.A.Doyle & P.D.Cantino, 2007	|		|	authority	|
+95	|	Mesangiospermae	|		|	scientific name	|
+96	|	Petrosaviidae	|		|	scientific name	|
+96	|	Petrosaviidae S.W.Graham & W.S.Judd, 2007	|		|	authority	|
+97	|	Osteoglossocephalai	|		|	scientific name	|
+98	|	Oryzinae Griseb., 1853	|		|	authority	|
+98	|	Oryzinae	|		|	scientific name	|
+99	|	mice and others	|		|	genbank common name	|
+99	|	Myomorpha	|		|	scientific name	|
+99	|	Sciurognathi	|	Sciurognathi <Myomorpha>	|	in-part	|
+100	|	Danionidae	|		|	scientific name	|
+101	|	Danioninae	|		|	scientific name	|
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_taxonomy.loc	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,1 @@
+test	testDB	${__HERE__}/ncbi_taxonomy/
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_taxonomy/README.md	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,2 @@
+The `*.dmp` files are automatically created by gen.sh (in the blastdb folder).
+`prot.accession2taxid` has been manually curated.
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_taxonomy/names.dmp	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,327 @@
+2	|	Bacteria	|	Bacteria <bacteria>	|	scientific name	|
+2	|	bacteria	|	bacteria <blast name>	|	blast name	|
+2	|	bacteria	|	bacteria <genbank common name>	|	genbank common name	|
+2	|	"Bacteria" Cavalier-Smith 1987	|		|	authority	|
+2	|	Bacteria (ex Cavalier-Smith 1987)	|		|	synonym	|
+2	|	Bacteria Woese et al. 2024	|		|	synonym	|
+2	|	"Bacteriobiota" Luketa 2012	|		|	authority	|
+2	|	Bacteriobiota	|		|	synonym	|
+2	|	eubacteria	|		|	common name	|
+2	|	Monera	|	Monera <bacteria>	|	in-part	|
+2	|	Procaryotae	|	Procaryotae <bacteria>	|	in-part	|
+2	|	Prokaryotae	|	Prokaryotae <bacteria>	|	in-part	|
+2	|	Prokaryota	|	Prokaryota <bacteria>	|	in-part	|
+2	|	prokaryote	|	prokaryote <bacteria>	|	in-part	|
+2	|	prokaryotes	|	prokaryotes <bacteria>	|	in-part	|
+1234	|	Nitrospira	|	Nitrospira <Nitrospira>	|	scientific name	|
+1234	|	Nitrospira Watson et al. 1986	|		|	authority	|
+2759	|	Eucarya	|		|	synonym	|
+2759	|	Eucaryotae	|		|	synonym	|
+2759	|	Eukarya	|		|	synonym	|
+2759	|	Eukaryotae	|		|	synonym	|
+2759	|	Eukaryota	|		|	scientific name	|
+2759	|	eukaryotes	|	eukaryotes <blast name>	|	blast name	|
+2759	|	eukaryotes	|	eukaryotes <genbank common name>	|	genbank common name	|
+3193	|	Embryophyta	|		|	scientific name	|
+3193	|	higher plants	|		|	common name	|
+3193	|	land plants	|	land plants <blast name>	|	blast name	|
+3193	|	land plants	|	land plants <genbank common name>	|	genbank common name	|
+3193	|	plants	|		|	common name	|
+3398	|	Angiospermae	|		|	synonym	|
+3398	|	angiosperms	|		|	common name	|
+3398	|	flowering plants	|	flowering plants <blast name>	|	blast name	|
+3398	|	flowering plants	|	flowering plants <genbank common name>	|	genbank common name	|
+3398	|	Magnoliophyta	|		|	synonym	|
+3398	|	Magnoliopsida	|		|	scientific name	|
+3432	|	Laurales Juss. ex Bercht. & J.Presl, 1820	|		|	authority	|
+3432	|	Laurales	|		|	scientific name	|
+3432	|	Laurineae	|		|	includes	|
+3433	|	Lauraceae Juss., 1789	|		|	authority	|
+3433	|	Lauraceae	|		|	scientific name	|
+3433	|	laurel family	|		|	common name	|
+3434	|	Persea Mill., 1754	|		|	authority	|
+3434	|	Persea	|		|	scientific name	|
+3435	|	avocado	|		|	genbank common name	|
+3435	|	Laurus persea L., 1753	|		|	authority	|
+3435	|	Laurus persea	|		|	synonym	|
+3435	|	Persea americana Mill., 1768	|		|	authority	|
+3435	|	Persea americana	|		|	scientific name	|
+3435	|	Persea americana var. tolimanensis	|		|	synonym	|
+3435	|	Persea americana var. tolimanensis (Zentmyer & Schieber) Scora, 2002	|		|	authority	|
+3435	|	Persea gratissima C.F.Gaertn., 1807	|		|	authority	|
+3435	|	Persea gratissima	|		|	synonym	|
+3435	|	Persea tolimanensis	|		|	synonym	|
+3435	|	Persea tolimanensis Zentmyer & Schieber, 1990	|		|	authority	|
+4447	|	Liliopsida	|		|	scientific name	|
+4447	|	monocots	|	monocots <blast name>	|	blast name	|
+4447	|	monocots	|	monocots <genbank common name>	|	genbank common name	|
+4447	|	Monocotyledoneae	|		|	synonym	|
+4447	|	monocotyledons	|		|	common name	|
+4479	|	Bambusaceae Nakai, 1943	|		|	authority	|
+4479	|	Bambusaceae	|		|	synonym	|
+4479	|	Gramineae	|		|	synonym	|
+4479	|	grass family	|		|	common name	|
+4479	|	Poaceae Barnhart, 1895	|		|	authority	|
+4479	|	Poaceae	|		|	scientific name	|
+4527	|	Oryza L., 1753	|		|	authority	|
+4527	|	Oryza	|		|	scientific name	|
+4527	|	Porteresia	|		|	includes	|
+4530	|	Asian cultivated rice	|		|	genbank common name	|
+4530	|	Oryza sativa L., 1753	|		|	authority	|
+4530	|	Oryza sativa	|		|	scientific name	|
+4530	|	red rice	|	red rice <Oryza sativa>	|	common name	|
+4530	|	rice	|		|	common name	|
+4734	|	Commelinidae	|		|	synonym	|
+4734	|	commelinids	|		|	scientific name	|
+4734	|	Commeliniflorae	|		|	synonym	|
+6072	|	Eumetazoa	|		|	scientific name	|
+6656	|	Arthropoda	|		|	scientific name	|
+6656	|	arthropods	|	arthropods <blast name>	|	blast name	|
+6656	|	arthropods	|	arthropods <genbank common name>	|	genbank common name	|
+6960	|	Atelocerata	|	Atelocerata <hexapods>	|	in-part	|
+6960	|	Hexapoda	|		|	scientific name	|
+6960	|	hexapods	|	hexapods <blast name>	|	blast name	|
+6960	|	hexapods	|	hexapods <genbank common name>	|	genbank common name	|
+6960	|	Tracheata	|	Tracheata <hexapods>	|	in-part	|
+6960	|	Uniramia	|	Uniramia <hexapods>	|	in-part	|
+7147	|	Diptera	|		|	scientific name	|
+7147	|	flies	|	flies <blast name>	|	blast name	|
+7147	|	flies	|	flies <genbank common name>	|	genbank common name	|
+7203	|	Brachycera	|		|	scientific name	|
+7214	|	Drosophilidae	|		|	scientific name	|
+7214	|	pomace flies	|		|	genbank common name	|
+7215	|	Drosophila	|	Drosophila <flies,genus>	|	scientific name	|
+7215	|	Drosophila Fallen, 1823	|		|	authority	|
+7215	|	fruit flies	|	fruit flies <Drosophila>	|	genbank common name	|
+7215	|	fruit fly	|	fruit fly <Drosophila>	|	common name	|
+7496	|	Pterygota	|	Pterygota <insects>	|	scientific name	|
+7496	|	winged insects	|		|	genbank common name	|
+7711	|	Chordata	|		|	scientific name	|
+7711	|	chordates	|	chordates <blast name>	|	blast name	|
+7711	|	chordates	|	chordates <genbank common name>	|	genbank common name	|
+7742	|	Vertebrata Cuvier, 1812	|		|	authority	|
+7742	|	Vertebrata	|	Vertebrata <vertebrates>	|	scientific name	|
+7742	|	vertebrates	|	vertebrates <blast name>	|	blast name	|
+7742	|	vertebrates	|	vertebrates <genbank common name>	|	genbank common name	|
+7776	|	Gnathostomata	|	Gnathostomata <vertebrates>	|	scientific name	|
+7776	|	jawed vertebrates	|		|	genbank common name	|
+7898	|	Actinopterygii	|		|	scientific name	|
+7898	|	Actinopterygi	|		|	synonym	|
+7898	|	fishes	|	fishes <ray-finned fishes>	|	common name	|
+7898	|	fish	|	fish <ray-finned fishes>	|	common name	|
+7898	|	Osteichthyes	|	Osteichthyes <ray-finned fishes>	|	in-part	|
+7898	|	ray-finned fishes	|	ray-finned fishes <blast name>	|	blast name	|
+7898	|	ray-finned fishes	|	ray-finned fishes <genbank common name>	|	genbank common name	|
+7952	|	carps and others	|		|	genbank common name	|
+7952	|	Cypriniformes	|		|	scientific name	|
+7954	|	Brachydanio	|		|	synonym	|
+7954	|	Celestichthys	|		|	synonym	|
+7954	|	Danio	|		|	scientific name	|
+7955	|	Brachydanio rerio frankei	|		|	synonym	|
+7955	|	Brachydanio rerio	|		|	synonym	|
+7955	|	Cyprinus rerio Hamilton, 1822	|		|	authority	|
+7955	|	Cyprinus rerio	|		|	synonym	|
+7955	|	Danio frankei	|		|	synonym	|
+7955	|	Danio rerio frankei	|		|	synonym	|
+7955	|	Danio rerio (Hamilton, 1822)	|		|	authority	|
+7955	|	Danio rerio	|		|	scientific name	|
+7955	|	leopard danio	|		|	common name	|
+7955	|	zebra danio	|		|	common name	|
+7955	|	zebrafish	|		|	genbank common name	|
+7955	|	zebra fish	|	zebra fish <Danio rerio>	|	common name	|
+8287	|	Sarcopterygii	|		|	scientific name	|
+9347	|	eutherian mammals	|		|	common name	|
+9347	|	Eutheria	|		|	scientific name	|
+9347	|	Placentalia	|		|	synonym	|
+9347	|	placental mammals	|		|	common name	|
+9347	|	placentals	|	placentals <blast name>	|	blast name	|
+9347	|	placentals	|	placentals <genbank common name>	|	genbank common name	|
+9443	|	Primata	|		|	synonym	|
+9443	|	primate	|		|	equivalent name	|
+9443	|	Primates Linnaeus, 1758	|		|	authority	|
+9443	|	primates	|	primates <blast name>	|	blast name	|
+9443	|	primates	|	primates <genbank common name>	|	genbank common name	|
+9443	|	Primates	|		|	scientific name	|
+9526	|	Catarrhini	|		|	scientific name	|
+9604	|	great apes	|		|	genbank common name	|
+9604	|	Hominidae Gray, 1825	|		|	authority	|
+9604	|	Hominidae	|		|	scientific name	|
+9604	|	Pongidae	|		|	synonym	|
+9605	|	Homo Linnaeus, 1758	|		|	authority	|
+9605	|	Homo	|		|	scientific name	|
+9605	|	humans	|		|	common name	|
+9606	|	Homo sapiens Linnaeus, 1758	|		|	authority	|
+9606	|	Homo sapiens	|		|	scientific name	|
+9606	|	human	|		|	genbank common name	|
+9989	|	rodent	|		|	common name	|
+9989	|	Rodentia	|		|	scientific name	|
+9989	|	rodents	|	rodents <blast name>	|	blast name	|
+9989	|	rodents	|	rodents <genbank common name>	|	genbank common name	|
+10066	|	Muridae	|		|	scientific name	|
+10088	|	mice	|	mice <Mus>	|	genbank common name	|
+10088	|	mouse	|	mouse <Mus>	|	common name	|
+10088	|	Mus	|	Mus <genus>	|	scientific name	|
+10090	|	Balb/c mouse	|		|	includes	|
+10090	|	house mouse	|		|	genbank common name	|
+10090	|	LK3 transgenic mice	|		|	includes	|
+10090	|	mouse	|	mouse <Mus musculus>	|	common name	|
+10090	|	Mus musculus Linnaeus, 1758	|		|	authority	|
+10090	|	Mus musculus	|		|	scientific name	|
+10090	|	Mus sp. 129SV	|		|	includes	|
+10090	|	nude mice	|		|	includes	|
+10090	|	transgenic mice	|		|	includes	|
+30727	|	Cyprinoidea	|		|	synonym	|
+30727	|	Cyprinoidei	|		|	scientific name	|
+32443	|	Teleostei	|		|	scientific name	|
+32443	|	teleost fishes	|		|	genbank common name	|
+32519	|	Ostariophysi	|		|	scientific name	|
+32523	|	Tetrapoda	|		|	scientific name	|
+32523	|	tetrapods	|		|	genbank common name	|
+32524	|	Amniota	|		|	scientific name	|
+32524	|	amniotes	|		|	genbank common name	|
+32525	|	Theria Parker & Haswell, 1897	|		|	authority	|
+32525	|	Theria	|	Theria <mammals>	|	scientific name	|
+33090	|	Chlorobionta Jeffrey, 1982	|		|	authority	|
+33090	|	Chlorobionta	|		|	synonym	|
+33090	|	Chlorophyta/Embryophyta group	|		|	equivalent name	|
+33090	|	chlorophyte/embryophyte group	|		|	equivalent name	|
+33090	|	Chloroplastida Adl et al. 2005	|		|	authority	|
+33090	|	Chloroplastida	|		|	synonym	|
+33090	|	green plants	|	green plants <blast name>	|	blast name	|
+33090	|	green plants	|	green plants <common name>	|	common name	|
+33090	|	Viridiplantae Cavalier-Smith, 1981	|		|	authority	|
+33090	|	Viridiplantae	|		|	scientific name	|
+33154	|	Fungi/Metazoa group	|		|	synonym	|
+33154	|	Opisthokonta Cavalier-Smith 1987	|		|	authority	|
+33154	|	Opisthokonta	|		|	scientific name	|
+33154	|	opisthokonts	|		|	synonym	|
+33208	|	Animalia	|		|	synonym	|
+33208	|	animals	|	animals <blast name>	|	blast name	|
+33208	|	animals	|	animals <genbank common name>	|	genbank common name	|
+33208	|	metazoans	|		|	common name	|
+33208	|	Metazoa	|		|	scientific name	|
+33208	|	multicellular animals	|		|	common name	|
+33213	|	Bilateria	|		|	scientific name	|
+33317	|	Protostomia	|		|	scientific name	|
+33340	|	Neoptera	|		|	scientific name	|
+33392	|	Endopterygota	|		|	scientific name	|
+33392	|	Holometabola	|		|	synonym	|
+33511	|	deuterostomes	|		|	common name	|
+33511	|	Deuterostomia	|		|	scientific name	|
+35493	|	Streptophyta Bremer, 1985	|		|	authority	|
+35493	|	Streptophyta	|		|	scientific name	|
+38820	|	Cyperales	|		|	includes	|
+38820	|	Poales	|		|	scientific name	|
+38820	|	Typhales	|		|	includes	|
+39107	|	Murinae	|		|	scientific name	|
+39107	|	Otomyinae	|		|	includes	|
+40117	|	"Nitrospirae" Garrity and Holt 2001	|		|	authority	|
+40117	|	"Nitrospiraeota" Oren et al. 2015	|		|	authority	|
+40117	|	Nitrospiraeota	|		|	synonym	|
+40117	|	Nitrospirae	|		|	synonym	|
+40117	|	Nitrospira group	|		|	synonym	|
+40117	|	Nitrospirota corrig. Garrity and Holt 2021	|		|	synonym	|
+40117	|	Nitrospirota	|		|	scientific name	|
+40117	|	Thermodesulfovibrio group	|		|	synonym	|
+40674	|	Mammalia	|		|	scientific name	|
+40674	|	mammals	|	mammals <blast name>	|	blast name	|
+40674	|	mammals	|	mammals <genbank common name>	|	genbank common name	|
+41665	|	Neopterygii	|		|	scientific name	|
+41665	|	Neopterygi	|		|	synonym	|
+43733	|	Asilomorpha	|		|	synonym	|
+43733	|	Muscomorpha	|		|	scientific name	|
+43738	|	Schizophora	|		|	scientific name	|
+43741	|	Acalyptratae	|		|	scientific name	|
+43746	|	Ephydroidea	|		|	scientific name	|
+43845	|	Drosophilinae	|		|	scientific name	|
+46877	|	Drosophilini	|		|	scientific name	|
+50557	|	Insecta	|		|	scientific name	|
+50557	|	insects	|	insects <blast name>	|	blast name	|
+50557	|	insects	|	insects <genbank common name>	|	genbank common name	|
+50557	|	true insects	|		|	common name	|
+58023	|	Tracheophyta	|		|	scientific name	|
+58023	|	Tracheophyta Sinnott ex Cavalier-Smith, 1998	|		|	authority	|
+58023	|	vascular plants	|	vascular plants <blast name>	|	blast name	|
+58023	|	vascular plants	|	vascular plants <common name>	|	common name	|
+58024	|	seed plants	|	seed plants <blast name>	|	blast name	|
+58024	|	seed plants	|	seed plants <common name>	|	common name	|
+58024	|	Spermatophyta	|		|	scientific name	|
+78536	|	Euphyllophyta	|		|	scientific name	|
+78536	|	euphyllophytes	|		|	equivalent name	|
+85512	|	Dicondylia	|		|	scientific name	|
+88770	|	Panarthropoda	|		|	scientific name	|
+89593	|	Craniata	|	Craniata <chordates>	|	scientific name	|
+117570	|	Teleostomi	|		|	scientific name	|
+117571	|	bony vertebrates	|		|	genbank common name	|
+117571	|	Euteleostomi	|		|	scientific name	|
+131221	|	Charophyta/Embryophyta group	|		|	synonym	|
+131221	|	charophyte/embryophyte group	|		|	equivalent name	|
+131221	|	Streptophytina	|		|	scientific name	|
+147367	|	Ehrhartoideae Jacq.-Fel. ex Caro, 1982	|		|	authority	|
+147367	|	Ehrhartoideae	|		|	synonym	|
+147367	|	Oryzoideae Kunth ex Beilschm., 1833	|		|	authority	|
+147367	|	Oryzoideae	|		|	scientific name	|
+147380	|	Oryzeae Dumort., 1824	|		|	authority	|
+147380	|	Oryzeae	|		|	scientific name	|
+186623	|	Actinopteri	|		|	scientific name	|
+186625	|	Clupeocephala	|		|	scientific name	|
+186626	|	Otophysa	|		|	synonym	|
+186626	|	Otophysi	|		|	scientific name	|
+186627	|	Cypriniphysae	|		|	scientific name	|
+186627	|	Cypriniphysi	|		|	synonym	|
+186634	|	Ostarioclupeomorpha	|		|	synonym	|
+186634	|	Otocephala	|		|	synonym	|
+186634	|	Otomorpha	|		|	scientific name	|
+189778	|	"Nitrospirales" Garrity and Holt 2001	|		|	authority	|
+189778	|	Nitrospirales Garrity and Holt 2022	|		|	authority	|
+189778	|	Nitrospirales	|		|	scientific name	|
+189779	|	"Nitrospiraceae" Garrity and Holt 2001	|		|	authority	|
+189779	|	Nitrospiraceae Garrity and Holt 2022	|		|	authority	|
+189779	|	Nitrospiraceae	|		|	scientific name	|
+197562	|	Pancrustacea	|		|	scientific name	|
+197563	|	Mandibulata	|		|	scientific name	|
+197563	|	mandibulates	|		|	common name	|
+203693	|	"Nitrospira" Garrity and Holt 2001	|		|	authority	|
+203693	|	Nitrospira	|	Nitrospira <Nitrospiria>	|	synonym	|
+203693	|	"Nitrospiria" Cavalier-Smith 2020	|		|	authority	|
+203693	|	Nitrospiria Garrity and Holt 2022	|		|	authority	|
+203693	|	"Nitrospiria" Oren et al. 2015	|		|	authority	|
+203693	|	Nitrospiria	|		|	scientific name	|
+207598	|	Homininae	|		|	scientific name	|
+207598	|	Homo/Pan/Gorilla group	|		|	synonym	|
+232347	|	Magnoliidae Novak ex Takht., 1967	|		|	authority	|
+232347	|	Magnoliidae	|		|	scientific name	|
+232347	|	magnoliids	|		|	equivalent name	|
+314146	|	Euarchontoglires	|		|	scientific name	|
+314147	|	Glires	|		|	scientific name	|
+314147	|	Rodents and rabbits	|		|	genbank common name	|
+314293	|	Anthropoidea	|		|	synonym	|
+314293	|	Simiiformes	|		|	scientific name	|
+314295	|	ape	|	ape <primates>	|	common name	|
+314295	|	apes	|		|	genbank common name	|
+314295	|	Hominoidea	|		|	scientific name	|
+337687	|	Muroidea	|		|	scientific name	|
+359160	|	BEP clade	|		|	equivalent name	|
+359160	|	BOP clade	|		|	scientific name	|
+376913	|	Haplorrhini	|		|	scientific name	|
+480117	|	Cyclorrhapha	|		|	scientific name	|
+480118	|	Eremoneura	|		|	scientific name	|
+862507	|	Mus	|	Mus <subgenus>	|	scientific name	|
+1206794	|	Ecdysozoa	|		|	scientific name	|
+1338369	|	Dipnotetrapodomorpha	|		|	scientific name	|
+1437010	|	Boreoeutheria	|		|	scientific name	|
+1437010	|	Boreotheria	|		|	synonym	|
+1437183	|	Mesangiospermae M.J.Donoghue, J.A.Doyle & P.D.Cantino, 2007	|		|	authority	|
+1437183	|	Mesangiospermae	|		|	scientific name	|
+1437197	|	Petrosaviidae	|		|	scientific name	|
+1437197	|	Petrosaviidae S.W.Graham & W.S.Judd, 2007	|		|	authority	|
+1489341	|	Osteoglossocephalai	|		|	scientific name	|
+1648021	|	Oryzinae Griseb., 1853	|		|	authority	|
+1648021	|	Oryzinae	|		|	scientific name	|
+1963758	|	mice and others	|		|	genbank common name	|
+1963758	|	Myomorpha	|		|	scientific name	|
+1963758	|	Sciurognathi	|	Sciurognathi <Myomorpha>	|	in-part	|
+2743709	|	Danionidae	|		|	scientific name	|
+2743711	|	Danioninae	|		|	scientific name	|
+3379134	|	Pseudomonadati (Gibbons and Murray 1978) Oren and Goker 2024	|		|	authority	|
+3379134	|	Pseudomonadati	|		|	scientific name	|
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_taxonomy/nodes.dmp	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,111 @@
+2	|	131567	|	domain	|		|	0	|	0	|	11	|	0	|	0	|	0	|	0	|	0	|		|
+1234	|	189779	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|	code compliant	|
+2759	|	131567	|	domain	|		|	1	|	0	|	1	|	0	|	1	|	0	|	0	|	0	|		|
+3193	|	131221	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	0	|	0	|	0	|		|
+3398	|	58024	|	class	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+3432	|	232347	|	order	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+3433	|	3432	|	family	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+3434	|	3433	|	genus	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+3435	|	3434	|	species	|	PA	|	4	|	1	|	1	|	1	|	1	|	1	|	1	|	0	|	code compliant; specified	|
+4447	|	1437183	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|		|
+4479	|	38820	|	family	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+4527	|	1648021	|	genus	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+4530	|	4527	|	species	|	OS	|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant; specified	|
+4734	|	1437197	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	1	|	1	|	0	|		|
+6072	|	33208	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	0	|	1	|	0	|		|
+6656	|	88770	|	phylum	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+6960	|	197562	|	subphylum	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+7147	|	33392	|	order	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+7203	|	7147	|	suborder	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+7214	|	43746	|	family	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+7215	|	46877	|	genus	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+7496	|	85512	|	subclass	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+7711	|	33511	|	phylum	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+7742	|	89593	|	clade	|		|	10	|	0	|	1	|	1	|	2	|	1	|	0	|	0	|		|
+7776	|	7742	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+7898	|	117571	|	superclass	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+7952	|	186627	|	order	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+7954	|	2743711	|	genus	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+7955	|	7954	|	species	|	DR	|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant; specified	|
+8287	|	117571	|	superclass	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant	|
+9347	|	32525	|	clade	|		|	2	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|		|
+9443	|	314146	|	order	|		|	5	|	0	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+9526	|	314293	|	parvorder	|		|	5	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+9604	|	314295	|	family	|		|	5	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+9605	|	207598	|	genus	|		|	5	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+9606	|	9605	|	species	|	HS	|	5	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant; specified	|
+9989	|	314147	|	order	|		|	6	|	0	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+10066	|	337687	|	family	|		|	6	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+10088	|	39107	|	genus	|		|	6	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+10090	|	862507	|	species	|	MM	|	6	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant; specified	|
+30727	|	7952	|	suborder	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant	|
+32443	|	41665	|	infraclass	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+32519	|	186634	|	subcohort	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|		|
+32523	|	1338369	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+32524	|	32523	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+32525	|	40674	|	clade	|		|	2	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+33090	|	2759	|	kingdom	|		|	4	|	0	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+33154	|	2759	|	clade	|		|	4	|	0	|	1	|	1	|	1	|	1	|	1	|	0	|		|
+33208	|	33154	|	kingdom	|		|	1	|	0	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+33213	|	6072	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+33317	|	33213	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+33340	|	7496	|	infraclass	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+33392	|	33340	|	cohort	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+33511	|	33213	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+35493	|	33090	|	phylum	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+38820	|	4734	|	order	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+39107	|	10066	|	subfamily	|		|	6	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+40117	|	3379134	|	phylum	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+40674	|	32524	|	class	|		|	2	|	0	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+41665	|	186623	|	subclass	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+43733	|	7203	|	infraorder	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+43738	|	480117	|	no rank	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+43741	|	43738	|	no rank	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+43746	|	43741	|	superfamily	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+43845	|	7214	|	subfamily	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|	code compliant	|
+46877	|	43845	|	tribe	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|	code compliant	|
+50557	|	6960	|	class	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+58023	|	3193	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|		|
+58024	|	78536	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|		|
+78536	|	58023	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	1	|	1	|	0	|		|
+85512	|	50557	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+88770	|	1206794	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+89593	|	7711	|	subphylum	|		|	10	|	0	|	1	|	1	|	2	|	0	|	0	|	0	|	code compliant	|
+117570	|	7776	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+117571	|	117570	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|		|
+131221	|	35493	|	subphylum	|		|	4	|	1	|	1	|	1	|	1	|	1	|	1	|	0	|	code compliant	|
+147367	|	359160	|	subfamily	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+147380	|	147367	|	tribe	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+186623	|	7898	|	class	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant	|
+186625	|	1489341	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+186626	|	32519	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+186627	|	186626	|	superorder	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant	|
+186634	|	186625	|	cohort	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant	|
+189778	|	203693	|	order	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|	code compliant	|
+189779	|	189778	|	family	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|	code compliant	|
+197562	|	197563	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+197563	|	6656	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+203693	|	40117	|	class	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|	code compliant	|
+207598	|	9604	|	subfamily	|		|	5	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant	|
+232347	|	1437183	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|		|
+314146	|	1437010	|	superorder	|		|	2	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+314147	|	314146	|	clade	|		|	2	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|		|
+314293	|	376913	|	infraorder	|		|	5	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant	|
+314295	|	9526	|	superfamily	|		|	5	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant	|
+337687	|	1963758	|	clade	|		|	6	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|		|
+359160	|	4479	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|		|
+376913	|	9443	|	suborder	|		|	5	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+480117	|	480118	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+480118	|	43733	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+862507	|	10088	|	subgenus	|		|	6	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+1206794	|	33317	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|		|
+1338369	|	8287	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+1437010	|	9347	|	clade	|		|	2	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+1437183	|	3398	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	1	|	1	|	0	|		|
+1437197	|	4447	|	subclass	|		|	4	|	1	|	1	|	1	|	1	|	1	|	1	|	0	|	code compliant	|
+1489341	|	32443	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+1648021	|	147380	|	subtribe	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+1963758	|	9989	|	suborder	|		|	6	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+2743709	|	30727	|	family	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+2743711	|	2743709	|	subfamily	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+3379134	|	2	|	kingdom	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_taxonomy/prot.accession2taxid	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,6 @@
+accession	accession.version	taxid	gi
+YP_514675	YP_514675.1	12	3950761
+YP_009047267	YP_009047267.1	20	19893533
+NP_059333	NP_059333.1	28	140539
+YP_003024028	YP_003024028.1	35	4512
+NP_904330	NP_904330.1	39	17708
--- a/test-data/nodes.dmp	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/nodes.dmp	Fri Dec 12 11:13:59 2025 +0000
@@ -1,3 +1,101 @@
 1	|	1	|	no rank	|		|	8	|	0	|	1	|	0	|	0	|	0	|	0	|	0	|		|
-2	|	1	|	species	|	AC	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
-3	|	1	|	species	|	AC	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
+2	|	71	|	domain	|		|	1	|	0	|	1	|	0	|	1	|	0	|	0	|	0	|		|
+3	|	70	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	0	|	0	|	0	|		|
+4	|	63	|	class	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+5	|	95	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|		|
+6	|	51	|	family	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+7	|	98	|	genus	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+8	|	7	|	species	|	OS	|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant; specified	|
+9	|	96	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	1	|	1	|	0	|		|
+10	|	44	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	0	|	1	|	0	|		|
+11	|	66	|	phylum	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+12	|	79	|	subphylum	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+13	|	48	|	order	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+14	|	13	|	suborder	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+15	|	58	|	family	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+16	|	60	|	genus	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+17	|	65	|	subclass	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+18	|	49	|	phylum	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+19	|	67	|	clade	|		|	10	|	0	|	1	|	1	|	2	|	1	|	0	|	0	|		|
+20	|	19	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+21	|	69	|	superclass	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+22	|	77	|	order	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+23	|	101	|	genus	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+24	|	23	|	species	|	DR	|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant; specified	|
+25	|	69	|	superclass	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant	|
+26	|	41	|	clade	|		|	2	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|		|
+27	|	82	|	order	|		|	5	|	0	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+28	|	84	|	parvorder	|		|	5	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+29	|	85	|	family	|		|	5	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+30	|	81	|	genus	|		|	5	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+31	|	30	|	species	|	HS	|	5	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant; specified	|
+32	|	83	|	order	|		|	6	|	0	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+33	|	86	|	family	|		|	6	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+34	|	52	|	genus	|		|	6	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+35	|	91	|	species	|	MM	|	6	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant; specified	|
+36	|	22	|	suborder	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant	|
+37	|	54	|	infraclass	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+38	|	78	|	subcohort	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|		|
+39	|	93	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+40	|	39	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+41	|	53	|	clade	|		|	2	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+42	|	2	|	kingdom	|		|	4	|	0	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+43	|	2	|	clade	|		|	4	|	0	|	1	|	1	|	1	|	1	|	1	|	0	|		|
+44	|	43	|	kingdom	|		|	1	|	0	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+45	|	10	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+46	|	45	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+47	|	17	|	infraclass	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+48	|	47	|	cohort	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+49	|	45	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+50	|	42	|	phylum	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+51	|	9	|	order	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+52	|	33	|	subfamily	|		|	6	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+53	|	40	|	class	|		|	2	|	0	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+54	|	74	|	subclass	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+55	|	14	|	infraorder	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+56	|	89	|	no rank	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+57	|	56	|	no rank	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+58	|	57	|	superfamily	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+59	|	15	|	subfamily	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|	code compliant	|
+60	|	59	|	tribe	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|	code compliant	|
+61	|	12	|	class	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|	code compliant	|
+62	|	3	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|		|
+63	|	64	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|		|
+64	|	62	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	1	|	1	|	0	|		|
+65	|	61	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+66	|	92	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+67	|	18	|	subphylum	|		|	10	|	0	|	1	|	1	|	2	|	0	|	0	|	0	|	code compliant	|
+68	|	20	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+69	|	68	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|		|
+70	|	50	|	subphylum	|		|	4	|	1	|	1	|	1	|	1	|	1	|	1	|	0	|	code compliant	|
+71	|	1	|	cellular root	|	CO	|	8	|	1	|	1	|	1	|	0	|	1	|	1	|	0	|		|
+72	|	87	|	subfamily	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+73	|	72	|	tribe	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+74	|	21	|	class	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant	|
+75	|	97	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+76	|	38	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+77	|	76	|	superorder	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant	|
+78	|	75	|	cohort	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant	|
+79	|	80	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+80	|	11	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+81	|	29	|	subfamily	|		|	5	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant	|
+82	|	94	|	superorder	|		|	2	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+83	|	82	|	clade	|		|	2	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|		|
+84	|	88	|	infraorder	|		|	5	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant	|
+85	|	28	|	superfamily	|		|	5	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|	code compliant	|
+86	|	99	|	clade	|		|	6	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|		|
+87	|	6	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|		|
+88	|	27	|	suborder	|		|	5	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+89	|	90	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+90	|	55	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	1	|	0	|		|
+91	|	34	|	subgenus	|		|	6	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+92	|	46	|	clade	|		|	1	|	1	|	1	|	1	|	5	|	1	|	0	|	0	|		|
+93	|	25	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+94	|	26	|	clade	|		|	2	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+95	|	4	|	clade	|		|	4	|	1	|	1	|	1	|	1	|	1	|	1	|	0	|		|
+96	|	5	|	subclass	|		|	4	|	1	|	1	|	1	|	1	|	1	|	1	|	0	|	code compliant	|
+97	|	37	|	clade	|		|	10	|	1	|	1	|	1	|	2	|	1	|	1	|	0	|		|
+98	|	73	|	subtribe	|		|	4	|	1	|	1	|	1	|	1	|	1	|	0	|	0	|	code compliant	|
+99	|	32	|	suborder	|		|	6	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+100	|	36	|	family	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
+101	|	100	|	subfamily	|		|	10	|	1	|	1	|	1	|	2	|	1	|	0	|	0	|	code compliant	|
--- a/test-data/nucleotide.fasta	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/nucleotide.fasta	Fri Dec 12 11:13:59 2025 +0000
@@ -1,17 +1,23 @@
->sequence more text
-CTGTGCCTGTACACCCACATCGGCAGAAACATCTACTACGGCAGCTACCTGTACAGCGAG
-ACCTGGAACACCGGCATCATGCTGCTGCTGATCACCATGGCCACCGCCTTCATGGGCTAC
-GTGCTGCCCTGGGGCCAGATGAGCTTCTGGGGCGCCACCGTGATCACCAACCTGTTCAGC
-GCCATCCCCTACATCGGCACCAACCTGGTGGAGTGGATCTGGGGCGGCTTCAGCGTGGAC
-AAGGCCACCCTGAACAGATTCTTCGCCTTCCACTTCATCCTGTTCACCATGGTGGCCCTG
-GCCGGCGTGCACCTGACCTTCCTGCACGAGACCGGCAGCAACAACCCCCTGGGCCTGACC
-AGCGACAGCGACAAGATCCCCTTCCACCCCTACTACACCATCAAGGACTTCCTGGGCCTG
-CTGATCCTGATCCTGCTGCTGCTGCTGCTGGCCCTGCTGAGCCCCGACATGCTGGGCGAC
-CCCGACAACCACATGCCCGCCGACCCCCTGAACACCCCCCTGCACATCAAGCCCGAGTGG
-TACTTCCTGTTCGCCTACGCCATCCTGAGAAGCGTGCCCAACAAGCTGGGCGGCGTGCTG
-GCCCTGTTCCTGAGCATCGTGATCCTGGGCCTGATGCCCTTCCTGCACACCAGCAAGCAC
-AGAAGCATGATGCTGAGACCCCTGAGCCAGGCCCTGTTCTGGACCCTGACCATGGACCTG
-CTGACCCTGACCTGGATCGGCAGCCAGCCCGTGGAGTACCCCTACACCATCATCGGCCAG
-ATGGCCAGCATCCTGTACTTCAGCATCATCCTGGCCTTCCTGCCCATCGCCGGCNNNATC
-GAGAACTAC
-
+>NC_001646.1:5332-6871 Pongo pygmaeus mitochondrion, complete genome
+ATGTTCGCCGACCGCTGGCTATTCTCCACGAACCACAAAGATATTGGAACGCTATACCTGTTGTTCGGCG
+CATGAGCTGGTGTCCTAGGCACTGCCCTAAGCCTCCTCATTCGTGCTGAACTAGGCCAACCCGGCAACCT
+CCTAGGTAATGACCATATTTACAATGTCATCGTCACAGCCCATGCATTCGTAATAATTTTTTTCATGGTC
+ATGCCCATAATAATTGGAGGCTTTGGCAACTGACTAGTGCCCCTGATAATTGGCGCCCCTGATATGGCAT
+TCCCGCGCATAAATAACATAAGCTTCTGACTCCTCCTCCCCTCCTTCCTCCTATTACTCGCTTCTGCTAC
+AGTAGAGGCCGGAGCAGGAACGGGCTGAACAGTCTATCCACCCCTAGCAGGAAACTACTCTCACCCAGGA
+GCCTCTGTAGACTTGACAATCTTCTCTCTACACCTAGCAGGCATTTCCTCAATTCTAGGGGCTATCAATT
+TCATTACAACAATTATTAATATAAAACCCCCTGCAATATCCCAATATCAAACTCCCCTCTTCGTCTGATC
+AATCCTGATCACAGCAGTCCTACTTCTCCTCTCCCTCCCAGTCCTAGCCGCTGGCATCACCATACTACTA
+ACAGACCGCAACTTAAATACTACATTCTTTGACCCGGCTGGAGGTGGGGATCCTATCCTATACCAACACT
+TATTCTGATTTTTCGGCCACCCTGAAGTCTACATTCTCATCCTACCAGGTTTCGGCATAATCTCCCACAT
+CGTAACACACTACTCCGGAAAAAAAGAACCATTTGGGTATATAGGCATAGTCTGAGCCATAGTCTCAATT
+GGTTTCCTGGGTTTTATCGTATGAGCCCACCACATATTCACAGTAGGGATAGACGTGGACACACGAGCCT
+ACTTCACCTCCGCTACCATAATTATTGCCATCCCCACCGGCGTCAAAGTATTTAGCTGACTCGCTACACT
+CCACGGAAGCAACACTAAATGATCTGCCGCAATCCTCTGAGCCTTAGGATTCATTTTCCTCTTCACCGTA
+GGCGGCTTAACAGGCATCGTACTGGCAAACTCATCACTAGACATCGTATTACACGATACATACTACGTTG
+TAGCCCACTTTCACTACGTCTTATCAATAGGAGCTGTATTCGCCATCATAGGAGGCTTCATCCACTGGTT
+CCCACTATTCTCAGGCTACACCTTAAACCAGACCTATGCTAAAATTCACTTCATCACCATATTTGTCGGC
+GTAAATTTAACCTTCTTCCCGCAACATTTCCTTGGCCTATCAGGTATACCCCGACGCTACTCCGATTACC
+CCGACGCATATACCACATGAAATATTTTATCATCCGCAGGCTCATTTATCTCCCTAACAGCAGTTATACT
+AATAATTTTCATAATTTGAGAAGCCTTTGCCTCAAAACGAAAAGTCCCAATAATTGAACAACCTTCCACA
+AGCCTAGAGTGGTTATACGGATGCCCCCCACCCTACCATACGTTTGAAGAACCCGTCTATATAAAACCCG
--- a/test-data/prot.accession2taxid	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/prot.accession2taxid	Fri Dec 12 11:13:59 2025 +0000
@@ -1,4 +1,6 @@
 accession	accession.version	taxid	gi
-AAD44166	AAD44166.1	2	5524211
-AAD44167	AAD44167.1	3	5524212
-
+YP_514675	YP_514675.1	8	3950761
+YP_009047267	YP_009047267.1	16	19893533
+NP_059333	NP_059333.1	24	140539
+YP_003024028	YP_003024028.1	31	4512
+NP_904330	NP_904330.1	35	17708
--- a/test-data/protein.fasta	Mon Nov 10 15:12:32 2025 +0000
+++ b/test-data/protein.fasta	Fri Dec 12 11:13:59 2025 +0000
@@ -1,9 +1,12 @@
->sequence more text
-LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV
-EWIWGGFSVDKATLNRFFAFHFILFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG
-LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL
-GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX
-IENY
+>NP_008227.1 cytochrome c oxidase subunit I (mitochondrion) [Pongo pygmaeus]
+MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTAHAFVMIFFMV
+MPMMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLLPSFLLLLASATVEAGAGTGWTVYPPLAGNYSHPG
+ASVDLTIFSLHLAGISSILGAINFITTIINMKPPAMSQYQTPLFVWSILITAVLLLLSLPVLAAGITMLL
+TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIVTHYSGKKEPFGYMGMVWAMVSI
+GFLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGSNTKWSAAILWALGFIFLFTV
+GGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFIHWFPLFSGYTLNQTYAKIHFITMFVG
+VNLTFFPQHFLGLSGMPRRYSDYPDAYTTWNILSSAGSFISLTAVMLMIFMIWEAFASKRKVPMIEQPST
+SLEWLYGCPPPYHTFEEPVYMKP
 >shuffled sequence that should go to unaligned
 XLPLILMLLGISPGSFEHTVAGGIWTSLMLFLPGYPGVGFLMLLVITVPALNFKFGFMLL
 LKPTTNIIKTLVLALTHADDPLSFPWLNYMPPAADFNGLFTNAGATTTLYQIPYEGSFYL
Binary file test-data/protein.fasta.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/taxon.tsv	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,2 @@
+32523
+7898
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/blastdb_p.loc	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,57 @@
+# This is a sample file distributed with Galaxy that is used to define a
+# list of protein domain databases, using three columns tab separated
+# (longer whitespace are TAB characters):
+#
+# <unique_id>{tab}<database_caption>{tab}<base_name_path>
+#
+# The captions typically contain spaces and might end with the build date.
+# It is important that the actual database name does not have a space in
+# it, and that there are only two tabs on each line.
+#
+# You can download the NCBI provided databases as tar-balls from here:
+# ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/
+#
+# For simplicity, many Galaxy servers are configured to offer just a live
+# version of each NCBI BLAST database (updated with the NCBI provided
+# Perl scripts or similar). In this case, we recommend using the case
+# sensistive base-name of the NCBI BLAST databases as the unique id.
+# Consistent naming is important for sharing workflows between Galaxy
+# servers.
+#
+# For example, consider the NCBI Conserved Domains Database (CDD), where
+# you have downloaded and decompressed the files under the directory
+# /data/blastdb/domains/ meaning at the command line BLAST+ would be
+# run as follows any would look at the files /data/blastdb/domains/Cdd.*:
+#
+# $ rpsblast -db /data/blastdb/domains/Cdd -query ...
+#
+# In this case use Cdd (title case to match the NCBI file naming) as the
+# unique id in the first column of blastdb_d.loc, giving an entry like
+# this:
+#
+# Cdd{tab}NCBI Conserved Domains Database (CDD){tab}/data/blastdb/domains/Cdd
+#
+# Your blastdb_d.loc file should include an entry per line for each "base name"
+# you have stored. For example:
+#
+# Cdd{tab}NCBI CDD{tab}/data/blastdb/domains/Cdd
+# Kog{tab}KOG (eukaryotes){tab}/data/blastdb/domains/Kog
+# Cog{tab}COG (prokaryotes){tab}/data/blastdb/domains/Cog
+# Pfam{tab}Pfam-A{tab}/data/blastdb/domains/Pfam
+# Smart{tab}SMART{tab}/data/blastdb/domains/Smart
+# Tigr{tab}TIGR	/data/blastdb/domains/Tigr
+# Prk{tab}Protein Clusters database{tab}/data/blastdb/domains/Prk
+# ...etc...
+#
+# Alternatively, rather than a "live" mirror of the NCBI databases which
+# are updated automatically, for full reproducibility the Galaxy Team
+# recommend saving date-stamped copies of the databases. In this case
+# your blastdb_d.loc file should include an entry per line for each
+# version you have stored. For example:
+#
+# Cdd_05Jun2010{tab}NCBI CDD 05 Jun 2010{tab}/data/blastdb/domains/05Jun2010/Cdd
+# Cdd_15Aug2010{tab}NCBI CDD 15 Aug 2010{tab}/data/blastdb/domains/15Aug2010/Cdd
+# ...etc...
+#
+# See also blastdb.loc which is for any nucleotide BLAST database, and
+# blastdb_p.loc which is for any protein BLAST databases.
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/ncbi_taxonomy.loc.sample	Fri Dec 12 11:13:59 2025 +0000
@@ -0,0 +1,5 @@
+# Tab separated fields where
+# value is unique key
+# name is descriptive name
+# path is path to directory containing names.dmp and nodes.dmp files
+#value	name	path
--- a/tool_data_table_conf.xml.sample	Mon Nov 10 15:12:32 2025 +0000
+++ b/tool_data_table_conf.xml.sample	Fri Dec 12 11:13:59 2025 +0000
@@ -1,8 +1,19 @@
 <!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
 <tables>
-    <!-- Locations of indexes in the Bowtie mapper format -->
+
+    <table name="blastdb_p" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, name, path</columns>
+        <file path="tool-data/blastdb_p.loc" />
+    </table>
+
     <table name="diamond_database" comment_char="#">
         <columns>value, name, db_path</columns>
         <file path="tool-data/diamond_database.loc" />
     </table>
+
+    <!-- Locations of taxonomy data downloaded from NCBI -->
+    <table name="ncbi_taxonomy" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/ncbi_taxonomy.loc" />
+    </table>
 </tables>
--- a/tool_data_table_conf.xml.test	Mon Nov 10 15:12:32 2025 +0000
+++ b/tool_data_table_conf.xml.test	Fri Dec 12 11:13:59 2025 +0000
@@ -1,7 +1,18 @@
 <tables>
-    <!-- Locations of all fasta files required to build Diamond databases -->
+    <table name="blastdb_p" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="${__HERE__}/test-data/blastdb_p.loc" />
+    </table>
+
     <table name="diamond_database" comment_char="#">
         <columns>value, name, db_path</columns>
         <file path="${__HERE__}/test-data/diamond_database.loc" />
     </table>
+
+    <!-- Locations of taxonomy data downloaded from NCBI -->
+    <table name="ncbi_taxonomy" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="${__HERE__}/test-data/ncbi_taxonomy.loc" />
+    </table>
+    
 </tables>