Mercurial > repos > bgruening > diamond
changeset 23:f12a64a8a5bb draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/diamond commit 3b8d4b833ee2bd2a99b23b7389def84cd3de84cb
line wrap: on
line diff
--- a/diamond.xml Mon Nov 10 15:12:32 2025 +0000 +++ b/diamond.xml Fri Dec 12 11:13:59 2025 +0000 @@ -13,18 +13,34 @@ <![CDATA[ #if $ref_db_source.db_source == "history": - ln -s '$ref_db_source.reference_database' ./database.dmnd + ln -s '$ref_db_source.reference_database' ./database.dmnd && + #set database="database.dmnd" + #else if $ref_db_source.db_source == "blast": + #import os.path + #set basename = os.path.basename($ref_db_source.reference_database.fields.path) + #set dirname = os.path.dirname($ref_db_source.reference_database.fields.path) + #set database="./db/" + basename + mkdir ./db && + ## symlink all files in the directory containing the BLAST DB + ## in newer BLAST DBs there is a file taxonomy4blast.sqlite3 + ## that is needed + ln -s '$dirname/'* ./db/ && + #if $ref_db_source.ncbi_taxonomy + ## symlink names and nodes dump files from NCBI taxonomy + ## need to be in the same dir as the BLAST DB (which is the + ## reason why we need to create .,/db/) + ln -s '$ref_db_source.ncbi_taxonomy.fields.path/nodes.dmp' ./db/nodes.dmp && + ln -s '$ref_db_source.ncbi_taxonomy.fields.path/names.dmp' ./db/names.dmp && + #end if #else: - ln -s '${ref_db_source.index.fields.db_path}' ./database.dmnd + ln -s '${ref_db_source.reference_database.fields.db_path}' ./database.dmnd && + #set database="database.dmnd" #end if - && - diamond $method_cond.method_select - --quiet --threads "\${GALAXY_SLOTS:-12}" - --db ./database + --db '$database' --query '$query' #if $method_cond.method_select == "blastx" --query-gencode '$method_cond.query_gencode' @@ -39,17 +55,11 @@ #end if @OUTPUT_ARGS@ - #if $output_section.output.outfmt != '100' --compress '0' #end if + $sens_cond.sensitivity - $iterate - $swipe - --algo $algo - #if $global_ranking - --global-ranking $global_ranking - #end if #if str($gapopen) != "": --gapopen '$gapopen' #end if @@ -69,6 +79,7 @@ #end if --id '$id' + --approx-id '$approx_id' --query-cover '$query_cover' --subject-cover '$subject_cover' --block-size '$sens_cond.block_size' @@ -94,22 +105,32 @@ --max-hsps $output_section.max_hsps #end if #if $tax_cond.tax_select == 'file': - --taxonlist `cat '$tax_cond.taxonlistfile' | grep -v "^#" | grep -v "^$" | tr "\n" "," | sed 's/,$//'` + --taxonlist \$(cat '$tax_cond.taxonlist' | grep -v "^#" | grep -v "^$" | tr "\n" "," | sed 's/,$//') #else if $tax_cond.tax_select == 'list': --taxonlist '$tax_cond.taxonlist' #end if + #if $tax_exclude_cond.tax_select == 'file': + --taxon_exclude \$(cat '$tax_exclude_cond.taxon_exclude' | grep -v "^#" | grep -v "^$" | tr "\n" "," | sed 's/,$//') + #else if $tax_exclude_cond.tax_select == 'list': + --taxon_exclude '$tax_exclude_cond.taxon_exclude' + #end if + #if $advanced_section.seed_cut --seed-cut $advanced_section.seed_cut #end if $advanced_section.freq_masking --motif-masking $advanced_section.motif_masking --soft-masking $advanced_section.soft_masking + $advanced_section.iterate + $advanced_section.swipe + --algo $advanced_section.algo + #if $advanced_section.global_ranking + --global-ranking $advanced_section.global_ranking + #end if --index-chunks "\${DIAMOND_INDEX_CHUNKS:-4}" --file-buffer-size "\${DIAMOND_FILE_BUFFER_SIZE:-67108864}" $log - -]]> - </command> + ]]></command> <inputs> <conditional name="method_cond"> <param name="method_select" type="select" label="Alignment mode" help="(blastp/blastx)"> @@ -117,7 +138,7 @@ <option value="blastx">DNA query sequences (blastx)</option> </param> <when value="blastx"> - <param argument="--query-gencode" type="select" label="Genetic code used for translation of query in BLASTX mode" help=""> + <param argument="--query-gencode" type="select" label="Genetic code" help="used for translation of query in BLASTX mode"> <option value="1">Standard Code</option> <option value="2">Vertebrate Mitochondrial Code</option> <option value="3">Yeast Mitochondrial Code</option> @@ -144,20 +165,20 @@ <option value="31">Blastocrithidia Nuclear Code</option> <option value="33">Cephalodiscidae Mitochondrial UAA-Tyr Code</option> </param> - <param argument="--min-orf" type="integer" value="1" min="1" label="ignore translated sequences without an open reading frame of at least this length" help="By default this feature is disabled for sequences of length below 30, set to 20 for sequences of length below 100, and set to 40 otherwise. Setting this option to 1 will disable this feature"/> - <param name="query_strand" argument="--strand" type="select" label="query strands to search" help=""> + <param argument="--min-orf" type="integer" value="1" min="1" label="Minimum ORF length" help="Ignore translated sequences without an open reading frame of at least this length. By default this feature is disabled for sequences of length below 30, set to 20 for sequences of length below 100, and set to 40 otherwise. Setting this option to 1 will disable this feature"/> + <param name="query_strand" argument="--strand" type="select" label="Query strands to search" help=""> <option value="both" selected="True">Both</option> <option value="plus">Plus</option> <option value="minus">Minus</option> </param> <conditional name="frameshift_cond"> - <param name="frameshift_select" type="select" label="Allow for frameshifts?" help=""> + <param name="frameshift_select" type="select" label="Allow for frameshifts" help=""> <option value="yes">yes</option> <option value="no" selected="true">no</option> </param> <when value="yes"> - <param argument="--range-culling" type="boolean" truevalue="--range-culling" falsevalue="" checked="false" label="restrict hit culling to overlapping query ranges" help="This feature is designed for long query DNA sequences that may span several genes. In these cases, the default of reporting the 25 best overall hits could cause hits to a lower scoring gene to be overshadowed. But just increasing the number of alignments reported will bloat the output size and reduce performance. Using this feature along with -k 25 (default), a hit will only be deleted if at least 50% of its query range is spanned by at least 25 higher or equal scoring hits. Using this feature along with --top 10, a hit will only be deleted if its score is more than 10% lower than that of a higher scoring hit over at least 50% of its query range. The percentage is configurable using --range-cover. Note that this feature is currently only available in frameshift alignment mode"/> - <param argument="--frameshift" type="integer" value="0" label="frame shift penalty" help="Values around 15 are reasonable for this parameter. Enabling this feature will have the aligner tolerate missing bases in DNA sequences and is most recommended for long, error-prone sequences like MinION reads. In the pairwise output format, frameshifts will be indicated by \ and / for a shift by +1 and -1 nucleotide in the direction of translation respectively."/> + <param argument="--range-culling" type="boolean" truevalue="--range-culling" falsevalue="" checked="false" label="Restrict hit culling to overlapping query ranges" help="This feature is designed for long query DNA sequences that may span several genes. In these cases, the default of reporting the 25 best overall hits could cause hits to a lower scoring gene to be overshadowed. But just increasing the number of alignments reported will bloat the output size and reduce performance. Using this feature along with -k 25 (default), a hit will only be deleted if at least 50% of its query range is spanned by at least 25 higher or equal scoring hits. Using this feature along with --top 10, a hit will only be deleted if its score is more than 10% lower than that of a higher scoring hit over at least 50% of its query range. The percentage is configurable using --range-cover. Note that this feature is currently only available in frameshift alignment mode"/> + <param argument="--frameshift" type="integer" value="0" label="Frame shift penalty" help="Values around 15 are reasonable for this parameter. Enabling this feature will have the aligner tolerate missing bases in DNA sequences and is most recommended for long, error-prone sequences like MinION reads. In the pairwise output format, frameshifts will be indicated by \ and / for a shift by +1 and -1 nucleotide in the direction of translation respectively."/> </when> <when value="no"/> </conditional> @@ -174,17 +195,32 @@ <option value="2">Compositional matrix adjust conditioned on sequence properties, simplified (Yu, 2005)</option> <option value="3">Compositional matrix adjust conditioned on sequence properties (Yu, 2005)</option> <option value="4">Compositional matrix adjust unconditionally (Yu, 2005)</option> + <option value="5">Compositional matrix adjustment conditioned on sequence properties with fallback on composition-based statistics</option> </param> </when> </conditional> <param argument="--query" type="data" format="fasta,fasta.gz,fastqsanger,fastqsanger.gz,fastqillumina,fastqillumina.gz" label="Input query file in FASTA or FASTQ format"/> <conditional name="ref_db_source"> - <param name="db_source" type="select" label="Will you select a reference database from your history or use a built-in index?" help="Built-ins were indexed using default options"> - <option value="indexed">Use a built-in index</option> + <param name="db_source" type="select" label="Reference database source" help=""> + <option value="blast">Use a built-in BLAST index</option> <option value="history">Use one from the history</option> + <option value="indexed">Use a built-in DIAMOND index</option> </param> + <when value="blast"> + <param name="reference_database" type="select" label="Reference database" help="If your database of interest is not listed, contact your Galaxy admin"> + <options from_data_table="blastdb_p"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + <param name="ncbi_taxonomy" type="select" optional="true" label="NCBI taxonomy database" help="Needed for output of taxonomy columns in tabular output"> + <options from_data_table="ncbi_taxonomy"> + <validator message="No NCBI database is available. Ask your Galaxy adin" type="no_options"/> + </options> + </param> + </when> <when value="indexed"> - <param name="index" type="select" label="Select a reference database" help="If your database of interest is not listed, contact your Galaxy admin"> + <param name="reference_database" type="select" label="Reference database" help="If your database of interest is not listed, contact your Galaxy admin"> <options from_data_table="diamond_database"> <filter type="sort_by" column="2"/> <validator type="no_options" message="No indexes are available for the selected input dataset"/> @@ -192,25 +228,11 @@ </param> </when> <when value="history"> - <param name="reference_database" argument="--db" type="data" format="dmnd" label="Select the reference database"/> + <param name="reference_database" argument="--db" type="data" format="dmnd" label="Reference database"/> </when> </conditional> - <conditional name="tax_cond"> - <param name="tax_select" type="select" label="Restrict search taxonomically?" help="Any taxonomic rank can be used, and only reference sequences matching one of the specified taxon ids will be searched against."> - <option value="no" selected="True">No</option> - <option value="list">List of taxids entered manually</option> - <option value="file">List of taxids from single column tabular file</option> - </param> - <when value="no"/> - <when value="list"> - <param argument="--taxonlist" type="text" value="" label="Comma separated list of taxon ids" help=""> - <validator type="regex" message="Taxonlist needs to be a comma separated list of integers">[0-9,]*</validator> - </param> - </when> - <when value="file"> - <param name="taxonlistfile" argument="--taxonlist" type="data" format="tabular" label="Keep alignments within the given percentage range of the top alignment score for a quer" help=""/> - </when> - </conditional> + <expand macro="taxon_cond_macro" argument="--taxonlist" cond_name="tax_cond" label="Restrict search taxonomically" help="Any taxonomic rank can be used. Only reference sequences included in the given taxa will be used"/> + <expand macro="taxon_cond_macro" argument="--taxon-exclude" cond_name="tax_exclude_cond" label="Exclude taxa from search" help="Any taxonomic rank can be used. Reference sequences included in the taxonomic rank will be excluded from the search."/> <conditional name="sens_cond"> <param name="sensitivity" type="select" label="Sensitivity Mode" help="Choose one of the sensitivity modes. The default mode is mainly designed for short read alignment, i.e. finding significant matches of >50 bits on 30-40aa fragments. The sensitive mode is a lot more sensitive than the default and generally recommended for aligning longer sequences. The more sensitive mode provides even more sensitivity. More sensitivity may increase computation time."> <option value="--faster">Faster (--faster)</option> @@ -223,28 +245,28 @@ <option value="--ultra-sensitive">Ultra Sensitive (--ultra-sensitive)</option> </param> <when value="--faster"> - <expand macro="block_size_low_sens"/> + <expand macro="block_size" value="2"/> </when> <when value="--fast"> - <expand macro="block_size_low_sens"/> + <expand macro="block_size" value="2"/> </when> <when value=""> - <expand macro="block_size_low_sens"/> + <expand macro="block_size" value="2"/> </when> <when value="--mid-sensitive"> - <expand macro="block_size_low_sens"/> + <expand macro="block_size" value="2"/> </when> <when value="--sensitive"> - <expand macro="block_size_low_sens"/> + <expand macro="block_size" value="2"/> </when> <when value="--more-sensitive"> - <expand macro="block_size_low_sens"/> + <expand macro="block_size" value="2"/> </when> <when value="--very-sensitive"> - <expand macro="block_size_hi_sens"/> + <expand macro="block_size" value="0.4"/> </when> <when value="--ultra-sensitive"> - <expand macro="block_size_hi_sens"/> + <expand macro="block_size" value="0.4"/> </when> </conditional> <param argument="--matrix" type="select" label="Scoring matrix" help="In parentheses are the supported values for (gap open)/(gap extend). In brackets are default gap penalties"> @@ -260,40 +282,18 @@ <param argument="--gapopen" type="integer" optional="True" value="" label="Gap open penalty" help="Leave empty for default (see scoring matrix)"/> <param argument="--gapextend" type="integer" optional="True" value="" label="Gap extension penalty" help="Leave empty for default (see scoring matrix)"/> <param argument="--masking" type="select" label="Masking algorithm" help="DIAMOND by default applies the tantan repeat masking algorithm to the query and target sequences as described in (Frith, 2011). This masking procedure increases the specificity of alignments and serves to filter out spurious hits. Note that when using --comp-based-stats (2,3,4), tantan masking is disabled by default."> - <option value="0">Disabled</option> - <option value="1" selected="true">Tantan</option> + <option value="none">Disabled</option> + <option value="tantan" selected="true">Tantan</option> <option value="seg">SEG</option> </param> - <conditional name="filter_score"> - <param name="filter_score_select" type="select" label="Method to filter?" help="(--evalue/--min-score)"> - <option value="evalue" selected="True">Maximum e-value to report alignments</option> - <option value="min-score">Minimum bit score to report alignments</option> - </param> - <when value="evalue"> - <param argument="--evalue" type="float" value="0.001" label="Maximum expected value to keep an alignment"/> - </when> - <when value="min-score"> - <param argument="--min-score" type="integer" value="0" label="Minimum bit score to keep an alignment" help="(--min-score)"/> - </when> - </conditional> - <param argument="--swipe" type="boolean" truevalue="--swipe" falsevalue="" checked="false" label="Run Exhaustive alignment against all database sequences" help="Smith Waterman alignments of all queries will be computed against all targets."/> - <param argument="--iterate" type="boolean" truevalue="--iterate" falsevalue="" checked="false" label="Run multiple rounds of searches with increasing sensitivity" help="The query dataset will first be searched at a lower sensitivity setting, only searching those query sequences at the target sensitivity that fail to produce a significant alignment at a lower sensitivity."/> - <param argument="--algo" type="select" label="Algorithm for seed search" help="Double-indexed is the main algorithm of the program, designed for large input files but less efficient for small query files. Query-indexed and improves performance for small query files. This mode will be automatically triggered based on the input. Contiguous-seed mode and further improves performance for small query files. The modes differ slightly in their sensitivity, so results are not guaranteed to be 100% identical for different settings of this option."> - <option value="0">Doble-indexed (0)</option> - <option value="1">Query-indexed (1)</option> - <option value="ctg">Contiguous-seed mode (ctg)</option> - </param> + <expand macro="hit_filter_macro"/> - <param argument="--global-ranking" type="integer" min="0" value="" optional="true" label="Limit on the number of Smith Waterman extensions" help="Target sequences will be ranked according to their ungapped extension scores at seed hits, and gapped extensions will only be computed for the best N targets for each query. Note that this option increases memory use."/> - <param argument="--id" type="integer" value="0" label="Minimum identity percentage to report an alignment" help="Report only alignments above the given percentage of sequence identity"/> - <param argument="--query-cover" type="integer" value="0" label="Minimum query cover percentage to report an alignment" help="Report only alignments above the given percentage of query cover"/> - <param argument="--subject-cover" type="integer" value="0" label="Minimum subject cover percentage to report an alignment" help="Report only alignments above the given percentage of subject cover"/> - <section name="output_section" title="Output options"> - <param argument="--max-hsps" type="integer" min="0" optional="true" label="Maximum number of HSPs" help="The maximum number of HSPs (High-Scoring Segment Pairs) per target sequence to report for each query. The default policy is to report only the highest-scoring HSP for each target, while disregarding alternative, lower-scoring HSPs that are contained in the same target."/> + <section name="output_section" title="Output options" expanded="true"> <expand macro="output_type_macro"> <!-- Taxonomy features are not supported for the DAA format (i.e. can't be used in diamond view) --> - <option value="staxids">unique Subject Taxonomy ID(s), separated by a ';' (in numerical order)</option> + <option value="staxids">Subject Taxonomy ID(s), separated by a ';' (in numerical order)</option> + <option value="sscinames">Subject Scientific Name(s)</option> <option value="sskingdoms">Subject super kingdoms</option> <option value="skingdoms">Subject kingdoms</option> <option value="sphylums">Subject phylums</option> @@ -303,20 +303,30 @@ <option value="--al">Output aligned queries (--al)</option> </param> <param argument="--log" type="boolean" truevalue="--log" falsevalue="" label="Output log file"/> + <param argument="--max-hsps" type="integer" min="0" optional="true" label="Maximum number of HSPs" help="The maximum number of HSPs (High-Scoring Segment Pairs) per target sequence to report for each query. The default policy is to report only the highest-scoring HSP for each target, while disregarding alternative, lower-scoring HSPs that are contained in the same target."/> </section> <section name="advanced_section" title="Advanced options" expanded="false"> <param argument="--seed-cut" type="float" min="0" optional="true" label="Set a complexity cutoff for indexed seeds"/> <param argument="--freq-masking" type="boolean" truevalue="--freq-masking" falsevalue="" checked="false" label="Enable masking seeds based on frequency" help="This option is incompatible with --sed-cut"/> <param argument="--soft-masking" type="select" label="Soft Masking" help="Select type of soft masking"> + <!-- https://github.com/bbuchfink/diamond/issues/916 --> <option value="0" selected="True">Disbled</option> - <option value="seg">seg</option> + <!-- <option value="seg">seg</option> --> <option value="tantan">tantan</option> </param> <param argument="--motif-masking" type="select" label="Softmask abundant motifs" help="Enable or disable motif masking"> <option value="0">Disabled</option> <option value="1">Enabled</option> </param> + <param argument="--swipe" type="boolean" truevalue="--swipe" falsevalue="" checked="false" label="Run Exhaustive alignment against all database sequences" help="Smith Waterman alignments of all queries will be computed against all targets."/> + <param argument="--iterate" type="boolean" truevalue="--iterate" falsevalue="" checked="false" label="Run multiple rounds of searches with increasing sensitivity" help="The query dataset will first be searched at a lower sensitivity setting, only searching those query sequences at the target sensitivity that fail to produce a significant alignment at a lower sensitivity."/> + <param argument="--algo" type="select" label="Algorithm for seed search" help="Double-indexed is the main algorithm of the program, designed for large input files but less efficient for small query files. Query-indexed and improves performance for small query files. This mode will be automatically triggered based on the input. Contiguous-seed mode and further improves performance for small query files. The modes differ slightly in their sensitivity, so results are not guaranteed to be 100% identical for different settings of this option."> + <option value="0">Double-indexed (0)</option> + <option value="1">Query-indexed (1)</option> + <option value="ctg">Contiguous-seed mode (ctg)</option> + </param> + <param argument="--global-ranking" type="integer" min="0" value="" optional="true" label="Limit on the number of Smith Waterman extensions" help="Target sequences will be ranked according to their ungapped extension scores at seed hits, and gapped extensions will only be computed for the best N targets for each query. Note that this option increases memory use."/> </section> </inputs> <outputs> @@ -332,7 +342,7 @@ </data> </outputs> <tests> - <!--Test 01--> + <!--Test 01 al and unal output --> <test expect_num_outputs="3"> <conditional name="method_cond"> <param name="method_select" value="blastp"/> @@ -354,7 +364,7 @@ <param name="sensitivity" value=""/> </conditional> <param name="matrix" value="BLOSUM62"/> - <param name="masking" value="1"/> + <param name="masking" value="tantan"/> <conditional name="hit_filter"> <param name="hit_filter_select" value="max"/> <param name="max_target_seqs" value="25"/> @@ -375,12 +385,12 @@ </output> <output name="alqueries"> <assert_contents> - <has_line line=">sequence more text"/> + <has_line line=">NP_008227.1 cytochrome c oxidase subunit I (mitochondrion) [Pongo pygmaeus]"/> </assert_contents> </output> <output name="blast_tabular" file="diamond_results.tabular"/> </test> - <!--Test 02--> + <!--Test 02 non-gz input, taxon list, no al and unal output, simple header --> <test expect_num_outputs="1"> <conditional name="method_cond"> <param name="method_select" value="blastp"/> @@ -393,19 +403,20 @@ </conditional> <conditional name="tax_cond"> <param name="tax_select" value="list"/> - <param name="taxonlist" value="2"/> + <param name="taxonlist" value="42"/> <!-- the taxID needed to use here is printed during the execution of gen.sh (filter_and_map_ids.py) it is not a NCBI taxID--> </conditional> <section name="output_section"> <conditional name="output"> <param name="outfmt" value="6"/> <param name="fields" value="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore"/> + <param name="header" value="simple"/> </conditional> </section> <conditional name="sens_cond"> <param name="sensitivity" value=""/> </conditional> <param name="matrix" value="BLOSUM62"/> - <param name="masking" value="1"/> + <param name="masking" value="tantan"/> <conditional name="hit_filter"> <param name="hit_filter_select" value="max"/> <param name="max_target_seqs" value="25"/> @@ -421,7 +432,7 @@ </conditional> <output name="blast_tabular" file="diamond_results.wtax.tabular"/> </test> - <!--Test 03--> + <!--Test 03 blastx, outfmt --> <test expect_num_outputs="1"> <conditional name="method_cond"> <param name="method_select" value="blastx"/> @@ -444,7 +455,7 @@ <param name="sensitivity" value=""/> </conditional> <param name="matrix" value="BLOSUM62"/> - <param name="masking" value="1"/> + <param name="masking" value="tantan"/> <conditional name="hit_filter"> <param name="hit_filter_select" value="top"/> <param name="top" value="10"/> @@ -460,7 +471,7 @@ </conditional> <output name="blast_tabular" file="diamond_results.pairwise"/> </test> - <!--Test 04--> + <!--Test 04 outfmt daa --> <test expect_num_outputs="1"> <conditional name="method_cond"> <param name="method_select" value="blastp"/> @@ -475,9 +486,13 @@ <param name="outfmt" value="100"/> </conditional> </section> - <output name="daa_output" file="diamond_results.daa" compare="sim_size" delta="10"/> + <output name="daa_output" ftype="daa"> + <assert_contents> + <has_size size="5602" delta="10"/> + </assert_contents> + </output> </test> - <!--Test 05--> + <!--Test 05 blastx w indexed diamond DB --> <test expect_num_outputs="1"> <conditional name="method_cond"> <param name="method_select" value="blastx"/> @@ -489,7 +504,7 @@ <param name="query" value="nucleotide.fasta" ftype="fasta"/> <conditional name="ref_db_source"> <param name="db_source" value="indexed"/> - <param name="index" value="testDb"/> + <param name="reference_database" value="testDb"/> </conditional> <section name="output_section"> <conditional name="output"> @@ -500,7 +515,7 @@ <param name="sensitivity" value=""/> </conditional> <param name="matrix" value="BLOSUM62"/> - <param name="masking" value="1"/> + <param name="masking" value="tantan"/> <conditional name="hit_filter"> <param name="hit_filter_select" value="top"/> <param name="top" value="10"/> @@ -524,18 +539,28 @@ <param name="query" value="nucleotide.fasta" ftype="fasta"/> <conditional name="ref_db_source"> <param name="db_source" value="indexed"/> - <param name="index" value="testDb"/> + <param name="reference_database" value="testDb"/> </conditional> - <param name="iterate" value="true"/> + <section name="advanced_section"> + <param name="iterate" value="true"/> + </section> <section name="output_section"> <conditional name="output"> <param name="outfmt" value="6"/> <param name="fields" value="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore"/> + <param name="header" value="verbose"/> </conditional> </section> - <output name="blast_tabular" file="diamond_results_iterate.tabular"/> + <!-- verbose header contains path -> allow for lines_diff 2, assert header line separately --> + <output name="blast_tabular" file="diamond_results_iterate.tabular" lines_diff="2"> + <assert_contents> + <has_text text="# Invocation: diamond blastx"/> + </assert_contents> + </output> </test> <!-- Test 07 swipe option--> + <!-- + https://github.com/bbuchfink/diamond/issues/915 <test expect_num_outputs="1"> <conditional name="method_cond"> <param name="method_select" value="blastx"/> @@ -543,7 +568,7 @@ <param name="query" value="nucleotide.fasta" ftype="fasta"/> <conditional name="ref_db_source"> <param name="db_source" value="indexed"/> - <param name="index" value="testDb"/> + <param name="reference_database" value="testDb"/> </conditional> <param name="swipe" value="true"/> <section name="output_section"> @@ -553,7 +578,7 @@ </conditional> </section> <output name="blast_tabular" file="diamond_results_swipe.tabular"/> - </test> + </test> --> <!--Test 08 algo option--> <test expect_num_outputs="1"> <conditional name="method_cond"> @@ -562,9 +587,11 @@ <param name="query" value="nucleotide.fasta" ftype="fasta"/> <conditional name="ref_db_source"> <param name="db_source" value="indexed"/> - <param name="index" value="testDb"/> + <param name="reference_database" value="testDb"/> </conditional> - <param name="algo" value="1"/> + <section name="advanced_section"> + <param name="algo" value="1"/> + </section> <section name="output_section"> <conditional name="output"> <param name="outfmt" value="6"/> @@ -581,9 +608,11 @@ <param name="query" value="nucleotide.fasta" ftype="fasta"/> <conditional name="ref_db_source"> <param name="db_source" value="indexed"/> - <param name="index" value="testDb"/> + <param name="reference_database" value="testDb"/> </conditional> - <param name="global_ranking" value="10"/> + <section name="advanced_section"> + <param name="global_ranking" value="10"/> + </section> <section name="output_section"> <conditional name="output"> <param name="outfmt" value="6"/> @@ -600,7 +629,7 @@ <param name="query" value="nucleotide.fasta" ftype="fasta"/> <conditional name="ref_db_source"> <param name="db_source" value="indexed"/> - <param name="index" value="testDb"/> + <param name="reference_database" value="testDb"/> </conditional> <section name="output_section"> <param name="max_hsps" value="10"/> @@ -619,7 +648,7 @@ <param name="query" value="nucleotide.fasta" ftype="fasta"/> <conditional name="ref_db_source"> <param name="db_source" value="indexed"/> - <param name="index" value="testDb"/> + <param name="reference_database" value="testDb"/> </conditional> <section name="advanced_section"> <param name="seed_cut" value="100"/> @@ -640,7 +669,7 @@ <param name="query" value="nucleotide.fasta" ftype="fasta"/> <conditional name="ref_db_source"> <param name="db_source" value="indexed"/> - <param name="index" value="testDb"/> + <param name="reference_database" value="testDb"/> </conditional> <section name="advanced_section"> <param name="freq_masking" value="true"/> @@ -661,10 +690,10 @@ <param name="query" value="nucleotide.fasta" ftype="fasta"/> <conditional name="ref_db_source"> <param name="db_source" value="indexed"/> - <param name="index" value="testDb"/> + <param name="reference_database" value="testDb"/> </conditional> <section name="advanced_section"> - <param name="motif_masking" value="1"/> + <param name="motif_masking" value="0"/> </section> <section name="output_section"> <conditional name="output"> @@ -682,7 +711,7 @@ <param name="query" value="nucleotide.fasta" ftype="fasta"/> <conditional name="ref_db_source"> <param name="db_source" value="indexed"/> - <param name="index" value="testDb"/> + <param name="reference_database" value="testDb"/> </conditional> <section name="advanced_section"> <param name="soft_masking" value="0"/> @@ -703,7 +732,7 @@ <param name="query" value="nucleotide.fasta" ftype="fasta"/> <conditional name="ref_db_source"> <param name="db_source" value="indexed"/> - <param name="index" value="testDb"/> + <param name="reference_database" value="testDb"/> </conditional> <section name="output_section"> <conditional name="output"> @@ -715,10 +744,112 @@ <output name="blast_tabular" file="diamond_results_log_test.tabular"/> <output name="log_file"> <assert_contents> - <has_n_lines n="259"/> - <has_text text="diamond blastx --quiet"/> + <has_n_lines n="375"/> + <has_text text="diamond blastx"/> <has_text text="--log"/> - <has_line line="Sequences = 6, letters = 1694, average length = 282"/> + <has_line line="Sequences = 6, letters = 3076, average length = 512"/> + </assert_contents> + </output> + </test> + + <!--Test 16 test against cached BLAST DB + NO NCBI taxonomy which works as long as (certain) tax columns are not selected in outputs--> + <test expect_num_outputs="1"> + <conditional name="method_cond"> + <param name="method_select" value="blastp"/> + <param name="comp_based_stats" value="1"/> + </conditional> + <param name="query" value="protein.fasta.gz" ftype="fasta.gz"/> + <conditional name="ref_db_source"> + <param name="db_source" value="blast"/> + <param name="reference_database" value="test"/> + </conditional> + <conditional name="tax_cond"> + <param name="tax_select" value="list"/> + <param name="taxonlist" value="2,2759"/> <!-- simulate tax filtering .. --> + </conditional> + <section name="output_section"> + <conditional name="output"> + <param name="outfmt" value="6"/> + <param name="fields" value="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,scovhsp,staxids,cigar"/> + </conditional> + </section> + <conditional name="sens_cond"> + <param name="sensitivity" value=""/> + </conditional> + <param name="matrix" value="BLOSUM62"/> + <param name="masking" value="seg"/> + <conditional name="hit_filter"> + <param name="hit_filter_select" value="max"/> + <param name="max_target_seqs" value="25"/> + </conditional> + <conditional name="filter_score"> + <param name="filter_score_select" value="evalue"/> + <param name="evalue" value="0.001"/> + </conditional> + <param name="id" value="0"/> + <param name="query_cover" value="0"/> + <conditional name="sens_cond"> + <param name="block_size" value="2"/> + </conditional> + <output name="blast_tabular"> + <assert_contents> + <has_n_columns n="15"/> + <has_n_lines n="5"/> + </assert_contents> + </output> + <assert_command> + <!-- ensure that NCBI taxonomy is really not used--> + <has_text text="nodes.dmp" negate="true"/> + <has_text text="names.dmp" negate="true"/> + </assert_command> + </test> + + <!--Test 17 test blastx against cached BLAST DB + tax columns in output + tax filtering file (tetrapoda and ray finned fished should result in mouse, human, zebra fish) --> + <test expect_num_outputs="1"> + <conditional name="method_cond"> + <param name="method_select" value="blastp"/> + <param name="comp_based_stats" value="1"/> + </conditional> + <param name="query" value="protein.fasta.gz" ftype="fasta.gz"/> + <conditional name="ref_db_source"> + <param name="db_source" value="blast"/> + <param name="reference_database" value="test"/> + <param name="ncbi_taxonomy" value="test"/> + </conditional> + <conditional name="tax_cond"> + <param name="tax_select" value="file"/> + <param name="taxonlist" value="taxon.tsv"/> + </conditional> + <section name="output_section"> + <conditional name="output"> + <param name="outfmt" value="6"/> + <param name="fields" value="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,scovhsp,sskingdoms,skingdoms,sphylums,cigar"/> + </conditional> + </section> + <conditional name="sens_cond"> + <param name="sensitivity" value=""/> + </conditional> + <param name="matrix" value="BLOSUM62"/> + <param name="masking" value="seg"/> + <conditional name="hit_filter"> + <param name="hit_filter_select" value="max"/> + <param name="max_target_seqs" value="25"/> + </conditional> + <conditional name="filter_score"> + <param name="filter_score_select" value="evalue"/> + <param name="evalue" value="0.001"/> + </conditional> + <param name="id" value="0"/> + <param name="query_cover" value="0"/> + <conditional name="sens_cond"> + <param name="block_size" value="2"/> + </conditional> + <output name="blast_tabular"> + <assert_contents> + <has_n_columns n="17"/> + <has_n_lines n="3"/> + <has_text text="Metazoa" n="3"/> + <has_text text="Viridiplantae" n="0"/> </assert_contents> </output> </test>
--- a/diamond_makedb.xml Mon Nov 10 15:12:32 2025 +0000 +++ b/diamond_makedb.xml Fri Dec 12 11:13:59 2025 +0000 @@ -9,15 +9,21 @@ <command detect_errors="aggressive"> <!-- DB has two files, *.dmnd and *.tx --> <![CDATA[ + ln -s '$infile' database.$infile.ext && + diamond makedb --threads \${GALAXY_SLOTS:-12} - --in '$infile' + --in database.$infile.ext --db ./database - #if str($tax_cond.tax_select) == 'yes': + #if $tax_cond.tax_select == 'yes': --taxonmap '$tax_cond.taxonmap' --taxonnodes '$tax_cond.taxonnodes' --taxonnames '$tax_cond.taxonnames' + #else if $tax_cond.tax_select == 'yes_cached': + --taxonmap '$tax_cond.ncbi_taxonomy.fields.path'/prot.accession2taxid + --taxonnodes '$tax_cond.ncbi_taxonomy.fields.path'/nodes.dmp + --taxonnames '$tax_cond.ncbi_taxonomy.fields.path'/names.dmp #end if ]]> </command> @@ -25,7 +31,8 @@ <param name="infile" type="data" format="fasta,fasta.gz" label="Input reference file in FASTA format"/> <conditional name="tax_cond"> <param name="tax_select" type="select" label="Add taxonomic data?" help="Needs to be supplied in order to provide taxonomy features of the aligner"> - <option value="yes">Yes</option> + <option value="yes_cached">Using built in NCBI taxonomy</option> + <option value="yes">Yes using datasets from history</option> <option value="no" selected="true">No</option> </param> <when value="yes"> @@ -33,6 +40,13 @@ <param argument="--taxonnodes" type="data" format="tabular" label="Taxonomy nodes.dmp from NCBI" help="This parameter is optional and needs to be supplied in order to provide taxonomy features"/> <param argument="--taxonnames" type="data" format="tabular" label="Taxonomy names.dmp from NCBI" help="This parameter is optional and needs to be supplied in order to provide taxonomy features"/> </when> + <when value="yes_cached"> + <param name="ncbi_taxonomy" type="select" optional="true" label="NCBI taxonomy database" help="Needed for output of taxonomy columns in tabular output"> + <options from_data_table="ncbi_taxonomy"> + <validator message="No NCBI database is available. Ask your Galaxy adin" type="no_options"/> + </options> + </param> + </when> <when value="no"/> </conditional> </inputs> @@ -43,11 +57,18 @@ <test> <param name="infile" value="db.fasta" ftype="fasta"/> <output name="outfile" value="db.dmnd" compare="sim_size" delta="2"/> + <assert_stderr> + <has_text_matching expression="Database sequences +5"/> + <has_text_matching expression="Database letters +2578"/> + </assert_stderr> </test> - <test> <param name="infile" value="db.fasta.gz" ftype="fasta.gz"/> <output name="outfile" value="db.dmnd" compare="sim_size" delta="2"/> + <assert_stderr> + <has_text_matching expression="Database sequences +5"/> + <has_text_matching expression="Database letters +2578"/> + </assert_stderr> </test> <test> <param name="infile" value="db.fasta" ftype="fasta"/> @@ -57,7 +78,32 @@ <param name="taxonnodes" ftype="tabular" value="nodes.dmp"/> <param name="taxonnames" ftype="tabular" value="names.dmp"/> </conditional> + <!-- this test uses a taxdb with consecutive taxIDs which creates the small dmnd test file --> <output name="outfile" value="db-wtax.dmnd" compare="sim_size" delta="2"/> + <assert_stderr> + <has_text_matching expression="Entries in accession to taxid file +5"/> + <has_text_matching expression="Database accessions mapped to taxid +5"/> + <has_text_matching expression="Database sequences mapped to taxid +5"/> + </assert_stderr> + </test> + <test> + <param name="infile" value="db.fasta" ftype="fasta"/> + <conditional name="tax_cond"> + <param name="tax_select" value="yes_cached"/> + <param name="ncbi_taxonomy" value="test"/> + </conditional> + <!-- note that this test uses a different taxDB (original taxIDs - not consecutive) + and therefore we get a larger dmnd file --> + <output name="outfile"> + <assert_contents> + <has_size size="20279226"/> + </assert_contents> + </output> + <assert_stderr> + <has_text_matching expression="Entries in accession to taxid file +5"/> + <has_text_matching expression="Database accessions mapped to taxid +5"/> + <has_text_matching expression="Database sequences mapped to taxid +5"/> + </assert_stderr> </test> </tests> <help>
--- a/macros.xml Mon Nov 10 15:12:32 2025 +0000 +++ b/macros.xml Fri Dec 12 11:13:59 2025 +0000 @@ -1,6 +1,6 @@ <macros> - <token name="@TOOL_VERSION@">2.1.13</token> - <token name="@VERSION_SUFFIX@">1</token> + <token name="@TOOL_VERSION@">2.1.16</token> + <token name="@VERSION_SUFFIX@">0</token> <xml name="requirements"> <requirements> <requirement type="package" version="@TOOL_VERSION@">diamond</requirement> @@ -28,45 +28,55 @@ <when value="0"/> <when value="5"/> <when value="6"> - <param name="fields" type="select" label="Tabular fields" help="" multiple="true"> + <param argument="--fields" type="select" label="Tabular fields" help="" multiple="true"> <option value="qseqid" selected="true">Query Seq - id</option> + <option value="qlen">Query sequence length</option> <option value="sseqid" selected="true">Subject Seq - id</option> <option value="sallseqid">All subject Seq - id(s)</option> - <option value="qlen">Query sequence length</option> <option value="slen">Subject sequence length</option> + <option value="qstart" selected="true">Start of alignment in query</option> + <option value="qend" selected="true">End of alignment in query</option> + <option value="sstart" selected="true">Start of alignment in subject</option> + <option value="send" selected="true">End of alignment in subject</option> + <option value="qseq">Aligned part of query sequence</option> + <option value="qseq_gapped">Aligned part of query sequence (with gaps)</option> + <option value="qseq_translated">Translation of the aligned part of query sequence</option> + <option value="full_qseq">Query sequence</option> + <option value="full_qseq_mate">Query sequence of the mate</option> + <option value="sseq">Aligned part of subject sequence</option> + <option value="sseq_gapped">Aligned part of subject sequence (with gaps)</option> + <option value="full_sseq">Subject sequence</option> + <option value="evalue" selected="true">Expect value</option> + <option value="bitscore" selected="true">Bit score</option> + <option value="corrected_bitscore" selected="true">Bit score corrected for edge effects</option> + <option value="score">Raw score</option> + <option value="length" selected="true">Alignment length</option> <option value="pident" selected="true">Percentage of identical matches</option> - <option value="length" selected="true">Alignment length</option> + <option value="approx_pident">Approximate percentage of identical matches</option> <option value="nident">Number of identical matches</option> <option value="mismatch" selected="true">Number of mismatches</option> <option value="positive">Number of positive - scoring matches</option> <option value="gapopen" selected="true">Number of gap openings</option> <option value="gaps">Total number of gaps</option> <option value="ppos">Percentage of positive - scoring matches</option> - <option value="qstart" selected="true">Start of alignment in query</option> - <option value="qend" selected="true">End of alignment in query</option> - <option value="sstart" selected="true">Start of alignment in subject</option> - <option value="send" selected="true">End of alignment in subject</option> - <option value="qseq">Aligned part of query sequence</option> - <option value="sseq">Aligned part of subject sequence</option> - <option value="qseq_translated">Translation of the aligned part of query sequence</option> - <option value="evalue" selected="true">Expect value</option> - <option value="bitscore" selected="true">Bit score</option> - <option value="score">Raw score</option> <option value="qframe">Query frame</option> <option value="btop">Blast traceback operations(BTOP)</option> - <option value="scovhsp">Subject coverage per HSP</option> + <option value="cigar">Cigar</option> <option value="stitle">Subject Title</option> <option value="salltitles">All Subject Title(s)</option> <option value="qcovhsp">Query Coverage Per HSP</option> + <option value="scovhsp">Subject coverage per HSP</option> <option value="qtitle">Query title</option> - <option value="full_qseq">Query sequence</option> - <option value="full_sseq">Subject sequence</option> <option value="qqual">Query quality values for the aligned part of the query</option> <option value="full_qqual">Query quality values</option> <option value="qstrand">Query strand</option> - <option value="cigar">Cigar</option> <yield/> </param> + <param argument="--header" type="select" label="Use header lines"> + <option value="0">No</option> + <option value="simple">Simple</option> + <option value="verbose">Verbose</option> + </param> </when> <when value="100"> </when> @@ -79,6 +89,19 @@ </conditional> </xml> <xml name="hit_filter_macro"> + <conditional name="filter_score"> + <param name="filter_score_select" type="select" label="Method to filter?" help="(--evalue/--min-score)"> + <option value="evalue" selected="True">Maximum e-value to report alignments</option> + <option value="min-score">Minimum bit score to report alignments</option> + </param> + <when value="evalue"> + <param argument="--evalue" type="float" value="0.001" label="Maximum expected value to keep an alignment"/> + </when> + <when value="min-score"> + <param argument="--min-score" type="integer" value="0" label="Minimum bit score to keep an alignment" help="(--min-score)"/> + </when> + </conditional> + <conditional name="hit_filter"> <param name="hit_filter_select" type="select" label="Method to restrict the number of hits?"> <option value="max">Maximum number of target sequences</option> @@ -91,14 +114,14 @@ <param argument="--top" type="integer" value="0" min="0" max="100" label="Keep alignments within the given percentage range of the top alignment score for a query" help="For example, setting this to 10 will report all alignments whose score is at most 10% lower than the best alignment score for a query."/> </when> </conditional> + <param argument="--id" type="float" value="0" min="0" max="100" label="Minimum identity percentage to report an alignment" help="Report only alignments above the given percentage of sequence identity"/> + <param argument="--approx-id" type="float" value="0" min="0" max="100" label="Minimum approx. identity% to report an alignment"/> + <param argument="--query-cover" type="float" value="0" min="0" max="100" label="Minimum query cover percentage to report an alignment" help="Report only alignments above the given percentage of query cover"/> + <param argument="--subject-cover" type="float" value="0" min="0" max="100" label="Minimum subject cover percentage to report an alignment" help="Report only alignments above the given percentage of subject cover"/> </xml> - <xml name="block_size_low_sens"> - <param argument="--block-size" type="float" value="2" label="Block size in billions of sequence letters to be processed at a time" - help="This is the main parameter for controlling the program’s memory and disk space usage. Bigger numbers will increase the use of memory and temporary disk space, but also improve performance"/> - </xml> - <xml name="block_size_hi_sens"> - <param argument="--block-size" type="float" value="0.4" label="Block size in billions of sequence letters to be processed at a time" - help="This is the main parameter for controlling the program’s memory and disk space usage. Bigger numbers will increase the use of memory and temporary disk space, but also improve performance"/> + <xml name="block_size" tokens="value"> + <param argument="--block-size" type="float" value="@VALUE@" min="0" label="Block size in billions of sequence letters to be processed at a time" + help="This is the main parameter for controlling the program’s memory and disk space usage. Bigger numbers will increase the use of memory and temporary disk space, but also improve performance"/> </xml> <xml name="citations"> <citations> @@ -138,6 +161,7 @@ --out '$blast_xml' #else if $output_section.output.outfmt == "6" --outfmt '6' #echo ' '.join(str($output_section.output.fields).split(',')) + --header $output_section.output.header --out '$blast_tabular' #else if $output_section.output.outfmt == "100" --outfmt '100' @@ -158,4 +182,23 @@ --top '$hit_filter.top' #end if </token> + + <xml name="taxon_cond_macro" tokens="cond_name,label,help,argument"> + <conditional name="@COND_NAME@"> + <param name="tax_select" type="select" label="@LABEL@" help="Any taxonomic rank can be used, and only reference sequences matching one of the specified taxon ids will be searched against."> + <option value="no" selected="True">No</option> + <option value="list">List of taxids entered manually</option> + <option value="file">List of taxids from single column tabular file</option> + </param> + <when value="no"/> + <when value="list"> + <param argument="@ARGUMENT@" type="text" value="" label="Taxon IDss" help="Comma separated list"> + <validator type="regex" message="Taxonlist needs to be a comma separated list of integers">[0-9,]*</validator> + </param> + </when> + <when value="file"> + <param argument="@ARGUMENT@" type="data" format="tabular" label="Taxon id file" help="One taxon ID per line"/> + </when> + </conditional> + </xml> </macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blastdb/README Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,10 @@ +BLAST DBs come with additional taxonomic data: taxdb.btd, taxdb.bti, taxonomy4blast.sqlite3 +which are quite large. this folder contains small test data covering a few species + +Oryza sativa 4530 +Drosophila 7215 +Danio rerio 7955 +Homo sapiens 9606 +Mus musculus 10090 + +the files have been provided to @bernt-matthias by the NCBI help desk (ticket help #247163)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blastdb/db.fasta Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,45 @@ +>gi|3950761|gb|YP_514675.1|cytochrome c oxidase subunit 1 (mitochondrion) [Oryza sativa Indica Group] +MTNLVRWLFSTNHKDIGTLYFIFGAIAGVMGTCFSVLIRMELARPGDQILGGNHQLYNVLITAHAFLMIF +FMVMPAMIGGFGNWFVPILIGAPDMAFPRLNNISFWLLPPSLLLLLSSALVEVGSGTGWTVYPPLSGITS +HSGGAVDLAIFSLHLSGVSSILGSINFITTIFNMRGPGMTMHRLPLFVWSVLVTAFLLLLSLPVLAGAIT +MLLTDRNFNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHIVSTFSRKPVFGYLGMVYAMI +SIGVLGFLVWAHHMFTVGLDVDTRAYFTAATMIIAVPTGIKIFSWIATMWGGSIQYKTPMLFAVGFIFLF +TIGGLTGIVLANSGLDIALHDTYYVVAHFHYVLSMGAVFALFAGFYYWVGKIFGRTYPETLGQIHFWITF +FGVNLTFFPMHFLGLSGMPRRIPDYPDAYAGWNALSSFGSYISVVGIRRFFVVVAITSSSGKNKRCAESP +WAVEQNPTTLEWLVQSPPAFHTFGELPAIKETKS +>gi|19893533|gb|YP_009047267.1|cytochrome c oxidase subunit I, partial (mitochondrion) [Drosophila melanogaster] +SRQWLFSTNHKDIGTLYFIFGAWAGMVGTSLSILIRAELGHPGALIGDDQIYNVIVTAHAFIMIFFMVMP +IMIGGFGNWLVPLMLGAPDMAFPRMNNMSFWLLPPALSLLLVSSMVENGAGTGWTVYPPLSAGIAHGGAS +VDLAIFSLHLAGISSILGAVNFITTVINMRSTGISLDRMPLFVWSVVITALLLLLSLPVLAGAITMLLTD +RNLNTSFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIISQESGKKETFGSLGMIYAMLAIGL +LGFIVWAHHMFTVGMDVDTRAYFTSATMIIAVPTGIKIFSWLATLHGTQLSYSPAILWALGFVFLFTVGG +LTGVVLANSSVDIILHDTYYVVAHFHYVLSMGAVFAIMAGFIHWYPLFTGLTLNNKWLKSHFIIMFIGVN +LTFFPQHFLGLAGMPRRYSDYPDAYTTWNIVSTIGSTISLLGILFFFFIIWESLVSQRQVIYPIQLNSSI +EWYQNTPPAEHSYSELPLLTN +>gi|140539|gb|NP_059333.1|cytochrome c oxidase subunit I (mitochondrion) [Danio rerio] +MTITRWFFSTNHKDIGTLYLVFGAWAGMVGTALSLLIRAELSQPGALLGDDQIYNVIVTAHAFVMIFFMV +MPILIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSGVEAGAGTGWTVYPPLAGNLAHAG +ASVDLTIFSLHLAGVSSILGAINFITTTINMKPPTISQYQTPLFVWAVLVTAVLLLLSLPVLAAGITMLL +TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHVVAYYAGKKEPFGYMGMVWAMMAI +GLLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGGAIKWETPMLWALGFIFLFTV +GGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFTGYTLNSVWTKIHFGVMFIG +VNLTFFPQHFLGLAGMPRRYSDYPDAYALWNTVSSIGSLISLVAVIMFLFILWEAFTAKREVLSVELTAT +NVEWLHGCPPPYHTFEEPAFVQIQSN +>gi|4512|gb|YP_003024028.1|cytochrome c oxidase subunit I (mitochondrion) [Homo sapiens] +MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTAHAFVMIFFMV +MPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSLLLLLASAMVEAGAGTGWTVYPPLAGNYSHPG +ASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLL +TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWAMMSI +GFLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGSNMKWSAAVLWALGFIFLFTV +GGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFIHWFPLFSGYTLDQTYAKIHFTIMFIG +VNLTFFPQHFLGLSGMPRRYSDYPDAYTTWNILSSVGSFISLTAVMLMIFMIWEAFASKRKVLMVEEPSM +NLEWLYGCPPPYHTFEEPVYMKS +>gi|17708|gb|NP_904330.1|cytochrome c oxidase subunit I (mitochondrion) [Mus musculus] +MFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSILIRAELGQPGALLGDDQIYNVIVTAHAFVMIFFMV +MPMMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSMVEAGAGTGWTVYPPLAGNLAHAG +ASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLL +TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHVVTYYSGKKEPFGYMGMVWAMMSI +GFLGFIVWAHHMFTVGLDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGGNIKWSPAMLWALGFIFLFTV +GGLTGIVLSNSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFSGFTLDDTWAKAHFAIMFVG +VNMTFFPQHFLGLSGMPRRYSDYPDAYTTWNTVSSMGSFISLTAVLIMIFMIWEAFASKREVMSVSYAST +NLEWLHGCPPPYHTFEEPTYVKVK
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blastdb/db.fasta.pjs Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,27 @@ +{ + "version": "1.2", + "dbname": "db.fasta", + "dbtype": "Protein", + "db-version": 5, + "description": "cox1 blastp DB", + "number-of-letters": 2578, + "number-of-sequences": 5, + "last-updated": "2025-12-09T18:15:00", + "number-of-volumes": 1, + "number-of-taxids": 5, + "bytes-total": 52950, + "bytes-to-cache": 2720, + "files": [ + "db.fasta.pdb", + "db.fasta.phr", + "db.fasta.pin", + "db.fasta.pnd", + "db.fasta.pni", + "db.fasta.pog", + "db.fasta.pos", + "db.fasta.pot", + "db.fasta.psq", + "db.fasta.ptf", + "db.fasta.pto" + ] +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blastdb/filter_and_map_ids.py Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,73 @@ +#!/usr/bin/env python + +# filter names and nodes dmp files by a list of given IDs +# parent node IDs will be added if needed +# +# IDs will be renamed to give a consecuive set of IDs: 1,2,... +# oderwise dmnd databases including taxonomy will be huge +# also make make sure that the order of the taxids is not changed + +from sys import argv + +names_file_name = argv[1] +nodes_file_name = argv[2] +prot2ids_file_name = argv[3] +names_file_out_name = argv[4] +nodes_file_out_name = argv[5] +prot2ids_file_out_name = argv[6] + +parent = dict() +with open(nodes_file_name) as nodes_file: + for line in nodes_file: + line = line.strip().split("|") + parent[line[0].strip()] = line[1].strip() + +initial_ids = set() +with open(prot2ids_file_name) as prot2ids_file: + for i, line in enumerate(prot2ids_file): + if i == 0: + continue + line = line.strip().split() + initial_ids.add(line[2].strip()) + +ids = set() +while len(initial_ids): + i = initial_ids.pop() + p = parent[i] + if p == i: + ids.add(p) + continue + ids.add(i) + initial_ids.add(p) + +id_map = dict() +with open(names_file_name) as names_file, open(names_file_out_name, "w") as names_file_out: + for line in names_file: + line = line.strip().split("|") + id = line[0].strip() + if id not in ids: + continue + if id not in id_map: + id_map[id] = len(id_map) + 1 + names_file_out.write(f'{id_map[id]}\t|{"|".join(line[1:])}\n') + +print(f'taxonlist for test 2 needs to be {id_map["33090"]}') + +with open(nodes_file_name) as nodes_file, open(nodes_file_out_name, "w") as nodes_file_out: + for line in nodes_file: + line = line.strip().split("|") + node = line[0].strip() + parent = line[1].strip() + if node not in ids or parent not in ids: + continue + nodes_file_out.write(f'{id_map[node]}\t|\t{id_map[parent]}\t|{"|".join(line[2:])}\n') + +with open(prot2ids_file_name) as prot2ids_file, open(prot2ids_file_out_name, "w") as prot2ids_file_out: + for i, line in enumerate(prot2ids_file): + if i == 0: + prot2ids_file_out.write(line) + continue + line = line.strip().split() + id = line[2].strip() + line[2] = str(id_map[id]) + prot2ids_file_out.write("\t".join(line) + "\n")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blastdb/gen.sh Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,37 @@ +#/bin/bash + +set -e + +wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz +tar -xzf taxdump.tar.gz + + +# create blast DB +# diamond expects 1234 in the tax data: https://github.com/bbuchfink/diamond/blob/56214dfcb4278f08e935147e8dbea7672997386e/src/data/blastdb/blastdb.cpp#L170 +# more precisely in the taxdb.bt* files (which are here constructed from the dmp files) +# we also add the path to the root (guess not needed strictly) +# ideally 1234 should also in the sqlite DB, but since the taxon is not in the fasta it should be fine +sqlite3 taxonomy4blast.sqlite3 "SELECT * FROM TaxidInfo;" | sed 's/|/\n/g' | sort -n -u | sed 's/^/^/; s/$/\\s/' > grep.txt +echo "^1234\\s" >> grep.txt +echo "^189779\\s" >> grep.txt +echo "^189778\\s" >> grep.txt +echo "^203693\\s" >> grep.txt +echo "^40117\\s" >> grep.txt +echo "^3379134\\s" >> grep.txt +echo "^2\\s" >> grep.txt + +grep -f grep.txt names.dmp > ../ncbi_taxonomy/names.dmp +grep -f grep.txt nodes.dmp > ../ncbi_taxonomy/nodes.dmp + +python taxdb.py +makeblastdb -in db.fasta -parse_seqids -blastdb_version 5 -taxid_map map.txt -title "cox1 blastp DB" -dbtype prot + +# create small dmnd data base with taxonomy +# the important thing to get a small DB is to have consecutive taxIDs +# NOTE: filter_and_map_ids modifies taxIDs (to get a small file), i.e. taxIDs will be different from tests using BLAST DB from above +python filter_and_map_ids.py names.dmp nodes.dmp prot.accession2taxid ../names.dmp ../nodes.dmp ../prot.accession2taxid +diamond makedb --in db.fasta --db ./database --taxonmap ../prot.accession2taxid --taxonnodes ../nodes.dmp --taxonnames ../names.dmp +mv database.dmnd ../db-wtax.dmnd + +rm *.dmp readme.txt taxdump.tar.gz gc.prt +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blastdb/map.txt Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,11 @@ +# file used to store protein IDs to taxids +# diamond expects 1234 in the tax data: https://github.com/bbuchfink/diamond/blob/56214dfcb4278f08e935147e8dbea7672997386e/src/data/blastdb/blastdb.cpp#L170 +# more precisely in the taxdb.bt* files (which are here constructed from the dmp files) +# we also add the path to the root (guess not needed strictly) +# ideally 1234 should also in the sqlite DB, but since the taxon is not in the fasta it should be fine +X 1234 +YP_514675.1 4530 +YP_009047267.1 7215 +NP_059333.1 7955 +YP_003024028.1 9606 +NP_904330.1 10090
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blastdb/prot.accession2taxid Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,6 @@ +accession accession.version taxid gi +YP_514675 YP_514675.1 4530 3950761 +YP_009047267 YP_009047267.1 7215 19893533 +NP_059333 NP_059333.1 7955 140539 +YP_003024028 YP_003024028.1 9606 4512 +NP_904330 NP_904330.1 10090 17708
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blastdb/taxdb.btd Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,1 @@ +Bacteria eubacteria bacteria BacteriaNitrospira bacteria BacteriaEukaryota eukaryotes EukaryotaEmbryophyta plants land plants EukaryotaMagnoliopsida angiosperms flowering plants EukaryotaLaurales flowering plants EukaryotaLauraceae laurel family flowering plants EukaryotaPersea flowering plants EukaryotaPersea americana flowering plants EukaryotaLiliopsida monocotyledons monocots EukaryotaPoaceae grass family monocots EukaryotaOryza monocots EukaryotaOryza sativa rice monocots Eukaryotacommelinids monocots EukaryotaEumetazoa animals EukaryotaArthropoda arthropods EukaryotaHexapoda hexapods EukaryotaDiptera flies EukaryotaBrachycera flies EukaryotaDrosophilidae flies EukaryotaDrosophila fruit fly flies EukaryotaPterygota insects EukaryotaChordata chordates EukaryotaVertebrata vertebrates EukaryotaGnathostomata vertebrates EukaryotaActinopterygii fish ray-finned fishes EukaryotaCypriniformes ray-finned fishes EukaryotaDanio ray-finned fishes EukaryotaDanio rerio zebra fish ray-finned fishes EukaryotaSarcopterygii vertebrates EukaryotaEutheria placental mammals placentals EukaryotaPrimates primates EukaryotaCatarrhini primates EukaryotaHominidae primates EukaryotaHomo humans primates EukaryotaHomo sapiens primates EukaryotaRodentia rodent rodents EukaryotaMuridae rodents EukaryotaMus mouse rodents EukaryotaMus musculus mouse rodents EukaryotaCyprinoidei ray-finned fishes EukaryotaTeleostei ray-finned fishes EukaryotaOstariophysi ray-finned fishes EukaryotaTetrapoda vertebrates EukaryotaAmniota vertebrates EukaryotaTheria mammals EukaryotaViridiplantae green plants green plants EukaryotaOpisthokonta eukaryotes EukaryotaMetazoa multicellular animals animals EukaryotaBilateria animals EukaryotaProtostomia animals EukaryotaNeoptera insects EukaryotaEndopterygota insects EukaryotaDeuterostomia deuterostomes animals EukaryotaStreptophyta green plants EukaryotaPoales monocots EukaryotaMurinae rodents EukaryotaNitrospirota bacteria BacteriaMammalia mammals EukaryotaNeopterygii ray-finned fishes EukaryotaMuscomorpha flies EukaryotaSchizophora flies EukaryotaAcalyptratae flies EukaryotaEphydroidea flies EukaryotaDrosophilinae flies EukaryotaDrosophilini flies EukaryotaInsecta true insects insects EukaryotaTracheophyta vascular plants vascular plants EukaryotaSpermatophyta seed plants seed plants EukaryotaEuphyllophyta vascular plants EukaryotaDicondylia insects EukaryotaPanarthropoda animals EukaryotaCraniata chordates EukaryotaTeleostomi vertebrates EukaryotaEuteleostomi vertebrates EukaryotaStreptophytina green plants EukaryotaOryzoideae monocots EukaryotaOryzeae monocots EukaryotaActinopteri ray-finned fishes EukaryotaClupeocephala ray-finned fishes EukaryotaOtophysi ray-finned fishes EukaryotaCypriniphysae ray-finned fishes EukaryotaOtomorpha ray-finned fishes EukaryotaNitrospirales bacteria BacteriaNitrospiraceae bacteria BacteriaPancrustacea arthropods EukaryotaMandibulata mandibulates arthropods EukaryotaNitrospiria bacteria BacteriaHomininae primates EukaryotaMagnoliidae flowering plants EukaryotaEuarchontoglires placentals EukaryotaGlires placentals EukaryotaSimiiformes primates EukaryotaHominoidea ape primates EukaryotaMuroidea rodents EukaryotaBOP clade monocots EukaryotaHaplorrhini primates EukaryotaCyclorrhapha flies EukaryotaEremoneura flies EukaryotaMus rodents EukaryotaEcdysozoa animals EukaryotaDipnotetrapodomorpha vertebrates EukaryotaBoreoeutheria placentals EukaryotaMesangiospermae flowering plants EukaryotaPetrosaviidae monocots EukaryotaOsteoglossocephalai ray-finned fishes EukaryotaOryzinae monocots EukaryotaMyomorpha rodents EukaryotaDanionidae ray-finned fishes EukaryotaDanioninae ray-finned fishes EukaryotaPseudomonadati bacteria Bacteria \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blastdb/taxdb.py Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 +""" +build_taxdb_from_nodes.py + +Create taxdb.btd and taxdb.bti (NCBI/BLAST/ISAM format) from a pruned nodes.dmp +and optional names.dmp found in the current directory. + +Usage: + python3 build_taxdb_from_nodes.py + +Output: + taxdb.btd + taxdb.bti + +Notes: +- Writes integers in BIG-ENDIAN (network order) as required by the ISAM/NCBI format. +- The btd records are written as: + scientific_name<TAB>common_name<TAB>blast_name<TAB>superkingdom_code + with no reliance on newlines for delimitation (offsets define length). +""" +import struct +import sys +from collections import defaultdict + +NODES_FILE = "../ncbi_taxonomy/nodes.dmp" +NAMES_FILE = "../ncbi_taxonomy/names.dmp" # optional +OUT_BTD = "taxdb.btd" +OUT_BTI = "taxdb.bti" + +TAXDB_MAGIC = 0x8739 + + +# ------------------------- +# Helpers +# ------------------------- +def read_nodes(nodes_path): + """Return dicts: parent[taxid]=parent_taxid, rank[taxid]=rank""" + parent = {} + rank = {} + with open(nodes_path, encoding="utf-8") as fh: + for line in fh: + parts = [p.strip() for p in line.split("|")] + if len(parts) < 3: + continue + try: + taxid = int(parts[0]) + parent_tax = int(parts[1]) + except ValueError: + continue + parent[taxid] = parent_tax + rank[taxid] = parts[2] + return parent, rank + + +def read_names(names_path): + """Return dict: names[taxid] = {'scientific':..., 'common':..., 'blast':...}""" + names = defaultdict(lambda: {"scientific": "", "common": "", "blast": ""}) + with open(names_path, encoding="utf-8") as fh: + for line in fh: + parts = [p.strip() for p in line.split("|")] + if len(parts) < 4: + continue + try: + taxid = int(parts[0]) + except ValueError: + continue + name_txt = parts[1] + name_class = parts[3] + if name_class == "scientific name": + names[taxid]["scientific"] = name_txt + elif name_class == "common name": + names[taxid]["common"] = name_txt + elif name_class == "blast name": + names[taxid]["blast"] = name_txt + return names + + +def infer_superkingdom_code(taxid, parent, rank, sci_name_lookup): + """ + Walk ancestors until rank == 'superkingdom', then map name to code: + B (Bacteria), A (Archaea), E (Eukaryota), V (Viruses), U (Unknown) + """ + seen = set() + cur = taxid + while True: + if cur in seen: + return "Unknown" + seen.add(cur) + r = rank.get(cur, "") + if r == "domain": + name = sci_name_lookup.get(cur, "").lower() + if "bacteria" in name or "eubacteria" in name: + return "Bacteria" + if "archaea" in name: + return "Archaea" + if "eukaryota" in name or "eukaryota" in name or "eukary" in name: + return "Eukaryota" + if "virus" in name or "viruses" in name: + return "Viruses" + return "Unknown" + if cur not in parent: + return "Unknown" + cur = parent[cur] + + +def infer_blast_name(taxid, parent, lookup): + """ + """ + seen = set() + cur = taxid + while True: + if cur in seen: + return "Unknown" + seen.add(cur) + name = lookup.get(cur, "").lower() + + if name: + return name + if cur not in parent: + return "Unknown" + cur = parent[cur] + + +# ------------------------- +# Main +# ------------------------- +def main(): + # Read nodes.dmp + try: + parent, rank = read_nodes(NODES_FILE) + except FileNotFoundError: + print(f"Error: {NODES_FILE} not found in current directory.", file=sys.stderr) + sys.exit(2) + + # Read names.dmp if present + try: + names = read_names(NAMES_FILE) + except FileNotFoundError: + names = defaultdict(lambda: {"scientific": "", "common": "", "blast": ""}) + print("Warning: names.dmp not found. scientific_name will be set to the taxid.", file=sys.stderr) + + # Determine the taxids to write: + # use taxids present in nodes.dmp (pruned set) + taxids = sorted(parent.keys()) + + if len(taxids) == 0: + print("No taxids found in nodes.dmp; nothing to do.", file=sys.stderr) + sys.exit(0) + + # Build scientific-name lookup for superkingdom inference + sci_lookup = {} + for tid, rec in names.items(): + sci_lookup[tid] = rec.get("scientific", "") + + # Build blast-name lookup blast name inference + bla_lookup = {} + for tid, rec in names.items(): + bla_lookup[tid] = rec.get("blast", "") + + # Build btd records and offsets + offsets = [] + btd_buf = bytearray() + for tid in taxids: + offsets.append(len(btd_buf)) + rec = names.get(tid, {"scientific": "", "common": "", "blast": ""}) + sci = rec.get("scientific", "") + com = rec.get("common", "") + + if not sci: + # fallback: use numeric taxid as scientific name (ensures non-empty) + sci = str(tid) + + # infer superkingdom code from nodes.dmp and names if possible + sk = infer_superkingdom_code(tid, parent, rank, sci_lookup) + bla = infer_blast_name(tid, parent, bla_lookup) + + # exactly 4 fields, tab-separated; no trailing newline required + record = f"{sci}\t{com}\t{bla}\t{sk}" + btd_buf.extend(record.encode("utf-8")) + + end_offset = len(btd_buf) + + # Write taxdb.btd + with open(OUT_BTD, "wb") as fh: + fh.write(btd_buf) + + # Write taxdb.bti + with open(OUT_BTI, "wb") as fh: + # header: magic, count (number of real taxids), reserved[4] + # IMPORTANT: write all integers BIG-ENDIAN (>I) + fh.write(struct.pack(">I", TAXDB_MAGIC)) + fh.write(struct.pack(">I", len(taxids))) # n (real entries only) + fh.write(struct.pack(">IIII", 0, 0, 0, 0)) # reserved + + # index entries: (taxid, offset) pairs + for tid, off in zip(taxids, offsets): + fh.write(struct.pack(">I", int(tid))) + fh.write(struct.pack(">I", int(off))) + + # # sentinel entry: taxid=0, offset=end_of_btd + # fh.write(struct.pack(">I", 0)) + # fh.write(struct.pack(">I", end_offset)) + + # Summary + print(f"Wrote {OUT_BTD} ({end_offset} bytes)") + print(f"Wrote {OUT_BTI} (header + {len(taxids)} entries)") + print(f"Taxids written: {len(taxids)}") + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blastdb_p.loc Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,1 @@ +test testDB ${__HERE__}/blastdb/db.fasta \ No newline at end of file
--- a/test-data/db.fasta Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/db.fasta Fri Dec 12 11:13:59 2025 +0000 @@ -1,12 +1,45 @@ ->gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus] -LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV -EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG -LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL -GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX -IENY ->gi|5524212|gb|AAD44167.1| cytochrome c [Elephas minimus minimus] -LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAGGGGGGGWGQMSFWGATVITNLFSAIPYIGTNLV -EWIWGGFSVDKAAAAAAAAAAAAAAAAAAAAAAAAATFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG -LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL -GLMPFLHTSKHRSMMLRPLSQALAAAAAAAAAAAAAAAAAAAAAAATIIGQMASILYFSIILAFLPIAGX -IENY +>gi|3950761|gb|YP_514675.1|cytochrome c oxidase subunit 1 (mitochondrion) [Oryza sativa Indica Group] +MTNLVRWLFSTNHKDIGTLYFIFGAIAGVMGTCFSVLIRMELARPGDQILGGNHQLYNVLITAHAFLMIF +FMVMPAMIGGFGNWFVPILIGAPDMAFPRLNNISFWLLPPSLLLLLSSALVEVGSGTGWTVYPPLSGITS +HSGGAVDLAIFSLHLSGVSSILGSINFITTIFNMRGPGMTMHRLPLFVWSVLVTAFLLLLSLPVLAGAIT +MLLTDRNFNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHIVSTFSRKPVFGYLGMVYAMI +SIGVLGFLVWAHHMFTVGLDVDTRAYFTAATMIIAVPTGIKIFSWIATMWGGSIQYKTPMLFAVGFIFLF +TIGGLTGIVLANSGLDIALHDTYYVVAHFHYVLSMGAVFALFAGFYYWVGKIFGRTYPETLGQIHFWITF +FGVNLTFFPMHFLGLSGMPRRIPDYPDAYAGWNALSSFGSYISVVGIRRFFVVVAITSSSGKNKRCAESP +WAVEQNPTTLEWLVQSPPAFHTFGELPAIKETKS +>gi|19893533|gb|YP_009047267.1|cytochrome c oxidase subunit I, partial (mitochondrion) [Drosophila melanogaster] +SRQWLFSTNHKDIGTLYFIFGAWAGMVGTSLSILIRAELGHPGALIGDDQIYNVIVTAHAFIMIFFMVMP +IMIGGFGNWLVPLMLGAPDMAFPRMNNMSFWLLPPALSLLLVSSMVENGAGTGWTVYPPLSAGIAHGGAS +VDLAIFSLHLAGISSILGAVNFITTVINMRSTGISLDRMPLFVWSVVITALLLLLSLPVLAGAITMLLTD +RNLNTSFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIISQESGKKETFGSLGMIYAMLAIGL +LGFIVWAHHMFTVGMDVDTRAYFTSATMIIAVPTGIKIFSWLATLHGTQLSYSPAILWALGFVFLFTVGG +LTGVVLANSSVDIILHDTYYVVAHFHYVLSMGAVFAIMAGFIHWYPLFTGLTLNNKWLKSHFIIMFIGVN +LTFFPQHFLGLAGMPRRYSDYPDAYTTWNIVSTIGSTISLLGILFFFFIIWESLVSQRQVIYPIQLNSSI +EWYQNTPPAEHSYSELPLLTN +>gi|140539|gb|NP_059333.1|cytochrome c oxidase subunit I (mitochondrion) [Danio rerio] +MTITRWFFSTNHKDIGTLYLVFGAWAGMVGTALSLLIRAELSQPGALLGDDQIYNVIVTAHAFVMIFFMV +MPILIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSGVEAGAGTGWTVYPPLAGNLAHAG +ASVDLTIFSLHLAGVSSILGAINFITTTINMKPPTISQYQTPLFVWAVLVTAVLLLLSLPVLAAGITMLL +TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHVVAYYAGKKEPFGYMGMVWAMMAI +GLLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGGAIKWETPMLWALGFIFLFTV +GGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFTGYTLNSVWTKIHFGVMFIG +VNLTFFPQHFLGLAGMPRRYSDYPDAYALWNTVSSIGSLISLVAVIMFLFILWEAFTAKREVLSVELTAT +NVEWLHGCPPPYHTFEEPAFVQIQSN +>gi|4512|gb|YP_003024028.1|cytochrome c oxidase subunit I (mitochondrion) [Homo sapiens] +MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTAHAFVMIFFMV +MPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSLLLLLASAMVEAGAGTGWTVYPPLAGNYSHPG +ASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLL +TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWAMMSI +GFLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGSNMKWSAAVLWALGFIFLFTV +GGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFIHWFPLFSGYTLDQTYAKIHFTIMFIG +VNLTFFPQHFLGLSGMPRRYSDYPDAYTTWNILSSVGSFISLTAVMLMIFMIWEAFASKRKVLMVEEPSM +NLEWLYGCPPPYHTFEEPVYMKS +>gi|17708|gb|NP_904330.1|cytochrome c oxidase subunit I (mitochondrion) [Mus musculus] +MFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSILIRAELGQPGALLGDDQIYNVIVTAHAFVMIFFMV +MPMMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSMVEAGAGTGWTVYPPLAGNLAHAG +ASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLL +TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHVVTYYSGKKEPFGYMGMVWAMMSI +GFLGFIVWAHHMFTVGLDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGGNIKWSPAMLWALGFIFLFTV +GGLTGIVLSNSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFSGFTLDDTWAKAHFAIMFVG +VNMTFFPQHFLGLSGMPRRYSDYPDAYTTWNTVSSMGSFISLTAVLIMIFMIWEAFASKREVMSVSYAST +NLEWLHGCPPPYHTFEEPTYVKVK
--- a/test-data/diamond_results.pairwise Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/diamond_results.pairwise Fri Dec 12 11:13:59 2025 +0000 @@ -1,34 +1,136 @@ BLASTP 2.3.0+ -Query= sequence more text +Query= NC_001646.1:5332-6871 Pongo pygmaeus mitochondrion, complete genome + +Length=1540 + +>gi|4512|gb|YP_003024028.1|cytochrome c oxidase subunit I (mitochondrion) [Homo sapiens] +Length=513 + + Score = 897 bits (2318), Expect = 0.0 + Identities = 455/512 (88%), Positives = 490/512 (95%), Gaps = 0/512 (0%) + Frame = 1 -Length=849 +Query 1 MFADRWLFSTNHKDIGTLYLLFGA*AGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTA 180 + MFADRWLFSTNHKDIGTLYLLFGA AGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTA +Sbjct 1 MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTA 60 + +Query 181 HAFVIIFFMVMPIIIGGFGN*LVPLIIGAPDMAFPRINNISF*LLLPSFLLLLASATVEA 360 + HAFV+IFFMVMPI+IGGFGN LVPL+IGAPDMAFPR+NN+SF LL PS LLLLASA VEA +Sbjct 61 HAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSLLLLLASAMVEA 120 + +Query 361 GAGTG*TVYPPLAGNYSHPGASVDLTIFSLHLAGISSILGAINFITTIINIKPPAISQYQ 540 + GAGTG TVYPPLAGNYSHPGASVDLTIFSLHLAG+SSILGAINFITTIIN+KPPA++QYQ +Sbjct 121 GAGTGWTVYPPLAGNYSHPGASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQ 180 ->gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus] -Length=284 +Query 541 TPLFV*SILITAVLLLLSLPVLAAGITILLTDRNLNTTFFDPAGGGDPILYQHLF*FFGH 720 + TPLFV S+LITAVLLLLSLPVLAAGIT+LLTDRNLNTTFFDPAGGGDPILYQHLF FFGH +Sbjct 181 TPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPILYQHLFWFFGH 240 + +Query 721 PEVYILILPGFGIISHIVTHYSGKKEPFGYIGIV*AIVSIGFLGFIV*AHHIFTVGIDVD 900 + PEVYILILPGFG+ISHIVT+YSGKKEPFGY+G+V A++SIGFLGFIV AHH+FTVG+DVD +Sbjct 241 PEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWAMMSIGFLGFIVWAHHMFTVGMDVD 300 + +Query 901 TRAYFTSATIIIAIPTGVKVFS*LATLHGSNTK*SAAIL*ALGFIFLFTVGGLTGIVLAN 1080 + TRAYFTSAT+IIAIPTGVKVFS LATLHGSN K SAA+L ALGFIFLFTVGGLTGIVLAN +Sbjct 301 TRAYFTSATMIIAIPTGVKVFSWLATLHGSNMKWSAAVLWALGFIFLFTVGGLTGIVLAN 360 - Score = 550 bits (1417), Expect = 1.44e-205 - Identities = 283/284 (99%), Positives = 283/284 (99%), Gaps = 1/284 (0%) +Query 1081 SSLDIVLHDTYYVVAHFHYVLSIGAVFAIIGGFIHWFPLFSGYTLNQTYAKIHFITIFVG 1260 + SSLDIVLHDTYYVVAHFHYVLS+GAVFAI+GGFIHWFPLFSGYTL+QTYAKIHF +F+G +Sbjct 361 SSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFIHWFPLFSGYTLDQTYAKIHFTIMFIG 420 + +Query 1261 VNLTFFPQHFLGLSGIPRRYSDYPDAYTT*NILSSAGSFISLTAVILIIFII*EAFASKR 1440 + VNLTFFPQHFLGLSG+PRRYSDYPDAYTT NILSS GSFISLTAV+L+IF+I EAFASKR +Sbjct 421 VNLTFFPQHFLGLSGMPRRYSDYPDAYTTWNILSSVGSFISLTAVMLMIFMIWEAFASKR 480 + +Query 1441 KVPIIEQPSTSLEWLYGCPPPYHTFEEPVYIK 1536 + KV ++E+PS +LEWLYGCPPPYHTFEEPVY+K +Sbjct 481 KVLMVEEPSMNLEWLYGCPPPYHTFEEPVYMK 512 + +>gi|17708|gb|NP_904330.1|cytochrome c oxidase subunit I (mitochondrion) [Mus musculus] +Length=514 + + Score = 847 bits (2189), Expect = 8.27e-315 + Identities = 427/512 (83%), Positives = 476/512 (92%), Gaps = 0/512 (0%) Frame = 1 -Query 1 LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFS 180 - LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFS -Sbjct 1 LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFS 60 +Query 1 MFADRWLFSTNHKDIGTLYLLFGA*AGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTA 180 + MF +RWLFSTNHKDIGTLYLLFGA AG++GTALS+LIRAELGQPG LLG+D IYNVIVTA +Sbjct 1 MFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSILIRAELGQPGALLGDDQIYNVIVTA 60 + +Query 181 HAFVIIFFMVMPIIIGGFGN*LVPLIIGAPDMAFPRINNISF*LLLPSFLLLLASATVEA 360 + HAFV+IFFMVMP++IGGFGN LVPL+IGAPDMAFPR+NN+SF LL PSFLLLLAS+ VEA +Sbjct 61 HAFVMIFFMVMPMMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSMVEA 120 + +Query 361 GAGTG*TVYPPLAGNYSHPGASVDLTIFSLHLAGISSILGAINFITTIINIKPPAISQYQ 540 + GAGTG TVYPPLAGN +H GASVDLTIFSLHLAG+SSILGAINFITTIIN+KPPA++QYQ +Sbjct 121 GAGTGWTVYPPLAGNLAHAGASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQ 180 + +Query 541 TPLFV*SILITAVLLLLSLPVLAAGITILLTDRNLNTTFFDPAGGGDPILYQHLF*FFGH 720 + TPLFV S+LITAVLLLLSLPVLAAGIT+LLTDRNLNTTFFDPAGGGDPILYQHLF FFGH +Sbjct 181 TPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPILYQHLFWFFGH 240 -Query 181 AIPYIGTNLVEWIWGGFSVDKATLNRFFAFHFIL-FTMVALAGVHLTFLHETGSNNPLGL 357 - AIPYIGTNLVEWIWGGFSVDKATLNRFFAFHFIL FTMVALAGVHLTFLHETGSNNPLGL -Sbjct 61 AIPYIGTNLVEWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGL 120 +Query 721 PEVYILILPGFGIISHIVTHYSGKKEPFGYIGIV*AIVSIGFLGFIV*AHHIFTVGIDVD 900 + PEVYILILPGFGIISH+VT+YSGKKEPFGY+G+V A++SIGFLGFIV AHH+FTVG+DVD +Sbjct 241 PEVYILILPGFGIISHVVTYYSGKKEPFGYMGMVWAMMSIGFLGFIVWAHHMFTVGLDVD 300 + +Query 901 TRAYFTSATIIIAIPTGVKVFS*LATLHGSNTK*SAAIL*ALGFIFLFTVGGLTGIVLAN 1080 + TRAYFTSAT+IIAIPTGVKVFS LATLHG N K S A+L ALGFIFLFTVGGLTGIVL+N +Sbjct 301 TRAYFTSATMIIAIPTGVKVFSWLATLHGGNIKWSPAMLWALGFIFLFTVGGLTGIVLSN 360 + +Query 1081 SSLDIVLHDTYYVVAHFHYVLSIGAVFAIIGGFIHWFPLFSGYTLNQTYAKIHFITIFVG 1260 + SSLDIVLHDTYYVVAHFHYVLS+GAVFAI+ GF+HWFPLFSG+TL+ T+AK HF +FVG +Sbjct 361 SSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFSGFTLDDTWAKAHFAIMFVG 420 + +Query 1261 VNLTFFPQHFLGLSGIPRRYSDYPDAYTT*NILSSAGSFISLTAVILIIFII*EAFASKR 1440 + VN+TFFPQHFLGLSG+PRRYSDYPDAYTT N +SS GSFISLTAV+++IF+I EAFASKR +Sbjct 421 VNMTFFPQHFLGLSGMPRRYSDYPDAYTTWNTVSSMGSFISLTAVLIMIFMIWEAFASKR 480 + +Query 1441 KVPIIEQPSTSLEWLYGCPPPYHTFEEPVYIK 1536 + +V + ST+LEWL+GCPPPYHTFEEP Y+K +Sbjct 481 EVMSVSYASTNLEWLHGCPPPYHTFEEPTYVK 512 + +>gi|140539|gb|NP_059333.1|cytochrome c oxidase subunit I (mitochondrion) [Danio rerio] +Length=516 -Query 358 TSDSDKIPFHPYYTIKDFLGLLILXXXXXXXALLSPDMLGDPDNHMPADPLNTPLHIKPE 537 - TSDSDKIPFHPYYTIKDFLGLLILXXXXXXXALLSPDMLGDPDNHMPADPLNTPLHIKPE -Sbjct 121 TSDSDKIPFHPYYTIKDFLGLLILXXXXXXXALLSPDMLGDPDNHMPADPLNTPLHIKPE 180 + Score = 810 bits (2091), Expect = 7.42e-300 + Identities = 407/512 (79%), Positives = 459/512 (89%), Gaps = 0/512 (0%) + Frame = 1 + +Query 1 MFADRWLFSTNHKDIGTLYLLFGA*AGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTA 180 + M RW FSTNHKDIGTLYL+FGA AG++GTALSLLIRAEL QPG LLG+D IYNVIVTA +Sbjct 1 MTITRWFFSTNHKDIGTLYLVFGAWAGMVGTALSLLIRAELSQPGALLGDDQIYNVIVTA 60 + +Query 181 HAFVIIFFMVMPIIIGGFGN*LVPLIIGAPDMAFPRINNISF*LLLPSFLLLLASATVEA 360 + HAFV+IFFMVMPI+IGGFGN LVPL+IGAPDMAFPR+NN+SF LL PSFLLLLAS+ VEA +Sbjct 61 HAFVMIFFMVMPILIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSGVEA 120 + +Query 361 GAGTG*TVYPPLAGNYSHPGASVDLTIFSLHLAGISSILGAINFITTIINIKPPAISQYQ 540 + GAGTG TVYPPLAGN +H GASVDLTIFSLHLAG+SSILGAINFITT IN+KPP ISQYQ +Sbjct 121 GAGTGWTVYPPLAGNLAHAGASVDLTIFSLHLAGVSSILGAINFITTTINMKPPTISQYQ 180 + +Query 541 TPLFV*SILITAVLLLLSLPVLAAGITILLTDRNLNTTFFDPAGGGDPILYQHLF*FFGH 720 + TPLFV ++L+TAVLLLLSLPVLAAGIT+LLTDRNLNTTFFDPAGGGDPILYQHLF FFGH +Sbjct 181 TPLFVWAVLVTAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPILYQHLFWFFGH 240 -Query 538 WYFLFAYAILRSVPNKLGGVLALFLSIVILGLMPFLHTSKHRSMMLRPLSQALFWTLTMD 717 - WYFLFAYAILRSVPNKLGGVLALFLSIVILGLMPFLHTSKHRSMMLRPLSQALFWTLTMD -Sbjct 181 WYFLFAYAILRSVPNKLGGVLALFLSIVILGLMPFLHTSKHRSMMLRPLSQALFWTLTMD 240 +Query 721 PEVYILILPGFGIISHIVTHYSGKKEPFGYIGIV*AIVSIGFLGFIV*AHHIFTVGIDVD 900 + PEVYILILPGFGIISH+V +Y+GKKEPFGY+G+V A+++IG LGFIV AHH+FTVG+DVD +Sbjct 241 PEVYILILPGFGIISHVVAYYAGKKEPFGYMGMVWAMMAIGLLGFIVWAHHMFTVGMDVD 300 + +Query 901 TRAYFTSATIIIAIPTGVKVFS*LATLHGSNTK*SAAIL*ALGFIFLFTVGGLTGIVLAN 1080 + TRAYFTSAT+IIAIPTGVKVFS LATLHG K +L ALGFIFLFTVGGLTGIVLAN +Sbjct 301 TRAYFTSATMIIAIPTGVKVFSWLATLHGGAIKWETPMLWALGFIFLFTVGGLTGIVLAN 360 -Query 718 LLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGXIENY 849 - LLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGXIENY -Sbjct 241 LLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGXIENY 284 +Query 1081 SSLDIVLHDTYYVVAHFHYVLSIGAVFAIIGGFIHWFPLFSGYTLNQTYAKIHFITIFVG 1260 + SSLDIVLHDTYYVVAHFHYVLS+GAVFAI+ GF+HWFPLF+GYTLN + KIHF +F+G +Sbjct 361 SSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFTGYTLNSVWTKIHFGVMFIG 420 +Query 1261 VNLTFFPQHFLGLSGIPRRYSDYPDAYTT*NILSSAGSFISLTAVILIIFII*EAFASKR 1440 + VNLTFFPQHFLGL+G+PRRYSDYPDAY N +SS GS ISL AVI+ +FI+ EAF +KR +Sbjct 421 VNLTFFPQHFLGLAGMPRRYSDYPDAYALWNTVSSIGSLISLVAVIMFLFILWEAFTAKR 480 + +Query 1441 KVPIIEQPSTSLEWLYGCPPPYHTFEEPVYIK 1536 + +V +E +T++EWL+GCPPPYHTFEEP +++ +Sbjct 481 EVLSVELTATNVEWLHGCPPPYHTFEEPAFVQ 512 +
--- a/test-data/diamond_results.tabular Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/diamond_results.tabular Fri Dec 12 11:13:59 2025 +0000 @@ -1,2 +1,5 @@ -sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 283 1 284 1.44e-205 550 100 0 0 0 94M1D189M -sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 283 1 284 5.77e-150 409 100 0 0 0 105M1D178M +NP_008227.1 gi|4512|gb|YP_003024028.1|cytochrome 95.9 512 21 0 1 512 1 512 0.0 999 99.8 0 Metazoa Chordata 512M +NP_008227.1 gi|17708|gb|NP_904330.1|cytochrome 89.6 512 53 0 1 512 1 512 0.0 942 99.6 0 Metazoa Chordata 512M +NP_008227.1 gi|140539|gb|NP_059333.1|cytochrome 84.2 512 81 0 1 512 1 512 0.0 894 99.2 0 Metazoa Chordata 512M +NP_008227.1 gi|19893533|gb|YP_009047267.1|cytochrome 76.2 505 120 0 3 507 1 505 1.13e-295 799 98.8 0 Metazoa Arthropoda 505M +NP_008227.1 gi|3950761|gb|YP_514675.1|cytochrome 68.7 511 151 4 5 507 6 515 2.93e-259 707 97.3 0 Viridiplantae Streptophyta 44M2D214M1I202M2D18M4D24M
--- a/test-data/diamond_results.wtax.tabular Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/diamond_results.wtax.tabular Fri Dec 12 11:13:59 2025 +0000 @@ -1,1 +1,2 @@ -sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 283 1 284 1.44e-205 550 +qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore +NP_008227.1 gi|3950761|gb|YP_514675.1|cytochrome 68.7 511 151 4 5 507 6 515 2.93e-259 707
--- a/test-data/diamond_results_algorithm.tabular Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/diamond_results_algorithm.tabular Fri Dec 12 11:13:59 2025 +0000 @@ -1,2 +1,5 @@ -sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 849 1 284 1.44e-205 550 -sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 849 1 284 5.77e-150 409 +NC_001646.1:5332-6871 gi|4512|gb|YP_003024028.1|cytochrome 88.9 512 57 0 1 1536 1 512 0.0 897 +NC_001646.1:5332-6871 gi|17708|gb|NP_904330.1|cytochrome 83.4 512 85 0 1 1536 1 512 8.27e-315 847 +NC_001646.1:5332-6871 gi|140539|gb|NP_059333.1|cytochrome 79.5 512 105 0 1 1536 1 512 7.42e-300 810 +NC_001646.1:5332-6871 gi|19893533|gb|YP_009047267.1|cytochrome 71.3 505 145 0 7 1521 1 505 2.88e-263 717 +NC_001646.1:5332-6871 gi|3950761|gb|YP_514675.1|cytochrome 65.5 516 169 4 13 1536 6 520 4.52e-237 651
--- a/test-data/diamond_results_freq_masking.tabular Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/diamond_results_freq_masking.tabular Fri Dec 12 11:13:59 2025 +0000 @@ -1,2 +1,5 @@ -sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 849 1 284 1.44e-205 550 -sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 849 1 284 5.77e-150 409 +NC_001646.1:5332-6871 gi|4512|gb|YP_003024028.1|cytochrome 88.9 512 57 0 1 1536 1 512 0.0 897 +NC_001646.1:5332-6871 gi|17708|gb|NP_904330.1|cytochrome 83.4 512 85 0 1 1536 1 512 8.27e-315 847 +NC_001646.1:5332-6871 gi|140539|gb|NP_059333.1|cytochrome 79.5 512 105 0 1 1536 1 512 7.42e-300 810 +NC_001646.1:5332-6871 gi|19893533|gb|YP_009047267.1|cytochrome 71.3 505 145 0 7 1521 1 505 2.88e-263 717 +NC_001646.1:5332-6871 gi|3950761|gb|YP_514675.1|cytochrome 65.5 516 169 4 13 1536 6 520 4.52e-237 651
--- a/test-data/diamond_results_global_ranking.tabular Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/diamond_results_global_ranking.tabular Fri Dec 12 11:13:59 2025 +0000 @@ -1,2 +1,5 @@ -sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 849 1 284 1.44e-205 550 -sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 849 1 284 5.77e-150 409 +NC_001646.1:5332-6871 gi|4512|gb|YP_003024028.1|cytochrome 88.9 512 57 0 1 1536 1 512 0.0 897 +NC_001646.1:5332-6871 gi|17708|gb|NP_904330.1|cytochrome 83.4 512 85 0 1 1536 1 512 8.27e-315 847 +NC_001646.1:5332-6871 gi|140539|gb|NP_059333.1|cytochrome 79.5 512 105 0 1 1536 1 512 7.42e-300 810 +NC_001646.1:5332-6871 gi|19893533|gb|YP_009047267.1|cytochrome 71.3 505 145 0 7 1521 1 505 2.88e-263 717 +NC_001646.1:5332-6871 gi|3950761|gb|YP_514675.1|cytochrome 65.5 516 169 4 13 1536 6 520 4.52e-237 651
--- a/test-data/diamond_results_iterate.tabular Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/diamond_results_iterate.tabular Fri Dec 12 11:13:59 2025 +0000 @@ -1,2 +1,7 @@ -sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 849 1 284 1.44e-205 550 -sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 849 1 284 5.77e-150 409 +# DIAMOND v2.1.16. http://github.com/bbuchfink/diamond +# Invocation: diamond blastx --threads 1 --db database.dmnd --query /tmp/tmpn1890frb/files/5/2/9/dataset_529e1e94-1186-4385-a242-298cfe957f6a.dat --query-gencode 1 --strand both --min-orf 1 --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore --header verbose --out /tmp/tmpn1890frb/job_working_directory/000/16/outputs/dataset_d67e7f18-9e9b-4100-8fff-c094b2a162ab.dat --compress 0 --iterate --algo 0 --matrix BLOSUM62 --comp-based-stats 1 --masking 1 --max-target-seqs 25 --evalue 0.001 --id 0 --query-cover 0 --subject-cover 0 --block-size 2.0 --motif-masking 0 --soft-masking 0 --index-chunks 4 --file-buffer-size 67108864 +# Fields: Query ID, Subject ID, Percentage of identical matches, Alignment length, Number of mismatches, Number of gap openings, Start of alignment in query, End of alignment in query, Start of alignment in subject, End of alignment in subject, Expected value, Bit score +NC_001646.1:5332-6871 gi|4512|gb|YP_003024028.1|cytochrome 88.9 512 57 0 1 1536 1 512 0.0 897 +NC_001646.1:5332-6871 gi|17708|gb|NP_904330.1|cytochrome 83.4 512 85 0 1 1536 1 512 8.27e-315 847 +NC_001646.1:5332-6871 gi|140539|gb|NP_059333.1|cytochrome 79.5 512 105 0 1 1536 1 512 7.42e-300 810 +NC_001646.1:5332-6871 gi|3950761|gb|YP_514675.1|cytochrome 65.5 516 169 4 13 1536 6 520 4.52e-237 651
--- a/test-data/diamond_results_log_test.tabular Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/diamond_results_log_test.tabular Fri Dec 12 11:13:59 2025 +0000 @@ -1,2 +1,5 @@ -sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 849 1 284 1.44e-205 550 -sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 849 1 284 5.77e-150 409 +NC_001646.1:5332-6871 gi|4512|gb|YP_003024028.1|cytochrome 88.9 512 57 0 1 1536 1 512 0.0 897 +NC_001646.1:5332-6871 gi|17708|gb|NP_904330.1|cytochrome 83.4 512 85 0 1 1536 1 512 8.27e-315 847 +NC_001646.1:5332-6871 gi|140539|gb|NP_059333.1|cytochrome 79.5 512 105 0 1 1536 1 512 7.42e-300 810 +NC_001646.1:5332-6871 gi|19893533|gb|YP_009047267.1|cytochrome 71.3 505 145 0 7 1521 1 505 2.88e-263 717 +NC_001646.1:5332-6871 gi|3950761|gb|YP_514675.1|cytochrome 65.5 516 169 4 13 1536 6 520 4.52e-237 651
--- a/test-data/diamond_results_max_hsps.tabular Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/diamond_results_max_hsps.tabular Fri Dec 12 11:13:59 2025 +0000 @@ -1,2 +1,5 @@ -sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 849 1 284 1.44e-205 550 -sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 849 1 284 5.77e-150 409 +NC_001646.1:5332-6871 gi|4512|gb|YP_003024028.1|cytochrome 88.9 512 57 0 1 1536 1 512 0.0 897 +NC_001646.1:5332-6871 gi|17708|gb|NP_904330.1|cytochrome 83.4 512 85 0 1 1536 1 512 8.27e-315 847 +NC_001646.1:5332-6871 gi|140539|gb|NP_059333.1|cytochrome 79.5 512 105 0 1 1536 1 512 7.42e-300 810 +NC_001646.1:5332-6871 gi|19893533|gb|YP_009047267.1|cytochrome 71.3 505 145 0 7 1521 1 505 2.88e-263 717 +NC_001646.1:5332-6871 gi|3950761|gb|YP_514675.1|cytochrome 65.5 516 169 4 13 1536 6 520 4.52e-237 651
--- a/test-data/diamond_results_motif_masking.tabular Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/diamond_results_motif_masking.tabular Fri Dec 12 11:13:59 2025 +0000 @@ -1,2 +1,5 @@ -sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 849 1 284 1.44e-205 550 -sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 849 1 284 5.77e-150 409 +NC_001646.1:5332-6871 gi|4512|gb|YP_003024028.1|cytochrome 88.9 512 57 0 1 1536 1 512 0.0 897 +NC_001646.1:5332-6871 gi|17708|gb|NP_904330.1|cytochrome 83.4 512 85 0 1 1536 1 512 8.27e-315 847 +NC_001646.1:5332-6871 gi|140539|gb|NP_059333.1|cytochrome 79.5 512 105 0 1 1536 1 512 7.42e-300 810 +NC_001646.1:5332-6871 gi|19893533|gb|YP_009047267.1|cytochrome 71.3 505 145 0 7 1521 1 505 2.88e-263 717 +NC_001646.1:5332-6871 gi|3950761|gb|YP_514675.1|cytochrome 65.5 516 169 4 13 1536 6 520 4.52e-237 651
--- a/test-data/diamond_results_soft_masking.tabular Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/diamond_results_soft_masking.tabular Fri Dec 12 11:13:59 2025 +0000 @@ -1,2 +1,5 @@ -sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 849 1 284 1.44e-205 550 -sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 849 1 284 5.77e-150 409 +NC_001646.1:5332-6871 gi|4512|gb|YP_003024028.1|cytochrome 88.9 512 57 0 1 1536 1 512 0.0 897 +NC_001646.1:5332-6871 gi|17708|gb|NP_904330.1|cytochrome 83.4 512 85 0 1 1536 1 512 8.27e-315 847 +NC_001646.1:5332-6871 gi|140539|gb|NP_059333.1|cytochrome 79.5 512 105 0 1 1536 1 512 7.42e-300 810 +NC_001646.1:5332-6871 gi|19893533|gb|YP_009047267.1|cytochrome 71.3 505 145 0 7 1521 1 505 2.88e-263 717 +NC_001646.1:5332-6871 gi|3950761|gb|YP_514675.1|cytochrome 65.5 516 169 4 13 1536 6 520 4.52e-237 651
--- a/test-data/names.dmp Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/names.dmp Fri Dec 12 11:13:59 2025 +0000 @@ -1,11 +1,270 @@ 1 | all | | synonym | 1 | root | | scientific name | -2 | Bacteria | Bacteria <bacteria> | scientific name | -2 | bacteria | | blast name | -2 | eubacteria | | genbank common name | -2 | Monera | Monera <bacteria> | in-part | -3 | Procaryotae | Procaryotae <bacteria> | in-part | -3 | Prokaryotae | Prokaryotae <bacteria> | in-part | -3 | Prokaryota | Prokaryota <bacteria> | in-part | -3 | prokaryote | prokaryote <bacteria> | in-part | -3 | prokaryotes | prokaryotes <bacteria> | in-part | +2 | Eucarya | | synonym | +2 | Eucaryotae | | synonym | +2 | Eukarya | | synonym | +2 | Eukaryotae | | synonym | +2 | Eukaryota | | scientific name | +2 | eukaryotes | eukaryotes <blast name> | blast name | +2 | eukaryotes | eukaryotes <genbank common name> | genbank common name | +3 | Embryophyta | | scientific name | +3 | higher plants | | common name | +3 | land plants | land plants <blast name> | blast name | +3 | land plants | land plants <genbank common name> | genbank common name | +3 | plants | | common name | +4 | Angiospermae | | synonym | +4 | angiosperms | | common name | +4 | flowering plants | flowering plants <blast name> | blast name | +4 | flowering plants | flowering plants <genbank common name> | genbank common name | +4 | Magnoliophyta | | synonym | +4 | Magnoliopsida | | scientific name | +5 | Liliopsida | | scientific name | +5 | monocots | monocots <blast name> | blast name | +5 | monocots | monocots <genbank common name> | genbank common name | +5 | Monocotyledoneae | | synonym | +5 | monocotyledons | | common name | +6 | Bambusaceae Nakai, 1943 | | authority | +6 | Bambusaceae | | synonym | +6 | Gramineae | | synonym | +6 | grass family | | common name | +6 | Poaceae Barnhart, 1895 | | authority | +6 | Poaceae | | scientific name | +7 | Oryza L., 1753 | | authority | +7 | Oryza | | scientific name | +7 | Porteresia | | includes | +8 | Asian cultivated rice | | genbank common name | +8 | Oryza sativa L., 1753 | | authority | +8 | Oryza sativa | | scientific name | +8 | red rice | red rice <Oryza sativa> | common name | +8 | rice | | common name | +9 | Commelinidae | | synonym | +9 | commelinids | | scientific name | +9 | Commeliniflorae | | synonym | +10 | Eumetazoa | | scientific name | +11 | Arthropoda | | scientific name | +11 | arthropods | arthropods <blast name> | blast name | +11 | arthropods | arthropods <genbank common name> | genbank common name | +12 | Atelocerata | Atelocerata <hexapods> | in-part | +12 | Hexapoda | | scientific name | +12 | hexapods | hexapods <blast name> | blast name | +12 | hexapods | hexapods <genbank common name> | genbank common name | +12 | Tracheata | Tracheata <hexapods> | in-part | +12 | Uniramia | Uniramia <hexapods> | in-part | +13 | Diptera | | scientific name | +13 | flies | flies <blast name> | blast name | +13 | flies | flies <genbank common name> | genbank common name | +14 | Brachycera | | scientific name | +15 | Drosophilidae | | scientific name | +15 | pomace flies | | genbank common name | +16 | Drosophila | Drosophila <flies,genus> | scientific name | +16 | Drosophila Fallen, 1823 | | authority | +16 | fruit flies | fruit flies <Drosophila> | genbank common name | +16 | fruit fly | fruit fly <Drosophila> | common name | +17 | Pterygota | Pterygota <insects> | scientific name | +17 | winged insects | | genbank common name | +18 | Chordata | | scientific name | +18 | chordates | chordates <blast name> | blast name | +18 | chordates | chordates <genbank common name> | genbank common name | +19 | Vertebrata Cuvier, 1812 | | authority | +19 | Vertebrata | Vertebrata <vertebrates> | scientific name | +19 | vertebrates | vertebrates <blast name> | blast name | +19 | vertebrates | vertebrates <genbank common name> | genbank common name | +20 | Gnathostomata | Gnathostomata <vertebrates> | scientific name | +20 | jawed vertebrates | | genbank common name | +21 | Actinopterygii | | scientific name | +21 | Actinopterygi | | synonym | +21 | fishes | fishes <ray-finned fishes> | common name | +21 | fish | fish <ray-finned fishes> | common name | +21 | Osteichthyes | Osteichthyes <ray-finned fishes> | in-part | +21 | ray-finned fishes | ray-finned fishes <blast name> | blast name | +21 | ray-finned fishes | ray-finned fishes <genbank common name> | genbank common name | +22 | carps and others | | genbank common name | +22 | Cypriniformes | | scientific name | +23 | Brachydanio | | synonym | +23 | Celestichthys | | synonym | +23 | Danio | | scientific name | +24 | Brachydanio rerio frankei | | synonym | +24 | Brachydanio rerio | | synonym | +24 | Cyprinus rerio Hamilton, 1822 | | authority | +24 | Cyprinus rerio | | synonym | +24 | Danio frankei | | synonym | +24 | Danio rerio frankei | | synonym | +24 | Danio rerio (Hamilton, 1822) | | authority | +24 | Danio rerio | | scientific name | +24 | leopard danio | | common name | +24 | zebra danio | | common name | +24 | zebrafish | | genbank common name | +24 | zebra fish | zebra fish <Danio rerio> | common name | +25 | Sarcopterygii | | scientific name | +26 | eutherian mammals | | common name | +26 | Eutheria | | scientific name | +26 | Placentalia | | synonym | +26 | placental mammals | | common name | +26 | placentals | placentals <blast name> | blast name | +26 | placentals | placentals <genbank common name> | genbank common name | +27 | Primata | | synonym | +27 | primate | | equivalent name | +27 | Primates Linnaeus, 1758 | | authority | +27 | primates | primates <blast name> | blast name | +27 | primates | primates <genbank common name> | genbank common name | +27 | Primates | | scientific name | +28 | Catarrhini | | scientific name | +29 | great apes | | genbank common name | +29 | Hominidae Gray, 1825 | | authority | +29 | Hominidae | | scientific name | +29 | Pongidae | | synonym | +30 | Homo Linnaeus, 1758 | | authority | +30 | Homo | | scientific name | +30 | humans | | common name | +31 | Homo sapiens Linnaeus, 1758 | | authority | +31 | Homo sapiens | | scientific name | +31 | human | | genbank common name | +32 | rodent | | common name | +32 | Rodentia | | scientific name | +32 | rodents | rodents <blast name> | blast name | +32 | rodents | rodents <genbank common name> | genbank common name | +33 | Muridae | | scientific name | +34 | mice | mice <Mus> | genbank common name | +34 | mouse | mouse <Mus> | common name | +34 | Mus | Mus <genus> | scientific name | +35 | Balb/c mouse | | includes | +35 | house mouse | | genbank common name | +35 | LK3 transgenic mice | | includes | +35 | mouse | mouse <Mus musculus> | common name | +35 | Mus musculus Linnaeus, 1758 | | authority | +35 | Mus musculus | | scientific name | +35 | Mus sp. 129SV | | includes | +35 | nude mice | | includes | +35 | transgenic mice | | includes | +36 | Cyprinoidea | | synonym | +36 | Cyprinoidei | | scientific name | +37 | Teleostei | | scientific name | +37 | teleost fishes | | genbank common name | +38 | Ostariophysi | | scientific name | +39 | Tetrapoda | | scientific name | +39 | tetrapods | | genbank common name | +40 | Amniota | | scientific name | +40 | amniotes | | genbank common name | +41 | Theria Parker & Haswell, 1897 | | authority | +41 | Theria | Theria <mammals> | scientific name | +42 | Chlorobionta Jeffrey, 1982 | | authority | +42 | Chlorobionta | | synonym | +42 | Chlorophyta/Embryophyta group | | equivalent name | +42 | chlorophyte/embryophyte group | | equivalent name | +42 | Chloroplastida Adl et al. 2005 | | authority | +42 | Chloroplastida | | synonym | +42 | green plants | green plants <blast name> | blast name | +42 | green plants | green plants <common name> | common name | +42 | Viridiplantae Cavalier-Smith, 1981 | | authority | +42 | Viridiplantae | | scientific name | +43 | Fungi/Metazoa group | | synonym | +43 | Opisthokonta Cavalier-Smith 1987 | | authority | +43 | Opisthokonta | | scientific name | +43 | opisthokonts | | synonym | +44 | Animalia | | synonym | +44 | animals | animals <blast name> | blast name | +44 | animals | animals <genbank common name> | genbank common name | +44 | metazoans | | common name | +44 | Metazoa | | scientific name | +44 | multicellular animals | | common name | +45 | Bilateria | | scientific name | +46 | Protostomia | | scientific name | +47 | Neoptera | | scientific name | +48 | Endopterygota | | scientific name | +48 | Holometabola | | synonym | +49 | deuterostomes | | common name | +49 | Deuterostomia | | scientific name | +50 | Streptophyta Bremer, 1985 | | authority | +50 | Streptophyta | | scientific name | +51 | Cyperales | | includes | +51 | Poales | | scientific name | +51 | Typhales | | includes | +52 | Murinae | | scientific name | +52 | Otomyinae | | includes | +53 | Mammalia | | scientific name | +53 | mammals | mammals <blast name> | blast name | +53 | mammals | mammals <genbank common name> | genbank common name | +54 | Neopterygii | | scientific name | +54 | Neopterygi | | synonym | +55 | Asilomorpha | | synonym | +55 | Muscomorpha | | scientific name | +56 | Schizophora | | scientific name | +57 | Acalyptratae | | scientific name | +58 | Ephydroidea | | scientific name | +59 | Drosophilinae | | scientific name | +60 | Drosophilini | | scientific name | +61 | Insecta | | scientific name | +61 | insects | insects <blast name> | blast name | +61 | insects | insects <genbank common name> | genbank common name | +61 | true insects | | common name | +62 | Tracheophyta | | scientific name | +62 | Tracheophyta Sinnott ex Cavalier-Smith, 1998 | | authority | +62 | vascular plants | vascular plants <blast name> | blast name | +62 | vascular plants | vascular plants <common name> | common name | +63 | seed plants | seed plants <blast name> | blast name | +63 | seed plants | seed plants <common name> | common name | +63 | Spermatophyta | | scientific name | +64 | Euphyllophyta | | scientific name | +64 | euphyllophytes | | equivalent name | +65 | Dicondylia | | scientific name | +66 | Panarthropoda | | scientific name | +67 | Craniata | Craniata <chordates> | scientific name | +68 | Teleostomi | | scientific name | +69 | bony vertebrates | | genbank common name | +69 | Euteleostomi | | scientific name | +70 | Charophyta/Embryophyta group | | synonym | +70 | charophyte/embryophyte group | | equivalent name | +70 | Streptophytina | | scientific name | +71 | biota | | synonym | +71 | cellular organisms | | scientific name | +72 | Ehrhartoideae Jacq.-Fel. ex Caro, 1982 | | authority | +72 | Ehrhartoideae | | synonym | +72 | Oryzoideae Kunth ex Beilschm., 1833 | | authority | +72 | Oryzoideae | | scientific name | +73 | Oryzeae Dumort., 1824 | | authority | +73 | Oryzeae | | scientific name | +74 | Actinopteri | | scientific name | +75 | Clupeocephala | | scientific name | +76 | Otophysa | | synonym | +76 | Otophysi | | scientific name | +77 | Cypriniphysae | | scientific name | +77 | Cypriniphysi | | synonym | +78 | Ostarioclupeomorpha | | synonym | +78 | Otocephala | | synonym | +78 | Otomorpha | | scientific name | +79 | Pancrustacea | | scientific name | +80 | Mandibulata | | scientific name | +80 | mandibulates | | common name | +81 | Homininae | | scientific name | +81 | Homo/Pan/Gorilla group | | synonym | +82 | Euarchontoglires | | scientific name | +83 | Glires | | scientific name | +83 | Rodents and rabbits | | genbank common name | +84 | Anthropoidea | | synonym | +84 | Simiiformes | | scientific name | +85 | ape | ape <primates> | common name | +85 | apes | | genbank common name | +85 | Hominoidea | | scientific name | +86 | Muroidea | | scientific name | +87 | BEP clade | | equivalent name | +87 | BOP clade | | scientific name | +88 | Haplorrhini | | scientific name | +89 | Cyclorrhapha | | scientific name | +90 | Eremoneura | | scientific name | +91 | Mus | Mus <subgenus> | scientific name | +92 | Ecdysozoa | | scientific name | +93 | Dipnotetrapodomorpha | | scientific name | +94 | Boreoeutheria | | scientific name | +94 | Boreotheria | | synonym | +95 | Mesangiospermae M.J.Donoghue, J.A.Doyle & P.D.Cantino, 2007 | | authority | +95 | Mesangiospermae | | scientific name | +96 | Petrosaviidae | | scientific name | +96 | Petrosaviidae S.W.Graham & W.S.Judd, 2007 | | authority | +97 | Osteoglossocephalai | | scientific name | +98 | Oryzinae Griseb., 1853 | | authority | +98 | Oryzinae | | scientific name | +99 | mice and others | | genbank common name | +99 | Myomorpha | | scientific name | +99 | Sciurognathi | Sciurognathi <Myomorpha> | in-part | +100 | Danionidae | | scientific name | +101 | Danioninae | | scientific name |
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ncbi_taxonomy.loc Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,1 @@ +test testDB ${__HERE__}/ncbi_taxonomy/ \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ncbi_taxonomy/README.md Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,2 @@ +The `*.dmp` files are automatically created by gen.sh (in the blastdb folder). +`prot.accession2taxid` has been manually curated. \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ncbi_taxonomy/names.dmp Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,327 @@ +2 | Bacteria | Bacteria <bacteria> | scientific name | +2 | bacteria | bacteria <blast name> | blast name | +2 | bacteria | bacteria <genbank common name> | genbank common name | +2 | "Bacteria" Cavalier-Smith 1987 | | authority | +2 | Bacteria (ex Cavalier-Smith 1987) | | synonym | +2 | Bacteria Woese et al. 2024 | | synonym | +2 | "Bacteriobiota" Luketa 2012 | | authority | +2 | Bacteriobiota | | synonym | +2 | eubacteria | | common name | +2 | Monera | Monera <bacteria> | in-part | +2 | Procaryotae | Procaryotae <bacteria> | in-part | +2 | Prokaryotae | Prokaryotae <bacteria> | in-part | +2 | Prokaryota | Prokaryota <bacteria> | in-part | +2 | prokaryote | prokaryote <bacteria> | in-part | +2 | prokaryotes | prokaryotes <bacteria> | in-part | +1234 | Nitrospira | Nitrospira <Nitrospira> | scientific name | +1234 | Nitrospira Watson et al. 1986 | | authority | +2759 | Eucarya | | synonym | +2759 | Eucaryotae | | synonym | +2759 | Eukarya | | synonym | +2759 | Eukaryotae | | synonym | +2759 | Eukaryota | | scientific name | +2759 | eukaryotes | eukaryotes <blast name> | blast name | +2759 | eukaryotes | eukaryotes <genbank common name> | genbank common name | +3193 | Embryophyta | | scientific name | +3193 | higher plants | | common name | +3193 | land plants | land plants <blast name> | blast name | +3193 | land plants | land plants <genbank common name> | genbank common name | +3193 | plants | | common name | +3398 | Angiospermae | | synonym | +3398 | angiosperms | | common name | +3398 | flowering plants | flowering plants <blast name> | blast name | +3398 | flowering plants | flowering plants <genbank common name> | genbank common name | +3398 | Magnoliophyta | | synonym | +3398 | Magnoliopsida | | scientific name | +3432 | Laurales Juss. ex Bercht. & J.Presl, 1820 | | authority | +3432 | Laurales | | scientific name | +3432 | Laurineae | | includes | +3433 | Lauraceae Juss., 1789 | | authority | +3433 | Lauraceae | | scientific name | +3433 | laurel family | | common name | +3434 | Persea Mill., 1754 | | authority | +3434 | Persea | | scientific name | +3435 | avocado | | genbank common name | +3435 | Laurus persea L., 1753 | | authority | +3435 | Laurus persea | | synonym | +3435 | Persea americana Mill., 1768 | | authority | +3435 | Persea americana | | scientific name | +3435 | Persea americana var. tolimanensis | | synonym | +3435 | Persea americana var. tolimanensis (Zentmyer & Schieber) Scora, 2002 | | authority | +3435 | Persea gratissima C.F.Gaertn., 1807 | | authority | +3435 | Persea gratissima | | synonym | +3435 | Persea tolimanensis | | synonym | +3435 | Persea tolimanensis Zentmyer & Schieber, 1990 | | authority | +4447 | Liliopsida | | scientific name | +4447 | monocots | monocots <blast name> | blast name | +4447 | monocots | monocots <genbank common name> | genbank common name | +4447 | Monocotyledoneae | | synonym | +4447 | monocotyledons | | common name | +4479 | Bambusaceae Nakai, 1943 | | authority | +4479 | Bambusaceae | | synonym | +4479 | Gramineae | | synonym | +4479 | grass family | | common name | +4479 | Poaceae Barnhart, 1895 | | authority | +4479 | Poaceae | | scientific name | +4527 | Oryza L., 1753 | | authority | +4527 | Oryza | | scientific name | +4527 | Porteresia | | includes | +4530 | Asian cultivated rice | | genbank common name | +4530 | Oryza sativa L., 1753 | | authority | +4530 | Oryza sativa | | scientific name | +4530 | red rice | red rice <Oryza sativa> | common name | +4530 | rice | | common name | +4734 | Commelinidae | | synonym | +4734 | commelinids | | scientific name | +4734 | Commeliniflorae | | synonym | +6072 | Eumetazoa | | scientific name | +6656 | Arthropoda | | scientific name | +6656 | arthropods | arthropods <blast name> | blast name | +6656 | arthropods | arthropods <genbank common name> | genbank common name | +6960 | Atelocerata | Atelocerata <hexapods> | in-part | +6960 | Hexapoda | | scientific name | +6960 | hexapods | hexapods <blast name> | blast name | +6960 | hexapods | hexapods <genbank common name> | genbank common name | +6960 | Tracheata | Tracheata <hexapods> | in-part | +6960 | Uniramia | Uniramia <hexapods> | in-part | +7147 | Diptera | | scientific name | +7147 | flies | flies <blast name> | blast name | +7147 | flies | flies <genbank common name> | genbank common name | +7203 | Brachycera | | scientific name | +7214 | Drosophilidae | | scientific name | +7214 | pomace flies | | genbank common name | +7215 | Drosophila | Drosophila <flies,genus> | scientific name | +7215 | Drosophila Fallen, 1823 | | authority | +7215 | fruit flies | fruit flies <Drosophila> | genbank common name | +7215 | fruit fly | fruit fly <Drosophila> | common name | +7496 | Pterygota | Pterygota <insects> | scientific name | +7496 | winged insects | | genbank common name | +7711 | Chordata | | scientific name | +7711 | chordates | chordates <blast name> | blast name | +7711 | chordates | chordates <genbank common name> | genbank common name | +7742 | Vertebrata Cuvier, 1812 | | authority | +7742 | Vertebrata | Vertebrata <vertebrates> | scientific name | +7742 | vertebrates | vertebrates <blast name> | blast name | +7742 | vertebrates | vertebrates <genbank common name> | genbank common name | +7776 | Gnathostomata | Gnathostomata <vertebrates> | scientific name | +7776 | jawed vertebrates | | genbank common name | +7898 | Actinopterygii | | scientific name | +7898 | Actinopterygi | | synonym | +7898 | fishes | fishes <ray-finned fishes> | common name | +7898 | fish | fish <ray-finned fishes> | common name | +7898 | Osteichthyes | Osteichthyes <ray-finned fishes> | in-part | +7898 | ray-finned fishes | ray-finned fishes <blast name> | blast name | +7898 | ray-finned fishes | ray-finned fishes <genbank common name> | genbank common name | +7952 | carps and others | | genbank common name | +7952 | Cypriniformes | | scientific name | +7954 | Brachydanio | | synonym | +7954 | Celestichthys | | synonym | +7954 | Danio | | scientific name | +7955 | Brachydanio rerio frankei | | synonym | +7955 | Brachydanio rerio | | synonym | +7955 | Cyprinus rerio Hamilton, 1822 | | authority | +7955 | Cyprinus rerio | | synonym | +7955 | Danio frankei | | synonym | +7955 | Danio rerio frankei | | synonym | +7955 | Danio rerio (Hamilton, 1822) | | authority | +7955 | Danio rerio | | scientific name | +7955 | leopard danio | | common name | +7955 | zebra danio | | common name | +7955 | zebrafish | | genbank common name | +7955 | zebra fish | zebra fish <Danio rerio> | common name | +8287 | Sarcopterygii | | scientific name | +9347 | eutherian mammals | | common name | +9347 | Eutheria | | scientific name | +9347 | Placentalia | | synonym | +9347 | placental mammals | | common name | +9347 | placentals | placentals <blast name> | blast name | +9347 | placentals | placentals <genbank common name> | genbank common name | +9443 | Primata | | synonym | +9443 | primate | | equivalent name | +9443 | Primates Linnaeus, 1758 | | authority | +9443 | primates | primates <blast name> | blast name | +9443 | primates | primates <genbank common name> | genbank common name | +9443 | Primates | | scientific name | +9526 | Catarrhini | | scientific name | +9604 | great apes | | genbank common name | +9604 | Hominidae Gray, 1825 | | authority | +9604 | Hominidae | | scientific name | +9604 | Pongidae | | synonym | +9605 | Homo Linnaeus, 1758 | | authority | +9605 | Homo | | scientific name | +9605 | humans | | common name | +9606 | Homo sapiens Linnaeus, 1758 | | authority | +9606 | Homo sapiens | | scientific name | +9606 | human | | genbank common name | +9989 | rodent | | common name | +9989 | Rodentia | | scientific name | +9989 | rodents | rodents <blast name> | blast name | +9989 | rodents | rodents <genbank common name> | genbank common name | +10066 | Muridae | | scientific name | +10088 | mice | mice <Mus> | genbank common name | +10088 | mouse | mouse <Mus> | common name | +10088 | Mus | Mus <genus> | scientific name | +10090 | Balb/c mouse | | includes | +10090 | house mouse | | genbank common name | +10090 | LK3 transgenic mice | | includes | +10090 | mouse | mouse <Mus musculus> | common name | +10090 | Mus musculus Linnaeus, 1758 | | authority | +10090 | Mus musculus | | scientific name | +10090 | Mus sp. 129SV | | includes | +10090 | nude mice | | includes | +10090 | transgenic mice | | includes | +30727 | Cyprinoidea | | synonym | +30727 | Cyprinoidei | | scientific name | +32443 | Teleostei | | scientific name | +32443 | teleost fishes | | genbank common name | +32519 | Ostariophysi | | scientific name | +32523 | Tetrapoda | | scientific name | +32523 | tetrapods | | genbank common name | +32524 | Amniota | | scientific name | +32524 | amniotes | | genbank common name | +32525 | Theria Parker & Haswell, 1897 | | authority | +32525 | Theria | Theria <mammals> | scientific name | +33090 | Chlorobionta Jeffrey, 1982 | | authority | +33090 | Chlorobionta | | synonym | +33090 | Chlorophyta/Embryophyta group | | equivalent name | +33090 | chlorophyte/embryophyte group | | equivalent name | +33090 | Chloroplastida Adl et al. 2005 | | authority | +33090 | Chloroplastida | | synonym | +33090 | green plants | green plants <blast name> | blast name | +33090 | green plants | green plants <common name> | common name | +33090 | Viridiplantae Cavalier-Smith, 1981 | | authority | +33090 | Viridiplantae | | scientific name | +33154 | Fungi/Metazoa group | | synonym | +33154 | Opisthokonta Cavalier-Smith 1987 | | authority | +33154 | Opisthokonta | | scientific name | +33154 | opisthokonts | | synonym | +33208 | Animalia | | synonym | +33208 | animals | animals <blast name> | blast name | +33208 | animals | animals <genbank common name> | genbank common name | +33208 | metazoans | | common name | +33208 | Metazoa | | scientific name | +33208 | multicellular animals | | common name | +33213 | Bilateria | | scientific name | +33317 | Protostomia | | scientific name | +33340 | Neoptera | | scientific name | +33392 | Endopterygota | | scientific name | +33392 | Holometabola | | synonym | +33511 | deuterostomes | | common name | +33511 | Deuterostomia | | scientific name | +35493 | Streptophyta Bremer, 1985 | | authority | +35493 | Streptophyta | | scientific name | +38820 | Cyperales | | includes | +38820 | Poales | | scientific name | +38820 | Typhales | | includes | +39107 | Murinae | | scientific name | +39107 | Otomyinae | | includes | +40117 | "Nitrospirae" Garrity and Holt 2001 | | authority | +40117 | "Nitrospiraeota" Oren et al. 2015 | | authority | +40117 | Nitrospiraeota | | synonym | +40117 | Nitrospirae | | synonym | +40117 | Nitrospira group | | synonym | +40117 | Nitrospirota corrig. Garrity and Holt 2021 | | synonym | +40117 | Nitrospirota | | scientific name | +40117 | Thermodesulfovibrio group | | synonym | +40674 | Mammalia | | scientific name | +40674 | mammals | mammals <blast name> | blast name | +40674 | mammals | mammals <genbank common name> | genbank common name | +41665 | Neopterygii | | scientific name | +41665 | Neopterygi | | synonym | +43733 | Asilomorpha | | synonym | +43733 | Muscomorpha | | scientific name | +43738 | Schizophora | | scientific name | +43741 | Acalyptratae | | scientific name | +43746 | Ephydroidea | | scientific name | +43845 | Drosophilinae | | scientific name | +46877 | Drosophilini | | scientific name | +50557 | Insecta | | scientific name | +50557 | insects | insects <blast name> | blast name | +50557 | insects | insects <genbank common name> | genbank common name | +50557 | true insects | | common name | +58023 | Tracheophyta | | scientific name | +58023 | Tracheophyta Sinnott ex Cavalier-Smith, 1998 | | authority | +58023 | vascular plants | vascular plants <blast name> | blast name | +58023 | vascular plants | vascular plants <common name> | common name | +58024 | seed plants | seed plants <blast name> | blast name | +58024 | seed plants | seed plants <common name> | common name | +58024 | Spermatophyta | | scientific name | +78536 | Euphyllophyta | | scientific name | +78536 | euphyllophytes | | equivalent name | +85512 | Dicondylia | | scientific name | +88770 | Panarthropoda | | scientific name | +89593 | Craniata | Craniata <chordates> | scientific name | +117570 | Teleostomi | | scientific name | +117571 | bony vertebrates | | genbank common name | +117571 | Euteleostomi | | scientific name | +131221 | Charophyta/Embryophyta group | | synonym | +131221 | charophyte/embryophyte group | | equivalent name | +131221 | Streptophytina | | scientific name | +147367 | Ehrhartoideae Jacq.-Fel. ex Caro, 1982 | | authority | +147367 | Ehrhartoideae | | synonym | +147367 | Oryzoideae Kunth ex Beilschm., 1833 | | authority | +147367 | Oryzoideae | | scientific name | +147380 | Oryzeae Dumort., 1824 | | authority | +147380 | Oryzeae | | scientific name | +186623 | Actinopteri | | scientific name | +186625 | Clupeocephala | | scientific name | +186626 | Otophysa | | synonym | +186626 | Otophysi | | scientific name | +186627 | Cypriniphysae | | scientific name | +186627 | Cypriniphysi | | synonym | +186634 | Ostarioclupeomorpha | | synonym | +186634 | Otocephala | | synonym | +186634 | Otomorpha | | scientific name | +189778 | "Nitrospirales" Garrity and Holt 2001 | | authority | +189778 | Nitrospirales Garrity and Holt 2022 | | authority | +189778 | Nitrospirales | | scientific name | +189779 | "Nitrospiraceae" Garrity and Holt 2001 | | authority | +189779 | Nitrospiraceae Garrity and Holt 2022 | | authority | +189779 | Nitrospiraceae | | scientific name | +197562 | Pancrustacea | | scientific name | +197563 | Mandibulata | | scientific name | +197563 | mandibulates | | common name | +203693 | "Nitrospira" Garrity and Holt 2001 | | authority | +203693 | Nitrospira | Nitrospira <Nitrospiria> | synonym | +203693 | "Nitrospiria" Cavalier-Smith 2020 | | authority | +203693 | Nitrospiria Garrity and Holt 2022 | | authority | +203693 | "Nitrospiria" Oren et al. 2015 | | authority | +203693 | Nitrospiria | | scientific name | +207598 | Homininae | | scientific name | +207598 | Homo/Pan/Gorilla group | | synonym | +232347 | Magnoliidae Novak ex Takht., 1967 | | authority | +232347 | Magnoliidae | | scientific name | +232347 | magnoliids | | equivalent name | +314146 | Euarchontoglires | | scientific name | +314147 | Glires | | scientific name | +314147 | Rodents and rabbits | | genbank common name | +314293 | Anthropoidea | | synonym | +314293 | Simiiformes | | scientific name | +314295 | ape | ape <primates> | common name | +314295 | apes | | genbank common name | +314295 | Hominoidea | | scientific name | +337687 | Muroidea | | scientific name | +359160 | BEP clade | | equivalent name | +359160 | BOP clade | | scientific name | +376913 | Haplorrhini | | scientific name | +480117 | Cyclorrhapha | | scientific name | +480118 | Eremoneura | | scientific name | +862507 | Mus | Mus <subgenus> | scientific name | +1206794 | Ecdysozoa | | scientific name | +1338369 | Dipnotetrapodomorpha | | scientific name | +1437010 | Boreoeutheria | | scientific name | +1437010 | Boreotheria | | synonym | +1437183 | Mesangiospermae M.J.Donoghue, J.A.Doyle & P.D.Cantino, 2007 | | authority | +1437183 | Mesangiospermae | | scientific name | +1437197 | Petrosaviidae | | scientific name | +1437197 | Petrosaviidae S.W.Graham & W.S.Judd, 2007 | | authority | +1489341 | Osteoglossocephalai | | scientific name | +1648021 | Oryzinae Griseb., 1853 | | authority | +1648021 | Oryzinae | | scientific name | +1963758 | mice and others | | genbank common name | +1963758 | Myomorpha | | scientific name | +1963758 | Sciurognathi | Sciurognathi <Myomorpha> | in-part | +2743709 | Danionidae | | scientific name | +2743711 | Danioninae | | scientific name | +3379134 | Pseudomonadati (Gibbons and Murray 1978) Oren and Goker 2024 | | authority | +3379134 | Pseudomonadati | | scientific name |
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ncbi_taxonomy/nodes.dmp Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,111 @@ +2 | 131567 | domain | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | | +1234 | 189779 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | code compliant | +2759 | 131567 | domain | | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | | +3193 | 131221 | clade | | 4 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | | +3398 | 58024 | class | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +3432 | 232347 | order | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +3433 | 3432 | family | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +3434 | 3433 | genus | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +3435 | 3434 | species | PA | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | code compliant; specified | +4447 | 1437183 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | | +4479 | 38820 | family | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +4527 | 1648021 | genus | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +4530 | 4527 | species | OS | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant; specified | +4734 | 1437197 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | | +6072 | 33208 | clade | | 1 | 1 | 1 | 1 | 5 | 0 | 1 | 0 | | +6656 | 88770 | phylum | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +6960 | 197562 | subphylum | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +7147 | 33392 | order | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +7203 | 7147 | suborder | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +7214 | 43746 | family | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +7215 | 46877 | genus | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +7496 | 85512 | subclass | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +7711 | 33511 | phylum | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +7742 | 89593 | clade | | 10 | 0 | 1 | 1 | 2 | 1 | 0 | 0 | | +7776 | 7742 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +7898 | 117571 | superclass | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +7952 | 186627 | order | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +7954 | 2743711 | genus | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +7955 | 7954 | species | DR | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant; specified | +8287 | 117571 | superclass | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant | +9347 | 32525 | clade | | 2 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | | +9443 | 314146 | order | | 5 | 0 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +9526 | 314293 | parvorder | | 5 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +9604 | 314295 | family | | 5 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +9605 | 207598 | genus | | 5 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +9606 | 9605 | species | HS | 5 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant; specified | +9989 | 314147 | order | | 6 | 0 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +10066 | 337687 | family | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +10088 | 39107 | genus | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +10090 | 862507 | species | MM | 6 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant; specified | +30727 | 7952 | suborder | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant | +32443 | 41665 | infraclass | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +32519 | 186634 | subcohort | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | | +32523 | 1338369 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +32524 | 32523 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +32525 | 40674 | clade | | 2 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +33090 | 2759 | kingdom | | 4 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +33154 | 2759 | clade | | 4 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | | +33208 | 33154 | kingdom | | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +33213 | 6072 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +33317 | 33213 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +33340 | 7496 | infraclass | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +33392 | 33340 | cohort | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +33511 | 33213 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +35493 | 33090 | phylum | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +38820 | 4734 | order | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +39107 | 10066 | subfamily | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +40117 | 3379134 | phylum | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +40674 | 32524 | class | | 2 | 0 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +41665 | 186623 | subclass | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +43733 | 7203 | infraorder | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +43738 | 480117 | no rank | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +43741 | 43738 | no rank | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +43746 | 43741 | superfamily | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +43845 | 7214 | subfamily | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | code compliant | +46877 | 43845 | tribe | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | code compliant | +50557 | 6960 | class | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +58023 | 3193 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | | +58024 | 78536 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | | +78536 | 58023 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | | +85512 | 50557 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +88770 | 1206794 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +89593 | 7711 | subphylum | | 10 | 0 | 1 | 1 | 2 | 0 | 0 | 0 | code compliant | +117570 | 7776 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +117571 | 117570 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | | +131221 | 35493 | subphylum | | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | code compliant | +147367 | 359160 | subfamily | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +147380 | 147367 | tribe | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +186623 | 7898 | class | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant | +186625 | 1489341 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +186626 | 32519 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +186627 | 186626 | superorder | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant | +186634 | 186625 | cohort | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant | +189778 | 203693 | order | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | code compliant | +189779 | 189778 | family | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | code compliant | +197562 | 197563 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +197563 | 6656 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +203693 | 40117 | class | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | code compliant | +207598 | 9604 | subfamily | | 5 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant | +232347 | 1437183 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | | +314146 | 1437010 | superorder | | 2 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +314147 | 314146 | clade | | 2 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | | +314293 | 376913 | infraorder | | 5 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant | +314295 | 9526 | superfamily | | 5 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant | +337687 | 1963758 | clade | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | | +359160 | 4479 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | | +376913 | 9443 | suborder | | 5 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +480117 | 480118 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +480118 | 43733 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +862507 | 10088 | subgenus | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +1206794 | 33317 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | | +1338369 | 8287 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +1437010 | 9347 | clade | | 2 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +1437183 | 3398 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | | +1437197 | 4447 | subclass | | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | code compliant | +1489341 | 32443 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +1648021 | 147380 | subtribe | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +1963758 | 9989 | suborder | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +2743709 | 30727 | family | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +2743711 | 2743709 | subfamily | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +3379134 | 2 | kingdom | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ncbi_taxonomy/prot.accession2taxid Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,6 @@ +accession accession.version taxid gi +YP_514675 YP_514675.1 12 3950761 +YP_009047267 YP_009047267.1 20 19893533 +NP_059333 NP_059333.1 28 140539 +YP_003024028 YP_003024028.1 35 4512 +NP_904330 NP_904330.1 39 17708
--- a/test-data/nodes.dmp Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/nodes.dmp Fri Dec 12 11:13:59 2025 +0000 @@ -1,3 +1,101 @@ 1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | -2 | 1 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | -3 | 1 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +2 | 71 | domain | | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | | +3 | 70 | clade | | 4 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | | +4 | 63 | class | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +5 | 95 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | | +6 | 51 | family | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +7 | 98 | genus | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +8 | 7 | species | OS | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant; specified | +9 | 96 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | | +10 | 44 | clade | | 1 | 1 | 1 | 1 | 5 | 0 | 1 | 0 | | +11 | 66 | phylum | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +12 | 79 | subphylum | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +13 | 48 | order | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +14 | 13 | suborder | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +15 | 58 | family | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +16 | 60 | genus | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +17 | 65 | subclass | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +18 | 49 | phylum | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +19 | 67 | clade | | 10 | 0 | 1 | 1 | 2 | 1 | 0 | 0 | | +20 | 19 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +21 | 69 | superclass | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +22 | 77 | order | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +23 | 101 | genus | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +24 | 23 | species | DR | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant; specified | +25 | 69 | superclass | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant | +26 | 41 | clade | | 2 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | | +27 | 82 | order | | 5 | 0 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +28 | 84 | parvorder | | 5 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +29 | 85 | family | | 5 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +30 | 81 | genus | | 5 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +31 | 30 | species | HS | 5 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant; specified | +32 | 83 | order | | 6 | 0 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +33 | 86 | family | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +34 | 52 | genus | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +35 | 91 | species | MM | 6 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant; specified | +36 | 22 | suborder | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant | +37 | 54 | infraclass | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +38 | 78 | subcohort | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | | +39 | 93 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +40 | 39 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +41 | 53 | clade | | 2 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +42 | 2 | kingdom | | 4 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +43 | 2 | clade | | 4 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | | +44 | 43 | kingdom | | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +45 | 10 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +46 | 45 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +47 | 17 | infraclass | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +48 | 47 | cohort | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +49 | 45 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +50 | 42 | phylum | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +51 | 9 | order | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +52 | 33 | subfamily | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +53 | 40 | class | | 2 | 0 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +54 | 74 | subclass | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +55 | 14 | infraorder | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +56 | 89 | no rank | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +57 | 56 | no rank | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +58 | 57 | superfamily | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +59 | 15 | subfamily | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | code compliant | +60 | 59 | tribe | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | code compliant | +61 | 12 | class | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant | +62 | 3 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | | +63 | 64 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | | +64 | 62 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | | +65 | 61 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +66 | 92 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +67 | 18 | subphylum | | 10 | 0 | 1 | 1 | 2 | 0 | 0 | 0 | code compliant | +68 | 20 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +69 | 68 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | | +70 | 50 | subphylum | | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | code compliant | +71 | 1 | cellular root | CO | 8 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +72 | 87 | subfamily | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +73 | 72 | tribe | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +74 | 21 | class | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant | +75 | 97 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +76 | 38 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +77 | 76 | superorder | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant | +78 | 75 | cohort | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant | +79 | 80 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +80 | 11 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +81 | 29 | subfamily | | 5 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant | +82 | 94 | superorder | | 2 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +83 | 82 | clade | | 2 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | | +84 | 88 | infraorder | | 5 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant | +85 | 28 | superfamily | | 5 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant | +86 | 99 | clade | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | | +87 | 6 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | | +88 | 27 | suborder | | 5 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +89 | 90 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +90 | 55 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | | +91 | 34 | subgenus | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +92 | 46 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | | +93 | 25 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +94 | 26 | clade | | 2 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +95 | 4 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | | +96 | 5 | subclass | | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | code compliant | +97 | 37 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | | +98 | 73 | subtribe | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant | +99 | 32 | suborder | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +100 | 36 | family | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant | +101 | 100 | subfamily | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
--- a/test-data/nucleotide.fasta Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/nucleotide.fasta Fri Dec 12 11:13:59 2025 +0000 @@ -1,17 +1,23 @@ ->sequence more text -CTGTGCCTGTACACCCACATCGGCAGAAACATCTACTACGGCAGCTACCTGTACAGCGAG -ACCTGGAACACCGGCATCATGCTGCTGCTGATCACCATGGCCACCGCCTTCATGGGCTAC -GTGCTGCCCTGGGGCCAGATGAGCTTCTGGGGCGCCACCGTGATCACCAACCTGTTCAGC -GCCATCCCCTACATCGGCACCAACCTGGTGGAGTGGATCTGGGGCGGCTTCAGCGTGGAC -AAGGCCACCCTGAACAGATTCTTCGCCTTCCACTTCATCCTGTTCACCATGGTGGCCCTG -GCCGGCGTGCACCTGACCTTCCTGCACGAGACCGGCAGCAACAACCCCCTGGGCCTGACC -AGCGACAGCGACAAGATCCCCTTCCACCCCTACTACACCATCAAGGACTTCCTGGGCCTG -CTGATCCTGATCCTGCTGCTGCTGCTGCTGGCCCTGCTGAGCCCCGACATGCTGGGCGAC -CCCGACAACCACATGCCCGCCGACCCCCTGAACACCCCCCTGCACATCAAGCCCGAGTGG -TACTTCCTGTTCGCCTACGCCATCCTGAGAAGCGTGCCCAACAAGCTGGGCGGCGTGCTG -GCCCTGTTCCTGAGCATCGTGATCCTGGGCCTGATGCCCTTCCTGCACACCAGCAAGCAC -AGAAGCATGATGCTGAGACCCCTGAGCCAGGCCCTGTTCTGGACCCTGACCATGGACCTG -CTGACCCTGACCTGGATCGGCAGCCAGCCCGTGGAGTACCCCTACACCATCATCGGCCAG -ATGGCCAGCATCCTGTACTTCAGCATCATCCTGGCCTTCCTGCCCATCGCCGGCNNNATC -GAGAACTAC - +>NC_001646.1:5332-6871 Pongo pygmaeus mitochondrion, complete genome +ATGTTCGCCGACCGCTGGCTATTCTCCACGAACCACAAAGATATTGGAACGCTATACCTGTTGTTCGGCG +CATGAGCTGGTGTCCTAGGCACTGCCCTAAGCCTCCTCATTCGTGCTGAACTAGGCCAACCCGGCAACCT +CCTAGGTAATGACCATATTTACAATGTCATCGTCACAGCCCATGCATTCGTAATAATTTTTTTCATGGTC +ATGCCCATAATAATTGGAGGCTTTGGCAACTGACTAGTGCCCCTGATAATTGGCGCCCCTGATATGGCAT +TCCCGCGCATAAATAACATAAGCTTCTGACTCCTCCTCCCCTCCTTCCTCCTATTACTCGCTTCTGCTAC +AGTAGAGGCCGGAGCAGGAACGGGCTGAACAGTCTATCCACCCCTAGCAGGAAACTACTCTCACCCAGGA +GCCTCTGTAGACTTGACAATCTTCTCTCTACACCTAGCAGGCATTTCCTCAATTCTAGGGGCTATCAATT +TCATTACAACAATTATTAATATAAAACCCCCTGCAATATCCCAATATCAAACTCCCCTCTTCGTCTGATC +AATCCTGATCACAGCAGTCCTACTTCTCCTCTCCCTCCCAGTCCTAGCCGCTGGCATCACCATACTACTA +ACAGACCGCAACTTAAATACTACATTCTTTGACCCGGCTGGAGGTGGGGATCCTATCCTATACCAACACT +TATTCTGATTTTTCGGCCACCCTGAAGTCTACATTCTCATCCTACCAGGTTTCGGCATAATCTCCCACAT +CGTAACACACTACTCCGGAAAAAAAGAACCATTTGGGTATATAGGCATAGTCTGAGCCATAGTCTCAATT +GGTTTCCTGGGTTTTATCGTATGAGCCCACCACATATTCACAGTAGGGATAGACGTGGACACACGAGCCT +ACTTCACCTCCGCTACCATAATTATTGCCATCCCCACCGGCGTCAAAGTATTTAGCTGACTCGCTACACT +CCACGGAAGCAACACTAAATGATCTGCCGCAATCCTCTGAGCCTTAGGATTCATTTTCCTCTTCACCGTA +GGCGGCTTAACAGGCATCGTACTGGCAAACTCATCACTAGACATCGTATTACACGATACATACTACGTTG +TAGCCCACTTTCACTACGTCTTATCAATAGGAGCTGTATTCGCCATCATAGGAGGCTTCATCCACTGGTT +CCCACTATTCTCAGGCTACACCTTAAACCAGACCTATGCTAAAATTCACTTCATCACCATATTTGTCGGC +GTAAATTTAACCTTCTTCCCGCAACATTTCCTTGGCCTATCAGGTATACCCCGACGCTACTCCGATTACC +CCGACGCATATACCACATGAAATATTTTATCATCCGCAGGCTCATTTATCTCCCTAACAGCAGTTATACT +AATAATTTTCATAATTTGAGAAGCCTTTGCCTCAAAACGAAAAGTCCCAATAATTGAACAACCTTCCACA +AGCCTAGAGTGGTTATACGGATGCCCCCCACCCTACCATACGTTTGAAGAACCCGTCTATATAAAACCCG
--- a/test-data/prot.accession2taxid Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/prot.accession2taxid Fri Dec 12 11:13:59 2025 +0000 @@ -1,4 +1,6 @@ accession accession.version taxid gi -AAD44166 AAD44166.1 2 5524211 -AAD44167 AAD44167.1 3 5524212 - +YP_514675 YP_514675.1 8 3950761 +YP_009047267 YP_009047267.1 16 19893533 +NP_059333 NP_059333.1 24 140539 +YP_003024028 YP_003024028.1 31 4512 +NP_904330 NP_904330.1 35 17708
--- a/test-data/protein.fasta Mon Nov 10 15:12:32 2025 +0000 +++ b/test-data/protein.fasta Fri Dec 12 11:13:59 2025 +0000 @@ -1,9 +1,12 @@ ->sequence more text -LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV -EWIWGGFSVDKATLNRFFAFHFILFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG -LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL -GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX -IENY +>NP_008227.1 cytochrome c oxidase subunit I (mitochondrion) [Pongo pygmaeus] +MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTAHAFVMIFFMV +MPMMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLLPSFLLLLASATVEAGAGTGWTVYPPLAGNYSHPG +ASVDLTIFSLHLAGISSILGAINFITTIINMKPPAMSQYQTPLFVWSILITAVLLLLSLPVLAAGITMLL +TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIVTHYSGKKEPFGYMGMVWAMVSI +GFLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGSNTKWSAAILWALGFIFLFTV +GGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFIHWFPLFSGYTLNQTYAKIHFITMFVG +VNLTFFPQHFLGLSGMPRRYSDYPDAYTTWNILSSAGSFISLTAVMLMIFMIWEAFASKRKVPMIEQPST +SLEWLYGCPPPYHTFEEPVYMKP >shuffled sequence that should go to unaligned XLPLILMLLGISPGSFEHTVAGGIWTSLMLFLPGYPGVGFLMLLVITVPALNFKFGFMLL LKPTTNIIKTLVLALTHADDPLSFPWLNYMPPAADFNGLFTNAGATTTLYQIPYEGSFYL
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/taxon.tsv Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,2 @@ +32523 +7898 \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/blastdb_p.loc Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,57 @@ +# This is a sample file distributed with Galaxy that is used to define a +# list of protein domain databases, using three columns tab separated +# (longer whitespace are TAB characters): +# +# <unique_id>{tab}<database_caption>{tab}<base_name_path> +# +# The captions typically contain spaces and might end with the build date. +# It is important that the actual database name does not have a space in +# it, and that there are only two tabs on each line. +# +# You can download the NCBI provided databases as tar-balls from here: +# ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/ +# +# For simplicity, many Galaxy servers are configured to offer just a live +# version of each NCBI BLAST database (updated with the NCBI provided +# Perl scripts or similar). In this case, we recommend using the case +# sensistive base-name of the NCBI BLAST databases as the unique id. +# Consistent naming is important for sharing workflows between Galaxy +# servers. +# +# For example, consider the NCBI Conserved Domains Database (CDD), where +# you have downloaded and decompressed the files under the directory +# /data/blastdb/domains/ meaning at the command line BLAST+ would be +# run as follows any would look at the files /data/blastdb/domains/Cdd.*: +# +# $ rpsblast -db /data/blastdb/domains/Cdd -query ... +# +# In this case use Cdd (title case to match the NCBI file naming) as the +# unique id in the first column of blastdb_d.loc, giving an entry like +# this: +# +# Cdd{tab}NCBI Conserved Domains Database (CDD){tab}/data/blastdb/domains/Cdd +# +# Your blastdb_d.loc file should include an entry per line for each "base name" +# you have stored. For example: +# +# Cdd{tab}NCBI CDD{tab}/data/blastdb/domains/Cdd +# Kog{tab}KOG (eukaryotes){tab}/data/blastdb/domains/Kog +# Cog{tab}COG (prokaryotes){tab}/data/blastdb/domains/Cog +# Pfam{tab}Pfam-A{tab}/data/blastdb/domains/Pfam +# Smart{tab}SMART{tab}/data/blastdb/domains/Smart +# Tigr{tab}TIGR /data/blastdb/domains/Tigr +# Prk{tab}Protein Clusters database{tab}/data/blastdb/domains/Prk +# ...etc... +# +# Alternatively, rather than a "live" mirror of the NCBI databases which +# are updated automatically, for full reproducibility the Galaxy Team +# recommend saving date-stamped copies of the databases. In this case +# your blastdb_d.loc file should include an entry per line for each +# version you have stored. For example: +# +# Cdd_05Jun2010{tab}NCBI CDD 05 Jun 2010{tab}/data/blastdb/domains/05Jun2010/Cdd +# Cdd_15Aug2010{tab}NCBI CDD 15 Aug 2010{tab}/data/blastdb/domains/15Aug2010/Cdd +# ...etc... +# +# See also blastdb.loc which is for any nucleotide BLAST database, and +# blastdb_p.loc which is for any protein BLAST databases. \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/ncbi_taxonomy.loc.sample Fri Dec 12 11:13:59 2025 +0000 @@ -0,0 +1,5 @@ +# Tab separated fields where +# value is unique key +# name is descriptive name +# path is path to directory containing names.dmp and nodes.dmp files +#value name path
--- a/tool_data_table_conf.xml.sample Mon Nov 10 15:12:32 2025 +0000 +++ b/tool_data_table_conf.xml.sample Fri Dec 12 11:13:59 2025 +0000 @@ -1,8 +1,19 @@ <!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc--> <tables> - <!-- Locations of indexes in the Bowtie mapper format --> + + <table name="blastdb_p" comment_char="#" allow_duplicate_entries="False"> + <columns>value, name, path</columns> + <file path="tool-data/blastdb_p.loc" /> + </table> + <table name="diamond_database" comment_char="#"> <columns>value, name, db_path</columns> <file path="tool-data/diamond_database.loc" /> </table> + + <!-- Locations of taxonomy data downloaded from NCBI --> + <table name="ncbi_taxonomy" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/ncbi_taxonomy.loc" /> + </table> </tables>
--- a/tool_data_table_conf.xml.test Mon Nov 10 15:12:32 2025 +0000 +++ b/tool_data_table_conf.xml.test Fri Dec 12 11:13:59 2025 +0000 @@ -1,7 +1,18 @@ <tables> - <!-- Locations of all fasta files required to build Diamond databases --> + <table name="blastdb_p" comment_char="#"> + <columns>value, name, path</columns> + <file path="${__HERE__}/test-data/blastdb_p.loc" /> + </table> + <table name="diamond_database" comment_char="#"> <columns>value, name, db_path</columns> <file path="${__HERE__}/test-data/diamond_database.loc" /> </table> + + <!-- Locations of taxonomy data downloaded from NCBI --> + <table name="ncbi_taxonomy" comment_char="#"> + <columns>value, name, path</columns> + <file path="${__HERE__}/test-data/ncbi_taxonomy.loc" /> + </table> + </tables>
