# HG changeset patch
# User iuc
# Date 1724245979 0
# Node ID b82ce29791e710c304ea3e5e36c95d5f581522ea
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit ab5e1189217b6ed5f1c5d7c5ff6b79b6a4c18cff
diff -r 000000000000 -r b82ce29791e7 blast2tsv.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blast2tsv.py	Wed Aug 21 13:12:59 2024 +0000
@@ -0,0 +1,297 @@
+#!/usr/bin/env python3
+
+
+# Name: blast2tsv
+# Author(s): Sebastien Theil, Marie Lefebvre - INRAE
+# Aims: Convert blast xml output to tsv and add taxonomy
+
+
+import argparse
+import csv
+import logging as log
+import os
+
+from Bio import Entrez
+from Bio import SeqIO
+from Bio.Blast import NCBIXML
+from ete3 import NCBITaxa
+
+ncbi = NCBITaxa()
+
+
+def main():
+    options = _set_options()
+    _set_log_level(options.verbosity)
+    hits = _read_xml(options)
+    _write_tsv(options, hits)
+
+
+def _guess_database(accession):
+    """Guess the correct database for querying based off the format of the accession"""
+    if accession.isdigit():
+        db = 'taxonomy'
+    else:
+        database_mappings_refseq = {'AC': 'nuccore', 'NC': 'nuccore', 'NG': 'nuccore',
+                                    'NT': 'nuccore', 'NW': 'nuccore', 'NZ': 'nuccore',
+                                    'AP': 'protein', 'NP': 'protein', 'YP': 'protein',
+                                    'XP': 'protein', 'WP': 'protein', 'OX': 'nuccore'}
+        try:
+            db = database_mappings_refseq[accession[0:2]]
+        except KeyError:
+            db = 'nuccore'
+            log.warning("DB not found for " + accession + ". Set to nuccore.")
+    return db
+
+
+def _read_xml(options):
+    """
+    Parse XML blast results file
+    Keep only the first hit
+    """
+    log.info("Read XML file.")
+    results = open(options.xml_file, 'r')
+    records = NCBIXML.parse(results)
+    xml_results = {}
+    for blast_record in records:
+        for aln in blast_record.alignments:
+            hit_count = 1
+            for hit in aln.hsps:
+                hsp = {}
+                if hit_count == 1:
+                    first_hit_frame = hit.frame[1] if len(hit.frame) > 0 else 0  # strand
+                    cumul_hit_identity = hit.identities if hit.identities else 0
+                    cumul_hit_score = hit.bits  # hit score
+                    cumul_hit_evalue = hit.expect  # evalue
+                    cumul_hit_length = hit.align_length if hit.align_length is not None else 0
+                    hit_count = hit_count + 1
+                else:
+                    # all HSPs in different strand than 1st HSPs will be discarded.
+                    if (first_hit_frame > 0 and hit.frame[1] > 0) or (first_hit_frame < 0 and hit.frame[1] < 0):
+                        cumul_hit_identity = cumul_hit_identity + hit.identities
+                        cumul_hit_length = cumul_hit_length + hit.align_length
+                        cumul_hit_evalue = cumul_hit_evalue + hit.expect
+                        cumul_hit_score = cumul_hit_score + hit.bits
+                        hit_count = hit_count + 1
+            if hit_count == 1:
+                final_hit_count = hit_count
+            elif hit_count > 1:
+                final_hit_count = hit_count - 1
+            hsp["evalue"] = cumul_hit_evalue / final_hit_count  # The smaller the E-value, the better the match
+            hsp["query_id"] = blast_record.query  # or query_id
+            hsp["query_length"] = blast_record.query_length  # length of the query
+            hsp["accession"] = aln.accession.replace("ref|", "")
+            hsp["description"] = aln.hit_def
+            hsp["hit_length"] = aln.length  # length of the hit
+            hsp["hsp_length"] = hit.align_length  # length of the hsp alignment
+            hsp["queryOverlap"] = _get_overlap_value(options.algo, hsp, 'hsp', hsp["query_length"])[0]
+            if cumul_hit_length == 0:
+                hsp["percentIdentity"] = round(cumul_hit_identity, 1)  # identity percentage
+            else:
+                hsp["percentIdentity"] = round(cumul_hit_identity / cumul_hit_length * 100, 1)  # identity percentage
+            hsp["score"] = cumul_hit_score  # The higher the bit-score, the better the sequence similarity
+            hsp["num_hsps"] = final_hit_count
+            hsp["hit_cumul_length"] = cumul_hit_length
+            hsp["hitOverlap"] = _get_overlap_value(options.algo, hsp, 'hit', hsp["query_length"])[1]
+            db = _guess_database(hsp["accession"])
+            try:
+                handle = Entrez.esummary(db=db, id=hsp["accession"])
+                taxid = str(int(Entrez.read(handle)[0]['TaxId']))
+                handle.close()
+                log.info("Taxid found for " + hsp["accession"])
+                lineage = ncbi.get_lineage(taxid)
+                names = ncbi.get_taxid_translator(lineage)
+                ordered = [names[tid] for tid in lineage]
+                taxonomy = ordered[1:]
+                hsp["tax_id"] = taxid
+                hsp["taxonomy"] = ';'.join(taxonomy)
+                hsp["organism"] = taxonomy[-1]
+            except RuntimeError:
+                hsp["tax_id"] = ""
+                hsp["taxonomy"] = ""
+                hsp["organism"] = ""
+                log.warning(f"RuntimeError - Taxid not found for {hsp['accession']}")
+            except Exception as err:
+                hsp["tax_id"] = ""
+                hsp["taxonomy"] = ""
+                hsp["organism"] = ""
+                log.warning(f"Taxid not found for {hsp['accession']}. The error is {err}")
+            if hsp["evalue"] <= options.max_evalue and hsp["queryOverlap"] >= options.min_qov and \
+                    hsp["hitOverlap"] >= options.min_hov and hsp["score"] >= options.min_score:
+                xml_results[hsp["query_id"]] = hsp
+            else:
+                xml_results[hsp["query_id"]] = [hsp["query_length"]]
+
+    return xml_results
+
+
+def _get_overlap_value(algo, hsp, type, qlength):
+    """
+    Set hsp or hit overlap values for hit and query
+    Return array [query_overlap, hit_overlap]
+    """
+    if type == 'hsp':
+        q_align_len = qlength
+        h_align_len = hsp["hsp_length"]
+    else:
+        q_align_len = qlength
+        h_align_len = hsp["hit_cumul_length"]
+
+    if algo == 'BLASTX':
+        if q_align_len:
+            query_overlap = (q_align_len * 3 / q_align_len) * 100
+        if hsp["hit_length"]:
+            hit_overlap = (h_align_len / hsp["hit_length"]) * 100
+    elif algo == 'TBLASTN':
+        if q_align_len:
+            query_overlap = (q_align_len / q_align_len) * 100
+        if hsp["hit_length"]:
+            hit_overlap = (h_align_len * 3 / hsp["hit_length"]) * 100
+    elif algo == 'TBLASTX':
+        if q_align_len:
+            query_overlap = (q_align_len * 3 / hsp["hsp_length"]) * 100
+        if hsp["hit_length"]:
+            hit_overlap = (h_align_len * 3 / hsp["hit_length"]) * 100
+    else:
+        if q_align_len:
+            query_overlap = (q_align_len / q_align_len) * 100
+        if hsp["hit_length"]:
+            hit_overlap = (h_align_len / hsp["hit_length"]) * 100
+    if query_overlap is None:
+        query_overlap = 0
+    if query_overlap > 100:
+        query_overlap = 100
+    if 'hit_overlap' not in locals():
+        hit_overlap = 0
+    if hit_overlap > 100:
+        hit_overlap = 100
+
+    return [round(query_overlap, 0), round(hit_overlap, 0)]
+
+
+def _write_tsv(options, hits):
+    """
+    Write output
+    """
+    # get a list of contig without corresponding number of mapped reads
+    if options.rn_file is not None:
+        with open(options.rn_file) as rn:
+            rows = (line.split('\t') for line in rn)
+            rn_list = {row[0]: row[1:] for row in rows}
+    fasta = SeqIO.to_dict(SeqIO.parse(open(options.fasta_file), 'fasta'))
+    headers = "#algo\tquery_id\tnb_reads\tquery_length\taccession\tdescription\torganism\tpercentIdentity\tnb_hsps\tqueryOverlap\thitOverlap\tevalue\tscore\ttax_id\ttaxonomy\tsequence\n"
+    if not os.path.exists(options.output):
+        os.mkdir(options.output)
+    tsv_file = options.output + "/blast2tsv_output.tab"
+    log.info("Write output file: " + tsv_file)
+    f = open(tsv_file, "w+")
+    f.write(headers)
+    for h in hits:
+        if options.rn_file is not None:
+            read_nb = ''.join(rn_list[h]).replace("\n", "")
+        else:
+            read_nb = ''
+        if len(hits[h]) > 1:
+            f.write(options.algo + "\t" + h + "\t" + read_nb + "\t" + str(hits[h]["query_length"]) + "\t")
+            f.write(hits[h]["accession"] + "\t" + hits[h]["description"] + "\t")
+            f.write(hits[h]["organism"] + "\t" + str(hits[h]["percentIdentity"]) + "\t")
+            f.write(str(hits[h]["num_hsps"]) + "\t" + str(hits[h]["queryOverlap"]) + "\t")
+            f.write(str(hits[h]["hitOverlap"]) + "\t" + str(hits[h]["evalue"]) + "\t")
+            f.write(str(hits[h]["score"]) + "\t" + str(hits[h]["tax_id"]) + "\t")
+            if h in fasta:
+                f.write(hits[h]["taxonomy"] + "\t" + str(fasta[h].seq))
+            else:
+                f.write(hits[h]["taxonomy"] + "\t\"\"")
+            f.write("\n")
+        else:
+            f.write(options.algo + "\t" + h + "\t" + read_nb + "\t" + str(hits[h])[1:-1] + "\t")
+            f.write("\n")
+    f.close()
+    _create_abundance(options, tsv_file)
+
+
+def _create_abundance(options, tsv_file):
+    """
+    extract values from tsv files
+    and create abundance files
+    """
+    log.info("Calculating abundance.")
+    file_path = tsv_file
+    abundance = dict()
+    with open(tsv_file, 'r') as current_file:
+        log.debug("Reading " + file_path)
+        csv_reader = csv.reader(current_file, delimiter='\t')
+        line_count = 0
+        for row in csv_reader:
+            if line_count == 0:
+                # headers
+                line_count += 1
+            else:
+                # no annotation
+                if len(row) == 16:
+                    if row[14] != "":
+                        nb_reads = row[2]
+                        if nb_reads == "":
+                            current_reads_nb = 0
+                            log.debug("No reads number for " + row[1])
+                        else:
+                            current_reads_nb = int(nb_reads)
+                        contig_id = row[14]
+                        if contig_id in abundance:
+                            # add reads
+                            abundance[contig_id]["reads_nb"] = abundance[row[14]]["reads_nb"] + current_reads_nb
+                            abundance[contig_id]["contigs_nb"] = abundance[row[14]]["contigs_nb"] + 1
+                        else:
+                            # init reads for this taxo
+                            abundance[contig_id] = {}
+                            abundance[contig_id]["reads_nb"] = current_reads_nb
+                            abundance[contig_id]["contigs_nb"] = 1
+                    else:
+                        log.debug("No annotations for contig " + row[1])
+                else:
+                    log.debug("No annotations for contig " + row[1])
+    log.debug(abundance)
+    reads_file = open(options.output + "/blast2tsv_reads.txt", "w+")
+    for taxo in abundance:
+        reads_file.write(str(abundance[taxo]["reads_nb"]))
+        reads_file.write("\t")
+        reads_file.write("\t".join(taxo.split(";")))
+        reads_file.write("\n")
+    reads_file.close()
+    log.info("Abundance file created " + options.output + "/blast2tsv_reads.txt")
+    contigs_file = open(options.output + "/blast2tsv_contigs.txt", "w+")
+    for taxo in abundance:
+        contigs_file.write(str(abundance[taxo]["contigs_nb"]))
+        contigs_file.write("\t")
+        contigs_file.write("\t".join(taxo.split(";")))
+        contigs_file.write("\n")
+    contigs_file.close()
+    log.info("Abundance file created " + options.output + "/blast2tsv_contigs.txt")
+
+
+def _set_options():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file')
+    parser.add_argument('-rn', '--read-count', help='Tab-delimited file associating seqID with read number.', action='store', dest='rn_file')
+    parser.add_argument('-c', '--contigs', help='FASTA file with contigs sequence.', action='store', required=True, dest='fasta_file')
+    parser.add_argument('-me', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue')
+    parser.add_argument('-qov', '--min_query_overlap', help='Minimum query overlap', action='store', type=int, default=5, dest='min_qov')
+    parser.add_argument('-mhov', '--min_hit_overlap', help='Minimum hit overlap', action='store', type=int, default=5, dest='min_hov')
+    parser.add_argument('-s', '--min_score', help='Minimum score', action='store', type=int, default=30, dest='min_score')
+    parser.add_argument('-a', '--algo', help='Blast type detection (BLASTN|BLASTP|BLASTX|TBLASTX|TBLASTN|DIAMONDX).', action='store', type=str, default='BLASTX', dest='algo')
+    parser.add_argument('-o', '--out', help='The output file (.csv).', action='store', type=str, default='./blast2tsv', dest='output')
+    parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1)
+    args = parser.parse_args()
+    return args
+
+
+def _set_log_level(verbosity):
+    if verbosity == 1:
+        log_format = '%(asctime)s %(levelname)-8s %(message)s'
+        log.basicConfig(level=log.INFO, format=log_format)
+    elif verbosity == 3:
+        log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s'
+        log.basicConfig(level=log.DEBUG, format=log_format)
+
+
+if __name__ == "__main__":
+    main()
diff -r 000000000000 -r b82ce29791e7 macros.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Wed Aug 21 13:12:59 2024 +0000
@@ -0,0 +1,30 @@
+
+    
+        
+            virannot 
+         
+     
+    
+        
+            biopython 
+            ete3 
+            clustalo 
+            curl 
+            r-base 
+            pyaml 
+            openpyxl 
+            xlsxwriter 
+            xlrd 
+            pandas 
+            krona 
+            zip 
+             
+     
+    
+        
+            10.1094/PBIOMES-07-19-0037-A 
+         
+     
+ 
diff -r 000000000000 -r b82ce29791e7 otu.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/otu.py	Wed Aug 21 13:12:59 2024 +0000
@@ -0,0 +1,442 @@
+#!/usr/bin/env python3
+
+
+# Name: virAnnot_otu
+# Author: Marie Lefebvre - INRAE
+# Reuirements: Ete3 toolkit and external apps
+# Aims: Create viral OTUs based on RPS and Blast annotations
+
+
+import argparse
+import csv
+import logging as log
+import os
+import random
+import re
+
+import pandas as pd
+import xlsxwriter
+from Bio import SeqIO
+from Bio.Align.Applications import ClustalOmegaCommandline
+from ete3 import NodeStyle, SeqGroup, SeqMotifFace, Tree, TreeStyle
+
+
+def main():
+    """
+    1 - retrieve info (sequence, query_id, taxo) from RPS file
+    2 - align protein sequences of the same domain, calculate
+    matrix of distances, generate trees
+    3 - get statistics (read number) per otu
+    4 - create HTML report
+    """
+    options = _set_options()
+    _set_log_level(options.verbosity)
+    hits_collection = _cut_sequence(options)
+    _align_sequences(options, hits_collection)
+    _get_stats(options, hits_collection)
+    _create_html(options, hits_collection)
+
+
+def _cut_sequence(options):
+    """
+    Retrieve viral hits and sequences from RPS files
+    """
+    log.info("Cut sequences")
+    i = 0  # keep track of iterations over rps files to use the corresponding fasta file
+    collection = {}
+    options.rps.sort()
+    for rps_file in options.rps:
+        log.debug("Reading rps file " + str(rps_file))
+        with open(rps_file[0], 'r') as rps_current_file:
+            rps_reader = csv.reader(rps_current_file, delimiter='\t')
+            headers = 0
+            for row in rps_reader:
+                if headers == 0:
+                    # headers
+                    headers += 1
+                else:
+                    if row[1] == "no_hit":
+                        pass
+                    else:
+                        query_id = row[0]
+                        cdd_id = row[2]
+                        startQ = int(row[5])
+                        endQ = int(row[6])
+                        frame = float(row[7])
+                        description = row[8]
+                        superkingdom = row[9]
+                        match = re.search("Viruses", superkingdom)
+                        # if contig is viral then retrieve sequence
+                        if match:
+                            options.fasta.sort()
+                            seq = _retrieve_fasta_seq(options.fasta[i][0], query_id)
+                            seq_length = len(seq)
+                            if endQ < seq_length:
+                                seq = seq[startQ - 1:endQ]
+                            else:
+                                seq = seq[startQ - 1:seq_length]
+                            if frame < 0:
+                                seq = seq.reverse_complement()
+                            prot = seq.translate()
+                            if len(prot) >= options.min_protein_length:
+                                log.debug("Add " + query_id + " to collection")
+                                if cdd_id not in collection:
+                                    collection[cdd_id] = {}
+                                collection[cdd_id][query_id] = {}
+                                collection[cdd_id][query_id]["nuccleotide"] = seq
+                                collection[cdd_id][query_id]["protein"] = prot
+                                collection[cdd_id][query_id]["full_description"] = description
+                                if options.blast is not None:
+                                    options.blast.sort()
+                                    with open(options.blast[i][0], 'r') as blast_current_file:
+                                        blast_reader = csv.reader(blast_current_file, delimiter='\t')
+                                        for b_query in blast_reader:
+                                            if b_query[1] == query_id:
+                                                collection[cdd_id][query_id]["nb"] = b_query[2]
+                                                if len(b_query) > 10:
+                                                    collection[cdd_id][query_id]["taxonomy"] = b_query[14]
+                                                else:
+                                                    collection[cdd_id][query_id]["taxonomy"] = "Unknown"
+                                            else:
+                                                if "nb" not in collection[cdd_id][query_id]:
+                                                    collection[cdd_id][query_id]["nb"] = 0
+                                                if "taxonomy" not in collection[cdd_id][query_id]:
+                                                    collection[cdd_id][query_id]["taxonomy"] = "Unknown"
+                                else:
+                                    log.info("No blast file")
+                                    collection[cdd_id][query_id]["taxonomy"] = "Unknown"
+                                    collection[cdd_id][query_id]["nb"] = 0
+
+                                collection[cdd_id]["short_description"] = description.split(",")[0] + description.split(",")[1]  # keep pfamXXX and RdRp 1
+                                collection[cdd_id]["full_description"] = description
+        i += 1
+    return collection
+
+
+def _retrieve_fasta_seq(fasta_file, query_id):
+    """
+    From fasta file retrieve specific sequence with id
+    """
+    contigs_list = SeqIO.to_dict(SeqIO.parse(open(fasta_file), 'fasta'))
+    try:
+        seq = contigs_list[query_id].seq
+    except KeyError:
+        print("KeyError for " + query_id + " file " + fasta_file)
+    else:
+        return seq
+
+
+def _create_tree(tree, fasta, out, color):
+    """
+    Create phylogenic tree from multiple alignments
+    """
+    try:
+        f = open(tree, 'r')
+    except IOError:
+        log.info("Unknown file: " + tree + ". You may have less than 2 sequences to align.")
+        return
+
+    line = ""
+    for word in f:
+        line += word.strip()
+
+    f.close()
+    seqs = SeqGroup(fasta, format="fasta")
+    t = Tree(tree)
+    ts = TreeStyle()
+    ts.show_branch_length = True
+    colors = _parse_color_file(color)
+    node_names = t.get_leaf_names()
+    for name in node_names:
+        seq = seqs.get_seq(name)
+        seqFace = SeqMotifFace(seq, seq_format="()")
+        node = t.get_leaves_by_name(name)
+        for i in range(0, len(node)):
+            if name in colors:
+                ns = NodeStyle()
+                ns['bgcolor'] = colors[name]
+                node[i].set_style(ns)
+            node[i].add_face(seqFace, 0, 'aligned')
+
+    t.render(out, tree_style=ts)
+
+
+def _parse_color_file(file):
+    fh = open(file)
+    reader = csv.reader(fh, delimiter="\t")
+    data = list(reader)
+    colors = {}
+    for i in range(0, len(data)):
+        colors[data[i][0]] = data[i][1]
+
+    return colors
+
+
+def _align_sequences(options, hits_collection):
+    """
+    Align hit sequences with pfam reference
+    """
+    log.info("Align sequences")
+    if not os.path.exists(options.output):
+        os.mkdir(options.output)
+    color_by_sample = {}
+    for cdd_id in hits_collection:
+        cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_")
+        if not os.path.exists(cdd_output):
+            os.mkdir(cdd_output)
+        if os.path.exists(cdd_output + "/seq_to_align.fasta"):
+            os.remove(cdd_output + "/seq_to_align.fasta")
+        file_seq_to_align = cdd_output + "/seq_to_align.fasta"
+        file_color_config = cdd_output + "/color_config.txt"
+        f = open(file_seq_to_align, "a")
+        f_c = open(file_color_config, "w+")
+        log.info("Writing to " + file_seq_to_align)
+        count = 0  # count number of contig per domain
+        for query_id in hits_collection[cdd_id]:
+            if query_id not in ["short_description", "full_description"]:
+                sample = query_id.split("_")[0]  # get sample from SAMPLE_IdCONTIG
+                sample_color = "#" + ''.join([random.choice('ABCDEF0123456789') for i in range(6)])
+                # same color for each contig of the same sample
+                if sample not in color_by_sample.keys():
+                    color_by_sample[sample] = sample_color
+                f.write(">" + query_id + "\n")
+                f.write(str(hits_collection[cdd_id][query_id]["protein"]) + "\n")
+                f_c.write(query_id + '\t' + color_by_sample[sample] + '\n')
+                count += 1
+        f.close()
+        f_c.close()
+        file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa'
+        tree_file = cdd_output + '/tree.dnd'
+        file_cluster = cdd_output + '/otu_cluster.csv'
+        # create alignment for domain with more than 1 contigs
+        if count > 1:
+            log.info("Run clustal omega...")
+            clustalo_cmd = ClustalOmegaCommandline("clustalo", infile=file_seq_to_align, outfile=file_seq_aligned,
+                                                   guidetree_out=tree_file, seqtype="protein", force=True)
+            log.debug(clustalo_cmd)
+            stdout, stderr = clustalo_cmd()
+            log.debug(stdout + stderr)
+
+            # create tree plot with colors
+            file_matrix = cdd_output + "/identity_matrix.csv"
+            log.info("Create tree...")
+            _create_tree(tree_file, file_seq_aligned, tree_file + '.png', file_color_config)
+            _compute_pairwise_distance(options, file_seq_aligned, file_matrix, cdd_id)
+            log.info("Retrieve OTUs...")
+            # if os.path.exists(file_cluster):
+            #     os.remove(file_cluster)
+            otu_cmd = os.path.join(options.tool_path, 'seek_otu.R') + ' ' + file_matrix + ' ' + file_cluster + ' ' + str(options.perc)
+            log.debug(otu_cmd)
+            os.system(otu_cmd)
+        # only one contig
+        else:
+            mv_cmd = 'cp ' + file_seq_to_align + ' ' + file_seq_aligned
+            log.debug(mv_cmd)
+            os.system(mv_cmd)
+
+            f = open(file_cluster, "w+")
+            f.write('OTU_1,1,' + list(hits_collection[cdd_id].keys())[0] + ',')
+            f.close()
+
+
+def _compute_pairwise_distance(options, file_seq_aligned, file_matrix, cdd_id):
+    """
+    Calculate paiwise distance between aligned protein sequences
+    from a cdd_id
+    """
+    log.info("Compute pairwise distance of " + cdd_id)
+    matrix = {}
+    for k1 in SeqIO.parse(file_seq_aligned, "fasta"):
+        row = []
+        for k2 in SeqIO.parse(file_seq_aligned, "fasta"):
+            identic = 0
+            compared = 0
+            keep_pos = 0
+            for base in k1:
+                base2 = k2[keep_pos]
+                # mutation, next
+                if base == 'X' or base2 == 'X':
+                    keep_pos += 1
+                    continue
+                # gap in both sequences, next
+                if base == '-' and base2 == '-':
+                    keep_pos += 1
+                    continue
+                # gap in one of the sequence, next
+                if base == '-' or base2 == '-':
+                    keep_pos += 1
+                    continue
+                # identity
+                if base == base2:
+                    identic += 1
+                compared += 1
+                keep_pos += 1
+            # set minimum overlap to 20
+            if compared == 0 or compared < 20:
+                percentIdentity = 0
+            else:
+                percentIdentity = (identic / compared) * 100
+            row.append(percentIdentity)
+        matrix[k1.id] = row
+    log.debug("Write " + file_matrix)
+    f = open(file_matrix, "w+")
+    for row in matrix:
+        f.write(row + ',' + ', '.join(map(str, matrix[row])) + "\n")
+    f.close()
+
+
+def _get_stats(options, hits_collection):
+    """
+    Retrieve annotation and number of read
+    for    each OTUs
+    """
+    file_xlsx = options.output + '/otu_stats.xlsx'  # Create a workbook
+    workbook = xlsxwriter.Workbook(file_xlsx)
+    log.info("Writing stats to " + file_xlsx)
+    for cdd_id in hits_collection:
+        otu_collection = {}
+        cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_")
+        worksheet = workbook.add_worksheet(hits_collection[cdd_id]["short_description"])  # add a worksheet
+        file_cluster = cdd_output + '/otu_cluster.csv'
+        with open(file_cluster, 'r') as clust:
+            otu_reader = csv.reader(clust, delimiter=',')
+            samples_list = []
+            for row in otu_reader:
+                contigs_list = row[2:len(row) - 1]  # remove last empty column
+                otu_collection[row[0]] = {}  # key -> otu number
+                otu_collection[row[0]]['contigs_list'] = contigs_list
+                for contig in contigs_list:
+                    sample = contig.split('_')[0]
+                    samples_list.append(sample) if sample not in samples_list else samples_list
+                    if sample not in otu_collection[row[0]]:
+                        otu_collection[row[0]][sample] = {}
+                        otu_collection[row[0]][sample][contig] = {}
+                        # add read number of the contig and annotation
+                        if 'nb' in hits_collection[cdd_id][contig]:
+                            otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"]
+                        else:
+                            otu_collection[row[0]][sample][contig]['nb'] = 0
+                        if 'taxonomy' in hits_collection[cdd_id][contig]:
+                            otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"]
+                        else:
+                            otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown'
+                    else:
+                        otu_collection[row[0]][sample][contig] = {}
+                        # add read number of the contig and annotation
+                        if 'nb' in hits_collection[cdd_id][contig]:
+                            otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"]
+                        else:
+                            otu_collection[row[0]][sample][contig]['nb'] = 0
+                        if 'taxonomy' in hits_collection[cdd_id][contig]:
+                            otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"]
+                        else:
+                            otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown'
+                    if 'taxonomy' in hits_collection[cdd_id][contig]:
+                        otu_collection[row[0]]['global_taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"]
+                    else:
+                        otu_collection[row[0]]['global_taxonomy'] = 'unknown'
+
+        # calculate total number of reads for each sample of each OTU
+        for otu in otu_collection:
+            for sample in otu_collection[otu]:
+                if sample not in ['contigs_list', 'global_taxonomy']:
+                    total_nb_read = 0
+                    for contig in otu_collection[otu][sample]:
+                        total_nb_read += int(otu_collection[otu][sample][contig]['nb'])
+                    otu_collection[otu][sample]['total_nb_read'] = total_nb_read
+        row = 0
+        column = 0
+        item = '#OTU_name'
+        worksheet.write(row, column, item)
+        for samp in samples_list:
+            column += 1
+            worksheet.write(row, column, samp)
+        worksheet.write(row, column + 1, 'taxonomy')
+        worksheet.write(row, column + 2, 'contigs_list')
+        row = 1
+        # column = 0
+        for otu in otu_collection:
+            if isinstance(otu_collection[otu], dict):
+                column = 0
+                worksheet.write(row, column, otu)
+                # prepare table with 0 in each cells
+                for sample in otu_collection[otu]:
+                    column = 1
+                    for samp in samples_list:
+                        worksheet.write(row, column, 0)
+                        column += 1
+                # fill in table with nb of read for each sample and each OTU
+                for sample in otu_collection[otu]:
+                    column = 1
+                    for samp in samples_list:
+                        if samp == sample:
+                            worksheet.write(row, column, otu_collection[otu][sample]['total_nb_read'])
+                        column += 1
+                worksheet.write(row, len(samples_list) + 1, otu_collection[otu]['global_taxonomy'].replace(';', ' '))
+                worksheet.write(row, len(samples_list) + 2, ",".join(otu_collection[otu]['contigs_list']))
+                row += 1
+    workbook.close()
+    read_file = pd.ExcelFile(file_xlsx)
+    for sheet in read_file.sheet_names:
+        cluster_nb_reads_file = options.output + "/" + sheet.replace(" ", "_") + "/cluster_nb_reads_files.tab"
+        data_xls = pd.read_excel(file_xlsx, sheet, dtype=str, index_col=None)
+        data_xls.to_csv(cluster_nb_reads_file, encoding='utf-8', index=False, sep='\t')
+
+
+def _create_html(options, hits_collection):
+    """
+    Create HTML file with all results
+    """
+    # create mapping file with all informations to use to create HTML report
+    map_file_path = options.output + "/map.txt"
+    if os.path.exists(map_file_path):
+        os.remove(map_file_path)
+
+    map_file = open(map_file_path, "w+")
+    headers = ['#cdd_id', 'align_files', 'tree_files', 'cluster_files', 'cluster_nb_reads_files', 'pairwise_files', 'description', 'full_description\n']
+    map_file.write("\t".join(headers))
+    for cdd_id in hits_collection:
+        cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_")
+        short_description = cdd_output
+        file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa'
+        tree_file = cdd_output + '/tree.dnd.png'
+        file_cluster = cdd_output + '/otu_cluster.csv'
+        file_matrix = cdd_output + "/identity_matrix.csv"
+        cluster_nb_reads_files = cdd_output + "/cluster_nb_reads_files.tab"
+        map_file.write(cdd_id + "\t" + file_seq_aligned + "\t" + tree_file + "\t")
+        map_file.write(file_cluster + "\t" + cluster_nb_reads_files + "\t" + file_matrix + "\t")
+        map_file.write(short_description + "\t" + hits_collection[cdd_id]["full_description"] + "\n")
+    map_file.close()
+    log.info("Writing HTML report")
+    html_cmd = os.path.join(options.tool_path, 'rps2tree_html.py') + ' -m ' + map_file_path + ' -o ' + options.output
+    log.debug(html_cmd)
+    os.system(html_cmd)
+
+
+def _set_options():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-b', '--blast', help='TAB blast file from blast2ecsv module.', action='append', required=False, dest='blast', nargs='+')
+    parser.add_argument('-r', '--rps', help='TAB rpsblast file from rps2ecsv module.', action='append', required=True, dest='rps', nargs='+')
+    parser.add_argument('-f', '--fasta', help='FASTA file with contigs', action='append', required=True, dest='fasta', nargs='+')
+    parser.add_argument('-p', '--percentage', help='Percentage similarity threshold for OTUs cutoff.', action='store', type=int, default=90, dest='perc')
+    parser.add_argument('-vp', '--viral_portion', help='Minimun portion of viral sequences in RPS domain to be included.', action='store', type=float, default=0.3, dest='viral_portion')
+    parser.add_argument('-mpl', '--min_protein_length', help='Minimum query protein length.', action='store', type=int, default=100, dest='min_protein_length')
+    parser.add_argument('-tp', '--tool_path', help='Path to otu_seek.R', action='store', type=str, default='./', dest='tool_path')
+    parser.add_argument('-o', '--out', help='The output directory', action='store', type=str, default='./Rps2tree_OTU', dest='output')
+    parser.add_argument('-rgb', '--rgb-conf', help='Color palette for contigs coloration', action='store', type=str, default='rgb.txt', dest='file_rgb')
+    parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1)
+    args = parser.parse_args()
+    return args
+
+
+def _set_log_level(verbosity):
+    if verbosity == 1:
+        log_format = '%(asctime)s %(levelname)-8s %(message)s'
+        log.basicConfig(level=log.INFO, format=log_format)
+    elif verbosity == 3:
+        log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s'
+        log.basicConfig(level=log.DEBUG, format=log_format)
+
+
+if __name__ == "__main__":
+    main()
diff -r 000000000000 -r b82ce29791e7 rps2tree_html.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rps2tree_html.py	Wed Aug 21 13:12:59 2024 +0000
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import logging
+import sys
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def main():
+    options = _set_options()
+    data, headers = _read_map_file(options.map)
+    html = _print_html(data, headers, options.out)
+    index_file = options.out + '/index.html'
+    fh = open(index_file, mode='w')
+    fh.write(html)
+    fh.close()
+
+
+def _get_google_script_headers(data, headers, out_dir):
+    html = '' + "\n"
+    html += '' + "\n"
+    return html
+
+
+def _get_google_js(data, headers, out_dir):
+    java_scripts = []
+    chart_names = []
+    for cdd in data:
+        chart_names.append(cdd['cdd_id'] + '_' + cdd['description'])
+        js = 'var data = new google.visualization.DataTable();' + "\n"
+        mat, head = _parse_csv(out_dir + '/' + cdd['cluster_nb_reads_files'])
+        for el in head:
+            if el == '#OTU_name':
+                js += 'data.addColumn(\'string\', \'' + el + '\');' + "\n"
+            elif el == 'taxonomy':
+                js += 'data.addColumn(\'string\', \'' + el + '\');' + "\n"
+            elif el == 'contigs_list' or el == 'seq_list':
+                js += 'data.addColumn(\'string\', \'' + el + '\');' + "\n"
+            else:
+                js += 'data.addColumn(\'number\', \'' + el + '\');' + "\n"
+        js += 'data.addRows([' + "\n"
+        for j in range(0, len(mat)):
+            js += '[\'' + mat[j][head[0]] + '\''
+            for i in range(1, len(head) - 2):
+                js += ',' + mat[j][head[i]]
+            js += ',\'' + mat[j][head[len(head) - 2]] + '\''
+            js += ',\'' + mat[j][head[len(head) - 1]] + '\''
+            js += ']'
+            if j != (len(mat) - 1):
+                js += ','
+            js += "\n"
+        js += ']);' + "\n"
+        js += 'var table = new google.visualization.Table(document.getElementById(\'' + (cdd['cdd_id'] + '_' + cdd['description']).replace('-', '_') + '_div' + '\'));' + "\n"
+        js += 'table.draw(data, {showRowNumber: false, width: \'70%\', height: \'70%\'});' + "\n"
+        java_scripts.append(js)
+    return chart_names, java_scripts
+
+
+def _parse_csv(file):
+    fh = open(file)
+    reader = csv.reader(fh, delimiter="\t")
+    data = list(reader)
+    headers = data[0]
+    matrix = []
+    for i in range(1, len(data)):
+        dict = {}
+        for j in range(0, len(data[i])):
+            if data[i][j] == '':
+                dict[headers[j]] = None
+            elif data[i][j] == 'null':
+                dict[headers[j]] = None
+            else:
+                dict[headers[j]] = data[i][j]
+        matrix.append(dict)
+    return matrix, headers
+
+
+def _print_html(data, headers, out_dir):
+    html = '' + "\n"
+    html += '
' + "\n"
+    html += '' + 'rps2tree' + ' '
+    html += _get_google_script_headers(data, headers, out_dir)
+    html += '' + "\n"
+    html += '' + "\n"
+    html += '
rps2tree ' + "\n"
+    html += '' + "\n"
+    html += _print_data(data, headers)
+    html += '' + "\n"
+    html += '' + "\n"
+    html += '' + "\n"
+    return html
+
+
+def _print_data(data, headers):
+    html = ''
+    for cdd in data:
+        html += '' + cdd['cdd_id'] + ' ' + cdd['description'] + ' ' + "\n"
+        html += '' + cdd['full_description'] + '' + '
' + "\n"
+        html += '
' + "\n"
+        html += '' + "\n"
+        html += '' + "\n"
+        html += '' + cdd['align_files'] + ' ' + "\n"
+        html += '' + "\n"
+        html += '' + cdd['cluster_files'] + ' ' + "\n"
+        html += '' + "\n"
+        html += '' + cdd['cluster_nb_reads_files'] + ' ' + "\n"
+        html += '' + "\n"
+        html += '' + cdd['pairwise_files'] + ' ' + "\n"
+        html += '' + "\n"
+        html += '' + "\n"
+        html += '
+  rpstblastn 
+  RPSTBLASTN 2.14.1+ 
+  Stephen F. Altschul, Thomas L. Madden, Alejandro A. Schäffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402. 
+  /data/db/databases/blast/2018-01-22/pfam/Pfam 
+  Query_1 
+  NODE_1_length_506_cov_10.687361 
+  506 
+  
+    
+      BLOSUM62 
+      0.0001 
+      11 
+      1 
+      F 
+     
+   
+
+
+  1 
+  Query_1 
+  NODE_1_length_506_cov_10.687361 
+  506 
+
+ 
+  
+    
+      16305 
+      2821655 
+      80 
+      133518440 
+      0.083353464 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  2 
+  Query_2 
+  NODE_2_length_429_cov_3.631016 
+  429 
+
+ 
+  
+    
+      16305 
+      2821655 
+      78 
+      100741225 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  3 
+  Query_3 
+  NODE_3_length_365_cov_1.074194 
+  365 
+
+ 
+  
+    
+      16305 
+      2821655 
+      75 
+      73543880 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  4 
+  Query_4 
+  NODE_4_length_351_cov_1.547297 
+  351 
+
+ 
+  
+    
+      16305 
+      2821655 
+      74 
+      69448655 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  5 
+  Query_5 
+  NODE_5_length_344_cov_3.273356 
+  344 
+
+ 
+  
+    
+      16305 
+      2821655 
+      72 
+      69203190 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  6 
+  Query_6 
+  NODE_6_length_338_cov_1.314488 
+  338 
+
+ 
+  
+    
+      16305 
+      2821655 
+      70 
+      70572810 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  7 
+  Query_7 
+  NODE_7_length_335_cov_1.714286 
+  335 
+
+ 
+  
+    
+      16305 
+      2821655 
+      70 
+      68892505 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  8 
+  Query_8 
+  NODE_8_length_331_cov_0.862319 
+  331 
+
+ 
+  
+    
+      16305 
+      2821655 
+      69 
+      69561010 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  9 
+  Query_9 
+  NODE_9_length_324_cov_2.141264 
+  324 
+
+ 
+  
+    
+      16305 
+      2821655 
+      67 
+      70898020 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  10 
+  Query_10 
+  NODE_10_length_324_cov_1.371747 
+  324 
+
+ 
+  
+    
+      16305 
+      2821655 
+      67 
+      70898020 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  11 
+  Query_11 
+  NODE_11_length_317_cov_1.125954 
+  317 
+
+ 
+  
+    
+      16305 
+      2821655 
+      65 
+      70473200 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  12 
+  Query_12 
+  NODE_12_length_311_cov_1.535156 
+  311 
+
+ 
+  
+    
+      16305 
+      2821655 
+      64 
+      69347265 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  13 
+  Query_13 
+  NODE_13_length_295_cov_0.945833 
+  295 
+
+
+  1 
+  gnl|CDD|316155 
+  pfam13603, tRNA-synt_1_2, Leucyl-tRNA synthetase, Domain 2.  This is a family of the conserved region of Leucine-tRNA ligase or Leucyl-tRNA synthetase, EC:6.1.1.4. 
+  316155 
+  184 
+  
+    
+      1 
+      38.6378 
+      91 
+      2.277e-05 
+      159 
+      269 
+      128 
+      168 
+      3 
+      0 
+      17 
+      22 
+      4 
+      41 
+      FIKQYGLPFNPVIAPEDAELTDEQIQSYINTA---NS-FFN 
+      FAKKYNLPIKPVIKPEDGDLPDIMTEAYTEEGILVNSGEFD 
+      F K+Y LP  PVI PED +L D   ++Y       NS  F+ 
+     
+   
+ 
+ 
+  
+    
+      16305 
+      2821655 
+      60 
+      70047490 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  14 
+  Query_14 
+  NODE_14_length_294_cov_1.891213 
+  294 
+
+ 
+  
+    
+      16305 
+      2821655 
+      60 
+      70047490 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  15 
+  Query_15 
+  NODE_15_length_280_cov_1.413333 
+  280 
+
+ 
+  
+    
+      16305 
+      2821655 
+      56 
+      70617275 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  16 
+  Query_16 
+  NODE_16_length_278_cov_0.901345 
+  278 
+
+
+  1 
+  gnl|CDD|306845 
+  pfam00421, PSII, Photosystem II protein.   
+  306845 
+  500 
+  
+    
+      1 
+      132.634 
+      334 
+      7.65615e-39 
+      34 
+      270 
+      388 
+      466 
+      1 
+      0 
+      52 
+      61 
+      0 
+      79 
+      SRYSVEQVGVTVEFYGGELNGVSYSDPATVKKYARRAQLGEIFELDRATLKSDGVFRSSPRGWFTFGHASFALLFFQTH 
+      SGYSLEQTGVTVQFYGGELNGQTFTDPWQVKRYARHAQLGELNSVDRVTTESDGVFRVSPRGWLAFSHFCFALLFFFGH 
+      S YS+EQ GVTV+FYGGELNG +++DP  VK+YAR AQLGE+  +DR T +SDGVFR SPRGW  F H  FALLFF  H 
+     
+   
+ 
+ 
+  
+    
+      16305 
+      2821655 
+      55 
+      71220560 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  17 
+  Query_17 
+  NODE_17_length_277_cov_1.540541 
+  277 
+
+ 
+  
+    
+      16305 
+      2821655 
+      55 
+      71220560 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  18 
+  Query_18 
+  NODE_18_length_274_cov_3.872146 
+  274 
+
+ 
+  
+    
+      16305 
+      2821655 
+      55 
+      69295680 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  19 
+  Query_19 
+  NODE_19_length_271_cov_0.879630 
+  271 
+
+
+  1 
+  gnl|CDD|306845 
+  pfam00421, PSII, Photosystem II protein.   
+  306845 
+  500 
+  
+    
+      1 
+      56.3644 
+      136 
+      1.69015e-11 
+      82 
+      252 
+      165 
+      224 
+      1 
+      0 
+      23 
+      34 
+      23 
+      70 
+      ITNLTLNPSVIFGYL-------------LKSPFGGEGWIVSVDDLEDIIGGHVWLGSICILGGIWHILTK 
+      VSDPTLDPGVIYGYTGHVQPVAPVWGAEGFSPFGPGG----------IVGHHIAAGILGIIGGIFHITTR 
+      +++ TL+P VI+GY                SPFG  G          I+G H+  G + I+GGI+HI T+ 
+     
+   
+ 
+ 
+  
+    
+      16305 
+      2821655 
+      54 
+      69882660 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  20 
+  Query_20 
+  NODE_20_length_267_cov_1.429245 
+  267 
+
+
+  1 
+  gnl|CDD|287774 
+  pfam10839, DUF2647, Protein of unknown function (DUF2647).  This eukaryotic family of proteins are annotated as ycf68 but have no known function. 
+  287774 
+  70 
+  
+    
+      1 
+      48.4966 
+      115 
+      7.70073e-10 
+      165 
+      233 
+      22 
+      44 
+      -2 
+      0 
+      21 
+      22 
+      0 
+      23 
+      GAIQVRSNVDPTFYSLVGSGRSG 
+      GAIQVRSHVDLTFYSLVGSGRSG 
+      GAIQVRS+VD TFYSLVGSGRSG 
+     
+    
+      2 
+      43.1038 
+      101 
+      7.90575e-08 
+      88 
+      222 
+      22 
+      70 
+      -1 
+      0 
+      23 
+      27 
+      4 
+      49 
+      GEIQCRSN----FLFTRGIRAVREGPPWLLSSRESIHPLSVYGQLSLEH 
+      GAIQVRSHVDLTFYSLVGSGRSGGGPPALLFSREHIHLISVWGAISLAH 
+      G IQ RS+    F    G      GPP LL SRE IH +SV+G +SL H 
+     
+    
+      3 
+      42.7186 
+      100 
+      1.07047e-07 
+      86 
+      175 
+      41 
+      70 
+      -3 
+      0 
+      17 
+      18 
+      0 
+      30 
+      GGPGGTTMAPLFSRIHTSLISIWTAISRAQ 
+      GRSGGGPPALLFSREHIHLISVWGAISLAH 
+      G  GG   A LFSR H  LIS+W AIS A  
+     
+   
+ 
+ 
+  
+    
+      16305 
+      2821655 
+      53 
+      70469640 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  21 
+  Query_21 
+  NODE_21_length_263_cov_1.177885 
+  263 
+
+ 
+  
+    
+      16305 
+      2821655 
+      52 
+      69082825 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  22 
+  Query_22 
+  NODE_22_length_262_cov_1.053140 
+  262 
+
+
+  1 
+  gnl|CDD|306604 
+  pfam00124, Photo_RC, Photosynthetic reaction centre protein.   
+  306604 
+  258 
+  
+    
+      1 
+      99.6256 
+      249 
+      4.94039e-28 
+      31 
+      228 
+      77 
+      142 
+      1 
+      0 
+      27 
+      41 
+      0 
+      66 
+      SVACYMGREWEVSFRLGMRPWIAVAYSAPVAAATAVFLIYPIGQGSFSDGMPLGISGTFNFMIVFQ 
+      AFISWWLREYEIARKLGMGPHIAWAFSAAIAAYLSLGLIRPILMGSWSEGFPLGIFPHLDWTSNFS 
+      +   +  RE+E++ +LGM P IA A+SA +AA  ++ LI PI  GS+S+G PLGI    ++   F  
+     
+   
+ 
+ 
+  
+    
+      16305 
+      2821655 
+      52 
+      69082825 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  23 
+  Query_23 
+  NODE_23_length_260_cov_1.590244 
+  260 
+
+ 
+  
+    
+      16305 
+      2821655 
+      51 
+      69653500 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  24 
+  Query_24 
+  NODE_24_length_258_cov_0.935961 
+  258 
+
+
+  1 
+  gnl|CDD|307679 
+  pfam01660, Vmethyltransf, Viral methyltransferase.  This RNA methyltransferase domain is found in a wide range of ssRNA viruses, including Hordei-, Tobra-, Tobamo-, Bromo-, Clostero- and Caliciviruses. This methyltransferase is involved in mRNA capping. Capping of mRNA enhances its stability. This usually occurs in the nucleus. Therefore, many viruses that replicate in the cytoplasm encode their own. This is a specific guanine-7-methyltransferase domain involved in viral mRNA cap0 synthesis. Specificity for guanine 7 position is shown by NMR in and in vivo role in cap synthesis. Based on secondary structure prediction, the basic fold is believed to be similar to the common AdoMet-dependent methyltransferase fold. A curious feature of this methyltransferase domain is that it together with flanking sequences seems to have guanylyltransferase activity coupled to the methyltransferase activity. The domain is found throughout the so-called Alphavirus superfamily, (including alphaviruses and several other groups). It forms the defining, unique feature of this superfamily. 
+  307679 
+  305 
+  
+    
+      1 
+      65.0021 
+      159 
+      8.38713e-15 
+      27 
+      230 
+      226 
+      296 
+      3 
+      0 
+      28 
+      38 
+      3 
+      71 
+      AHHLVAITRGEA--ENCKHRSFGPFEATASESLAKL-CPDYPICLPVPYDVINKVYRYLRTLKKPDVQSPH 
+      AHHLFKITRGDGLTLKPDSRTFGPFEAVLLPKIFVPRVLNYIRGKPIPLTVVNKLFSYLRSLKKRVVINGM 
+      AHHL  ITRG+        R+FGPFEA     +      +Y    P+P  V+NK++ YLR+LKK  V +   
+     
+   
+ 
+ 
+  
+    
+      16305 
+      2821655 
+      51 
+      69653500 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  25 
+  Query_25 
+  NODE_25_length_256_cov_0.945274 
+  256 
+
+ 
+  
+    
+      16305 
+      2821655 
+      50 
+      70224175 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  26 
+  Query_26 
+  NODE_26_length_256_cov_0.895522 
+  256 
+
+ 
+  
+    
+      16305 
+      2821655 
+      50 
+      70224175 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  27 
+  Query_27 
+  NODE_27_length_254_cov_0.793970 
+  254 
+
+ 
+  
+    
+      16305 
+      2821655 
+      49 
+      70794850 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  28 
+  Query_28 
+  NODE_28_length_253_cov_1.313131 
+  253 
+
+ 
+  
+    
+      16305 
+      2821655 
+      49 
+      70794850 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  29 
+  Query_29 
+  NODE_29_length_250_cov_0.851282 
+  250 
+
+
+  1 
+  gnl|CDD|278700 
+  pfam00283, Cytochrom_B559, Cytochrome b559, alpha (gene psbE) and beta (gene psbF)subunits.   
+  278700 
+  29 
+  
+    
+      1 
+      42.0012 
+      99 
+      7.31211e-08 
+      79 
+      165 
+      1 
+      29 
+      1 
+      0 
+      17 
+      19 
+      0 
+      29 
+      MTIDRTYPIFTVRWLAVHGLAVPTVSFLG 
+      GERPFSYPITTVRWWAIHALTVPTVFFLG 
+           +YPI TVRW A+H L VPTV FLG 
+     
+   
+ 
+ 
+  
+    
+      16305 
+      2821655 
+      48 
+      71365525 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  30 
+  Query_30 
+  NODE_30_length_249_cov_1.298969 
+  249 
+
+ 
+  
+    
+      16305 
+      2821655 
+      48 
+      71365525 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  31 
+  Query_31 
+  NODE_31_length_249_cov_0.979381 
+  249 
+
+ 
+  
+    
+      16305 
+      2821655 
+      48 
+      71365525 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  32 
+  Query_32 
+  NODE_32_length_248_cov_0.979275 
+  248 
+
+ 
+  
+    
+      16305 
+      2821655 
+      48 
+      69326510 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  33 
+  Query_33 
+  NODE_33_length_245_cov_1.000000 
+  245 
+
+ 
+  
+    
+      16305 
+      2821655 
+      47 
+      69880880 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  34 
+  Query_34 
+  NODE_34_length_245_cov_1.000000 
+  245 
+
+
+  1 
+  gnl|CDD|250270 
+  pfam00978, RdRP_2, RNA dependent RNA polymerase.  This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses. 
+  250270 
+  441 
+  
+    
+      1 
+      45.7137 
+      109 
+      6.42106e-08 
+      44 
+      211 
+      164 
+      219 
+      2 
+      0 
+      21 
+      31 
+      0 
+      56 
+      KGMCAIFSPIFKELKNRLKSVLDIKYMYADGLRPDQLSERMSQIGAGKYFIENDME 
+      KLVTAYFSPIFRELFERLLYVLKPKVVFPTGMTSSLIAERFEFLDASEDFLEIDFS 
+      K + A FSPIF+EL  RL  VL  K ++  G+    ++ER   + A + F+E D   
+     
+   
+ 
+ 
+  
+    
+      16305 
+      2821655 
+      47 
+      69880880 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  35 
+  Query_35 
+  NODE_35_length_242_cov_0.818182 
+  242 
+
+ 
+  
+    
+      16305 
+      2821655 
+      46 
+      70435250 
+      0.08832702 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  36 
+  Query_36 
+  NODE_36_length_240_cov_1.259459 
+  240 
+
+ 
+  
+    
+      16305 
+      2821655 
+      46 
+      70435250 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  37 
+  Query_37 
+  NODE_37_length_239_cov_1.032609 
+  239 
+
+ 
+  
+    
+      16305 
+      2821655 
+      45 
+      70989620 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  38 
+  Query_38 
+  NODE_38_length_239_cov_1.032609 
+  239 
+
+ 
+  
+    
+      16305 
+      2821655 
+      45 
+      70989620 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  39 
+  Query_39 
+  NODE_39_length_238_cov_1.038251 
+  238 
+
+ 
+  
+    
+      16305 
+      2821655 
+      45 
+      70989620 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  40 
+  Query_40 
+  NODE_40_length_238_cov_0.879781 
+  238 
+
+ 
+  
+    
+      16305 
+      2821655 
+      45 
+      70989620 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  41 
+  Query_41 
+  NODE_41_length_236_cov_1.049724 
+  236 
+
+ 
+  
+    
+      16305 
+      2821655 
+      45 
+      68901690 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  42 
+  Query_42 
+  NODE_42_length_235_cov_1.855556 
+  235 
+
+ 
+  
+    
+      16305 
+      2821655 
+      45 
+      68901690 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  43 
+  Query_43 
+  NODE_43_length_234_cov_1.061453 
+  234 
+
+ 
+  
+    
+      16305 
+      2821655 
+      45 
+      68901690 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  44 
+  Query_44 
+  NODE_44_length_232_cov_1.073446 
+  232 
+
+ 
+  
+    
+      16305 
+      2821655 
+      44 
+      69439755 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  45 
+  Query_45 
+  NODE_45_length_232_cov_1.073446 
+  232 
+
+ 
+  
+    
+      16305 
+      2821655 
+      44 
+      69439755 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  46 
+  Query_46 
+  NODE_46_length_229_cov_1.091954 
+  229 
+
+
+  1 
+  gnl|CDD|306604 
+  pfam00124, Photo_RC, Photosynthetic reaction centre protein.   
+  306604 
+  258 
+  
+    
+      1 
+      86.1436 
+      214 
+      4.26406e-23 
+      24 
+      194 
+      87 
+      143 
+      -3 
+      0 
+      25 
+      36 
+      0 
+      57 
+      ELSFRLGMRPWIAVAYSAPVAAATAVFLIYPIGQGSFSDGMPLGISGTFNFMIVFQA 
+      EIARKLGMGPHIAWAFSAAIAAYLSLGLIRPILMGSWSEGFPLGIFPHLDWTSNFSY 
+      E++ +LGM P IA A+SA +AA  ++ LI PI  GS+S+G PLGI    ++   F   
+     
+   
+ 
+ 
+  
+    
+      16305 
+      2821655 
+      43 
+      69977820 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  47 
+  Query_47 
+  NODE_47_length_229_cov_0.816092 
+  229 
+
+
+  1 
+  gnl|CDD|306687 
+  pfam00223, PsaA_PsaB, Photosystem I psaA/psaB protein.   
+  306687 
+  717 
+  
+    
+      1 
+      61.3066 
+      149 
+      1.79906e-13 
+      124 
+      222 
+      1 
+      33 
+      1 
+      0 
+      22 
+      24 
+      0 
+      33 
+      FSRTLAKGPDTTTWIWNLHADAHDLNSQTHPTQ 
+      FSRDLAQGPKTTTWIWNLHATAHDFESHDGDTE 
+      FSR LA+GP TTTWIWNLHA AHD  S    T+ 
+     
+   
+ 
+ 
+  
+    
+      16305 
+      2821655 
+      43 
+      69977820 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  48 
+  Query_48 
+  NODE_48_length_227_cov_1.273256 
+  227 
+
+ 
+  
+    
+      16305 
+      2821655 
+      42 
+      70515885 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  49 
+  Query_49 
+  NODE_49_length_227_cov_1.017442 
+  227 
+
+ 
+  
+    
+      16305 
+      2821655 
+      42 
+      70515885 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  50 
+  Query_50 
+  NODE_50_length_226_cov_2.269006 
+  226 
+
+
+  1 
+  gnl|CDD|306845 
+  pfam00421, PSII, Photosystem II protein.   
+  306845 
+  500 
+  
+    
+      1 
+      89.1064 
+      221 
+      2.77182e-23 
+      31 
+      213 
+      160 
+      227 
+      1 
+      0 
+      41 
+      45 
+      7 
+      68 
+      GPGIWVSDP-------YGLTGTVQPVNPAWGVEGFDPFVPGGIASHHIAAGTLGILAGLFHLSVRSPQ 
+      GLGTWVSDPTLDPGVIYGYTGHVQPVAPVWGAEGFSPFGPGGIVGHHIAAGILGIIGGIFHITTRPPG 
+      G G WVSDP       YG TG VQPV P WG EGF PF PGGI  HHIAAG LGI+ G+FH++ R P  
+     
+   
+ 
+ 
+  
+    
+      16305 
+      2821655 
+      42 
+      70515885 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  51 
+  Query_51 
+  NODE_51_length_225_cov_1.117647 
+  225 
+
+ 
+  
+    
+      16305 
+      2821655 
+      42 
+      70515885 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  52 
+  Query_52 
+  NODE_52_length_225_cov_0.917647 
+  225 
+
+ 
+  
+    
+      16305 
+      2821655 
+      42 
+      70515885 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  53 
+  Query_53 
+  NODE_53_length_223_cov_3.303571 
+  223 
+
+ 
+  
+    
+      16305 
+      2821655 
+      41 
+      71053950 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  54 
+  Query_54 
+  NODE_54_length_223_cov_1.803571 
+  223 
+
+ 
+  
+    
+      16305 
+      2821655 
+      41 
+      71053950 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  55 
+  Query_55 
+  NODE_55_length_216_cov_1.596273 
+  216 
+
+ 
+  
+    
+      16305 
+      2821655 
+      40 
+      69422560 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  56 
+  Query_56 
+  NODE_56_length_216_cov_1.180124 
+  216 
+
+
+  1 
+  gnl|CDD|306795 
+  pfam00361, Proton_antipo_M, Proton-conducting membrane transporter.  This is a family of membrane transporters that inlcudes some 7 of potentially 14-16 TM regions. In many instances the family forms part of complex I that catalyzes the transfer of two electrons from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla. 
+  306795 
+  291 
+  
+    
+      1 
+      42.6815 
+      101 
+      5.23486e-07 
+      18 
+      188 
+      52 
+      110 
+      3 
+      0 
+      16 
+      28 
+      4 
+      60 
+      GLCGLFGSTEPTLNFEILTNQSYPVALEI---IFYIGFFLAFAVKLPIIPLHTWLPDTHE 
+      GISLMYNYTG-TLSFTELSKALFNGLNSWGLLLLFLLILVGFLFKSAQVPFHTWLPDAYE 
+      G+  ++  T  TL+F  L+   +         + ++   + F  K   +P HTWLPD +E 
+     
+   
+ 
+ 
+  
+    
+      16305 
+      2821655 
+      40 
+      69422560 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  57 
+  Query_57 
+  NODE_57_length_216_cov_0.869565 
+  216 
+
+ 
+  
+    
+      16305 
+      2821655 
+      40 
+      69422560 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  58 
+  Query_58 
+  NODE_58_length_215_cov_1.668750 
+  215 
+
+ 
+  
+    
+      16305 
+      2821655 
+      39 
+      69944320 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  59 
+  Query_59 
+  NODE_59_length_215_cov_1.187500 
+  215 
+
+ 
+  
+    
+      16305 
+      2821655 
+      39 
+      69944320 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  60 
+  Query_60 
+  NODE_60_length_215_cov_0.843750 
+  215 
+
+ 
+  
+    
+      16305 
+      2821655 
+      39 
+      69944320 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  61 
+  Query_61 
+  NODE_61_length_214_cov_1.232704 
+  214 
+
+ 
+  
+    
+      16305 
+      2821655 
+      39 
+      69944320 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  62 
+  Query_62 
+  NODE_62_length_213_cov_0.936709 
+  213 
+
+ 
+  
+    
+      16305 
+      2821655 
+      39 
+      69944320 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  63 
+  Query_63 
+  NODE_63_length_210_cov_1.225806 
+  210 
+
+ 
+  
+    
+      16305 
+      2821655 
+      38 
+      70466080 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  64 
+  Query_64 
+  NODE_64_length_208_cov_1.019608 
+  208 
+
+ 
+  
+    
+      16305 
+      2821655 
+      37 
+      70987840 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+ 
+ 
+
diff -r 000000000000 -r b82ce29791e7 test-data/blast2tsv_output.tab
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blast2tsv_output.tab	Wed Aug 21 13:12:59 2024 +0000
@@ -0,0 +1,13 @@
+#algo	query_id	nb_reads	query_length	accession	description	organism	percentIdentity	nb_hsps	queryOverlap	hitOverlap	evalue	score	tax_id	taxonomy	sequence
+TBLASTX	NODE_13_length_295_cov_0.945833		295	316155	pfam13603, tRNA-synt_1_2, Leucyl-tRNA synthetase, Domain 2.  This is a family of the conserved region of Leucine-tRNA ligase or Leucyl-tRNA synthetase, EC:6.1.1.4.	Tursiops truncatus papillomavirus 2	41.5	1	100	67.0	2.277e-05	38.6378	316155	Viruses;Monodnaviria;Shotokuvirae;Cossaviricota;Papovaviricetes;Zurhausenvirales;Papillomaviridae;Firstpapillomavirinae;Upsilonpapillomavirus;Upsilonpapillomavirus 2;Tursiops truncatus papillomavirus 2	TGTGTTGGGTGTGTTTGGTTTCCGGTTACCATAATCGCTATTCTTTCAAACAGAAAGCGCATGCTAAGTATTCTCACCCAGAGGAATATGCTGACAAGCCCTCCTCAAAAGGCTATTTTTACAATGCCACCTATGAGAATGCACGAACTCTTATTCACTTCATTAAGCAATATGGATTGCCCTTCAATCCTGTTATTGCACCAGAAGATGCTGAACTAACTGATGAACAGATTCAATCTTACATCAACACAGCAAACTCCTTCTTTAATGATTATCCGACGTTACTGTTCACCCG
+TBLASTX	NODE_16_length_278_cov_0.901345		278	306845	pfam00421, PSII, Photosystem II protein.  		65.8	1	100	47.0	7.65615e-39	132.634			GTCTAACCTGTGTTGGGTGTGTTTGGGCTGTAATCGAGGTATAGTGTCGAACAAGTCGGTGTCACTGTTGAATTCTATGGCGGCGAACTCAATGGAGTCAGTTATAGTGATCCTGCTACTGTGAAAAAATATGCTAGACGTGCTCAATTGGGTGAAATTTTTGAATTAGATCGTGCTACTTTAAAATCGGATGGTGTTTTTCGTAGCAGTCCAAGGGGTTGGTTTACTTTTGGACATGCGTCGTTTGCTCTGCTCTTCTTCCAAACACACCCAACACA
+TBLASTX	NODE_19_length_271_cov_0.879630		271	306845	pfam00421, PSII, Photosystem II protein.  		32.9	1	100	42.0	1.69015e-11	56.3644			GTCTAACCTGTGTTGGGTGTGTTTGGTATGGAGGGAGGTGTATATGATACCTGGGCACCCGGAGGGGGAGATGTAAGAAAAATTACCAACTTGACCCTTAACCCAAGCGTGATATTTGGTTATTTACTAAAATCTCCTTTTGGGGGAGAAGGATGGATTGTTAGTGTGGACGATTTAGAAGATATAATTGGAGGACATGTCTGGTTAGGCTCCATTTGTATACTTGGTGGAATTTGGCATATCTTAACCAAACACACCCAACACAGGTTAG
+TBLASTX	NODE_20_length_267_cov_1.429245		267	287774	pfam10839, DUF2647, Protein of unknown function (DUF2647).  This eukaryotic family of proteins are annotated as ycf68 but have no known function.	Desulfovibrio sp. G100IX	91.3	1	100	99.0	7.70073e-10	48.4966	287774	cellular organisms;Bacteria;Thermodesulfobacteriota;Desulfovibrionia;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio;unclassified Desulfovibrio;Desulfovibrio sp. G100IX	CTGTGTTGGGTGTGTTTGGACTTGGTATGTGAAGATACGTTGTTAGGTGCTCCGTTTTATTTTCCCATTGAGGCCGAACCTAAACCTGTGCTCGAGAGATAGCTGTCCATATACTGATAAGGGATGTATGGATTCTCGAGAAGAGAGGAGCCATGGTGGTCCCTCCCGGACCGCCCGGATCCCACGAGTGAATAGAAAGTTGGATCTACATTGGATCTCACCTGAATCGCCCCATAAACAACCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_22_length_262_cov_1.053140		262	306604	pfam00124, Photo_RC, Photosynthetic reaction centre protein.  	Heterotermes sp. TMJ-2004j	40.9	1	100	77.0	4.94039e-28	99.6256	306604	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j	GTCTAACCTGTGTTGGGTGTGTTTGGCTAGTCAGTAGCTTGTTATATGGGTCGTGAGTGGGAAGTTAGCTTCCGTCTGGGTATGCGCCCGTGGATTGCTGTTGCATATTCAGCTCCTGTTGCAGCTGCTACTGCTGTTTTCTTGATTTACCCAATTGGTCAAGGAAGTTTTTCTGATGGTATGCCTCTAGGAATCTCTGGTACTTTCAACTTCATGATTGTATTCCAGGAGAGCACCCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_24_length_258_cov_0.935961		258	307679	pfam01660, Vmethyltransf, Viral methyltransferase.  This RNA methyltransferase domain is found in a wide range of ssRNA viruses, including Hordei-, Tobra-, Tobamo-, Bromo-, Clostero- and Caliciviruses. This methyltransferase is involved in mRNA capping. Capping of mRNA enhances its stability. This usually occurs in the nucleus. Therefore, many viruses that replicate in the cytoplasm encode their own. This is a specific guanine-7-methyltransferase domain involved in viral mRNA cap0 synthesis. Specificity for guanine 7 position is shown by NMR in and in vivo role in cap synthesis. Based on secondary structure prediction, the basic fold is believed to be similar to the common AdoMet-dependent methyltransferase fold. A curious feature of this methyltransferase domain is that it together with flanking sequences seems to have guanylyltransferase activity coupled to the methyltransferase activity. The domain is found throughout the so-called Alphavirus superfamily, (including alphaviruses and several other groups). It forms the defining, unique feature of this superfamily.	Tetrastemma peltatum	39.4	1	100	70.0	8.38713e-15	65.0021	307679	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Spiralia;Lophotrochozoa;Nemertea;Enopla;Hoplonemertea;Monostilifera;Eumonostilifera;Tetrastemmatidae;Tetrastemma;Tetrastemma peltatum	GTGTTGGGTGTGTTTGGTTGGTGAACGCGCACCATTTAGTGGCAATCACGCGCGGGGAGGCTGAAAACTGCAAGCATAGATCTTTCGGCCCTTTCGAAGCTACCGCTTCCGAGAGCCTGGCTAAACTCTGCCCAGATTATCCGATCTGCTTGCCTGTACCTTACGACGTGATCAATAAAGTGTATAGGTATCTCAGAACGCTTAAGAAGCCTGATGTGCAGTCGCCCCACTACCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_29_length_250_cov_0.851282		250	278700	pfam00283, Cytochrom_B559, Cytochrome b559, alpha (gene psbE) and beta (gene psbF)subunits.  	uncultured archaeon CRE-PA11a	58.6	1	100	100	7.31211e-08	42.0012	278700	cellular organisms;Archaea;environmental samples;uncultured archaeon CRE-PA11a	GTCTAACCTGTGTTGGGTGTGTTTGGGTTTCTTTGGAGCAACTCGATGAATTTAGTAAATCCTTTTAGGAGGTTCCCAATGACCATAGATCGAACCTATCCAATTTTTACAGTGCGATGGTTGGCTGTTCACGGACTGGCTGTACCTACTGTTTCTTTTTTAGGGTCAATATCAGCAATGCAGTTCATCCAACGATAAACCTAATTCAAATTATAGAGCTAGCACACCAAACACACCCAACACAGGTTAG
+TBLASTX	NODE_34_length_245_cov_1.000000		245	250270	pfam00978, RdRP_2, RNA dependent RNA polymerase.  This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses.	Nocardia sp. 431D04	37.5	1	100	38.0	6.42106e-08	45.7137	250270	cellular organisms;Bacteria;Terrabacteria group;Actinomycetota;Actinomycetes;Mycobacteriales;Nocardiaceae;Nocardia;unclassified Nocardia;Nocardia sp. 431D04	GTCTAACCTGTGTTGGGTGTGTTTGGATGGTGATCGGCAATTTAAAGGAATGTGTGCGATATTTTCACCAATATTCAAGGAGCTTAAAAATCGACTAAAAAGCGTGTTAGATATTAAATACATGTATGCAGACGGATTGAGACCTGATCAGTTGTCGGAGCGCATGTCACAGATAGGTGCAGGTAAATATTTTATAGAGAATGATATGGAACAGCATCTCGCCAAACACACCCAACACAGGTTAG
+TBLASTX	NODE_46_length_229_cov_1.091954		229	306604	pfam00124, Photo_RC, Photosynthetic reaction centre protein.  	Heterotermes sp. TMJ-2004j	43.9	1	100	66.0	4.26406e-23	86.1436	306604	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j	TGTGTTGGGTGTGTTTGGTTGGATGCCTGGAATACAATCATGAAATTGAAAGTACCAGATATTCCTAAAGGCATGCCATCTGAAAAACTTCCTTGACCAATAGGGTAGATCAAGAAAACAGCTGTAGCAGCCGCGACAGGAGCTGAATATGCAACAGCAATCCAAGGACGCATACCCAGACGGAAACTAAGCTCCCTCTCGCTCCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_47_length_229_cov_0.816092		229	306687	pfam00223, PsaA_PsaB, Photosystem I psaA/psaB protein.  		66.7	1	100	14.0	1.79906e-13	61.3066			TTGGTAAATTGGCGGAAAGAGGAGGACTCAATGATTATTCGTTCGCCGGAACCAGAAGTAAAAATTTTGGTAGATAGGGATCACATAAAAACTTCTTTCGAGGAATGGGCCAGGCCGGGTCATTTCTCAAGAACACTAGCTAAAGGCCCTGACACTACCACTTGGATCTGGAACCTACATGCTGATGCTCACGATCTTAATAGCCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_50_length_226_cov_2.269006		226	306845	pfam00421, PSII, Photosystem II protein.  		60.3	1	100	41.0	2.77182e-23	89.1064			GTCAACGGTGTGTTGGGTGTGTTTGGGAAAGGTCCTGGAATATGGGTGTCCGATCCTTATGGACTAACCGGAACAGTGCAACCTGTAAATCCGGCGTGGGGCGTGGAAGGTTTTGATCCTTTTGTCCCGGGAGGAATAGCTTCTCATCATATTGCAGCAGGTACATTGGGCATATTAGCGGGCCTATTCCATCTTAGCGTACGGTCACCCCAGCCAAACACACCCA
+TBLASTX	NODE_56_length_216_cov_1.180124		216	306795	pfam00361, Proton_antipo_M, Proton-conducting membrane transporter.  This is a family of membrane transporters that inlcudes some 7 of potentially 14-16 TM regions. In many instances the family forms part of complex I that catalyzes the transfer of two electrons from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla.		26.7	1	100	62.0	5.23486e-07	42.6815			GTGTTGGGTGTGTTTGGGGTCTATGTGGTTTATTTGGTTCTACTGAACCAACATTAAATTTTGAAATATTAACTAATCAGTCCTATCCTGTGGCCTTGGAAATAATATTTTATATTGGATTTTTTCTTGCTTTTGCTGTAAAATTACCAATCATACCCCTACATACATGGTTACCAGATACCCACGAGAGCCAAACACACCCAACACAGGTTAGAC
diff -r 000000000000 -r b82ce29791e7 test-data/blast2tsv_output_with_rn.tab
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blast2tsv_output_with_rn.tab	Wed Aug 21 13:12:59 2024 +0000
@@ -0,0 +1,13 @@
+#algo	query_id	nb_reads	query_length	accession	description	organism	percentIdentity	nb_hsps	queryOverlap	hitOverlap	evalue	score	tax_id	taxonomy	sequence
+TBLASTX	NODE_13_length_295_cov_0.945833	264	295	316155	pfam13603, tRNA-synt_1_2, Leucyl-tRNA synthetase, Domain 2.  This is a family of the conserved region of Leucine-tRNA ligase or Leucyl-tRNA synthetase, EC:6.1.1.4.	Tursiops truncatus papillomavirus 2	41.5	1	100	67.0	2.277e-05	38.6378	316155	Viruses;Monodnaviria;Shotokuvirae;Cossaviricota;Papovaviricetes;Zurhausenvirales;Papillomaviridae;Firstpapillomavirinae;Upsilonpapillomavirus;Upsilonpapillomavirus 2;Tursiops truncatus papillomavirus 2	TGTGTTGGGTGTGTTTGGTTTCCGGTTACCATAATCGCTATTCTTTCAAACAGAAAGCGCATGCTAAGTATTCTCACCCAGAGGAATATGCTGACAAGCCCTCCTCAAAAGGCTATTTTTACAATGCCACCTATGAGAATGCACGAACTCTTATTCACTTCATTAAGCAATATGGATTGCCCTTCAATCCTGTTATTGCACCAGAAGATGCTGAACTAACTGATGAACAGATTCAATCTTACATCAACACAGCAAACTCCTTCTTTAATGATTATCCGACGTTACTGTTCACCCG
+TBLASTX	NODE_16_length_278_cov_0.901345	377	278	306845	pfam00421, PSII, Photosystem II protein.  		65.8	1	100	47.0	7.65615e-39	132.634			GTCTAACCTGTGTTGGGTGTGTTTGGGCTGTAATCGAGGTATAGTGTCGAACAAGTCGGTGTCACTGTTGAATTCTATGGCGGCGAACTCAATGGAGTCAGTTATAGTGATCCTGCTACTGTGAAAAAATATGCTAGACGTGCTCAATTGGGTGAAATTTTTGAATTAGATCGTGCTACTTTAAAATCGGATGGTGTTTTTCGTAGCAGTCCAAGGGGTTGGTTTACTTTTGGACATGCGTCGTTTGCTCTGCTCTTCTTCCAAACACACCCAACACA
+TBLASTX	NODE_19_length_271_cov_0.879630	67	271	306845	pfam00421, PSII, Photosystem II protein.  		32.9	1	100	42.0	1.69015e-11	56.3644			GTCTAACCTGTGTTGGGTGTGTTTGGTATGGAGGGAGGTGTATATGATACCTGGGCACCCGGAGGGGGAGATGTAAGAAAAATTACCAACTTGACCCTTAACCCAAGCGTGATATTTGGTTATTTACTAAAATCTCCTTTTGGGGGAGAAGGATGGATTGTTAGTGTGGACGATTTAGAAGATATAATTGGAGGACATGTCTGGTTAGGCTCCATTTGTATACTTGGTGGAATTTGGCATATCTTAACCAAACACACCCAACACAGGTTAG
+TBLASTX	NODE_20_length_267_cov_1.429245	2	267	287774	pfam10839, DUF2647, Protein of unknown function (DUF2647).  This eukaryotic family of proteins are annotated as ycf68 but have no known function.	Desulfovibrio sp. G100IX	91.3	1	100	99.0	7.70073e-10	48.4966	287774	cellular organisms;Bacteria;Thermodesulfobacteriota;Desulfovibrionia;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio;unclassified Desulfovibrio;Desulfovibrio sp. G100IX	CTGTGTTGGGTGTGTTTGGACTTGGTATGTGAAGATACGTTGTTAGGTGCTCCGTTTTATTTTCCCATTGAGGCCGAACCTAAACCTGTGCTCGAGAGATAGCTGTCCATATACTGATAAGGGATGTATGGATTCTCGAGAAGAGAGGAGCCATGGTGGTCCCTCCCGGACCGCCCGGATCCCACGAGTGAATAGAAAGTTGGATCTACATTGGATCTCACCTGAATCGCCCCATAAACAACCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_22_length_262_cov_1.053140	262	262	306604	pfam00124, Photo_RC, Photosynthetic reaction centre protein.  	Heterotermes sp. TMJ-2004j	40.9	1	100	77.0	4.94039e-28	99.6256	306604	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j	GTCTAACCTGTGTTGGGTGTGTTTGGCTAGTCAGTAGCTTGTTATATGGGTCGTGAGTGGGAAGTTAGCTTCCGTCTGGGTATGCGCCCGTGGATTGCTGTTGCATATTCAGCTCCTGTTGCAGCTGCTACTGCTGTTTTCTTGATTTACCCAATTGGTCAAGGAAGTTTTTCTGATGGTATGCCTCTAGGAATCTCTGGTACTTTCAACTTCATGATTGTATTCCAGGAGAGCACCCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_24_length_258_cov_0.935961	101	258	307679	pfam01660, Vmethyltransf, Viral methyltransferase.  This RNA methyltransferase domain is found in a wide range of ssRNA viruses, including Hordei-, Tobra-, Tobamo-, Bromo-, Clostero- and Caliciviruses. This methyltransferase is involved in mRNA capping. Capping of mRNA enhances its stability. This usually occurs in the nucleus. Therefore, many viruses that replicate in the cytoplasm encode their own. This is a specific guanine-7-methyltransferase domain involved in viral mRNA cap0 synthesis. Specificity for guanine 7 position is shown by NMR in and in vivo role in cap synthesis. Based on secondary structure prediction, the basic fold is believed to be similar to the common AdoMet-dependent methyltransferase fold. A curious feature of this methyltransferase domain is that it together with flanking sequences seems to have guanylyltransferase activity coupled to the methyltransferase activity. The domain is found throughout the so-called Alphavirus superfamily, (including alphaviruses and several other groups). It forms the defining, unique feature of this superfamily.	Tetrastemma peltatum	39.4	1	100	70.0	8.38713e-15	65.0021	307679	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Spiralia;Lophotrochozoa;Nemertea;Enopla;Hoplonemertea;Monostilifera;Eumonostilifera;Tetrastemmatidae;Tetrastemma;Tetrastemma peltatum	GTGTTGGGTGTGTTTGGTTGGTGAACGCGCACCATTTAGTGGCAATCACGCGCGGGGAGGCTGAAAACTGCAAGCATAGATCTTTCGGCCCTTTCGAAGCTACCGCTTCCGAGAGCCTGGCTAAACTCTGCCCAGATTATCCGATCTGCTTGCCTGTACCTTACGACGTGATCAATAAAGTGTATAGGTATCTCAGAACGCTTAAGAAGCCTGATGTGCAGTCGCCCCACTACCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_29_length_250_cov_0.851282	428	250	278700	pfam00283, Cytochrom_B559, Cytochrome b559, alpha (gene psbE) and beta (gene psbF)subunits.  	uncultured archaeon CRE-PA11a	58.6	1	100	100	7.31211e-08	42.0012	278700	cellular organisms;Archaea;environmental samples;uncultured archaeon CRE-PA11a	GTCTAACCTGTGTTGGGTGTGTTTGGGTTTCTTTGGAGCAACTCGATGAATTTAGTAAATCCTTTTAGGAGGTTCCCAATGACCATAGATCGAACCTATCCAATTTTTACAGTGCGATGGTTGGCTGTTCACGGACTGGCTGTACCTACTGTTTCTTTTTTAGGGTCAATATCAGCAATGCAGTTCATCCAACGATAAACCTAATTCAAATTATAGAGCTAGCACACCAAACACACCCAACACAGGTTAG
+TBLASTX	NODE_34_length_245_cov_1.000000	183	245	250270	pfam00978, RdRP_2, RNA dependent RNA polymerase.  This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses.	Nocardia sp. 431D04	37.5	1	100	38.0	6.42106e-08	45.7137	250270	cellular organisms;Bacteria;Terrabacteria group;Actinomycetota;Actinomycetes;Mycobacteriales;Nocardiaceae;Nocardia;unclassified Nocardia;Nocardia sp. 431D04	GTCTAACCTGTGTTGGGTGTGTTTGGATGGTGATCGGCAATTTAAAGGAATGTGTGCGATATTTTCACCAATATTCAAGGAGCTTAAAAATCGACTAAAAAGCGTGTTAGATATTAAATACATGTATGCAGACGGATTGAGACCTGATCAGTTGTCGGAGCGCATGTCACAGATAGGTGCAGGTAAATATTTTATAGAGAATGATATGGAACAGCATCTCGCCAAACACACCCAACACAGGTTAG
+TBLASTX	NODE_46_length_229_cov_1.091954	471	229	306604	pfam00124, Photo_RC, Photosynthetic reaction centre protein.  	Heterotermes sp. TMJ-2004j	43.9	1	100	66.0	4.26406e-23	86.1436	306604	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j	TGTGTTGGGTGTGTTTGGTTGGATGCCTGGAATACAATCATGAAATTGAAAGTACCAGATATTCCTAAAGGCATGCCATCTGAAAAACTTCCTTGACCAATAGGGTAGATCAAGAAAACAGCTGTAGCAGCCGCGACAGGAGCTGAATATGCAACAGCAATCCAAGGACGCATACCCAGACGGAAACTAAGCTCCCTCTCGCTCCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_47_length_229_cov_0.816092	470	229	306687	pfam00223, PsaA_PsaB, Photosystem I psaA/psaB protein.  		66.7	1	100	14.0	1.79906e-13	61.3066			TTGGTAAATTGGCGGAAAGAGGAGGACTCAATGATTATTCGTTCGCCGGAACCAGAAGTAAAAATTTTGGTAGATAGGGATCACATAAAAACTTCTTTCGAGGAATGGGCCAGGCCGGGTCATTTCTCAAGAACACTAGCTAAAGGCCCTGACACTACCACTTGGATCTGGAACCTACATGCTGATGCTCACGATCTTAATAGCCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_50_length_226_cov_2.269006	315	226	306845	pfam00421, PSII, Photosystem II protein.  		60.3	1	100	41.0	2.77182e-23	89.1064			GTCAACGGTGTGTTGGGTGTGTTTGGGAAAGGTCCTGGAATATGGGTGTCCGATCCTTATGGACTAACCGGAACAGTGCAACCTGTAAATCCGGCGTGGGGCGTGGAAGGTTTTGATCCTTTTGTCCCGGGAGGAATAGCTTCTCATCATATTGCAGCAGGTACATTGGGCATATTAGCGGGCCTATTCCATCTTAGCGTACGGTCACCCCAGCCAAACACACCCA
+TBLASTX	NODE_56_length_216_cov_1.180124	166	216	306795	pfam00361, Proton_antipo_M, Proton-conducting membrane transporter.  This is a family of membrane transporters that inlcudes some 7 of potentially 14-16 TM regions. In many instances the family forms part of complex I that catalyzes the transfer of two electrons from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla.		26.7	1	100	62.0	5.23486e-07	42.6815			GTGTTGGGTGTGTTTGGGGTCTATGTGGTTTATTTGGTTCTACTGAACCAACATTAAATTTTGAAATATTAACTAATCAGTCCTATCCTGTGGCCTTGGAAATAATATTTTATATTGGATTTTTTCTTGCTTTTGCTGTAAAATTACCAATCATACCCCTACATACATGGTTACCAGATACCCACGAGAGCCAAACACACCCAACACAGGTTAGAC
diff -r 000000000000 -r b82ce29791e7 test-data/blast2tsv_read_nb.tab
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blast2tsv_read_nb.tab	Wed Aug 21 13:12:59 2024 +0000
@@ -0,0 +1,64 @@
+NODE_1_length_506_cov_10.687361	56
+NODE_2_length_429_cov_3.631016	301
+NODE_3_length_365_cov_1.074194	23
+NODE_4_length_351_cov_1.547297	183
+NODE_5_length_344_cov_3.273356	220
+NODE_6_length_338_cov_1.314488	121
+NODE_7_length_335_cov_1.714286	6
+NODE_8_length_331_cov_0.862319	322
+NODE_9_length_324_cov_2.141264	153
+NODE_10_length_324_cov_1.371747	235
+NODE_11_length_317_cov_1.125954	136
+NODE_12_length_311_cov_1.535156	196
+NODE_13_length_295_cov_0.945833	264
+NODE_14_length_294_cov_1.891213	155
+NODE_15_length_280_cov_1.413333	348
+NODE_16_length_278_cov_0.901345	377
+NODE_17_length_277_cov_1.540541	160
+NODE_18_length_274_cov_3.872146	25
+NODE_19_length_271_cov_0.879630	67
+NODE_20_length_267_cov_1.429245	2
+NODE_21_length_263_cov_1.177885	361
+NODE_22_length_262_cov_1.053140	262
+NODE_23_length_260_cov_1.590244	316
+NODE_24_length_258_cov_0.935961	101
+NODE_25_length_256_cov_0.945274	46
+NODE_26_length_256_cov_0.895522	153
+NODE_27_length_254_cov_0.793970	127
+NODE_28_length_253_cov_1.313131	20
+NODE_29_length_250_cov_0.851282	428
+NODE_30_length_249_cov_1.298969	249
+NODE_31_length_249_cov_0.979381	445
+NODE_32_length_248_cov_0.979275	496
+NODE_33_length_245_cov_1.000000	281
+NODE_34_length_245_cov_1.000000	183
+NODE_35_length_242_cov_0.818182	222
+NODE_36_length_240_cov_1.259459	179
+NODE_37_length_239_cov_1.032609	98
+NODE_38_length_239_cov_1.032609	405
+NODE_39_length_238_cov_1.038251	426
+NODE_40_length_238_cov_0.879781	105
+NODE_41_length_236_cov_1.049724	225
+NODE_42_length_235_cov_1.855556	440
+NODE_43_length_234_cov_1.061453	106
+NODE_44_length_232_cov_1.073446	136
+NODE_45_length_232_cov_1.073446	430
+NODE_46_length_229_cov_1.091954	471
+NODE_47_length_229_cov_0.816092	470
+NODE_48_length_227_cov_1.273256	450
+NODE_49_length_227_cov_1.017442	190
+NODE_50_length_226_cov_2.269006	315
+NODE_51_length_225_cov_1.117647	384
+NODE_52_length_225_cov_0.917647	405
+NODE_53_length_223_cov_3.303571	414
+NODE_54_length_223_cov_1.803571	355
+NODE_55_length_216_cov_1.596273	317
+NODE_56_length_216_cov_1.180124	166
+NODE_57_length_216_cov_0.869565	247
+NODE_58_length_215_cov_1.668750	267
+NODE_59_length_215_cov_1.187500	163
+NODE_60_length_215_cov_0.843750	124
+NODE_61_length_214_cov_1.232704	103
+NODE_62_length_213_cov_0.936709	421
+NODE_63_length_210_cov_1.225806	290
+NODE_64_length_208_cov_1.019608	498
\ No newline at end of file
diff -r 000000000000 -r b82ce29791e7 test-data/blast2tsv_reads.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blast2tsv_reads.txt	Wed Aug 21 13:12:59 2024 +0000
@@ -0,0 +1,6 @@
+0	Viruses	 Monodnaviria	 Shotokuvirae	 Cossaviricota	 Papovaviricetes	 Zurhausenvirales	 Papillomaviridae	 Firstpapillomavirinae	 Upsilonpapillomavirus	 Upsilonpapillomavirus 2
+0	cellular organisms	 Bacteria	 Thermodesulfobacteriota	 Desulfovibrionia	 Desulfovibrionales	 Desulfovibrionaceae	 Desulfovibrio	 unclassified Desulfovibrio
+0	cellular organisms	 Eukaryota	 Opisthokonta	 Metazoa	 Eumetazoa	 Bilateria	 Protostomia	 Ecdysozoa	 Panarthropoda	 Arthropoda	 Mandibulata	 Pancrustacea	 Hexapoda	 Insecta	 Dicondylia	 Pterygota	 Neoptera	 Polyneoptera	 Dictyoptera	 Blattodea	 Blattoidea	 Termitoidae	 Rhinotermitidae	 Heterotermitinae	 Heterotermes	 unclassified Heterotermes
+0	cellular organisms	 Eukaryota	 Opisthokonta	 Metazoa	 Eumetazoa	 Bilateria	 Protostomia	 Spiralia	 Lophotrochozoa	 Nemertea	 Enopla	 Hoplonemertea	 Monostilifera	 Eumonostilifera	 Tetrastemmatidae	 Tetrastemma
+0	cellular organisms	 Archaea	 environmental samples
+0	cellular organisms	 Bacteria	 Terrabacteria group	 Actinomycetota	 Actinomycetes	 Mycobacteriales	 Nocardiaceae	 Nocardia	 unclassified Nocardia
diff -r 000000000000 -r b82ce29791e7 test-data/blast2tsv_reads_with_rn.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blast2tsv_reads_with_rn.txt	Wed Aug 21 13:12:59 2024 +0000
@@ -0,0 +1,13 @@
+#algo	query_id	nb_reads	query_length	accession	description	organism	percentIdentity	nb_hsps	queryOverlap	hitOverlap	evalue	score	tax_id	taxonomy	sequence
+TBLASTX	NODE_13_length_295_cov_0.945833	264	295	316155	pfam13603, tRNA-synt_1_2, Leucyl-tRNA synthetase, Domain 2.  This is a family of the conserved region of Leucine-tRNA ligase or Leucyl-tRNA synthetase, EC:6.1.1.4.	Tursiops truncatus papillomavirus 2	41.5	1	100	67.0	2.277e-05	38.6378	316155	Viruses;Monodnaviria;Shotokuvirae;Cossaviricota;Papovaviricetes;Zurhausenvirales;Papillomaviridae;Firstpapillomavirinae;Upsilonpapillomavirus;Upsilonpapillomavirus 2;Tursiops truncatus papillomavirus 2	TGTGTTGGGTGTGTTTGGTTTCCGGTTACCATAATCGCTATTCTTTCAAACAGAAAGCGCATGCTAAGTATTCTCACCCAGAGGAATATGCTGACAAGCCCTCCTCAAAAGGCTATTTTTACAATGCCACCTATGAGAATGCACGAACTCTTATTCACTTCATTAAGCAATATGGATTGCCCTTCAATCCTGTTATTGCACCAGAAGATGCTGAACTAACTGATGAACAGATTCAATCTTACATCAACACAGCAAACTCCTTCTTTAATGATTATCCGACGTTACTGTTCACCCG
+TBLASTX	NODE_16_length_278_cov_0.901345	377	278	306845	pfam00421, PSII, Photosystem II protein.  		65.8	1	100	47.0	7.65615e-39	132.634			GTCTAACCTGTGTTGGGTGTGTTTGGGCTGTAATCGAGGTATAGTGTCGAACAAGTCGGTGTCACTGTTGAATTCTATGGCGGCGAACTCAATGGAGTCAGTTATAGTGATCCTGCTACTGTGAAAAAATATGCTAGACGTGCTCAATTGGGTGAAATTTTTGAATTAGATCGTGCTACTTTAAAATCGGATGGTGTTTTTCGTAGCAGTCCAAGGGGTTGGTTTACTTTTGGACATGCGTCGTTTGCTCTGCTCTTCTTCCAAACACACCCAACACA
+TBLASTX	NODE_19_length_271_cov_0.879630	67	271	306845	pfam00421, PSII, Photosystem II protein.  		32.9	1	100	42.0	1.69015e-11	56.3644			GTCTAACCTGTGTTGGGTGTGTTTGGTATGGAGGGAGGTGTATATGATACCTGGGCACCCGGAGGGGGAGATGTAAGAAAAATTACCAACTTGACCCTTAACCCAAGCGTGATATTTGGTTATTTACTAAAATCTCCTTTTGGGGGAGAAGGATGGATTGTTAGTGTGGACGATTTAGAAGATATAATTGGAGGACATGTCTGGTTAGGCTCCATTTGTATACTTGGTGGAATTTGGCATATCTTAACCAAACACACCCAACACAGGTTAG
+TBLASTX	NODE_20_length_267_cov_1.429245	2	267	287774	pfam10839, DUF2647, Protein of unknown function (DUF2647).  This eukaryotic family of proteins are annotated as ycf68 but have no known function.	Desulfovibrio sp. G100IX	91.3	1	100	99.0	7.70073e-10	48.4966	287774	cellular organisms;Bacteria;Thermodesulfobacteriota;Desulfovibrionia;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio;unclassified Desulfovibrio;Desulfovibrio sp. G100IX	CTGTGTTGGGTGTGTTTGGACTTGGTATGTGAAGATACGTTGTTAGGTGCTCCGTTTTATTTTCCCATTGAGGCCGAACCTAAACCTGTGCTCGAGAGATAGCTGTCCATATACTGATAAGGGATGTATGGATTCTCGAGAAGAGAGGAGCCATGGTGGTCCCTCCCGGACCGCCCGGATCCCACGAGTGAATAGAAAGTTGGATCTACATTGGATCTCACCTGAATCGCCCCATAAACAACCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_22_length_262_cov_1.053140	262	262	306604	pfam00124, Photo_RC, Photosynthetic reaction centre protein.  	Heterotermes sp. TMJ-2004j	40.9	1	100	77.0	4.94039e-28	99.6256	306604	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j	GTCTAACCTGTGTTGGGTGTGTTTGGCTAGTCAGTAGCTTGTTATATGGGTCGTGAGTGGGAAGTTAGCTTCCGTCTGGGTATGCGCCCGTGGATTGCTGTTGCATATTCAGCTCCTGTTGCAGCTGCTACTGCTGTTTTCTTGATTTACCCAATTGGTCAAGGAAGTTTTTCTGATGGTATGCCTCTAGGAATCTCTGGTACTTTCAACTTCATGATTGTATTCCAGGAGAGCACCCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_24_length_258_cov_0.935961	101	258	307679	pfam01660, Vmethyltransf, Viral methyltransferase.  This RNA methyltransferase domain is found in a wide range of ssRNA viruses, including Hordei-, Tobra-, Tobamo-, Bromo-, Clostero- and Caliciviruses. This methyltransferase is involved in mRNA capping. Capping of mRNA enhances its stability. This usually occurs in the nucleus. Therefore, many viruses that replicate in the cytoplasm encode their own. This is a specific guanine-7-methyltransferase domain involved in viral mRNA cap0 synthesis. Specificity for guanine 7 position is shown by NMR in and in vivo role in cap synthesis. Based on secondary structure prediction, the basic fold is believed to be similar to the common AdoMet-dependent methyltransferase fold. A curious feature of this methyltransferase domain is that it together with flanking sequences seems to have guanylyltransferase activity coupled to the methyltransferase activity. The domain is found throughout the so-called Alphavirus superfamily, (including alphaviruses and several other groups). It forms the defining, unique feature of this superfamily.	Tetrastemma peltatum	39.4	1	100	70.0	8.38713e-15	65.0021	307679	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Spiralia;Lophotrochozoa;Nemertea;Enopla;Hoplonemertea;Monostilifera;Eumonostilifera;Tetrastemmatidae;Tetrastemma;Tetrastemma peltatum	GTGTTGGGTGTGTTTGGTTGGTGAACGCGCACCATTTAGTGGCAATCACGCGCGGGGAGGCTGAAAACTGCAAGCATAGATCTTTCGGCCCTTTCGAAGCTACCGCTTCCGAGAGCCTGGCTAAACTCTGCCCAGATTATCCGATCTGCTTGCCTGTACCTTACGACGTGATCAATAAAGTGTATAGGTATCTCAGAACGCTTAAGAAGCCTGATGTGCAGTCGCCCCACTACCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_29_length_250_cov_0.851282	428	250	278700	pfam00283, Cytochrom_B559, Cytochrome b559, alpha (gene psbE) and beta (gene psbF)subunits.  	uncultured archaeon CRE-PA11a	58.6	1	100	100	7.31211e-08	42.0012	278700	cellular organisms;Archaea;environmental samples;uncultured archaeon CRE-PA11a	GTCTAACCTGTGTTGGGTGTGTTTGGGTTTCTTTGGAGCAACTCGATGAATTTAGTAAATCCTTTTAGGAGGTTCCCAATGACCATAGATCGAACCTATCCAATTTTTACAGTGCGATGGTTGGCTGTTCACGGACTGGCTGTACCTACTGTTTCTTTTTTAGGGTCAATATCAGCAATGCAGTTCATCCAACGATAAACCTAATTCAAATTATAGAGCTAGCACACCAAACACACCCAACACAGGTTAG
+TBLASTX	NODE_34_length_245_cov_1.000000	183	245	250270	pfam00978, RdRP_2, RNA dependent RNA polymerase.  This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses.	Nocardia sp. 431D04	37.5	1	100	38.0	6.42106e-08	45.7137	250270	cellular organisms;Bacteria;Terrabacteria group;Actinomycetota;Actinomycetes;Mycobacteriales;Nocardiaceae;Nocardia;unclassified Nocardia;Nocardia sp. 431D04	GTCTAACCTGTGTTGGGTGTGTTTGGATGGTGATCGGCAATTTAAAGGAATGTGTGCGATATTTTCACCAATATTCAAGGAGCTTAAAAATCGACTAAAAAGCGTGTTAGATATTAAATACATGTATGCAGACGGATTGAGACCTGATCAGTTGTCGGAGCGCATGTCACAGATAGGTGCAGGTAAATATTTTATAGAGAATGATATGGAACAGCATCTCGCCAAACACACCCAACACAGGTTAG
+TBLASTX	NODE_46_length_229_cov_1.091954	471	229	306604	pfam00124, Photo_RC, Photosynthetic reaction centre protein.  	Heterotermes sp. TMJ-2004j	43.9	1	100	66.0	4.26406e-23	86.1436	306604	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j	TGTGTTGGGTGTGTTTGGTTGGATGCCTGGAATACAATCATGAAATTGAAAGTACCAGATATTCCTAAAGGCATGCCATCTGAAAAACTTCCTTGACCAATAGGGTAGATCAAGAAAACAGCTGTAGCAGCCGCGACAGGAGCTGAATATGCAACAGCAATCCAAGGACGCATACCCAGACGGAAACTAAGCTCCCTCTCGCTCCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_47_length_229_cov_0.816092	470	229	306687	pfam00223, PsaA_PsaB, Photosystem I psaA/psaB protein.  		66.7	1	100	14.0	1.79906e-13	61.3066			TTGGTAAATTGGCGGAAAGAGGAGGACTCAATGATTATTCGTTCGCCGGAACCAGAAGTAAAAATTTTGGTAGATAGGGATCACATAAAAACTTCTTTCGAGGAATGGGCCAGGCCGGGTCATTTCTCAAGAACACTAGCTAAAGGCCCTGACACTACCACTTGGATCTGGAACCTACATGCTGATGCTCACGATCTTAATAGCCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_50_length_226_cov_2.269006	315	226	306845	pfam00421, PSII, Photosystem II protein.  		60.3	1	100	41.0	2.77182e-23	89.1064			GTCAACGGTGTGTTGGGTGTGTTTGGGAAAGGTCCTGGAATATGGGTGTCCGATCCTTATGGACTAACCGGAACAGTGCAACCTGTAAATCCGGCGTGGGGCGTGGAAGGTTTTGATCCTTTTGTCCCGGGAGGAATAGCTTCTCATCATATTGCAGCAGGTACATTGGGCATATTAGCGGGCCTATTCCATCTTAGCGTACGGTCACCCCAGCCAAACACACCCA
+TBLASTX	NODE_56_length_216_cov_1.180124	166	216	306795	pfam00361, Proton_antipo_M, Proton-conducting membrane transporter.  This is a family of membrane transporters that inlcudes some 7 of potentially 14-16 TM regions. In many instances the family forms part of complex I that catalyzes the transfer of two electrons from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla.		26.7	1	100	62.0	5.23486e-07	42.6815			GTGTTGGGTGTGTTTGGGGTCTATGTGGTTTATTTGGTTCTACTGAACCAACATTAAATTTTGAAATATTAACTAATCAGTCCTATCCTGTGGCCTTGGAAATAATATTTTATATTGGATTTTTTCTTGCTTTTGCTGTAAAATTACCAATCATACCCCTACATACATGGTTACCAGATACCCACGAGAGCCAAACACACCCAACACAGGTTAGAC
diff -r 000000000000 -r b82ce29791e7 test-data/index.html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/index.html	Wed Aug 21 13:12:59 2024 +0000
@@ -0,0 +1,235 @@
+
+
+rps2tree 
+
+
+
+
rps2tree 
+
+
pfam02123 pfam02123_RdRP_4 
+
pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.
+
+
+
+
+
+
pfam02123_RdRP_4/seq_aligned.final_tree.fa 
+
+
pfam02123_RdRP_4/otu_cluster.csv 
+
+
pfam02123_RdRP_4/cluster_nb_reads_files.tab 
+
+
pfam02123_RdRP_4/identity_matrix.csv 
+
+
+
+
pfam00680 pfam00680_RdRP_1 
+
pfam00680, RdRP_1, RNA dependent RNA polymerase.  
+
+
+
+
+
+
pfam00680_RdRP_1/seq_aligned.final_tree.fa 
+
+
pfam00680_RdRP_1/otu_cluster.csv 
+
+
pfam00680_RdRP_1/cluster_nb_reads_files.tab 
+
+
pfam00680_RdRP_1/identity_matrix.csv 
+
+
+
+
pfam00665 pfam00665_rve 
+
pfam00665, rve, Integrase core domain.  Integrase mediates integration of a DNA copy of the viral genome into the host chromosome. Integrase is composed of three domains. The amino-terminal domain is a zinc binding domain pfam02022. This domain is the central catalytic domain. The carboxyl terminal domain that is a non-specific DNA binding domain pfam00552. The catalytic domain acts as an endonuclease when two nucleotides are removed from the 3' ends of the blunt-ended viral DNA made by reverse transcription. This domain also catalyzes the DNA strand transfer reaction of the 3' ends of the viral DNA to the 5' ends of the integration site.
+
+
+
+
+
+
pfam00665_rve/seq_aligned.final_tree.fa 
+
+
pfam00665_rve/otu_cluster.csv 
+
+
pfam00665_rve/cluster_nb_reads_files.tab 
+
+
pfam00665_rve/identity_matrix.csv 
+
+
+
+
pfam01443 pfam01443_Viral_helicase1 
+
pfam01443, Viral_helicase1, Viral (Superfamily 1) RNA helicase.  Helicase activity for this family has been demonstrated and NTPase activity. This helicase has multiple roles at different stages of viral RNA replication, as dissected by mutational analysis.
+
+
+
+
+
+
pfam01443_Viral_helicase1/seq_aligned.final_tree.fa 
+
+
pfam01443_Viral_helicase1/otu_cluster.csv 
+
+
pfam01443_Viral_helicase1/cluster_nb_reads_files.tab 
+
+
pfam01443_Viral_helicase1/identity_matrix.csv 
+
+
+
+
pfam00078 pfam00078_RVT_1 
+
pfam00078, RVT_1, Reverse transcriptase (RNA-dependent DNA polymerase).  A reverse transcriptase gene is usually indicative of a mobile element such as a retrotransposon or retrovirus. Reverse transcriptases occur in a variety of mobile elements, including retrotransposons, retroviruses, group II introns, bacterial msDNAs, hepadnaviruses, and caulimoviruses.
+
+
+
+
+
+
pfam00078_RVT_1/seq_aligned.final_tree.fa 
+
+
pfam00078_RVT_1/otu_cluster.csv 
+
+
pfam00078_RVT_1/cluster_nb_reads_files.tab 
+
+
pfam00078_RVT_1/identity_matrix.csv 
+
+
+
+
pfam01787 pfam01787_Ilar_coat 
+
pfam01787, Ilar_coat, Ilarvirus coat protein.  This family consists of various coat proteins from the ilarviruses part of the Bromoviridae, members include apple mosaic virus and prune dwarf virus. The ilarvirus coat protein is required to initiate replication of the viral genome in host plants. Members of the Bromoviridae have a positive stand ssRNA genome with no DNA stage in there replication.
+
+
+
+
+
+
pfam01787_Ilar_coat/seq_aligned.final_tree.fa 
+
+
pfam01787_Ilar_coat/otu_cluster.csv 
+
+
pfam01787_Ilar_coat/cluster_nb_reads_files.tab 
+
+
pfam01787_Ilar_coat/identity_matrix.csv 
+
+
+
+
pfam01573 pfam01573_Bromo_MP 
+
pfam01573, Bromo_MP, Bromovirus movement protein.  
+
+
+
+
+
+
pfam01573_Bromo_MP/seq_aligned.final_tree.fa 
+
+
pfam01573_Bromo_MP/otu_cluster.csv 
+
+
pfam01573_Bromo_MP/cluster_nb_reads_files.tab 
+
+
pfam01573_Bromo_MP/identity_matrix.csv 
+
+
+
+
+
+  rpstblastn 
+  RPSTBLASTN 2.10.1+ 
+  Stephen F. Altschul, Thomas L. Madden, Alejandro A. Schäffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402. 
+  /home/tcandresse/work/pfam/Pfam 
+  ds2020-267_269 
+  No definition line 
+  259 
+  
+    
+      BLOSUM62 
+      0.001 
+      11 
+      1 
+      F 
+     
+   
+
+
+  1 
+  ds2020-267_269 
+  No definition line 
+  259 
+
+ 
+  
+    
+      17919 
+      3004588 
+      50 
+      75910968 
+      0.050055168 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  2 
+  ds2020-267_1242 
+  No definition line 
+  59 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      57087172 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  3 
+  ds2020-267_333 
+  No definition line 
+  248 
+
+ 
+  
+    
+      17919 
+      3004588 
+      47 
+      75683825 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  4 
+  ds2020-267_1111 
+  No definition line 
+  70 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      69105524 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  5 
+  ds2020-267_560 
+  No definition line 
+  222 
+
+ 
+  
+    
+      17919 
+      3004588 
+      41 
+      74906997 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  6 
+  ds2020-267_120 
+  No definition line 
+  339 
+
+
+  1 
+  gnl|CDD|374695 
+  pfam16639, Apocytochr_F_N, Apocytochrome F, N-terminal.  This is the N-terminal domain of cytochrome f. It is a soluble lumen-side domain. 
+  374695 
+  154 
+  
+    
+      1 
+      91.1926 
+      227 
+      2.20279e-25 
+      197 
+      325 
+      112 
+      154 
+      -3 
+      0 
+      30 
+      37 
+      0 
+      43 
+      MVIGPVPGQKYSEITFPILSPDPATKKDVHFLKYPIYVGGNRG 
+      LIVGPLPGDQYQEIVFPVLSPDPATDKSVHFGKYPVYVGGNRG 
+      +++GP+PG +Y EI FP+LSPDPAT K VHF KYP+YVGGNRG 
+     
+   
+ 
+
+  2 
+  gnl|CDD|366578 
+  pfam01333, Apocytochr_F_C, Apocytochrome F, C-terminal.  This is a sub-family of cytochrome C. See pfam00034. 
+  366578 
+  115 
+  
+    
+      1 
+      34.0526 
+      79 
+      0.000848733 
+      116 
+      163 
+      1 
+      16 
+      -3 
+      0 
+      10 
+      13 
+      0 
+      16 
+      NNNVYNATAAGIVSKI 
+      NNNVFTASAAGTISAI 
+      NNNV+ A+AAG +S I 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      70 
+      75261094 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  7 
+  ds2020-267_374 
+  No definition line 
+  242 
+
+
+  1 
+  gnl|CDD|365890 
+  pfam00124, Photo_RC, Photosynthetic reaction centre protein.   
+  365890 
+  260 
+  
+    
+      1 
+      42.9994 
+      102 
+      5.09126e-07 
+      21 
+      125 
+      91 
+      125 
+      3 
+      0 
+      12 
+      19 
+      0 
+      35 
+      SVQLRPYNAIAFSGPIAVFVSVFLIYPLGQSGWFF 
+      KLGMGPHVAWAFSAAIAAYLSLGLIRPILMGSWSE 
+       + + P+ A AFS  IA ++S+ LI P+    W   
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      46 
+      74130676 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  8 
+  ds2020-267_470 
+  No definition line 
+  230 
+
+ 
+  
+    
+      17919 
+      3004588 
+      43 
+      73724343 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  9 
+  ds2020-267_609 
+  No definition line 
+  218 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      76089651 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  10 
+  ds2020-267_128 
+  No definition line 
+  332 
+
+ 
+  
+    
+      17919 
+      3004588 
+      68 
+      75016032 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  11 
+  ds2020-267_870 
+  No definition line 
+  206 
+
+ 
+  
+    
+      17919 
+      3004588 
+      36 
+      75504128 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  12 
+  ds2020-267_1236 
+  No definition line 
+  59 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      57087172 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  13 
+  ds2020-267_651 
+  No definition line 
+  216 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      76089651 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  14 
+  ds2020-267_648 
+  No definition line 
+  216 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      76089651 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  15 
+  ds2020-267_847 
+  No definition line 
+  207 
+
+ 
+  
+    
+      17919 
+      3004588 
+      37 
+      74930720 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  16 
+  ds2020-267_978 
+  No definition line 
+  121 
+
+ 
+  
+    
+      17919 
+      3004588 
+      13 
+      74834307 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  17 
+  ds2020-267_973 
+  No definition line 
+  123 
+
+ 
+  
+    
+      17919 
+      3004588 
+      14 
+      74350494 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  18 
+  ds2020-267_456 
+  No definition line 
+  232 
+
+ 
+  
+    
+      17919 
+      3004588 
+      43 
+      75958414 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  19 
+  ds2020-267_272 
+  No definition line 
+  259 
+
+ 
+  
+    
+      17919 
+      3004588 
+      50 
+      75910968 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  20 
+  ds2020-267_1065 
+  No definition line 
+  74 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      72110112 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  21 
+  ds2020-267_888 
+  No definition line 
+  206 
+
+ 
+  
+    
+      17919 
+      3004588 
+      36 
+      75504128 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  22 
+  ds2020-267_866 
+  No definition line 
+  206 
+
+ 
+  
+    
+      17919 
+      3004588 
+      36 
+      75504128 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  23 
+  ds2020-267_1034 
+  No definition line 
+  76 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      75114700 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  24 
+  ds2020-267_393 
+  No definition line 
+  240 
+
+ 
+  
+    
+      17919 
+      3004588 
+      46 
+      74130676 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  25 
+  ds2020-267_1084 
+  No definition line 
+  72 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      72110112 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  26 
+  ds2020-267_489 
+  No definition line 
+  228 
+
+ 
+  
+    
+      17919 
+      3004588 
+      43 
+      73724343 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  27 
+  ds2020-267_471 
+  No definition line 
+  230 
+
+
+  1 
+  gnl|CDD|278624 
+  pfam00201, UDPGT, UDP-glucoronosyl and UDP-glucosyl transferase.   
+  278624 
+  499 
+  
+    
+      1 
+      43.5512 
+      103 
+      3.12575e-07 
+      46 
+      210 
+      353 
+      401 
+      1 
+      0 
+      14 
+      23 
+      10 
+      57 
+      IL*ALTAGVPMICSPFFADQRTNCYYTCNEWGSGMEIDNTFGRAD--SMQSAEVLTA 
+      VYEAICHGVPMVGMPLFGDQMDNAKHMEAKGA--------AVTLNVLTMTSEDLLNA 
+      +  A+  GVPM+  P F DQ  N  +   +              +  +M S ++L A 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      43 
+      73724343 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  28 
+  ds2020-267_367 
+  No definition line 
+  243 
+
+ 
+  
+    
+      17919 
+      3004588 
+      47 
+      73521430 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  29 
+  ds2020-267_535 
+  No definition line 
+  224 
+
+ 
+  
+    
+      17919 
+      3004588 
+      41 
+      74906997 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  30 
+  ds2020-267_1201 
+  No definition line 
+  61 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      60091760 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  31 
+  ds2020-267_710 
+  No definition line 
+  213 
+
+
+  1 
+  gnl|CDD|366480 
+  pfam01127, Sdh_cyt, Succinate dehydrogenase/Fumarate reductase transmembrane subunit.  This family includes a transmembrane protein from both the Succinate dehydrogenase and Fumarate reductase complexes. 
+  366480 
+  122 
+  
+    
+      1 
+      33.1145 
+      76 
+      0.000723904 
+      13 
+      126 
+      8 
+      45 
+      1 
+      0 
+      13 
+      19 
+      0 
+      38 
+      NSDLLIYKPQLTSTFPISHRISGAFLVTIVLFFYLLCL 
+      SPHLGLYRAHLGTWLSILHRITGVALFVLGLIHLLLWL 
+      +  L +Y+  L +   I HRI+G  L  + L   LL L 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      73783904 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  32 
+  ds2020-267_904 
+  No definition line 
+  142 
+
+ 
+  
+    
+      17919 
+      3004588 
+      19 
+      74595556 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  33 
+  ds2020-267_692 
+  No definition line 
+  214 
+
+
+  1 
+  gnl|CDD|366242 
+  pfam00680, RdRP_1, RNA dependent RNA polymerase.   
+  366242 
+  470 
+  
+    
+      1 
+      37.3361 
+      87 
+      4.79875e-05 
+      70 
+      180 
+      82 
+      115 
+      1 
+      0 
+      16 
+      21 
+      3 
+      37 
+      FTFEDATLDEAINGVEDLDYFDSLVIGTSEGYPYVLE 
+      ADLGDLSVSEAINGA---EGFDALNKDTSPGLPYILE 
+          D ++ EAING    + FD+L   TS G PY+LE 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      73783904 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  34 
+  ds2020-267_1114 
+  No definition line 
+  69 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      69105524 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  35 
+  ds2020-267_813 
+  No definition line 
+  208 
+
+ 
+  
+    
+      17919 
+      3004588 
+      37 
+      74930720 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  36 
+  ds2020-267_390 
+  No definition line 
+  241 
+
+ 
+  
+    
+      17919 
+      3004588 
+      46 
+      74130676 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  37 
+  ds2020-267_217 
+  No definition line 
+  275 
+
+ 
+  
+    
+      17919 
+      3004588 
+      54 
+      75367594 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  38 
+  ds2020-267_654 
+  No definition line 
+  216 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      76089651 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  39 
+  ds2020-267_956 
+  No definition line 
+  126 
+
+ 
+  
+    
+      17919 
+      3004588 
+      15 
+      73866681 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  40 
+  ds2020-267_1228 
+  No definition line 
+  60 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      60091760 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  41 
+  ds2020-267_608 
+  No definition line 
+  218 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      76089651 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  42 
+  ds2020-267_412 
+  No definition line 
+  238 
+
+ 
+  
+    
+      17919 
+      3004588 
+      45 
+      74739922 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  43 
+  ds2020-267_211 
+  No definition line 
+  276 
+
+ 
+  
+    
+      17919 
+      3004588 
+      55 
+      74704591 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  44 
+  ds2020-267_419 
+  No definition line 
+  236 
+
+ 
+  
+    
+      17919 
+      3004588 
+      44 
+      75349168 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  45 
+  ds2020-267_606 
+  No definition line 
+  219 
+
+ 
+  
+    
+      17919 
+      3004588 
+      40 
+      75498324 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  46 
+  ds2020-267_817 
+  No definition line 
+  208 
+
+
+  1 
+  gnl|CDD|377540 
+  pfam05656, DUF805, Protein of unknown function (DUF805).  This family consists of several bacterial proteins of unknown function. 
+  377540 
+  108 
+  
+    
+      1 
+      39.1746 
+      92 
+      3.45664e-06 
+      86 
+      190 
+      72 
+      106 
+      -1 
+      0 
+      15 
+      19 
+      0 
+      35 
+      TGWLSLRMQIPALDLIFMIYLFAAKGTEGNNDYGP 
+      SGWWLLLGLIPIIGLIVLLVLLCLPGTPGPNRYGP 
+      +GW  L   IP + LI ++ L    GT G N YGP 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      37 
+      74930720 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  47 
+  ds2020-267_1207 
+  No definition line 
+  61 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      60091760 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  48 
+  ds2020-267_98 
+  No definition line 
+  379 
+
+
+  1 
+  gnl|CDD|374428 
+  pfam16203, ERCC3_RAD25_C, ERCC3/RAD25/XPB C-terminal helicase.  This is the C-terminal helicase domain of ERCC3, RAD25 and XPB helicases. 
+  374428 
+  247 
+  
+    
+      1 
+      107.677 
+      270 
+      1.33948e-30 
+      131 
+      280 
+      108 
+      157 
+      -1 
+      0 
+      39 
+      42 
+      0 
+      50 
+      NMAFLSKVGDNSIDILEANVIIQISSHAGSRRQEAQRLGRILRAKGKLQD 
+      NTIFLSKVGDTSIDLPEANVLIQISSHFGSRRQEAQRLGRILRAKRRSND 
+      N  FLSKVGD SID+ EANV+IQISSH GSRRQEAQRLGRILRAK +  D 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      76 
+      82137200 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  78 
+  ds2020-267_1141 
+  No definition line 
+  66 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      66100936 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  79 
+  ds2020-267_212 
+  No definition line 
+  276 
+
+ 
+  
+    
+      17919 
+      3004588 
+      55 
+      74704591 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  80 
+  ds2020-267_261 
+  No definition line 
+  260 
+
+
+  1 
+  gnl|CDD|376444 
+  pfam01051, Rep_3, Initiator Replication protein.  This protein is an initiator of plasmid replication. RepB possesses nicking-closing (topoisomerase I) like activity. It is also able to perform a strand transfer reaction on ssDNA that contains its target. This family also includes RepA which is an E.coli protein involved in plasmid replication. The RepA protein binds to DNA repeats that flank the repA gene. 
+  376444 
+  221 
+  
+    
+      1 
+      76.5582 
+      189 
+      1.77523e-19 
+      26 
+      217 
+      94 
+      157 
+      -2 
+      0 
+      28 
+      42 
+      0 
+      64 
+      RWVDKIGYIDDLGCVELVFASDVIPLITRLEQRFTEYDIDQVSNLQSKYAVRLYELLVQWRSTG 
+      LWVGYIISAKGEGKVEIEFSPDLKPYLLELKKNFTKYELKEFLKLKSKYSIRLYELLKQYRSTG 
+       WV  I      G VE+ F+ D+ P +  L++ FT+Y++ +   L+SKY++RLYELL Q+RSTG 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      50 
+      75910968 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  81 
+  ds2020-267_689 
+  No definition line 
+  214 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      73783904 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  82 
+  ds2020-267_892 
+  No definition line 
+  181 
+
+ 
+  
+    
+      17919 
+      3004588 
+      30 
+      74010540 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  83 
+  ds2020-267_1243 
+  No definition line 
+  59 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      57087172 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  84 
+  ds2020-267_521 
+  No definition line 
+  225 
+
+ 
+  
+    
+      17919 
+      3004588 
+      42 
+      74315670 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  85 
+  ds2020-267_773 
+  No definition line 
+  210 
+
+
+  1 
+  gnl|CDD|376583 
+  pfam01641, SelR, SelR domain.  Methionine sulfoxide reduction is an important process, by which cells regulate biological processes and cope with oxidative stress. MsrA, a protein involved in the reduction of methionine sulfoxides in proteins, has been known for four decades and has been extensively characterized with respect to structure and function. However, recent studies revealed that MsrA is only specific for methionine-S-sulfoxides. Because oxidized methionines occur in a mixture of R and S isomers in vivo, it was unclear how stereo-specific MsrA could be responsible for the reduction of all protein methionine sulfoxides. It appears that a second methionine sulfoxide reductase, SelR, evolved that is specific for methionine-R-sulfoxides, the activity that is different but complementary to that of MsrA. Thus, these proteins, working together, could reduce both stereoisomers of methionine sulfoxide. This domain is found both in SelR proteins and fused with the peptide methionine sulfoxide reductase enzymatic domain pfam01625. The domain has two conserved cysteine and histidines. The domain binds both selenium and zinc. The final cysteine is found to be replaced by the rare amino acid selenocysteine in some members of the family. This family has methionine-R-sulfoxide reductase activity. 
+  376583 
+  120 
+  
+    
+      1 
+      110.138 
+      277 
+      5.23903e-34 
+      16 
+      174 
+      6 
+      58 
+      1 
+      0 
+      32 
+      36 
+      0 
+      53 
+      LTDVQYYVTQQNGTERPFSHEYDHQFEPGIYVDIVSGEPLFSSSDKYDSGCGW 
+      LTPEQYRVLREKGTERPFTGEYWDNKEPGIYVCAGCGTPLFSSDTKFDSGCGW 
+      LT  QY V ++ GTERPF+ EY    EPGIYV    G PLFSS  K+DSGCGW 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  86 
+  ds2020-267_619 
+  No definition line 
+  217 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      76089651 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  87 
+  ds2020-267_675 
+  No definition line 
+  215 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      73783904 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  88 
+  ds2020-267_974 
+  No definition line 
+  122 
+
+ 
+  
+    
+      17919 
+      3004588 
+      13 
+      74834307 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  89 
+  ds2020-267_912 
+  No definition line 
+  135 
+
+ 
+  
+    
+      17919 
+      3004588 
+      17 
+      75599020 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  90 
+  ds2020-267_1054 
+  No definition line 
+  75 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      75114700 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  91 
+  ds2020-267_287 
+  No definition line 
+  256 
+
+
+  1 
+  gnl|CDD|376293 
+  pfam00115, COX1, Cytochrome C and Quinol oxidase polypeptide I.   
+  376293 
+  433 
+  
+    
+      1 
+      97.2464 
+      243 
+      2.8946e-26 
+      13 
+      237 
+      326 
+      400 
+      1 
+      0 
+      47 
+      57 
+      0 
+      75 
+      NTPGLDIALHDTYYVVAHFHYVLSMGAVFALFAGFHYWVGKIFGRIYPETLGQIHFWITFFGVNLTFFPMHFLGL 
+      ALPPVNYYVHDTYFVVAHFHYVLFGGVVFALFAGIYYWFPKLTGRMYSERLGKLHFWLLFIGFNLTFFPMHILGL 
+        P ++  +HDTY+VVAHFHYVL  G VFALFAG +YW  K+ GR+Y E LG++HFW+ F G NLTFFPMH LGL 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      50 
+      73802330 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  92 
+  ds2020-267_139 
+  No definition line 
+  320 
+
+
+  1 
+  gnl|CDD|368641 
+  pfam05860, Haemagg_act, haemagglutination activity domain.  This domain is suggested to be a carbohydrate- dependent haemagglutination activity site. It is found in a range of haemagglutinins and haemolysins. 
+  368641 
+  118 
+  
+    
+      1 
+      59.967 
+      146 
+      1.34887e-13 
+      167 
+      298 
+      56 
+      99 
+      2 
+      0 
+      20 
+      26 
+      0 
+      44 
+      GEAKIILGQVNSTSPSQLAGYTEIAGGKAELVIANPAGITCSGG 
+      GAASNILNRVTGGNPSQIQGYIEVAGANANVFLANPNGIIFGGN 
+      G A  IL +V   +PSQ+ GY E+AG  A + +ANP GI   G  
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      65 
+      75433973 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  93 
+  ds2020-267_763 
+  No definition line 
+  211 
+
+
+  1 
+  gnl|CDD|376349 
+  pfam00557, Peptidase_M24, Metallopeptidase family M24.  This family contains metallopeptidases. It also contains non-peptidase homologs such as the N terminal domain of Spt16 which is a histone H3-H4 binding module. 
+  376349 
+  206 
+  
+    
+      1 
+      34.896 
+      81 
+      0.000231782 
+      15 
+      74 
+      76 
+      95 
+      -3 
+      0 
+      12 
+      14 
+      0 
+      20 
+      FLIDAGASFHGYASDITRIY 
+      VLIDVGAEYDGYCSDITRTF 
+       LID GA + GY SDITR + 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  94 
+  ds2020-267_624 
+  No definition line 
+  217 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      76089651 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  95 
+  ds2020-267_70 
+  No definition line 
+  445 
+
+ 
+  
+    
+      17919 
+      3004588 
+      79 
+      109640103 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  96 
+  ds2020-267_1209 
+  No definition line 
+  61 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      60091760 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  97 
+  ds2020-267_196 
+  No definition line 
+  283 
+
+ 
+  
+    
+      17919 
+      3004588 
+      57 
+      73378585 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  98 
+  ds2020-267_1102 
+  No definition line 
+  71 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      69105524 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  99 
+  ds2020-267_346 
+  No definition line 
+  246 
+
+ 
+  
+    
+      17919 
+      3004588 
+      47 
+      75683825 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  100 
+  ds2020-267_612 
+  No definition line 
+  218 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      76089651 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  101 
+  ds2020-267_607 
+  No definition line 
+  219 
+
+ 
+  
+    
+      17919 
+      3004588 
+      40 
+      75498324 
+      0.04777374 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  102 
+  ds2020-267_571 
+  No definition line 
+  221 
+
+
+  1 
+  gnl|CDD|366135 
+  pfam00501, AMP-binding, AMP-binding enzyme.   
+  366135 
+  361 
+  
+    
+      1 
+      43.7436 
+      104 
+      2.61467e-07 
+      34 
+      201 
+      1 
+      56 
+      1 
+      0 
+      17 
+      29 
+      0 
+      56 
+      LERSATVYGDCPSLIYNDTTYTWTQTHRRCIRVASSISSLVIKSRHVVSVLSPNTP 
+      LERQAARTPDKTALVGEGRRLTYRELDERANRLAAGLRALGVKKGDRVAILLPNSP 
+      LER A    D  +L+      T+ +   R  R+A+ + +L +K    V++L PN+P 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      40 
+      75498324 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  103 
+  ds2020-267_592 
+  No definition line 
+  219 
+
+ 
+  
+    
+      17919 
+      3004588 
+      40 
+      75498324 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  104 
+  ds2020-267_764 
+  No definition line 
+  211 
+
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  105 
+  ds2020-267_1015 
+  No definition line 
+  85 
+
+ 
+  
+    
+      17919 
+      3004588 
+      3 
+      73770775 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  106 
+  ds2020-267_1151 
+  No definition line 
+  65 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      63096348 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  107 
+  ds2020-267_69 
+  No definition line 
+  451 
+
+ 
+  
+    
+      17919 
+      3004588 
+      79 
+      112818077 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  108 
+  ds2020-267_167 
+  No definition line 
+  298 
+
+ 
+  
+    
+      17919 
+      3004588 
+      60 
+      75248472 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  109 
+  ds2020-267_23 
+  No definition line 
+  835 
+
+ 
+  
+    
+      17919 
+      3004588 
+      85 
+      285924289 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  110 
+  ds2020-267_872 
+  No definition line 
+  206 
+
+ 
+  
+    
+      17919 
+      3004588 
+      36 
+      75504128 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  111 
+  ds2020-267_578 
+  No definition line 
+  221 
+
+ 
+  
+    
+      17919 
+      3004588 
+      40 
+      75498324 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  112 
+  ds2020-267_611 
+  No definition line 
+  218 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      76089651 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  113 
+  ds2020-267_565 
+  No definition line 
+  222 
+
+
+  1 
+  gnl|CDD|377172 
+  pfam03950, tRNA-synt_1c_C, tRNA synthetases class I (E and Q), anti-codon binding domain.  Other tRNA synthetase sub-families are too dissimilar to be included. This family includes only glutamyl and glutaminyl tRNA synthetases. In some organisms, a single glutamyl-tRNA synthetase aminoacylates both tRNA(Glu) and tRNA(Gln). 
+  377172 
+  174 
+  
+    
+      1 
+      49.5699 
+      119 
+      9.52435e-10 
+      53 
+      184 
+      63 
+      108 
+      -3 
+      0 
+      16 
+      27 
+      2 
+      46 
+      EIRLRNSYVLKIEEHITDDNGEVVGLTATIDPKTLGNN--PEGLVH 
+      EVRLMDAYNIKVTEVVKDEDGNVTELHCTYDGDDLGGARKVKGIIH 
+      E+RL ++Y +K+ E + D++G V  L  T D   LG     +G++H 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      41 
+      74906997 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  114 
+  ds2020-267_468 
+  No definition line 
+  230 
+
+ 
+  
+    
+      17919 
+      3004588 
+      43 
+      73724343 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  115 
+  ds2020-267_202 
+  No definition line 
+  281 
+
+ 
+  
+    
+      17919 
+      3004588 
+      56 
+      74041588 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  116 
+  ds2020-267_826 
+  No definition line 
+  208 
+
+ 
+  
+    
+      17919 
+      3004588 
+      37 
+      74930720 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  117 
+  ds2020-267_503 
+  No definition line 
+  226 
+
+ 
+  
+    
+      17919 
+      3004588 
+      42 
+      74315670 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  118 
+  ds2020-267_197 
+  No definition line 
+  283 
+
+ 
+  
+    
+      17919 
+      3004588 
+      57 
+      73378585 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  119 
+  ds2020-267_307 
+  No definition line 
+  253 
+
+ 
+  
+    
+      17919 
+      3004588 
+      49 
+      74429495 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  120 
+  ds2020-267_427 
+  No definition line 
+  235 
+
+
+  1 
+  gnl|CDD|367360 
+  pfam03154, Atrophin-1, Atrophin-1 family.  Atrophin-1 is the protein product of the dentatorubral-pallidoluysian atrophy (DRPLA) gene. DRPLA OMIM:125370 is a progressive neurodegenerative disorder. It is caused by the expansion of a CAG repeat in the DRPLA gene on chromosome 12p. This results in an extended polyglutamine region in atrophin-1, that is thought to confer toxicity to the protein, possibly through altering its interactions with other proteins. The expansion of a CAG repeat is also the underlying defect in six other neurodegenerative disorders, including Huntington's disease. One interaction of expanded polyglutamine repeats that is thought to be pathogenic is that with the short glutamine repeat in the transcriptional coactivator CREB binding protein, CBP. This interaction draws CBP away from its usual nuclear location to the expanded polyglutamine repeat protein aggregates that are characteristic of the polyglutamine neurodegenerative disorders. This interferes with CBP-mediated transcription and causes cytotoxicity. 
+  367360 
+  980 
+  
+    
+      1 
+      34.611 
+      79 
+      0.000552392 
+      40 
+      213 
+      578 
+      636 
+      -2 
+      0 
+      19 
+      37 
+      1 
+      59 
+      KEMEEGKHKSRKEGESK-RSHRDRQREKERNGERHRDKDKDKDKRDRDSRRSEREKSSD 
+      KKREEALEKAKREAEQKAREEREREKEREKEREREREREREAERAAKASSSSHEGRMSD 
+      K+ EE   K+++E E K R  R+R++E+E+  ER R+++++ ++  + S  S   + SD 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      44 
+      75349168 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  121 
+  ds2020-267_738 
+  No definition line 
+  212 
+
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  122 
+  ds2020-267_1160 
+  No definition line 
+  64 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      63096348 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  123 
+  ds2020-267_1066 
+  No definition line 
+  74 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      72110112 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  124 
+  ds2020-267_7 
+  No definition line 
+  1772 
+
+ 
+  
+    
+      17919 
+      3004588 
+      91 
+      685605541 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  125 
+  ds2020-267_786 
+  No definition line 
+  210 
+
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  126 
+  ds2020-267_44 
+  No definition line 
+  545 
+
+ 
+  
+    
+      17919 
+      3004588 
+      81 
+      155314900 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  127 
+  ds2020-267_475 
+  No definition line 
+  229 
+
+ 
+  
+    
+      17919 
+      3004588 
+      43 
+      73724343 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  128 
+  ds2020-267_928 
+  No definition line 
+  130 
+
+ 
+  
+    
+      17919 
+      3004588 
+      16 
+      73382868 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  129 
+  ds2020-267_752 
+  No definition line 
+  211 
+
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  130 
+  ds2020-267_988 
+  No definition line 
+  116 
+
+ 
+  
+    
+      17919 
+      3004588 
+      11 
+      75801933 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  131 
+  ds2020-267_265 
+  No definition line 
+  260 
+
+ 
+  
+    
+      17919 
+      3004588 
+      50 
+      75910968 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  132 
+  ds2020-267_1219 
+  No definition line 
+  60 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      60091760 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  133 
+  ds2020-267_115 
+  No definition line 
+  345 
+
+ 
+  
+    
+      17919 
+      3004588 
+      72 
+      73720060 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  134 
+  ds2020-267_559 
+  No definition line 
+  222 
+
+ 
+  
+    
+      17919 
+      3004588 
+      41 
+      74906997 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  135 
+  ds2020-267_51 
+  No definition line 
+  500 
+
+ 
+  
+    
+      17919 
+      3004588 
+      80 
+      135111848 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  136 
+  ds2020-267_1185 
+  No definition line 
+  62 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      60091760 
+      0.061499328 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  137 
+  ds2020-267_520 
+  No definition line 
+  225 
+
+ 
+  
+    
+      17919 
+      3004588 
+      42 
+      74315670 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  138 
+  ds2020-267_1137 
+  No definition line 
+  66 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      66100936 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  139 
+  ds2020-267_1191 
+  No definition line 
+  61 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      60091760 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  140 
+  ds2020-267_548 
+  No definition line 
+  223 
+
+ 
+  
+    
+      17919 
+      3004588 
+      41 
+      74906997 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  141 
+  ds2020-267_4 
+  No definition line 
+  2297 
+
+
+  1 
+  gnl|CDD|280316 
+  pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. 
+  280316 
+  465 
+  
+    
+      1 
+      187.283 
+      476 
+      1.96254e-52 
+      824 
+      1858 
+      121 
+      464 
+      -2 
+      0 
+      94 
+      136 
+      19 
+      354 
+      VRKSRLVNWEEEHKNRVSPNLAEMPEGLVYERASQLFSRSISAGKRPR-KFD-WREYWQSRWQWSAAGSIHSQYSEDDKYIFKDIYLKNKFISILAMPDMNMDSWRER----DPELHAWSSTKYEWSKLRAIYGTDVTSYVLAHFAFYNCEDVLPSPFPVGKAANDEN--VRSRVRSVLEGRTQYCVDFEDFNSQHSVQSMKAVIDAYRDTFGHFLTQEQLAAVEWTRLSLDRVIVHDNQGLKMEYNAKGTLLSGWRLTTFMNSVLNYIYTQLIVPDVVQSQNSLHNGDDVLLGSNSLEDVLLAGKNAKKHNIRLQMSK-CAYGAIAEFLRVDHKRGSKGQYLSRAMATLVHSR 
+      GRGVTNVDWEEEAKNRVDLAVVCRLVLLPMEELRAHIDAVLDELVVRRGLCDPIRLFVKNEPLWCVNGHPDHKLRE---GRLRLLSSVSLVDQLVRR--MLFEPQNNNEIAWWGSVPSKPSMKLEHGKSRAIYACDTRSYLAFEYLLAPVEKAWANKSVILNPGEGDISGFDWSVQDWKRGGVSLMLDYDDFNSQHSTESMRAVFERLR----RRLPDEPAEAADWLVCSMDSMYQLSD-GTLLAQRVPGTLKSGHRATTFINSVLNCAYAELAGAPWADVPTSIHMGDDVLEGLRTPADATSLLDKYARLGFKVNPSKQSVGHTIAEFLRVAFCSHEVRGYLARAIASLVSGN 
+       R    V+WEEE KNRV   +      L  E         +      R   D  R + ++   W   G    +  E      + +   +    ++    M  +            + +  S K E  K RAIY  D  SY+   +     E    +   +      +       V+    G     +D++DFNSQHS +SM+AV +  R      L  E   A +W   S+D +    + G  +     GTL SG R TTF+NSVLN  Y +L          S+H GDDVL G  +  D         +   ++  SK      IAEFLRV         YL+RA+A+LV    
+     
+   
+ 
+
+  2 
+  gnl|CDD|366242 
+  pfam00680, RdRP_1, RNA dependent RNA polymerase.   
+  366242 
+  470 
+  
+    
+      1 
+      44.6549 
+      106 
+      4.43825e-05 
+      995 
+      1510 
+      166 
+      342 
+      -2 
+      0 
+      44 
+      65 
+      31 
+      190 
+      KYEWSKLRAIYGTDVTSYVLAHFAFYNCEDVLPSPFPVGKAANDENVRSRVRSVLEGRTQ-----YCVDFEDFNSQHSVQSMKAVIDAYRDTFGHFLTQEQLAAVEWTRLSLDRVIVHDNQGLKMEYNAK-----GTLLSGWRLTTFMNSVLNYIYTQLIVPD-VVQSQNSLH-------NGDDVLLGSN 
+      KVQAGKTRLFWGCPVEVNLVARAVFGPFCNKIYSNALKLGIAVGINPFSRDWERLGALIRKGSDVLDVDYSAFDSTLSPFVFDLVEDIRSEFCGGL---------EPTRLALLELLSNP----IHILGGTIIKVEGGLPSGQPATSVINSILNNIYVLYALIKHTGESELDDHETIRFISYGDDNLVAVN 
+      K +  K R  +G  V   ++A   F    + + S       A   N  SR    L    +       VD+  F+S  S      V D   +  G           E TRL+L  ++ +                 G L SG   T+ +NS+LN IY    +     +S+   H        GDD L+  N 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      93 
+      899217312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  142 
+  ds2020-267_573 
+  No definition line 
+  221 
+
+ 
+  
+    
+      17919 
+      3004588 
+      40 
+      75498324 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  143 
+  ds2020-267_16 
+  No definition line 
+  1165 
+
+
+  1 
+  gnl|CDD|280316 
+  pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. 
+  280316 
+  465 
+  
+    
+      1 
+      141.829 
+      358 
+      1.58664e-38 
+      536 
+      1078 
+      288 
+      464 
+      -1 
+      0 
+      62 
+      85 
+      6 
+      182 
+      SRVRAVLEGRTQYCVDFEDFNSQHSVQSMKAVIDAYRDTFGHFLTQEQLAAVEWTRLSLNRVIVHDNQGLKMEYSAKGTLLSGWRLTTFMNSVLNYIYTQLIVPDVVKSQNSLHNGDDVLLGSNSLGDVLLAGRNAKKHNIRLQMSK-CAYGAIAEFLRVDHKRGSKGQYLSRAMATLVHSR 
+      WSVQDWKRGGVSLMLDYDDFNSQHSTESMRAVFERLR----RRLPDEPAEAADWLVCSMDSMYQLSD-GTLLAQRVPGTLKSGHRATTFINSVLNCAYAELAGAPWADVPTSIHMGDDVLEGLRTPADATSLLDKYARLGFKVNPSKQSVGHTIAEFLRVAFCSHEVRGYLARAIASLVSGN 
+        V+    G     +D++DFNSQHS +SM+AV +  R      L  E   A +W   S++ +    + G  +     GTL SG R TTF+NSVLN  Y +L          S+H GDDVL G  +  D         +   ++  SK      IAEFLRV         YL+RA+A+LV    
+     
+   
+ 
+
+  2 
+  gnl|CDD|366242 
+  pfam00680, RdRP_1, RNA dependent RNA polymerase.   
+  366242 
+  470 
+  
+    
+      1 
+      45.4253 
+      108 
+      8.1737e-06 
+      707 
+      1042 
+      231 
+      342 
+      -1 
+      0 
+      30 
+      45 
+      16 
+      120 
+      YCVDFEDFNSQHSVQSMKAVIDAYRDTFGHFLTQEQLAAVEWTRLSLNRVIVHDNQGLKMEYSAKGTLLSGWRLTTFMNSVLNYIYTQLIVPD-VVKSQNSLH-------NGDDVLLGSN 
+      LDVDYSAFDSTLSPFVFDLVEDIRS----EFCGGLEPTRLALLELLSNPIHILGGTIIKVE----GGLPSGQPATSVINSILNNIYVLYALIKHTGESELDDHETIRFISYGDDNLVAVN 
+        VD+  F+S  S      V D        F    +   +    L  N + +     +K+E    G L SG   T+ +NS+LN IY    +     +S+   H        GDD L+  N 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      88 
+      428314800 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  144 
+  ds2020-267_753 
+  No definition line 
+  211 
+
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  145 
+  ds2020-267_438 
+  No definition line 
+  234 
+
+
+  1 
+  gnl|CDD|365856 
+  pfam00078, RVT_1, Reverse transcriptase (RNA-dependent DNA polymerase).  A reverse transcriptase gene is usually indicative of a mobile element such as a retrotransposon or retrovirus. Reverse transcriptases occur in a variety of mobile elements, including retrotransposons, retroviruses, group II introns, bacterial msDNAs, hepadnaviruses, and caulimoviruses. 
+  365856 
+  184 
+  
+    
+      1 
+      33.8002 
+      78 
+      0.000870142 
+      110 
+      220 
+      97 
+      136 
+      -3 
+      0 
+      11 
+      17 
+      3 
+      40 
+      VLPFGLKNDGAPYQRAMTALFHDMIHKE---MEVYVDDMI 
+      GLPQGLVLSPALFQLFMNELLRPLRKRAGLTLVRYADDIL 
+       LP GL    A +Q  M  L   +  +    +  Y DD++ 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      44 
+      75349168 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  146 
+  ds2020-267_370 
+  No definition line 
+  242 
+
+
+  1 
+  gnl|CDD|376297 
+  pfam00146, NADHdh, NADH dehydrogenase.   
+  376297 
+  301 
+  
+    
+      1 
+      52.4668 
+      127 
+      2.41391e-10 
+      22 
+      111 
+      269 
+      298 
+      1 
+      0 
+      20 
+      24 
+      0 
+      30 
+      VRAAFPRYRYDQLMGLGRKVFLPLSLARVV 
+      IRATLPRFRYDQLMRLGWKVLLPLSLANLL 
+      +RA  PR+RYDQLM LG KV LPLSLA ++ 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      46 
+      74130676 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  147 
+  ds2020-267_798 
+  No definition line 
+  209 
+
+ 
+  
+    
+      17919 
+      3004588 
+      37 
+      74930720 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  148 
+  ds2020-267_278 
+  No definition line 
+  258 
+
+
+  1 
+  gnl|CDD|365808 
+  pfam00012, HSP70, Hsp70 protein.  Hsp70 chaperones help to fold many proteins. Hsp70 assisted folding involves repeated cycles of substrate binding and release. Hsp70 activity is ATP dependent. Hsp70 proteins are made up of two regions: the amino terminus is the ATPase domain and the carboxyl terminus is the substrate binding region. 
+  365808 
+  598 
+  
+    
+      1 
+      77.6867 
+      192 
+      4.1355e-19 
+      50 
+      232 
+      417 
+      477 
+      2 
+      0 
+      27 
+      37 
+      0 
+      61 
+      SQTFSTAEDGQSQILLHLYRGDSAMAKSAHSLGTFQITGIAPMPRGEPSVRVEFLADTGGI 
+      SQIFSTAADNQTAVEIQVYQGEREMAPDNKLLGSFELDGIPPAPRGVPQIEVTFDIDANGI 
+      SQ FSTA D Q+ + + +Y+G+  MA     LG+F++ GI P PRG P + V F  D  GI 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      50 
+      75910968 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  149 
+  ds2020-267_314 
+  No definition line 
+  252 
+
+ 
+  
+    
+      17919 
+      3004588 
+      49 
+      74429495 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  150 
+  ds2020-267_1251 
+  No definition line 
+  58 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      57087172 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  151 
+  ds2020-267_435 
+  No definition line 
+  234 
+
+ 
+  
+    
+      17919 
+      3004588 
+      44 
+      75349168 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  152 
+  ds2020-267_1017 
+  No definition line 
+  83 
+
+ 
+  
+    
+      17919 
+      3004588 
+      2 
+      74218750 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  153 
+  ds2020-267_1214 
+  No definition line 
+  60 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      60091760 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  154 
+  ds2020-267_960 
+  No definition line 
+  125 
+
+ 
+  
+    
+      17919 
+      3004588 
+      14 
+      74350494 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  155 
+  ds2020-267_772 
+  No definition line 
+  210 
+
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  156 
+  ds2020-267_937 
+  No definition line 
+  129 
+
+ 
+  
+    
+      17919 
+      3004588 
+      16 
+      73382868 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  157 
+  ds2020-267_812 
+  No definition line 
+  208 
+
+ 
+  
+    
+      17919 
+      3004588 
+      37 
+      74930720 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  158 
+  ds2020-267_66 
+  No definition line 
+  460 
+
+ 
+  
+    
+      17919 
+      3004588 
+      79 
+      117585038 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  159 
+  ds2020-267_425 
+  No definition line 
+  235 
+
+ 
+  
+    
+      17919 
+      3004588 
+      44 
+      75349168 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  160 
+  ds2020-267_1010 
+  No definition line 
+  96 
+
+ 
+  
+    
+      17919 
+      3004588 
+      6 
+      75323924 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  161 
+  ds2020-267_727 
+  No definition line 
+  213 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      73783904 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  162 
+  ds2020-267_745 
+  No definition line 
+  211 
+
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  163 
+  ds2020-267_512 
+  No definition line 
+  225 
+
+ 
+  
+    
+      17919 
+      3004588 
+      42 
+      74315670 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  164 
+  ds2020-267_899 
+  No definition line 
+  146 
+
+ 
+  
+    
+      17919 
+      3004588 
+      20 
+      74093824 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  165 
+  ds2020-267_1020 
+  No definition line 
+  81 
+
+ 
+  
+    
+      17919 
+      3004588 
+      2 
+      74218750 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  166 
+  ds2020-267_724 
+  No definition line 
+  213 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      73783904 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  167 
+  ds2020-267_1163 
+  No definition line 
+  64 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      63096348 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  168 
+  ds2020-267_62 
+  No definition line 
+  464 
+
+ 
+  
+    
+      17919 
+      3004588 
+      79 
+      119174025 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  169 
+  ds2020-267_824 
+  No definition line 
+  208 
+
+ 
+  
+    
+      17919 
+      3004588 
+      37 
+      74930720 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  170 
+  ds2020-267_495 
+  No definition line 
+  227 
+
+ 
+  
+    
+      17919 
+      3004588 
+      42 
+      74315670 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  171 
+  ds2020-267_479 
+  No definition line 
+  229 
+
+ 
+  
+    
+      17919 
+      3004588 
+      43 
+      73724343 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  172 
+  ds2020-267_437 
+  No definition line 
+  234 
+
+ 
+  
+    
+      17919 
+      3004588 
+      44 
+      75349168 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  173 
+  ds2020-267_947 
+  No definition line 
+  127 
+
+ 
+  
+    
+      17919 
+      3004588 
+      15 
+      73866681 
+      0.065298648 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  174 
+  ds2020-267_531 
+  No definition line 
+  224 
+
+ 
+  
+    
+      17919 
+      3004588 
+      41 
+      74906997 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  175 
+  ds2020-267_454 
+  No definition line 
+  232 
+
+ 
+  
+    
+      17919 
+      3004588 
+      43 
+      75958414 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  176 
+  ds2020-267_931 
+  No definition line 
+  129 
+
+ 
+  
+    
+      17919 
+      3004588 
+      16 
+      73382868 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  177 
+  ds2020-267_364 
+  No definition line 
+  243 
+
+
+  1 
+  gnl|CDD|365952 
+  pfam00216, Bac_DNA_binding, Bacterial DNA-binding protein.   
+  365952 
+  88 
+  
+    
+      1 
+      50.2082 
+      121 
+      1.5507e-10 
+      134 
+      241 
+      53 
+      88 
+      -3 
+      0 
+      23 
+      24 
+      0 
+      36 
+      ARPQRKRRNPATGEAIQIPAKKAPIFKAGKALKDAV 
+      KRAARTGRNPKTGEAITIPAKKVVKFKPGKELKEAV 
+       R  R  RNP TGEAI IPAKK   FK GK LK+AV 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      47 
+      73521430 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  178 
+  ds2020-267_790 
+  No definition line 
+  210 
+
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  179 
+  ds2020-267_953 
+  No definition line 
+  126 
+
+ 
+  
+    
+      17919 
+      3004588 
+      15 
+      73866681 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  180 
+  ds2020-267_340 
+  No definition line 
+  247 
+
+ 
+  
+    
+      17919 
+      3004588 
+      47 
+      75683825 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  181 
+  ds2020-267_822 
+  No definition line 
+  208 
+
+ 
+  
+    
+      17919 
+      3004588 
+      37 
+      74930720 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  182 
+  ds2020-267_1241 
+  No definition line 
+  59 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      57087172 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  183 
+  ds2020-267_558 
+  No definition line 
+  222 
+
+
+  1 
+  gnl|CDD|377116 
+  pfam03737, RraA-like, Aldolase/RraA.  Members of this family include regulator of ribonuclease E activity A (RraA) and 4-hydroxy-4-methyl-2-oxoglutarate (HMG)/4-carboxy- 4-hydroxy-2-oxoadipate (CHA) aldolase, also known as RraA-like protein. RraA acts as a trans-acting modulator of RNA turnover, binding essential endonuclease RNase E and inhibiting RNA processing. RraA-like proteins seem to contain aldolase and/or decarboxylase activity either in place of or in addition to the RNase E inhibitor functions. 
+  377116 
+  147 
+  
+    
+      1 
+      57.5038 
+      140 
+      4.93695e-13 
+      57 
+      179 
+      104 
+      147 
+      -2 
+      0 
+      18 
+      28 
+      3 
+      44 
+      VFA---IPRKSNRKGVGETDIEISFGGLTINSGMYVYADNNGII 
+      VFALGTTPRGSPKKGGGEVNVPVTIGGVTVRPGDIVVADEDGVV 
+      VFA    PR S +KG GE ++ ++ GG+T+  G  V AD +G++ 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      41 
+      74906997 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  184 
+  ds2020-267_43 
+  No definition line 
+  563 
+
+ 
+  
+    
+      17919 
+      3004588 
+      81 
+      164633794 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  185 
+  ds2020-267_702 
+  No definition line 
+  214 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      73783904 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  186 
+  ds2020-267_1230 
+  No definition line 
+  60 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      60091760 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  187 
+  ds2020-267_1186 
+  No definition line 
+  62 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      60091760 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  188 
+  ds2020-267_925 
+  No definition line 
+  131 
+
+ 
+  
+    
+      17919 
+      3004588 
+      16 
+      73382868 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  189 
+  ds2020-267_122 
+  No definition line 
+  338 
+
+ 
+  
+    
+      17919 
+      3004588 
+      70 
+      73510836 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  190 
+  ds2020-267_770 
+  No definition line 
+  210 
+
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  191 
+  ds2020-267_1078 
+  No definition line 
+  73 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      72110112 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  192 
+  ds2020-267_102 
+  No definition line 
+  375 
+
+ 
+  
+    
+      17919 
+      3004588 
+      76 
+      80494456 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  193 
+  ds2020-267_713 
+  No definition line 
+  213 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      73783904 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  194 
+  ds2020-267_660 
+  No definition line 
+  216 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      76089651 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  195 
+  ds2020-267_1147 
+  No definition line 
+  65 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      63096348 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  196 
+  ds2020-267_760 
+  No definition line 
+  211 
+
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  197 
+  ds2020-267_542 
+  No definition line 
+  224 
+
+ 
+  
+    
+      17919 
+      3004588 
+      41 
+      74906997 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  198 
+  ds2020-267_11 
+  No definition line 
+  1579 
+
+ 
+  
+    
+      17919 
+      3004588 
+      91 
+      597672165 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  199 
+  ds2020-267_209 
+  No definition line 
+  277 
+
+ 
+  
+    
+      17919 
+      3004588 
+      55 
+      74704591 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  200 
+  ds2020-267_618 
+  No definition line 
+  218 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      76089651 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  201 
+  ds2020-267_1133 
+  No definition line 
+  66 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      66100936 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  202 
+  ds2020-267_939 
+  No definition line 
+  128 
+
+ 
+  
+    
+      17919 
+      3004588 
+      15 
+      73866681 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  203 
+  ds2020-267_749 
+  No definition line 
+  211 
+
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  204 
+  ds2020-267_1008 
+  No definition line 
+  96 
+
+ 
+  
+    
+      17919 
+      3004588 
+      6 
+      75323924 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  205 
+  ds2020-267_135 
+  No definition line 
+  323 
+
+ 
+  
+    
+      17919 
+      3004588 
+      66 
+      74699294 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  206 
+  ds2020-267_218 
+  No definition line 
+  274 
+
+
+  1 
+  gnl|CDD|279664 
+  pfam01348, Intron_maturas2, Type II intron maturase.  Group II introns use intron-encoded reverse transcriptase, maturase and DNA endonuclease activities for site-specific insertion into DNA. Although this type of intron is self splicing in vitro they require a maturase protein for splicing in vivo. It has been shown that a specific region of the aI2 intron is needed for the maturase function. This region was found to be conserved in group II introns and called domain X. 
+  279664 
+  140 
+  
+    
+      1 
+      38.6047 
+      90 
+      1.66328e-05 
+      51 
+      257 
+      32 
+      100 
+      3 
+      0 
+      21 
+      32 
+      0 
+      69 
+      PIHVACLTNVSDGDIVNWSAGIAINPLSYYRCRDNLYQVRTIVDHQIRWSAIFTLAHKHKSSARNIILK 
+      PRSVGRWTDLDDRDILLRYNAIIRGILNYYSFADNKKRLYTRIYYILRLSCAKTLARKLKLGTVRKVIK 
+      P  V   T++ D DI+     I    L+YY   DN  ++ T + + +R S   TLA K K      ++K 
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      54 
+      75367594 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  207 
+  ds2020-267_777 
+  No definition line 
+  210 
+
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  208 
+  ds2020-267_1105 
+  No definition line 
+  71 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      69105524 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  209 
+  ds2020-267_972 
+  No definition line 
+  123 
+
+ 
+  
+    
+      17919 
+      3004588 
+      14 
+      74350494 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  210 
+  ds2020-267_1070 
+  No definition line 
+  73 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      72110112 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  211 
+  ds2020-267_835 
+  No definition line 
+  207 
+
+ 
+  
+    
+      17919 
+      3004588 
+      37 
+      74930720 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  212 
+  ds2020-267_286 
+  No definition line 
+  257 
+
+ 
+  
+    
+      17919 
+      3004588 
+      50 
+      73802330 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  213 
+  ds2020-267_820 
+  No definition line 
+  208 
+
+ 
+  
+    
+      17919 
+      3004588 
+      37 
+      74930720 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  214 
+  ds2020-267_324 
+  No definition line 
+  250 
+
+ 
+  
+    
+      17919 
+      3004588 
+      48 
+      75056660 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  215 
+  ds2020-267_363 
+  No definition line 
+  243 
+
+
+  1 
+  gnl|CDD|366086 
+  pfam00416, Ribosomal_S13, Ribosomal protein S13/S18.  This family includes ribosomal protein S13 from prokaryotes and S18 from eukaryotes. 
+  366086 
+  109 
+  
+    
+      1 
+      37.3005 
+      87 
+      2.02528e-05 
+      15 
+      134 
+      2 
+      41 
+      -2 
+      0 
+      14 
+      21 
+      0 
+      40 
+      ISGARSVADEQVRIASTKIDGIGPKKAIQVRYRLGISGDI 
+      ILGTDIDGDKKVEIALTYIKGIGRRRANIILKKAGVDLDK 
+      I G     D++V IA T I GIG ++A  +  + G+  D  
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      47 
+      73521430 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  216 
+  ds2020-267_674 
+  No definition line 
+  215 
+
+ 
+  
+    
+      17919 
+      3004588 
+      39 
+      73783904 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  217 
+  ds2020-267_863 
+  No definition line 
+  206 
+
+ 
+  
+    
+      17919 
+      3004588 
+      36 
+      75504128 
+      0.070739016 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  218 
+  ds2020-267_1109 
+  No definition line 
+  70 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      69105524 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  219 
+  ds2020-267_45 
+  No definition line 
+  540 
+
+ 
+  
+    
+      17919 
+      3004588 
+      81 
+      153761751 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  220 
+  ds2020-267_746 
+  No definition line 
+  211 
+
+
+  1 
+  gnl|CDD|279788 
+  pfam01490, Aa_trans, Transmembrane amino acid transporter protein.  This transmembrane region is found in many amino acid transporters including UNC-47 and MTR. UNC-47 encodes a vesicular amino butyric acid (GABA) transporter, (VGAT). UNC-47 is predicted to have 10 transmembrane domains. MTR is a N system amino acid transporter system protein involved in methyltryptophan resistance. Other members of this family include proline transporters and amino acid permeases. 
+  279788 
+  410 
+  
+    
+      1 
+      35.3596 
+      82 
+      0.000177299 
+      21 
+      176 
+      358 
+      410 
+      3 
+      0 
+      11 
+      21 
+      1 
+      53 
+      WPLAIYFPVEMYFVQKKI-GSWTRKWIVLEAFSLVCFLVTVVGVIGSVQGLIS 
+      APLSFILPPLFHLKLKKTKKKSQEKLWKPDILDVICIVIGLLLMAYGVAGLIL 
+       PL+   P   +   KK       K    +   ++C ++ ++ +   V GLI  
+     
+   
+ 
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+ 
+
+  221 
+  ds2020-267_403 
+  No definition line 
+  239 
+
+ 
+  
+    
+      17919 
+      3004588 
+      45 
+      74739922 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  222 
+  ds2020-267_350 
+  No definition line 
+  245 
+
+ 
+  
+    
+      17919 
+      3004588 
+      47 
+      73521430 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  223 
+  ds2020-267_144 
+  No definition line 
+  315 
+
+ 
+  
+    
+      17919 
+      3004588 
+      65 
+      73594120 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  224 
+  ds2020-267_1049 
+  No definition line 
+  75 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      75114700 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  225 
+  ds2020-267_499 
+  No definition line 
+  227 
+
+ 
+  
+    
+      17919 
+      3004588 
+      42 
+      74315670 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  226 
+  ds2020-267_22 
+  No definition line 
+  841 
+
+ 
+  
+    
+      17919 
+      3004588 
+      85 
+      288887235 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  227 
+  ds2020-267_862 
+  No definition line 
+  206 
+
+ 
+  
+    
+      17919 
+      3004588 
+      36 
+      75504128 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  228 
+  ds2020-267_294 
+  No definition line 
+  255 
+
+ 
+  
+    
+      17919 
+      3004588 
+      50 
+      73802330 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  229 
+  ds2020-267_31 
+  No definition line 
+  712 
+
+ 
+  
+    
+      17919 
+      3004588 
+      84 
+      229406976 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  230 
+  ds2020-267_1019 
+  No definition line 
+  81 
+
+ 
+  
+    
+      17919 
+      3004588 
+      2 
+      74218750 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  231 
+  ds2020-267_747 
+  No definition line 
+  211 
+
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  232 
+  ds2020-267_543 
+  No definition line 
+  223 
+
+ 
+  
+    
+      17919 
+      3004588 
+      41 
+      74906997 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  233 
+  ds2020-267_1072 
+  No definition line 
+  73 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      72110112 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  234 
+  ds2020-267_784 
+  No definition line 
+  210 
+
+ 
+  
+    
+      17919 
+      3004588 
+      38 
+      74357312 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  235 
+  ds2020-267_879 
+  No definition line 
+  206 
+
+ 
+  
+    
+      17919 
+      3004588 
+      36 
+      75504128 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  236 
+  ds2020-267_1120 
+  No definition line 
+  67 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      66100936 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  237 
+  ds2020-267_1126 
+  No definition line 
+  66 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      66100936 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  238 
+  ds2020-267_1218 
+  No definition line 
+  60 
+
+ 
+  
+    
+      17919 
+      3004588 
+      0 
+      60091760 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  239 
+  ds2020-267_915 
+  No definition line 
+  134 
+
+ 
+  
+    
+      17919 
+      3004588 
+      16 
+      76100752 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  240 
+  ds2020-267_1026 
+  No definition line 
+  79 
+
+ 
+  
+    
+      17919 
+      3004588 
+      1 
+      74666725 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+
+  241 
+  ds2020-267_845 
+  No definition line 
+  207 
+
+ 
+  
+    
+      17919 
+      3004588 
+      37 
+      74930720 
+      0.041 
+      0.267 
+      0.14 
+     
+   
+  No hits found 
+ 
+   
+ 
diff -r 000000000000 -r b82ce29791e7 virAnnot_blast2tsv.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/virAnnot_blast2tsv.xml	Wed Aug 21 13:12:59 2024 +0000
@@ -0,0 +1,87 @@
+
+    convert XML blast results to tabular file with taxonomic informations 
+    
+        macros.xml 
+     
+    
+         
+    
+        BLASTX 
+            BLASTP 
+            TBLASTX 
+            BLASTN 
+            DIAMOND 
+        
+        0 
+            0.1 
+            0.01 
+            0.001 
+            0.0001 
+        
+         
+    
+         
+    
+        
+            
+                
+                     
+             
+         
+        
+            
+                
+                     
+             
+            
+                
+                     
+             
+         
+     
+