view tools/blast_rbh/best_hits.py @ 39:64369ee3aaf0 draft

planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh commit 0a172d2516a9d7e8d8fa26dff24c0c906b9d0c6a-dirty
author peterjc
date Mon, 01 Apr 2019 08:05:31 -0400
parents
children 2bca27f8abb2
line wrap: on
line source

#!/usr/bin/env python
"""Helper code for BLAST Reciprocal Best Hit (RBH) from two BLAST tabular files.

This module defines a function best_hits to return the best hit in a BLAST
tabular file, intended for use in finding BLAST Reciprocal Best Hits (RBH).

The code in this module originated from blast_rbh.py in
https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh

Please cite the author per instructions in
https://github.com/peterjc/galaxy_blast/blob/master/tools/blast_rbh/README.rst
"""

import sys

tie_warning = 0

c_query = 0
c_match = 1
c_score = 2
c_identity = 3
c_coverage = 4
c_qlen = 5
c_length = 6

cols = "qseqid sseqid bitscore pident qcovhsp qlen length"


def best_hits(blast_tabular, min_identity=70, min_coverage=50, ignore_self=False):
    """Parse BLAST tabular output returning the best hit.

    Argument blast_tabular should be a filename, containing BLAST tabular
    output with "qseqid sseqid bitscore pident qcovhsp qlen length" as
    the columns.

    Parses the tabular file, iterating over the results as 2-tuples
    of (query name, tuple of values for the best hit).

    One hit is returned for tied best hits to the same sequence
    (e.g. repeated domains).

    Tied best hits to different sequences are NOT returned. Instead,
    global variable tie_warning is incremented.
    """
    global tie_warning
    current = None
    best_score = None
    best = None
    col_count = len(cols.split())
    with open(blast_tabular) as h:
        for line in h:
            if line.startswith("#"):
                continue
            parts = line.rstrip("\n").split("\t")
            if len(parts) != col_count:
                # Using NCBI BLAST+ 2.2.27 the undefined field is ignored
                # Even NCBI BLAST+ 2.5.0 silently ignores unknown fields :(
                sys.exit(
                    "Old version of NCBI BLAST? Expected %i columns, got %i:\n%s\n"
                    "Note the qcovhsp field was only added in version 2.2.28\n"
                    % (col_count, len(parts), line)
                )
            if (
                float(parts[c_identity]) < min_identity
                or float(parts[c_coverage]) < min_coverage
            ):
                continue
            a = parts[c_query]
            b = parts[c_match]
            if ignore_self and a == b:
                continue
            score = float(parts[c_score])
            qlen = int(parts[c_qlen])
            length = int(parts[c_length])
            # print("Considering hit for %s to %s with score %s..." % (a, b, score))
            if current is None:
                # First hit
                assert best is None
                assert best_score is None
                best = dict()
                # Now append this hit...
            elif a != current:
                # New hit
                if len(best) == 1:
                    # Unambiguous (no tied matches)
                    yield current, list(best.values())[0]
                else:
                    # print("%s has %i equally good hits: %s"
                    #       % (a, len(best), ", ".join(best)))
                    tie_warning += 1
                best = dict()
                # Now append this hit...
            elif score < best_score:
                # print("No improvement for %s, %s < %s" % (a, score, best_score))
                continue
            elif score > best_score:
                # This is better, discard old best
                best = dict()
                # Now append this hit...
            else:
                # print("Tied best hits for %s" % a)
                assert best_score == score
                # Now append this hit...
            current = a
            best_score = score
            # This will collapse two equally good hits
            # to the same target (e.g. duplicated domain)
            best[b] = (
                b,
                score,
                parts[c_score],
                parts[c_identity],
                parts[c_coverage],
                qlen,
                length,
            )
    # Best hit for final query, if unambiguous:
    if current is not None:
        if len(best) == 1:
            yield current, list(best.values())[0]
        else:
            # print("%s has %i equally good hits: %s"
            # % (a, len(best), ", ".join(best)))
            tie_warning += 1