Mercurial > repos > peterjc > blast_rbh

--- a/tools/blast_rbh/README.rst	Fri Feb 22 09:55:36 2019 -0500
+++ b/tools/blast_rbh/README.rst	Mon Apr 01 08:05:31 2019 -0400
@@ -43,9 +43,10 @@
 Manual Installation
 ===================

-There are just two files to install:
+There are just three files to install:

 - ``blast_rbh.py`` (the Python script)
+- ``best_hits.py`` (helper script, put in same directory)
 - ``blast_rbh.xml`` (the Galaxy tool definition)

 The suggested location is in a ``tools/blast_rbh/`` folder. You will then
@@ -93,6 +94,7 @@
         - Tweak Python script to work under Python 2 or Python 3.
 v0.1.12 - Use ``<command detect_errors="aggressive">`` (internal change only).
         - Single quote command line arguments (internal change only).
+v0.2.0  - Refactored to use more than one Python file (internal change only).
 ======= ======================================================================
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blast_rbh/best_hits.py	Mon Apr 01 08:05:31 2019 -0400
@@ -0,0 +1,124 @@
+#!/usr/bin/env python
+"""Helper code for BLAST Reciprocal Best Hit (RBH) from two BLAST tabular files.
+
+This module defines a function best_hits to return the best hit in a BLAST
+tabular file, intended for use in finding BLAST Reciprocal Best Hits (RBH).
+
+The code in this module originated from blast_rbh.py in
+https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh
+
+Please cite the author per instructions in
+https://github.com/peterjc/galaxy_blast/blob/master/tools/blast_rbh/README.rst
+"""
+
+import sys
+
+tie_warning = 0
+
+c_query = 0
+c_match = 1
+c_score = 2
+c_identity = 3
+c_coverage = 4
+c_qlen = 5
+c_length = 6
+
+cols = "qseqid sseqid bitscore pident qcovhsp qlen length"
+
+
+def best_hits(blast_tabular, min_identity=70, min_coverage=50, ignore_self=False):
+    """Parse BLAST tabular output returning the best hit.
+
+    Argument blast_tabular should be a filename, containing BLAST tabular
+    output with "qseqid sseqid bitscore pident qcovhsp qlen length" as
+    the columns.
+
+    Parses the tabular file, iterating over the results as 2-tuples
+    of (query name, tuple of values for the best hit).
+
+    One hit is returned for tied best hits to the same sequence
+    (e.g. repeated domains).
+
+    Tied best hits to different sequences are NOT returned. Instead,
+    global variable tie_warning is incremented.
+    """
+    global tie_warning
+    current = None
+    best_score = None
+    best = None
+    col_count = len(cols.split())
+    with open(blast_tabular) as h:
+        for line in h:
+            if line.startswith("#"):
+                continue
+            parts = line.rstrip("\n").split("\t")
+            if len(parts) != col_count:
+                # Using NCBI BLAST+ 2.2.27 the undefined field is ignored
+                # Even NCBI BLAST+ 2.5.0 silently ignores unknown fields :(
+                sys.exit(
+                    "Old version of NCBI BLAST? Expected %i columns, got %i:\n%s\n"
+                    "Note the qcovhsp field was only added in version 2.2.28\n"
+                    % (col_count, len(parts), line)
+                )
+            if (
+                float(parts[c_identity]) < min_identity
+                or float(parts[c_coverage]) < min_coverage
+            ):
+                continue
+            a = parts[c_query]
+            b = parts[c_match]
+            if ignore_self and a == b:
+                continue
+            score = float(parts[c_score])
+            qlen = int(parts[c_qlen])
+            length = int(parts[c_length])
+            # print("Considering hit for %s to %s with score %s..." % (a, b, score))
+            if current is None:
+                # First hit
+                assert best is None
+                assert best_score is None
+                best = dict()
+                # Now append this hit...
+            elif a != current:
+                # New hit
+                if len(best) == 1:
+                    # Unambiguous (no tied matches)
+                    yield current, list(best.values())[0]
+                else:
+                    # print("%s has %i equally good hits: %s"
+                    #       % (a, len(best), ", ".join(best)))
+                    tie_warning += 1
+                best = dict()
+                # Now append this hit...
+            elif score < best_score:
+                # print("No improvement for %s, %s < %s" % (a, score, best_score))
+                continue
+            elif score > best_score:
+                # This is better, discard old best
+                best = dict()
+                # Now append this hit...
+            else:
+                # print("Tied best hits for %s" % a)
+                assert best_score == score
+                # Now append this hit...
+            current = a
+            best_score = score
+            # This will collapse two equally good hits
+            # to the same target (e.g. duplicated domain)
+            best[b] = (
+                b,
+                score,
+                parts[c_score],
+                parts[c_identity],
+                parts[c_coverage],
+                qlen,
+                length,
+            )
+    # Best hit for final query, if unambiguous:
+    if current is not None:
+        if len(best) == 1:
+            yield current, list(best.values())[0]
+        else:
+            # print("%s has %i equally good hits: %s"
+            # % (a, len(best), ", ".join(best)))
+            tie_warning += 1
--- a/tools/blast_rbh/blast_rbh.py	Fri Feb 22 09:55:36 2019 -0500
+++ b/tools/blast_rbh/blast_rbh.py	Mon Apr 01 08:05:31 2019 -0400
@@ -24,6 +24,8 @@
 import shutil
 import sys
 import tempfile
+import best_hits
+

 from optparse import OptionParser

@@ -37,7 +39,7 @@

 if "--version" in sys.argv[1:]:
     # TODO - Capture version of BLAST+ binaries too?
-    print("BLAST RBH v0.1.11")
+    print("BLAST RBH v0.2.0")
     sys.exit(0)

 try:
@@ -193,109 +195,6 @@
     b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
 log = os.path.join(base_path, "blast.log")

-cols = "qseqid sseqid bitscore pident qcovhsp qlen length"  # Or qcovs?
-c_query = 0
-c_match = 1
-c_score = 2
-c_identity = 3
-c_coverage = 4
-c_qlen = 5
-c_length = 6
-
-tie_warning = 0
-
-
-def best_hits(blast_tabular, ignore_self=False):
-    """Iterate over BLAST tabular output, returns best hits as 2-tuples.
-
-    Each return value is (query name, tuple of value for the best hit).
-
-    Tied best hits to different sequences are NOT returned.
-
-    One hit is returned for tied best hits to the same sequence
-    (e.g. repeated domains).
-    """
-    global tie_warning
-    current = None
-    best_score = None
-    best = None
-    col_count = len(cols.split())
-    with open(blast_tabular) as h:
-        for line in h:
-            if line.startswith("#"):
-                continue
-            parts = line.rstrip("\n").split("\t")
-            if len(parts) != col_count:
-                # Using NCBI BLAST+ 2.2.27 the undefined field is ignored
-                # Even NCBI BLAST+ 2.5.0 silently ignores unknown fields :(
-                sys.exit(
-                    "Old version of NCBI BLAST? Expected %i columns, got %i:\n%s\n"
-                    "Note the qcovhsp field was only added in version 2.2.28\n"
-                    % (col_count, len(parts), line)
-                )
-            if (
-                float(parts[c_identity]) < min_identity
-                or float(parts[c_coverage]) < min_coverage
-            ):
-                continue
-            a = parts[c_query]
-            b = parts[c_match]
-            if ignore_self and a == b:
-                continue
-            score = float(parts[c_score])
-            qlen = int(parts[c_qlen])
-            length = int(parts[c_length])
-            # print("Considering hit for %s to %s with score %s..." % (a, b, score))
-            if current is None:
-                # First hit
-                assert best is None
-                assert best_score is None
-                best = dict()
-                # Now append this hit...
-            elif a != current:
-                # New hit
-                if len(best) == 1:
-                    # Unambiguous (no tied matches)
-                    yield current, list(best.values())[0]
-                else:
-                    # print("%s has %i equally good hits: %s"
-                    #       % (a, len(best), ", ".join(best)))
-                    tie_warning += 1
-                best = dict()
-                # Now append this hit...
-            elif score < best_score:
-                # print("No improvement for %s, %s < %s" % (a, score, best_score))
-                continue
-            elif score > best_score:
-                # This is better, discard old best
-                best = dict()
-                # Now append this hit...
-            else:
-                # print("Tied best hits for %s" % a)
-                assert best_score == score
-                # Now append this hit...
-            current = a
-            best_score = score
-            # This will collapse two equally good hits
-            # to the same target (e.g. duplicated domain)
-            best[b] = (
-                b,
-                score,
-                parts[c_score],
-                parts[c_identity],
-                parts[c_coverage],
-                qlen,
-                length,
-            )
-    # Best hit for final query, if unambiguous:
-    if current is not None:
-        if len(best) == 1:
-            yield current, list(best.values())[0]
-        else:
-            # print("%s has %i equally good hits: %s"
-            # % (a, len(best), ", ".join(best)))
-            tie_warning += 1
-

 def check_duplicate_ids(filename):
     """Check for duplicate identifiers in a FASTA file."""
@@ -388,18 +287,20 @@
 # print("BLAST databases prepared.")
 run(
     '%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
-    % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads)
+    % (blast_cmd, fasta_a, db_b, a_vs_b, best_hits.cols, threads)
 )
 # print("BLAST species A vs species B done.")
 if not self_comparison:
     run(
         '%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
-        % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)
+        % (blast_cmd, fasta_b, db_a, b_vs_a, best_hits.cols, threads)
     )
     # print("BLAST species B vs species A done.")


-best_b_vs_a = dict(best_hits(b_vs_a, self_comparison))
+best_b_vs_a = dict(
+    best_hits.best_hits(b_vs_a, min_identity, min_coverage, self_comparison)
+)


 count = 0
@@ -410,7 +311,7 @@
 for (
     a,
     (b, a_score_float, a_score_str, a_identity_str, a_coverage_str, a_qlen, a_length),
-) in best_hits(a_vs_b, self_comparison):
+) in best_hits.best_hits(a_vs_b, min_identity, min_coverage, self_comparison):
     if b not in best_b_vs_a:
         # Match b has no best hit
         continue
@@ -443,7 +344,7 @@
     count += 1
 outfile.close()
 print("Done, %i RBH found" % count)
-if tie_warning:
+if best_hits.tie_warning:
     sys.stderr.write(
         "Warning: Sequences with tied best hits found, "
         "you may have duplicates/clusters\n"
--- a/tools/blast_rbh/blast_rbh.xml	Fri Feb 22 09:55:36 2019 -0500
+++ b/tools/blast_rbh/blast_rbh.xml	Mon Apr 01 08:05:31 2019 -0400
@@ -1,4 +1,4 @@
-<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.1.12">
+<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.2.0">
     <description>from two FASTA files</description>
     <requirements>
         <requirement type="package" version="1.67">biopython</requirement>