# HG changeset patch # User peterjc # Date 1554120331 14400 # Node ID 64369ee3aaf0efa9da8432efef1f37418226e36e # Parent 632d8517dad1de60927fe7d03c0ad3d4a242f43c planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh commit 0a172d2516a9d7e8d8fa26dff24c0c906b9d0c6a-dirty diff -r 632d8517dad1 -r 64369ee3aaf0 tools/blast_rbh/README.rst --- a/tools/blast_rbh/README.rst Fri Feb 22 09:55:36 2019 -0500 +++ b/tools/blast_rbh/README.rst Mon Apr 01 08:05:31 2019 -0400 @@ -43,9 +43,10 @@ Manual Installation =================== -There are just two files to install: +There are just three files to install: - ``blast_rbh.py`` (the Python script) +- ``best_hits.py`` (helper script, put in same directory) - ``blast_rbh.xml`` (the Galaxy tool definition) The suggested location is in a ``tools/blast_rbh/`` folder. You will then @@ -93,6 +94,7 @@ - Tweak Python script to work under Python 2 or Python 3. v0.1.12 - Use ```` (internal change only). - Single quote command line arguments (internal change only). +v0.2.0 - Refactored to use more than one Python file (internal change only). ======= ====================================================================== diff -r 632d8517dad1 -r 64369ee3aaf0 tools/blast_rbh/best_hits.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/blast_rbh/best_hits.py Mon Apr 01 08:05:31 2019 -0400 @@ -0,0 +1,124 @@ +#!/usr/bin/env python +"""Helper code for BLAST Reciprocal Best Hit (RBH) from two BLAST tabular files. + +This module defines a function best_hits to return the best hit in a BLAST +tabular file, intended for use in finding BLAST Reciprocal Best Hits (RBH). + +The code in this module originated from blast_rbh.py in +https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh + +Please cite the author per instructions in +https://github.com/peterjc/galaxy_blast/blob/master/tools/blast_rbh/README.rst +""" + +import sys + +tie_warning = 0 + +c_query = 0 +c_match = 1 +c_score = 2 +c_identity = 3 +c_coverage = 4 +c_qlen = 5 +c_length = 6 + +cols = "qseqid sseqid bitscore pident qcovhsp qlen length" + + +def best_hits(blast_tabular, min_identity=70, min_coverage=50, ignore_self=False): + """Parse BLAST tabular output returning the best hit. + + Argument blast_tabular should be a filename, containing BLAST tabular + output with "qseqid sseqid bitscore pident qcovhsp qlen length" as + the columns. + + Parses the tabular file, iterating over the results as 2-tuples + of (query name, tuple of values for the best hit). + + One hit is returned for tied best hits to the same sequence + (e.g. repeated domains). + + Tied best hits to different sequences are NOT returned. Instead, + global variable tie_warning is incremented. + """ + global tie_warning + current = None + best_score = None + best = None + col_count = len(cols.split()) + with open(blast_tabular) as h: + for line in h: + if line.startswith("#"): + continue + parts = line.rstrip("\n").split("\t") + if len(parts) != col_count: + # Using NCBI BLAST+ 2.2.27 the undefined field is ignored + # Even NCBI BLAST+ 2.5.0 silently ignores unknown fields :( + sys.exit( + "Old version of NCBI BLAST? Expected %i columns, got %i:\n%s\n" + "Note the qcovhsp field was only added in version 2.2.28\n" + % (col_count, len(parts), line) + ) + if ( + float(parts[c_identity]) < min_identity + or float(parts[c_coverage]) < min_coverage + ): + continue + a = parts[c_query] + b = parts[c_match] + if ignore_self and a == b: + continue + score = float(parts[c_score]) + qlen = int(parts[c_qlen]) + length = int(parts[c_length]) + # print("Considering hit for %s to %s with score %s..." % (a, b, score)) + if current is None: + # First hit + assert best is None + assert best_score is None + best = dict() + # Now append this hit... + elif a != current: + # New hit + if len(best) == 1: + # Unambiguous (no tied matches) + yield current, list(best.values())[0] + else: + # print("%s has %i equally good hits: %s" + # % (a, len(best), ", ".join(best))) + tie_warning += 1 + best = dict() + # Now append this hit... + elif score < best_score: + # print("No improvement for %s, %s < %s" % (a, score, best_score)) + continue + elif score > best_score: + # This is better, discard old best + best = dict() + # Now append this hit... + else: + # print("Tied best hits for %s" % a) + assert best_score == score + # Now append this hit... + current = a + best_score = score + # This will collapse two equally good hits + # to the same target (e.g. duplicated domain) + best[b] = ( + b, + score, + parts[c_score], + parts[c_identity], + parts[c_coverage], + qlen, + length, + ) + # Best hit for final query, if unambiguous: + if current is not None: + if len(best) == 1: + yield current, list(best.values())[0] + else: + # print("%s has %i equally good hits: %s" + # % (a, len(best), ", ".join(best))) + tie_warning += 1 diff -r 632d8517dad1 -r 64369ee3aaf0 tools/blast_rbh/blast_rbh.py --- a/tools/blast_rbh/blast_rbh.py Fri Feb 22 09:55:36 2019 -0500 +++ b/tools/blast_rbh/blast_rbh.py Mon Apr 01 08:05:31 2019 -0400 @@ -24,6 +24,8 @@ import shutil import sys import tempfile +import best_hits + from optparse import OptionParser @@ -37,7 +39,7 @@ if "--version" in sys.argv[1:]: # TODO - Capture version of BLAST+ binaries too? - print("BLAST RBH v0.1.11") + print("BLAST RBH v0.2.0") sys.exit(0) try: @@ -193,109 +195,6 @@ b_vs_a = os.path.join(base_path, "B_vs_A.tabular") log = os.path.join(base_path, "blast.log") -cols = "qseqid sseqid bitscore pident qcovhsp qlen length" # Or qcovs? -c_query = 0 -c_match = 1 -c_score = 2 -c_identity = 3 -c_coverage = 4 -c_qlen = 5 -c_length = 6 - -tie_warning = 0 - - -def best_hits(blast_tabular, ignore_self=False): - """Iterate over BLAST tabular output, returns best hits as 2-tuples. - - Each return value is (query name, tuple of value for the best hit). - - Tied best hits to different sequences are NOT returned. - - One hit is returned for tied best hits to the same sequence - (e.g. repeated domains). - """ - global tie_warning - current = None - best_score = None - best = None - col_count = len(cols.split()) - with open(blast_tabular) as h: - for line in h: - if line.startswith("#"): - continue - parts = line.rstrip("\n").split("\t") - if len(parts) != col_count: - # Using NCBI BLAST+ 2.2.27 the undefined field is ignored - # Even NCBI BLAST+ 2.5.0 silently ignores unknown fields :( - sys.exit( - "Old version of NCBI BLAST? Expected %i columns, got %i:\n%s\n" - "Note the qcovhsp field was only added in version 2.2.28\n" - % (col_count, len(parts), line) - ) - if ( - float(parts[c_identity]) < min_identity - or float(parts[c_coverage]) < min_coverage - ): - continue - a = parts[c_query] - b = parts[c_match] - if ignore_self and a == b: - continue - score = float(parts[c_score]) - qlen = int(parts[c_qlen]) - length = int(parts[c_length]) - # print("Considering hit for %s to %s with score %s..." % (a, b, score)) - if current is None: - # First hit - assert best is None - assert best_score is None - best = dict() - # Now append this hit... - elif a != current: - # New hit - if len(best) == 1: - # Unambiguous (no tied matches) - yield current, list(best.values())[0] - else: - # print("%s has %i equally good hits: %s" - # % (a, len(best), ", ".join(best))) - tie_warning += 1 - best = dict() - # Now append this hit... - elif score < best_score: - # print("No improvement for %s, %s < %s" % (a, score, best_score)) - continue - elif score > best_score: - # This is better, discard old best - best = dict() - # Now append this hit... - else: - # print("Tied best hits for %s" % a) - assert best_score == score - # Now append this hit... - current = a - best_score = score - # This will collapse two equally good hits - # to the same target (e.g. duplicated domain) - best[b] = ( - b, - score, - parts[c_score], - parts[c_identity], - parts[c_coverage], - qlen, - length, - ) - # Best hit for final query, if unambiguous: - if current is not None: - if len(best) == 1: - yield current, list(best.values())[0] - else: - # print("%s has %i equally good hits: %s" - # % (a, len(best), ", ".join(best))) - tie_warning += 1 - def check_duplicate_ids(filename): """Check for duplicate identifiers in a FASTA file.""" @@ -388,18 +287,20 @@ # print("BLAST databases prepared.") run( '%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' - % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads) + % (blast_cmd, fasta_a, db_b, a_vs_b, best_hits.cols, threads) ) # print("BLAST species A vs species B done.") if not self_comparison: run( '%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' - % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads) + % (blast_cmd, fasta_b, db_a, b_vs_a, best_hits.cols, threads) ) # print("BLAST species B vs species A done.") -best_b_vs_a = dict(best_hits(b_vs_a, self_comparison)) +best_b_vs_a = dict( + best_hits.best_hits(b_vs_a, min_identity, min_coverage, self_comparison) +) count = 0 @@ -410,7 +311,7 @@ for ( a, (b, a_score_float, a_score_str, a_identity_str, a_coverage_str, a_qlen, a_length), -) in best_hits(a_vs_b, self_comparison): +) in best_hits.best_hits(a_vs_b, min_identity, min_coverage, self_comparison): if b not in best_b_vs_a: # Match b has no best hit continue @@ -443,7 +344,7 @@ count += 1 outfile.close() print("Done, %i RBH found" % count) -if tie_warning: +if best_hits.tie_warning: sys.stderr.write( "Warning: Sequences with tied best hits found, " "you may have duplicates/clusters\n" diff -r 632d8517dad1 -r 64369ee3aaf0 tools/blast_rbh/blast_rbh.xml --- a/tools/blast_rbh/blast_rbh.xml Fri Feb 22 09:55:36 2019 -0500 +++ b/tools/blast_rbh/blast_rbh.xml Mon Apr 01 08:05:31 2019 -0400 @@ -1,4 +1,4 @@ - + from two FASTA files biopython