Mercurial > repos > peterjc > blast_rbh
view tools/blast_rbh/best_hits.py @ 39:64369ee3aaf0 draft
planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh commit 0a172d2516a9d7e8d8fa26dff24c0c906b9d0c6a-dirty
author | peterjc |
---|---|
date | Mon, 01 Apr 2019 08:05:31 -0400 |
parents | |
children | 2bca27f8abb2 |
line wrap: on
line source
#!/usr/bin/env python """Helper code for BLAST Reciprocal Best Hit (RBH) from two BLAST tabular files. This module defines a function best_hits to return the best hit in a BLAST tabular file, intended for use in finding BLAST Reciprocal Best Hits (RBH). The code in this module originated from blast_rbh.py in https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh Please cite the author per instructions in https://github.com/peterjc/galaxy_blast/blob/master/tools/blast_rbh/README.rst """ import sys tie_warning = 0 c_query = 0 c_match = 1 c_score = 2 c_identity = 3 c_coverage = 4 c_qlen = 5 c_length = 6 cols = "qseqid sseqid bitscore pident qcovhsp qlen length" def best_hits(blast_tabular, min_identity=70, min_coverage=50, ignore_self=False): """Parse BLAST tabular output returning the best hit. Argument blast_tabular should be a filename, containing BLAST tabular output with "qseqid sseqid bitscore pident qcovhsp qlen length" as the columns. Parses the tabular file, iterating over the results as 2-tuples of (query name, tuple of values for the best hit). One hit is returned for tied best hits to the same sequence (e.g. repeated domains). Tied best hits to different sequences are NOT returned. Instead, global variable tie_warning is incremented. """ global tie_warning current = None best_score = None best = None col_count = len(cols.split()) with open(blast_tabular) as h: for line in h: if line.startswith("#"): continue parts = line.rstrip("\n").split("\t") if len(parts) != col_count: # Using NCBI BLAST+ 2.2.27 the undefined field is ignored # Even NCBI BLAST+ 2.5.0 silently ignores unknown fields :( sys.exit( "Old version of NCBI BLAST? Expected %i columns, got %i:\n%s\n" "Note the qcovhsp field was only added in version 2.2.28\n" % (col_count, len(parts), line) ) if ( float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage ): continue a = parts[c_query] b = parts[c_match] if ignore_self and a == b: continue score = float(parts[c_score]) qlen = int(parts[c_qlen]) length = int(parts[c_length]) # print("Considering hit for %s to %s with score %s..." % (a, b, score)) if current is None: # First hit assert best is None assert best_score is None best = dict() # Now append this hit... elif a != current: # New hit if len(best) == 1: # Unambiguous (no tied matches) yield current, list(best.values())[0] else: # print("%s has %i equally good hits: %s" # % (a, len(best), ", ".join(best))) tie_warning += 1 best = dict() # Now append this hit... elif score < best_score: # print("No improvement for %s, %s < %s" % (a, score, best_score)) continue elif score > best_score: # This is better, discard old best best = dict() # Now append this hit... else: # print("Tied best hits for %s" % a) assert best_score == score # Now append this hit... current = a best_score = score # This will collapse two equally good hits # to the same target (e.g. duplicated domain) best[b] = ( b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length, ) # Best hit for final query, if unambiguous: if current is not None: if len(best) == 1: yield current, list(best.values())[0] else: # print("%s has %i equally good hits: %s" # % (a, len(best), ", ".join(best))) tie_warning += 1