Mercurial > repos > peterjc > blast_rbh
changeset 8:d3eb5cda7270 draft
Uploaded v0.1.2, self comparison and easier command line API
author | peterjc |
---|---|
date | Tue, 10 Jun 2014 10:09:50 -0400 |
parents | 493243f8eb88 |
children | 95190ebd4657 |
files | tools/blast_rbh/README.rst tools/blast_rbh/blast_rbh.py tools/blast_rbh/blast_rbh.xml tools/blast_rbh/repository_dependencies.xml |
diffstat | 4 files changed, 71 insertions(+), 21 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/blast_rbh/README.rst Wed May 21 06:40:30 2014 -0400 +++ b/tools/blast_rbh/README.rst Tue Jun 10 10:09:50 2014 -0400 @@ -50,6 +50,8 @@ Version Changes ------- ---------------------------------------------------------------------- v0.1.0 - Initial public release, targetting NCBI BLAST+ 2.2.29 +v0.1.1 - Supports self-comparison, sometimes useful for spotting duplicates. +v0.1.2 - Using optparse for command line API. ======= ======================================================================
--- a/tools/blast_rbh/blast_rbh.py Wed May 21 06:40:30 2014 -0400 +++ b/tools/blast_rbh/blast_rbh.py Tue Jun 10 10:09:50 2014 -0400 @@ -17,6 +17,7 @@ import sys import tempfile import shutil +from optparse import OptionParser def stop_err( msg ): sys.stderr.write("%s\n" % msg) @@ -29,38 +30,70 @@ if "--version" in sys.argv[1:]: #TODO - Capture version of BLAST+ binaries too? - print "BLAST RBH v0.1.0" + print "BLAST RBH v0.1.1" sys.exit(0) #Parse Command Line -#TODO - optparse -try: - fasta_a, fasta_b, dbtype, blast_type, min_identity, min_coverage, out_file = sys.argv[1:] -except: - stop_err("Expect 7 arguments, got %i" % (len(sys.argv) - 1)) +usage = """Use as follows: + +$ python blast_rbh.py [options] A.fasta B.fasta +""" +parser = OptionParser(usage=usage) +parser.add_option("-a", "--alphabet", dest="dbtype", + default=None, + help="Alphabet type (nucl or prot)") +parser.add_option("-t", "--task", dest="task", + default=None, + help="BLAST task (e.g. blastp, blastn, megablast)") +parser.add_option("-i","--identity", dest="min_identity", + default="0", + help="Minimum percentage identity (optional, default 0)") +parser.add_option("-c", "--coverage", dest="min_coverage", + default="0", + help="Minimum HSP coverage (optional, default 0)") +parser.add_option("-o", "--output", dest="output", + default=None, metavar="FILE", + help="Output filename") +options, args = parser.parse_args() + +if len(args) != 2: + stop_err("Expects two input FASTA filenames") +fasta_a, fasta_b = args if not os.path.isfile(fasta_a): stop_err("Missing input file for species A: %r" % fasta_a) if not os.path.isfile(fasta_b): stop_err("Missing input file for species B: %r" % fasta_b) if os.path.abspath(fasta_a) == os.path.abspath(fasta_b): - #TODO - is this ever useful, e.g. positive control? - stop_err("Asked to compare the FASTA file to itself!") + self_comparison = True + print("Doing self comparison; ignoring self matches.") +else: + self_comparison = False + +if not options.output: + stop_err("Output filename required, e.g. -o example.tab") +out_file = options.output try: - min_identity = float(min_identity) + min_identity = float(options.min_identity) except ValueError: stop_err("Expected number between 0 and 100 for minimum identity, not %r" % min_identity) if not (0 <= min_identity <= 100): stop_err("Expected minimum identity between 0 and 100, not %0.2f" % min_identity) try: - min_coverage = float(min_coverage) + min_coverage = float(options.min_coverage) except ValueError: stop_err("Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage) if not (0 <= min_coverage <= 100): stop_err("Expected minimum coverage between 0 and 100, not %0.2f" % min_coverage) +if not options.task: + stop_err("Missing BLAST task, e.g. -t blastp") +blast_type = options.task +if not options.dbtype: + stop_err("Missing database type, -a nucl, or -a prot") +dbtype = options.dbtype if dbtype == "nucl": if blast_type in ["megablast", "blastn", "blastn-short", "dc-megablast"]: blast_cmd = "blastn -task %s" % blast_type @@ -101,7 +134,7 @@ tie_warning = 0 -def best_hits(blast_tabular): +def best_hits(blast_tabular, ignore_self=False): """Iterate over BLAST tabular output, returns best hits as 2-tuples. Each return value is (query name, tuple of value for the best hit). @@ -124,6 +157,8 @@ continue a = parts[c_query] b = parts[c_match] + if ignore_self and a == b: + continue score = float(parts[c_score]) qlen = int(parts[c_qlen]) length = int(parts[c_length]) @@ -181,13 +216,13 @@ #print("BLAST species B vs species A done.") -best_b_vs_a = dict(best_hits(b_vs_a)) +best_b_vs_a = dict(best_hits(b_vs_a, self_comparison)) count = 0 outfile = open(out_file, 'w') outfile.write("#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n") -for a, (b, a_score_float, a_score_str, a_identity_str, a_coverage_str, a_qlen, a_length) in best_hits(a_vs_b): +for a, (b, a_score_float, a_score_str, a_identity_str, a_coverage_str, a_qlen, a_length) in best_hits(a_vs_b, self_comparison): if b not in best_b_vs_a: #Match b has no best hit continue
--- a/tools/blast_rbh/blast_rbh.xml Wed May 21 06:40:30 2014 -0400 +++ b/tools/blast_rbh/blast_rbh.xml Tue Jun 10 10:09:50 2014 -0400 @@ -1,4 +1,4 @@ -<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.1.0"> +<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.1.2"> <description>from two FASTA files</description> <requirements> <requirement type="binary">makeblastdb</requirement> @@ -10,13 +10,16 @@ blast_rbh.py --version </version_command> <command interpreter="python"> -blast_rbh.py "$fasta_a" "$fasta_b" $seq.dbtype +blast_rbh.py "$fasta_a" "$fasta_b" +-a $seq.dbtype #if $seq.dbtype=="nucl" - $seq.nucl_type +-t $seq.nucl_type #else - $seq.prot_type +-t $seq.prot_type #end if -$identity $q_cover "$output" +-i $identity +-c $q_cover +-o "$output" </command> <stdio> <!-- Anything other than zero is an error --> @@ -134,7 +137,7 @@ <test> <param name="fasta_a" value="k12_ten_proteins.fasta" ftype="fasta"/> <param name="fasta_b" value="k12_edited_proteins.fasta" ftype="fasta"/> - <param name="dbtype" value="nprot"/> + <param name="dbtype" value="prot"/> <param name="nucl_type" value="blastp"/> <param name="identity" value="0.0"/> <param name="q_cover" value="0.0"/> @@ -143,12 +146,22 @@ <test> <param name="fasta_a" value="k12_edited_proteins.fasta" ftype="fasta"/> <param name="fasta_b" value="k12_ten_proteins.fasta" ftype="fasta"/> - <param name="dbtype" value="nprot"/> + <param name="dbtype" value="prot"/> <param name="nucl_type" value="blastp"/> <param name="identity" value="0.0"/> <param name="q_cover" value="0.0"/> <output name="output" file="rbh_blastp_k12.tabular" ftype="tabular"/> </test> + <!-- this tests self-comparison --> + <test> + <param name="fasta_a" value="k12_edited_proteins.fasta" ftype="fasta"/> + <param name="fasta_b" value="k12_edited_proteins.fasta" ftype="fasta"/> + <param name="dbtype" value="prot"/> + <param name="nucl_type" value="blastp"/> + <param name="identity" value="80.0"/> + <param name="q_cover" value="80.0"/> + <output name="output" file="rbh_blastp_k12_self.tabular" ftype="tabular"/> + </test> </tests> <help> **What it does**
--- a/tools/blast_rbh/repository_dependencies.xml Wed May 21 06:40:30 2014 -0400 +++ b/tools/blast_rbh/repository_dependencies.xml Tue Jun 10 10:09:50 2014 -0400 @@ -1,4 +1,4 @@ <?xml version="1.0"?> <repositories description="Requires the NCBI BLAST+ binaries."> - <repository changeset_revision="e78bbab7933d" name="package_blast_plus_2_2_29" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + <repository changeset_revision="e78bbab7933d" name="package_blast_plus_2_2_29" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" /> </repositories>