changeset 8:d3eb5cda7270 draft

Uploaded v0.1.2, self comparison and easier command line API
author peterjc
date Tue, 10 Jun 2014 10:09:50 -0400
parents 493243f8eb88
children 95190ebd4657
files tools/blast_rbh/README.rst tools/blast_rbh/blast_rbh.py tools/blast_rbh/blast_rbh.xml tools/blast_rbh/repository_dependencies.xml
diffstat 4 files changed, 71 insertions(+), 21 deletions(-) [+]
line wrap: on
line diff
--- a/tools/blast_rbh/README.rst	Wed May 21 06:40:30 2014 -0400
+++ b/tools/blast_rbh/README.rst	Tue Jun 10 10:09:50 2014 -0400
@@ -50,6 +50,8 @@
 Version Changes
 ------- ----------------------------------------------------------------------
 v0.1.0  - Initial public release, targetting NCBI BLAST+ 2.2.29
+v0.1.1  - Supports self-comparison, sometimes useful for spotting duplicates.
+v0.1.2  - Using optparse for command line API.
 ======= ======================================================================
 
 
--- a/tools/blast_rbh/blast_rbh.py	Wed May 21 06:40:30 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.py	Tue Jun 10 10:09:50 2014 -0400
@@ -17,6 +17,7 @@
 import sys
 import tempfile
 import shutil
+from optparse import OptionParser
 
 def stop_err( msg ):
     sys.stderr.write("%s\n" % msg)
@@ -29,38 +30,70 @@
 
 if "--version" in sys.argv[1:]:
     #TODO - Capture version of BLAST+ binaries too?
-    print "BLAST RBH v0.1.0"
+    print "BLAST RBH v0.1.1"
     sys.exit(0)
 
 #Parse Command Line
-#TODO - optparse
-try:
-    fasta_a, fasta_b, dbtype, blast_type, min_identity, min_coverage, out_file = sys.argv[1:]
-except:
-    stop_err("Expect 7 arguments, got %i" % (len(sys.argv) - 1))
+usage = """Use as follows:
+
+$ python blast_rbh.py [options] A.fasta B.fasta
+"""
 
+parser = OptionParser(usage=usage)
+parser.add_option("-a", "--alphabet", dest="dbtype",
+                  default=None,
+                  help="Alphabet type (nucl or prot)")
+parser.add_option("-t", "--task", dest="task",
+                  default=None,
+                  help="BLAST task (e.g. blastp, blastn, megablast)")
+parser.add_option("-i","--identity", dest="min_identity",
+                  default="0",
+                  help="Minimum percentage identity (optional, default 0)")
+parser.add_option("-c", "--coverage", dest="min_coverage",
+                  default="0",
+                  help="Minimum HSP coverage (optional, default 0)")
+parser.add_option("-o", "--output", dest="output",
+                  default=None, metavar="FILE",
+                  help="Output filename")
+options, args = parser.parse_args()
+
+if len(args) != 2:
+    stop_err("Expects two input FASTA filenames")
+fasta_a, fasta_b = args
 if not os.path.isfile(fasta_a):
     stop_err("Missing input file for species A: %r" % fasta_a)
 if not os.path.isfile(fasta_b):
     stop_err("Missing input file for species B: %r" % fasta_b)
 if os.path.abspath(fasta_a) == os.path.abspath(fasta_b):
-    #TODO - is this ever useful, e.g. positive control?
-    stop_err("Asked to compare the FASTA file to itself!")
+    self_comparison = True
+    print("Doing self comparison; ignoring self matches.")
+else:
+    self_comparison = False
+
+if not options.output:
+    stop_err("Output filename required, e.g. -o example.tab")
+out_file = options.output
 
 try:
-    min_identity = float(min_identity)
+    min_identity = float(options.min_identity)
 except ValueError:
     stop_err("Expected number between 0 and 100 for minimum identity, not %r" % min_identity)
 if not (0 <= min_identity <= 100):
     stop_err("Expected minimum identity between 0 and 100, not %0.2f" % min_identity)
 try:
-    min_coverage = float(min_coverage)
+    min_coverage = float(options.min_coverage)
 except ValueError:
     stop_err("Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage)
 if not (0 <= min_coverage <= 100):
     stop_err("Expected minimum coverage between 0 and 100, not %0.2f" % min_coverage)
 
+if not options.task:
+    stop_err("Missing BLAST task, e.g. -t blastp")
+blast_type = options.task
 
+if not options.dbtype:
+    stop_err("Missing database type, -a nucl, or -a prot")
+dbtype = options.dbtype
 if dbtype == "nucl":
     if blast_type in ["megablast", "blastn", "blastn-short", "dc-megablast"]:
          blast_cmd = "blastn -task %s" % blast_type
@@ -101,7 +134,7 @@
 
 tie_warning = 0
 
-def best_hits(blast_tabular):
+def best_hits(blast_tabular, ignore_self=False):
     """Iterate over BLAST tabular output, returns best hits as 2-tuples.
 
     Each return value is (query name, tuple of value for the best hit).
@@ -124,6 +157,8 @@
                 continue
             a = parts[c_query]
             b = parts[c_match]
+            if ignore_self and a == b:
+                continue
             score = float(parts[c_score])
             qlen = int(parts[c_qlen])
             length = int(parts[c_length])
@@ -181,13 +216,13 @@
 #print("BLAST species B vs species A done.")
 
 
-best_b_vs_a = dict(best_hits(b_vs_a))
+best_b_vs_a = dict(best_hits(b_vs_a, self_comparison))
 
 
 count = 0
 outfile = open(out_file, 'w')
 outfile.write("#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n")
-for a, (b, a_score_float, a_score_str, a_identity_str, a_coverage_str, a_qlen, a_length) in best_hits(a_vs_b):
+for a, (b, a_score_float, a_score_str, a_identity_str, a_coverage_str, a_qlen, a_length) in best_hits(a_vs_b, self_comparison):
     if b not in best_b_vs_a:
         #Match b has no best hit
         continue
--- a/tools/blast_rbh/blast_rbh.xml	Wed May 21 06:40:30 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.xml	Tue Jun 10 10:09:50 2014 -0400
@@ -1,4 +1,4 @@
-<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.1.0">
+<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.1.2">
     <description>from two FASTA files</description>
     <requirements>
             <requirement type="binary">makeblastdb</requirement>
@@ -10,13 +10,16 @@
 blast_rbh.py --version
     </version_command>
     <command interpreter="python">
-blast_rbh.py "$fasta_a" "$fasta_b" $seq.dbtype
+blast_rbh.py "$fasta_a" "$fasta_b"
+-a $seq.dbtype
 #if $seq.dbtype=="nucl"
-    $seq.nucl_type
+-t $seq.nucl_type
 #else
-    $seq.prot_type
+-t $seq.prot_type
 #end if
-$identity $q_cover "$output"
+-i $identity
+-c $q_cover
+-o "$output"
     </command>
     <stdio>
         <!-- Anything other than zero is an error -->
@@ -134,7 +137,7 @@
         <test>
             <param name="fasta_a" value="k12_ten_proteins.fasta" ftype="fasta"/>
             <param name="fasta_b" value="k12_edited_proteins.fasta" ftype="fasta"/>
-            <param name="dbtype" value="nprot"/>
+            <param name="dbtype" value="prot"/>
             <param name="nucl_type" value="blastp"/>
             <param name="identity" value="0.0"/>
             <param name="q_cover" value="0.0"/>
@@ -143,12 +146,22 @@
         <test>
             <param name="fasta_a" value="k12_edited_proteins.fasta" ftype="fasta"/>
             <param name="fasta_b" value="k12_ten_proteins.fasta" ftype="fasta"/>
-            <param name="dbtype" value="nprot"/>
+            <param name="dbtype" value="prot"/>
             <param name="nucl_type" value="blastp"/>
             <param name="identity" value="0.0"/>
             <param name="q_cover" value="0.0"/>
             <output name="output" file="rbh_blastp_k12.tabular" ftype="tabular"/>
         </test>
+        <!-- this tests self-comparison -->
+        <test>
+            <param name="fasta_a" value="k12_edited_proteins.fasta" ftype="fasta"/>
+            <param name="fasta_b" value="k12_edited_proteins.fasta" ftype="fasta"/>
+            <param name="dbtype" value="prot"/>
+            <param name="nucl_type" value="blastp"/>
+            <param name="identity" value="80.0"/>
+            <param name="q_cover" value="80.0"/>
+            <output name="output" file="rbh_blastp_k12_self.tabular" ftype="tabular"/>
+        </test>
     </tests>
     <help>
 **What it does**
--- a/tools/blast_rbh/repository_dependencies.xml	Wed May 21 06:40:30 2014 -0400
+++ b/tools/blast_rbh/repository_dependencies.xml	Tue Jun 10 10:09:50 2014 -0400
@@ -1,4 +1,4 @@
 <?xml version="1.0"?>
 <repositories description="Requires the NCBI BLAST+ binaries.">
-    <repository changeset_revision="e78bbab7933d" name="package_blast_plus_2_2_29" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+    <repository changeset_revision="e78bbab7933d" name="package_blast_plus_2_2_29" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
 </repositories>