# HG changeset patch
# User peterjc
# Date 1413284020 14400
# Node ID 661276ad882e4030df3053ff1e8cf4351cf664df
# Parent  4733e4ea4dab25788bac44abff5e171787e8fa76
Uploaded v0.1.3, uses Biopython for making NR

diff -r 4733e4ea4dab -r 661276ad882e tools/blast_rbh/README.rst
--- a/tools/blast_rbh/README.rst	Thu Aug 14 07:11:59 2014 -0400
+++ b/tools/blast_rbh/README.rst	Tue Oct 14 06:53:40 2014 -0400
@@ -8,9 +8,8 @@
 This tool is a short Python script to run reciprocal BLAST searches on a
 pair of sequence files, and extract the reciprocal best hits.
 
-This is a work in progress, and builds on an earlier implementation which
-prequired the two BLAST searches be prepared in advance. Integration allows
-a much simpler user experience, and can ensure sensible filters are used.
+It is available from the Galaxy Tool Shed at:
+http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh
 
 
 Automated Installation
@@ -54,6 +53,8 @@
 v0.1.2  - Using optparse for command line API.
         - Tool definition now embeds citation information.
         - Fixed Tool Shed dependency definition.
+v0.1.3  - Option to make FASTA files non-redundant (via Biopython dependency).
+        - Avoid extra database and BLAST search in self-comparison mode.
 ======= ======================================================================
 
 
diff -r 4733e4ea4dab -r 661276ad882e tools/blast_rbh/blast_rbh.py
--- a/tools/blast_rbh/blast_rbh.py	Thu Aug 14 07:11:59 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.py	Tue Oct 14 06:53:40 2014 -0400
@@ -30,7 +30,7 @@
 
 if "--version" in sys.argv[1:]:
     #TODO - Capture version of BLAST+ binaries too?
-    print "BLAST RBH v0.1.2"
+    print "BLAST RBH v0.1.3"
     sys.exit(0)
 
 #Parse Command Line
@@ -52,6 +52,9 @@
 parser.add_option("-c", "--coverage", dest="min_coverage",
                   default="50",
                   help="Minimum HSP coverage (optional, default 50)")
+parser.add_option("--nr", dest="nr", default=False, action="store_true",
+                  help="Preprocess FASTA files to collapse identifical "
+                  "entries (make sequences non-redundant)")
 parser.add_option("-o", "--output", dest="output",
                   default=None, metavar="FILE",
                   help="Output filename")
@@ -117,10 +120,17 @@
 makeblastdb_exe = "makeblastdb"
 
 base_path = tempfile.mkdtemp()
+tmp_a = os.path.join(base_path, "SpeciesA.fasta")
 db_a = os.path.join(base_path, "SpeciesA")
-db_b = os.path.join(base_path, "SpeciesB")
 a_vs_b = os.path.join(base_path, "A_vs_B.tabular")
-b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
+if self_comparison:
+    tmp_b = tmp_a
+    db_b = db_a
+    b_vs_a = a_vs_b
+else:
+    tmp_b = os.path.join(base_path, "SpeciesB.fasta")
+    db_b = os.path.join(base_path, "SpeciesB")
+    b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
 log = os.path.join(base_path, "blast.log")
 
 cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs?
@@ -202,18 +212,65 @@
             #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
             tie_warning += 1
 
+def make_nr(input_fasta, output_fasta, sep=";"):
+    #TODO - seq-hash based to avoid loading everything into RAM?
+    by_seq = dict()
+    try:
+        from Bio import SeqIO
+    except KeyError:
+        stop_err("Missing Biopython")
+    for record in SeqIO.parse(input_fasta, "fasta"):
+        s = str(record.seq).upper()
+        try:
+            by_seq[s].append(record.id)
+        except KeyError:
+            by_seq[s] = [record.id]
+    unique = 0
+    representatives = dict()
+    duplicates = set()
+    for cluster in by_seq.values():
+        if len(cluster) > 1:
+            representatives[cluster[0]] = cluster
+            duplicates.update(cluster[1:])
+        else:
+            unique += 1
+    del by_seq
+    if duplicates:
+        #TODO - refactor as a generator with single SeqIO.write(...) call
+        with open(output_fasta, "w") as handle:
+            for record in SeqIO.parse(input_fasta, "fasta"):
+                if record.id in representatives:
+                    cluster = representatives[record.id]
+                    record.id = sep.join(cluster)
+                    record.description = "representing %i records" % len(cluster)
+                elif record.id in duplicates:
+                    continue
+                SeqIO.write(record, handle, "fasta")
+        print("%i unique entries; removed %i duplicates leaving %i representative records" % (unique, len(duplicates), len(representatives)))
+    else:
+        os.symlink(os.path.abspath(input_fasta), output_fasta)
+        print("No perfect duplicates in file, %i unique entries" % unique)
 
 #print("Starting...")
+if options.nr:
+    make_nr(fasta_a, tmp_a)
+    if not self_comparison:
+        make_nr(fasta_b, tmp_b)
+    fasta_a = tmp_a
+    fasta_b = tmp_b
+
 #TODO - Report log in case of error?
 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log))
-run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
+if not self_comparison:
+    run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
 #print("BLAST databases prepared.")
 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
     % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads))
 #print("BLAST species A vs species B done.")
-run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
-    % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads))
-#print("BLAST species B vs species A done.")
+if not self_comparison:
+    run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
+        % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads))
+    #print("BLAST species B vs species A done.")
 
 
 best_b_vs_a = dict(best_hits(b_vs_a, self_comparison))
diff -r 4733e4ea4dab -r 661276ad882e tools/blast_rbh/blast_rbh.xml
--- a/tools/blast_rbh/blast_rbh.xml	Thu Aug 14 07:11:59 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.xml	Tue Oct 14 06:53:40 2014 -0400
@@ -1,10 +1,12 @@
-<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.1.2">
+<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.1.3">
     <description>from two FASTA files</description>
     <requirements>
-            <requirement type="binary">makeblastdb</requirement>
-            <requirement type="binary">blastp</requirement>
-            <requirement type="binary">blastn</requirement>
-            <requirement type="package" version="2.2.29">blast+</requirement>
+        <requirement type="package" version="1.64">biopython</requirement>
+        <requirement type="python-module">Bio</requirement>
+        <requirement type="binary">makeblastdb</requirement>
+        <requirement type="binary">blastp</requirement>
+        <requirement type="binary">blastn</requirement>
+        <requirement type="package" version="2.2.29">blast+</requirement>
     </requirements>
     <version_command interpreter="python">
 blast_rbh.py --version
@@ -17,6 +19,7 @@
 #else
 -t $seq.prot_type
 #end if
+$make_nr
 -i $identity
 -c $q_cover
 -o "$output"
@@ -61,6 +64,9 @@
         <param name="q_cover" type="float" value="50" min="0" max="100"
 	       label="Minimum percentage query coverage for BLAST matches"
 	       help="Default is 50%, use 0 for no filtering." />
+        <param name="make_nr" type="boolean" checked="false" truevalue="--nr" falsevalue=""
+	       label="Process input FASTA files to collapse identical sequences"
+               help="i.e. First make the input non-redundant" />
     </inputs>
     <outputs>
         <data name="output" format="tabular" label="BLAST RBH: $fasta_a.name vs $fasta_b.name" />
@@ -201,6 +207,13 @@
 can happen following gene duplication, or for (near) identical gene
 duplicates.
 
+The tool can optionally make the FASTA files non-redundant by replacing
+repeated identical sequences with a single representative before building
+the databases and running BLAST.
+
+Finally, the tool can be run using the same FASTA input file to look for
+RBH within the dataset. In this case, self matches are discarded.
+
 .. class:: warningmark
 
 **Note**
diff -r 4733e4ea4dab -r 661276ad882e tools/blast_rbh/tool_dependencies.xml
--- a/tools/blast_rbh/tool_dependencies.xml	Thu Aug 14 07:11:59 2014 -0400
+++ b/tools/blast_rbh/tool_dependencies.xml	Tue Oct 14 06:53:40 2014 -0400
@@ -1,5 +1,8 @@
 <?xml version="1.0"?>
 <tool_dependency>
+    <package name="biopython" version="1.64">
+        <repository changeset_revision="268128adb501" name="package_biopython_1_64" owner="biopython" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+    </package>
     <package name="blast+" version="2.2.29">
         <repository changeset_revision="e78bbab7933d" name="package_blast_plus_2_2_29" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>