# HG changeset patch
# User peterjc
# Date 1414693248 14400
# Node ID 4454596ed1273ce9b5a46a4a58ad6184cef6b0de
# Parent 661276ad882e4030df3053ff1e8cf4351cf664df
Uploaded
diff -r 661276ad882e -r 4454596ed127 test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular
--- a/test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular Tue Oct 14 06:53:40 2014 -0400
+++ b/test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular Thu Oct 30 14:20:48 2014 -0400
@@ -1,2 +1,2 @@
#A_id B_id A_length B_length A_qcovhsp B_qcovhsp length pident bitscore
-gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 1047 1213 66 57 230 97.39 559
+gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 1047 1213 22 19 230 97.39 559
diff -r 661276ad882e -r 4454596ed127 tools/blast_rbh/README.rst
--- a/tools/blast_rbh/README.rst Tue Oct 14 06:53:40 2014 -0400
+++ b/tools/blast_rbh/README.rst Thu Oct 30 14:20:48 2014 -0400
@@ -1,14 +1,18 @@
-Galaxy tool to find BLAST Reciprocal Best Hits (RBH)
-====================================================
+Find BLAST Reciprocal Best Hits (RBH), with Galaxy wrapper
+==========================================================
This tool is copyright 2011-2014 by Peter Cock, The James Hutton Institute
(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
See the licence text below.
This tool is a short Python script to run reciprocal BLAST searches on a
-pair of sequence files, and extract the reciprocal best hits.
+pair of sequence files, and extract the reciprocal best hits. The script
+``blast_rbh.py`` can be used directly (without Galaxy) as long as NCBI
+BLAST+ is installed.
-It is available from the Galaxy Tool Shed at:
+It comes with an optional Galaxy tool definition file ``blast_rbh.xml``
+allowing the Python script to be run from within Galaxy. It is available
+from the Galaxy Tool Shed at:
http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh
@@ -33,11 +37,10 @@
-If you want to run the functional tests, include the same line in your
-``tool_conf.xml.sample`` file, and the sample test files under Galaxy's
-``test-data/`` directory. Then::
+If you want to run the functional tests, copy the sample test files under
+sample test files under Galaxy's ``test-data/`` directory. Then::
- ./run_functional_tests.sh -id blast_reciprocal_best_hits
+ ./run_tests.sh -id blast_reciprocal_best_hits
You will need to have the NCBI BLAST+ binaries installed and on the ``$PATH``.
@@ -55,6 +58,10 @@
- Fixed Tool Shed dependency definition.
v0.1.3 - Option to make FASTA files non-redundant (via Biopython dependency).
- Avoid extra database and BLAST search in self-comparison mode.
+v0.1.4 - Check for duplicate FASTA identifiers (workaround for makeblastdb
+ not treating this as an error, leading to confusing RBH output).
+v0.1.5 - Clarify documentation for using the Python script outside Galaxy.
+ - Updated to depend on NCBI BLAST+ 2.2.30 via ToolShed install.
======= ======================================================================
diff -r 661276ad882e -r 4454596ed127 tools/blast_rbh/blast_rbh.py
--- a/tools/blast_rbh/blast_rbh.py Tue Oct 14 06:53:40 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.py Thu Oct 30 14:20:48 2014 -0400
@@ -1,14 +1,16 @@
#!/usr/bin/env python
"""BLAST Reciprocal Best Hit (RBH) from two FASTA input files.
-Takes the following command line options,
-1. FASTA filename of species A
-2. FASTA filename of species B
-3. Sequence type (prot/nucl)
-4. BLAST type (e.g. blastn, or blastp) consistent with sequence type
-5. Minimum BLAST Percentage identity
-6. Minimum BLAST query coverage
-7. Output filename
+Run "blast_rbh.py -h" to see the help text, or read the associated
+README.rst file which is also available on GitHub at:
+https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh
+
+This requires Python and the NCBI BLAST+ tools to be installed
+and on the $PATH.
+
+You can also run this tool via Galaxy using the "blast_rbh.xml"
+definition file. This is available as a package on the Galaxy
+Tool Shed: http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh
"""
# TODO - Output more columns, e.g. pident, qcovs, descriptions?
@@ -30,7 +32,7 @@
if "--version" in sys.argv[1:]:
#TODO - Capture version of BLAST+ binaries too?
- print "BLAST RBH v0.1.3"
+ print "BLAST RBH v0.1.5"
sys.exit(0)
#Parse Command Line
@@ -212,6 +214,24 @@
#print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
tie_warning += 1
+def check_duplicate_ids(filename):
+ # Copied from tools/ncbi_blast_plus/check_no_duplicates.py
+ # TODO - just use Biopython's FASTA parser?
+ if not os.path.isfile(filename):
+ stop_err("Missing FASTA file %r" % filename, 2)
+ identifiers = set()
+ handle = open(filename)
+ for line in handle:
+ if line.startswith(">"):
+ # The split will also take care of the new line character,
+ # e.g. ">test\n" and ">test description here\n" both give "test"
+ seq_id = line[1:].split(None, 1)[0]
+ if seq_id in identifiers:
+ handle.close()
+ stop_err("Repeated identifiers, e.g. %r" % seq_id, 3)
+ identifiers.add(seq_id)
+ handle.close()
+
def make_nr(input_fasta, output_fasta, sep=";"):
#TODO - seq-hash based to avoid loading everything into RAM?
by_seq = dict()
@@ -252,6 +272,10 @@
print("No perfect duplicates in file, %i unique entries" % unique)
#print("Starting...")
+check_duplicate_ids(fasta_a)
+if not self_comparison:
+ check_duplicate_ids(fasta_b)
+
if options.nr:
make_nr(fasta_a, tmp_a)
if not self_comparison:
diff -r 661276ad882e -r 4454596ed127 tools/blast_rbh/blast_rbh.xml
--- a/tools/blast_rbh/blast_rbh.xml Tue Oct 14 06:53:40 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.xml Thu Oct 30 14:20:48 2014 -0400
@@ -1,4 +1,4 @@
-
+
from two FASTA files
biopython
@@ -6,7 +6,7 @@
makeblastdb
blastp
blastn
- blast+
+ blast+
blast_rbh.py --version
diff -r 661276ad882e -r 4454596ed127 tools/blast_rbh/tool_dependencies.xml
--- a/tools/blast_rbh/tool_dependencies.xml Tue Oct 14 06:53:40 2014 -0400
+++ b/tools/blast_rbh/tool_dependencies.xml Thu Oct 30 14:20:48 2014 -0400
@@ -3,7 +3,7 @@
-
-
+
+