changeset 13:4454596ed127 draft

Uploaded
author peterjc
date Thu, 30 Oct 2014 14:20:48 -0400
parents 661276ad882e
children 40c85a67e645
files test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular tools/blast_rbh/README.rst tools/blast_rbh/blast_rbh.py tools/blast_rbh/blast_rbh.xml tools/blast_rbh/tool_dependencies.xml
diffstat 5 files changed, 53 insertions(+), 22 deletions(-) [+]
line wrap: on
line diff
--- a/test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular	Tue Oct 14 06:53:40 2014 -0400
+++ b/test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular	Thu Oct 30 14:20:48 2014 -0400
@@ -1,2 +1,2 @@
 #A_id	B_id	A_length	B_length	A_qcovhsp	B_qcovhsp	length	pident	bitscore
-gi|57163782|ref|NM_001009242.1|	ENA|BC112106|BC112106.1	1047	1213	66	57	230	97.39	  559
+gi|57163782|ref|NM_001009242.1|	ENA|BC112106|BC112106.1	1047	1213	22	19	230	97.39	  559
--- a/tools/blast_rbh/README.rst	Tue Oct 14 06:53:40 2014 -0400
+++ b/tools/blast_rbh/README.rst	Thu Oct 30 14:20:48 2014 -0400
@@ -1,14 +1,18 @@
-Galaxy tool to find BLAST Reciprocal Best Hits (RBH)
-====================================================
+Find BLAST Reciprocal Best Hits (RBH), with Galaxy wrapper
+==========================================================
 
 This tool is copyright 2011-2014 by Peter Cock, The James Hutton Institute
 (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
 See the licence text below.
 
 This tool is a short Python script to run reciprocal BLAST searches on a
-pair of sequence files, and extract the reciprocal best hits.
+pair of sequence files, and extract the reciprocal best hits. The script
+``blast_rbh.py`` can be used directly (without Galaxy) as long as NCBI
+BLAST+ is installed.
 
-It is available from the Galaxy Tool Shed at:
+It comes with an optional Galaxy tool definition file ``blast_rbh.xml``
+allowing the Python script to be run from within Galaxy. It is available
+from the Galaxy Tool Shed at:
 http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh
 
 
@@ -33,11 +37,10 @@
 
     <tool file="blast_rbh/blast_rbh.xml" />
 
-If you want to run the functional tests, include the same line in your
-``tool_conf.xml.sample`` file, and the sample test files under Galaxy's
-``test-data/`` directory. Then::
+If you want to run the functional tests, copy the sample test files under
+sample test files under Galaxy's ``test-data/`` directory. Then::
 
-    ./run_functional_tests.sh -id blast_reciprocal_best_hits
+    ./run_tests.sh -id blast_reciprocal_best_hits
 
 You will need to have the NCBI BLAST+ binaries installed and on the ``$PATH``.
 
@@ -55,6 +58,10 @@
         - Fixed Tool Shed dependency definition.
 v0.1.3  - Option to make FASTA files non-redundant (via Biopython dependency).
         - Avoid extra database and BLAST search in self-comparison mode.
+v0.1.4  - Check for duplicate FASTA identifiers (workaround for makeblastdb
+          not treating this as an error, leading to confusing RBH output).
+v0.1.5  - Clarify documentation for using the Python script outside Galaxy.
+        - Updated to depend on NCBI BLAST+ 2.2.30 via ToolShed install.
 ======= ======================================================================
 
 
--- a/tools/blast_rbh/blast_rbh.py	Tue Oct 14 06:53:40 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.py	Thu Oct 30 14:20:48 2014 -0400
@@ -1,14 +1,16 @@
 #!/usr/bin/env python
 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files.
 
-Takes the following command line options,
-1. FASTA filename of species A
-2. FASTA filename of species B
-3. Sequence type (prot/nucl)
-4. BLAST type (e.g. blastn, or blastp) consistent with sequence type
-5. Minimum BLAST Percentage identity
-6. Minimum BLAST query coverage
-7. Output filename
+Run "blast_rbh.py -h" to see the help text, or read the associated
+README.rst file which is also available on GitHub at:
+https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh
+
+This requires Python and the NCBI BLAST+ tools to be installed
+and on the $PATH.
+
+You can also run this tool via Galaxy using the "blast_rbh.xml"
+definition file. This is available as a package on the Galaxy
+Tool Shed: http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh
 """
 
 # TODO - Output more columns, e.g. pident, qcovs, descriptions?
@@ -30,7 +32,7 @@
 
 if "--version" in sys.argv[1:]:
     #TODO - Capture version of BLAST+ binaries too?
-    print "BLAST RBH v0.1.3"
+    print "BLAST RBH v0.1.5"
     sys.exit(0)
 
 #Parse Command Line
@@ -212,6 +214,24 @@
             #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
             tie_warning += 1
 
+def check_duplicate_ids(filename):
+    # Copied from tools/ncbi_blast_plus/check_no_duplicates.py
+    # TODO - just use Biopython's FASTA parser?
+    if not os.path.isfile(filename):
+        stop_err("Missing FASTA file %r" % filename, 2)
+    identifiers = set()
+    handle = open(filename)
+    for line in handle:
+        if line.startswith(">"):
+        # The split will also take care of the new line character,
+        # e.g. ">test\n" and ">test description here\n" both give "test"
+            seq_id = line[1:].split(None, 1)[0]
+            if seq_id in identifiers:
+                handle.close()
+                stop_err("Repeated identifiers, e.g. %r" % seq_id, 3)
+            identifiers.add(seq_id)
+    handle.close()
+
 def make_nr(input_fasta, output_fasta, sep=";"):
     #TODO - seq-hash based to avoid loading everything into RAM?
     by_seq = dict()
@@ -252,6 +272,10 @@
         print("No perfect duplicates in file, %i unique entries" % unique)
 
 #print("Starting...")
+check_duplicate_ids(fasta_a)
+if not self_comparison:
+    check_duplicate_ids(fasta_b)
+
 if options.nr:
     make_nr(fasta_a, tmp_a)
     if not self_comparison:
--- a/tools/blast_rbh/blast_rbh.xml	Tue Oct 14 06:53:40 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.xml	Thu Oct 30 14:20:48 2014 -0400
@@ -1,4 +1,4 @@
-<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.1.3">
+<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.1.5">
     <description>from two FASTA files</description>
     <requirements>
         <requirement type="package" version="1.64">biopython</requirement>
@@ -6,7 +6,7 @@
         <requirement type="binary">makeblastdb</requirement>
         <requirement type="binary">blastp</requirement>
         <requirement type="binary">blastn</requirement>
-        <requirement type="package" version="2.2.29">blast+</requirement>
+        <requirement type="package" version="2.2.30">blast+</requirement>
     </requirements>
     <version_command interpreter="python">
 blast_rbh.py --version
--- a/tools/blast_rbh/tool_dependencies.xml	Tue Oct 14 06:53:40 2014 -0400
+++ b/tools/blast_rbh/tool_dependencies.xml	Thu Oct 30 14:20:48 2014 -0400
@@ -3,7 +3,7 @@
     <package name="biopython" version="1.64">
         <repository changeset_revision="268128adb501" name="package_biopython_1_64" owner="biopython" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>
-    <package name="blast+" version="2.2.29">
-        <repository changeset_revision="e78bbab7933d" name="package_blast_plus_2_2_29" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+    <package name="blast+" version="2.2.30">
+        <repository changeset_revision="f69b90d89b62" name="package_blast_plus_2_2_30" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>
 </tool_dependency>