# HG changeset patch # User peterjc # Date 1414693248 14400 # Node ID 4454596ed1273ce9b5a46a4a58ad6184cef6b0de # Parent 661276ad882e4030df3053ff1e8cf4351cf664df Uploaded diff -r 661276ad882e -r 4454596ed127 test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular --- a/test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular Tue Oct 14 06:53:40 2014 -0400 +++ b/test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular Thu Oct 30 14:20:48 2014 -0400 @@ -1,2 +1,2 @@ #A_id B_id A_length B_length A_qcovhsp B_qcovhsp length pident bitscore -gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 1047 1213 66 57 230 97.39 559 +gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 1047 1213 22 19 230 97.39 559 diff -r 661276ad882e -r 4454596ed127 tools/blast_rbh/README.rst --- a/tools/blast_rbh/README.rst Tue Oct 14 06:53:40 2014 -0400 +++ b/tools/blast_rbh/README.rst Thu Oct 30 14:20:48 2014 -0400 @@ -1,14 +1,18 @@ -Galaxy tool to find BLAST Reciprocal Best Hits (RBH) -==================================================== +Find BLAST Reciprocal Best Hits (RBH), with Galaxy wrapper +========================================================== This tool is copyright 2011-2014 by Peter Cock, The James Hutton Institute (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. See the licence text below. This tool is a short Python script to run reciprocal BLAST searches on a -pair of sequence files, and extract the reciprocal best hits. +pair of sequence files, and extract the reciprocal best hits. The script +``blast_rbh.py`` can be used directly (without Galaxy) as long as NCBI +BLAST+ is installed. -It is available from the Galaxy Tool Shed at: +It comes with an optional Galaxy tool definition file ``blast_rbh.xml`` +allowing the Python script to be run from within Galaxy. It is available +from the Galaxy Tool Shed at: http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh @@ -33,11 +37,10 @@ -If you want to run the functional tests, include the same line in your -``tool_conf.xml.sample`` file, and the sample test files under Galaxy's -``test-data/`` directory. Then:: +If you want to run the functional tests, copy the sample test files under +sample test files under Galaxy's ``test-data/`` directory. Then:: - ./run_functional_tests.sh -id blast_reciprocal_best_hits + ./run_tests.sh -id blast_reciprocal_best_hits You will need to have the NCBI BLAST+ binaries installed and on the ``$PATH``. @@ -55,6 +58,10 @@ - Fixed Tool Shed dependency definition. v0.1.3 - Option to make FASTA files non-redundant (via Biopython dependency). - Avoid extra database and BLAST search in self-comparison mode. +v0.1.4 - Check for duplicate FASTA identifiers (workaround for makeblastdb + not treating this as an error, leading to confusing RBH output). +v0.1.5 - Clarify documentation for using the Python script outside Galaxy. + - Updated to depend on NCBI BLAST+ 2.2.30 via ToolShed install. ======= ====================================================================== diff -r 661276ad882e -r 4454596ed127 tools/blast_rbh/blast_rbh.py --- a/tools/blast_rbh/blast_rbh.py Tue Oct 14 06:53:40 2014 -0400 +++ b/tools/blast_rbh/blast_rbh.py Thu Oct 30 14:20:48 2014 -0400 @@ -1,14 +1,16 @@ #!/usr/bin/env python """BLAST Reciprocal Best Hit (RBH) from two FASTA input files. -Takes the following command line options, -1. FASTA filename of species A -2. FASTA filename of species B -3. Sequence type (prot/nucl) -4. BLAST type (e.g. blastn, or blastp) consistent with sequence type -5. Minimum BLAST Percentage identity -6. Minimum BLAST query coverage -7. Output filename +Run "blast_rbh.py -h" to see the help text, or read the associated +README.rst file which is also available on GitHub at: +https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh + +This requires Python and the NCBI BLAST+ tools to be installed +and on the $PATH. + +You can also run this tool via Galaxy using the "blast_rbh.xml" +definition file. This is available as a package on the Galaxy +Tool Shed: http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh """ # TODO - Output more columns, e.g. pident, qcovs, descriptions? @@ -30,7 +32,7 @@ if "--version" in sys.argv[1:]: #TODO - Capture version of BLAST+ binaries too? - print "BLAST RBH v0.1.3" + print "BLAST RBH v0.1.5" sys.exit(0) #Parse Command Line @@ -212,6 +214,24 @@ #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) tie_warning += 1 +def check_duplicate_ids(filename): + # Copied from tools/ncbi_blast_plus/check_no_duplicates.py + # TODO - just use Biopython's FASTA parser? + if not os.path.isfile(filename): + stop_err("Missing FASTA file %r" % filename, 2) + identifiers = set() + handle = open(filename) + for line in handle: + if line.startswith(">"): + # The split will also take care of the new line character, + # e.g. ">test\n" and ">test description here\n" both give "test" + seq_id = line[1:].split(None, 1)[0] + if seq_id in identifiers: + handle.close() + stop_err("Repeated identifiers, e.g. %r" % seq_id, 3) + identifiers.add(seq_id) + handle.close() + def make_nr(input_fasta, output_fasta, sep=";"): #TODO - seq-hash based to avoid loading everything into RAM? by_seq = dict() @@ -252,6 +272,10 @@ print("No perfect duplicates in file, %i unique entries" % unique) #print("Starting...") +check_duplicate_ids(fasta_a) +if not self_comparison: + check_duplicate_ids(fasta_b) + if options.nr: make_nr(fasta_a, tmp_a) if not self_comparison: diff -r 661276ad882e -r 4454596ed127 tools/blast_rbh/blast_rbh.xml --- a/tools/blast_rbh/blast_rbh.xml Tue Oct 14 06:53:40 2014 -0400 +++ b/tools/blast_rbh/blast_rbh.xml Thu Oct 30 14:20:48 2014 -0400 @@ -1,4 +1,4 @@ - + from two FASTA files biopython @@ -6,7 +6,7 @@ makeblastdb blastp blastn - blast+ + blast+ blast_rbh.py --version diff -r 661276ad882e -r 4454596ed127 tools/blast_rbh/tool_dependencies.xml --- a/tools/blast_rbh/tool_dependencies.xml Tue Oct 14 06:53:40 2014 -0400 +++ b/tools/blast_rbh/tool_dependencies.xml Thu Oct 30 14:20:48 2014 -0400 @@ -3,7 +3,7 @@ - - + +