changeset 2:8aa6385b9a05 draft

Uploaded v0.1.0b, more verbose output
author peterjc
date Fri, 16 May 2014 08:54:35 -0400
parents a68f4e5789d7
children 347eb6bfabd7
files test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular tools/blast_rbh/blast_rbh.py tools/blast_rbh/blast_rbh.xml
diffstat 5 files changed, 45 insertions(+), 37 deletions(-) [+]
line wrap: on
line diff
--- a/test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular	Thu May 15 13:11:03 2014 -0400
+++ b/test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular	Fri May 16 08:54:35 2014 -0400
@@ -1,2 +1,2 @@
-#A_id	B_id	A_vs_B	B_vs_A
-ENA|BC112106|BC112106.1	gi|57163782|ref|NM_001009242.1|	 1514	 1514
+#A_id	B_id	A_vs_B bitscore	A_vs_B pident	A_vs_B qcovhsp	B_vs_A bitscore	B_vs_A pident	B_vs_A qcovhsp
+ENA|BC112106|BC112106.1	gi|57163782|ref|NM_001009242.1|	 1514	92.07	86	 1514	92.07	100
--- a/test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular	Thu May 15 13:11:03 2014 -0400
+++ b/test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular	Fri May 16 08:54:35 2014 -0400
@@ -1,2 +1,2 @@
-#A_id	B_id	A_vs_B	B_vs_A
-sp|P08100|OPSD_HUMAN	gi|57163783|ref|NP_001009242.1|	  701	  701
+#A_id	B_id	A_vs_B bitscore	A_vs_B pident	A_vs_B qcovhsp	B_vs_A bitscore	B_vs_A pident	B_vs_A qcovhsp
+sp|P08100|OPSD_HUMAN	gi|57163783|ref|NP_001009242.1|	  701	96.55	100	  701	96.55	100
--- a/test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular	Thu May 15 13:11:03 2014 -0400
+++ b/test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular	Fri May 16 08:54:35 2014 -0400
@@ -1,2 +1,2 @@
-#A_id	B_id	A_vs_B	B_vs_A
-gi|57163782|ref|NM_001009242.1|	ENA|BC112106|BC112106.1	 1474	 1474
+#A_id	B_id	A_vs_B bitscore	A_vs_B pident	A_vs_B qcovhsp	B_vs_A bitscore	B_vs_A pident	B_vs_A qcovhsp
+gi|57163782|ref|NM_001009242.1|	ENA|BC112106|BC112106.1	 1474	92.07	100	 1474	92.07	86
--- a/tools/blast_rbh/blast_rbh.py	Thu May 15 13:11:03 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.py	Fri May 16 08:54:35 2014 -0400
@@ -11,6 +11,8 @@
 7. Output filename
 """
 
+# TODO - Output more columns, e.g. pident, qcovs, descriptions?
+
 import os
 import sys
 import tempfile
@@ -69,17 +71,14 @@
     blast_exe = "blastp"
 else:
     stop_err("Expected 'nucl' or 'prot' for BLAST database type, not %r" % blast_type)
-makeblastdb_exe = "makeblastdb"
 
-def run(cmd):
-    return_code = os.system(cmd)
-    if return_code:
-        stop_err("Error %i from: %s" % (return_code, cmd))
+try:
+    threads = int(os.environ.get("GALAXY_SLOTS", "1"))
+except:
+    threads = 1
+assert 1 <= threads, threads
 
-def makeblastdb(fasta_file, dbtype, output_stem):
-    cmd = "%s -dbtype %s -in %s -out %s" % (makeblastdb_exe, dbtype, fasta_file, output_stem)
-    return run(cmd)
-
+makeblastdb_exe = "makeblastdb"
 
 base_path = tempfile.mkdtemp()
 db_a = os.path.join(base_path, "SpeciesA")
@@ -100,13 +99,13 @@
 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log))
 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
 print("BLAST databases prepared.")
-run('%s -task %s -query "%s" -db "%s" -out "%s" -outfmt "6 %s"' % (blast_exe, blast_type, fasta_a, db_b, a_vs_b, cols))
+run('%s -task %s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
+    % (blast_exe, blast_type, fasta_a, db_b, a_vs_b, cols, threads))
 print("BLAST species A vs species B done.")
-run('%s -task %s -query "%s" -db "%s" -out "%s" -outfmt "6 %s"' % (blast_exe, blast_type, fasta_b, db_a, b_vs_a, cols))
+run('%s -task %s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
+    % (blast_exe, blast_type, fasta_b, db_a, b_vs_a, cols, threads))
 print("BLAST species B vs species A done.")
 
-#TODO - Include identity and coverage filters...
-
 best_a_vs_b = dict()
 for line in open(a_vs_b):
     if line.startswith("#"): continue
@@ -117,8 +116,9 @@
         continue
     score = float(parts[c_score])
     if a not in best_a_vs_b or score > best_a_vs_b[a][1]:
-        best_a_vs_b[a] = (b, score, parts[c_score])
-b_short_list = set(b for (b,score, score_str) in best_a_vs_b.values())
+        # Store bitscore as a float for sorting, original string for output
+        best_a_vs_b[a] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage])
+b_short_list = set(v[0] for v in best_a_vs_b.values())
 
 best_b_vs_a = dict()
 for line in open(b_vs_a):
@@ -134,22 +134,23 @@
         continue
     score = float(parts[c_score])
     if b not in best_b_vs_a or score > best_b_vs_a[b][1]:
-        best_b_vs_a[b] = (a, score, parts[c_score])
+        # Store bitscore as a float for sorting, original string for output
+        best_b_vs_a[b] = (a, score, parts[c_score], parts[c_identity], parts[c_coverage])
 #TODO - Preserve order from A vs B?
-a_short_list = sorted(set(a for (a,score,score_str) in best_b_vs_a.values()))
+a_short_list = sorted(set(v[0] for v in best_b_vs_a.values()))
 
 count = 0
 outfile = open(out_file, 'w')
-outfile.write("#A_id\tB_id\tA_vs_B\tB_vs_A\n")
+outfile.write("#A_id\tB_id\tA_vs_B bitscore\tA_vs_B pident\tA_vs_B qcovhsp\tB_vs_A bitscore\tB_vs_A pident\tB_vs_A qcovhsp\n")
 for a in a_short_list:
     b = best_a_vs_b[a][0]
     if b in best_b_vs_a and a == best_b_vs_a[b][0]:
-        outfile.write("%s\t%s\t%s\t%s\n" % (a, b, best_a_vs_b[a][2], best_b_vs_a[b][2]))
+        #Output the original string versions of the scores
+        values = [a, b] + list(best_a_vs_b[a][2:]) + list(best_b_vs_a[b][2:])
+        outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % tuple(values))
         count += 1
 outfile.close()
 print "Done, %i RBH found" % count
 
-
 #Remove temp files...
 shutil.rmtree(base_path)
-
--- a/tools/blast_rbh/blast_rbh.xml	Thu May 15 13:11:03 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.xml	Fri May 16 08:54:35 2014 -0400
@@ -119,21 +119,28 @@
     <help>
 **What it does**
 
-Takes two FASTA files (species *A* and species *B*), builds a BLAST database
+Takes two FASTA files (*species A* and *species B*), builds a BLAST database
 for each, runs reciprocal BLAST searchs (*A vs B*, and *B vs A*), optionally
-filters these, and then compiles a list of the reciprocal best hits (RBH).
+filters the HSPs, and then compiles a list of the reciprocal best hits (RBH).
 
-The output from this tool is a tabular file containing four columns, with
-the order taken from input file A:
+The output from this tool is a tabular file containing eight columns, with
+information about the BLAST matches used:
 
-====== ======================
+====== =================================
 Column Description
------- ----------------------
-     1 ID from species *A*
-     2 ID from species *B*
+------ ---------------------------------
+     1 ID from *species A*
+     2 ID from *species B*
      3 Bitscore from *A vs B*
-     4 Bitscore from *B vs A*
-====== ======================
+     4 Percentage identity from *A vs B*
+     5 Query coverage from *A vs B*
+     6 Bitscore from *B vs A*
+     7 Percentage identity from *B vs A*
+     8 Query coverage from *B vs A*
+====== =================================
+
+These values correspond to the ``bitscore``, ``pident`` and ``qcovhsp``
+values in the BLAST+ tabular output.
 
 .. class:: warningmark