changeset 4:57245c11b8cb draft

Uploaded v0.1.0d, TBLASTX support; changed output columns
author peterjc
date Mon, 19 May 2014 13:39:55 -0400
parents 347eb6bfabd7
children c84b6c21e3d4
files test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_none.tabular tools/blast_rbh/blast_rbh.py tools/blast_rbh/blast_rbh.xml
diffstat 6 files changed, 48 insertions(+), 29 deletions(-) [+]
line wrap: on
line diff
--- a/test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular	Fri May 16 11:16:02 2014 -0400
+++ b/test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular	Mon May 19 13:39:55 2014 -0400
@@ -1,2 +1,2 @@
-#A_id	B_id	A_vs_B bitscore	A_vs_B pident	A_vs_B qcovhsp	B_vs_A bitscore	B_vs_A pident	B_vs_A qcovhsp
-ENA|BC112106|BC112106.1	gi|57163782|ref|NM_001009242.1|	 1514	92.07	86	 1514	92.07	100
+#A_id	B_id	bitscore	pident	A qcovhsp	B qcovhsp
+ENA|BC112106|BC112106.1	gi|57163782|ref|NM_001009242.1|	 1514	92.07	86	100
--- a/test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular	Fri May 16 11:16:02 2014 -0400
+++ b/test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular	Mon May 19 13:39:55 2014 -0400
@@ -1,2 +1,2 @@
-#A_id	B_id	A_vs_B bitscore	A_vs_B pident	A_vs_B qcovhsp	B_vs_A bitscore	B_vs_A pident	B_vs_A qcovhsp
-sp|P08100|OPSD_HUMAN	gi|57163783|ref|NP_001009242.1|	  701	96.55	100	  701	96.55	100
+#A_id	B_id	bitscore	pident	A qcovhsp	B qcovhsp
+sp|P08100|OPSD_HUMAN	gi|57163783|ref|NP_001009242.1|	  701	96.55	100	100
--- a/test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular	Fri May 16 11:16:02 2014 -0400
+++ b/test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular	Mon May 19 13:39:55 2014 -0400
@@ -1,2 +1,2 @@
-#A_id	B_id	A_vs_B bitscore	A_vs_B pident	A_vs_B qcovhsp	B_vs_A bitscore	B_vs_A pident	B_vs_A qcovhsp
-gi|57163782|ref|NM_001009242.1|	ENA|BC112106|BC112106.1	 1474	92.07	100	 1474	92.07	86
+#A_id	B_id	bitscore	pident	A qcovhsp	B qcovhsp
+gi|57163782|ref|NM_001009242.1|	ENA|BC112106|BC112106.1	 1474	92.07	100	86
--- a/test-data/rbh_none.tabular	Fri May 16 11:16:02 2014 -0400
+++ b/test-data/rbh_none.tabular	Mon May 19 13:39:55 2014 -0400
@@ -1,1 +1,1 @@
-#A_id	B_id	A_vs_B bitscore	A_vs_B pident	A_vs_B qcovhsp	B_vs_A bitscore	B_vs_A pident	B_vs_A qcovhsp
+#A_id	B_id	bitscore	pident	A qcovhsp	B qcovhsp
--- a/tools/blast_rbh/blast_rbh.py	Fri May 16 11:16:02 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.py	Mon May 19 13:39:55 2014 -0400
@@ -62,13 +62,16 @@
 
 
 if dbtype == "nucl":
-    if blast_type not in ["megablast", "blastn", "blastn-short", "dc-megablast"]:
+    if blast_type in ["megablast", "blastn", "blastn-short", "dc-megablast"]:
+         blast_cmd = "blastn -task %s" % blast_type
+    elif blast_type == "tblastx":
+        blast_cmd = "tblastx"
+    else:
         stop_err("Invalid BLAST type for BLASTN: %r" % blast_type)
-    blast_exe = "blastn"
 elif dbtype == "prot":
     if blast_type not in ["blastp", "blastp-short"]:
         stop_err("Invalid BLAST type for BLASTP: %r" % blast_type)
-    blast_exe = "blastp"
+    blast_cmd = "blastp -task %s" % blast_type
 else:
     stop_err("Expected 'nucl' or 'prot' for BLAST database type, not %r" % blast_type)
 
@@ -94,17 +97,17 @@
 c_identity = 3
 c_coverage = 4
 
-print("Starting...")
+#print("Starting...")
 #TODO - Report log in case of error?
 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log))
 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
-print("BLAST databases prepared.")
-run('%s -task %s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
-    % (blast_exe, blast_type, fasta_a, db_b, a_vs_b, cols, threads))
-print("BLAST species A vs species B done.")
-run('%s -task %s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
-    % (blast_exe, blast_type, fasta_b, db_a, b_vs_a, cols, threads))
-print("BLAST species B vs species A done.")
+#print("BLAST databases prepared.")
+run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
+    % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads))
+#print("BLAST species A vs species B done.")
+run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
+    % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads))
+#print("BLAST species B vs species A done.")
 
 best_a_vs_b = dict()
 for line in open(a_vs_b):
@@ -141,13 +144,28 @@
 
 count = 0
 outfile = open(out_file, 'w')
-outfile.write("#A_id\tB_id\tA_vs_B bitscore\tA_vs_B pident\tA_vs_B qcovhsp\tB_vs_A bitscore\tB_vs_A pident\tB_vs_A qcovhsp\n")
+outfile.write("#A_id\tB_id\tbitscore\tpident\tA qcovhsp\tB qcovhsp\n")
 for a in a_short_list:
-    b = best_a_vs_b[a][0]
-    if b in best_b_vs_a and a == best_b_vs_a[b][0]:
+    a_values = best_a_vs_b[a]
+    b = a_values[0]
+    b_values = best_b_vs_a[b]
+    if b in best_b_vs_a and a == b_values[0]:
         #Output the original string versions of the scores
-        values = [a, b] + list(best_a_vs_b[a][2:]) + list(best_b_vs_a[b][2:])
-        outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % tuple(values))
+        values = [a,b]
+        #[1] = bitscore as float, [2] = bitscore as original string 
+        if a_values[1] < b_values[1]:
+            values.append(a_values[2])
+        else:
+            values.append(b_values[2])
+        #[3] = identity as string
+        if float(a_values[3]) < float(b_values[3]):
+            values.append(a_values[3])
+        else:
+            values.append(b_values[3])
+        #[4] = coverage
+        values.append(a_values[4])
+        values.append(b_values[4])
+        outfile.write("%s\t%s\t%s\t%s\t%s\t%s\n" % tuple(values))
         count += 1
 outfile.close()
 print "Done, %i RBH found" % count
--- a/tools/blast_rbh/blast_rbh.xml	Fri May 16 11:16:02 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.xml	Mon May 19 13:39:55 2014 -0400
@@ -43,6 +43,7 @@
                     <option value="blastn">blastn - Traditional BLASTN requiring an exact match of 11, for somewhat similar sequences</option>
                     <option value="blastn-short">blastn-short - BLASTN program optimized for sequences shorter than 50 bases</option>
                     <option value="dc-megablast">dc-megablast - Discontiguous megablast used to find more distant (e.g., interspecies) sequences</option>
+                    <option value="tblastx">tblastx - TBLASTX program using translated query against translated database (protein level matches)</option>
                 </param>
             </when>
         </conditional>
@@ -131,16 +132,16 @@
 ------ ---------------------------------
      1 ID from *species A*
      2 ID from *species B*
-     3 Bitscore from *A vs B*
-     4 Percentage identity from *A vs B*
+     3 Bitscore
+     4 Percentage identity
      5 Query coverage from *A vs B*
-     6 Bitscore from *B vs A*
-     7 Percentage identity from *B vs A*
-     8 Query coverage from *B vs A*
+     6 Query coverage from *B vs A*
 ====== =================================
 
 These values correspond to the ``bitscore``, ``pident`` and ``qcovhsp``
-values in the BLAST+ tabular output.
+values in the BLAST+ tabular output. For the bitscore and percentage identity
+the values for *A vs B* and *B vs A* are typically the same, so their minimum
+is shown. The coverage values differ according to the query lengths.
 
 .. class:: warningmark