# HG changeset patch
# User peterjc
# Date 1400581988 14400
# Node ID c84b6c21e3d40ebe223c40d9293a2ebe88229af4
# Parent 57245c11b8cb28e67db8d440f5ac5e740e2eb8b5
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
diff -r 57245c11b8cb -r c84b6c21e3d4 test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular
--- a/test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular Mon May 19 13:39:55 2014 -0400
+++ b/test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular Tue May 20 06:33:08 2014 -0400
@@ -1,2 +1,2 @@
-#A_id B_id bitscore pident A qcovhsp B qcovhsp
-ENA|BC112106|BC112106.1 gi|57163782|ref|NM_001009242.1| 1514 92.07 86 100
+#A_id B_id A_length B_length A_qcovhsp B_qcovhsp length pident bitscore
+ENA|BC112106|BC112106.1 gi|57163782|ref|NM_001009242.1| 1213 1047 86 100 1047 92.07 1514
diff -r 57245c11b8cb -r c84b6c21e3d4 test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular
--- a/test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular Mon May 19 13:39:55 2014 -0400
+++ b/test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular Tue May 20 06:33:08 2014 -0400
@@ -1,2 +1,2 @@
-#A_id B_id bitscore pident A qcovhsp B qcovhsp
-sp|P08100|OPSD_HUMAN gi|57163783|ref|NP_001009242.1| 701 96.55 100 100
+#A_id B_id A_length B_length A_qcovhsp B_qcovhsp length pident bitscore
+sp|P08100|OPSD_HUMAN gi|57163783|ref|NP_001009242.1| 348 348 100 100 348 96.55 701
diff -r 57245c11b8cb -r c84b6c21e3d4 test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular
--- a/test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular Mon May 19 13:39:55 2014 -0400
+++ b/test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular Tue May 20 06:33:08 2014 -0400
@@ -1,2 +1,2 @@
-#A_id B_id bitscore pident A qcovhsp B qcovhsp
-gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 1474 92.07 100 86
+#A_id B_id A_length B_length A_qcovhsp B_qcovhsp length pident bitscore
+gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 1047 1213 100 86 1047 92.07 1474
diff -r 57245c11b8cb -r c84b6c21e3d4 test-data/rbh_none.tabular
--- a/test-data/rbh_none.tabular Mon May 19 13:39:55 2014 -0400
+++ b/test-data/rbh_none.tabular Tue May 20 06:33:08 2014 -0400
@@ -1,1 +1,1 @@
-#A_id B_id bitscore pident A qcovhsp B qcovhsp
+#A_id B_id A_length B_length A_qcovhsp B_qcovhsp length pident bitscore
diff -r 57245c11b8cb -r c84b6c21e3d4 test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular Tue May 20 06:33:08 2014 -0400
@@ -0,0 +1,2 @@
+#A_id B_id A_length B_length A_qcovhsp B_qcovhsp length pident bitscore
+gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 1047 1213 66 57 230 97.39 559
diff -r 57245c11b8cb -r c84b6c21e3d4 tools/blast_rbh/README.rst
--- a/tools/blast_rbh/README.rst Mon May 19 13:39:55 2014 -0400
+++ b/tools/blast_rbh/README.rst Tue May 20 06:33:08 2014 -0400
@@ -62,7 +62,7 @@
For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball I use
the following command from the Galaxy root folder::
- $ tar -czf blast_rbh.tar.gz tools/blast_rbh/README.rst tools/blast_rbh/blast_rbh.xml tools/blast_rbh/blast_rbh.py tools/blast_rbh/repository_dependencies.xml test-data/rhodopsin_nucs.fasta test-data/rhodopsin_proteins.fasta test-data/three_human_mRNA.fasta test-data/four_human_proteins.fasta test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_none.tabular
+ $ tar -czf blast_rbh.tar.gz tools/blast_rbh/README.rst tools/blast_rbh/blast_rbh.xml tools/blast_rbh/blast_rbh.py tools/blast_rbh/repository_dependencies.xml test-data/rhodopsin_nucs.fasta test-data/rhodopsin_proteins.fasta test-data/three_human_mRNA.fasta test-data/four_human_proteins.fasta test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_none.tabular test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular
Check this worked::
@@ -79,6 +79,7 @@
test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular
test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular
test-data/rbh_none.tabular
+ test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular
Licence (MIT)
diff -r 57245c11b8cb -r c84b6c21e3d4 tools/blast_rbh/blast_rbh.py
--- a/tools/blast_rbh/blast_rbh.py Mon May 19 13:39:55 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.py Tue May 20 06:33:08 2014 -0400
@@ -90,12 +90,14 @@
b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
log = os.path.join(base_path, "blast.log")
-cols = "qseqid sseqid bitscore pident qcovhsp" #Or qcovs?
+cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs?
c_query = 0
c_match = 1
c_score = 2
c_identity = 3
c_coverage = 4
+c_qlen = 5
+c_length = 6
#print("Starting...")
#TODO - Report log in case of error?
@@ -118,9 +120,11 @@
if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage:
continue
score = float(parts[c_score])
+ qlen = int(parts[c_qlen])
+ length = int(parts[c_length])
if a not in best_a_vs_b or score > best_a_vs_b[a][1]:
# Store bitscore as a float for sorting, original string for output
- best_a_vs_b[a] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage])
+ best_a_vs_b[a] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length)
b_short_list = set(v[0] for v in best_a_vs_b.values())
best_b_vs_a = dict()
@@ -136,36 +140,38 @@
if float(parts[c_identity])< min_identity or float(parts[c_coverage]) < min_coverage:
continue
score = float(parts[c_score])
+ qlen = int(parts[c_qlen])
+ length = int(parts[c_length])
if b not in best_b_vs_a or score > best_b_vs_a[b][1]:
# Store bitscore as a float for sorting, original string for output
- best_b_vs_a[b] = (a, score, parts[c_score], parts[c_identity], parts[c_coverage])
+ best_b_vs_a[b] = (a, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length)
#TODO - Preserve order from A vs B?
a_short_list = sorted(set(v[0] for v in best_b_vs_a.values()))
count = 0
outfile = open(out_file, 'w')
-outfile.write("#A_id\tB_id\tbitscore\tpident\tA qcovhsp\tB qcovhsp\n")
+outfile.write("#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n")
for a in a_short_list:
a_values = best_a_vs_b[a]
b = a_values[0]
b_values = best_b_vs_a[b]
if b in best_b_vs_a and a == b_values[0]:
+ #Start with IDs, lengths [5], coverage [4]
+ values = [a, b, a_values[5], b_values[5], a_values[4], b_values[4]]
+ #Length was an integer so don't care about original string
+ values.append(min(a_values[6], b_values[6]))
#Output the original string versions of the scores
- values = [a,b]
+ #[3] = identity as string
+ if float(a_values[3]) < float(b_values[3]):
+ values.append(a_values[3])
+ else:
+ values.append(b_values[3])
#[1] = bitscore as float, [2] = bitscore as original string
if a_values[1] < b_values[1]:
values.append(a_values[2])
else:
values.append(b_values[2])
- #[3] = identity as string
- if float(a_values[3]) < float(b_values[3]):
- values.append(a_values[3])
- else:
- values.append(b_values[3])
- #[4] = coverage
- values.append(a_values[4])
- values.append(b_values[4])
- outfile.write("%s\t%s\t%s\t%s\t%s\t%s\n" % tuple(values))
+ outfile.write("%s\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\n" % tuple(values))
count += 1
outfile.close()
print "Done, %i RBH found" % count
diff -r 57245c11b8cb -r c84b6c21e3d4 tools/blast_rbh/blast_rbh.xml
--- a/tools/blast_rbh/blast_rbh.xml Mon May 19 13:39:55 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.xml Tue May 20 06:33:08 2014 -0400
@@ -108,6 +108,15 @@
+
+
+
+
+
+
+
+
+
@@ -124,24 +133,31 @@
for each, runs reciprocal BLAST searchs (*A vs B*, and *B vs A*), optionally
filters the HSPs, and then compiles a list of the reciprocal best hits (RBH).
-The output from this tool is a tabular file containing eight columns, with
+The output from this tool is a tabular file containing multiple columns, with
information about the BLAST matches used:
-====== =================================
+====== ==================================
Column Description
------- ---------------------------------
+------ ----------------------------------
1 ID from *species A*
2 ID from *species B*
- 3 Bitscore
- 4 Percentage identity
- 5 Query coverage from *A vs B*
- 6 Query coverage from *B vs A*
-====== =================================
+ 3 Length of sequence *A*
+ 4 Length of sequence *B*
+ 5 Percentage of sequence *A* covered
+ 6 Percentage of sequence *B* covered
+ 7 HSP alignment length
+ 8 HSP percentage identity
+ 9 HSP bitscore
+====== ==================================
-These values correspond to the ``bitscore``, ``pident`` and ``qcovhsp``
-values in the BLAST+ tabular output. For the bitscore and percentage identity
-the values for *A vs B* and *B vs A* are typically the same, so their minimum
-is shown. The coverage values differ according to the query lengths.
+These values correspond to the ``qseqid``/``sseqid``, ``qlen``/``slen``,
+``qcovhsp``, ``length``, ``pident`` and ``bitscore`` values in the BLAST+
+tabular output.
+
+For the alignment length, bitscore and percentage identity the values for
+*A vs B* and *B vs A* are typically the same, so their minimum is shown.
+The coverage values are given by the HSP alignment length divided by the
+sequence length (adjusted by a factor of three for TBLASTX).
.. class:: warningmark