changeset 5:c84b6c21e3d4 draft

Uploaded v0.1.0e, test TBLASTX mode; more columns in output
author peterjc
date Tue, 20 May 2014 06:33:08 -0400
parents 57245c11b8cb
children e47960bcdccb
files test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_none.tabular test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular tools/blast_rbh/README.rst tools/blast_rbh/blast_rbh.py tools/blast_rbh/blast_rbh.xml
diffstat 8 files changed, 59 insertions(+), 34 deletions(-) [+]
line wrap: on
line diff
--- a/test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular	Mon May 19 13:39:55 2014 -0400
+++ b/test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular	Tue May 20 06:33:08 2014 -0400
@@ -1,2 +1,2 @@
-#A_id	B_id	bitscore	pident	A qcovhsp	B qcovhsp
-ENA|BC112106|BC112106.1	gi|57163782|ref|NM_001009242.1|	 1514	92.07	86	100
+#A_id	B_id	A_length	B_length	A_qcovhsp	B_qcovhsp	length	pident	bitscore
+ENA|BC112106|BC112106.1	gi|57163782|ref|NM_001009242.1|	1213	1047	86	100	1047	92.07	 1514
--- a/test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular	Mon May 19 13:39:55 2014 -0400
+++ b/test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular	Tue May 20 06:33:08 2014 -0400
@@ -1,2 +1,2 @@
-#A_id	B_id	bitscore	pident	A qcovhsp	B qcovhsp
-sp|P08100|OPSD_HUMAN	gi|57163783|ref|NP_001009242.1|	  701	96.55	100	100
+#A_id	B_id	A_length	B_length	A_qcovhsp	B_qcovhsp	length	pident	bitscore
+sp|P08100|OPSD_HUMAN	gi|57163783|ref|NP_001009242.1|	348	348	100	100	348	96.55	  701
--- a/test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular	Mon May 19 13:39:55 2014 -0400
+++ b/test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular	Tue May 20 06:33:08 2014 -0400
@@ -1,2 +1,2 @@
-#A_id	B_id	bitscore	pident	A qcovhsp	B qcovhsp
-gi|57163782|ref|NM_001009242.1|	ENA|BC112106|BC112106.1	 1474	92.07	100	86
+#A_id	B_id	A_length	B_length	A_qcovhsp	B_qcovhsp	length	pident	bitscore
+gi|57163782|ref|NM_001009242.1|	ENA|BC112106|BC112106.1	1047	1213	100	86	1047	92.07	 1474
--- a/test-data/rbh_none.tabular	Mon May 19 13:39:55 2014 -0400
+++ b/test-data/rbh_none.tabular	Tue May 20 06:33:08 2014 -0400
@@ -1,1 +1,1 @@
-#A_id	B_id	bitscore	pident	A qcovhsp	B qcovhsp
+#A_id	B_id	A_length	B_length	A_qcovhsp	B_qcovhsp	length	pident	bitscore
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular	Tue May 20 06:33:08 2014 -0400
@@ -0,0 +1,2 @@
+#A_id	B_id	A_length	B_length	A_qcovhsp	B_qcovhsp	length	pident	bitscore
+gi|57163782|ref|NM_001009242.1|	ENA|BC112106|BC112106.1	1047	1213	66	57	230	97.39	  559
--- a/tools/blast_rbh/README.rst	Mon May 19 13:39:55 2014 -0400
+++ b/tools/blast_rbh/README.rst	Tue May 20 06:33:08 2014 -0400
@@ -62,7 +62,7 @@
 For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball I use
 the following command from the Galaxy root folder::
 
-    $ tar -czf blast_rbh.tar.gz tools/blast_rbh/README.rst tools/blast_rbh/blast_rbh.xml tools/blast_rbh/blast_rbh.py tools/blast_rbh/repository_dependencies.xml test-data/rhodopsin_nucs.fasta test-data/rhodopsin_proteins.fasta test-data/three_human_mRNA.fasta test-data/four_human_proteins.fasta test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_none.tabular
+    $ tar -czf blast_rbh.tar.gz tools/blast_rbh/README.rst tools/blast_rbh/blast_rbh.xml tools/blast_rbh/blast_rbh.py tools/blast_rbh/repository_dependencies.xml test-data/rhodopsin_nucs.fasta test-data/rhodopsin_proteins.fasta test-data/three_human_mRNA.fasta test-data/four_human_proteins.fasta test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_none.tabular test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular
 
 Check this worked::
 
@@ -79,6 +79,7 @@
     test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular
     test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular
     test-data/rbh_none.tabular
+    test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular
 
 
 Licence (MIT)
--- a/tools/blast_rbh/blast_rbh.py	Mon May 19 13:39:55 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.py	Tue May 20 06:33:08 2014 -0400
@@ -90,12 +90,14 @@
 b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
 log = os.path.join(base_path, "blast.log")
 
-cols = "qseqid sseqid bitscore pident qcovhsp" #Or qcovs?
+cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs?
 c_query = 0
 c_match = 1
 c_score = 2
 c_identity = 3
 c_coverage = 4
+c_qlen = 5
+c_length = 6
 
 #print("Starting...")
 #TODO - Report log in case of error?
@@ -118,9 +120,11 @@
     if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage:
         continue
     score = float(parts[c_score])
+    qlen = int(parts[c_qlen])
+    length = int(parts[c_length])
     if a not in best_a_vs_b or score > best_a_vs_b[a][1]:
         # Store bitscore as a float for sorting, original string for output
-        best_a_vs_b[a] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage])
+        best_a_vs_b[a] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length)
 b_short_list = set(v[0] for v in best_a_vs_b.values())
 
 best_b_vs_a = dict()
@@ -136,36 +140,38 @@
     if float(parts[c_identity])< min_identity or float(parts[c_coverage]) < min_coverage:
         continue
     score = float(parts[c_score])
+    qlen = int(parts[c_qlen])
+    length = int(parts[c_length])
     if b not in best_b_vs_a or score > best_b_vs_a[b][1]:
         # Store bitscore as a float for sorting, original string for output
-        best_b_vs_a[b] = (a, score, parts[c_score], parts[c_identity], parts[c_coverage])
+        best_b_vs_a[b] = (a, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length)
 #TODO - Preserve order from A vs B?
 a_short_list = sorted(set(v[0] for v in best_b_vs_a.values()))
 
 count = 0
 outfile = open(out_file, 'w')
-outfile.write("#A_id\tB_id\tbitscore\tpident\tA qcovhsp\tB qcovhsp\n")
+outfile.write("#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n")
 for a in a_short_list:
     a_values = best_a_vs_b[a]
     b = a_values[0]
     b_values = best_b_vs_a[b]
     if b in best_b_vs_a and a == b_values[0]:
+        #Start with IDs, lengths [5], coverage [4]
+        values = [a, b, a_values[5], b_values[5], a_values[4], b_values[4]]
+        #Length was an integer so don't care about original string
+        values.append(min(a_values[6], b_values[6]))
         #Output the original string versions of the scores
-        values = [a,b]
+        #[3] = identity as string
+        if float(a_values[3]) < float(b_values[3]):
+            values.append(a_values[3])
+        else:
+            values.append(b_values[3])
         #[1] = bitscore as float, [2] = bitscore as original string 
         if a_values[1] < b_values[1]:
             values.append(a_values[2])
         else:
             values.append(b_values[2])
-        #[3] = identity as string
-        if float(a_values[3]) < float(b_values[3]):
-            values.append(a_values[3])
-        else:
-            values.append(b_values[3])
-        #[4] = coverage
-        values.append(a_values[4])
-        values.append(b_values[4])
-        outfile.write("%s\t%s\t%s\t%s\t%s\t%s\n" % tuple(values))
+        outfile.write("%s\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\n" % tuple(values))
         count += 1
 outfile.close()
 print "Done, %i RBH found" % count
--- a/tools/blast_rbh/blast_rbh.xml	Mon May 19 13:39:55 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.xml	Tue May 20 06:33:08 2014 -0400
@@ -108,6 +108,15 @@
             <output name="output" file="rbh_none.tabular" ftype="tabular"/>
         </test>
         <test>
+            <param name="fasta_a" value="rhodopsin_nucs.fasta" ftype="fasta"/>
+            <param name="fasta_b" value="three_human_mRNA.fasta" ftype="fasta"/>
+            <param name="dbtype" value="nucl"/>
+            <param name="nucl_type" value="tblastx"/>
+            <param name="identity" value="0.0"/>
+            <param name="q_cover" value="0.0"/>
+            <output name="output" file="rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular" ftype="tabular"/>
+        </test>
+        <test>
             <param name="fasta_a" value="three_human_mRNA.fasta" ftype="fasta"/>
             <param name="fasta_b" value="rhodopsin_nucs.fasta" ftype="fasta"/>
             <param name="dbtype" value="nucl"/>
@@ -124,24 +133,31 @@
 for each, runs reciprocal BLAST searchs (*A vs B*, and *B vs A*), optionally
 filters the HSPs, and then compiles a list of the reciprocal best hits (RBH).
 
-The output from this tool is a tabular file containing eight columns, with
+The output from this tool is a tabular file containing multiple columns, with
 information about the BLAST matches used:
 
-====== =================================
+====== ==================================
 Column Description
------- ---------------------------------
+------ ----------------------------------
      1 ID from *species A*
      2 ID from *species B*
-     3 Bitscore
-     4 Percentage identity
-     5 Query coverage from *A vs B*
-     6 Query coverage from *B vs A*
-====== =================================
+     3 Length of sequence *A*
+     4 Length of sequence *B*
+     5 Percentage of sequence *A* covered
+     6 Percentage of sequence *B* covered
+     7 HSP alignment length
+     8 HSP percentage identity
+     9 HSP bitscore
+====== ==================================
 
-These values correspond to the ``bitscore``, ``pident`` and ``qcovhsp``
-values in the BLAST+ tabular output. For the bitscore and percentage identity
-the values for *A vs B* and *B vs A* are typically the same, so their minimum
-is shown. The coverage values differ according to the query lengths.
+These values correspond to the ``qseqid``/``sseqid``, ``qlen``/``slen``,
+``qcovhsp``, ``length``, ``pident`` and ``bitscore`` values in the BLAST+
+tabular output.
+
+For the alignment length, bitscore and percentage identity the values for
+*A vs B* and *B vs A* are typically the same, so their minimum is shown.
+The coverage values are given by the HSP alignment length divided by the
+sequence length (adjusted by a factor of three for TBLASTX).
 
 .. class:: warningmark