Mercurial > repos > peterjc > blast_rbh
changeset 5:c84b6c21e3d4 draft
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
author | peterjc |
---|---|
date | Tue, 20 May 2014 06:33:08 -0400 |
parents | 57245c11b8cb |
children | e47960bcdccb |
files | test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_none.tabular test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular tools/blast_rbh/README.rst tools/blast_rbh/blast_rbh.py tools/blast_rbh/blast_rbh.xml |
diffstat | 8 files changed, 59 insertions(+), 34 deletions(-) [+] |
line wrap: on
line diff
--- a/test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular Mon May 19 13:39:55 2014 -0400 +++ b/test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular Tue May 20 06:33:08 2014 -0400 @@ -1,2 +1,2 @@ -#A_id B_id bitscore pident A qcovhsp B qcovhsp -ENA|BC112106|BC112106.1 gi|57163782|ref|NM_001009242.1| 1514 92.07 86 100 +#A_id B_id A_length B_length A_qcovhsp B_qcovhsp length pident bitscore +ENA|BC112106|BC112106.1 gi|57163782|ref|NM_001009242.1| 1213 1047 86 100 1047 92.07 1514
--- a/test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular Mon May 19 13:39:55 2014 -0400 +++ b/test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular Tue May 20 06:33:08 2014 -0400 @@ -1,2 +1,2 @@ -#A_id B_id bitscore pident A qcovhsp B qcovhsp -sp|P08100|OPSD_HUMAN gi|57163783|ref|NP_001009242.1| 701 96.55 100 100 +#A_id B_id A_length B_length A_qcovhsp B_qcovhsp length pident bitscore +sp|P08100|OPSD_HUMAN gi|57163783|ref|NP_001009242.1| 348 348 100 100 348 96.55 701
--- a/test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular Mon May 19 13:39:55 2014 -0400 +++ b/test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular Tue May 20 06:33:08 2014 -0400 @@ -1,2 +1,2 @@ -#A_id B_id bitscore pident A qcovhsp B qcovhsp -gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 1474 92.07 100 86 +#A_id B_id A_length B_length A_qcovhsp B_qcovhsp length pident bitscore +gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 1047 1213 100 86 1047 92.07 1474
--- a/test-data/rbh_none.tabular Mon May 19 13:39:55 2014 -0400 +++ b/test-data/rbh_none.tabular Tue May 20 06:33:08 2014 -0400 @@ -1,1 +1,1 @@ -#A_id B_id bitscore pident A qcovhsp B qcovhsp +#A_id B_id A_length B_length A_qcovhsp B_qcovhsp length pident bitscore
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular Tue May 20 06:33:08 2014 -0400 @@ -0,0 +1,2 @@ +#A_id B_id A_length B_length A_qcovhsp B_qcovhsp length pident bitscore +gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 1047 1213 66 57 230 97.39 559
--- a/tools/blast_rbh/README.rst Mon May 19 13:39:55 2014 -0400 +++ b/tools/blast_rbh/README.rst Tue May 20 06:33:08 2014 -0400 @@ -62,7 +62,7 @@ For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball I use the following command from the Galaxy root folder:: - $ tar -czf blast_rbh.tar.gz tools/blast_rbh/README.rst tools/blast_rbh/blast_rbh.xml tools/blast_rbh/blast_rbh.py tools/blast_rbh/repository_dependencies.xml test-data/rhodopsin_nucs.fasta test-data/rhodopsin_proteins.fasta test-data/three_human_mRNA.fasta test-data/four_human_proteins.fasta test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_none.tabular + $ tar -czf blast_rbh.tar.gz tools/blast_rbh/README.rst tools/blast_rbh/blast_rbh.xml tools/blast_rbh/blast_rbh.py tools/blast_rbh/repository_dependencies.xml test-data/rhodopsin_nucs.fasta test-data/rhodopsin_proteins.fasta test-data/three_human_mRNA.fasta test-data/four_human_proteins.fasta test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_none.tabular test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular Check this worked:: @@ -79,6 +79,7 @@ test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_none.tabular + test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular Licence (MIT)
--- a/tools/blast_rbh/blast_rbh.py Mon May 19 13:39:55 2014 -0400 +++ b/tools/blast_rbh/blast_rbh.py Tue May 20 06:33:08 2014 -0400 @@ -90,12 +90,14 @@ b_vs_a = os.path.join(base_path, "B_vs_A.tabular") log = os.path.join(base_path, "blast.log") -cols = "qseqid sseqid bitscore pident qcovhsp" #Or qcovs? +cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs? c_query = 0 c_match = 1 c_score = 2 c_identity = 3 c_coverage = 4 +c_qlen = 5 +c_length = 6 #print("Starting...") #TODO - Report log in case of error? @@ -118,9 +120,11 @@ if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage: continue score = float(parts[c_score]) + qlen = int(parts[c_qlen]) + length = int(parts[c_length]) if a not in best_a_vs_b or score > best_a_vs_b[a][1]: # Store bitscore as a float for sorting, original string for output - best_a_vs_b[a] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage]) + best_a_vs_b[a] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length) b_short_list = set(v[0] for v in best_a_vs_b.values()) best_b_vs_a = dict() @@ -136,36 +140,38 @@ if float(parts[c_identity])< min_identity or float(parts[c_coverage]) < min_coverage: continue score = float(parts[c_score]) + qlen = int(parts[c_qlen]) + length = int(parts[c_length]) if b not in best_b_vs_a or score > best_b_vs_a[b][1]: # Store bitscore as a float for sorting, original string for output - best_b_vs_a[b] = (a, score, parts[c_score], parts[c_identity], parts[c_coverage]) + best_b_vs_a[b] = (a, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length) #TODO - Preserve order from A vs B? a_short_list = sorted(set(v[0] for v in best_b_vs_a.values())) count = 0 outfile = open(out_file, 'w') -outfile.write("#A_id\tB_id\tbitscore\tpident\tA qcovhsp\tB qcovhsp\n") +outfile.write("#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n") for a in a_short_list: a_values = best_a_vs_b[a] b = a_values[0] b_values = best_b_vs_a[b] if b in best_b_vs_a and a == b_values[0]: + #Start with IDs, lengths [5], coverage [4] + values = [a, b, a_values[5], b_values[5], a_values[4], b_values[4]] + #Length was an integer so don't care about original string + values.append(min(a_values[6], b_values[6])) #Output the original string versions of the scores - values = [a,b] + #[3] = identity as string + if float(a_values[3]) < float(b_values[3]): + values.append(a_values[3]) + else: + values.append(b_values[3]) #[1] = bitscore as float, [2] = bitscore as original string if a_values[1] < b_values[1]: values.append(a_values[2]) else: values.append(b_values[2]) - #[3] = identity as string - if float(a_values[3]) < float(b_values[3]): - values.append(a_values[3]) - else: - values.append(b_values[3]) - #[4] = coverage - values.append(a_values[4]) - values.append(b_values[4]) - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\n" % tuple(values)) + outfile.write("%s\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\n" % tuple(values)) count += 1 outfile.close() print "Done, %i RBH found" % count
--- a/tools/blast_rbh/blast_rbh.xml Mon May 19 13:39:55 2014 -0400 +++ b/tools/blast_rbh/blast_rbh.xml Tue May 20 06:33:08 2014 -0400 @@ -108,6 +108,15 @@ <output name="output" file="rbh_none.tabular" ftype="tabular"/> </test> <test> + <param name="fasta_a" value="rhodopsin_nucs.fasta" ftype="fasta"/> + <param name="fasta_b" value="three_human_mRNA.fasta" ftype="fasta"/> + <param name="dbtype" value="nucl"/> + <param name="nucl_type" value="tblastx"/> + <param name="identity" value="0.0"/> + <param name="q_cover" value="0.0"/> + <output name="output" file="rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular" ftype="tabular"/> + </test> + <test> <param name="fasta_a" value="three_human_mRNA.fasta" ftype="fasta"/> <param name="fasta_b" value="rhodopsin_nucs.fasta" ftype="fasta"/> <param name="dbtype" value="nucl"/> @@ -124,24 +133,31 @@ for each, runs reciprocal BLAST searchs (*A vs B*, and *B vs A*), optionally filters the HSPs, and then compiles a list of the reciprocal best hits (RBH). -The output from this tool is a tabular file containing eight columns, with +The output from this tool is a tabular file containing multiple columns, with information about the BLAST matches used: -====== ================================= +====== ================================== Column Description ------- --------------------------------- +------ ---------------------------------- 1 ID from *species A* 2 ID from *species B* - 3 Bitscore - 4 Percentage identity - 5 Query coverage from *A vs B* - 6 Query coverage from *B vs A* -====== ================================= + 3 Length of sequence *A* + 4 Length of sequence *B* + 5 Percentage of sequence *A* covered + 6 Percentage of sequence *B* covered + 7 HSP alignment length + 8 HSP percentage identity + 9 HSP bitscore +====== ================================== -These values correspond to the ``bitscore``, ``pident`` and ``qcovhsp`` -values in the BLAST+ tabular output. For the bitscore and percentage identity -the values for *A vs B* and *B vs A* are typically the same, so their minimum -is shown. The coverage values differ according to the query lengths. +These values correspond to the ``qseqid``/``sseqid``, ``qlen``/``slen``, +``qcovhsp``, ``length``, ``pident`` and ``bitscore`` values in the BLAST+ +tabular output. + +For the alignment length, bitscore and percentage identity the values for +*A vs B* and *B vs A* are typically the same, so their minimum is shown. +The coverage values are given by the HSP alignment length divided by the +sequence length (adjusted by a factor of three for TBLASTX). .. class:: warningmark