Mercurial > repos > peterjc > blast_rbh
changeset 4:57245c11b8cb draft
Uploaded v0.1.0d, TBLASTX support; changed output columns
author | peterjc |
---|---|
date | Mon, 19 May 2014 13:39:55 -0400 |
parents | 347eb6bfabd7 |
children | c84b6c21e3d4 |
files | test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_none.tabular tools/blast_rbh/blast_rbh.py tools/blast_rbh/blast_rbh.xml |
diffstat | 6 files changed, 48 insertions(+), 29 deletions(-) [+] |
line wrap: on
line diff
--- a/test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular Fri May 16 11:16:02 2014 -0400 +++ b/test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular Mon May 19 13:39:55 2014 -0400 @@ -1,2 +1,2 @@ -#A_id B_id A_vs_B bitscore A_vs_B pident A_vs_B qcovhsp B_vs_A bitscore B_vs_A pident B_vs_A qcovhsp -ENA|BC112106|BC112106.1 gi|57163782|ref|NM_001009242.1| 1514 92.07 86 1514 92.07 100 +#A_id B_id bitscore pident A qcovhsp B qcovhsp +ENA|BC112106|BC112106.1 gi|57163782|ref|NM_001009242.1| 1514 92.07 86 100
--- a/test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular Fri May 16 11:16:02 2014 -0400 +++ b/test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular Mon May 19 13:39:55 2014 -0400 @@ -1,2 +1,2 @@ -#A_id B_id A_vs_B bitscore A_vs_B pident A_vs_B qcovhsp B_vs_A bitscore B_vs_A pident B_vs_A qcovhsp -sp|P08100|OPSD_HUMAN gi|57163783|ref|NP_001009242.1| 701 96.55 100 701 96.55 100 +#A_id B_id bitscore pident A qcovhsp B qcovhsp +sp|P08100|OPSD_HUMAN gi|57163783|ref|NP_001009242.1| 701 96.55 100 100
--- a/test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular Fri May 16 11:16:02 2014 -0400 +++ b/test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular Mon May 19 13:39:55 2014 -0400 @@ -1,2 +1,2 @@ -#A_id B_id A_vs_B bitscore A_vs_B pident A_vs_B qcovhsp B_vs_A bitscore B_vs_A pident B_vs_A qcovhsp -gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 1474 92.07 100 1474 92.07 86 +#A_id B_id bitscore pident A qcovhsp B qcovhsp +gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 1474 92.07 100 86
--- a/test-data/rbh_none.tabular Fri May 16 11:16:02 2014 -0400 +++ b/test-data/rbh_none.tabular Mon May 19 13:39:55 2014 -0400 @@ -1,1 +1,1 @@ -#A_id B_id A_vs_B bitscore A_vs_B pident A_vs_B qcovhsp B_vs_A bitscore B_vs_A pident B_vs_A qcovhsp +#A_id B_id bitscore pident A qcovhsp B qcovhsp
--- a/tools/blast_rbh/blast_rbh.py Fri May 16 11:16:02 2014 -0400 +++ b/tools/blast_rbh/blast_rbh.py Mon May 19 13:39:55 2014 -0400 @@ -62,13 +62,16 @@ if dbtype == "nucl": - if blast_type not in ["megablast", "blastn", "blastn-short", "dc-megablast"]: + if blast_type in ["megablast", "blastn", "blastn-short", "dc-megablast"]: + blast_cmd = "blastn -task %s" % blast_type + elif blast_type == "tblastx": + blast_cmd = "tblastx" + else: stop_err("Invalid BLAST type for BLASTN: %r" % blast_type) - blast_exe = "blastn" elif dbtype == "prot": if blast_type not in ["blastp", "blastp-short"]: stop_err("Invalid BLAST type for BLASTP: %r" % blast_type) - blast_exe = "blastp" + blast_cmd = "blastp -task %s" % blast_type else: stop_err("Expected 'nucl' or 'prot' for BLAST database type, not %r" % blast_type) @@ -94,17 +97,17 @@ c_identity = 3 c_coverage = 4 -print("Starting...") +#print("Starting...") #TODO - Report log in case of error? run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log)) run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log)) -print("BLAST databases prepared.") -run('%s -task %s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' - % (blast_exe, blast_type, fasta_a, db_b, a_vs_b, cols, threads)) -print("BLAST species A vs species B done.") -run('%s -task %s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' - % (blast_exe, blast_type, fasta_b, db_a, b_vs_a, cols, threads)) -print("BLAST species B vs species A done.") +#print("BLAST databases prepared.") +run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' + % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads)) +#print("BLAST species A vs species B done.") +run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' + % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)) +#print("BLAST species B vs species A done.") best_a_vs_b = dict() for line in open(a_vs_b): @@ -141,13 +144,28 @@ count = 0 outfile = open(out_file, 'w') -outfile.write("#A_id\tB_id\tA_vs_B bitscore\tA_vs_B pident\tA_vs_B qcovhsp\tB_vs_A bitscore\tB_vs_A pident\tB_vs_A qcovhsp\n") +outfile.write("#A_id\tB_id\tbitscore\tpident\tA qcovhsp\tB qcovhsp\n") for a in a_short_list: - b = best_a_vs_b[a][0] - if b in best_b_vs_a and a == best_b_vs_a[b][0]: + a_values = best_a_vs_b[a] + b = a_values[0] + b_values = best_b_vs_a[b] + if b in best_b_vs_a and a == b_values[0]: #Output the original string versions of the scores - values = [a, b] + list(best_a_vs_b[a][2:]) + list(best_b_vs_a[b][2:]) - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % tuple(values)) + values = [a,b] + #[1] = bitscore as float, [2] = bitscore as original string + if a_values[1] < b_values[1]: + values.append(a_values[2]) + else: + values.append(b_values[2]) + #[3] = identity as string + if float(a_values[3]) < float(b_values[3]): + values.append(a_values[3]) + else: + values.append(b_values[3]) + #[4] = coverage + values.append(a_values[4]) + values.append(b_values[4]) + outfile.write("%s\t%s\t%s\t%s\t%s\t%s\n" % tuple(values)) count += 1 outfile.close() print "Done, %i RBH found" % count
--- a/tools/blast_rbh/blast_rbh.xml Fri May 16 11:16:02 2014 -0400 +++ b/tools/blast_rbh/blast_rbh.xml Mon May 19 13:39:55 2014 -0400 @@ -43,6 +43,7 @@ <option value="blastn">blastn - Traditional BLASTN requiring an exact match of 11, for somewhat similar sequences</option> <option value="blastn-short">blastn-short - BLASTN program optimized for sequences shorter than 50 bases</option> <option value="dc-megablast">dc-megablast - Discontiguous megablast used to find more distant (e.g., interspecies) sequences</option> + <option value="tblastx">tblastx - TBLASTX program using translated query against translated database (protein level matches)</option> </param> </when> </conditional> @@ -131,16 +132,16 @@ ------ --------------------------------- 1 ID from *species A* 2 ID from *species B* - 3 Bitscore from *A vs B* - 4 Percentage identity from *A vs B* + 3 Bitscore + 4 Percentage identity 5 Query coverage from *A vs B* - 6 Bitscore from *B vs A* - 7 Percentage identity from *B vs A* - 8 Query coverage from *B vs A* + 6 Query coverage from *B vs A* ====== ================================= These values correspond to the ``bitscore``, ``pident`` and ``qcovhsp`` -values in the BLAST+ tabular output. +values in the BLAST+ tabular output. For the bitscore and percentage identity +the values for *A vs B* and *B vs A* are typically the same, so their minimum +is shown. The coverage values differ according to the query lengths. .. class:: warningmark