0
|
1 #!/usr/bin/env python
|
|
2 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files.
|
|
3
|
|
4 Takes the following command line options,
|
|
5 1. FASTA filename of species A
|
|
6 2. FASTA filename of species B
|
|
7 3. Sequence type (prot/nucl)
|
|
8 4. BLAST type (e.g. blastn, or blastp) consistent with sequence type
|
|
9 5. Minimum BLAST Percentage identity
|
|
10 6. Minimum BLAST query coverage
|
|
11 7. Output filename
|
|
12 """
|
|
13
|
|
14 import os
|
|
15 import sys
|
|
16 import tempfile
|
|
17 import shutil
|
|
18
|
|
19 def stop_err( msg ):
|
|
20 sys.stderr.write("%s\n" % msg)
|
|
21 sys.exit(1)
|
|
22
|
|
23 def run(cmd):
|
|
24 return_code = os.system(cmd)
|
|
25 if return_code:
|
|
26 stop_err("Error %i from: %s" % (return_code, cmd))
|
|
27
|
|
28 if "--version" in sys.argv[1:]:
|
|
29 #TODO - Capture version of BLAST+ binaries too?
|
|
30 print "BLAST RBH v0.1.0"
|
|
31 sys.exit(0)
|
|
32
|
|
33 #Parse Command Line
|
|
34 #TODO - optparse
|
|
35 try:
|
|
36 fasta_a, fasta_b, dbtype, blast_type, min_identity, min_coverage, out_file = sys.argv[1:]
|
|
37 except:
|
|
38 stop_err("Expect 7 arguments, got %i" % (len(sys.argv) - 1))
|
|
39
|
|
40 if not os.path.isfile(fasta_a):
|
|
41 stop_err("Missing input file for species A: %r" % fasta_a)
|
|
42 if not os.path.isfile(fasta_b):
|
|
43 stop_err("Missing input file for species B: %r" % fasta_b)
|
|
44 if os.path.abspath(fasta_a) == os.path.abspath(fasta_b):
|
|
45 #TODO - is this ever useful, e.g. positive control?
|
|
46 stop_err("Asked to compare the FASTA file to itself!")
|
|
47
|
|
48 try:
|
|
49 min_identity = float(min_identity)
|
|
50 except ValueError:
|
|
51 stop_err("Expected number between 0 and 100 for minimum identity, not %r" % min_identity)
|
|
52 if not (0 <= min_identity <= 100):
|
|
53 stop_err("Expected minimum identity between 0 and 100, not %0.2f" % min_identity)
|
|
54 try:
|
|
55 min_coverage = float(min_coverage)
|
|
56 except ValueError:
|
|
57 stop_err("Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage)
|
|
58 if not (0 <= min_coverage <= 100):
|
|
59 stop_err("Expected minimum coverage between 0 and 100, not %0.2f" % min_coverage)
|
|
60
|
|
61
|
|
62 if dbtype == "nucl":
|
|
63 if blast_type not in ["megablast", "blastn", "blastn-short", "dc-megablast"]:
|
|
64 stop_err("Invalid BLAST type for BLASTN: %r" % blast_type)
|
|
65 blast_exe = "blastn"
|
|
66 elif dbtype == "prot":
|
|
67 if blast_type not in ["blastp", "blastp-short"]:
|
|
68 stop_err("Invalid BLAST type for BLASTP: %r" % blast_type)
|
|
69 blast_exe = "blastp"
|
|
70 else:
|
|
71 stop_err("Expected 'nucl' or 'prot' for BLAST database type, not %r" % blast_type)
|
|
72 makeblastdb_exe = "makeblastdb"
|
|
73
|
|
74 def run(cmd):
|
|
75 return_code = os.system(cmd)
|
|
76 if return_code:
|
|
77 stop_err("Error %i from: %s" % (return_code, cmd))
|
|
78
|
|
79 def makeblastdb(fasta_file, dbtype, output_stem):
|
|
80 cmd = "%s -dbtype %s -in %s -out %s" % (makeblastdb_exe, dbtype, fasta_file, output_stem)
|
|
81 return run(cmd)
|
|
82
|
|
83
|
|
84 base_path = tempfile.mkdtemp()
|
|
85 db_a = os.path.join(base_path, "SpeciesA")
|
|
86 db_b = os.path.join(base_path, "SpeciesB")
|
|
87 a_vs_b = os.path.join(base_path, "A_vs_B.tabular")
|
|
88 b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
|
|
89 log = os.path.join(base_path, "blast.log")
|
|
90
|
|
91 cols = "qseqid sseqid bitscore pident qcovhsp" #Or qcovs?
|
|
92 c_query = 0
|
|
93 c_match = 1
|
|
94 c_score = 2
|
|
95 c_identity = 3
|
|
96 c_coverage = 4
|
|
97
|
|
98 print("Starting...")
|
|
99 #TODO - Report log in case of error?
|
|
100 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log))
|
|
101 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
|
|
102 print("BLAST databases prepared.")
|
|
103 run('%s -task %s -query "%s" -db "%s" -out "%s" -outfmt "6 %s"' % (blast_exe, blast_type, fasta_a, db_b, a_vs_b, cols))
|
|
104 print("BLAST species A vs species B done.")
|
|
105 run('%s -task %s -query "%s" -db "%s" -out "%s" -outfmt "6 %s"' % (blast_exe, blast_type, fasta_b, db_a, b_vs_a, cols))
|
|
106 print("BLAST species B vs species A done.")
|
|
107
|
|
108 #TODO - Include identity and coverage filters...
|
|
109
|
|
110 best_a_vs_b = dict()
|
|
111 for line in open(a_vs_b):
|
|
112 if line.startswith("#"): continue
|
|
113 parts = line.rstrip("\n").split("\t")
|
|
114 a = parts[c_query]
|
|
115 b = parts[c_match]
|
|
116 if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage:
|
|
117 continue
|
|
118 score = float(parts[c_score])
|
|
119 if a not in best_a_vs_b or score > best_a_vs_b[a][1]:
|
|
120 best_a_vs_b[a] = (b, score, parts[c_score])
|
|
121 b_short_list = set(b for (b,score, score_str) in best_a_vs_b.values())
|
|
122
|
|
123 best_b_vs_a = dict()
|
|
124 for line in open(b_vs_a):
|
|
125 if line.startswith("#"): continue
|
|
126 parts = line.rstrip("\n").split("\t")
|
|
127 b = parts[c_query]
|
|
128 a = parts[c_match]
|
|
129 if a not in best_a_vs_b:
|
|
130 continue
|
|
131 #stop_err("The A-vs-B file does not have A-ID %r found in B-vs-A file" % a)
|
|
132 if b not in b_short_list: continue
|
|
133 if float(parts[c_identity])< min_identity or float(parts[c_coverage]) < min_coverage:
|
|
134 continue
|
|
135 score = float(parts[c_score])
|
|
136 if b not in best_b_vs_a or score > best_b_vs_a[b][1]:
|
|
137 best_b_vs_a[b] = (a, score, parts[c_score])
|
|
138 #TODO - Preserve order from A vs B?
|
|
139 a_short_list = sorted(set(a for (a,score,score_str) in best_b_vs_a.values()))
|
|
140
|
|
141 count = 0
|
|
142 outfile = open(out_file, 'w')
|
|
143 outfile.write("#A_id\tB_id\tA_vs_B\tB_vs_A\n")
|
|
144 for a in a_short_list:
|
|
145 b = best_a_vs_b[a][0]
|
|
146 if b in best_b_vs_a and a == best_b_vs_a[b][0]:
|
|
147 outfile.write("%s\t%s\t%s\t%s\n" % (a, b, best_a_vs_b[a][2], best_b_vs_a[b][2]))
|
|
148 count += 1
|
|
149 outfile.close()
|
|
150 print "Done, %i RBH found" % count
|
|
151
|
|
152
|
|
153 #Remove temp files...
|
|
154 shutil.rmtree(base_path)
|
|
155
|