annotate tools/blast_rbh/blast_rbh.py @ 0:a96608a125fb draft

Uploaded v0.1.0, first release
author peterjc
date Thu, 15 May 2014 12:54:09 -0400
parents
children 8aa6385b9a05
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
1 #!/usr/bin/env python
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
2 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files.
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
3
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
4 Takes the following command line options,
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
5 1. FASTA filename of species A
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
6 2. FASTA filename of species B
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
7 3. Sequence type (prot/nucl)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
8 4. BLAST type (e.g. blastn, or blastp) consistent with sequence type
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
9 5. Minimum BLAST Percentage identity
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
10 6. Minimum BLAST query coverage
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
11 7. Output filename
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
12 """
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
13
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
14 import os
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
15 import sys
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
16 import tempfile
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
17 import shutil
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
18
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
19 def stop_err( msg ):
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
20 sys.stderr.write("%s\n" % msg)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
21 sys.exit(1)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
22
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
23 def run(cmd):
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
24 return_code = os.system(cmd)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
25 if return_code:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
26 stop_err("Error %i from: %s" % (return_code, cmd))
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
27
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
28 if "--version" in sys.argv[1:]:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
29 #TODO - Capture version of BLAST+ binaries too?
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
30 print "BLAST RBH v0.1.0"
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
31 sys.exit(0)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
32
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
33 #Parse Command Line
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
34 #TODO - optparse
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
35 try:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
36 fasta_a, fasta_b, dbtype, blast_type, min_identity, min_coverage, out_file = sys.argv[1:]
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
37 except:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
38 stop_err("Expect 7 arguments, got %i" % (len(sys.argv) - 1))
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
39
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
40 if not os.path.isfile(fasta_a):
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
41 stop_err("Missing input file for species A: %r" % fasta_a)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
42 if not os.path.isfile(fasta_b):
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
43 stop_err("Missing input file for species B: %r" % fasta_b)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
44 if os.path.abspath(fasta_a) == os.path.abspath(fasta_b):
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
45 #TODO - is this ever useful, e.g. positive control?
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
46 stop_err("Asked to compare the FASTA file to itself!")
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
47
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
48 try:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
49 min_identity = float(min_identity)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
50 except ValueError:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
51 stop_err("Expected number between 0 and 100 for minimum identity, not %r" % min_identity)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
52 if not (0 <= min_identity <= 100):
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
53 stop_err("Expected minimum identity between 0 and 100, not %0.2f" % min_identity)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
54 try:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
55 min_coverage = float(min_coverage)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
56 except ValueError:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
57 stop_err("Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
58 if not (0 <= min_coverage <= 100):
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
59 stop_err("Expected minimum coverage between 0 and 100, not %0.2f" % min_coverage)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
60
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
61
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
62 if dbtype == "nucl":
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
63 if blast_type not in ["megablast", "blastn", "blastn-short", "dc-megablast"]:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
64 stop_err("Invalid BLAST type for BLASTN: %r" % blast_type)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
65 blast_exe = "blastn"
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
66 elif dbtype == "prot":
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
67 if blast_type not in ["blastp", "blastp-short"]:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
68 stop_err("Invalid BLAST type for BLASTP: %r" % blast_type)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
69 blast_exe = "blastp"
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
70 else:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
71 stop_err("Expected 'nucl' or 'prot' for BLAST database type, not %r" % blast_type)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
72 makeblastdb_exe = "makeblastdb"
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
73
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
74 def run(cmd):
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
75 return_code = os.system(cmd)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
76 if return_code:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
77 stop_err("Error %i from: %s" % (return_code, cmd))
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
78
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
79 def makeblastdb(fasta_file, dbtype, output_stem):
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
80 cmd = "%s -dbtype %s -in %s -out %s" % (makeblastdb_exe, dbtype, fasta_file, output_stem)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
81 return run(cmd)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
82
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
83
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
84 base_path = tempfile.mkdtemp()
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
85 db_a = os.path.join(base_path, "SpeciesA")
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
86 db_b = os.path.join(base_path, "SpeciesB")
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
87 a_vs_b = os.path.join(base_path, "A_vs_B.tabular")
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
88 b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
89 log = os.path.join(base_path, "blast.log")
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
90
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
91 cols = "qseqid sseqid bitscore pident qcovhsp" #Or qcovs?
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
92 c_query = 0
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
93 c_match = 1
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
94 c_score = 2
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
95 c_identity = 3
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
96 c_coverage = 4
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
97
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
98 print("Starting...")
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
99 #TODO - Report log in case of error?
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
100 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log))
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
101 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
102 print("BLAST databases prepared.")
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
103 run('%s -task %s -query "%s" -db "%s" -out "%s" -outfmt "6 %s"' % (blast_exe, blast_type, fasta_a, db_b, a_vs_b, cols))
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
104 print("BLAST species A vs species B done.")
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
105 run('%s -task %s -query "%s" -db "%s" -out "%s" -outfmt "6 %s"' % (blast_exe, blast_type, fasta_b, db_a, b_vs_a, cols))
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
106 print("BLAST species B vs species A done.")
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
107
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
108 #TODO - Include identity and coverage filters...
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
109
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
110 best_a_vs_b = dict()
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
111 for line in open(a_vs_b):
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
112 if line.startswith("#"): continue
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
113 parts = line.rstrip("\n").split("\t")
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
114 a = parts[c_query]
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
115 b = parts[c_match]
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
116 if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
117 continue
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
118 score = float(parts[c_score])
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
119 if a not in best_a_vs_b or score > best_a_vs_b[a][1]:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
120 best_a_vs_b[a] = (b, score, parts[c_score])
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
121 b_short_list = set(b for (b,score, score_str) in best_a_vs_b.values())
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
122
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
123 best_b_vs_a = dict()
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
124 for line in open(b_vs_a):
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
125 if line.startswith("#"): continue
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
126 parts = line.rstrip("\n").split("\t")
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
127 b = parts[c_query]
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
128 a = parts[c_match]
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
129 if a not in best_a_vs_b:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
130 continue
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
131 #stop_err("The A-vs-B file does not have A-ID %r found in B-vs-A file" % a)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
132 if b not in b_short_list: continue
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
133 if float(parts[c_identity])< min_identity or float(parts[c_coverage]) < min_coverage:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
134 continue
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
135 score = float(parts[c_score])
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
136 if b not in best_b_vs_a or score > best_b_vs_a[b][1]:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
137 best_b_vs_a[b] = (a, score, parts[c_score])
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
138 #TODO - Preserve order from A vs B?
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
139 a_short_list = sorted(set(a for (a,score,score_str) in best_b_vs_a.values()))
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
140
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
141 count = 0
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
142 outfile = open(out_file, 'w')
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
143 outfile.write("#A_id\tB_id\tA_vs_B\tB_vs_A\n")
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
144 for a in a_short_list:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
145 b = best_a_vs_b[a][0]
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
146 if b in best_b_vs_a and a == best_b_vs_a[b][0]:
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
147 outfile.write("%s\t%s\t%s\t%s\n" % (a, b, best_a_vs_b[a][2], best_b_vs_a[b][2]))
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
148 count += 1
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
149 outfile.close()
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
150 print "Done, %i RBH found" % count
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
151
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
152
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
153 #Remove temp files...
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
154 shutil.rmtree(base_path)
a96608a125fb Uploaded v0.1.0, first release
peterjc
parents:
diff changeset
155