comparison tools/ncbi_blast_plus/blastxml_to_tabular.py @ 2:fae4084a0bc0 draft

Uploaded v0.0.20, preview 5 Cope if cElementTree is missing in BLAST XML to tabular script.
author peterjc
date Thu, 02 May 2013 11:20:43 -0400
parents
children
comparison
equal deleted inserted replaced
1:2feaef06d388 2:fae4084a0bc0
1 #!/usr/bin/env python
2 """Convert a BLAST XML file to tabular output.
3
4 Takes three command line options, input BLAST XML filename, output tabular
5 BLAST filename, output format (std for standard 12 columns, or ext for the
6 extended 24 columns offered in the BLAST+ wrappers).
7
8 The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart
9 qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which
10 mean:
11
12 ====== ========= ============================================
13 Column NCBI name Description
14 ------ --------- --------------------------------------------
15 1 qseqid Query Seq-id (ID of your sequence)
16 2 sseqid Subject Seq-id (ID of the database hit)
17 3 pident Percentage of identical matches
18 4 length Alignment length
19 5 mismatch Number of mismatches
20 6 gapopen Number of gap openings
21 7 qstart Start of alignment in query
22 8 qend End of alignment in query
23 9 sstart Start of alignment in subject (database hit)
24 10 send End of alignment in subject (database hit)
25 11 evalue Expectation value (E-value)
26 12 bitscore Bit score
27 ====== ========= ============================================
28
29 The additional columns offered in the Galaxy BLAST+ wrappers are:
30
31 ====== ============= ===========================================
32 Column NCBI name Description
33 ------ ------------- -------------------------------------------
34 13 sallseqid All subject Seq-id(s), separated by a ';'
35 14 score Raw score
36 15 nident Number of identical matches
37 16 positive Number of positive-scoring matches
38 17 gaps Total number of gaps
39 18 ppos Percentage of positive-scoring matches
40 19 qframe Query frame
41 20 sframe Subject frame
42 21 qseq Aligned part of query sequence
43 22 sseq Aligned part of subject sequence
44 23 qlen Query sequence length
45 24 slen Subject sequence length
46 ====== ============= ===========================================
47
48 Most of these fields are given explicitly in the XML file, others some like
49 the percentage identity and the number of gap openings must be calculated.
50
51 Be aware that the sequence in the extended tabular output or XML direct from
52 BLAST+ may or may not use XXXX masking on regions of low complexity. This
53 can throw the off the calculation of percentage identity and gap openings.
54 [In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard,
55 with these numbers changing depending on whether or not the low complexity
56 filter is used.]
57
58 This script attempts to produce identical output to what BLAST+ would have done.
59 However, check this with "diff -b ..." since BLAST+ sometimes includes an extra
60 space character (probably a bug).
61 """
62 import sys
63 import re
64
65 if "-v" in sys.argv or "--version" in sys.argv:
66 print "v0.0.12"
67 sys.exit(0)
68
69 if sys.version_info[:2] >= ( 2, 5 ):
70 try:
71 from xml.etree import cElementTree as ElementTree
72 except ImportError:
73 from xml.etree import ElementTree as ElementTree
74 else:
75 from galaxy import eggs
76 import pkg_resources; pkg_resources.require( "elementtree" )
77 from elementtree import ElementTree
78
79 def stop_err( msg ):
80 sys.stderr.write("%s\n" % msg)
81 sys.exit(1)
82
83 #Parse Command Line
84 try:
85 in_file, out_file, out_fmt = sys.argv[1:]
86 except:
87 stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, out format (std or ext)")
88
89 if out_fmt == "std":
90 extended = False
91 elif out_fmt == "x22":
92 stop_err("Format argument x22 has been replaced with ext (extended 24 columns)")
93 elif out_fmt == "ext":
94 extended = True
95 else:
96 stop_err("Format argument should be std (12 column) or ext (extended 24 columns)")
97
98
99 # get an iterable
100 try:
101 context = ElementTree.iterparse(in_file, events=("start", "end"))
102 except:
103 stop_err("Invalid data format.")
104 # turn it into an iterator
105 context = iter(context)
106 # get the root element
107 try:
108 event, root = context.next()
109 except:
110 stop_err( "Invalid data format." )
111
112
113 re_default_query_id = re.compile("^Query_\d+$")
114 assert re_default_query_id.match("Query_101")
115 assert not re_default_query_id.match("Query_101a")
116 assert not re_default_query_id.match("MyQuery_101")
117 re_default_subject_id = re.compile("^Subject_\d+$")
118 assert re_default_subject_id.match("Subject_1")
119 assert not re_default_subject_id.match("Subject_")
120 assert not re_default_subject_id.match("Subject_12a")
121 assert not re_default_subject_id.match("TheSubject_1")
122
123
124 outfile = open(out_file, 'w')
125 blast_program = None
126 for event, elem in context:
127 if event == "end" and elem.tag == "BlastOutput_program":
128 blast_program = elem.text
129 # for every <Iteration> tag
130 if event == "end" and elem.tag == "Iteration":
131 #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
132 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
133 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
134 # <Iteration_query-len>406</Iteration_query-len>
135 # <Iteration_hits></Iteration_hits>
136 #
137 #Or, from BLAST 2.2.24+ run online
138 # <Iteration_query-ID>Query_1</Iteration_query-ID>
139 # <Iteration_query-def>Sample</Iteration_query-def>
140 # <Iteration_query-len>516</Iteration_query-len>
141 # <Iteration_hits>...
142 qseqid = elem.findtext("Iteration_query-ID")
143 if re_default_query_id.match(qseqid):
144 #Place holder ID, take the first word of the query definition
145 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
146 qlen = int(elem.findtext("Iteration_query-len"))
147
148 # for every <Hit> within <Iteration>
149 for hit in elem.findall("Iteration_hits/Hit"):
150 #Expecting either this,
151 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
152 # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
153 # <Hit_accession>P56514</Hit_accession>
154 #or,
155 # <Hit_id>Subject_1</Hit_id>
156 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
157 # <Hit_accession>Subject_1</Hit_accession>
158 #
159 #apparently depending on the parse_deflines switch
160 sseqid = hit.findtext("Hit_id").split(None,1)[0]
161 hit_def = sseqid + " " + hit.findtext("Hit_def")
162 if re_default_subject_id.match(sseqid) \
163 and sseqid == hit.findtext("Hit_accession"):
164 #Place holder ID, take the first word of the subject definition
165 hit_def = hit.findtext("Hit_def")
166 sseqid = hit_def.split(None,1)[0]
167 # for every <Hsp> within <Hit>
168 for hsp in hit.findall("Hit_hsps/Hsp"):
169 nident = hsp.findtext("Hsp_identity")
170 length = hsp.findtext("Hsp_align-len")
171 pident = "%0.2f" % (100*float(nident)/float(length))
172
173 q_seq = hsp.findtext("Hsp_qseq")
174 h_seq = hsp.findtext("Hsp_hseq")
175 m_seq = hsp.findtext("Hsp_midline")
176 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
177 gapopen = str(len(q_seq.replace('-', ' ').split())-1 + \
178 len(h_seq.replace('-', ' ').split())-1)
179
180 mismatch = m_seq.count(' ') + m_seq.count('+') \
181 - q_seq.count('-') - h_seq.count('-')
182 #TODO - Remove this alternative mismatch calculation and test
183 #once satisifed there are no problems
184 expected_mismatch = len(q_seq) \
185 - sum(1 for q,h in zip(q_seq, h_seq) \
186 if q == h or q == "-" or h == "-")
187 xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X")
188 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
189 stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \
190 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
191 int(mismatch), expected_mismatch))
192
193 #TODO - Remove this alternative identity calculation and test
194 #once satisifed there are no problems
195 expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h)
196 if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):
197 stop_err("%s vs %s identities, expected %i <= %i <= %i" \
198 % (qseqid, sseqid, expected_identity, int(nident),
199 expected_identity + q_seq.count("X")))
200
201
202 evalue = hsp.findtext("Hsp_evalue")
203 if evalue == "0":
204 evalue = "0.0"
205 else:
206 evalue = "%0.0e" % float(evalue)
207
208 bitscore = float(hsp.findtext("Hsp_bit-score"))
209 if bitscore < 100:
210 #Seems to show one decimal place for lower scores
211 bitscore = "%0.1f" % bitscore
212 else:
213 #Note BLAST does not round to nearest int, it truncates
214 bitscore = "%i" % bitscore
215
216 values = [qseqid,
217 sseqid,
218 pident,
219 length, #hsp.findtext("Hsp_align-len")
220 str(mismatch),
221 gapopen,
222 hsp.findtext("Hsp_query-from"), #qstart,
223 hsp.findtext("Hsp_query-to"), #qend,
224 hsp.findtext("Hsp_hit-from"), #sstart,
225 hsp.findtext("Hsp_hit-to"), #send,
226 evalue, #hsp.findtext("Hsp_evalue") in scientific notation
227 bitscore, #hsp.findtext("Hsp_bit-score") rounded
228 ]
229
230 if extended:
231 sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">"))
232 #print hit_def, "-->", sallseqid
233 positive = hsp.findtext("Hsp_positive")
234 ppos = "%0.2f" % (100*float(positive)/float(length))
235 qframe = hsp.findtext("Hsp_query-frame")
236 sframe = hsp.findtext("Hsp_hit-frame")
237 if blast_program == "blastp":
238 #Probably a bug in BLASTP that they use 0 or 1 depending on format
239 if qframe == "0": qframe = "1"
240 if sframe == "0": sframe = "1"
241 slen = int(hit.findtext("Hit_len"))
242 values.extend([sallseqid,
243 hsp.findtext("Hsp_score"), #score,
244 nident,
245 positive,
246 hsp.findtext("Hsp_gaps"), #gaps,
247 ppos,
248 qframe,
249 sframe,
250 #NOTE - for blastp, XML shows original seq, tabular uses XXX masking
251 q_seq,
252 h_seq,
253 str(qlen),
254 str(slen),
255 ])
256 #print "\t".join(values)
257 outfile.write("\t".join(values) + "\n")
258 # prevents ElementTree from growing large datastructure
259 root.clear()
260 elem.clear()
261 outfile.close()