comparison parseSnpEffVcf.py @ 47:23d98125d20c

parse snpEff output
author jingchunzhu
date Thu, 13 Aug 2015 23:26:33 -0700
parents
children a38cc72edd75
comparison
equal deleted inserted replaced
46:ebf3bc09c383 47:23d98125d20c
1 #!/usr/bin/env python
2
3 import argparse
4 import re
5 import sys, os
6 import string
7 import math
8
9 impact = {"MODIFIER":0, "LOW":1, "MODERATE":2, "HIGH":3}
10
11 class vcfRow(object):
12 """This object contains an individual row of VCF output"""
13 def __init__(self, row, columnLabels, EFFECT=1):
14 tokens = row[:-1].split("\t")
15 #chromosome notation
16 chrom = tokens[0]
17 if string.upper(chrom[0:3])== "CHR" and chrom[0:3]!="chr":
18 chrom="chr"+chrom[3:]
19 elif string.upper(chrom[0:2])== "CH" and string.upper(chrom[2])!="R":
20 chrom= "chr"+chrom[2:]
21 elif chrom in ["23","X","x"]:
22 chrom="chrX"
23 elif chrom in ["24","Y","y"]:
24 chrom="chrY"
25 elif chrom in ["25","M","m"]:
26 chrom="chrM"
27 else:
28 chrom="chr"+chrom
29
30 if chrom == "chr23":
31 chrom="chrX"
32 if chrom == "chr24":
33 chrom="chrY"
34
35 self.chr = chrom
36 self.start = int(tokens[1])
37 self.ID = tokens[2]
38 self.reference = tokens[3]
39 if (self.reference ==""):
40 self.reference="-"
41 self.alt = tokens[4]
42 if (self.alt ==""):
43 self.alt ="-"
44 self.end = self.start + len(self.reference) - 1
45 self.DNA_AF=""
46 self.RNA_AF=""
47 self.NORMAL_AF=""
48
49 effectsString = ""
50 for thisSubToken in tokens[7].split(";"):
51 if re.search("^EFF=", thisSubToken):
52 effectsString = thisSubToken
53
54 GT_code=None
55
56 if len(tokens)<=8:
57 pass
58 else:
59 # this could be all specific to RADIA output
60 format =tokens[8]
61 DNA_NORMAL = tokens[9]
62 DNA_TUMOR =tokens[10]
63 if len(tokens)>11:
64 RNA_TUMOR = tokens[11]
65 else:
66 RNA_TUMOR=''
67
68 #get the ALT base code (in VCF: GT=0,1,2?, 1 and 2 are acceptable)
69 GT_code = self._findGTCode(self.chr, format, DNA_TUMOR, RNA_TUMOR, self.start)
70
71 if GT_code !=None:
72 self.alt = string.split(tokens[4],",")[GT_code-1]
73 # AF allel frequency # this is all specific to RADIA output
74 ID="AD"
75 val =self._parse_TUMOR_ALT_ID (ID,format,DNA_TUMOR, RNA_TUMOR, GT_code)
76 if val != None:
77 self.DNA_AD, self.RNA_AD= val
78 ID="DP"
79 val = self._parse_TUMOR_SINGLE_ID(ID,format,DNA_TUMOR, RNA_TUMOR)
80 if val != None:
81 self.DNA_DP, self.RNA_DP= val
82 try:
83 self.DNA_AF = str(float(self.DNA_AD) /float(self.DNA_DP))
84 except:
85 self.DNA_AF = "NA"
86 try:
87 self.RNA_AF = str(float(self.RNA_AD) /float(self.RNA_DP))
88 except:
89 self.RNA_AF = "NA"
90 else:
91 self.DNA_AF, self.RNA_AF= "NA","NA"
92 else: # returned None
93 #print "AF error", row
94 self.DNA_AF, self.RNA_AF= "NA","NA"
95
96 #get info on normal sample
97 ID = "AD"
98 val = self._parse_NORMAL_ALT_ID (ID,format,DNA_NORMAL, GT_code)
99 if val != None:
100 if val =="NA":
101 val =0
102 self.NORMAL_AD = val
103 ID ="DP"
104 val = self._parse_NORMAL_SINGLE_ID(ID,format,DNA_NORMAL)
105 if val != None:
106 self.NORMAL_DP = val
107 try:
108 self.NORMAL_AF = str(float(self.NORMAL_AD) /float(self.NORMAL_DP))
109 except:
110 self.NORMAL_AF = "NA"
111 else:
112 self.NORMAL_AF = "NA"
113 else:
114 self.NORMAL_AF = "NA"
115 else:
116 #print "GT error", row
117 self.alt = "NA"
118 self.DNA_AF, self.RNA_AF= "NA","NA"
119
120
121 if EFFECT:
122 self.effectPerGene = self._parseEffectsPerGene(effectsString, columnLabels, GT_code)
123 return
124
125 def get_DNAVAF(self):
126 return self.DNA_AF
127
128 def get_RNAVAF(self):
129 return self.RNA_AF
130
131 def get_NORMALVAF(self):
132 return self.NORMAL_AF
133
134 def _findGTCode(self, chrom, format, DNA_TUMOR, RNA_TUMOR, start):
135 pos=-1
136 data= string.split(format,":")
137 for i in range(0,len(data)):
138 if data[i]== "GT":
139 pos=i
140 break
141 if pos ==-1:
142 return None
143
144 DNA_AD=-1
145 RNA_AD=-1
146 DNA_GT_code =-1
147 RNA_GT_code =-1
148
149 if DNA_TUMOR not in ["","."]:
150 data= string.split(DNA_TUMOR,":")
151 if len(string.split(data[pos],'/'))>=2:
152 DNA_GT_code = int(string.split(data[pos],'/')[-1]) # the last segment
153 if chrom in ["chrX","chrY"]: ##### the really stupid thing RADIA does to set GT=0 reference on chrX and Y even when there is clear evidence of altnative allele. can't believe this!
154 #parse data to figure this out really stupid way to do it.
155 AF_pos=-1
156 data= string.split(format,":")
157 for i in range(0,len(data)):
158 if data[i]== "AD":
159 AF_pos= i
160 if AF_pos==-1 :
161 DNA_GT_code =-1
162 elif DNA_TUMOR not in ["","."]:
163 data= string.split(DNA_TUMOR,":")
164 data = string.split(data[AF_pos],",")
165 if len(data)<2:
166 DNA_GT_code =-1
167 elif len(data)==2: # ref, alt1
168 DNA_GT_code =1
169 DNA_AD = float(data[DNA_GT_code])
170 else:
171 DNA_GT_code =1
172 DNA_AD = float(data[DNA_GT_code])
173 for i in range (2, len(data)):
174 if float(data[i]) > float(data[DNA_GT_code]):#ref, alt1, alt2, alt3:
175 DNA_GT_code = i
176 DNA_AD = float(data[DNA_GT_code])
177 else:
178 DNA_GT_code =-1
179 else:
180 DNA_GT_code =-1
181
182 if RNA_TUMOR not in ["","."]:
183 data= string.split(RNA_TUMOR,":")
184 if len(string.split(data[pos],'/'))>=2:
185 RNA_GT_code = int(string.split(data[pos],'/')[-1]) # the last segment
186 if chrom in ["chrX","chrY"]: ##### the really stupid thing RADIA does to set GT=0 reference on chrX and Y even when there is clear evidence of altnative allele. can't believe this!
187 #parse data to figure this out myself!
188 AF_pos=-1
189 data= string.split(format,":")
190 for i in range(0,len(data)):
191 if data[i]== "AD":
192 AF_pos=i
193 if AF_pos==-1 :
194 RNA_GT_code =-1
195 elif RNA_TUMOR not in ["","."]:
196 data= string.split(RNA_TUMOR,":")
197 data = string.split(data[AF_pos],",")
198 if len(data)<2:
199 RNA_GT_code =-1
200 elif len(data)==2: # ref, alt1
201 RNA_GT_code =1
202 RNA_AD = float(data[RNA_GT_code])
203 else:
204 RNA_GT_code =1
205 RNA_AD = float(data[RNA_GT_code])
206 for i in range (2, len(data)):
207 if float(data[i]) > float(data[RNA_GT_code]):#ref, alt1, alt2, alt3:
208 RNA_GT_code = i
209 RNA_AD = float(data[RNA_GT_code])
210 else:
211 RNA_GT_code =-1
212 else:
213 RNA_GT_code =-1
214
215 if DNA_GT_code in [-1,0] and RNA_GT_code > 0:
216 return RNA_GT_code
217 if RNA_GT_code in [-1,0] and DNA_GT_code >0:
218 return DNA_GT_code
219 if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code == RNA_GT_code:
220 return DNA_GT_code
221 if DNA_GT_code < 0 and RNA_GT_code < 0:
222 return None
223 if DNA_GT_code == 0 or RNA_GT_code == 0: #algorithms thinks the alt is just noise, but that the alt genotype
224 return 1 #essentially pick one of the alt
225 if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code != RNA_GT_code and (chrom not in ["chrX","chrY"]):
226 return None
227 if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code != RNA_GT_code and (chrom in ["chrX","chrY"]): # really stupid RADIA chrX and Y handling
228 if RNA_AD > DNA_AD:
229 return RNA_GT_code
230 else:
231 return DNA_GT_code
232
233 def _parse_NORMAL_ALT_ID (self, ID, format, DNA_NORMAL, GT_code):
234 #get the "ID" column in VCF
235 pos=-1
236 data= string.split(format,":")
237 for i in range(0,len(data)):
238 if data[i]== ID:
239 pos=i
240 if pos==-1 :
241 return None
242
243 if DNA_NORMAL not in ["","."]:
244 data= string.split(DNA_NORMAL,":")
245 try:
246 DNA_ID_val = string.split(data[pos],",")[GT_code]
247 except:
248 DNA_ID_val="NA"
249 else:
250 DNA_ID_val="NA"
251
252 return DNA_ID_val
253
254
255 def _parse_TUMOR_ALT_ID (self, ID,format,DNA_TUMOR,RNA_TUMOR, GT_code):
256 #get the "ID" column in VCF
257 pos=-1
258 data= string.split(format,":")
259 for i in range(0,len(data)):
260 if data[i]== ID:
261 pos=i
262 if pos==-1 :
263 return None
264
265
266 if DNA_TUMOR not in ["","."]:
267 data= string.split(DNA_TUMOR,":")
268 try:
269 DNA_ID_val = string.split(data[pos],",")[GT_code]
270 except:
271 DNA_ID_val="NA"
272 else:
273 DNA_ID_val="NA"
274
275 if RNA_TUMOR not in ["","."]:
276 data= string.split(RNA_TUMOR,":")
277 try:
278 RNA_ID_val = string.split(data[pos],",")[GT_code]
279 except:
280 RNA_ID_val="NA"
281 else:
282 RNA_ID_val="NA"
283 return [DNA_ID_val,RNA_ID_val]
284
285 def _parse_NORMAL_SINGLE_ID (self, ID,format,DNA_NORMAL):
286 #get the "ID" column in VCF
287 pos=-1
288 data= string.split(format,":")
289 for i in range(0,len(data)):
290 if data[i]== ID:
291 pos=i
292 if pos==-1 :
293 return None
294
295 if DNA_NORMAL not in ["","."]:
296 data= string.split(DNA_NORMAL,":")
297 DNA_ID_val = data[pos]
298 else:
299 DNA_ID_val="NA"
300
301 return DNA_ID_val
302
303 def _parse_TUMOR_SINGLE_ID (self, ID,format,DNA_TUMOR,RNA_TUMOR):
304 #get the "ID" column in VCF
305 pos=-1
306 data= string.split(format,":")
307 for i in range(0,len(data)):
308 if data[i]== ID:
309 pos=i
310 if pos==-1 :
311 return None
312
313 if DNA_TUMOR not in ["","."]:
314 data= string.split(DNA_TUMOR,":")
315 DNA_ID_val = data[pos]
316 else:
317 DNA_ID_val="NA"
318
319 if RNA_TUMOR not in ["","."]:
320 data= string.split(RNA_TUMOR,":")
321 RNA_ID_val = data[pos]
322 else:
323 RNA_ID_val="NA"
324 return [DNA_ID_val,RNA_ID_val]
325
326 def _parseEffectsPerGene(self, effectString, columnLabels, GT_code):
327 if effectString =="":
328 return {}
329 effectPerGene = dict()
330 effects = re.sub("EFF=", "", effectString).split(",")
331 for thisEffect in effects:
332 effectType = thisEffect.split("(")[0]
333 #
334 # Given a string such as
335 # downstream_gene_variant(MODIFIER||3956||459|CA9|||NM_001216.2||1)
336 # extract the stuff between the parens, divide it by |, and store
337 # it in a dictionary indexed by the column labels given as input
338 #
339 effectTokens = re.sub(".+\(", "",
340 re.sub("\)", "", thisEffect)).split("|")
341 effect = dict()
342 for ii in range(0,len(effectTokens)):
343 effect[columnLabels[ii]] = effectTokens[ii]
344
345 #match GT_code
346 if GT_code and GT_code != int(effect["Genotype_Number"]):
347 continue
348
349 effect["effect"] = effectType
350 #
351 # Parse through the list of effects. Extract the gene.
352 # Save one effect per gene, choosing an arbitrary effect
353 # from the most severe effect class.
354 #
355 thisGene = effect["Gene_Name"]
356 if not effectPerGene.has_key(thisGene):
357 effectPerGene[thisGene] = effect
358 else:
359 impactThisEffect = effect["Effect_Impact"]
360 worstImpactYet = effectPerGene[thisGene]["Effect_Impact"]
361 if impact[impactThisEffect] > impact[worstImpactYet]:
362 effectPerGene[thisGene] = effect
363 elif impact[impactThisEffect] == impact[worstImpactYet]:
364 if effect["Amino_Acid_length"] > effectPerGene[thisGene]["Amino_Acid_length"]:
365 effectPerGene[thisGene] = effect
366 return(effectPerGene)
367
368
369
370 class vcf(object):
371 """This object contains the set of rows from a VCF file"""
372 def __init__(self, stream):
373 self._effectColumn = None
374 self._rows = list()
375 self._indexNextRow = 0
376 for row in stream:
377 if re.search("^##INFO=<ID=EFF", row):
378 """Given a line of format
379
380 ##INFO=<ID=EFF,Number=.,Type=String,Description="Pred<icted
381 effects for this variant.Format: 'Effect (
382 Effect_Impact | Functional_Class | Codon_Change |
383 Amino_Acid_Change| Amino_Acid_length | Gene_Name |
384 Transcript_BioType | Gene_Coding | Transcript_ID |
385 Exon_Rank | Genotype_Number [ | ERRORS | WARNINGS ] )'">
386
387 Parse out the ordered list of tokens as they will appear:
388 ['Functional_Class', 'Codon_Change', 'Amino_Acid_Change',
389 'Amino_Acid_length', 'Gene_Name', 'Transcript_BioType',
390 'Gene_Coding', 'Transcript_ID', 'Exon_Rank', 'Genotype_Number'
391 'ERRORS']"""
392 row = re.sub("[\[\] ]", "", row)
393 row = re.sub("ERRORS|WARNINGS", "ERRORS", row)
394 self._effectColumn = re.split("[(|)]", row)[1:-1]
395 elif not re.search("^#", row):
396 # This is a row of VCF data
397 newVcfRow = vcfRow(row, self._effectColumn)
398 self._rows.append(newVcfRow)
399
400 def read(self):
401 return self._rows
402
403 import math
404
405 def round_sigfigs(num, sig_figs):
406 """Round to specified number of sigfigs.
407
408 >>> round_sigfigs(0, sig_figs=4)
409 0
410 >>> int(round_sigfigs(12345, sig_figs=2))
411 12000
412 >>> int(round_sigfigs(-12345, sig_figs=2))
413 -12000
414 >>> int(round_sigfigs(1, sig_figs=2))
415 1
416 >>> '{0:.3}'.format(round_sigfigs(3.1415, sig_figs=2))
417 '3.1'
418 >>> '{0:.3}'.format(round_sigfigs(-3.1415, sig_figs=2))
419 '-3.1'
420 >>> '{0:.5}'.format(round_sigfigs(0.00098765, sig_figs=2))
421 '0.00099'
422 >>> '{0:.6}'.format(round_sigfigs(0.00098765, sig_figs=3))
423 '0.000988'
424 """
425 if num != 0:
426 return round(num, -int(math.floor(math.log10(abs(num))) - (sig_figs - 1)))
427 else:
428 return 0 # Can't take the log of 0
429
430
431 def main():
432 parser = argparse.ArgumentParser()
433 parser.add_argument("ID", type=str, help="Entry for the ID column")
434 parser.add_argument("output", type=str, help="outputfile")
435 args = parser.parse_args()
436
437 myVcf = vcf(sys.stdin)
438
439 fout =open(args.ID,'w')
440
441 for row in myVcf.read():
442 #total =total+1
443 if row.alt =="NA":
444 continue ######## bad calls in the VCF
445 #good = good+1
446 if str(row.DNA_AF) not in ["NA",""]:
447 row.DNA_AF= round_sigfigs(float(row.DNA_AF),3)
448 else:
449 row.DNA_AF=""
450 if str(row.RNA_AF) not in ["NA",""]:
451 row.RNA_AF= round_sigfigs(float(row.RNA_AF),3)
452 else:
453 row.RNA_AF=""
454 if str(row.NORMAL_AF) not in ["NA",""]:
455 row.NORMAL_AF= round_sigfigs(float(row.NORMAL_AF),3)
456 else:
457 row.NORMAL_AF=""
458 if len(row.effectPerGene)!=0:
459 for gene in row.effectPerGene.keys():
460 AA_Change = row.effectPerGene[gene]["Amino_Acid_Change"]
461 if AA_Change !="" and AA_Change[:2]!="p.":
462 AA_Change="p."+AA_Change
463 fout.write(string.join([args.ID, row.chr, str(row.start),
464 str(row.end), row.reference, row.alt,
465 gene,row.effectPerGene[gene]["effect"],
466 str(row.DNA_AF), str(row.RNA_AF),AA_Change
467 #, str(row.NORMAL_AF)
468 ],"\t")+"\n")
469 else:
470 gene =""
471 AA_Change=""
472 effect =""
473 fout.write(string.join([args.ID, row.chr, str(row.start),
474 str(row.end), row.reference, row.alt,
475 gene,effect,
476 str(row.DNA_AF), str(row.RNA_AF),AA_Change
477 #, str(row.NORMAL_AF)
478 ],"\t")+"\n")
479 fout.close()
480
481 os.system("cat "+args.ID+" >> "+args.output)
482 os.system("rm -f "+args.ID)
483
484 if __name__ == '__main__':
485 main()
486
487