Mercurial > repos > melissacline > ucsc_cancer_utilities
comparison parseSnpEffVcf.py @ 47:23d98125d20c
parse snpEff output
author | jingchunzhu |
---|---|
date | Thu, 13 Aug 2015 23:26:33 -0700 |
parents | |
children | a38cc72edd75 |
comparison
equal
deleted
inserted
replaced
46:ebf3bc09c383 | 47:23d98125d20c |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import argparse | |
4 import re | |
5 import sys, os | |
6 import string | |
7 import math | |
8 | |
9 impact = {"MODIFIER":0, "LOW":1, "MODERATE":2, "HIGH":3} | |
10 | |
11 class vcfRow(object): | |
12 """This object contains an individual row of VCF output""" | |
13 def __init__(self, row, columnLabels, EFFECT=1): | |
14 tokens = row[:-1].split("\t") | |
15 #chromosome notation | |
16 chrom = tokens[0] | |
17 if string.upper(chrom[0:3])== "CHR" and chrom[0:3]!="chr": | |
18 chrom="chr"+chrom[3:] | |
19 elif string.upper(chrom[0:2])== "CH" and string.upper(chrom[2])!="R": | |
20 chrom= "chr"+chrom[2:] | |
21 elif chrom in ["23","X","x"]: | |
22 chrom="chrX" | |
23 elif chrom in ["24","Y","y"]: | |
24 chrom="chrY" | |
25 elif chrom in ["25","M","m"]: | |
26 chrom="chrM" | |
27 else: | |
28 chrom="chr"+chrom | |
29 | |
30 if chrom == "chr23": | |
31 chrom="chrX" | |
32 if chrom == "chr24": | |
33 chrom="chrY" | |
34 | |
35 self.chr = chrom | |
36 self.start = int(tokens[1]) | |
37 self.ID = tokens[2] | |
38 self.reference = tokens[3] | |
39 if (self.reference ==""): | |
40 self.reference="-" | |
41 self.alt = tokens[4] | |
42 if (self.alt ==""): | |
43 self.alt ="-" | |
44 self.end = self.start + len(self.reference) - 1 | |
45 self.DNA_AF="" | |
46 self.RNA_AF="" | |
47 self.NORMAL_AF="" | |
48 | |
49 effectsString = "" | |
50 for thisSubToken in tokens[7].split(";"): | |
51 if re.search("^EFF=", thisSubToken): | |
52 effectsString = thisSubToken | |
53 | |
54 GT_code=None | |
55 | |
56 if len(tokens)<=8: | |
57 pass | |
58 else: | |
59 # this could be all specific to RADIA output | |
60 format =tokens[8] | |
61 DNA_NORMAL = tokens[9] | |
62 DNA_TUMOR =tokens[10] | |
63 if len(tokens)>11: | |
64 RNA_TUMOR = tokens[11] | |
65 else: | |
66 RNA_TUMOR='' | |
67 | |
68 #get the ALT base code (in VCF: GT=0,1,2?, 1 and 2 are acceptable) | |
69 GT_code = self._findGTCode(self.chr, format, DNA_TUMOR, RNA_TUMOR, self.start) | |
70 | |
71 if GT_code !=None: | |
72 self.alt = string.split(tokens[4],",")[GT_code-1] | |
73 # AF allel frequency # this is all specific to RADIA output | |
74 ID="AD" | |
75 val =self._parse_TUMOR_ALT_ID (ID,format,DNA_TUMOR, RNA_TUMOR, GT_code) | |
76 if val != None: | |
77 self.DNA_AD, self.RNA_AD= val | |
78 ID="DP" | |
79 val = self._parse_TUMOR_SINGLE_ID(ID,format,DNA_TUMOR, RNA_TUMOR) | |
80 if val != None: | |
81 self.DNA_DP, self.RNA_DP= val | |
82 try: | |
83 self.DNA_AF = str(float(self.DNA_AD) /float(self.DNA_DP)) | |
84 except: | |
85 self.DNA_AF = "NA" | |
86 try: | |
87 self.RNA_AF = str(float(self.RNA_AD) /float(self.RNA_DP)) | |
88 except: | |
89 self.RNA_AF = "NA" | |
90 else: | |
91 self.DNA_AF, self.RNA_AF= "NA","NA" | |
92 else: # returned None | |
93 #print "AF error", row | |
94 self.DNA_AF, self.RNA_AF= "NA","NA" | |
95 | |
96 #get info on normal sample | |
97 ID = "AD" | |
98 val = self._parse_NORMAL_ALT_ID (ID,format,DNA_NORMAL, GT_code) | |
99 if val != None: | |
100 if val =="NA": | |
101 val =0 | |
102 self.NORMAL_AD = val | |
103 ID ="DP" | |
104 val = self._parse_NORMAL_SINGLE_ID(ID,format,DNA_NORMAL) | |
105 if val != None: | |
106 self.NORMAL_DP = val | |
107 try: | |
108 self.NORMAL_AF = str(float(self.NORMAL_AD) /float(self.NORMAL_DP)) | |
109 except: | |
110 self.NORMAL_AF = "NA" | |
111 else: | |
112 self.NORMAL_AF = "NA" | |
113 else: | |
114 self.NORMAL_AF = "NA" | |
115 else: | |
116 #print "GT error", row | |
117 self.alt = "NA" | |
118 self.DNA_AF, self.RNA_AF= "NA","NA" | |
119 | |
120 | |
121 if EFFECT: | |
122 self.effectPerGene = self._parseEffectsPerGene(effectsString, columnLabels, GT_code) | |
123 return | |
124 | |
125 def get_DNAVAF(self): | |
126 return self.DNA_AF | |
127 | |
128 def get_RNAVAF(self): | |
129 return self.RNA_AF | |
130 | |
131 def get_NORMALVAF(self): | |
132 return self.NORMAL_AF | |
133 | |
134 def _findGTCode(self, chrom, format, DNA_TUMOR, RNA_TUMOR, start): | |
135 pos=-1 | |
136 data= string.split(format,":") | |
137 for i in range(0,len(data)): | |
138 if data[i]== "GT": | |
139 pos=i | |
140 break | |
141 if pos ==-1: | |
142 return None | |
143 | |
144 DNA_AD=-1 | |
145 RNA_AD=-1 | |
146 DNA_GT_code =-1 | |
147 RNA_GT_code =-1 | |
148 | |
149 if DNA_TUMOR not in ["","."]: | |
150 data= string.split(DNA_TUMOR,":") | |
151 if len(string.split(data[pos],'/'))>=2: | |
152 DNA_GT_code = int(string.split(data[pos],'/')[-1]) # the last segment | |
153 if chrom in ["chrX","chrY"]: ##### the really stupid thing RADIA does to set GT=0 reference on chrX and Y even when there is clear evidence of altnative allele. can't believe this! | |
154 #parse data to figure this out really stupid way to do it. | |
155 AF_pos=-1 | |
156 data= string.split(format,":") | |
157 for i in range(0,len(data)): | |
158 if data[i]== "AD": | |
159 AF_pos= i | |
160 if AF_pos==-1 : | |
161 DNA_GT_code =-1 | |
162 elif DNA_TUMOR not in ["","."]: | |
163 data= string.split(DNA_TUMOR,":") | |
164 data = string.split(data[AF_pos],",") | |
165 if len(data)<2: | |
166 DNA_GT_code =-1 | |
167 elif len(data)==2: # ref, alt1 | |
168 DNA_GT_code =1 | |
169 DNA_AD = float(data[DNA_GT_code]) | |
170 else: | |
171 DNA_GT_code =1 | |
172 DNA_AD = float(data[DNA_GT_code]) | |
173 for i in range (2, len(data)): | |
174 if float(data[i]) > float(data[DNA_GT_code]):#ref, alt1, alt2, alt3: | |
175 DNA_GT_code = i | |
176 DNA_AD = float(data[DNA_GT_code]) | |
177 else: | |
178 DNA_GT_code =-1 | |
179 else: | |
180 DNA_GT_code =-1 | |
181 | |
182 if RNA_TUMOR not in ["","."]: | |
183 data= string.split(RNA_TUMOR,":") | |
184 if len(string.split(data[pos],'/'))>=2: | |
185 RNA_GT_code = int(string.split(data[pos],'/')[-1]) # the last segment | |
186 if chrom in ["chrX","chrY"]: ##### the really stupid thing RADIA does to set GT=0 reference on chrX and Y even when there is clear evidence of altnative allele. can't believe this! | |
187 #parse data to figure this out myself! | |
188 AF_pos=-1 | |
189 data= string.split(format,":") | |
190 for i in range(0,len(data)): | |
191 if data[i]== "AD": | |
192 AF_pos=i | |
193 if AF_pos==-1 : | |
194 RNA_GT_code =-1 | |
195 elif RNA_TUMOR not in ["","."]: | |
196 data= string.split(RNA_TUMOR,":") | |
197 data = string.split(data[AF_pos],",") | |
198 if len(data)<2: | |
199 RNA_GT_code =-1 | |
200 elif len(data)==2: # ref, alt1 | |
201 RNA_GT_code =1 | |
202 RNA_AD = float(data[RNA_GT_code]) | |
203 else: | |
204 RNA_GT_code =1 | |
205 RNA_AD = float(data[RNA_GT_code]) | |
206 for i in range (2, len(data)): | |
207 if float(data[i]) > float(data[RNA_GT_code]):#ref, alt1, alt2, alt3: | |
208 RNA_GT_code = i | |
209 RNA_AD = float(data[RNA_GT_code]) | |
210 else: | |
211 RNA_GT_code =-1 | |
212 else: | |
213 RNA_GT_code =-1 | |
214 | |
215 if DNA_GT_code in [-1,0] and RNA_GT_code > 0: | |
216 return RNA_GT_code | |
217 if RNA_GT_code in [-1,0] and DNA_GT_code >0: | |
218 return DNA_GT_code | |
219 if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code == RNA_GT_code: | |
220 return DNA_GT_code | |
221 if DNA_GT_code < 0 and RNA_GT_code < 0: | |
222 return None | |
223 if DNA_GT_code == 0 or RNA_GT_code == 0: #algorithms thinks the alt is just noise, but that the alt genotype | |
224 return 1 #essentially pick one of the alt | |
225 if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code != RNA_GT_code and (chrom not in ["chrX","chrY"]): | |
226 return None | |
227 if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code != RNA_GT_code and (chrom in ["chrX","chrY"]): # really stupid RADIA chrX and Y handling | |
228 if RNA_AD > DNA_AD: | |
229 return RNA_GT_code | |
230 else: | |
231 return DNA_GT_code | |
232 | |
233 def _parse_NORMAL_ALT_ID (self, ID, format, DNA_NORMAL, GT_code): | |
234 #get the "ID" column in VCF | |
235 pos=-1 | |
236 data= string.split(format,":") | |
237 for i in range(0,len(data)): | |
238 if data[i]== ID: | |
239 pos=i | |
240 if pos==-1 : | |
241 return None | |
242 | |
243 if DNA_NORMAL not in ["","."]: | |
244 data= string.split(DNA_NORMAL,":") | |
245 try: | |
246 DNA_ID_val = string.split(data[pos],",")[GT_code] | |
247 except: | |
248 DNA_ID_val="NA" | |
249 else: | |
250 DNA_ID_val="NA" | |
251 | |
252 return DNA_ID_val | |
253 | |
254 | |
255 def _parse_TUMOR_ALT_ID (self, ID,format,DNA_TUMOR,RNA_TUMOR, GT_code): | |
256 #get the "ID" column in VCF | |
257 pos=-1 | |
258 data= string.split(format,":") | |
259 for i in range(0,len(data)): | |
260 if data[i]== ID: | |
261 pos=i | |
262 if pos==-1 : | |
263 return None | |
264 | |
265 | |
266 if DNA_TUMOR not in ["","."]: | |
267 data= string.split(DNA_TUMOR,":") | |
268 try: | |
269 DNA_ID_val = string.split(data[pos],",")[GT_code] | |
270 except: | |
271 DNA_ID_val="NA" | |
272 else: | |
273 DNA_ID_val="NA" | |
274 | |
275 if RNA_TUMOR not in ["","."]: | |
276 data= string.split(RNA_TUMOR,":") | |
277 try: | |
278 RNA_ID_val = string.split(data[pos],",")[GT_code] | |
279 except: | |
280 RNA_ID_val="NA" | |
281 else: | |
282 RNA_ID_val="NA" | |
283 return [DNA_ID_val,RNA_ID_val] | |
284 | |
285 def _parse_NORMAL_SINGLE_ID (self, ID,format,DNA_NORMAL): | |
286 #get the "ID" column in VCF | |
287 pos=-1 | |
288 data= string.split(format,":") | |
289 for i in range(0,len(data)): | |
290 if data[i]== ID: | |
291 pos=i | |
292 if pos==-1 : | |
293 return None | |
294 | |
295 if DNA_NORMAL not in ["","."]: | |
296 data= string.split(DNA_NORMAL,":") | |
297 DNA_ID_val = data[pos] | |
298 else: | |
299 DNA_ID_val="NA" | |
300 | |
301 return DNA_ID_val | |
302 | |
303 def _parse_TUMOR_SINGLE_ID (self, ID,format,DNA_TUMOR,RNA_TUMOR): | |
304 #get the "ID" column in VCF | |
305 pos=-1 | |
306 data= string.split(format,":") | |
307 for i in range(0,len(data)): | |
308 if data[i]== ID: | |
309 pos=i | |
310 if pos==-1 : | |
311 return None | |
312 | |
313 if DNA_TUMOR not in ["","."]: | |
314 data= string.split(DNA_TUMOR,":") | |
315 DNA_ID_val = data[pos] | |
316 else: | |
317 DNA_ID_val="NA" | |
318 | |
319 if RNA_TUMOR not in ["","."]: | |
320 data= string.split(RNA_TUMOR,":") | |
321 RNA_ID_val = data[pos] | |
322 else: | |
323 RNA_ID_val="NA" | |
324 return [DNA_ID_val,RNA_ID_val] | |
325 | |
326 def _parseEffectsPerGene(self, effectString, columnLabels, GT_code): | |
327 if effectString =="": | |
328 return {} | |
329 effectPerGene = dict() | |
330 effects = re.sub("EFF=", "", effectString).split(",") | |
331 for thisEffect in effects: | |
332 effectType = thisEffect.split("(")[0] | |
333 # | |
334 # Given a string such as | |
335 # downstream_gene_variant(MODIFIER||3956||459|CA9|||NM_001216.2||1) | |
336 # extract the stuff between the parens, divide it by |, and store | |
337 # it in a dictionary indexed by the column labels given as input | |
338 # | |
339 effectTokens = re.sub(".+\(", "", | |
340 re.sub("\)", "", thisEffect)).split("|") | |
341 effect = dict() | |
342 for ii in range(0,len(effectTokens)): | |
343 effect[columnLabels[ii]] = effectTokens[ii] | |
344 | |
345 #match GT_code | |
346 if GT_code and GT_code != int(effect["Genotype_Number"]): | |
347 continue | |
348 | |
349 effect["effect"] = effectType | |
350 # | |
351 # Parse through the list of effects. Extract the gene. | |
352 # Save one effect per gene, choosing an arbitrary effect | |
353 # from the most severe effect class. | |
354 # | |
355 thisGene = effect["Gene_Name"] | |
356 if not effectPerGene.has_key(thisGene): | |
357 effectPerGene[thisGene] = effect | |
358 else: | |
359 impactThisEffect = effect["Effect_Impact"] | |
360 worstImpactYet = effectPerGene[thisGene]["Effect_Impact"] | |
361 if impact[impactThisEffect] > impact[worstImpactYet]: | |
362 effectPerGene[thisGene] = effect | |
363 elif impact[impactThisEffect] == impact[worstImpactYet]: | |
364 if effect["Amino_Acid_length"] > effectPerGene[thisGene]["Amino_Acid_length"]: | |
365 effectPerGene[thisGene] = effect | |
366 return(effectPerGene) | |
367 | |
368 | |
369 | |
370 class vcf(object): | |
371 """This object contains the set of rows from a VCF file""" | |
372 def __init__(self, stream): | |
373 self._effectColumn = None | |
374 self._rows = list() | |
375 self._indexNextRow = 0 | |
376 for row in stream: | |
377 if re.search("^##INFO=<ID=EFF", row): | |
378 """Given a line of format | |
379 | |
380 ##INFO=<ID=EFF,Number=.,Type=String,Description="Pred<icted | |
381 effects for this variant.Format: 'Effect ( | |
382 Effect_Impact | Functional_Class | Codon_Change | | |
383 Amino_Acid_Change| Amino_Acid_length | Gene_Name | | |
384 Transcript_BioType | Gene_Coding | Transcript_ID | | |
385 Exon_Rank | Genotype_Number [ | ERRORS | WARNINGS ] )'"> | |
386 | |
387 Parse out the ordered list of tokens as they will appear: | |
388 ['Functional_Class', 'Codon_Change', 'Amino_Acid_Change', | |
389 'Amino_Acid_length', 'Gene_Name', 'Transcript_BioType', | |
390 'Gene_Coding', 'Transcript_ID', 'Exon_Rank', 'Genotype_Number' | |
391 'ERRORS']""" | |
392 row = re.sub("[\[\] ]", "", row) | |
393 row = re.sub("ERRORS|WARNINGS", "ERRORS", row) | |
394 self._effectColumn = re.split("[(|)]", row)[1:-1] | |
395 elif not re.search("^#", row): | |
396 # This is a row of VCF data | |
397 newVcfRow = vcfRow(row, self._effectColumn) | |
398 self._rows.append(newVcfRow) | |
399 | |
400 def read(self): | |
401 return self._rows | |
402 | |
403 import math | |
404 | |
405 def round_sigfigs(num, sig_figs): | |
406 """Round to specified number of sigfigs. | |
407 | |
408 >>> round_sigfigs(0, sig_figs=4) | |
409 0 | |
410 >>> int(round_sigfigs(12345, sig_figs=2)) | |
411 12000 | |
412 >>> int(round_sigfigs(-12345, sig_figs=2)) | |
413 -12000 | |
414 >>> int(round_sigfigs(1, sig_figs=2)) | |
415 1 | |
416 >>> '{0:.3}'.format(round_sigfigs(3.1415, sig_figs=2)) | |
417 '3.1' | |
418 >>> '{0:.3}'.format(round_sigfigs(-3.1415, sig_figs=2)) | |
419 '-3.1' | |
420 >>> '{0:.5}'.format(round_sigfigs(0.00098765, sig_figs=2)) | |
421 '0.00099' | |
422 >>> '{0:.6}'.format(round_sigfigs(0.00098765, sig_figs=3)) | |
423 '0.000988' | |
424 """ | |
425 if num != 0: | |
426 return round(num, -int(math.floor(math.log10(abs(num))) - (sig_figs - 1))) | |
427 else: | |
428 return 0 # Can't take the log of 0 | |
429 | |
430 | |
431 def main(): | |
432 parser = argparse.ArgumentParser() | |
433 parser.add_argument("ID", type=str, help="Entry for the ID column") | |
434 parser.add_argument("output", type=str, help="outputfile") | |
435 args = parser.parse_args() | |
436 | |
437 myVcf = vcf(sys.stdin) | |
438 | |
439 fout =open(args.ID,'w') | |
440 | |
441 for row in myVcf.read(): | |
442 #total =total+1 | |
443 if row.alt =="NA": | |
444 continue ######## bad calls in the VCF | |
445 #good = good+1 | |
446 if str(row.DNA_AF) not in ["NA",""]: | |
447 row.DNA_AF= round_sigfigs(float(row.DNA_AF),3) | |
448 else: | |
449 row.DNA_AF="" | |
450 if str(row.RNA_AF) not in ["NA",""]: | |
451 row.RNA_AF= round_sigfigs(float(row.RNA_AF),3) | |
452 else: | |
453 row.RNA_AF="" | |
454 if str(row.NORMAL_AF) not in ["NA",""]: | |
455 row.NORMAL_AF= round_sigfigs(float(row.NORMAL_AF),3) | |
456 else: | |
457 row.NORMAL_AF="" | |
458 if len(row.effectPerGene)!=0: | |
459 for gene in row.effectPerGene.keys(): | |
460 AA_Change = row.effectPerGene[gene]["Amino_Acid_Change"] | |
461 if AA_Change !="" and AA_Change[:2]!="p.": | |
462 AA_Change="p."+AA_Change | |
463 fout.write(string.join([args.ID, row.chr, str(row.start), | |
464 str(row.end), row.reference, row.alt, | |
465 gene,row.effectPerGene[gene]["effect"], | |
466 str(row.DNA_AF), str(row.RNA_AF),AA_Change | |
467 #, str(row.NORMAL_AF) | |
468 ],"\t")+"\n") | |
469 else: | |
470 gene ="" | |
471 AA_Change="" | |
472 effect ="" | |
473 fout.write(string.join([args.ID, row.chr, str(row.start), | |
474 str(row.end), row.reference, row.alt, | |
475 gene,effect, | |
476 str(row.DNA_AF), str(row.RNA_AF),AA_Change | |
477 #, str(row.NORMAL_AF) | |
478 ],"\t")+"\n") | |
479 fout.close() | |
480 | |
481 os.system("cat "+args.ID+" >> "+args.output) | |
482 os.system("rm -f "+args.ID) | |
483 | |
484 if __name__ == '__main__': | |
485 main() | |
486 | |
487 |