47
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import argparse
|
|
4 import re
|
|
5 import sys, os
|
|
6 import string
|
|
7 import math
|
|
8
|
|
9 impact = {"MODIFIER":0, "LOW":1, "MODERATE":2, "HIGH":3}
|
|
10
|
|
11 class vcfRow(object):
|
|
12 """This object contains an individual row of VCF output"""
|
|
13 def __init__(self, row, columnLabels, EFFECT=1):
|
|
14 tokens = row[:-1].split("\t")
|
|
15 #chromosome notation
|
|
16 chrom = tokens[0]
|
|
17 if string.upper(chrom[0:3])== "CHR" and chrom[0:3]!="chr":
|
|
18 chrom="chr"+chrom[3:]
|
|
19 elif string.upper(chrom[0:2])== "CH" and string.upper(chrom[2])!="R":
|
|
20 chrom= "chr"+chrom[2:]
|
|
21 elif chrom in ["23","X","x"]:
|
|
22 chrom="chrX"
|
|
23 elif chrom in ["24","Y","y"]:
|
|
24 chrom="chrY"
|
|
25 elif chrom in ["25","M","m"]:
|
|
26 chrom="chrM"
|
|
27 else:
|
|
28 chrom="chr"+chrom
|
|
29
|
|
30 if chrom == "chr23":
|
|
31 chrom="chrX"
|
|
32 if chrom == "chr24":
|
|
33 chrom="chrY"
|
|
34
|
|
35 self.chr = chrom
|
|
36 self.start = int(tokens[1])
|
|
37 self.ID = tokens[2]
|
|
38 self.reference = tokens[3]
|
|
39 if (self.reference ==""):
|
|
40 self.reference="-"
|
|
41 self.alt = tokens[4]
|
|
42 if (self.alt ==""):
|
|
43 self.alt ="-"
|
|
44 self.end = self.start + len(self.reference) - 1
|
|
45 self.DNA_AF=""
|
|
46 self.RNA_AF=""
|
|
47 self.NORMAL_AF=""
|
|
48
|
|
49 effectsString = ""
|
|
50 for thisSubToken in tokens[7].split(";"):
|
|
51 if re.search("^EFF=", thisSubToken):
|
|
52 effectsString = thisSubToken
|
|
53
|
|
54 GT_code=None
|
|
55
|
|
56 if len(tokens)<=8:
|
|
57 pass
|
|
58 else:
|
|
59 # this could be all specific to RADIA output
|
|
60 format =tokens[8]
|
|
61 DNA_NORMAL = tokens[9]
|
|
62 DNA_TUMOR =tokens[10]
|
|
63 if len(tokens)>11:
|
|
64 RNA_TUMOR = tokens[11]
|
|
65 else:
|
|
66 RNA_TUMOR=''
|
|
67
|
|
68 #get the ALT base code (in VCF: GT=0,1,2?, 1 and 2 are acceptable)
|
|
69 GT_code = self._findGTCode(self.chr, format, DNA_TUMOR, RNA_TUMOR, self.start)
|
|
70
|
|
71 if GT_code !=None:
|
|
72 self.alt = string.split(tokens[4],",")[GT_code-1]
|
|
73 # AF allel frequency # this is all specific to RADIA output
|
|
74 ID="AD"
|
|
75 val =self._parse_TUMOR_ALT_ID (ID,format,DNA_TUMOR, RNA_TUMOR, GT_code)
|
|
76 if val != None:
|
|
77 self.DNA_AD, self.RNA_AD= val
|
|
78 ID="DP"
|
|
79 val = self._parse_TUMOR_SINGLE_ID(ID,format,DNA_TUMOR, RNA_TUMOR)
|
|
80 if val != None:
|
|
81 self.DNA_DP, self.RNA_DP= val
|
|
82 try:
|
|
83 self.DNA_AF = str(float(self.DNA_AD) /float(self.DNA_DP))
|
|
84 except:
|
|
85 self.DNA_AF = "NA"
|
|
86 try:
|
|
87 self.RNA_AF = str(float(self.RNA_AD) /float(self.RNA_DP))
|
|
88 except:
|
|
89 self.RNA_AF = "NA"
|
|
90 else:
|
|
91 self.DNA_AF, self.RNA_AF= "NA","NA"
|
|
92 else: # returned None
|
|
93 #print "AF error", row
|
|
94 self.DNA_AF, self.RNA_AF= "NA","NA"
|
|
95
|
|
96 #get info on normal sample
|
|
97 ID = "AD"
|
|
98 val = self._parse_NORMAL_ALT_ID (ID,format,DNA_NORMAL, GT_code)
|
|
99 if val != None:
|
|
100 if val =="NA":
|
|
101 val =0
|
|
102 self.NORMAL_AD = val
|
|
103 ID ="DP"
|
|
104 val = self._parse_NORMAL_SINGLE_ID(ID,format,DNA_NORMAL)
|
|
105 if val != None:
|
|
106 self.NORMAL_DP = val
|
|
107 try:
|
|
108 self.NORMAL_AF = str(float(self.NORMAL_AD) /float(self.NORMAL_DP))
|
|
109 except:
|
|
110 self.NORMAL_AF = "NA"
|
|
111 else:
|
|
112 self.NORMAL_AF = "NA"
|
|
113 else:
|
|
114 self.NORMAL_AF = "NA"
|
|
115 else:
|
|
116 #print "GT error", row
|
|
117 self.alt = "NA"
|
|
118 self.DNA_AF, self.RNA_AF= "NA","NA"
|
|
119
|
|
120
|
|
121 if EFFECT:
|
|
122 self.effectPerGene = self._parseEffectsPerGene(effectsString, columnLabels, GT_code)
|
|
123 return
|
|
124
|
|
125 def get_DNAVAF(self):
|
|
126 return self.DNA_AF
|
|
127
|
|
128 def get_RNAVAF(self):
|
|
129 return self.RNA_AF
|
|
130
|
|
131 def get_NORMALVAF(self):
|
|
132 return self.NORMAL_AF
|
|
133
|
|
134 def _findGTCode(self, chrom, format, DNA_TUMOR, RNA_TUMOR, start):
|
|
135 pos=-1
|
|
136 data= string.split(format,":")
|
|
137 for i in range(0,len(data)):
|
|
138 if data[i]== "GT":
|
|
139 pos=i
|
|
140 break
|
|
141 if pos ==-1:
|
|
142 return None
|
|
143
|
|
144 DNA_AD=-1
|
|
145 RNA_AD=-1
|
|
146 DNA_GT_code =-1
|
|
147 RNA_GT_code =-1
|
|
148
|
|
149 if DNA_TUMOR not in ["","."]:
|
|
150 data= string.split(DNA_TUMOR,":")
|
|
151 if len(string.split(data[pos],'/'))>=2:
|
|
152 DNA_GT_code = int(string.split(data[pos],'/')[-1]) # the last segment
|
|
153 if chrom in ["chrX","chrY"]: ##### the really stupid thing RADIA does to set GT=0 reference on chrX and Y even when there is clear evidence of altnative allele. can't believe this!
|
|
154 #parse data to figure this out really stupid way to do it.
|
|
155 AF_pos=-1
|
|
156 data= string.split(format,":")
|
|
157 for i in range(0,len(data)):
|
|
158 if data[i]== "AD":
|
|
159 AF_pos= i
|
|
160 if AF_pos==-1 :
|
|
161 DNA_GT_code =-1
|
|
162 elif DNA_TUMOR not in ["","."]:
|
|
163 data= string.split(DNA_TUMOR,":")
|
|
164 data = string.split(data[AF_pos],",")
|
|
165 if len(data)<2:
|
|
166 DNA_GT_code =-1
|
|
167 elif len(data)==2: # ref, alt1
|
|
168 DNA_GT_code =1
|
|
169 DNA_AD = float(data[DNA_GT_code])
|
|
170 else:
|
|
171 DNA_GT_code =1
|
|
172 DNA_AD = float(data[DNA_GT_code])
|
|
173 for i in range (2, len(data)):
|
|
174 if float(data[i]) > float(data[DNA_GT_code]):#ref, alt1, alt2, alt3:
|
|
175 DNA_GT_code = i
|
|
176 DNA_AD = float(data[DNA_GT_code])
|
|
177 else:
|
|
178 DNA_GT_code =-1
|
|
179 else:
|
|
180 DNA_GT_code =-1
|
|
181
|
|
182 if RNA_TUMOR not in ["","."]:
|
|
183 data= string.split(RNA_TUMOR,":")
|
|
184 if len(string.split(data[pos],'/'))>=2:
|
|
185 RNA_GT_code = int(string.split(data[pos],'/')[-1]) # the last segment
|
|
186 if chrom in ["chrX","chrY"]: ##### the really stupid thing RADIA does to set GT=0 reference on chrX and Y even when there is clear evidence of altnative allele. can't believe this!
|
|
187 #parse data to figure this out myself!
|
|
188 AF_pos=-1
|
|
189 data= string.split(format,":")
|
|
190 for i in range(0,len(data)):
|
|
191 if data[i]== "AD":
|
|
192 AF_pos=i
|
|
193 if AF_pos==-1 :
|
|
194 RNA_GT_code =-1
|
|
195 elif RNA_TUMOR not in ["","."]:
|
|
196 data= string.split(RNA_TUMOR,":")
|
|
197 data = string.split(data[AF_pos],",")
|
|
198 if len(data)<2:
|
|
199 RNA_GT_code =-1
|
|
200 elif len(data)==2: # ref, alt1
|
|
201 RNA_GT_code =1
|
|
202 RNA_AD = float(data[RNA_GT_code])
|
|
203 else:
|
|
204 RNA_GT_code =1
|
|
205 RNA_AD = float(data[RNA_GT_code])
|
|
206 for i in range (2, len(data)):
|
|
207 if float(data[i]) > float(data[RNA_GT_code]):#ref, alt1, alt2, alt3:
|
|
208 RNA_GT_code = i
|
|
209 RNA_AD = float(data[RNA_GT_code])
|
|
210 else:
|
|
211 RNA_GT_code =-1
|
|
212 else:
|
|
213 RNA_GT_code =-1
|
|
214
|
|
215 if DNA_GT_code in [-1,0] and RNA_GT_code > 0:
|
|
216 return RNA_GT_code
|
|
217 if RNA_GT_code in [-1,0] and DNA_GT_code >0:
|
|
218 return DNA_GT_code
|
|
219 if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code == RNA_GT_code:
|
|
220 return DNA_GT_code
|
|
221 if DNA_GT_code < 0 and RNA_GT_code < 0:
|
|
222 return None
|
|
223 if DNA_GT_code == 0 or RNA_GT_code == 0: #algorithms thinks the alt is just noise, but that the alt genotype
|
|
224 return 1 #essentially pick one of the alt
|
|
225 if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code != RNA_GT_code and (chrom not in ["chrX","chrY"]):
|
|
226 return None
|
|
227 if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code != RNA_GT_code and (chrom in ["chrX","chrY"]): # really stupid RADIA chrX and Y handling
|
|
228 if RNA_AD > DNA_AD:
|
|
229 return RNA_GT_code
|
|
230 else:
|
|
231 return DNA_GT_code
|
|
232
|
|
233 def _parse_NORMAL_ALT_ID (self, ID, format, DNA_NORMAL, GT_code):
|
|
234 #get the "ID" column in VCF
|
|
235 pos=-1
|
|
236 data= string.split(format,":")
|
|
237 for i in range(0,len(data)):
|
|
238 if data[i]== ID:
|
|
239 pos=i
|
|
240 if pos==-1 :
|
|
241 return None
|
|
242
|
|
243 if DNA_NORMAL not in ["","."]:
|
|
244 data= string.split(DNA_NORMAL,":")
|
|
245 try:
|
|
246 DNA_ID_val = string.split(data[pos],",")[GT_code]
|
|
247 except:
|
|
248 DNA_ID_val="NA"
|
|
249 else:
|
|
250 DNA_ID_val="NA"
|
|
251
|
|
252 return DNA_ID_val
|
|
253
|
|
254
|
|
255 def _parse_TUMOR_ALT_ID (self, ID,format,DNA_TUMOR,RNA_TUMOR, GT_code):
|
|
256 #get the "ID" column in VCF
|
|
257 pos=-1
|
|
258 data= string.split(format,":")
|
|
259 for i in range(0,len(data)):
|
|
260 if data[i]== ID:
|
|
261 pos=i
|
|
262 if pos==-1 :
|
|
263 return None
|
|
264
|
|
265
|
|
266 if DNA_TUMOR not in ["","."]:
|
|
267 data= string.split(DNA_TUMOR,":")
|
|
268 try:
|
|
269 DNA_ID_val = string.split(data[pos],",")[GT_code]
|
|
270 except:
|
|
271 DNA_ID_val="NA"
|
|
272 else:
|
|
273 DNA_ID_val="NA"
|
|
274
|
|
275 if RNA_TUMOR not in ["","."]:
|
|
276 data= string.split(RNA_TUMOR,":")
|
|
277 try:
|
|
278 RNA_ID_val = string.split(data[pos],",")[GT_code]
|
|
279 except:
|
|
280 RNA_ID_val="NA"
|
|
281 else:
|
|
282 RNA_ID_val="NA"
|
|
283 return [DNA_ID_val,RNA_ID_val]
|
|
284
|
|
285 def _parse_NORMAL_SINGLE_ID (self, ID,format,DNA_NORMAL):
|
|
286 #get the "ID" column in VCF
|
|
287 pos=-1
|
|
288 data= string.split(format,":")
|
|
289 for i in range(0,len(data)):
|
|
290 if data[i]== ID:
|
|
291 pos=i
|
|
292 if pos==-1 :
|
|
293 return None
|
|
294
|
|
295 if DNA_NORMAL not in ["","."]:
|
|
296 data= string.split(DNA_NORMAL,":")
|
|
297 DNA_ID_val = data[pos]
|
|
298 else:
|
|
299 DNA_ID_val="NA"
|
|
300
|
|
301 return DNA_ID_val
|
|
302
|
|
303 def _parse_TUMOR_SINGLE_ID (self, ID,format,DNA_TUMOR,RNA_TUMOR):
|
|
304 #get the "ID" column in VCF
|
|
305 pos=-1
|
|
306 data= string.split(format,":")
|
|
307 for i in range(0,len(data)):
|
|
308 if data[i]== ID:
|
|
309 pos=i
|
|
310 if pos==-1 :
|
|
311 return None
|
|
312
|
|
313 if DNA_TUMOR not in ["","."]:
|
|
314 data= string.split(DNA_TUMOR,":")
|
|
315 DNA_ID_val = data[pos]
|
|
316 else:
|
|
317 DNA_ID_val="NA"
|
|
318
|
|
319 if RNA_TUMOR not in ["","."]:
|
|
320 data= string.split(RNA_TUMOR,":")
|
|
321 RNA_ID_val = data[pos]
|
|
322 else:
|
|
323 RNA_ID_val="NA"
|
|
324 return [DNA_ID_val,RNA_ID_val]
|
|
325
|
|
326 def _parseEffectsPerGene(self, effectString, columnLabels, GT_code):
|
|
327 if effectString =="":
|
|
328 return {}
|
|
329 effectPerGene = dict()
|
|
330 effects = re.sub("EFF=", "", effectString).split(",")
|
|
331 for thisEffect in effects:
|
|
332 effectType = thisEffect.split("(")[0]
|
|
333 #
|
|
334 # Given a string such as
|
|
335 # downstream_gene_variant(MODIFIER||3956||459|CA9|||NM_001216.2||1)
|
|
336 # extract the stuff between the parens, divide it by |, and store
|
|
337 # it in a dictionary indexed by the column labels given as input
|
|
338 #
|
|
339 effectTokens = re.sub(".+\(", "",
|
|
340 re.sub("\)", "", thisEffect)).split("|")
|
|
341 effect = dict()
|
|
342 for ii in range(0,len(effectTokens)):
|
|
343 effect[columnLabels[ii]] = effectTokens[ii]
|
|
344
|
|
345 #match GT_code
|
|
346 if GT_code and GT_code != int(effect["Genotype_Number"]):
|
|
347 continue
|
|
348
|
|
349 effect["effect"] = effectType
|
|
350 #
|
|
351 # Parse through the list of effects. Extract the gene.
|
|
352 # Save one effect per gene, choosing an arbitrary effect
|
|
353 # from the most severe effect class.
|
|
354 #
|
|
355 thisGene = effect["Gene_Name"]
|
|
356 if not effectPerGene.has_key(thisGene):
|
|
357 effectPerGene[thisGene] = effect
|
|
358 else:
|
|
359 impactThisEffect = effect["Effect_Impact"]
|
|
360 worstImpactYet = effectPerGene[thisGene]["Effect_Impact"]
|
|
361 if impact[impactThisEffect] > impact[worstImpactYet]:
|
|
362 effectPerGene[thisGene] = effect
|
|
363 elif impact[impactThisEffect] == impact[worstImpactYet]:
|
|
364 if effect["Amino_Acid_length"] > effectPerGene[thisGene]["Amino_Acid_length"]:
|
|
365 effectPerGene[thisGene] = effect
|
|
366 return(effectPerGene)
|
|
367
|
|
368
|
|
369
|
|
370 class vcf(object):
|
|
371 """This object contains the set of rows from a VCF file"""
|
|
372 def __init__(self, stream):
|
|
373 self._effectColumn = None
|
|
374 self._rows = list()
|
|
375 self._indexNextRow = 0
|
|
376 for row in stream:
|
|
377 if re.search("^##INFO=<ID=EFF", row):
|
|
378 """Given a line of format
|
|
379
|
|
380 ##INFO=<ID=EFF,Number=.,Type=String,Description="Pred<icted
|
|
381 effects for this variant.Format: 'Effect (
|
|
382 Effect_Impact | Functional_Class | Codon_Change |
|
|
383 Amino_Acid_Change| Amino_Acid_length | Gene_Name |
|
|
384 Transcript_BioType | Gene_Coding | Transcript_ID |
|
|
385 Exon_Rank | Genotype_Number [ | ERRORS | WARNINGS ] )'">
|
|
386
|
|
387 Parse out the ordered list of tokens as they will appear:
|
|
388 ['Functional_Class', 'Codon_Change', 'Amino_Acid_Change',
|
|
389 'Amino_Acid_length', 'Gene_Name', 'Transcript_BioType',
|
|
390 'Gene_Coding', 'Transcript_ID', 'Exon_Rank', 'Genotype_Number'
|
|
391 'ERRORS']"""
|
|
392 row = re.sub("[\[\] ]", "", row)
|
|
393 row = re.sub("ERRORS|WARNINGS", "ERRORS", row)
|
|
394 self._effectColumn = re.split("[(|)]", row)[1:-1]
|
|
395 elif not re.search("^#", row):
|
|
396 # This is a row of VCF data
|
|
397 newVcfRow = vcfRow(row, self._effectColumn)
|
|
398 self._rows.append(newVcfRow)
|
|
399
|
|
400 def read(self):
|
|
401 return self._rows
|
|
402
|
|
403 import math
|
|
404
|
|
405 def round_sigfigs(num, sig_figs):
|
|
406 """Round to specified number of sigfigs.
|
|
407
|
|
408 >>> round_sigfigs(0, sig_figs=4)
|
|
409 0
|
|
410 >>> int(round_sigfigs(12345, sig_figs=2))
|
|
411 12000
|
|
412 >>> int(round_sigfigs(-12345, sig_figs=2))
|
|
413 -12000
|
|
414 >>> int(round_sigfigs(1, sig_figs=2))
|
|
415 1
|
|
416 >>> '{0:.3}'.format(round_sigfigs(3.1415, sig_figs=2))
|
|
417 '3.1'
|
|
418 >>> '{0:.3}'.format(round_sigfigs(-3.1415, sig_figs=2))
|
|
419 '-3.1'
|
|
420 >>> '{0:.5}'.format(round_sigfigs(0.00098765, sig_figs=2))
|
|
421 '0.00099'
|
|
422 >>> '{0:.6}'.format(round_sigfigs(0.00098765, sig_figs=3))
|
|
423 '0.000988'
|
|
424 """
|
|
425 if num != 0:
|
|
426 return round(num, -int(math.floor(math.log10(abs(num))) - (sig_figs - 1)))
|
|
427 else:
|
|
428 return 0 # Can't take the log of 0
|
|
429
|
|
430
|
|
431 def main():
|
|
432 parser = argparse.ArgumentParser()
|
|
433 parser.add_argument("ID", type=str, help="Entry for the ID column")
|
|
434 parser.add_argument("output", type=str, help="outputfile")
|
|
435 args = parser.parse_args()
|
|
436
|
|
437 myVcf = vcf(sys.stdin)
|
|
438
|
48
|
439 tmpOutput = "file_"+args.ID
|
|
440 fout =open(tmpOutput, 'w')
|
|
441
|
|
442 if args.ID[-4:]==".vcf":
|
|
443 sampleID = args.ID[:-4]
|
|
444 else:
|
|
445 sampleID = args.ID
|
47
|
446
|
|
447 for row in myVcf.read():
|
|
448 #total =total+1
|
|
449 if row.alt =="NA":
|
|
450 continue ######## bad calls in the VCF
|
|
451 #good = good+1
|
|
452 if str(row.DNA_AF) not in ["NA",""]:
|
|
453 row.DNA_AF= round_sigfigs(float(row.DNA_AF),3)
|
|
454 else:
|
|
455 row.DNA_AF=""
|
|
456 if str(row.RNA_AF) not in ["NA",""]:
|
|
457 row.RNA_AF= round_sigfigs(float(row.RNA_AF),3)
|
|
458 else:
|
|
459 row.RNA_AF=""
|
|
460 if str(row.NORMAL_AF) not in ["NA",""]:
|
|
461 row.NORMAL_AF= round_sigfigs(float(row.NORMAL_AF),3)
|
|
462 else:
|
|
463 row.NORMAL_AF=""
|
|
464 if len(row.effectPerGene)!=0:
|
|
465 for gene in row.effectPerGene.keys():
|
|
466 AA_Change = row.effectPerGene[gene]["Amino_Acid_Change"]
|
|
467 if AA_Change !="" and AA_Change[:2]!="p.":
|
|
468 AA_Change="p."+AA_Change
|
48
|
469 fout.write(string.join([sampleID, row.chr, str(row.start),
|
47
|
470 str(row.end), row.reference, row.alt,
|
|
471 gene,row.effectPerGene[gene]["effect"],
|
|
472 str(row.DNA_AF), str(row.RNA_AF),AA_Change
|
|
473 #, str(row.NORMAL_AF)
|
|
474 ],"\t")+"\n")
|
|
475 else:
|
|
476 gene =""
|
|
477 AA_Change=""
|
|
478 effect =""
|
48
|
479 fout.write(string.join([sampleID, row.chr, str(row.start),
|
47
|
480 str(row.end), row.reference, row.alt,
|
|
481 gene,effect,
|
|
482 str(row.DNA_AF), str(row.RNA_AF),AA_Change
|
|
483 #, str(row.NORMAL_AF)
|
|
484 ],"\t")+"\n")
|
|
485 fout.close()
|
|
486
|
48
|
487 os.system("cat "+tmpOutput+" >> "+args.output)
|
|
488 os.system("rm -f "+tmpOutput)
|
47
|
489
|
|
490 if __name__ == '__main__':
|
48
|
491 main()
|
47
|
492
|
|
493
|