annotate parseSnpEffVcf.py @ 55:1093078e7976

merge mutation data conform to new mutationVector data standard
author jingchunzhu
date Fri, 18 Sep 2015 10:24:39 -0700
parents a38cc72edd75
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
47
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
1 #!/usr/bin/env python
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
2
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
3 import argparse
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
4 import re
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
5 import sys, os
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
6 import string
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
7 import math
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
8
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
9 impact = {"MODIFIER":0, "LOW":1, "MODERATE":2, "HIGH":3}
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
10
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
11 class vcfRow(object):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
12 """This object contains an individual row of VCF output"""
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
13 def __init__(self, row, columnLabels, EFFECT=1):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
14 tokens = row[:-1].split("\t")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
15 #chromosome notation
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
16 chrom = tokens[0]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
17 if string.upper(chrom[0:3])== "CHR" and chrom[0:3]!="chr":
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
18 chrom="chr"+chrom[3:]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
19 elif string.upper(chrom[0:2])== "CH" and string.upper(chrom[2])!="R":
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
20 chrom= "chr"+chrom[2:]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
21 elif chrom in ["23","X","x"]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
22 chrom="chrX"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
23 elif chrom in ["24","Y","y"]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
24 chrom="chrY"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
25 elif chrom in ["25","M","m"]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
26 chrom="chrM"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
27 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
28 chrom="chr"+chrom
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
29
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
30 if chrom == "chr23":
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
31 chrom="chrX"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
32 if chrom == "chr24":
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
33 chrom="chrY"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
34
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
35 self.chr = chrom
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
36 self.start = int(tokens[1])
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
37 self.ID = tokens[2]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
38 self.reference = tokens[3]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
39 if (self.reference ==""):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
40 self.reference="-"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
41 self.alt = tokens[4]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
42 if (self.alt ==""):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
43 self.alt ="-"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
44 self.end = self.start + len(self.reference) - 1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
45 self.DNA_AF=""
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
46 self.RNA_AF=""
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
47 self.NORMAL_AF=""
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
48
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
49 effectsString = ""
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
50 for thisSubToken in tokens[7].split(";"):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
51 if re.search("^EFF=", thisSubToken):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
52 effectsString = thisSubToken
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
53
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
54 GT_code=None
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
55
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
56 if len(tokens)<=8:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
57 pass
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
58 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
59 # this could be all specific to RADIA output
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
60 format =tokens[8]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
61 DNA_NORMAL = tokens[9]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
62 DNA_TUMOR =tokens[10]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
63 if len(tokens)>11:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
64 RNA_TUMOR = tokens[11]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
65 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
66 RNA_TUMOR=''
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
67
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
68 #get the ALT base code (in VCF: GT=0,1,2?, 1 and 2 are acceptable)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
69 GT_code = self._findGTCode(self.chr, format, DNA_TUMOR, RNA_TUMOR, self.start)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
70
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
71 if GT_code !=None:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
72 self.alt = string.split(tokens[4],",")[GT_code-1]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
73 # AF allel frequency # this is all specific to RADIA output
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
74 ID="AD"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
75 val =self._parse_TUMOR_ALT_ID (ID,format,DNA_TUMOR, RNA_TUMOR, GT_code)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
76 if val != None:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
77 self.DNA_AD, self.RNA_AD= val
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
78 ID="DP"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
79 val = self._parse_TUMOR_SINGLE_ID(ID,format,DNA_TUMOR, RNA_TUMOR)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
80 if val != None:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
81 self.DNA_DP, self.RNA_DP= val
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
82 try:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
83 self.DNA_AF = str(float(self.DNA_AD) /float(self.DNA_DP))
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
84 except:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
85 self.DNA_AF = "NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
86 try:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
87 self.RNA_AF = str(float(self.RNA_AD) /float(self.RNA_DP))
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
88 except:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
89 self.RNA_AF = "NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
90 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
91 self.DNA_AF, self.RNA_AF= "NA","NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
92 else: # returned None
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
93 #print "AF error", row
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
94 self.DNA_AF, self.RNA_AF= "NA","NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
95
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
96 #get info on normal sample
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
97 ID = "AD"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
98 val = self._parse_NORMAL_ALT_ID (ID,format,DNA_NORMAL, GT_code)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
99 if val != None:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
100 if val =="NA":
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
101 val =0
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
102 self.NORMAL_AD = val
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
103 ID ="DP"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
104 val = self._parse_NORMAL_SINGLE_ID(ID,format,DNA_NORMAL)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
105 if val != None:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
106 self.NORMAL_DP = val
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
107 try:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
108 self.NORMAL_AF = str(float(self.NORMAL_AD) /float(self.NORMAL_DP))
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
109 except:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
110 self.NORMAL_AF = "NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
111 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
112 self.NORMAL_AF = "NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
113 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
114 self.NORMAL_AF = "NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
115 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
116 #print "GT error", row
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
117 self.alt = "NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
118 self.DNA_AF, self.RNA_AF= "NA","NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
119
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
120
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
121 if EFFECT:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
122 self.effectPerGene = self._parseEffectsPerGene(effectsString, columnLabels, GT_code)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
123 return
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
124
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
125 def get_DNAVAF(self):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
126 return self.DNA_AF
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
127
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
128 def get_RNAVAF(self):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
129 return self.RNA_AF
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
130
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
131 def get_NORMALVAF(self):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
132 return self.NORMAL_AF
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
133
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
134 def _findGTCode(self, chrom, format, DNA_TUMOR, RNA_TUMOR, start):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
135 pos=-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
136 data= string.split(format,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
137 for i in range(0,len(data)):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
138 if data[i]== "GT":
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
139 pos=i
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
140 break
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
141 if pos ==-1:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
142 return None
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
143
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
144 DNA_AD=-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
145 RNA_AD=-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
146 DNA_GT_code =-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
147 RNA_GT_code =-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
148
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
149 if DNA_TUMOR not in ["","."]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
150 data= string.split(DNA_TUMOR,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
151 if len(string.split(data[pos],'/'))>=2:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
152 DNA_GT_code = int(string.split(data[pos],'/')[-1]) # the last segment
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
153 if chrom in ["chrX","chrY"]: ##### the really stupid thing RADIA does to set GT=0 reference on chrX and Y even when there is clear evidence of altnative allele. can't believe this!
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
154 #parse data to figure this out really stupid way to do it.
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
155 AF_pos=-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
156 data= string.split(format,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
157 for i in range(0,len(data)):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
158 if data[i]== "AD":
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
159 AF_pos= i
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
160 if AF_pos==-1 :
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
161 DNA_GT_code =-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
162 elif DNA_TUMOR not in ["","."]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
163 data= string.split(DNA_TUMOR,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
164 data = string.split(data[AF_pos],",")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
165 if len(data)<2:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
166 DNA_GT_code =-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
167 elif len(data)==2: # ref, alt1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
168 DNA_GT_code =1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
169 DNA_AD = float(data[DNA_GT_code])
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
170 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
171 DNA_GT_code =1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
172 DNA_AD = float(data[DNA_GT_code])
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
173 for i in range (2, len(data)):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
174 if float(data[i]) > float(data[DNA_GT_code]):#ref, alt1, alt2, alt3:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
175 DNA_GT_code = i
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
176 DNA_AD = float(data[DNA_GT_code])
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
177 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
178 DNA_GT_code =-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
179 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
180 DNA_GT_code =-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
181
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
182 if RNA_TUMOR not in ["","."]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
183 data= string.split(RNA_TUMOR,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
184 if len(string.split(data[pos],'/'))>=2:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
185 RNA_GT_code = int(string.split(data[pos],'/')[-1]) # the last segment
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
186 if chrom in ["chrX","chrY"]: ##### the really stupid thing RADIA does to set GT=0 reference on chrX and Y even when there is clear evidence of altnative allele. can't believe this!
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
187 #parse data to figure this out myself!
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
188 AF_pos=-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
189 data= string.split(format,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
190 for i in range(0,len(data)):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
191 if data[i]== "AD":
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
192 AF_pos=i
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
193 if AF_pos==-1 :
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
194 RNA_GT_code =-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
195 elif RNA_TUMOR not in ["","."]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
196 data= string.split(RNA_TUMOR,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
197 data = string.split(data[AF_pos],",")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
198 if len(data)<2:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
199 RNA_GT_code =-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
200 elif len(data)==2: # ref, alt1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
201 RNA_GT_code =1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
202 RNA_AD = float(data[RNA_GT_code])
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
203 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
204 RNA_GT_code =1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
205 RNA_AD = float(data[RNA_GT_code])
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
206 for i in range (2, len(data)):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
207 if float(data[i]) > float(data[RNA_GT_code]):#ref, alt1, alt2, alt3:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
208 RNA_GT_code = i
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
209 RNA_AD = float(data[RNA_GT_code])
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
210 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
211 RNA_GT_code =-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
212 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
213 RNA_GT_code =-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
214
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
215 if DNA_GT_code in [-1,0] and RNA_GT_code > 0:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
216 return RNA_GT_code
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
217 if RNA_GT_code in [-1,0] and DNA_GT_code >0:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
218 return DNA_GT_code
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
219 if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code == RNA_GT_code:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
220 return DNA_GT_code
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
221 if DNA_GT_code < 0 and RNA_GT_code < 0:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
222 return None
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
223 if DNA_GT_code == 0 or RNA_GT_code == 0: #algorithms thinks the alt is just noise, but that the alt genotype
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
224 return 1 #essentially pick one of the alt
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
225 if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code != RNA_GT_code and (chrom not in ["chrX","chrY"]):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
226 return None
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
227 if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code != RNA_GT_code and (chrom in ["chrX","chrY"]): # really stupid RADIA chrX and Y handling
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
228 if RNA_AD > DNA_AD:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
229 return RNA_GT_code
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
230 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
231 return DNA_GT_code
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
232
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
233 def _parse_NORMAL_ALT_ID (self, ID, format, DNA_NORMAL, GT_code):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
234 #get the "ID" column in VCF
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
235 pos=-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
236 data= string.split(format,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
237 for i in range(0,len(data)):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
238 if data[i]== ID:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
239 pos=i
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
240 if pos==-1 :
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
241 return None
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
242
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
243 if DNA_NORMAL not in ["","."]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
244 data= string.split(DNA_NORMAL,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
245 try:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
246 DNA_ID_val = string.split(data[pos],",")[GT_code]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
247 except:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
248 DNA_ID_val="NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
249 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
250 DNA_ID_val="NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
251
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
252 return DNA_ID_val
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
253
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
254
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
255 def _parse_TUMOR_ALT_ID (self, ID,format,DNA_TUMOR,RNA_TUMOR, GT_code):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
256 #get the "ID" column in VCF
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
257 pos=-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
258 data= string.split(format,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
259 for i in range(0,len(data)):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
260 if data[i]== ID:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
261 pos=i
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
262 if pos==-1 :
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
263 return None
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
264
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
265
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
266 if DNA_TUMOR not in ["","."]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
267 data= string.split(DNA_TUMOR,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
268 try:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
269 DNA_ID_val = string.split(data[pos],",")[GT_code]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
270 except:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
271 DNA_ID_val="NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
272 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
273 DNA_ID_val="NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
274
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
275 if RNA_TUMOR not in ["","."]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
276 data= string.split(RNA_TUMOR,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
277 try:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
278 RNA_ID_val = string.split(data[pos],",")[GT_code]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
279 except:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
280 RNA_ID_val="NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
281 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
282 RNA_ID_val="NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
283 return [DNA_ID_val,RNA_ID_val]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
284
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
285 def _parse_NORMAL_SINGLE_ID (self, ID,format,DNA_NORMAL):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
286 #get the "ID" column in VCF
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
287 pos=-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
288 data= string.split(format,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
289 for i in range(0,len(data)):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
290 if data[i]== ID:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
291 pos=i
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
292 if pos==-1 :
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
293 return None
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
294
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
295 if DNA_NORMAL not in ["","."]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
296 data= string.split(DNA_NORMAL,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
297 DNA_ID_val = data[pos]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
298 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
299 DNA_ID_val="NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
300
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
301 return DNA_ID_val
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
302
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
303 def _parse_TUMOR_SINGLE_ID (self, ID,format,DNA_TUMOR,RNA_TUMOR):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
304 #get the "ID" column in VCF
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
305 pos=-1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
306 data= string.split(format,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
307 for i in range(0,len(data)):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
308 if data[i]== ID:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
309 pos=i
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
310 if pos==-1 :
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
311 return None
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
312
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
313 if DNA_TUMOR not in ["","."]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
314 data= string.split(DNA_TUMOR,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
315 DNA_ID_val = data[pos]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
316 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
317 DNA_ID_val="NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
318
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
319 if RNA_TUMOR not in ["","."]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
320 data= string.split(RNA_TUMOR,":")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
321 RNA_ID_val = data[pos]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
322 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
323 RNA_ID_val="NA"
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
324 return [DNA_ID_val,RNA_ID_val]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
325
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
326 def _parseEffectsPerGene(self, effectString, columnLabels, GT_code):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
327 if effectString =="":
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
328 return {}
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
329 effectPerGene = dict()
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
330 effects = re.sub("EFF=", "", effectString).split(",")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
331 for thisEffect in effects:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
332 effectType = thisEffect.split("(")[0]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
333 #
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
334 # Given a string such as
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
335 # downstream_gene_variant(MODIFIER||3956||459|CA9|||NM_001216.2||1)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
336 # extract the stuff between the parens, divide it by |, and store
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
337 # it in a dictionary indexed by the column labels given as input
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
338 #
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
339 effectTokens = re.sub(".+\(", "",
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
340 re.sub("\)", "", thisEffect)).split("|")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
341 effect = dict()
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
342 for ii in range(0,len(effectTokens)):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
343 effect[columnLabels[ii]] = effectTokens[ii]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
344
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
345 #match GT_code
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
346 if GT_code and GT_code != int(effect["Genotype_Number"]):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
347 continue
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
348
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
349 effect["effect"] = effectType
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
350 #
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
351 # Parse through the list of effects. Extract the gene.
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
352 # Save one effect per gene, choosing an arbitrary effect
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
353 # from the most severe effect class.
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
354 #
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
355 thisGene = effect["Gene_Name"]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
356 if not effectPerGene.has_key(thisGene):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
357 effectPerGene[thisGene] = effect
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
358 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
359 impactThisEffect = effect["Effect_Impact"]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
360 worstImpactYet = effectPerGene[thisGene]["Effect_Impact"]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
361 if impact[impactThisEffect] > impact[worstImpactYet]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
362 effectPerGene[thisGene] = effect
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
363 elif impact[impactThisEffect] == impact[worstImpactYet]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
364 if effect["Amino_Acid_length"] > effectPerGene[thisGene]["Amino_Acid_length"]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
365 effectPerGene[thisGene] = effect
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
366 return(effectPerGene)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
367
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
368
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
369
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
370 class vcf(object):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
371 """This object contains the set of rows from a VCF file"""
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
372 def __init__(self, stream):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
373 self._effectColumn = None
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
374 self._rows = list()
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
375 self._indexNextRow = 0
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
376 for row in stream:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
377 if re.search("^##INFO=<ID=EFF", row):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
378 """Given a line of format
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
379
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
380 ##INFO=<ID=EFF,Number=.,Type=String,Description="Pred<icted
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
381 effects for this variant.Format: 'Effect (
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
382 Effect_Impact | Functional_Class | Codon_Change |
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
383 Amino_Acid_Change| Amino_Acid_length | Gene_Name |
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
384 Transcript_BioType | Gene_Coding | Transcript_ID |
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
385 Exon_Rank | Genotype_Number [ | ERRORS | WARNINGS ] )'">
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
386
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
387 Parse out the ordered list of tokens as they will appear:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
388 ['Functional_Class', 'Codon_Change', 'Amino_Acid_Change',
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
389 'Amino_Acid_length', 'Gene_Name', 'Transcript_BioType',
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
390 'Gene_Coding', 'Transcript_ID', 'Exon_Rank', 'Genotype_Number'
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
391 'ERRORS']"""
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
392 row = re.sub("[\[\] ]", "", row)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
393 row = re.sub("ERRORS|WARNINGS", "ERRORS", row)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
394 self._effectColumn = re.split("[(|)]", row)[1:-1]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
395 elif not re.search("^#", row):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
396 # This is a row of VCF data
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
397 newVcfRow = vcfRow(row, self._effectColumn)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
398 self._rows.append(newVcfRow)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
399
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
400 def read(self):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
401 return self._rows
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
402
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
403 import math
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
404
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
405 def round_sigfigs(num, sig_figs):
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
406 """Round to specified number of sigfigs.
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
407
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
408 >>> round_sigfigs(0, sig_figs=4)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
409 0
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
410 >>> int(round_sigfigs(12345, sig_figs=2))
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
411 12000
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
412 >>> int(round_sigfigs(-12345, sig_figs=2))
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
413 -12000
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
414 >>> int(round_sigfigs(1, sig_figs=2))
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
415 1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
416 >>> '{0:.3}'.format(round_sigfigs(3.1415, sig_figs=2))
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
417 '3.1'
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
418 >>> '{0:.3}'.format(round_sigfigs(-3.1415, sig_figs=2))
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
419 '-3.1'
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
420 >>> '{0:.5}'.format(round_sigfigs(0.00098765, sig_figs=2))
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
421 '0.00099'
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
422 >>> '{0:.6}'.format(round_sigfigs(0.00098765, sig_figs=3))
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
423 '0.000988'
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
424 """
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
425 if num != 0:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
426 return round(num, -int(math.floor(math.log10(abs(num))) - (sig_figs - 1)))
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
427 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
428 return 0 # Can't take the log of 0
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
429
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
430
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
431 def main():
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
432 parser = argparse.ArgumentParser()
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
433 parser.add_argument("ID", type=str, help="Entry for the ID column")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
434 parser.add_argument("output", type=str, help="outputfile")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
435 args = parser.parse_args()
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
436
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
437 myVcf = vcf(sys.stdin)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
438
48
a38cc72edd75 xena header
jingchunzhu
parents: 47
diff changeset
439 tmpOutput = "file_"+args.ID
a38cc72edd75 xena header
jingchunzhu
parents: 47
diff changeset
440 fout =open(tmpOutput, 'w')
a38cc72edd75 xena header
jingchunzhu
parents: 47
diff changeset
441
a38cc72edd75 xena header
jingchunzhu
parents: 47
diff changeset
442 if args.ID[-4:]==".vcf":
a38cc72edd75 xena header
jingchunzhu
parents: 47
diff changeset
443 sampleID = args.ID[:-4]
a38cc72edd75 xena header
jingchunzhu
parents: 47
diff changeset
444 else:
a38cc72edd75 xena header
jingchunzhu
parents: 47
diff changeset
445 sampleID = args.ID
47
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
446
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
447 for row in myVcf.read():
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
448 #total =total+1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
449 if row.alt =="NA":
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
450 continue ######## bad calls in the VCF
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
451 #good = good+1
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
452 if str(row.DNA_AF) not in ["NA",""]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
453 row.DNA_AF= round_sigfigs(float(row.DNA_AF),3)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
454 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
455 row.DNA_AF=""
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
456 if str(row.RNA_AF) not in ["NA",""]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
457 row.RNA_AF= round_sigfigs(float(row.RNA_AF),3)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
458 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
459 row.RNA_AF=""
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
460 if str(row.NORMAL_AF) not in ["NA",""]:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
461 row.NORMAL_AF= round_sigfigs(float(row.NORMAL_AF),3)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
462 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
463 row.NORMAL_AF=""
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
464 if len(row.effectPerGene)!=0:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
465 for gene in row.effectPerGene.keys():
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
466 AA_Change = row.effectPerGene[gene]["Amino_Acid_Change"]
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
467 if AA_Change !="" and AA_Change[:2]!="p.":
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
468 AA_Change="p."+AA_Change
48
a38cc72edd75 xena header
jingchunzhu
parents: 47
diff changeset
469 fout.write(string.join([sampleID, row.chr, str(row.start),
47
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
470 str(row.end), row.reference, row.alt,
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
471 gene,row.effectPerGene[gene]["effect"],
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
472 str(row.DNA_AF), str(row.RNA_AF),AA_Change
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
473 #, str(row.NORMAL_AF)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
474 ],"\t")+"\n")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
475 else:
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
476 gene =""
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
477 AA_Change=""
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
478 effect =""
48
a38cc72edd75 xena header
jingchunzhu
parents: 47
diff changeset
479 fout.write(string.join([sampleID, row.chr, str(row.start),
47
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
480 str(row.end), row.reference, row.alt,
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
481 gene,effect,
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
482 str(row.DNA_AF), str(row.RNA_AF),AA_Change
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
483 #, str(row.NORMAL_AF)
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
484 ],"\t")+"\n")
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
485 fout.close()
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
486
48
a38cc72edd75 xena header
jingchunzhu
parents: 47
diff changeset
487 os.system("cat "+tmpOutput+" >> "+args.output)
a38cc72edd75 xena header
jingchunzhu
parents: 47
diff changeset
488 os.system("rm -f "+tmpOutput)
47
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
489
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
490 if __name__ == '__main__':
48
a38cc72edd75 xena header
jingchunzhu
parents: 47
diff changeset
491 main()
47
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
492
23d98125d20c parse snpEff output
jingchunzhu
parents:
diff changeset
493