ucsc_cancer_utilities: parseSnpEffVcf.py annotate

annotate parseSnpEffVcf.py @ 60:bf57076e27b9 default tip

change genomicSegment input data

author	jingchunzhu@gmail.com
date	Tue, 27 Oct 2015 16:07:09 -0700
parents	a38cc72edd75
children

rev	line source
47 23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	1 #!/usr/bin/env python
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	2
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	3 import argparse
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	4 import re
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	5 import sys, os
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	6 import string
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	7 import math
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	8
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	9 impact = {"MODIFIER":0, "LOW":1, "MODERATE":2, "HIGH":3}
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	10
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	11 class vcfRow(object):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	12 """This object contains an individual row of VCF output"""
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	13 def __init__(self, row, columnLabels, EFFECT=1):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	14 tokens = row[:-1].split("\t")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	15 #chromosome notation
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	16 chrom = tokens[0]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	17 if string.upper(chrom[0:3])== "CHR" and chrom[0:3]!="chr":
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	18 chrom="chr"+chrom[3:]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	19 elif string.upper(chrom[0:2])== "CH" and string.upper(chrom[2])!="R":
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	20 chrom= "chr"+chrom[2:]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	21 elif chrom in ["23","X","x"]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	22 chrom="chrX"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	23 elif chrom in ["24","Y","y"]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	24 chrom="chrY"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	25 elif chrom in ["25","M","m"]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	26 chrom="chrM"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	27 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	28 chrom="chr"+chrom
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	29
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	30 if chrom == "chr23":
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	31 chrom="chrX"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	32 if chrom == "chr24":
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	33 chrom="chrY"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	34
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	35 self.chr = chrom
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	36 self.start = int(tokens[1])
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	37 self.ID = tokens[2]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	38 self.reference = tokens[3]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	39 if (self.reference ==""):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	40 self.reference="-"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	41 self.alt = tokens[4]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	42 if (self.alt ==""):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	43 self.alt ="-"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	44 self.end = self.start + len(self.reference) - 1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	45 self.DNA_AF=""
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	46 self.RNA_AF=""
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	47 self.NORMAL_AF=""
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	48
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	49 effectsString = ""
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	50 for thisSubToken in tokens[7].split(";"):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	51 if re.search("^EFF=", thisSubToken):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	52 effectsString = thisSubToken
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	53
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	54 GT_code=None
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	55
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	56 if len(tokens)<=8:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	57 pass
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	58 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	59 # this could be all specific to RADIA output
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	60 format =tokens[8]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	61 DNA_NORMAL = tokens[9]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	62 DNA_TUMOR =tokens[10]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	63 if len(tokens)>11:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	64 RNA_TUMOR = tokens[11]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	65 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	66 RNA_TUMOR=''
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	67
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	68 #get the ALT base code (in VCF: GT=0,1,2?, 1 and 2 are acceptable)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	69 GT_code = self._findGTCode(self.chr, format, DNA_TUMOR, RNA_TUMOR, self.start)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	70
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	71 if GT_code !=None:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	72 self.alt = string.split(tokens[4],",")[GT_code-1]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	73 # AF allel frequency # this is all specific to RADIA output
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	74 ID="AD"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	75 val =self._parse_TUMOR_ALT_ID (ID,format,DNA_TUMOR, RNA_TUMOR, GT_code)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	76 if val != None:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	77 self.DNA_AD, self.RNA_AD= val
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	78 ID="DP"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	79 val = self._parse_TUMOR_SINGLE_ID(ID,format,DNA_TUMOR, RNA_TUMOR)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	80 if val != None:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	81 self.DNA_DP, self.RNA_DP= val
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	82 try:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	83 self.DNA_AF = str(float(self.DNA_AD) /float(self.DNA_DP))
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	84 except:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	85 self.DNA_AF = "NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	86 try:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	87 self.RNA_AF = str(float(self.RNA_AD) /float(self.RNA_DP))
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	88 except:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	89 self.RNA_AF = "NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	90 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	91 self.DNA_AF, self.RNA_AF= "NA","NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	92 else: # returned None
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	93 #print "AF error", row
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	94 self.DNA_AF, self.RNA_AF= "NA","NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	95
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	96 #get info on normal sample
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	97 ID = "AD"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	98 val = self._parse_NORMAL_ALT_ID (ID,format,DNA_NORMAL, GT_code)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	99 if val != None:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	100 if val =="NA":
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	101 val =0
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	102 self.NORMAL_AD = val
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	103 ID ="DP"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	104 val = self._parse_NORMAL_SINGLE_ID(ID,format,DNA_NORMAL)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	105 if val != None:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	106 self.NORMAL_DP = val
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	107 try:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	108 self.NORMAL_AF = str(float(self.NORMAL_AD) /float(self.NORMAL_DP))
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	109 except:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	110 self.NORMAL_AF = "NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	111 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	112 self.NORMAL_AF = "NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	113 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	114 self.NORMAL_AF = "NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	115 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	116 #print "GT error", row
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	117 self.alt = "NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	118 self.DNA_AF, self.RNA_AF= "NA","NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	119
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	120
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	121 if EFFECT:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	122 self.effectPerGene = self._parseEffectsPerGene(effectsString, columnLabels, GT_code)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	123 return
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	124
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	125 def get_DNAVAF(self):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	126 return self.DNA_AF
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	127
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	128 def get_RNAVAF(self):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	129 return self.RNA_AF
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	130
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	131 def get_NORMALVAF(self):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	132 return self.NORMAL_AF
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	133
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	134 def _findGTCode(self, chrom, format, DNA_TUMOR, RNA_TUMOR, start):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	135 pos=-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	136 data= string.split(format,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	137 for i in range(0,len(data)):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	138 if data[i]== "GT":
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	139 pos=i
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	140 break
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	141 if pos ==-1:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	142 return None
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	143
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	144 DNA_AD=-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	145 RNA_AD=-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	146 DNA_GT_code =-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	147 RNA_GT_code =-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	148
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	149 if DNA_TUMOR not in ["","."]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	150 data= string.split(DNA_TUMOR,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	151 if len(string.split(data[pos],'/'))>=2:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	152 DNA_GT_code = int(string.split(data[pos],'/')[-1]) # the last segment
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	153 if chrom in ["chrX","chrY"]: ##### the really stupid thing RADIA does to set GT=0 reference on chrX and Y even when there is clear evidence of altnative allele. can't believe this!
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	154 #parse data to figure this out really stupid way to do it.
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	155 AF_pos=-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	156 data= string.split(format,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	157 for i in range(0,len(data)):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	158 if data[i]== "AD":
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	159 AF_pos= i
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	160 if AF_pos==-1 :
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	161 DNA_GT_code =-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	162 elif DNA_TUMOR not in ["","."]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	163 data= string.split(DNA_TUMOR,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	164 data = string.split(data[AF_pos],",")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	165 if len(data)<2:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	166 DNA_GT_code =-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	167 elif len(data)==2: # ref, alt1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	168 DNA_GT_code =1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	169 DNA_AD = float(data[DNA_GT_code])
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	170 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	171 DNA_GT_code =1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	172 DNA_AD = float(data[DNA_GT_code])
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	173 for i in range (2, len(data)):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	174 if float(data[i]) > float(data[DNA_GT_code]):#ref, alt1, alt2, alt3:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	175 DNA_GT_code = i
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	176 DNA_AD = float(data[DNA_GT_code])
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	177 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	178 DNA_GT_code =-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	179 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	180 DNA_GT_code =-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	181
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	182 if RNA_TUMOR not in ["","."]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	183 data= string.split(RNA_TUMOR,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	184 if len(string.split(data[pos],'/'))>=2:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	185 RNA_GT_code = int(string.split(data[pos],'/')[-1]) # the last segment
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	186 if chrom in ["chrX","chrY"]: ##### the really stupid thing RADIA does to set GT=0 reference on chrX and Y even when there is clear evidence of altnative allele. can't believe this!
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	187 #parse data to figure this out myself!
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	188 AF_pos=-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	189 data= string.split(format,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	190 for i in range(0,len(data)):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	191 if data[i]== "AD":
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	192 AF_pos=i
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	193 if AF_pos==-1 :
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	194 RNA_GT_code =-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	195 elif RNA_TUMOR not in ["","."]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	196 data= string.split(RNA_TUMOR,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	197 data = string.split(data[AF_pos],",")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	198 if len(data)<2:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	199 RNA_GT_code =-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	200 elif len(data)==2: # ref, alt1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	201 RNA_GT_code =1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	202 RNA_AD = float(data[RNA_GT_code])
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	203 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	204 RNA_GT_code =1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	205 RNA_AD = float(data[RNA_GT_code])
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	206 for i in range (2, len(data)):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	207 if float(data[i]) > float(data[RNA_GT_code]):#ref, alt1, alt2, alt3:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	208 RNA_GT_code = i
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	209 RNA_AD = float(data[RNA_GT_code])
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	210 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	211 RNA_GT_code =-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	212 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	213 RNA_GT_code =-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	214
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	215 if DNA_GT_code in [-1,0] and RNA_GT_code > 0:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	216 return RNA_GT_code
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	217 if RNA_GT_code in [-1,0] and DNA_GT_code >0:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	218 return DNA_GT_code
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	219 if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code == RNA_GT_code:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	220 return DNA_GT_code
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	221 if DNA_GT_code < 0 and RNA_GT_code < 0:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	222 return None
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	223 if DNA_GT_code == 0 or RNA_GT_code == 0: #algorithms thinks the alt is just noise, but that the alt genotype
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	224 return 1 #essentially pick one of the alt
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	225 if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code != RNA_GT_code and (chrom not in ["chrX","chrY"]):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	226 return None
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	227 if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code != RNA_GT_code and (chrom in ["chrX","chrY"]): # really stupid RADIA chrX and Y handling
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	228 if RNA_AD > DNA_AD:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	229 return RNA_GT_code
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	230 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	231 return DNA_GT_code
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	232
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	233 def _parse_NORMAL_ALT_ID (self, ID, format, DNA_NORMAL, GT_code):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	234 #get the "ID" column in VCF
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	235 pos=-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	236 data= string.split(format,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	237 for i in range(0,len(data)):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	238 if data[i]== ID:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	239 pos=i
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	240 if pos==-1 :
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	241 return None
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	242
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	243 if DNA_NORMAL not in ["","."]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	244 data= string.split(DNA_NORMAL,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	245 try:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	246 DNA_ID_val = string.split(data[pos],",")[GT_code]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	247 except:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	248 DNA_ID_val="NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	249 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	250 DNA_ID_val="NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	251
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	252 return DNA_ID_val
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	253
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	254
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	255 def _parse_TUMOR_ALT_ID (self, ID,format,DNA_TUMOR,RNA_TUMOR, GT_code):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	256 #get the "ID" column in VCF
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	257 pos=-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	258 data= string.split(format,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	259 for i in range(0,len(data)):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	260 if data[i]== ID:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	261 pos=i
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	262 if pos==-1 :
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	263 return None
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	264
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	265
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	266 if DNA_TUMOR not in ["","."]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	267 data= string.split(DNA_TUMOR,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	268 try:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	269 DNA_ID_val = string.split(data[pos],",")[GT_code]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	270 except:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	271 DNA_ID_val="NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	272 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	273 DNA_ID_val="NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	274
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	275 if RNA_TUMOR not in ["","."]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	276 data= string.split(RNA_TUMOR,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	277 try:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	278 RNA_ID_val = string.split(data[pos],",")[GT_code]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	279 except:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	280 RNA_ID_val="NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	281 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	282 RNA_ID_val="NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	283 return [DNA_ID_val,RNA_ID_val]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	284
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	285 def _parse_NORMAL_SINGLE_ID (self, ID,format,DNA_NORMAL):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	286 #get the "ID" column in VCF
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	287 pos=-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	288 data= string.split(format,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	289 for i in range(0,len(data)):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	290 if data[i]== ID:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	291 pos=i
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	292 if pos==-1 :
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	293 return None
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	294
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	295 if DNA_NORMAL not in ["","."]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	296 data= string.split(DNA_NORMAL,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	297 DNA_ID_val = data[pos]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	298 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	299 DNA_ID_val="NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	300
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	301 return DNA_ID_val
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	302
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	303 def _parse_TUMOR_SINGLE_ID (self, ID,format,DNA_TUMOR,RNA_TUMOR):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	304 #get the "ID" column in VCF
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	305 pos=-1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	306 data= string.split(format,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	307 for i in range(0,len(data)):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	308 if data[i]== ID:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	309 pos=i
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	310 if pos==-1 :
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	311 return None
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	312
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	313 if DNA_TUMOR not in ["","."]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	314 data= string.split(DNA_TUMOR,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	315 DNA_ID_val = data[pos]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	316 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	317 DNA_ID_val="NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	318
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	319 if RNA_TUMOR not in ["","."]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	320 data= string.split(RNA_TUMOR,":")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	321 RNA_ID_val = data[pos]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	322 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	323 RNA_ID_val="NA"
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	324 return [DNA_ID_val,RNA_ID_val]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	325
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	326 def _parseEffectsPerGene(self, effectString, columnLabels, GT_code):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	327 if effectString =="":
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	328 return {}
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	329 effectPerGene = dict()
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	330 effects = re.sub("EFF=", "", effectString).split(",")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	331 for thisEffect in effects:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	332 effectType = thisEffect.split("(")[0]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	333 #
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	334 # Given a string such as
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	335 # downstream_gene_variant(MODIFIER\|\|3956\|\|459\|CA9\|\|\|NM_001216.2\|\|1)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	336 # extract the stuff between the parens, divide it by \|, and store
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	337 # it in a dictionary indexed by the column labels given as input
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	338 #
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	339 effectTokens = re.sub(".+\(", "",
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	340 re.sub("\)", "", thisEffect)).split("\|")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	341 effect = dict()
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	342 for ii in range(0,len(effectTokens)):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	343 effect[columnLabels[ii]] = effectTokens[ii]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	344
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	345 #match GT_code
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	346 if GT_code and GT_code != int(effect["Genotype_Number"]):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	347 continue
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	348
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	349 effect["effect"] = effectType
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	350 #
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	351 # Parse through the list of effects. Extract the gene.
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	352 # Save one effect per gene, choosing an arbitrary effect
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	353 # from the most severe effect class.
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	354 #
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	355 thisGene = effect["Gene_Name"]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	356 if not effectPerGene.has_key(thisGene):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	357 effectPerGene[thisGene] = effect
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	358 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	359 impactThisEffect = effect["Effect_Impact"]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	360 worstImpactYet = effectPerGene[thisGene]["Effect_Impact"]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	361 if impact[impactThisEffect] > impact[worstImpactYet]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	362 effectPerGene[thisGene] = effect
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	363 elif impact[impactThisEffect] == impact[worstImpactYet]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	364 if effect["Amino_Acid_length"] > effectPerGene[thisGene]["Amino_Acid_length"]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	365 effectPerGene[thisGene] = effect
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	366 return(effectPerGene)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	367
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	368
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	369
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	370 class vcf(object):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	371 """This object contains the set of rows from a VCF file"""
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	372 def __init__(self, stream):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	373 self._effectColumn = None
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	374 self._rows = list()
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	375 self._indexNextRow = 0
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	376 for row in stream:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	377 if re.search("^##INFO=<ID=EFF", row):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	378 """Given a line of format
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	379
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	380 ##INFO=<ID=EFF,Number=.,Type=String,Description="Pred<icted
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	381 effects for this variant.Format: 'Effect (
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	382 Effect_Impact \| Functional_Class \| Codon_Change \|
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	383 Amino_Acid_Change\| Amino_Acid_length \| Gene_Name \|
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	384 Transcript_BioType \| Gene_Coding \| Transcript_ID \|
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	385 Exon_Rank \| Genotype_Number [ \| ERRORS \| WARNINGS ] )'">
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	386
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	387 Parse out the ordered list of tokens as they will appear:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	388 ['Functional_Class', 'Codon_Change', 'Amino_Acid_Change',
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	389 'Amino_Acid_length', 'Gene_Name', 'Transcript_BioType',
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	390 'Gene_Coding', 'Transcript_ID', 'Exon_Rank', 'Genotype_Number'
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	391 'ERRORS']"""
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	392 row = re.sub("[\[\] ]", "", row)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	393 row = re.sub("ERRORS\|WARNINGS", "ERRORS", row)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	394 self._effectColumn = re.split("[(\|)]", row)[1:-1]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	395 elif not re.search("^#", row):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	396 # This is a row of VCF data
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	397 newVcfRow = vcfRow(row, self._effectColumn)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	398 self._rows.append(newVcfRow)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	399
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	400 def read(self):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	401 return self._rows
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	402
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	403 import math
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	404
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	405 def round_sigfigs(num, sig_figs):
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	406 """Round to specified number of sigfigs.
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	407
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	408 >>> round_sigfigs(0, sig_figs=4)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	409 0
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	410 >>> int(round_sigfigs(12345, sig_figs=2))
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	411 12000
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	412 >>> int(round_sigfigs(-12345, sig_figs=2))
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	413 -12000
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	414 >>> int(round_sigfigs(1, sig_figs=2))
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	415 1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	416 >>> '{0:.3}'.format(round_sigfigs(3.1415, sig_figs=2))
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	417 '3.1'
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	418 >>> '{0:.3}'.format(round_sigfigs(-3.1415, sig_figs=2))
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	419 '-3.1'
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	420 >>> '{0:.5}'.format(round_sigfigs(0.00098765, sig_figs=2))
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	421 '0.00099'
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	422 >>> '{0:.6}'.format(round_sigfigs(0.00098765, sig_figs=3))
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	423 '0.000988'
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	424 """
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	425 if num != 0:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	426 return round(num, -int(math.floor(math.log10(abs(num))) - (sig_figs - 1)))
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	427 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	428 return 0 # Can't take the log of 0
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	429
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	430
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	431 def main():
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	432 parser = argparse.ArgumentParser()
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	433 parser.add_argument("ID", type=str, help="Entry for the ID column")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	434 parser.add_argument("output", type=str, help="outputfile")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	435 args = parser.parse_args()
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	436
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	437 myVcf = vcf(sys.stdin)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	438
48 a38cc72edd75 xena header jingchunzhu parents: 47 diff changeset	439 tmpOutput = "file_"+args.ID
a38cc72edd75 xena header jingchunzhu parents: 47 diff changeset	440 fout =open(tmpOutput, 'w')
a38cc72edd75 xena header jingchunzhu parents: 47 diff changeset	441
a38cc72edd75 xena header jingchunzhu parents: 47 diff changeset	442 if args.ID[-4:]==".vcf":
a38cc72edd75 xena header jingchunzhu parents: 47 diff changeset	443 sampleID = args.ID[:-4]
a38cc72edd75 xena header jingchunzhu parents: 47 diff changeset	444 else:
a38cc72edd75 xena header jingchunzhu parents: 47 diff changeset	445 sampleID = args.ID
47 23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	446
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	447 for row in myVcf.read():
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	448 #total =total+1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	449 if row.alt =="NA":
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	450 continue ######## bad calls in the VCF
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	451 #good = good+1
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	452 if str(row.DNA_AF) not in ["NA",""]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	453 row.DNA_AF= round_sigfigs(float(row.DNA_AF),3)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	454 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	455 row.DNA_AF=""
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	456 if str(row.RNA_AF) not in ["NA",""]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	457 row.RNA_AF= round_sigfigs(float(row.RNA_AF),3)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	458 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	459 row.RNA_AF=""
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	460 if str(row.NORMAL_AF) not in ["NA",""]:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	461 row.NORMAL_AF= round_sigfigs(float(row.NORMAL_AF),3)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	462 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	463 row.NORMAL_AF=""
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	464 if len(row.effectPerGene)!=0:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	465 for gene in row.effectPerGene.keys():
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	466 AA_Change = row.effectPerGene[gene]["Amino_Acid_Change"]
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	467 if AA_Change !="" and AA_Change[:2]!="p.":
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	468 AA_Change="p."+AA_Change
48 a38cc72edd75 xena header jingchunzhu parents: 47 diff changeset	469 fout.write(string.join([sampleID, row.chr, str(row.start),
47 23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	470 str(row.end), row.reference, row.alt,
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	471 gene,row.effectPerGene[gene]["effect"],
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	472 str(row.DNA_AF), str(row.RNA_AF),AA_Change
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	473 #, str(row.NORMAL_AF)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	474 ],"\t")+"\n")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	475 else:
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	476 gene =""
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	477 AA_Change=""
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	478 effect =""
48 a38cc72edd75 xena header jingchunzhu parents: 47 diff changeset	479 fout.write(string.join([sampleID, row.chr, str(row.start),
47 23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	480 str(row.end), row.reference, row.alt,
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	481 gene,effect,
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	482 str(row.DNA_AF), str(row.RNA_AF),AA_Change
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	483 #, str(row.NORMAL_AF)
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	484 ],"\t")+"\n")
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	485 fout.close()
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	486
48 a38cc72edd75 xena header jingchunzhu parents: 47 diff changeset	487 os.system("cat "+tmpOutput+" >> "+args.output)
a38cc72edd75 xena header jingchunzhu parents: 47 diff changeset	488 os.system("rm -f "+tmpOutput)
47 23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	489
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	490 if __name__ == '__main__':
48 a38cc72edd75 xena header jingchunzhu parents: 47 diff changeset	491 main()
47 23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	492
23d98125d20c parse snpEff output jingchunzhu parents: diff changeset	493

Mercurial > repos > melissacline > ucsc_cancer_utilities

annotate parseSnpEffVcf.py @ 60:bf57076e27b9 default tip