# HG changeset patch # User jingchunzhu # Date 1439533593 25200 # Node ID 23d98125d20c11d477f3192f15e07c36b46b4c4f # Parent ebf3bc09c383757bf98db58652b99e7402e3a8f0 parse snpEff output diff -r ebf3bc09c383 -r 23d98125d20c parseSnpEffVcf.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/parseSnpEffVcf.py Thu Aug 13 23:26:33 2015 -0700 @@ -0,0 +1,487 @@ +#!/usr/bin/env python + +import argparse +import re +import sys, os +import string +import math + +impact = {"MODIFIER":0, "LOW":1, "MODERATE":2, "HIGH":3} + +class vcfRow(object): + """This object contains an individual row of VCF output""" + def __init__(self, row, columnLabels, EFFECT=1): + tokens = row[:-1].split("\t") + #chromosome notation + chrom = tokens[0] + if string.upper(chrom[0:3])== "CHR" and chrom[0:3]!="chr": + chrom="chr"+chrom[3:] + elif string.upper(chrom[0:2])== "CH" and string.upper(chrom[2])!="R": + chrom= "chr"+chrom[2:] + elif chrom in ["23","X","x"]: + chrom="chrX" + elif chrom in ["24","Y","y"]: + chrom="chrY" + elif chrom in ["25","M","m"]: + chrom="chrM" + else: + chrom="chr"+chrom + + if chrom == "chr23": + chrom="chrX" + if chrom == "chr24": + chrom="chrY" + + self.chr = chrom + self.start = int(tokens[1]) + self.ID = tokens[2] + self.reference = tokens[3] + if (self.reference ==""): + self.reference="-" + self.alt = tokens[4] + if (self.alt ==""): + self.alt ="-" + self.end = self.start + len(self.reference) - 1 + self.DNA_AF="" + self.RNA_AF="" + self.NORMAL_AF="" + + effectsString = "" + for thisSubToken in tokens[7].split(";"): + if re.search("^EFF=", thisSubToken): + effectsString = thisSubToken + + GT_code=None + + if len(tokens)<=8: + pass + else: + # this could be all specific to RADIA output + format =tokens[8] + DNA_NORMAL = tokens[9] + DNA_TUMOR =tokens[10] + if len(tokens)>11: + RNA_TUMOR = tokens[11] + else: + RNA_TUMOR='' + + #get the ALT base code (in VCF: GT=0,1,2?, 1 and 2 are acceptable) + GT_code = self._findGTCode(self.chr, format, DNA_TUMOR, RNA_TUMOR, self.start) + + if GT_code !=None: + self.alt = string.split(tokens[4],",")[GT_code-1] + # AF allel frequency # this is all specific to RADIA output + ID="AD" + val =self._parse_TUMOR_ALT_ID (ID,format,DNA_TUMOR, RNA_TUMOR, GT_code) + if val != None: + self.DNA_AD, self.RNA_AD= val + ID="DP" + val = self._parse_TUMOR_SINGLE_ID(ID,format,DNA_TUMOR, RNA_TUMOR) + if val != None: + self.DNA_DP, self.RNA_DP= val + try: + self.DNA_AF = str(float(self.DNA_AD) /float(self.DNA_DP)) + except: + self.DNA_AF = "NA" + try: + self.RNA_AF = str(float(self.RNA_AD) /float(self.RNA_DP)) + except: + self.RNA_AF = "NA" + else: + self.DNA_AF, self.RNA_AF= "NA","NA" + else: # returned None + #print "AF error", row + self.DNA_AF, self.RNA_AF= "NA","NA" + + #get info on normal sample + ID = "AD" + val = self._parse_NORMAL_ALT_ID (ID,format,DNA_NORMAL, GT_code) + if val != None: + if val =="NA": + val =0 + self.NORMAL_AD = val + ID ="DP" + val = self._parse_NORMAL_SINGLE_ID(ID,format,DNA_NORMAL) + if val != None: + self.NORMAL_DP = val + try: + self.NORMAL_AF = str(float(self.NORMAL_AD) /float(self.NORMAL_DP)) + except: + self.NORMAL_AF = "NA" + else: + self.NORMAL_AF = "NA" + else: + self.NORMAL_AF = "NA" + else: + #print "GT error", row + self.alt = "NA" + self.DNA_AF, self.RNA_AF= "NA","NA" + + + if EFFECT: + self.effectPerGene = self._parseEffectsPerGene(effectsString, columnLabels, GT_code) + return + + def get_DNAVAF(self): + return self.DNA_AF + + def get_RNAVAF(self): + return self.RNA_AF + + def get_NORMALVAF(self): + return self.NORMAL_AF + + def _findGTCode(self, chrom, format, DNA_TUMOR, RNA_TUMOR, start): + pos=-1 + data= string.split(format,":") + for i in range(0,len(data)): + if data[i]== "GT": + pos=i + break + if pos ==-1: + return None + + DNA_AD=-1 + RNA_AD=-1 + DNA_GT_code =-1 + RNA_GT_code =-1 + + if DNA_TUMOR not in ["","."]: + data= string.split(DNA_TUMOR,":") + if len(string.split(data[pos],'/'))>=2: + DNA_GT_code = int(string.split(data[pos],'/')[-1]) # the last segment + if chrom in ["chrX","chrY"]: ##### the really stupid thing RADIA does to set GT=0 reference on chrX and Y even when there is clear evidence of altnative allele. can't believe this! + #parse data to figure this out really stupid way to do it. + AF_pos=-1 + data= string.split(format,":") + for i in range(0,len(data)): + if data[i]== "AD": + AF_pos= i + if AF_pos==-1 : + DNA_GT_code =-1 + elif DNA_TUMOR not in ["","."]: + data= string.split(DNA_TUMOR,":") + data = string.split(data[AF_pos],",") + if len(data)<2: + DNA_GT_code =-1 + elif len(data)==2: # ref, alt1 + DNA_GT_code =1 + DNA_AD = float(data[DNA_GT_code]) + else: + DNA_GT_code =1 + DNA_AD = float(data[DNA_GT_code]) + for i in range (2, len(data)): + if float(data[i]) > float(data[DNA_GT_code]):#ref, alt1, alt2, alt3: + DNA_GT_code = i + DNA_AD = float(data[DNA_GT_code]) + else: + DNA_GT_code =-1 + else: + DNA_GT_code =-1 + + if RNA_TUMOR not in ["","."]: + data= string.split(RNA_TUMOR,":") + if len(string.split(data[pos],'/'))>=2: + RNA_GT_code = int(string.split(data[pos],'/')[-1]) # the last segment + if chrom in ["chrX","chrY"]: ##### the really stupid thing RADIA does to set GT=0 reference on chrX and Y even when there is clear evidence of altnative allele. can't believe this! + #parse data to figure this out myself! + AF_pos=-1 + data= string.split(format,":") + for i in range(0,len(data)): + if data[i]== "AD": + AF_pos=i + if AF_pos==-1 : + RNA_GT_code =-1 + elif RNA_TUMOR not in ["","."]: + data= string.split(RNA_TUMOR,":") + data = string.split(data[AF_pos],",") + if len(data)<2: + RNA_GT_code =-1 + elif len(data)==2: # ref, alt1 + RNA_GT_code =1 + RNA_AD = float(data[RNA_GT_code]) + else: + RNA_GT_code =1 + RNA_AD = float(data[RNA_GT_code]) + for i in range (2, len(data)): + if float(data[i]) > float(data[RNA_GT_code]):#ref, alt1, alt2, alt3: + RNA_GT_code = i + RNA_AD = float(data[RNA_GT_code]) + else: + RNA_GT_code =-1 + else: + RNA_GT_code =-1 + + if DNA_GT_code in [-1,0] and RNA_GT_code > 0: + return RNA_GT_code + if RNA_GT_code in [-1,0] and DNA_GT_code >0: + return DNA_GT_code + if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code == RNA_GT_code: + return DNA_GT_code + if DNA_GT_code < 0 and RNA_GT_code < 0: + return None + if DNA_GT_code == 0 or RNA_GT_code == 0: #algorithms thinks the alt is just noise, but that the alt genotype + return 1 #essentially pick one of the alt + if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code != RNA_GT_code and (chrom not in ["chrX","chrY"]): + return None + if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code != RNA_GT_code and (chrom in ["chrX","chrY"]): # really stupid RADIA chrX and Y handling + if RNA_AD > DNA_AD: + return RNA_GT_code + else: + return DNA_GT_code + + def _parse_NORMAL_ALT_ID (self, ID, format, DNA_NORMAL, GT_code): + #get the "ID" column in VCF + pos=-1 + data= string.split(format,":") + for i in range(0,len(data)): + if data[i]== ID: + pos=i + if pos==-1 : + return None + + if DNA_NORMAL not in ["","."]: + data= string.split(DNA_NORMAL,":") + try: + DNA_ID_val = string.split(data[pos],",")[GT_code] + except: + DNA_ID_val="NA" + else: + DNA_ID_val="NA" + + return DNA_ID_val + + + def _parse_TUMOR_ALT_ID (self, ID,format,DNA_TUMOR,RNA_TUMOR, GT_code): + #get the "ID" column in VCF + pos=-1 + data= string.split(format,":") + for i in range(0,len(data)): + if data[i]== ID: + pos=i + if pos==-1 : + return None + + + if DNA_TUMOR not in ["","."]: + data= string.split(DNA_TUMOR,":") + try: + DNA_ID_val = string.split(data[pos],",")[GT_code] + except: + DNA_ID_val="NA" + else: + DNA_ID_val="NA" + + if RNA_TUMOR not in ["","."]: + data= string.split(RNA_TUMOR,":") + try: + RNA_ID_val = string.split(data[pos],",")[GT_code] + except: + RNA_ID_val="NA" + else: + RNA_ID_val="NA" + return [DNA_ID_val,RNA_ID_val] + + def _parse_NORMAL_SINGLE_ID (self, ID,format,DNA_NORMAL): + #get the "ID" column in VCF + pos=-1 + data= string.split(format,":") + for i in range(0,len(data)): + if data[i]== ID: + pos=i + if pos==-1 : + return None + + if DNA_NORMAL not in ["","."]: + data= string.split(DNA_NORMAL,":") + DNA_ID_val = data[pos] + else: + DNA_ID_val="NA" + + return DNA_ID_val + + def _parse_TUMOR_SINGLE_ID (self, ID,format,DNA_TUMOR,RNA_TUMOR): + #get the "ID" column in VCF + pos=-1 + data= string.split(format,":") + for i in range(0,len(data)): + if data[i]== ID: + pos=i + if pos==-1 : + return None + + if DNA_TUMOR not in ["","."]: + data= string.split(DNA_TUMOR,":") + DNA_ID_val = data[pos] + else: + DNA_ID_val="NA" + + if RNA_TUMOR not in ["","."]: + data= string.split(RNA_TUMOR,":") + RNA_ID_val = data[pos] + else: + RNA_ID_val="NA" + return [DNA_ID_val,RNA_ID_val] + + def _parseEffectsPerGene(self, effectString, columnLabels, GT_code): + if effectString =="": + return {} + effectPerGene = dict() + effects = re.sub("EFF=", "", effectString).split(",") + for thisEffect in effects: + effectType = thisEffect.split("(")[0] + # + # Given a string such as + # downstream_gene_variant(MODIFIER||3956||459|CA9|||NM_001216.2||1) + # extract the stuff between the parens, divide it by |, and store + # it in a dictionary indexed by the column labels given as input + # + effectTokens = re.sub(".+\(", "", + re.sub("\)", "", thisEffect)).split("|") + effect = dict() + for ii in range(0,len(effectTokens)): + effect[columnLabels[ii]] = effectTokens[ii] + + #match GT_code + if GT_code and GT_code != int(effect["Genotype_Number"]): + continue + + effect["effect"] = effectType + # + # Parse through the list of effects. Extract the gene. + # Save one effect per gene, choosing an arbitrary effect + # from the most severe effect class. + # + thisGene = effect["Gene_Name"] + if not effectPerGene.has_key(thisGene): + effectPerGene[thisGene] = effect + else: + impactThisEffect = effect["Effect_Impact"] + worstImpactYet = effectPerGene[thisGene]["Effect_Impact"] + if impact[impactThisEffect] > impact[worstImpactYet]: + effectPerGene[thisGene] = effect + elif impact[impactThisEffect] == impact[worstImpactYet]: + if effect["Amino_Acid_length"] > effectPerGene[thisGene]["Amino_Acid_length"]: + effectPerGene[thisGene] = effect + return(effectPerGene) + + + +class vcf(object): + """This object contains the set of rows from a VCF file""" + def __init__(self, stream): + self._effectColumn = None + self._rows = list() + self._indexNextRow = 0 + for row in stream: + if re.search("^##INFO= + + Parse out the ordered list of tokens as they will appear: + ['Functional_Class', 'Codon_Change', 'Amino_Acid_Change', + 'Amino_Acid_length', 'Gene_Name', 'Transcript_BioType', + 'Gene_Coding', 'Transcript_ID', 'Exon_Rank', 'Genotype_Number' + 'ERRORS']""" + row = re.sub("[\[\] ]", "", row) + row = re.sub("ERRORS|WARNINGS", "ERRORS", row) + self._effectColumn = re.split("[(|)]", row)[1:-1] + elif not re.search("^#", row): + # This is a row of VCF data + newVcfRow = vcfRow(row, self._effectColumn) + self._rows.append(newVcfRow) + + def read(self): + return self._rows + +import math + +def round_sigfigs(num, sig_figs): + """Round to specified number of sigfigs. + + >>> round_sigfigs(0, sig_figs=4) + 0 + >>> int(round_sigfigs(12345, sig_figs=2)) + 12000 + >>> int(round_sigfigs(-12345, sig_figs=2)) + -12000 + >>> int(round_sigfigs(1, sig_figs=2)) + 1 + >>> '{0:.3}'.format(round_sigfigs(3.1415, sig_figs=2)) + '3.1' + >>> '{0:.3}'.format(round_sigfigs(-3.1415, sig_figs=2)) + '-3.1' + >>> '{0:.5}'.format(round_sigfigs(0.00098765, sig_figs=2)) + '0.00099' + >>> '{0:.6}'.format(round_sigfigs(0.00098765, sig_figs=3)) + '0.000988' + """ + if num != 0: + return round(num, -int(math.floor(math.log10(abs(num))) - (sig_figs - 1))) + else: + return 0 # Can't take the log of 0 + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("ID", type=str, help="Entry for the ID column") + parser.add_argument("output", type=str, help="outputfile") + args = parser.parse_args() + + myVcf = vcf(sys.stdin) + + fout =open(args.ID,'w') + + for row in myVcf.read(): + #total =total+1 + if row.alt =="NA": + continue ######## bad calls in the VCF + #good = good+1 + if str(row.DNA_AF) not in ["NA",""]: + row.DNA_AF= round_sigfigs(float(row.DNA_AF),3) + else: + row.DNA_AF="" + if str(row.RNA_AF) not in ["NA",""]: + row.RNA_AF= round_sigfigs(float(row.RNA_AF),3) + else: + row.RNA_AF="" + if str(row.NORMAL_AF) not in ["NA",""]: + row.NORMAL_AF= round_sigfigs(float(row.NORMAL_AF),3) + else: + row.NORMAL_AF="" + if len(row.effectPerGene)!=0: + for gene in row.effectPerGene.keys(): + AA_Change = row.effectPerGene[gene]["Amino_Acid_Change"] + if AA_Change !="" and AA_Change[:2]!="p.": + AA_Change="p."+AA_Change + fout.write(string.join([args.ID, row.chr, str(row.start), + str(row.end), row.reference, row.alt, + gene,row.effectPerGene[gene]["effect"], + str(row.DNA_AF), str(row.RNA_AF),AA_Change + #, str(row.NORMAL_AF) + ],"\t")+"\n") + else: + gene ="" + AA_Change="" + effect ="" + fout.write(string.join([args.ID, row.chr, str(row.start), + str(row.end), row.reference, row.alt, + gene,effect, + str(row.DNA_AF), str(row.RNA_AF),AA_Change + #, str(row.NORMAL_AF) + ],"\t")+"\n") + fout.close() + + os.system("cat "+args.ID+" >> "+args.output) + os.system("rm -f "+args.ID) + +if __name__ == '__main__': + main() + + diff -r ebf3bc09c383 -r 23d98125d20c vcfToXena.xml --- a/vcfToXena.xml Thu Aug 13 21:49:03 2015 -0700 +++ b/vcfToXena.xml Thu Aug 13 23:26:33 2015 -0700 @@ -1,9 +1,10 @@ - Vcf To Xena positional mutation data + Convert vcf To Xena ready mutation data java -Xmx4G -jar $__tool_directory__/snpEff/snpEff.jar -c $__tool_directory__/snpEff/snpEff.config -i vcf -upDownStreamLen 5000 $genome - $input > $snpeff_output + $input > $__tool_directory__/tmp ; + cat $__tool_directory__/tmp | python $__tool_directory__/parseSnpEffVcf.py $input.name $snpeff_output @@ -13,7 +14,7 @@ - + This tool convert vcf files to xena ready positional mutation data files.