changeset 47:23d98125d20c

parse snpEff output
author jingchunzhu
date Thu, 13 Aug 2015 23:26:33 -0700
parents ebf3bc09c383
children a38cc72edd75
files parseSnpEffVcf.py vcfToXena.xml
diffstat 2 files changed, 491 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/parseSnpEffVcf.py	Thu Aug 13 23:26:33 2015 -0700
@@ -0,0 +1,487 @@
+#!/usr/bin/env python
+
+import argparse
+import re
+import sys, os
+import string
+import math
+
+impact = {"MODIFIER":0, "LOW":1, "MODERATE":2, "HIGH":3}
+
+class vcfRow(object):
+    """This object contains an individual row of VCF output"""
+    def __init__(self, row, columnLabels, EFFECT=1):
+        tokens = row[:-1].split("\t")
+        #chromosome notation
+        chrom = tokens[0]
+        if string.upper(chrom[0:3])== "CHR" and chrom[0:3]!="chr":
+            chrom="chr"+chrom[3:]
+        elif string.upper(chrom[0:2])== "CH" and string.upper(chrom[2])!="R":
+            chrom= "chr"+chrom[2:]
+        elif chrom in ["23","X","x"]:
+            chrom="chrX"
+        elif chrom in ["24","Y","y"]:
+            chrom="chrY"
+        elif chrom in ["25","M","m"]:
+            chrom="chrM"
+        else:
+            chrom="chr"+chrom
+
+        if chrom == "chr23":
+            chrom="chrX"
+        if chrom == "chr24":
+            chrom="chrY"
+
+        self.chr = chrom
+        self.start = int(tokens[1])
+        self.ID = tokens[2]
+        self.reference = tokens[3]
+        if (self.reference ==""):
+            self.reference="-"
+        self.alt = tokens[4]
+        if (self.alt ==""):
+            self.alt ="-"
+        self.end = self.start + len(self.reference) - 1
+        self.DNA_AF=""
+        self.RNA_AF=""
+        self.NORMAL_AF=""
+
+        effectsString = ""
+        for thisSubToken in tokens[7].split(";"):
+            if re.search("^EFF=", thisSubToken):
+                effectsString = thisSubToken
+
+        GT_code=None
+
+        if len(tokens)<=8:
+            pass
+        else:
+            # this could be all specific to RADIA output
+            format =tokens[8]
+            DNA_NORMAL = tokens[9]
+            DNA_TUMOR =tokens[10]
+            if len(tokens)>11:
+                RNA_TUMOR = tokens[11]
+            else:
+                RNA_TUMOR=''
+
+            #get the ALT base code (in VCF: GT=0,1,2?, 1 and 2 are acceptable)
+            GT_code = self._findGTCode(self.chr, format, DNA_TUMOR, RNA_TUMOR, self.start)
+
+            if GT_code !=None:
+                self.alt = string.split(tokens[4],",")[GT_code-1]
+                # AF allel frequency # this is all specific to RADIA output
+                ID="AD"
+                val =self._parse_TUMOR_ALT_ID (ID,format,DNA_TUMOR, RNA_TUMOR, GT_code)
+                if val != None:
+                    self.DNA_AD, self.RNA_AD= val
+                    ID="DP"
+                    val = self._parse_TUMOR_SINGLE_ID(ID,format,DNA_TUMOR, RNA_TUMOR)
+                    if val != None:
+                        self.DNA_DP, self.RNA_DP= val
+                        try:
+                            self.DNA_AF = str(float(self.DNA_AD) /float(self.DNA_DP))
+                        except:
+                            self.DNA_AF = "NA"
+                        try:
+                            self.RNA_AF = str(float(self.RNA_AD) /float(self.RNA_DP))
+                        except:
+                            self.RNA_AF = "NA"
+                    else:
+                        self.DNA_AF, self.RNA_AF= "NA","NA"
+                else: # returned None
+                    #print "AF error", row
+                    self.DNA_AF, self.RNA_AF= "NA","NA"
+
+                #get info on normal sample
+                ID = "AD"
+                val = self._parse_NORMAL_ALT_ID (ID,format,DNA_NORMAL, GT_code)
+                if val != None:
+                    if val =="NA":
+                        val =0
+                    self.NORMAL_AD = val
+                    ID ="DP"
+                    val = self._parse_NORMAL_SINGLE_ID(ID,format,DNA_NORMAL)
+                    if val != None:
+                        self.NORMAL_DP = val
+                        try:
+                            self.NORMAL_AF = str(float(self.NORMAL_AD) /float(self.NORMAL_DP))
+                        except:
+                            self.NORMAL_AF = "NA"
+                    else:
+                        self.NORMAL_AF = "NA"
+                else:
+                    self.NORMAL_AF = "NA"
+            else:
+                #print "GT error", row
+                self.alt = "NA"
+                self.DNA_AF, self.RNA_AF= "NA","NA"
+
+
+        if EFFECT:
+            self.effectPerGene = self._parseEffectsPerGene(effectsString, columnLabels, GT_code)
+        return
+
+    def get_DNAVAF(self):
+        return self.DNA_AF
+
+    def get_RNAVAF(self):
+        return self.RNA_AF
+
+    def get_NORMALVAF(self):
+        return self.NORMAL_AF
+
+    def _findGTCode(self, chrom, format, DNA_TUMOR, RNA_TUMOR, start):
+        pos=-1
+        data= string.split(format,":")
+        for i in range(0,len(data)):
+            if data[i]== "GT":
+                pos=i
+                break
+        if pos ==-1:
+            return None
+
+        DNA_AD=-1
+        RNA_AD=-1
+        DNA_GT_code =-1
+        RNA_GT_code =-1
+
+        if DNA_TUMOR not in ["","."]:
+            data= string.split(DNA_TUMOR,":")
+            if len(string.split(data[pos],'/'))>=2:
+                DNA_GT_code = int(string.split(data[pos],'/')[-1]) # the last segment
+            if chrom in ["chrX","chrY"]: ##### the really stupid thing RADIA does to set GT=0 reference on chrX and Y even when there is clear evidence of altnative allele. can't believe this!
+                #parse data to figure this out really stupid way to do it.
+                AF_pos=-1
+                data= string.split(format,":")
+                for i in range(0,len(data)):
+                    if data[i]== "AD":
+                        AF_pos= i
+                if AF_pos==-1 :
+                    DNA_GT_code =-1
+                elif DNA_TUMOR not in ["","."]:
+                    data= string.split(DNA_TUMOR,":")
+                    data = string.split(data[AF_pos],",")
+                    if len(data)<2:
+                        DNA_GT_code =-1
+                    elif len(data)==2: # ref, alt1
+                        DNA_GT_code =1
+                        DNA_AD = float(data[DNA_GT_code])
+                    else:
+                        DNA_GT_code =1
+                        DNA_AD = float(data[DNA_GT_code])
+                        for i in range (2, len(data)):
+                            if float(data[i]) > float(data[DNA_GT_code]):#ref, alt1, alt2, alt3:
+                                DNA_GT_code = i
+                                DNA_AD = float(data[DNA_GT_code])
+                else:
+                    DNA_GT_code =-1
+        else:
+            DNA_GT_code =-1
+
+        if RNA_TUMOR not in ["","."]:
+            data= string.split(RNA_TUMOR,":")
+            if len(string.split(data[pos],'/'))>=2:
+                RNA_GT_code = int(string.split(data[pos],'/')[-1]) # the last segment
+            if chrom in ["chrX","chrY"]: ##### the really stupid thing RADIA does to set GT=0 reference on chrX and Y even when there is clear evidence of altnative allele. can't believe this!
+                #parse data to figure this out myself!
+                AF_pos=-1
+                data= string.split(format,":")
+                for i in range(0,len(data)):
+                    if data[i]== "AD":
+                        AF_pos=i
+                if AF_pos==-1 :
+                    RNA_GT_code =-1
+                elif RNA_TUMOR not in ["","."]:
+                    data= string.split(RNA_TUMOR,":")
+                    data = string.split(data[AF_pos],",")
+                    if len(data)<2:
+                        RNA_GT_code =-1
+                    elif len(data)==2: # ref, alt1
+                        RNA_GT_code =1
+                        RNA_AD = float(data[RNA_GT_code])
+                    else:
+                        RNA_GT_code =1
+                        RNA_AD = float(data[RNA_GT_code])
+                        for i in range (2, len(data)):
+                            if float(data[i]) > float(data[RNA_GT_code]):#ref, alt1, alt2, alt3:
+                                RNA_GT_code = i
+                                RNA_AD = float(data[RNA_GT_code])
+                else:
+                    RNA_GT_code =-1
+        else:
+            RNA_GT_code =-1
+
+        if DNA_GT_code in [-1,0] and RNA_GT_code > 0:
+            return RNA_GT_code
+        if RNA_GT_code in [-1,0] and DNA_GT_code >0:
+            return DNA_GT_code
+        if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code == RNA_GT_code:
+            return DNA_GT_code
+        if DNA_GT_code < 0 and RNA_GT_code < 0:
+            return None
+        if DNA_GT_code == 0 or RNA_GT_code == 0: #algorithms thinks the alt is just noise, but that the alt genotype
+            return 1 #essentially pick one of the alt
+        if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code != RNA_GT_code and (chrom not in ["chrX","chrY"]):
+            return None
+        if DNA_GT_code > 0 and RNA_GT_code > 0 and DNA_GT_code != RNA_GT_code and (chrom in ["chrX","chrY"]):  # really stupid RADIA chrX and Y handling
+            if RNA_AD > DNA_AD:
+               return RNA_GT_code
+            else:
+               return DNA_GT_code
+
+    def _parse_NORMAL_ALT_ID (self, ID, format, DNA_NORMAL, GT_code):
+        #get the "ID" column in VCF
+        pos=-1
+        data= string.split(format,":")
+        for i in range(0,len(data)):
+            if data[i]== ID:
+                pos=i
+        if pos==-1 :
+            return None
+
+        if DNA_NORMAL not in ["","."]:
+            data= string.split(DNA_NORMAL,":")
+            try:
+                DNA_ID_val = string.split(data[pos],",")[GT_code]
+            except:
+                DNA_ID_val="NA"
+        else:
+            DNA_ID_val="NA"
+
+        return DNA_ID_val
+
+
+    def _parse_TUMOR_ALT_ID (self, ID,format,DNA_TUMOR,RNA_TUMOR, GT_code):
+        #get the "ID" column in VCF
+        pos=-1
+        data= string.split(format,":")
+        for i in range(0,len(data)):
+            if data[i]== ID:
+                pos=i
+        if pos==-1 :
+            return None
+
+
+        if DNA_TUMOR not in ["","."]:
+            data= string.split(DNA_TUMOR,":")
+            try:
+                DNA_ID_val = string.split(data[pos],",")[GT_code]
+            except:
+                DNA_ID_val="NA"
+        else:
+            DNA_ID_val="NA"
+
+        if RNA_TUMOR not in ["","."]:
+            data= string.split(RNA_TUMOR,":")
+            try:
+                RNA_ID_val = string.split(data[pos],",")[GT_code]
+            except:
+                RNA_ID_val="NA"
+        else:
+            RNA_ID_val="NA"
+        return [DNA_ID_val,RNA_ID_val]
+
+    def _parse_NORMAL_SINGLE_ID (self, ID,format,DNA_NORMAL):
+        #get the "ID" column in VCF
+        pos=-1
+        data= string.split(format,":")
+        for i in range(0,len(data)):
+            if data[i]== ID:
+                pos=i
+        if pos==-1 :
+            return None
+
+        if DNA_NORMAL not in ["","."]:
+            data= string.split(DNA_NORMAL,":")
+            DNA_ID_val = data[pos]
+        else:
+            DNA_ID_val="NA"
+
+        return DNA_ID_val
+
+    def _parse_TUMOR_SINGLE_ID (self, ID,format,DNA_TUMOR,RNA_TUMOR):
+        #get the "ID" column in VCF
+        pos=-1
+        data= string.split(format,":")
+        for i in range(0,len(data)):
+            if data[i]== ID:
+                pos=i
+        if pos==-1 :
+            return None
+
+        if DNA_TUMOR not in ["","."]:
+            data= string.split(DNA_TUMOR,":")
+            DNA_ID_val = data[pos]
+        else:
+            DNA_ID_val="NA"
+
+        if RNA_TUMOR not in ["","."]:
+            data= string.split(RNA_TUMOR,":")
+            RNA_ID_val = data[pos]
+        else:
+            RNA_ID_val="NA"
+        return [DNA_ID_val,RNA_ID_val]
+
+    def _parseEffectsPerGene(self, effectString, columnLabels, GT_code):
+        if effectString =="":
+            return {}
+        effectPerGene = dict()
+        effects = re.sub("EFF=", "", effectString).split(",")
+        for thisEffect in effects:
+            effectType = thisEffect.split("(")[0]
+            #
+            # Given a string such as
+            # downstream_gene_variant(MODIFIER||3956||459|CA9|||NM_001216.2||1)
+            # extract the stuff between the parens, divide it by |, and store
+            # it in a dictionary indexed by the column labels given as input
+            #
+            effectTokens = re.sub(".+\(", "",
+                                  re.sub("\)", "", thisEffect)).split("|")
+            effect = dict()
+            for ii in range(0,len(effectTokens)):
+                effect[columnLabels[ii]] = effectTokens[ii]
+
+            #match GT_code
+            if GT_code and GT_code != int(effect["Genotype_Number"]):
+                continue
+
+            effect["effect"] = effectType
+            #
+            # Parse through the list of effects.  Extract the gene.
+            # Save one effect per gene, choosing an arbitrary effect
+            # from the most severe effect class.
+            #
+            thisGene = effect["Gene_Name"]
+            if not effectPerGene.has_key(thisGene):
+                effectPerGene[thisGene] = effect
+            else:
+                impactThisEffect = effect["Effect_Impact"]
+                worstImpactYet = effectPerGene[thisGene]["Effect_Impact"]
+                if impact[impactThisEffect] > impact[worstImpactYet]:
+                    effectPerGene[thisGene] = effect
+                elif impact[impactThisEffect] == impact[worstImpactYet]:
+                    if effect["Amino_Acid_length"] > effectPerGene[thisGene]["Amino_Acid_length"]:
+                        effectPerGene[thisGene] = effect
+        return(effectPerGene)
+
+
+
+class vcf(object):
+    """This object contains the set of rows from a VCF file"""
+    def __init__(self, stream):
+        self._effectColumn = None
+        self._rows = list()
+        self._indexNextRow = 0
+        for row in stream:
+            if re.search("^##INFO=<ID=EFF", row):
+                """Given a line of format
+
+                ##INFO=<ID=EFF,Number=.,Type=String,Description="Pred<icted
+                effects for this variant.Format: 'Effect (
+                Effect_Impact | Functional_Class | Codon_Change |
+                Amino_Acid_Change| Amino_Acid_length | Gene_Name |
+                Transcript_BioType | Gene_Coding | Transcript_ID |
+                Exon_Rank | Genotype_Number [ | ERRORS | WARNINGS ] )'">
+
+                Parse out the ordered list of tokens as they will appear:
+                ['Functional_Class', 'Codon_Change', 'Amino_Acid_Change',
+                'Amino_Acid_length', 'Gene_Name', 'Transcript_BioType',
+                'Gene_Coding', 'Transcript_ID', 'Exon_Rank', 'Genotype_Number'
+                'ERRORS']"""
+                row = re.sub("[\[\] ]", "", row)
+                row = re.sub("ERRORS|WARNINGS", "ERRORS", row)
+                self._effectColumn = re.split("[(|)]", row)[1:-1]
+            elif not re.search("^#", row):
+                # This is a row of VCF data
+                newVcfRow = vcfRow(row, self._effectColumn)
+                self._rows.append(newVcfRow)
+
+    def read(self):
+        return self._rows
+
+import math
+
+def round_sigfigs(num, sig_figs):
+    """Round to specified number of sigfigs.
+
+    >>> round_sigfigs(0, sig_figs=4)
+    0
+    >>> int(round_sigfigs(12345, sig_figs=2))
+    12000
+    >>> int(round_sigfigs(-12345, sig_figs=2))
+    -12000
+    >>> int(round_sigfigs(1, sig_figs=2))
+    1
+    >>> '{0:.3}'.format(round_sigfigs(3.1415, sig_figs=2))
+    '3.1'
+    >>> '{0:.3}'.format(round_sigfigs(-3.1415, sig_figs=2))
+    '-3.1'
+    >>> '{0:.5}'.format(round_sigfigs(0.00098765, sig_figs=2))
+    '0.00099'
+    >>> '{0:.6}'.format(round_sigfigs(0.00098765, sig_figs=3))
+    '0.000988'
+    """
+    if num != 0:
+        return round(num, -int(math.floor(math.log10(abs(num))) - (sig_figs - 1)))
+    else:
+        return 0  # Can't take the log of 0
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("ID", type=str, help="Entry for the ID column")
+    parser.add_argument("output", type=str, help="outputfile")
+    args = parser.parse_args()
+
+    myVcf = vcf(sys.stdin)
+
+    fout =open(args.ID,'w')
+
+    for row in myVcf.read():
+        #total =total+1
+        if row.alt =="NA":
+            continue ######## bad calls in the VCF
+        #good = good+1
+        if str(row.DNA_AF) not in ["NA",""]:
+            row.DNA_AF= round_sigfigs(float(row.DNA_AF),3)
+        else:
+            row.DNA_AF=""
+        if str(row.RNA_AF) not in ["NA",""]:
+            row.RNA_AF= round_sigfigs(float(row.RNA_AF),3)
+        else:
+            row.RNA_AF=""
+        if str(row.NORMAL_AF) not in ["NA",""]:
+            row.NORMAL_AF= round_sigfigs(float(row.NORMAL_AF),3)
+        else:
+            row.NORMAL_AF=""
+        if len(row.effectPerGene)!=0:
+            for gene in row.effectPerGene.keys():
+                AA_Change = row.effectPerGene[gene]["Amino_Acid_Change"]
+                if AA_Change !="" and AA_Change[:2]!="p.":
+                    AA_Change="p."+AA_Change
+                fout.write(string.join([args.ID, row.chr, str(row.start),
+                                        str(row.end), row.reference, row.alt,
+                                        gene,row.effectPerGene[gene]["effect"],
+                                        str(row.DNA_AF), str(row.RNA_AF),AA_Change
+                                        #, str(row.NORMAL_AF)
+                                        ],"\t")+"\n")
+        else:
+            gene =""
+            AA_Change=""
+            effect =""
+            fout.write(string.join([args.ID, row.chr, str(row.start),
+                                    str(row.end), row.reference, row.alt,
+                                    gene,effect,
+                                    str(row.DNA_AF), str(row.RNA_AF),AA_Change
+                                    #, str(row.NORMAL_AF)
+                                    ],"\t")+"\n")
+    fout.close()
+
+    os.system("cat "+args.ID+" >> "+args.output)
+    os.system("rm -f "+args.ID)
+
+if __name__ == '__main__':
+        main()
+
+
--- a/vcfToXena.xml	Thu Aug 13 21:49:03 2015 -0700
+++ b/vcfToXena.xml	Thu Aug 13 23:26:33 2015 -0700
@@ -1,9 +1,10 @@
 <tool id="vcfToXena" name="vcfToXena" version="0.1">
-    <description>Vcf To Xena positional mutation data</description>
+    <description>Convert vcf To Xena ready mutation data</description>
     <command>
       java -Xmx4G -jar $__tool_directory__/snpEff/snpEff.jar  -c $__tool_directory__/snpEff/snpEff.config 
       -i vcf -upDownStreamLen 5000 $genome
-      $input > $snpeff_output 
+      $input > $__tool_directory__/tmp ;
+      cat $__tool_directory__/tmp | python $__tool_directory__/parseSnpEffVcf.py  $input.name $snpeff_output
     </command>
     <inputs>
         <param format="vcf" name="input" type="data" label="Input VCF file"/>
@@ -13,7 +14,7 @@
 
     </inputs>
     <outputs>
-        <data format="vcf" name="snpeff_output" />
+        <data format="tabular" name="snpeff_output" label="${input.name}.mutationVector" />
     </outputs>
     <help>
       This tool convert vcf files to xena ready positional mutation data files.