comparison vcfToMutationVector.py @ 0:60efb9214eaa

Uploaded
author melissacline
date Wed, 14 Jan 2015 13:54:03 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:60efb9214eaa
1 #!/usr/bin/env python
2
3 """A mutationVector generator for the UCSC Xena platform
4
5 This script takes a VCF file generated by snpEff.
6 Its output is a mutationVector object that
7 represents the data in the language of the Xena platform.
8
9 Required Arguments:
10 snpEff output VCF filename: the output of snpEFF for the same VCF file
11
12 Optional Arguments:
13 - trinity: TRUE if this is a trinity-formatted VCF. This concerns how to
14 get the sample name.
15 """
16
17 import argparse
18 import re
19 import string
20 import vcf
21
22 impact = {"MODIFIER":0, "LOW":1, "MODERATE":2, "HIGH":3}
23
24
25 def _parseSnpEffColumnLabels(vcfReader):
26 """Given a line of format
27 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted
28 effects for this variant.Format: 'Effect ( Effect_Impact |
29 Functional_Class | Codon_Change | Amino_Acid_Change|
30 Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding |
31 Transcript_ID | Exon_Rank | Genotype_Number [ | ERRORS | WARNINGS
32 ] )'">
33
34 which is translated by the VCF reader to the following:
35
36 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon | GenotypeNum [ | ERRORS | WARNINGS ] )'
37
38 Parse out the ordered list of tokens as they will appear:
39 ['Functional_Class', 'Codon_Change', 'Amino_Acid_Change',
40 'Amino_Acid_length', 'Gene_Name', 'Transcript_BioType',
41 'Gene_Coding', 'Transcript_ID', 'Exon_Rank', 'Genotype_Number'
42 'ERRORS']"""
43 snpEffHeaderLabel = vcfReader.infos["EFF"].desc.split("\'")[1]
44 snpEffHeaderLabel = re.sub("[\[\] ]", "", snpEffHeaderLabel)
45 snpEffHeaderLabel = re.sub("ERRORS[|]WARNINGS", "ERRORS",
46 snpEffHeaderLabel)
47 snpEffLabelTokens = re.split("[(|)]", snpEffHeaderLabel)[1:-1]
48 return(snpEffLabelTokens)
49
50
51 #def _parseEffectsPerGene(effectString, columnLabels, GT_code):
52 def parseEffectsPerGene(effectString, columnLabels):
53 if effectString =="":
54 return {}
55 effectPerGene = dict()
56 effects = effectString.split(",")
57 for thisEffect in effects:
58 effectType = thisEffect.split("(")[0]
59 #
60 # Given a string such as
61 # downstream_gene_variant(MODIFIER||3956||459|CA9|||NM_001216.2||1)
62 # extract the stuff between the parens, divide it by |, and store
63 # it in a dictionary indexed by the column labels given as input
64 #
65 effectTokens = re.sub(".+\(", "",
66 re.sub("\)", "", thisEffect)).split("|")
67 effect = dict()
68 for ii in range(0,len(effectTokens)):
69 effect[columnLabels[ii]] = effectTokens[ii]
70 #match GT_code
71 #if GT_code and GT_code != int(effect["Genotype_Number"]):
72 # continue
73 effect["effect"] = effectType
74 #
75 # Parse through the list of effects. Extract the gene.
76 # Save one effect per gene, choosing an arbitrary effect
77 # from the most severe effect class.
78 #
79 thisGene = effect["Gene_Name"]
80 if not effectPerGene.has_key(thisGene):
81 effectPerGene[thisGene] = effect
82 else:
83 impactThisEffect = effect["Effect_Impact"]
84 worstImpactYet = effectPerGene[thisGene]["Effect_Impact"]
85 if impact[impactThisEffect] > impact[worstImpactYet]:
86 effectPerGene[thisGene] = effect
87 elif impact[impactThisEffect] == impact[worstImpactYet]:
88 if effect["Amino_Acid_length"] > effectPerGene[thisGene]["Amino_Acid_length"]:
89 effectPerGene[thisGene] = effect
90 return(effectPerGene)
91
92
93
94
95 def main():
96 parser = argparse.ArgumentParser()
97 parser.add_argument("snpEffVcf", type=str, help="VCF file from snpEff")
98 parser.add_argument("--trinity", type=int, default=False,
99 help="""True if this VCF represents Trinity output,
100 and if the sample name is then stored in
101 the info field under samples[name]""")
102 args = parser.parse_args()
103 print string.join(["#sample", "chr", "start", "end", "reference",
104 "alt", "gene", "effect", "DNA_VAF", "RNA_VAF",
105 "Amino_Acid_Change"], "\t")
106 vcfReader = vcf.VCFReader(open(args.snpEffVcf, "r"))
107 snpEffHeaderColumnLabels = _parseSnpEffColumnLabels(vcfReader)
108 for record in vcfReader:
109 if record.ALT =="NA":
110 continue ######## bad calls in the VCF
111 sampleName = "Unknown"
112 if args.trinity:
113 sampleName = record.samples[0]['name']
114 strand = record.ID
115 start = record.POS
116 end = int(record.POS) + len(record.REF) - 1
117 snpEffEffectsString = record.INFO["EFF"]
118 effectPerGene = parseEffectsPerGene(snpEffEffectsString,
119 snpEffHeaderColumnLabels)
120 if len(effectPerGene)!=0:
121 for gene in effectPerGene.keys():
122 aaChange = effectPerGene[gene]["Amino_Acid_change"]
123 if aaChange !="":
124 aaChange="p."+aaChange
125 print string.join([sampleName, record.CHROM, str(start),
126 str(end),
127 record.REF, record.ALT[0],
128 gene, effectPerGene[gene]['effect'],
129 "0.0", str(record.INFO["AF"][0]), aaChange], "\t")
130 else:
131 print string.join([sampleName, record.CHROM, str(start), str(end),
132 record.REF, record.ALT[0], "", "",
133 "0.0", str(record.INFO["AF"][0]), ""], "\t")
134
135
136
137
138 if __name__ == '__main__':
139 main()
140
141
142