annotate vcfToMutationVector.py @ 20:914bc8ee6222

Debugged the merge mutation data tool
author melissacline
date Fri, 20 Mar 2015 15:50:22 -0700
parents 60efb9214eaa
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
1 #!/usr/bin/env python
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
2
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
3 """A mutationVector generator for the UCSC Xena platform
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
4
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
5 This script takes a VCF file generated by snpEff.
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
6 Its output is a mutationVector object that
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
7 represents the data in the language of the Xena platform.
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
8
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
9 Required Arguments:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
10 snpEff output VCF filename: the output of snpEFF for the same VCF file
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
11
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
12 Optional Arguments:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
13 - trinity: TRUE if this is a trinity-formatted VCF. This concerns how to
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
14 get the sample name.
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
15 """
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
16
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
17 import argparse
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
18 import re
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
19 import string
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
20 import vcf
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
21
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
22 impact = {"MODIFIER":0, "LOW":1, "MODERATE":2, "HIGH":3}
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
23
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
24
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
25 def _parseSnpEffColumnLabels(vcfReader):
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
26 """Given a line of format
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
27 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
28 effects for this variant.Format: 'Effect ( Effect_Impact |
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
29 Functional_Class | Codon_Change | Amino_Acid_Change|
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
30 Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding |
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
31 Transcript_ID | Exon_Rank | Genotype_Number [ | ERRORS | WARNINGS
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
32 ] )'">
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
33
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
34 which is translated by the VCF reader to the following:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
35
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
36 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon | GenotypeNum [ | ERRORS | WARNINGS ] )'
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
37
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
38 Parse out the ordered list of tokens as they will appear:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
39 ['Functional_Class', 'Codon_Change', 'Amino_Acid_Change',
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
40 'Amino_Acid_length', 'Gene_Name', 'Transcript_BioType',
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
41 'Gene_Coding', 'Transcript_ID', 'Exon_Rank', 'Genotype_Number'
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
42 'ERRORS']"""
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
43 snpEffHeaderLabel = vcfReader.infos["EFF"].desc.split("\'")[1]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
44 snpEffHeaderLabel = re.sub("[\[\] ]", "", snpEffHeaderLabel)
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
45 snpEffHeaderLabel = re.sub("ERRORS[|]WARNINGS", "ERRORS",
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
46 snpEffHeaderLabel)
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
47 snpEffLabelTokens = re.split("[(|)]", snpEffHeaderLabel)[1:-1]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
48 return(snpEffLabelTokens)
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
49
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
50
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
51 #def _parseEffectsPerGene(effectString, columnLabels, GT_code):
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
52 def parseEffectsPerGene(effectString, columnLabels):
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
53 if effectString =="":
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
54 return {}
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
55 effectPerGene = dict()
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
56 effects = effectString.split(",")
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
57 for thisEffect in effects:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
58 effectType = thisEffect.split("(")[0]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
59 #
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
60 # Given a string such as
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
61 # downstream_gene_variant(MODIFIER||3956||459|CA9|||NM_001216.2||1)
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
62 # extract the stuff between the parens, divide it by |, and store
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
63 # it in a dictionary indexed by the column labels given as input
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
64 #
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
65 effectTokens = re.sub(".+\(", "",
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
66 re.sub("\)", "", thisEffect)).split("|")
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
67 effect = dict()
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
68 for ii in range(0,len(effectTokens)):
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
69 effect[columnLabels[ii]] = effectTokens[ii]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
70 #match GT_code
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
71 #if GT_code and GT_code != int(effect["Genotype_Number"]):
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
72 # continue
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
73 effect["effect"] = effectType
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
74 #
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
75 # Parse through the list of effects. Extract the gene.
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
76 # Save one effect per gene, choosing an arbitrary effect
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
77 # from the most severe effect class.
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
78 #
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
79 thisGene = effect["Gene_Name"]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
80 if not effectPerGene.has_key(thisGene):
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
81 effectPerGene[thisGene] = effect
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
82 else:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
83 impactThisEffect = effect["Effect_Impact"]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
84 worstImpactYet = effectPerGene[thisGene]["Effect_Impact"]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
85 if impact[impactThisEffect] > impact[worstImpactYet]:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
86 effectPerGene[thisGene] = effect
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
87 elif impact[impactThisEffect] == impact[worstImpactYet]:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
88 if effect["Amino_Acid_length"] > effectPerGene[thisGene]["Amino_Acid_length"]:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
89 effectPerGene[thisGene] = effect
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
90 return(effectPerGene)
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
91
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
92
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
93
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
94
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
95 def main():
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
96 parser = argparse.ArgumentParser()
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
97 parser.add_argument("snpEffVcf", type=str, help="VCF file from snpEff")
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
98 parser.add_argument("--trinity", type=int, default=False,
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
99 help="""True if this VCF represents Trinity output,
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
100 and if the sample name is then stored in
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
101 the info field under samples[name]""")
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
102 args = parser.parse_args()
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
103 print string.join(["#sample", "chr", "start", "end", "reference",
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
104 "alt", "gene", "effect", "DNA_VAF", "RNA_VAF",
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
105 "Amino_Acid_Change"], "\t")
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
106 vcfReader = vcf.VCFReader(open(args.snpEffVcf, "r"))
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
107 snpEffHeaderColumnLabels = _parseSnpEffColumnLabels(vcfReader)
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
108 for record in vcfReader:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
109 if record.ALT =="NA":
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
110 continue ######## bad calls in the VCF
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
111 sampleName = "Unknown"
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
112 if args.trinity:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
113 sampleName = record.samples[0]['name']
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
114 strand = record.ID
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
115 start = record.POS
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
116 end = int(record.POS) + len(record.REF) - 1
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
117 snpEffEffectsString = record.INFO["EFF"]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
118 effectPerGene = parseEffectsPerGene(snpEffEffectsString,
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
119 snpEffHeaderColumnLabels)
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
120 if len(effectPerGene)!=0:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
121 for gene in effectPerGene.keys():
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
122 aaChange = effectPerGene[gene]["Amino_Acid_change"]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
123 if aaChange !="":
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
124 aaChange="p."+aaChange
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
125 print string.join([sampleName, record.CHROM, str(start),
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
126 str(end),
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
127 record.REF, record.ALT[0],
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
128 gene, effectPerGene[gene]['effect'],
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
129 "0.0", str(record.INFO["AF"][0]), aaChange], "\t")
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
130 else:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
131 print string.join([sampleName, record.CHROM, str(start), str(end),
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
132 record.REF, record.ALT[0], "", "",
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
133 "0.0", str(record.INFO["AF"][0]), ""], "\t")
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
134
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
135
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
136
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
137
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
138 if __name__ == '__main__':
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
139 main()
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
140
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
141
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
142