annotate vcfToMutationVector.py @ 37:e81019e3ac99

Updated synapseGetDataset to look at the filename rather than the (no longer existant) content type field to determine if the data is in zip format
author melissacline
date Mon, 27 Jul 2015 16:29:24 -0700
parents 60efb9214eaa
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
1 #!/usr/bin/env python
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
2
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
3 """A mutationVector generator for the UCSC Xena platform
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
4
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
5 This script takes a VCF file generated by snpEff.
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
6 Its output is a mutationVector object that
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
7 represents the data in the language of the Xena platform.
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
8
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
9 Required Arguments:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
10 snpEff output VCF filename: the output of snpEFF for the same VCF file
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
11
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
12 Optional Arguments:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
13 - trinity: TRUE if this is a trinity-formatted VCF. This concerns how to
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
14 get the sample name.
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
15 """
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
16
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
17 import argparse
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
18 import re
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
19 import string
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
20 import vcf
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
21
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
22 impact = {"MODIFIER":0, "LOW":1, "MODERATE":2, "HIGH":3}
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
23
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
24
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
25 def _parseSnpEffColumnLabels(vcfReader):
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
26 """Given a line of format
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
27 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
28 effects for this variant.Format: 'Effect ( Effect_Impact |
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
29 Functional_Class | Codon_Change | Amino_Acid_Change|
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
30 Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding |
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
31 Transcript_ID | Exon_Rank | Genotype_Number [ | ERRORS | WARNINGS
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
32 ] )'">
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
33
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
34 which is translated by the VCF reader to the following:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
35
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
36 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon | GenotypeNum [ | ERRORS | WARNINGS ] )'
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
37
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
38 Parse out the ordered list of tokens as they will appear:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
39 ['Functional_Class', 'Codon_Change', 'Amino_Acid_Change',
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
40 'Amino_Acid_length', 'Gene_Name', 'Transcript_BioType',
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
41 'Gene_Coding', 'Transcript_ID', 'Exon_Rank', 'Genotype_Number'
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
42 'ERRORS']"""
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
43 snpEffHeaderLabel = vcfReader.infos["EFF"].desc.split("\'")[1]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
44 snpEffHeaderLabel = re.sub("[\[\] ]", "", snpEffHeaderLabel)
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
45 snpEffHeaderLabel = re.sub("ERRORS[|]WARNINGS", "ERRORS",
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
46 snpEffHeaderLabel)
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
47 snpEffLabelTokens = re.split("[(|)]", snpEffHeaderLabel)[1:-1]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
48 return(snpEffLabelTokens)
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
49
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
50
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
51 #def _parseEffectsPerGene(effectString, columnLabels, GT_code):
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
52 def parseEffectsPerGene(effectString, columnLabels):
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
53 if effectString =="":
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
54 return {}
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
55 effectPerGene = dict()
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
56 effects = effectString.split(",")
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
57 for thisEffect in effects:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
58 effectType = thisEffect.split("(")[0]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
59 #
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
60 # Given a string such as
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
61 # downstream_gene_variant(MODIFIER||3956||459|CA9|||NM_001216.2||1)
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
62 # extract the stuff between the parens, divide it by |, and store
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
63 # it in a dictionary indexed by the column labels given as input
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
64 #
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
65 effectTokens = re.sub(".+\(", "",
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
66 re.sub("\)", "", thisEffect)).split("|")
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
67 effect = dict()
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
68 for ii in range(0,len(effectTokens)):
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
69 effect[columnLabels[ii]] = effectTokens[ii]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
70 #match GT_code
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
71 #if GT_code and GT_code != int(effect["Genotype_Number"]):
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
72 # continue
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
73 effect["effect"] = effectType
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
74 #
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
75 # Parse through the list of effects. Extract the gene.
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
76 # Save one effect per gene, choosing an arbitrary effect
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
77 # from the most severe effect class.
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
78 #
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
79 thisGene = effect["Gene_Name"]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
80 if not effectPerGene.has_key(thisGene):
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
81 effectPerGene[thisGene] = effect
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
82 else:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
83 impactThisEffect = effect["Effect_Impact"]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
84 worstImpactYet = effectPerGene[thisGene]["Effect_Impact"]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
85 if impact[impactThisEffect] > impact[worstImpactYet]:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
86 effectPerGene[thisGene] = effect
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
87 elif impact[impactThisEffect] == impact[worstImpactYet]:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
88 if effect["Amino_Acid_length"] > effectPerGene[thisGene]["Amino_Acid_length"]:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
89 effectPerGene[thisGene] = effect
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
90 return(effectPerGene)
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
91
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
92
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
93
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
94
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
95 def main():
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
96 parser = argparse.ArgumentParser()
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
97 parser.add_argument("snpEffVcf", type=str, help="VCF file from snpEff")
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
98 parser.add_argument("--trinity", type=int, default=False,
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
99 help="""True if this VCF represents Trinity output,
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
100 and if the sample name is then stored in
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
101 the info field under samples[name]""")
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
102 args = parser.parse_args()
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
103 print string.join(["#sample", "chr", "start", "end", "reference",
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
104 "alt", "gene", "effect", "DNA_VAF", "RNA_VAF",
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
105 "Amino_Acid_Change"], "\t")
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
106 vcfReader = vcf.VCFReader(open(args.snpEffVcf, "r"))
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
107 snpEffHeaderColumnLabels = _parseSnpEffColumnLabels(vcfReader)
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
108 for record in vcfReader:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
109 if record.ALT =="NA":
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
110 continue ######## bad calls in the VCF
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
111 sampleName = "Unknown"
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
112 if args.trinity:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
113 sampleName = record.samples[0]['name']
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
114 strand = record.ID
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
115 start = record.POS
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
116 end = int(record.POS) + len(record.REF) - 1
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
117 snpEffEffectsString = record.INFO["EFF"]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
118 effectPerGene = parseEffectsPerGene(snpEffEffectsString,
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
119 snpEffHeaderColumnLabels)
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
120 if len(effectPerGene)!=0:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
121 for gene in effectPerGene.keys():
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
122 aaChange = effectPerGene[gene]["Amino_Acid_change"]
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
123 if aaChange !="":
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
124 aaChange="p."+aaChange
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
125 print string.join([sampleName, record.CHROM, str(start),
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
126 str(end),
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
127 record.REF, record.ALT[0],
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
128 gene, effectPerGene[gene]['effect'],
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
129 "0.0", str(record.INFO["AF"][0]), aaChange], "\t")
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
130 else:
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
131 print string.join([sampleName, record.CHROM, str(start), str(end),
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
132 record.REF, record.ALT[0], "", "",
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
133 "0.0", str(record.INFO["AF"][0]), ""], "\t")
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
134
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
135
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
136
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
137
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
138 if __name__ == '__main__':
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
139 main()
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
140
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
141
60efb9214eaa Uploaded
melissacline
parents:
diff changeset
142