annotate annotate.py @ 0:3eca7b2537aa draft default tip

Imported from capsule None
author devteam
date Thu, 23 Jan 2014 12:31:47 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
1 #!/usr/bin/python
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
2
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
3 import os.path
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
4 import sys
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
5 import optparse
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
6
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
7 import vcfClass
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
8 from vcfClass import *
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
9
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
10 import tools
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
11 from tools import *
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
12
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
13 if __name__ == "__main__":
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
14 main()
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
15
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
16 # Check that the reference and alternate in the dbsnp vcf file match those
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
17 # from the input vcf file.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
18 def checkRefAlt(vcfRef, vcfAlt, dbsnpRef, dbsnpAlt, ref, position, annotation):
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
19 text = "WARNING: ref and alt alleles differ between vcf and " + annotation + " " + ref + ":" + str(position) + " vcf: " + \
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
20 vcfRef + "/" + vcfAlt + ", dbsnp: " + dbsnpRef + "/" + dbsnpAlt
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
21
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
22 allelesAgree = True
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
23 if vcfRef.lower() != dbsnpRef.lower():
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
24 if vcfRef.lower() != dbsnpAlt.lower():
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
25 #print >> sys.stderr, text
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
26 allelesAgree = False
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
27 else:
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
28 if vcfAlt.lower() != dbsnpAlt.lower():
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
29 #print >> sys.stderr, text
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
30 allelesAgree = False
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
31
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
32 return allelesAgree
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
33
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
34 # Intersect two vcf files. It is assumed that the two files are
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
35 # sorted by genomic coordinates and the reference sequences are
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
36 # in the same order.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
37 def annotateVcf(v, d, outputFile, annotation):
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
38 success1 = v.getRecord()
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
39 success2 = d.getRecord()
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
40 currentReferenceSequence = v.referenceSequence
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
41
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
42 # Finish when the end of the first file has been reached.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
43 while success1:
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
44
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
45 # If the end of the dbsnp vcf file is reached, write out the
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
46 # remaining records from the vcf file.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
47 if not success2:
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
48 outputFile.write(v.record)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
49 success1 = v.getRecord()
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
50
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
51 if v.referenceSequence == d.referenceSequence and v.referenceSequence == currentReferenceSequence:
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
52 if v.position == d.position:
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
53 allelesAgree = checkRefAlt(v.ref, v.alt, d.ref, d.alt, v.referenceSequence, v.position, annotation)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
54 if annotation == "dbsnp": v.rsid = d.getDbsnpInfo()
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
55 elif annotation == "hapmap":
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
56 if allelesAgree: v.info += ";HM3"
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
57 else: v.info += ";HM3A"
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
58 record = v.buildRecord(False)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
59 outputFile.write(record)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
60
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
61 success1 = v.getRecord()
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
62 success2 = d.getRecord()
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
63 elif d.position > v.position: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
64 elif v.position > d.position: success2 = d.parseVcf(v.referenceSequence, v.position, False, None)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
65 else:
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
66 if v.referenceSequence == currentReferenceSequence: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
67 elif d.referenceSequence == currentReferenceSequence: success2 = d.parseVcf(v.referenceSequence, v.position, False, None)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
68
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
69 # If the last record for a reference sequence is the same for both vcf
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
70 # files, they will both have referenceSequences different from the
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
71 # current reference sequence. Change the reference sequence to reflect
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
72 # this and proceed.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
73 else:
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
74 if v.referenceSequence != d.referenceSequence:
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
75 print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different."
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
76 print >> sys.stderr, "Check that both files contain records for the following reference sequences:"
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
77 print >> sys.stderr, "\t", v.referenceSequence, " and ", d.referenceSequence
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
78 exit(1)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
79 currentReferenceSequence = v.referenceSequence
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
80
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
81 def main():
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
82
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
83 # Parse the command line options
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
84 usage = "Usage: vcfPytools.py annotate [options]"
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
85 parser = optparse.OptionParser(usage = usage)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
86 parser.add_option("-i", "--in",
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
87 action="store", type="string",
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
88 dest="vcfFile", help="input vcf files")
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
89 parser.add_option("-d", "--dbsnp",
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
90 action="store", type="string",
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
91 dest="dbsnpFile", help="input dbsnp vcf file")
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
92 parser.add_option("-m", "--hapmap",
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
93 action="store", type="string",
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
94 dest="hapmapFile", help="input hapmap vcf file")
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
95 parser.add_option("-o", "--out",
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
96 action="store", type="string",
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
97 dest="output", help="output vcf file")
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
98
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
99 (options, args) = parser.parse_args()
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
100
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
101 # Check that a single vcf file is given.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
102 if options.vcfFile == None:
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
103 parser.print_help()
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
104 print >> sys.stderr, "\nInput vcf file (--in, -i) is required for dbsnp annotation."
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
105 exit(1)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
106
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
107 # Check that either a hapmap or a dbsnp vcf file is included.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
108 if options.dbsnpFile == None and options.hapmapFile == None:
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
109 parser.print_help()
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
110 print >> sys.stderr, "\ndbSNP or hapmap vcf file is required (--dbsnp, -d, --hapmap, -h)."
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
111 exit(1)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
112 elif options.dbsnpFile != None and options.hapmapFile != None:
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
113 parser.print_help()
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
114 print >> sys.stderr, "\ndbSNP or hapmap vcf file is required, not both (--dbsnp, -d, --hapmap, -h)."
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
115 exit(1)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
116
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
117 # Set the output file to stdout if no output file was specified.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
118 outputFile, writeOut = setOutput(options.output) # tools.py
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
119
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
120 v = vcf() # Define vcf object.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
121 d = vcf() # Define dbsnp/hapmap vcf object.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
122 if options.dbsnpFile:
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
123 d.dbsnpVcf = True
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
124 annotationFile = options.dbsnpFile
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
125 annotation = "dbsnp"
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
126 elif options.hapmapFile:
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
127 d.hapmapVcf = True
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
128 annotationFile = options.hapmapFile
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
129 annotation = "hapmap"
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
130
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
131 # Open the vcf files.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
132 v.openVcf(options.vcfFile)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
133 d.openVcf(annotationFile)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
134
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
135 # Read in the header information.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
136 v.parseHeader(options.vcfFile, writeOut)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
137 d.parseHeader(annotationFile, writeOut)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
138
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
139 # Add an extra line to the vcf header to indicate the file used for
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
140 # performing dbsnp annotation.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
141 taskDescriptor = "##vcfPytools=annotated vcf file with "
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
142 if options.dbsnpFile: taskDescriptor += "dbSNP file " + options.dbsnpFile
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
143 elif options.hapmapFile:
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
144 taskDescriptor += "hapmap file " + options.hapmapFile
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
145 v.infoHeaderString["HM3"] = "##INFO=<ID=HM3,Number=0,Type=Flag,Description=\"Hapmap3.2 membership determined from file " + \
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
146 options.hapmapFile + "\">"
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
147 v.infoHeaderString["HM3A"] = "##INFO=<ID=HM3A,Number=0,Type=Flag,Description=\"Hapmap3.2 membership (with different alleles)" + \
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
148 ", determined from file " + options.hapmapFile + "\">"
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
149 writeHeader(outputFile, v, False, taskDescriptor) # tools.py
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
150
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
151 # Annotate the vcf file.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
152 annotateVcf(v, d, outputFile, annotation)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
153
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
154 # Check that the input files had the same list of reference sequences.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
155 # If not, it is possible that there were some problems.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
156 checkReferenceSequenceLists(v.referenceSequenceList, d.referenceSequenceList) # tools.py
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
157
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
158 # Close the vcf files.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
159 v.closeVcf(options.vcfFile)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
160 d.closeVcf(annotationFile)
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
161
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
162 # End the program.
3eca7b2537aa Imported from capsule None
devteam
parents:
diff changeset
163 return 0