Mercurial > repos > devteam > vcf_annotate
comparison annotate.py @ 0:3eca7b2537aa draft default tip
Imported from capsule None
author | devteam |
---|---|
date | Thu, 23 Jan 2014 12:31:47 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3eca7b2537aa |
---|---|
1 #!/usr/bin/python | |
2 | |
3 import os.path | |
4 import sys | |
5 import optparse | |
6 | |
7 import vcfClass | |
8 from vcfClass import * | |
9 | |
10 import tools | |
11 from tools import * | |
12 | |
13 if __name__ == "__main__": | |
14 main() | |
15 | |
16 # Check that the reference and alternate in the dbsnp vcf file match those | |
17 # from the input vcf file. | |
18 def checkRefAlt(vcfRef, vcfAlt, dbsnpRef, dbsnpAlt, ref, position, annotation): | |
19 text = "WARNING: ref and alt alleles differ between vcf and " + annotation + " " + ref + ":" + str(position) + " vcf: " + \ | |
20 vcfRef + "/" + vcfAlt + ", dbsnp: " + dbsnpRef + "/" + dbsnpAlt | |
21 | |
22 allelesAgree = True | |
23 if vcfRef.lower() != dbsnpRef.lower(): | |
24 if vcfRef.lower() != dbsnpAlt.lower(): | |
25 #print >> sys.stderr, text | |
26 allelesAgree = False | |
27 else: | |
28 if vcfAlt.lower() != dbsnpAlt.lower(): | |
29 #print >> sys.stderr, text | |
30 allelesAgree = False | |
31 | |
32 return allelesAgree | |
33 | |
34 # Intersect two vcf files. It is assumed that the two files are | |
35 # sorted by genomic coordinates and the reference sequences are | |
36 # in the same order. | |
37 def annotateVcf(v, d, outputFile, annotation): | |
38 success1 = v.getRecord() | |
39 success2 = d.getRecord() | |
40 currentReferenceSequence = v.referenceSequence | |
41 | |
42 # Finish when the end of the first file has been reached. | |
43 while success1: | |
44 | |
45 # If the end of the dbsnp vcf file is reached, write out the | |
46 # remaining records from the vcf file. | |
47 if not success2: | |
48 outputFile.write(v.record) | |
49 success1 = v.getRecord() | |
50 | |
51 if v.referenceSequence == d.referenceSequence and v.referenceSequence == currentReferenceSequence: | |
52 if v.position == d.position: | |
53 allelesAgree = checkRefAlt(v.ref, v.alt, d.ref, d.alt, v.referenceSequence, v.position, annotation) | |
54 if annotation == "dbsnp": v.rsid = d.getDbsnpInfo() | |
55 elif annotation == "hapmap": | |
56 if allelesAgree: v.info += ";HM3" | |
57 else: v.info += ";HM3A" | |
58 record = v.buildRecord(False) | |
59 outputFile.write(record) | |
60 | |
61 success1 = v.getRecord() | |
62 success2 = d.getRecord() | |
63 elif d.position > v.position: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile) | |
64 elif v.position > d.position: success2 = d.parseVcf(v.referenceSequence, v.position, False, None) | |
65 else: | |
66 if v.referenceSequence == currentReferenceSequence: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile) | |
67 elif d.referenceSequence == currentReferenceSequence: success2 = d.parseVcf(v.referenceSequence, v.position, False, None) | |
68 | |
69 # If the last record for a reference sequence is the same for both vcf | |
70 # files, they will both have referenceSequences different from the | |
71 # current reference sequence. Change the reference sequence to reflect | |
72 # this and proceed. | |
73 else: | |
74 if v.referenceSequence != d.referenceSequence: | |
75 print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different." | |
76 print >> sys.stderr, "Check that both files contain records for the following reference sequences:" | |
77 print >> sys.stderr, "\t", v.referenceSequence, " and ", d.referenceSequence | |
78 exit(1) | |
79 currentReferenceSequence = v.referenceSequence | |
80 | |
81 def main(): | |
82 | |
83 # Parse the command line options | |
84 usage = "Usage: vcfPytools.py annotate [options]" | |
85 parser = optparse.OptionParser(usage = usage) | |
86 parser.add_option("-i", "--in", | |
87 action="store", type="string", | |
88 dest="vcfFile", help="input vcf files") | |
89 parser.add_option("-d", "--dbsnp", | |
90 action="store", type="string", | |
91 dest="dbsnpFile", help="input dbsnp vcf file") | |
92 parser.add_option("-m", "--hapmap", | |
93 action="store", type="string", | |
94 dest="hapmapFile", help="input hapmap vcf file") | |
95 parser.add_option("-o", "--out", | |
96 action="store", type="string", | |
97 dest="output", help="output vcf file") | |
98 | |
99 (options, args) = parser.parse_args() | |
100 | |
101 # Check that a single vcf file is given. | |
102 if options.vcfFile == None: | |
103 parser.print_help() | |
104 print >> sys.stderr, "\nInput vcf file (--in, -i) is required for dbsnp annotation." | |
105 exit(1) | |
106 | |
107 # Check that either a hapmap or a dbsnp vcf file is included. | |
108 if options.dbsnpFile == None and options.hapmapFile == None: | |
109 parser.print_help() | |
110 print >> sys.stderr, "\ndbSNP or hapmap vcf file is required (--dbsnp, -d, --hapmap, -h)." | |
111 exit(1) | |
112 elif options.dbsnpFile != None and options.hapmapFile != None: | |
113 parser.print_help() | |
114 print >> sys.stderr, "\ndbSNP or hapmap vcf file is required, not both (--dbsnp, -d, --hapmap, -h)." | |
115 exit(1) | |
116 | |
117 # Set the output file to stdout if no output file was specified. | |
118 outputFile, writeOut = setOutput(options.output) # tools.py | |
119 | |
120 v = vcf() # Define vcf object. | |
121 d = vcf() # Define dbsnp/hapmap vcf object. | |
122 if options.dbsnpFile: | |
123 d.dbsnpVcf = True | |
124 annotationFile = options.dbsnpFile | |
125 annotation = "dbsnp" | |
126 elif options.hapmapFile: | |
127 d.hapmapVcf = True | |
128 annotationFile = options.hapmapFile | |
129 annotation = "hapmap" | |
130 | |
131 # Open the vcf files. | |
132 v.openVcf(options.vcfFile) | |
133 d.openVcf(annotationFile) | |
134 | |
135 # Read in the header information. | |
136 v.parseHeader(options.vcfFile, writeOut) | |
137 d.parseHeader(annotationFile, writeOut) | |
138 | |
139 # Add an extra line to the vcf header to indicate the file used for | |
140 # performing dbsnp annotation. | |
141 taskDescriptor = "##vcfPytools=annotated vcf file with " | |
142 if options.dbsnpFile: taskDescriptor += "dbSNP file " + options.dbsnpFile | |
143 elif options.hapmapFile: | |
144 taskDescriptor += "hapmap file " + options.hapmapFile | |
145 v.infoHeaderString["HM3"] = "##INFO=<ID=HM3,Number=0,Type=Flag,Description=\"Hapmap3.2 membership determined from file " + \ | |
146 options.hapmapFile + "\">" | |
147 v.infoHeaderString["HM3A"] = "##INFO=<ID=HM3A,Number=0,Type=Flag,Description=\"Hapmap3.2 membership (with different alleles)" + \ | |
148 ", determined from file " + options.hapmapFile + "\">" | |
149 writeHeader(outputFile, v, False, taskDescriptor) # tools.py | |
150 | |
151 # Annotate the vcf file. | |
152 annotateVcf(v, d, outputFile, annotation) | |
153 | |
154 # Check that the input files had the same list of reference sequences. | |
155 # If not, it is possible that there were some problems. | |
156 checkReferenceSequenceLists(v.referenceSequenceList, d.referenceSequenceList) # tools.py | |
157 | |
158 # Close the vcf files. | |
159 v.closeVcf(options.vcfFile) | |
160 d.closeVcf(annotationFile) | |
161 | |
162 # End the program. | |
163 return 0 |