Mercurial > repos > devteam > vcf_extract
comparison vcfClass.py @ 0:c6fb674dfda3 draft default tip
Imported from capsule None
author | devteam |
---|---|
date | Thu, 23 Jan 2014 12:31:12 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c6fb674dfda3 |
---|---|
1 #!/usr/bin/python | |
2 | |
3 import os.path | |
4 import sys | |
5 import re | |
6 | |
7 class vcf: | |
8 def __init__(self): | |
9 | |
10 # Header info. | |
11 self.filename = "" | |
12 self.hasHeader = True | |
13 self.headerText = "" | |
14 self.headerTitles = "" | |
15 self.vcfFormat = "" | |
16 #self.headerInfoText = "" | |
17 #self.headerFormatText = "" | |
18 | |
19 # Store the info and format tags as well as the lines that describe | |
20 # them in a dictionary. | |
21 self.numberDataSets = 0 | |
22 self.includedDataSets = {} | |
23 self.infoHeaderTags = {} | |
24 self.infoHeaderString = {} | |
25 self.formatHeaderTags = {} | |
26 self.formatHeaderString = {} | |
27 | |
28 # Genotype information. | |
29 self.genotypes = False | |
30 self.infoField = {} | |
31 | |
32 # Reference sequence information. | |
33 self.referenceSequences = {} | |
34 self.referenceSequenceList = [] | |
35 self.referenceSequence = "" | |
36 | |
37 # Record information. | |
38 self.position = -1 | |
39 self.samplesList = [] | |
40 | |
41 # Determine which fields to process. | |
42 self.processInfo = False | |
43 self.processGenotypes = False | |
44 self.dbsnpVcf = False | |
45 self.hapmapVcf = False | |
46 | |
47 # Open a vcf file. | |
48 def openVcf(self, filename): | |
49 if filename == "stdin": | |
50 self.filehandle = sys.stdin | |
51 self.filename = "stdin" | |
52 else: | |
53 try: self.filehandle = open(filename,"r") | |
54 except IOError: | |
55 print >> sys.stderr, "Failed to find file: ",filename | |
56 exit(1) | |
57 self.filename = os.path.abspath(filename) | |
58 | |
59 # Parse the vcf header. | |
60 def parseHeader(self, filename, writeOut): | |
61 while self.getHeaderLine(filename, writeOut): | |
62 continue | |
63 | |
64 # Determine the type of information in the header line. | |
65 def getHeaderLine(self, filename, writeOut): | |
66 self.headerLine = self.filehandle.readline().rstrip("\n") | |
67 if self.headerLine.startswith("##fileformat"): success = self.getvcfFormat() | |
68 if self.headerLine.startswith("##INFO"): success = self.headerInfo(writeOut, "info") | |
69 elif self.headerLine.startswith("##FORMAT"): success = self.headerInfo(writeOut, "format") | |
70 elif self.headerLine.startswith("##FILE"): success = self.headerFiles(writeOut) | |
71 elif self.headerLine.startswith("##"): success = self.headerAdditional() | |
72 elif self.headerLine.startswith("#"): success = self.headerTitleString(filename, writeOut) | |
73 else: success = self.noHeader(filename, writeOut) | |
74 | |
75 return success | |
76 | |
77 # Read VCF format | |
78 def getvcfFormat(self): | |
79 try: | |
80 self.vcfFormat = self.headerLine.split("=",1)[1] | |
81 self.vcfFormat = float( self.vcfFormat.split("VCFv",1)[1] )## Extract the version number rather than the whole string | |
82 except IndexError: | |
83 print >> sys.stderr, "\nError parsing the fileformat" | |
84 print >> sys.stderr, "The following fileformat header is wrongly formatted: ", self.headerLine | |
85 exit(1) | |
86 return True | |
87 | |
88 | |
89 # Read information on an info field from the header line. | |
90 def headerInfo(self, writeOut, lineType): | |
91 tag = self.headerLine.split("=",1) | |
92 tagID = (tag[1].split("ID=",1))[1].split(",",1) | |
93 | |
94 # Check if this info field has already been defined. | |
95 if (lineType == "info" and self.infoHeaderTags.has_key(tagID[0])) or (lineType == "format" and self.formatHeaderTags.has_key(tagID[0])): | |
96 print >> sys.stderr, "Info tag \"", tagID[0], "\" is defined multiple times in the header." | |
97 exit(1) | |
98 | |
99 # Determine the number of entries, entry type and description. | |
100 tagNumber = (tagID[1].split("Number=",1))[1].split(",",1) | |
101 tagType = (tagNumber[1].split("Type=",1))[1].split(",",1) | |
102 try: tagDescription = ( ( (tagType[1].split("Description=\"",1))[1] ).split("\">") )[0] | |
103 except IndexError: tagDescription = "" | |
104 tagID = tagID[0]; tagNumber = tagNumber[0]; tagType = tagType[0] | |
105 | |
106 # Check that the number of fields associated with the tag is either | |
107 # an integer or a '.' to indicate variable number of entries. | |
108 if tagNumber == ".": tagNumber = "variable" | |
109 else: | |
110 if self.vcfFormat<4.1: | |
111 | |
112 try: | |
113 tagNumber = int(tagNumber) | |
114 | |
115 except ValueError: | |
116 print >> sys.stderr, "\nError parsing header. Problem with info tag:", tagID | |
117 print >> sys.stderr, "Number of fields associated with this tag is not an integer or '.'" | |
118 exit(1) | |
119 | |
120 if lineType == "info": | |
121 self.infoHeaderTags[tagID] = tagNumber, tagType, tagDescription | |
122 self.infoHeaderString[tagID] = self.headerLine | |
123 if lineType == "format": | |
124 self.formatHeaderTags[tagID] = tagNumber, tagType, tagDescription | |
125 self.formatHeaderString[tagID] = self.headerLine | |
126 | |
127 return True | |
128 | |
129 # Check to see if the records contain information from multiple different | |
130 # sources. If vcfPytools has been used to find the intersection or union | |
131 # of two vcf files, the records may have been merged to keep all the | |
132 # information available. If this is the case, there will be a ##FILE line | |
133 # for each set of information in the file. The order of these files needs | |
134 # to be maintained. | |
135 def headerFiles(self, writeOut): | |
136 fileID = (self.headerLine.split("ID=",1))[1].split(",",1) | |
137 filename = fileID[1].split("\"",2)[1] | |
138 try: fileID = int(fileID[0]) | |
139 except ValueError: | |
140 print >> sys.stderr, "File ID in ##FILE entry must be an integer." | |
141 print >> sys.stderr, self.headerLine | |
142 exit(1) | |
143 if self.includedDataSets.has_key(fileID): | |
144 print >> sys.stderr, "\nERROR: file " + self.filename | |
145 print >> sys.stderr, "Multiple files in the ##FILE list have identical ID values." | |
146 exit(1) | |
147 self.includedDataSets[fileID] = filename | |
148 | |
149 # Set the number of files with information in this vcf file. | |
150 if fileID > self.numberDataSets: self.numberDataSets = fileID | |
151 | |
152 return True | |
153 | |
154 # Read additional information contained in the header. | |
155 def headerAdditional(self): | |
156 self.headerText += self.headerLine + "\n" | |
157 | |
158 return True | |
159 | |
160 # Read in the column titles to check that all standard fields | |
161 # are present and read in all the samples. | |
162 def headerTitleString(self, filename, writeOut): | |
163 self.headerTitles = self.headerLine + "\n" | |
164 | |
165 # Strip the end of line character from the last infoFields entry. | |
166 infoFields = self.headerLine.split("\t") | |
167 if len(infoFields) > 8: | |
168 # if len(infoFields) - 9 == 1 and writeOut: print >> sys.stdout, len(infoFields) - 9, " sample present in vcf file: ", filename | |
169 # elif writeOut: print >> sys.stdout, len(infoFields) - 9, " samples present in vcf file: ", filename | |
170 self.samplesList = infoFields[9:] | |
171 self.genotypes = True | |
172 elif len(infoFields) == 8: | |
173 if writeOut: print >> sys.stdout, "No samples present in the header. No genotype information available." | |
174 else: | |
175 print self.headerLine, len(infoFields) | |
176 print >> sys.stderr, "Not all vcf standard fields are available." | |
177 exit(1) | |
178 | |
179 return False | |
180 | |
181 # If there is no header in the vcf file, close and reopen the | |
182 # file so that the first line is avaiable for parsing as a | |
183 # vcf record. | |
184 def noHeader(self, filename, writeOut): | |
185 if writeOut: print >> sys.stdout, "No header lines present in", filename | |
186 self.hasHeader = False | |
187 self.closeVcf(filename) | |
188 self.openVcf(filename) | |
189 | |
190 return False | |
191 | |
192 # Check that info fields exist. | |
193 def checkInfoFields(self, tag): | |
194 if self.infoHeaderTags.has_key(tag) == False: | |
195 print >> sys.stderr, "Info tag \"", tag, "\" does not exist in the header." | |
196 exit(1) | |
197 | |
198 # Get the next line of information from the vcf file. | |
199 def getRecord(self): | |
200 self.record = self.filehandle.readline() | |
201 if not self.record: return False | |
202 | |
203 # Set up and execute a regular expression match. | |
204 recordRe = re.compile(r"^(\S+)\t(\d+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)(\n|\t.+)$") | |
205 #recordRe = re.compile(r"^(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)(\n|\s+.+)$") | |
206 recordMatch = recordRe.match(self.record) | |
207 if recordMatch == None: | |
208 print >> sys.stderr, "Unable to resolve vcf record.\n" | |
209 print >> sys.stderr, self.record | |
210 exit(1) | |
211 | |
212 self.referenceSequence = recordMatch.group(1) | |
213 try: self.position = int(recordMatch.group(2)) | |
214 except ValueError: | |
215 text = "variant position is not an integer" | |
216 self.generalError(text, "", None) | |
217 self.rsid = recordMatch.group(3) | |
218 self.ref = recordMatch.group(4) | |
219 self.alt = recordMatch.group(5) | |
220 self.quality = recordMatch.group(6) | |
221 self.filters = recordMatch.group(7) | |
222 self.info = recordMatch.group(8) | |
223 self.genotypeString = recordMatch.group(9) | |
224 self.infoTags = {} | |
225 | |
226 # Check that the quality is an integer or a float. If not, set the quality | |
227 # to zero. | |
228 try: self.quality = float(self.quality) | |
229 except ValueError: self.quality = float(0.) | |
230 | |
231 # If recordMatch.group(9) is not the end of line character, there is | |
232 # genotype information with this record. | |
233 if self.genotypeString != "\n": self.hasGenotypes = True | |
234 else: self.hasGenotypes = False | |
235 | |
236 # Add the reference sequence to the dictionary. If it didn't previously | |
237 # exist append the reference sequence to the end of the list as well. | |
238 # This ensures that the order in which the reference sequences appeared | |
239 # in the header can be preserved. | |
240 if self.referenceSequence not in self.referenceSequences: | |
241 self.referenceSequences[self.referenceSequence] = True | |
242 self.referenceSequenceList.append(self.referenceSequence) | |
243 | |
244 # Check for multiple alternate alleles. | |
245 self.alternateAlleles = self.alt.split(",") | |
246 self.numberAlternateAlleles = len(self.alternateAlleles) | |
247 | |
248 # If required, process the info and genotypes. | |
249 if self.processInfo: self.processInfoFields() | |
250 if self.processGenotypes and self.hasGenotypes: self.processGenotypeFields() | |
251 | |
252 return True | |
253 | |
254 # Process the info string. | |
255 def processInfoFields(self): | |
256 | |
257 # First break the info string into its constituent elements. | |
258 infoEntries = self.info.split(";") | |
259 | |
260 # As long as some info fields exist, place them into a dictionary. | |
261 for entry in infoEntries: | |
262 infoEntry = entry.split("=") | |
263 | |
264 # If the entry is a flag, there will be no equals and the length of | |
265 # infoEntry will be 1. In this case, set the dictionary entry to the | |
266 # whole entry. If the vcf file has undergone a union or intersection | |
267 # operation and contains the information from multiple files, this may | |
268 # be a '/' seperate list of flags and so cannot be set to a Boolean value | |
269 # yet. | |
270 if len(infoEntry) == 1: self.infoTags[infoEntry[0]] = infoEntry[0] | |
271 elif len(infoEntry) > 1: self.infoTags[infoEntry[0]] = infoEntry[1] | |
272 | |
273 # Process the genotype formats and values. | |
274 def processGenotypeFields(self): | |
275 genotypeEntries = self.genotypeString.split("\t") | |
276 self.genotypeFormatString = genotypeEntries[1] | |
277 self.genotypes = list(genotypeEntries[2:]) | |
278 self.genotypeFormats = {} | |
279 self.genotypeFields = {} | |
280 self.genotypeFormats = self.genotypeFormatString.split(":") | |
281 | |
282 # Check that the number of genotype fields is equal to the number of samples | |
283 if len(self.samplesList) != len(self.genotypes): | |
284 text = "The number of genotypes is different to the number of samples" | |
285 self.generalError(text, "", "") | |
286 | |
287 # Add the genotype information to a dictionary. | |
288 for i in range( len(self.samplesList) ): | |
289 genotypeInfo = self.genotypes[i].split(":") | |
290 self.genotypeFields[ self.samplesList[i] ] = {} | |
291 | |
292 # Check that there are as many fields as in the format field. If not, this must | |
293 # be because the information is not known. In this case, it is permitted that | |
294 # the genotype information is either . or ./. | |
295 if genotypeInfo[0] == "./." or genotypeInfo[0] == "." and len(self.genotypeFormats) != len(genotypeInfo): | |
296 self.genotypeFields[ self.samplesList[i] ] = "." | |
297 else: | |
298 if len(self.genotypeFormats) != len(genotypeInfo): | |
299 text = "The number of genotype fields is different to the number specified in the format string" | |
300 self.generalError(text, "sample", self.samplesList[i]) | |
301 | |
302 for j in range( len(self.genotypeFormats) ): self.genotypeFields[ self.samplesList[i] ][ self.genotypeFormats[j] ] = genotypeInfo[j] | |
303 | |
304 # Parse through the vcf file until the correct reference sequence is | |
305 # encountered and the position is greater than or equal to that requested. | |
306 def parseVcf(self, referenceSequence, position, writeOut, outputFile): | |
307 success = True | |
308 if self.referenceSequence != referenceSequence: | |
309 while self.referenceSequence != referenceSequence and success: | |
310 if writeOut: outputFile.write(self.record) | |
311 success = self.getRecord() | |
312 | |
313 while self.referenceSequence == referenceSequence and self.position < position and success: | |
314 if writeOut: outputFile.write(self.record) | |
315 success = self.getRecord() | |
316 | |
317 return success | |
318 | |
319 # Get the information for a specific info tag. Also check that it contains | |
320 # the correct number and type of entries. | |
321 def getInfo(self, tag): | |
322 result = [] | |
323 | |
324 # Check if the tag exists in the header information. If so, | |
325 # determine the number and type of entries asscoiated with this | |
326 # tag. | |
327 if self.infoHeaderTags.has_key(tag): | |
328 infoNumber = self.infoHeaderTags[tag][0] | |
329 infoType = self.infoHeaderTags[tag][1] | |
330 numberValues = infoNumber | |
331 | |
332 # First check that the tag exists in the information string. Then split | |
333 # the entry on commas. For flag entries, do not perform the split. | |
334 if self.infoTags.has_key(tag): | |
335 if numberValues == 0 and infoType == "Flag": result = True | |
336 elif numberValues != 0 and infoType == "Flag": | |
337 print >> sys.stderr, "ERROR" | |
338 exit(1) | |
339 else: | |
340 fields = self.infoTags[tag].split(",") | |
341 if len(fields) != numberValues: | |
342 text = "Unexpected number of entries" | |
343 self.generalError(text, "information tag", tag) | |
344 | |
345 for i in range(infoNumber): | |
346 try: result.append(fields[i]) | |
347 except IndexError: | |
348 text = "Insufficient values. Expected: " + self.infoHeaderTags[tag][0] | |
349 self.generalError(text, "tag:", tag) | |
350 else: numberValues = 0 | |
351 | |
352 else: | |
353 text = "information field does not have a definition in the header" | |
354 self.generalError(text, "tag", tag) | |
355 | |
356 return numberValues, infoType, result | |
357 | |
358 # Get the genotype information. | |
359 def getGenotypeInfo(self, sample, tag): | |
360 result = [] | |
361 if self.formatHeaderTags.has_key(tag): | |
362 infoNumber = self.formatHeaderTags[tag][0] | |
363 infoType = self.formatHeaderTags[tag][1] | |
364 numberValues = infoNumber | |
365 | |
366 if self.genotypeFields[sample] == "." and len(self.genotypeFields[sample]) == 1: | |
367 numberValues = 0 | |
368 result = "." | |
369 else: | |
370 if self.genotypeFields[sample].has_key(tag): | |
371 if tag == "GT": | |
372 if len(self.genotypeFields[sample][tag]) != 3 and len(self.genotypeFields[sample][tag]) != 1: | |
373 text = "Unexected number of characters in genotype (GT) field" | |
374 self.generalError(text, "sample", sample) | |
375 | |
376 # If a diploid call, check whether or not the genotype is phased. | |
377 elif len(self.genotypeFields[sample][tag]) == 3: | |
378 self.phased = True if self.genotypeFields[sample][tag][1] == "|" else False | |
379 result.append( self.genotypeFields[sample][tag][0] ) | |
380 result.append( self.genotypeFields[sample][tag][2] ) | |
381 elif len(self.genotypeFields[sample][tag]) == 3: | |
382 result.append( self.genotypeFields[sample][tag][0] ) | |
383 else: | |
384 fields = self.genotypeFields[sample][tag].split(",") | |
385 if len(fields) != numberValues: | |
386 text = "Unexpected number of characters in " + tag + " field" | |
387 self.generalError(text, "sample", sample) | |
388 | |
389 for i in range(infoNumber): result.append(fields[i]) | |
390 else: | |
391 text = "genotype field does not have a definition in the header" | |
392 self.generalError(text, "tag", tag) | |
393 | |
394 return numberValues, result | |
395 | |
396 # Parse the dbsnp entry. If the entry conforms to the required variant type, | |
397 # return the dbsnp rsid value, otherwise ".". | |
398 def getDbsnpInfo(self): | |
399 | |
400 # First check that the variant class (VC) is listed as SNP. | |
401 vc = self.info.split("VC=",1) | |
402 if vc[1].find(";") != -1: snp = vc[1].split(";",1) | |
403 else: | |
404 snp = [] | |
405 snp.append(vc[1]) | |
406 | |
407 if snp[0].lower() == "snp": rsid = self.rsid | |
408 else: rsid = "." | |
409 | |
410 return rsid | |
411 | |
412 # Build a new vcf record. | |
413 def buildRecord(self, removeGenotypes): | |
414 record = self.referenceSequence + "\t" + \ | |
415 str(self.position) + "\t" + \ | |
416 self.rsid + "\t" + \ | |
417 self.ref + "\t" + \ | |
418 self.alt + "\t" + \ | |
419 str(self.quality) + "\t" + \ | |
420 self.filters + "\t" + \ | |
421 self.info | |
422 | |
423 if self.hasGenotypes and not removeGenotypes: record += self.genotypeString | |
424 | |
425 record += "\n" | |
426 | |
427 return record | |
428 | |
429 # Close the vcf file. | |
430 def closeVcf(self, filename): | |
431 self.filehandle.close() | |
432 | |
433 # Define error messages for different handled errors. | |
434 def generalError(self, text, field, fieldValue): | |
435 print >> sys.stderr, "\nError encountered when attempting to read:" | |
436 print >> sys.stderr, "\treference sequence :\t", self.referenceSequence | |
437 print >> sys.stderr, "\tposition :\t\t", self.position | |
438 if field != "": print >> sys.stderr, "\t", field, ":\t", fieldValue | |
439 print >> sys.stderr, "\n", text | |
440 exit(1) |