comparison vcfClass.py @ 0:4502baf7ac42 draft default tip

Imported from capsule None
author devteam
date Thu, 23 Jan 2014 12:32:12 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4502baf7ac42
1 #!/usr/bin/python
2
3 import os.path
4 import sys
5 import re
6
7 class vcf:
8 def __init__(self):
9
10 # Header info.
11 self.filename = ""
12 self.hasHeader = True
13 self.headerText = ""
14 self.headerTitles = ""
15 self.vcfFormat = ""
16 #self.headerInfoText = ""
17 #self.headerFormatText = ""
18
19 # Store the info and format tags as well as the lines that describe
20 # them in a dictionary.
21 self.numberDataSets = 0
22 self.includedDataSets = {}
23 self.infoHeaderTags = {}
24 self.infoHeaderString = {}
25 self.formatHeaderTags = {}
26 self.formatHeaderString = {}
27
28 # Genotype information.
29 self.genotypes = False
30 self.infoField = {}
31
32 # Reference sequence information.
33 self.referenceSequences = {}
34 self.referenceSequenceList = []
35 self.referenceSequence = ""
36
37 # Record information.
38 self.position = -1
39 self.samplesList = []
40
41 # Determine which fields to process.
42 self.processInfo = False
43 self.processGenotypes = False
44 self.dbsnpVcf = False
45 self.hapmapVcf = False
46
47 # Open a vcf file.
48 def openVcf(self, filename):
49 if filename == "stdin":
50 self.filehandle = sys.stdin
51 self.filename = "stdin"
52 else:
53 try: self.filehandle = open(filename,"r")
54 except IOError:
55 print >> sys.stderr, "Failed to find file: ",filename
56 exit(1)
57 self.filename = os.path.abspath(filename)
58
59 # Parse the vcf header.
60 def parseHeader(self, filename, writeOut):
61 while self.getHeaderLine(filename, writeOut):
62 continue
63
64 # Determine the type of information in the header line.
65 def getHeaderLine(self, filename, writeOut):
66 self.headerLine = self.filehandle.readline().rstrip("\n")
67 if self.headerLine.startswith("##fileformat"): success = self.getvcfFormat()
68 if self.headerLine.startswith("##INFO"): success = self.headerInfo(writeOut, "info")
69 elif self.headerLine.startswith("##FORMAT"): success = self.headerInfo(writeOut, "format")
70 elif self.headerLine.startswith("##FILE"): success = self.headerFiles(writeOut)
71 elif self.headerLine.startswith("##"): success = self.headerAdditional()
72 elif self.headerLine.startswith("#"): success = self.headerTitleString(filename, writeOut)
73 else: success = self.noHeader(filename, writeOut)
74
75 return success
76
77 # Read VCF format
78 def getvcfFormat(self):
79 try:
80 self.vcfFormat = self.headerLine.split("=",1)[1]
81 self.vcfFormat = float( self.vcfFormat.split("VCFv",1)[1] )## Extract the version number rather than the whole string
82 except IndexError:
83 print >> sys.stderr, "\nError parsing the fileformat"
84 print >> sys.stderr, "The following fileformat header is wrongly formatted: ", self.headerLine
85 exit(1)
86 return True
87
88
89 # Read information on an info field from the header line.
90 def headerInfo(self, writeOut, lineType):
91 tag = self.headerLine.split("=",1)
92 tagID = (tag[1].split("ID=",1))[1].split(",",1)
93
94 # Check if this info field has already been defined.
95 if (lineType == "info" and self.infoHeaderTags.has_key(tagID[0])) or (lineType == "format" and self.formatHeaderTags.has_key(tagID[0])):
96 print >> sys.stderr, "Info tag \"", tagID[0], "\" is defined multiple times in the header."
97 exit(1)
98
99 # Determine the number of entries, entry type and description.
100 tagNumber = (tagID[1].split("Number=",1))[1].split(",",1)
101 tagType = (tagNumber[1].split("Type=",1))[1].split(",",1)
102 try: tagDescription = ( ( (tagType[1].split("Description=\"",1))[1] ).split("\">") )[0]
103 except IndexError: tagDescription = ""
104 tagID = tagID[0]; tagNumber = tagNumber[0]; tagType = tagType[0]
105
106 # Check that the number of fields associated with the tag is either
107 # an integer or a '.' to indicate variable number of entries.
108 if tagNumber == ".": tagNumber = "variable"
109 else:
110 if self.vcfFormat<4.1:
111
112 try:
113 tagNumber = int(tagNumber)
114
115 except ValueError:
116 print >> sys.stderr, "\nError parsing header. Problem with info tag:", tagID
117 print >> sys.stderr, "Number of fields associated with this tag is not an integer or '.'"
118 exit(1)
119
120 if lineType == "info":
121 self.infoHeaderTags[tagID] = tagNumber, tagType, tagDescription
122 self.infoHeaderString[tagID] = self.headerLine
123 if lineType == "format":
124 self.formatHeaderTags[tagID] = tagNumber, tagType, tagDescription
125 self.formatHeaderString[tagID] = self.headerLine
126
127 return True
128
129 # Check to see if the records contain information from multiple different
130 # sources. If vcfPytools has been used to find the intersection or union
131 # of two vcf files, the records may have been merged to keep all the
132 # information available. If this is the case, there will be a ##FILE line
133 # for each set of information in the file. The order of these files needs
134 # to be maintained.
135 def headerFiles(self, writeOut):
136 fileID = (self.headerLine.split("ID=",1))[1].split(",",1)
137 filename = fileID[1].split("\"",2)[1]
138 try: fileID = int(fileID[0])
139 except ValueError:
140 print >> sys.stderr, "File ID in ##FILE entry must be an integer."
141 print >> sys.stderr, self.headerLine
142 exit(1)
143 if self.includedDataSets.has_key(fileID):
144 print >> sys.stderr, "\nERROR: file " + self.filename
145 print >> sys.stderr, "Multiple files in the ##FILE list have identical ID values."
146 exit(1)
147 self.includedDataSets[fileID] = filename
148
149 # Set the number of files with information in this vcf file.
150 if fileID > self.numberDataSets: self.numberDataSets = fileID
151
152 return True
153
154 # Read additional information contained in the header.
155 def headerAdditional(self):
156 self.headerText += self.headerLine + "\n"
157
158 return True
159
160 # Read in the column titles to check that all standard fields
161 # are present and read in all the samples.
162 def headerTitleString(self, filename, writeOut):
163 self.headerTitles = self.headerLine + "\n"
164
165 # Strip the end of line character from the last infoFields entry.
166 infoFields = self.headerLine.split("\t")
167 if len(infoFields) > 8:
168 # if len(infoFields) - 9 == 1 and writeOut: print >> sys.stdout, len(infoFields) - 9, " sample present in vcf file: ", filename
169 # elif writeOut: print >> sys.stdout, len(infoFields) - 9, " samples present in vcf file: ", filename
170 self.samplesList = infoFields[9:]
171 self.genotypes = True
172 elif len(infoFields) == 8:
173 if writeOut: print >> sys.stdout, "No samples present in the header. No genotype information available."
174 else:
175 print self.headerLine, len(infoFields)
176 print >> sys.stderr, "Not all vcf standard fields are available."
177 exit(1)
178
179 return False
180
181 # If there is no header in the vcf file, close and reopen the
182 # file so that the first line is avaiable for parsing as a
183 # vcf record.
184 def noHeader(self, filename, writeOut):
185 if writeOut: print >> sys.stdout, "No header lines present in", filename
186 self.hasHeader = False
187 self.closeVcf(filename)
188 self.openVcf(filename)
189
190 return False
191
192 # Check that info fields exist.
193 def checkInfoFields(self, tag):
194 if self.infoHeaderTags.has_key(tag) == False:
195 print >> sys.stderr, "Info tag \"", tag, "\" does not exist in the header."
196 exit(1)
197
198 # Get the next line of information from the vcf file.
199 def getRecord(self):
200 self.record = self.filehandle.readline()
201 if not self.record: return False
202
203 # Set up and execute a regular expression match.
204 recordRe = re.compile(r"^(\S+)\t(\d+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)(\n|\t.+)$")
205 #recordRe = re.compile(r"^(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)(\n|\s+.+)$")
206 recordMatch = recordRe.match(self.record)
207 if recordMatch == None:
208 print >> sys.stderr, "Unable to resolve vcf record.\n"
209 print >> sys.stderr, self.record
210 exit(1)
211
212 self.referenceSequence = recordMatch.group(1)
213 try: self.position = int(recordMatch.group(2))
214 except ValueError:
215 text = "variant position is not an integer"
216 self.generalError(text, "", None)
217 self.rsid = recordMatch.group(3)
218 self.ref = recordMatch.group(4)
219 self.alt = recordMatch.group(5)
220 self.quality = recordMatch.group(6)
221 self.filters = recordMatch.group(7)
222 self.info = recordMatch.group(8)
223 self.genotypeString = recordMatch.group(9)
224 self.infoTags = {}
225
226 # Check that the quality is an integer or a float. If not, set the quality
227 # to zero.
228 try: self.quality = float(self.quality)
229 except ValueError: self.quality = float(0.)
230
231 # If recordMatch.group(9) is not the end of line character, there is
232 # genotype information with this record.
233 if self.genotypeString != "\n": self.hasGenotypes = True
234 else: self.hasGenotypes = False
235
236 # Add the reference sequence to the dictionary. If it didn't previously
237 # exist append the reference sequence to the end of the list as well.
238 # This ensures that the order in which the reference sequences appeared
239 # in the header can be preserved.
240 if self.referenceSequence not in self.referenceSequences:
241 self.referenceSequences[self.referenceSequence] = True
242 self.referenceSequenceList.append(self.referenceSequence)
243
244 # Check for multiple alternate alleles.
245 self.alternateAlleles = self.alt.split(",")
246 self.numberAlternateAlleles = len(self.alternateAlleles)
247
248 # If required, process the info and genotypes.
249 if self.processInfo: self.processInfoFields()
250 if self.processGenotypes and self.hasGenotypes: self.processGenotypeFields()
251
252 return True
253
254 # Process the info string.
255 def processInfoFields(self):
256
257 # First break the info string into its constituent elements.
258 infoEntries = self.info.split(";")
259
260 # As long as some info fields exist, place them into a dictionary.
261 for entry in infoEntries:
262 infoEntry = entry.split("=")
263
264 # If the entry is a flag, there will be no equals and the length of
265 # infoEntry will be 1. In this case, set the dictionary entry to the
266 # whole entry. If the vcf file has undergone a union or intersection
267 # operation and contains the information from multiple files, this may
268 # be a '/' seperate list of flags and so cannot be set to a Boolean value
269 # yet.
270 if len(infoEntry) == 1: self.infoTags[infoEntry[0]] = infoEntry[0]
271 elif len(infoEntry) > 1: self.infoTags[infoEntry[0]] = infoEntry[1]
272
273 # Process the genotype formats and values.
274 def processGenotypeFields(self):
275 genotypeEntries = self.genotypeString.split("\t")
276 self.genotypeFormatString = genotypeEntries[1]
277 self.genotypes = list(genotypeEntries[2:])
278 self.genotypeFormats = {}
279 self.genotypeFields = {}
280 self.genotypeFormats = self.genotypeFormatString.split(":")
281
282 # Check that the number of genotype fields is equal to the number of samples
283 if len(self.samplesList) != len(self.genotypes):
284 text = "The number of genotypes is different to the number of samples"
285 self.generalError(text, "", "")
286
287 # Add the genotype information to a dictionary.
288 for i in range( len(self.samplesList) ):
289 genotypeInfo = self.genotypes[i].split(":")
290 self.genotypeFields[ self.samplesList[i] ] = {}
291
292 # Check that there are as many fields as in the format field. If not, this must
293 # be because the information is not known. In this case, it is permitted that
294 # the genotype information is either . or ./.
295 if genotypeInfo[0] == "./." or genotypeInfo[0] == "." and len(self.genotypeFormats) != len(genotypeInfo):
296 self.genotypeFields[ self.samplesList[i] ] = "."
297 else:
298 if len(self.genotypeFormats) != len(genotypeInfo):
299 text = "The number of genotype fields is different to the number specified in the format string"
300 self.generalError(text, "sample", self.samplesList[i])
301
302 for j in range( len(self.genotypeFormats) ): self.genotypeFields[ self.samplesList[i] ][ self.genotypeFormats[j] ] = genotypeInfo[j]
303
304 # Parse through the vcf file until the correct reference sequence is
305 # encountered and the position is greater than or equal to that requested.
306 def parseVcf(self, referenceSequence, position, writeOut, outputFile):
307 success = True
308 if self.referenceSequence != referenceSequence:
309 while self.referenceSequence != referenceSequence and success:
310 if writeOut: outputFile.write(self.record)
311 success = self.getRecord()
312
313 while self.referenceSequence == referenceSequence and self.position < position and success:
314 if writeOut: outputFile.write(self.record)
315 success = self.getRecord()
316
317 return success
318
319 # Get the information for a specific info tag. Also check that it contains
320 # the correct number and type of entries.
321 def getInfo(self, tag):
322 result = []
323
324 # Check if the tag exists in the header information. If so,
325 # determine the number and type of entries asscoiated with this
326 # tag.
327 if self.infoHeaderTags.has_key(tag):
328 infoNumber = self.infoHeaderTags[tag][0]
329 infoType = self.infoHeaderTags[tag][1]
330 numberValues = infoNumber
331
332 # First check that the tag exists in the information string. Then split
333 # the entry on commas. For flag entries, do not perform the split.
334 if self.infoTags.has_key(tag):
335 if numberValues == 0 and infoType == "Flag": result = True
336 elif numberValues != 0 and infoType == "Flag":
337 print >> sys.stderr, "ERROR"
338 exit(1)
339 else:
340 fields = self.infoTags[tag].split(",")
341 if len(fields) != numberValues:
342 text = "Unexpected number of entries"
343 self.generalError(text, "information tag", tag)
344
345 for i in range(infoNumber):
346 try: result.append(fields[i])
347 except IndexError:
348 text = "Insufficient values. Expected: " + self.infoHeaderTags[tag][0]
349 self.generalError(text, "tag:", tag)
350 else: numberValues = 0
351
352 else:
353 text = "information field does not have a definition in the header"
354 self.generalError(text, "tag", tag)
355
356 return numberValues, infoType, result
357
358 # Get the genotype information.
359 def getGenotypeInfo(self, sample, tag):
360 result = []
361 if self.formatHeaderTags.has_key(tag):
362 infoNumber = self.formatHeaderTags[tag][0]
363 infoType = self.formatHeaderTags[tag][1]
364 numberValues = infoNumber
365
366 if self.genotypeFields[sample] == "." and len(self.genotypeFields[sample]) == 1:
367 numberValues = 0
368 result = "."
369 else:
370 if self.genotypeFields[sample].has_key(tag):
371 if tag == "GT":
372 if len(self.genotypeFields[sample][tag]) != 3 and len(self.genotypeFields[sample][tag]) != 1:
373 text = "Unexected number of characters in genotype (GT) field"
374 self.generalError(text, "sample", sample)
375
376 # If a diploid call, check whether or not the genotype is phased.
377 elif len(self.genotypeFields[sample][tag]) == 3:
378 self.phased = True if self.genotypeFields[sample][tag][1] == "|" else False
379 result.append( self.genotypeFields[sample][tag][0] )
380 result.append( self.genotypeFields[sample][tag][2] )
381 elif len(self.genotypeFields[sample][tag]) == 3:
382 result.append( self.genotypeFields[sample][tag][0] )
383 else:
384 fields = self.genotypeFields[sample][tag].split(",")
385 if len(fields) != numberValues:
386 text = "Unexpected number of characters in " + tag + " field"
387 self.generalError(text, "sample", sample)
388
389 for i in range(infoNumber): result.append(fields[i])
390 else:
391 text = "genotype field does not have a definition in the header"
392 self.generalError(text, "tag", tag)
393
394 return numberValues, result
395
396 # Parse the dbsnp entry. If the entry conforms to the required variant type,
397 # return the dbsnp rsid value, otherwise ".".
398 def getDbsnpInfo(self):
399
400 # First check that the variant class (VC) is listed as SNP.
401 vc = self.info.split("VC=",1)
402 if vc[1].find(";") != -1: snp = vc[1].split(";",1)
403 else:
404 snp = []
405 snp.append(vc[1])
406
407 if snp[0].lower() == "snp": rsid = self.rsid
408 else: rsid = "."
409
410 return rsid
411
412 # Build a new vcf record.
413 def buildRecord(self, removeGenotypes):
414 record = self.referenceSequence + "\t" + \
415 str(self.position) + "\t" + \
416 self.rsid + "\t" + \
417 self.ref + "\t" + \
418 self.alt + "\t" + \
419 str(self.quality) + "\t" + \
420 self.filters + "\t" + \
421 self.info
422
423 if self.hasGenotypes and not removeGenotypes: record += self.genotypeString
424
425 record += "\n"
426
427 return record
428
429 # Close the vcf file.
430 def closeVcf(self, filename):
431 self.filehandle.close()
432
433 # Define error messages for different handled errors.
434 def generalError(self, text, field, fieldValue):
435 print >> sys.stderr, "\nError encountered when attempting to read:"
436 print >> sys.stderr, "\treference sequence :\t", self.referenceSequence
437 print >> sys.stderr, "\tposition :\t\t", self.position
438 if field != "": print >> sys.stderr, "\t", field, ":\t", fieldValue
439 print >> sys.stderr, "\n", text
440 exit(1)