0
|
1 #!/usr/bin/python
|
|
2
|
|
3 import os.path
|
|
4 import sys
|
|
5 import re
|
|
6
|
|
7 class vcf:
|
|
8 def __init__(self):
|
|
9
|
|
10 # Header info.
|
|
11 self.filename = ""
|
|
12 self.hasHeader = True
|
|
13 self.headerText = ""
|
|
14 self.headerTitles = ""
|
|
15 self.vcfFormat = ""
|
|
16 #self.headerInfoText = ""
|
|
17 #self.headerFormatText = ""
|
|
18
|
|
19 # Store the info and format tags as well as the lines that describe
|
|
20 # them in a dictionary.
|
|
21 self.numberDataSets = 0
|
|
22 self.includedDataSets = {}
|
|
23 self.infoHeaderTags = {}
|
|
24 self.infoHeaderString = {}
|
|
25 self.formatHeaderTags = {}
|
|
26 self.formatHeaderString = {}
|
|
27
|
|
28 # Genotype information.
|
|
29 self.genotypes = False
|
|
30 self.infoField = {}
|
|
31
|
|
32 # Reference sequence information.
|
|
33 self.referenceSequences = {}
|
|
34 self.referenceSequenceList = []
|
|
35 self.referenceSequence = ""
|
|
36
|
|
37 # Record information.
|
|
38 self.position = -1
|
|
39 self.samplesList = []
|
|
40
|
|
41 # Determine which fields to process.
|
|
42 self.processInfo = False
|
|
43 self.processGenotypes = False
|
|
44 self.dbsnpVcf = False
|
|
45 self.hapmapVcf = False
|
|
46
|
|
47 # Open a vcf file.
|
|
48 def openVcf(self, filename):
|
|
49 if filename == "stdin":
|
|
50 self.filehandle = sys.stdin
|
|
51 self.filename = "stdin"
|
|
52 else:
|
|
53 try: self.filehandle = open(filename,"r")
|
|
54 except IOError:
|
|
55 print >> sys.stderr, "Failed to find file: ",filename
|
|
56 exit(1)
|
|
57 self.filename = os.path.abspath(filename)
|
|
58
|
|
59 # Parse the vcf header.
|
|
60 def parseHeader(self, filename, writeOut):
|
|
61 while self.getHeaderLine(filename, writeOut):
|
|
62 continue
|
|
63
|
|
64 # Determine the type of information in the header line.
|
|
65 def getHeaderLine(self, filename, writeOut):
|
|
66 self.headerLine = self.filehandle.readline().rstrip("\n")
|
|
67 if self.headerLine.startswith("##fileformat"): success = self.getvcfFormat()
|
|
68 if self.headerLine.startswith("##INFO"): success = self.headerInfo(writeOut, "info")
|
|
69 elif self.headerLine.startswith("##FORMAT"): success = self.headerInfo(writeOut, "format")
|
|
70 elif self.headerLine.startswith("##FILE"): success = self.headerFiles(writeOut)
|
|
71 elif self.headerLine.startswith("##"): success = self.headerAdditional()
|
|
72 elif self.headerLine.startswith("#"): success = self.headerTitleString(filename, writeOut)
|
|
73 else: success = self.noHeader(filename, writeOut)
|
|
74
|
|
75 return success
|
|
76
|
|
77 # Read VCF format
|
|
78 def getvcfFormat(self):
|
|
79 try:
|
|
80 self.vcfFormat = self.headerLine.split("=",1)[1]
|
|
81 self.vcfFormat = float( self.vcfFormat.split("VCFv",1)[1] )## Extract the version number rather than the whole string
|
|
82 except IndexError:
|
|
83 print >> sys.stderr, "\nError parsing the fileformat"
|
|
84 print >> sys.stderr, "The following fileformat header is wrongly formatted: ", self.headerLine
|
|
85 exit(1)
|
|
86 return True
|
|
87
|
|
88
|
|
89 # Read information on an info field from the header line.
|
|
90 def headerInfo(self, writeOut, lineType):
|
|
91 tag = self.headerLine.split("=",1)
|
|
92 tagID = (tag[1].split("ID=",1))[1].split(",",1)
|
|
93
|
|
94 # Check if this info field has already been defined.
|
|
95 if (lineType == "info" and self.infoHeaderTags.has_key(tagID[0])) or (lineType == "format" and self.formatHeaderTags.has_key(tagID[0])):
|
|
96 print >> sys.stderr, "Info tag \"", tagID[0], "\" is defined multiple times in the header."
|
|
97 exit(1)
|
|
98
|
|
99 # Determine the number of entries, entry type and description.
|
|
100 tagNumber = (tagID[1].split("Number=",1))[1].split(",",1)
|
|
101 tagType = (tagNumber[1].split("Type=",1))[1].split(",",1)
|
|
102 try: tagDescription = ( ( (tagType[1].split("Description=\"",1))[1] ).split("\">") )[0]
|
|
103 except IndexError: tagDescription = ""
|
|
104 tagID = tagID[0]; tagNumber = tagNumber[0]; tagType = tagType[0]
|
|
105
|
|
106 # Check that the number of fields associated with the tag is either
|
|
107 # an integer or a '.' to indicate variable number of entries.
|
|
108 if tagNumber == ".": tagNumber = "variable"
|
|
109 else:
|
|
110 if self.vcfFormat<4.1:
|
|
111
|
|
112 try:
|
|
113 tagNumber = int(tagNumber)
|
|
114
|
|
115 except ValueError:
|
|
116 print >> sys.stderr, "\nError parsing header. Problem with info tag:", tagID
|
|
117 print >> sys.stderr, "Number of fields associated with this tag is not an integer or '.'"
|
|
118 exit(1)
|
|
119
|
|
120 if lineType == "info":
|
|
121 self.infoHeaderTags[tagID] = tagNumber, tagType, tagDescription
|
|
122 self.infoHeaderString[tagID] = self.headerLine
|
|
123 if lineType == "format":
|
|
124 self.formatHeaderTags[tagID] = tagNumber, tagType, tagDescription
|
|
125 self.formatHeaderString[tagID] = self.headerLine
|
|
126
|
|
127 return True
|
|
128
|
|
129 # Check to see if the records contain information from multiple different
|
|
130 # sources. If vcfPytools has been used to find the intersection or union
|
|
131 # of two vcf files, the records may have been merged to keep all the
|
|
132 # information available. If this is the case, there will be a ##FILE line
|
|
133 # for each set of information in the file. The order of these files needs
|
|
134 # to be maintained.
|
|
135 def headerFiles(self, writeOut):
|
|
136 fileID = (self.headerLine.split("ID=",1))[1].split(",",1)
|
|
137 filename = fileID[1].split("\"",2)[1]
|
|
138 try: fileID = int(fileID[0])
|
|
139 except ValueError:
|
|
140 print >> sys.stderr, "File ID in ##FILE entry must be an integer."
|
|
141 print >> sys.stderr, self.headerLine
|
|
142 exit(1)
|
|
143 if self.includedDataSets.has_key(fileID):
|
|
144 print >> sys.stderr, "\nERROR: file " + self.filename
|
|
145 print >> sys.stderr, "Multiple files in the ##FILE list have identical ID values."
|
|
146 exit(1)
|
|
147 self.includedDataSets[fileID] = filename
|
|
148
|
|
149 # Set the number of files with information in this vcf file.
|
|
150 if fileID > self.numberDataSets: self.numberDataSets = fileID
|
|
151
|
|
152 return True
|
|
153
|
|
154 # Read additional information contained in the header.
|
|
155 def headerAdditional(self):
|
|
156 self.headerText += self.headerLine + "\n"
|
|
157
|
|
158 return True
|
|
159
|
|
160 # Read in the column titles to check that all standard fields
|
|
161 # are present and read in all the samples.
|
|
162 def headerTitleString(self, filename, writeOut):
|
|
163 self.headerTitles = self.headerLine + "\n"
|
|
164
|
|
165 # Strip the end of line character from the last infoFields entry.
|
|
166 infoFields = self.headerLine.split("\t")
|
|
167 if len(infoFields) > 8:
|
|
168 # if len(infoFields) - 9 == 1 and writeOut: print >> sys.stdout, len(infoFields) - 9, " sample present in vcf file: ", filename
|
|
169 # elif writeOut: print >> sys.stdout, len(infoFields) - 9, " samples present in vcf file: ", filename
|
|
170 self.samplesList = infoFields[9:]
|
|
171 self.genotypes = True
|
|
172 elif len(infoFields) == 8:
|
|
173 if writeOut: print >> sys.stdout, "No samples present in the header. No genotype information available."
|
|
174 else:
|
|
175 print self.headerLine, len(infoFields)
|
|
176 print >> sys.stderr, "Not all vcf standard fields are available."
|
|
177 exit(1)
|
|
178
|
|
179 return False
|
|
180
|
|
181 # If there is no header in the vcf file, close and reopen the
|
|
182 # file so that the first line is avaiable for parsing as a
|
|
183 # vcf record.
|
|
184 def noHeader(self, filename, writeOut):
|
|
185 if writeOut: print >> sys.stdout, "No header lines present in", filename
|
|
186 self.hasHeader = False
|
|
187 self.closeVcf(filename)
|
|
188 self.openVcf(filename)
|
|
189
|
|
190 return False
|
|
191
|
|
192 # Check that info fields exist.
|
|
193 def checkInfoFields(self, tag):
|
|
194 if self.infoHeaderTags.has_key(tag) == False:
|
|
195 print >> sys.stderr, "Info tag \"", tag, "\" does not exist in the header."
|
|
196 exit(1)
|
|
197
|
|
198 # Get the next line of information from the vcf file.
|
|
199 def getRecord(self):
|
|
200 self.record = self.filehandle.readline()
|
|
201 if not self.record: return False
|
|
202
|
|
203 # Set up and execute a regular expression match.
|
|
204 recordRe = re.compile(r"^(\S+)\t(\d+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)(\n|\t.+)$")
|
|
205 #recordRe = re.compile(r"^(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)(\n|\s+.+)$")
|
|
206 recordMatch = recordRe.match(self.record)
|
|
207 if recordMatch == None:
|
|
208 print >> sys.stderr, "Unable to resolve vcf record.\n"
|
|
209 print >> sys.stderr, self.record
|
|
210 exit(1)
|
|
211
|
|
212 self.referenceSequence = recordMatch.group(1)
|
|
213 try: self.position = int(recordMatch.group(2))
|
|
214 except ValueError:
|
|
215 text = "variant position is not an integer"
|
|
216 self.generalError(text, "", None)
|
|
217 self.rsid = recordMatch.group(3)
|
|
218 self.ref = recordMatch.group(4)
|
|
219 self.alt = recordMatch.group(5)
|
|
220 self.quality = recordMatch.group(6)
|
|
221 self.filters = recordMatch.group(7)
|
|
222 self.info = recordMatch.group(8)
|
|
223 self.genotypeString = recordMatch.group(9)
|
|
224 self.infoTags = {}
|
|
225
|
|
226 # Check that the quality is an integer or a float. If not, set the quality
|
|
227 # to zero.
|
|
228 try: self.quality = float(self.quality)
|
|
229 except ValueError: self.quality = float(0.)
|
|
230
|
|
231 # If recordMatch.group(9) is not the end of line character, there is
|
|
232 # genotype information with this record.
|
|
233 if self.genotypeString != "\n": self.hasGenotypes = True
|
|
234 else: self.hasGenotypes = False
|
|
235
|
|
236 # Add the reference sequence to the dictionary. If it didn't previously
|
|
237 # exist append the reference sequence to the end of the list as well.
|
|
238 # This ensures that the order in which the reference sequences appeared
|
|
239 # in the header can be preserved.
|
|
240 if self.referenceSequence not in self.referenceSequences:
|
|
241 self.referenceSequences[self.referenceSequence] = True
|
|
242 self.referenceSequenceList.append(self.referenceSequence)
|
|
243
|
|
244 # Check for multiple alternate alleles.
|
|
245 self.alternateAlleles = self.alt.split(",")
|
|
246 self.numberAlternateAlleles = len(self.alternateAlleles)
|
|
247
|
|
248 # If required, process the info and genotypes.
|
|
249 if self.processInfo: self.processInfoFields()
|
|
250 if self.processGenotypes and self.hasGenotypes: self.processGenotypeFields()
|
|
251
|
|
252 return True
|
|
253
|
|
254 # Process the info string.
|
|
255 def processInfoFields(self):
|
|
256
|
|
257 # First break the info string into its constituent elements.
|
|
258 infoEntries = self.info.split(";")
|
|
259
|
|
260 # As long as some info fields exist, place them into a dictionary.
|
|
261 for entry in infoEntries:
|
|
262 infoEntry = entry.split("=")
|
|
263
|
|
264 # If the entry is a flag, there will be no equals and the length of
|
|
265 # infoEntry will be 1. In this case, set the dictionary entry to the
|
|
266 # whole entry. If the vcf file has undergone a union or intersection
|
|
267 # operation and contains the information from multiple files, this may
|
|
268 # be a '/' seperate list of flags and so cannot be set to a Boolean value
|
|
269 # yet.
|
|
270 if len(infoEntry) == 1: self.infoTags[infoEntry[0]] = infoEntry[0]
|
|
271 elif len(infoEntry) > 1: self.infoTags[infoEntry[0]] = infoEntry[1]
|
|
272
|
|
273 # Process the genotype formats and values.
|
|
274 def processGenotypeFields(self):
|
|
275 genotypeEntries = self.genotypeString.split("\t")
|
|
276 self.genotypeFormatString = genotypeEntries[1]
|
|
277 self.genotypes = list(genotypeEntries[2:])
|
|
278 self.genotypeFormats = {}
|
|
279 self.genotypeFields = {}
|
|
280 self.genotypeFormats = self.genotypeFormatString.split(":")
|
|
281
|
|
282 # Check that the number of genotype fields is equal to the number of samples
|
|
283 if len(self.samplesList) != len(self.genotypes):
|
|
284 text = "The number of genotypes is different to the number of samples"
|
|
285 self.generalError(text, "", "")
|
|
286
|
|
287 # Add the genotype information to a dictionary.
|
|
288 for i in range( len(self.samplesList) ):
|
|
289 genotypeInfo = self.genotypes[i].split(":")
|
|
290 self.genotypeFields[ self.samplesList[i] ] = {}
|
|
291
|
|
292 # Check that there are as many fields as in the format field. If not, this must
|
|
293 # be because the information is not known. In this case, it is permitted that
|
|
294 # the genotype information is either . or ./.
|
|
295 if genotypeInfo[0] == "./." or genotypeInfo[0] == "." and len(self.genotypeFormats) != len(genotypeInfo):
|
|
296 self.genotypeFields[ self.samplesList[i] ] = "."
|
|
297 else:
|
|
298 if len(self.genotypeFormats) != len(genotypeInfo):
|
|
299 text = "The number of genotype fields is different to the number specified in the format string"
|
|
300 self.generalError(text, "sample", self.samplesList[i])
|
|
301
|
|
302 for j in range( len(self.genotypeFormats) ): self.genotypeFields[ self.samplesList[i] ][ self.genotypeFormats[j] ] = genotypeInfo[j]
|
|
303
|
|
304 # Parse through the vcf file until the correct reference sequence is
|
|
305 # encountered and the position is greater than or equal to that requested.
|
|
306 def parseVcf(self, referenceSequence, position, writeOut, outputFile):
|
|
307 success = True
|
|
308 if self.referenceSequence != referenceSequence:
|
|
309 while self.referenceSequence != referenceSequence and success:
|
|
310 if writeOut: outputFile.write(self.record)
|
|
311 success = self.getRecord()
|
|
312
|
|
313 while self.referenceSequence == referenceSequence and self.position < position and success:
|
|
314 if writeOut: outputFile.write(self.record)
|
|
315 success = self.getRecord()
|
|
316
|
|
317 return success
|
|
318
|
|
319 # Get the information for a specific info tag. Also check that it contains
|
|
320 # the correct number and type of entries.
|
|
321 def getInfo(self, tag):
|
|
322 result = []
|
|
323
|
|
324 # Check if the tag exists in the header information. If so,
|
|
325 # determine the number and type of entries asscoiated with this
|
|
326 # tag.
|
|
327 if self.infoHeaderTags.has_key(tag):
|
|
328 infoNumber = self.infoHeaderTags[tag][0]
|
|
329 infoType = self.infoHeaderTags[tag][1]
|
|
330 numberValues = infoNumber
|
|
331
|
|
332 # First check that the tag exists in the information string. Then split
|
|
333 # the entry on commas. For flag entries, do not perform the split.
|
|
334 if self.infoTags.has_key(tag):
|
|
335 if numberValues == 0 and infoType == "Flag": result = True
|
|
336 elif numberValues != 0 and infoType == "Flag":
|
|
337 print >> sys.stderr, "ERROR"
|
|
338 exit(1)
|
|
339 else:
|
|
340 fields = self.infoTags[tag].split(",")
|
|
341 if len(fields) != numberValues:
|
|
342 text = "Unexpected number of entries"
|
|
343 self.generalError(text, "information tag", tag)
|
|
344
|
|
345 for i in range(infoNumber):
|
|
346 try: result.append(fields[i])
|
|
347 except IndexError:
|
|
348 text = "Insufficient values. Expected: " + self.infoHeaderTags[tag][0]
|
|
349 self.generalError(text, "tag:", tag)
|
|
350 else: numberValues = 0
|
|
351
|
|
352 else:
|
|
353 text = "information field does not have a definition in the header"
|
|
354 self.generalError(text, "tag", tag)
|
|
355
|
|
356 return numberValues, infoType, result
|
|
357
|
|
358 # Get the genotype information.
|
|
359 def getGenotypeInfo(self, sample, tag):
|
|
360 result = []
|
|
361 if self.formatHeaderTags.has_key(tag):
|
|
362 infoNumber = self.formatHeaderTags[tag][0]
|
|
363 infoType = self.formatHeaderTags[tag][1]
|
|
364 numberValues = infoNumber
|
|
365
|
|
366 if self.genotypeFields[sample] == "." and len(self.genotypeFields[sample]) == 1:
|
|
367 numberValues = 0
|
|
368 result = "."
|
|
369 else:
|
|
370 if self.genotypeFields[sample].has_key(tag):
|
|
371 if tag == "GT":
|
|
372 if len(self.genotypeFields[sample][tag]) != 3 and len(self.genotypeFields[sample][tag]) != 1:
|
|
373 text = "Unexected number of characters in genotype (GT) field"
|
|
374 self.generalError(text, "sample", sample)
|
|
375
|
|
376 # If a diploid call, check whether or not the genotype is phased.
|
|
377 elif len(self.genotypeFields[sample][tag]) == 3:
|
|
378 self.phased = True if self.genotypeFields[sample][tag][1] == "|" else False
|
|
379 result.append( self.genotypeFields[sample][tag][0] )
|
|
380 result.append( self.genotypeFields[sample][tag][2] )
|
|
381 elif len(self.genotypeFields[sample][tag]) == 3:
|
|
382 result.append( self.genotypeFields[sample][tag][0] )
|
|
383 else:
|
|
384 fields = self.genotypeFields[sample][tag].split(",")
|
|
385 if len(fields) != numberValues:
|
|
386 text = "Unexpected number of characters in " + tag + " field"
|
|
387 self.generalError(text, "sample", sample)
|
|
388
|
|
389 for i in range(infoNumber): result.append(fields[i])
|
|
390 else:
|
|
391 text = "genotype field does not have a definition in the header"
|
|
392 self.generalError(text, "tag", tag)
|
|
393
|
|
394 return numberValues, result
|
|
395
|
|
396 # Parse the dbsnp entry. If the entry conforms to the required variant type,
|
|
397 # return the dbsnp rsid value, otherwise ".".
|
|
398 def getDbsnpInfo(self):
|
|
399
|
|
400 # First check that the variant class (VC) is listed as SNP.
|
|
401 vc = self.info.split("VC=",1)
|
|
402 if vc[1].find(";") != -1: snp = vc[1].split(";",1)
|
|
403 else:
|
|
404 snp = []
|
|
405 snp.append(vc[1])
|
|
406
|
|
407 if snp[0].lower() == "snp": rsid = self.rsid
|
|
408 else: rsid = "."
|
|
409
|
|
410 return rsid
|
|
411
|
|
412 # Build a new vcf record.
|
|
413 def buildRecord(self, removeGenotypes):
|
|
414 record = self.referenceSequence + "\t" + \
|
|
415 str(self.position) + "\t" + \
|
|
416 self.rsid + "\t" + \
|
|
417 self.ref + "\t" + \
|
|
418 self.alt + "\t" + \
|
|
419 str(self.quality) + "\t" + \
|
|
420 self.filters + "\t" + \
|
|
421 self.info
|
|
422
|
|
423 if self.hasGenotypes and not removeGenotypes: record += self.genotypeString
|
|
424
|
|
425 record += "\n"
|
|
426
|
|
427 return record
|
|
428
|
|
429 # Close the vcf file.
|
|
430 def closeVcf(self, filename):
|
|
431 self.filehandle.close()
|
|
432
|
|
433 # Define error messages for different handled errors.
|
|
434 def generalError(self, text, field, fieldValue):
|
|
435 print >> sys.stderr, "\nError encountered when attempting to read:"
|
|
436 print >> sys.stderr, "\treference sequence :\t", self.referenceSequence
|
|
437 print >> sys.stderr, "\tposition :\t\t", self.position
|
|
438 if field != "": print >> sys.stderr, "\t", field, ":\t", fieldValue
|
|
439 print >> sys.stderr, "\n", text
|
|
440 exit(1)
|