Mercurial > repos > melissacline > ucsc_cancer_utilities
comparison vcf.py @ 0:60efb9214eaa
Uploaded
author | melissacline |
---|---|
date | Wed, 14 Jan 2015 13:54:03 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:60efb9214eaa |
---|---|
1 #!/usr/bin/env python | |
2 '''A VCFv4.0 parser for Python. | |
3 | |
4 The intent of this module is to mimic the ``csv`` module in the Python stdlib, | |
5 as opposed to more flexible serialization formats like JSON or YAML. ``vcf`` | |
6 will attempt to parse the content of each record based on the data types | |
7 specified in the meta-information lines -- specifically the ##INFO and | |
8 ##FORMAT lines. If these lines are missing or incomplete, it will check | |
9 against the reserved types mentioned in the spec. Failing that, it will just | |
10 return strings. | |
11 | |
12 There is currently one piece of interface: ``VCFReader``. It takes a file-like | |
13 object and acts as a reader:: | |
14 | |
15 >>> import vcf | |
16 >>> vcf_reader = vcf.VCFReader(open('example.vcf', 'rb')) | |
17 >>> for record in vcf_reader: | |
18 ... print record | |
19 Record(CHROM='20', POS=14370, ID='rs6054257', REF='G', ALT=['A'], QUAL=29, | |
20 FILTER='PASS', INFO={'H2': True, 'NS': 3, 'DB': True, 'DP': 14, 'AF': [0.5] | |
21 }, FORMAT='GT:GQ:DP:HQ', samples=[{'GT': '0', 'HQ': [58, 50], 'DP': 3, 'GQ' | |
22 : 49, 'name': 'NA00001'}, {'GT': '0', 'HQ': [65, 3], 'DP': 5, 'GQ': 3, 'nam | |
23 e' : 'NA00002'}, {'GT': '0', 'DP': 3, 'GQ': 41, 'name': 'NA00003'}]) | |
24 | |
25 This produces a great deal of information, but it is conveniently accessed. | |
26 The attributes of a Record are the 8 fixed fields from the VCF spec plus two | |
27 more. That is: | |
28 | |
29 * ``Record.CHROM`` | |
30 * ``Record.POS`` | |
31 * ``Record.ID`` | |
32 * ``Record.REF`` | |
33 * ``Record.ALT`` | |
34 * ``Record.QUAL`` | |
35 * ``Record.FILTER`` | |
36 * ``Record.INFO`` | |
37 | |
38 plus two more attributes to handle genotype information: | |
39 | |
40 * ``Record.FORMAT`` | |
41 * ``Record.samples`` | |
42 | |
43 ``samples``, not being the title of any column, is left lowercase. The format | |
44 of the fixed fields is from the spec. Comma-separated lists in the VCF are | |
45 converted to lists. In particular, one-entry VCF lists are converted to | |
46 one-entry Python lists (see, e.g., ``Record.ALT``). Semicolon-delimited lists | |
47 of key=value pairs are converted to Python dictionaries, with flags being given | |
48 a ``True`` value. Integers and floats are handled exactly as you'd expect:: | |
49 | |
50 >>> record = vcf_reader.next() | |
51 >>> print record.POS | |
52 17330 | |
53 >>> print record.ALT | |
54 ['A'] | |
55 >>> print record.INFO['AF'] | |
56 [0.017] | |
57 | |
58 ``record.FORMAT`` will be a string specifying the format of the genotype | |
59 fields. In case the FORMAT column does not exist, ``record.FORMAT`` is | |
60 ``None``. Finally, ``record.samples`` is a list of dictionaries containing the | |
61 parsed sample column:: | |
62 | |
63 >>> record = vcf_reader.next() | |
64 >>> for sample in record.samples: | |
65 ... print sample['GT'] | |
66 '1|2' | |
67 '2|1' | |
68 '2/2' | |
69 | |
70 Metadata regarding the VCF file itself can be investigated through the | |
71 following attributes: | |
72 | |
73 * ``VCFReader.metadata`` | |
74 * ``VCFReader.infos`` | |
75 * ``VCFReader.filters`` | |
76 * ``VCFReader.formats`` | |
77 * ``VCFReader.samples`` | |
78 | |
79 For example:: | |
80 | |
81 >>> vcf_reader.metadata['fileDate'] | |
82 20090805 | |
83 >>> vcf_reader.samples | |
84 ['NA00001', 'NA00002', 'NA00003'] | |
85 >>> vcf_reader.filters | |
86 {'q10': Filter(id='q10', desc='Quality below 10'), | |
87 's50': Filter(id='s50', desc='Less than 50% of samples have data')} | |
88 >>> vcf_reader.infos['AA'].desc | |
89 Ancestral Allele | |
90 | |
91 ''' | |
92 import collections | |
93 import re | |
94 | |
95 | |
96 # Metadata parsers/constants | |
97 RESERVED_INFO = { | |
98 'AA': 'String', 'AC': 'Integer', 'AF': 'Float', 'AN': 'Integer', | |
99 'BQ': 'Float', 'CIGAR': 'String', 'DB': 'Flag', 'DP': 'Integer', | |
100 'END': 'Integer', 'H2': 'Flag', 'MQ': 'Float', 'MQ0': 'Integer', | |
101 'NS': 'Integer', 'SB': 'String', 'SOMATIC': 'Flag', 'VALIDATED': 'Flag' | |
102 } | |
103 | |
104 RESERVED_FORMAT = { | |
105 'GT': 'String', 'DP': 'Integer', 'FT': 'String', 'GL': 'Float', | |
106 'GQ': 'Float', 'HQ': 'Float' | |
107 } | |
108 | |
109 | |
110 _Info = collections.namedtuple('Info', ['id', 'num', 'type', 'desc']) | |
111 _Filter = collections.namedtuple('Filter', ['id', 'desc']) | |
112 _Format = collections.namedtuple('Format', ['id', 'num', 'type', 'desc']) | |
113 | |
114 | |
115 class _vcf_metadata_parser(object): | |
116 '''Parse the metadat in the header of a VCF file.''' | |
117 def __init__(self, aggressive=False): | |
118 super(_vcf_metadata_parser, self).__init__() | |
119 self.aggro = aggressive | |
120 self.info_pattern = re.compile(r'''\#\#INFO=< | |
121 ID=(?P<id>[^,]+), | |
122 Number=(?P<number>\d+|\.|[AG]), | |
123 Type=(?P<type>Integer|Float|Flag|Character|String), | |
124 Description="(?P<desc>[^"]*)" | |
125 >''', re.VERBOSE) | |
126 self.filter_pattern = re.compile(r'''\#\#FILTER=< | |
127 ID=(?P<id>[^,]+), | |
128 Description="(?P<desc>[^"]*)" | |
129 >''', re.VERBOSE) | |
130 self.format_pattern = re.compile(r'''\#\#FORMAT=< | |
131 ID=(?P<id>.+), | |
132 Number=(?P<number>\d+|\.|[AG]), | |
133 Type=(?P<type>.+), | |
134 Description="(?P<desc>.*)" | |
135 >''', re.VERBOSE) | |
136 self.meta_pattern = re.compile(r'''##(?P<key>.+)=(?P<val>.+)''') | |
137 | |
138 def read_info(self, info_string): | |
139 '''Read a meta-information INFO line.''' | |
140 match = self.info_pattern.match(info_string) | |
141 if not match: | |
142 raise SyntaxError( | |
143 "One of the INFO lines is malformed: {}".format(info_string)) | |
144 | |
145 try: | |
146 num = int(match.group('number')) | |
147 except ValueError: | |
148 num = None if self.aggro else '.' | |
149 | |
150 info = _Info(match.group('id'), num, | |
151 match.group('type'), match.group('desc')) | |
152 | |
153 return (match.group('id'), info) | |
154 | |
155 def read_filter(self, filter_string): | |
156 '''Read a meta-information FILTER line.''' | |
157 match = self.filter_pattern.match(filter_string) | |
158 if not match: | |
159 raise SyntaxError( | |
160 "One of the FILTER lines is malformed: {}".format( | |
161 filter_string)) | |
162 | |
163 filt = _Filter(match.group('id'), match.group('desc')) | |
164 | |
165 return (match.group('id'), filt) | |
166 | |
167 def read_format(self, format_string): | |
168 '''Read a meta-information FORMAT line.''' | |
169 match = self.format_pattern.match(format_string) | |
170 if not match: | |
171 raise SyntaxError( | |
172 "One of the FORMAT lines is malformed: {}".format( | |
173 format_string)) | |
174 | |
175 try: | |
176 num = int(match.group('number')) | |
177 except ValueError: | |
178 num = None if self.aggro else '.' | |
179 | |
180 form = _Format(match.group('id'), num, | |
181 match.group('type'), match.group('desc')) | |
182 | |
183 return (match.group('id'), form) | |
184 | |
185 def read_meta(self, meta_string): | |
186 match = self.meta_pattern.match(meta_string) | |
187 return match.group('key'), match.group('val') | |
188 | |
189 | |
190 # Reader class | |
191 class _meta_info(object): | |
192 '''Decorator for a property stored in the header info.''' | |
193 def __init__(self, func): | |
194 self.func = func | |
195 | |
196 def __call__(self, fself): | |
197 if getattr(fself, "_%s" % self.func.__name__) is None: | |
198 fself._parse_metainfo() | |
199 | |
200 return self.func(fself) | |
201 | |
202 def __repr__(self): | |
203 '''Return the function's docstring.''' | |
204 return self.func.__doc__ | |
205 | |
206 def __doc__(self): | |
207 '''Return the function's docstring.''' | |
208 return self.func.__doc__ | |
209 | |
210 _Record = collections.namedtuple('Record', [ | |
211 'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', | |
212 'samples' | |
213 ]) | |
214 | |
215 | |
216 class VCFReader(object): | |
217 '''Read and parse a VCF v 4.0 file''' | |
218 def __init__(self, fsock, aggressive=False): | |
219 super(VCFReader, self).__init__() | |
220 self.aggro = aggressive | |
221 self._metadata = None | |
222 self._infos = None | |
223 self._filters = None | |
224 self._formats = None | |
225 self._samples = None | |
226 self.reader = fsock | |
227 if aggressive: | |
228 self._mapper = self._none_map | |
229 else: | |
230 self._mapper = self._pass_map | |
231 | |
232 def __iter__(self): | |
233 return self | |
234 | |
235 @property | |
236 @_meta_info | |
237 def metadata(self): | |
238 '''Return the information from lines starting "##"''' | |
239 return self._metadata | |
240 | |
241 @property | |
242 @_meta_info | |
243 def infos(self): | |
244 '''Return the information from lines starting "##INFO"''' | |
245 return self._infos | |
246 | |
247 @property | |
248 @_meta_info | |
249 def filters(self): | |
250 '''Return the information from lines starting "##FILTER"''' | |
251 return self._filters | |
252 | |
253 @property | |
254 @_meta_info | |
255 def formats(self): | |
256 '''Return the information from lines starting "##FORMAT"''' | |
257 return self._formats | |
258 | |
259 @property | |
260 @_meta_info | |
261 def samples(self): | |
262 '''Return the names of the genotype fields.''' | |
263 return self._samples | |
264 | |
265 def _parse_metainfo(self): | |
266 '''Parse the information stored in the metainfo of the VCF. | |
267 | |
268 The end user shouldn't have to use this. She can access the metainfo | |
269 directly with ``self.metadata``.''' | |
270 for attr in ('_metadata', '_infos', '_filters', '_formats'): | |
271 setattr(self, attr, {}) | |
272 | |
273 parser = _vcf_metadata_parser() | |
274 | |
275 line = self.reader.next() | |
276 while line.startswith('##'): | |
277 line = line.strip() | |
278 if line.startswith('##INFO'): | |
279 key, val = parser.read_info(line) | |
280 self._infos[key] = val | |
281 | |
282 elif line.startswith('##FILTER'): | |
283 key, val = parser.read_filter(line) | |
284 self._filters[key] = val | |
285 | |
286 elif line.startswith('##FORMAT'): | |
287 key, val = parser.read_format(line) | |
288 self._formats[key] = val | |
289 | |
290 else: | |
291 key, val = parser.read_meta(line.strip()) | |
292 self._metadata[key] = val | |
293 | |
294 line = self.reader.next() | |
295 | |
296 fields = line.split() | |
297 self._samples = fields[9:] | |
298 | |
299 def _none_map(self, func, iterable, bad='.'): | |
300 '''``map``, but make bad values None.''' | |
301 return [func(x) if x != bad else None | |
302 for x in iterable] | |
303 | |
304 def _pass_map(self, func, iterable, bad='.'): | |
305 '''``map``, but make bad values None.''' | |
306 return [func(x) if x != bad else bad | |
307 for x in iterable] | |
308 | |
309 def _parse_info(self, info_str): | |
310 '''Parse the INFO field of a VCF entry into a dictionary of Python | |
311 types. | |
312 | |
313 ''' | |
314 entries = info_str.split(';') | |
315 retdict = {} | |
316 for entry in entries: | |
317 entry = entry.split('=') | |
318 ID = entry[0] | |
319 try: | |
320 entry_type = self.infos[ID].type | |
321 except KeyError: | |
322 try: | |
323 entry_type = RESERVED_INFO[ID] | |
324 except KeyError: | |
325 if entry[1:]: | |
326 entry_type = 'String' | |
327 else: | |
328 entry_type = 'Flag' | |
329 | |
330 if entry_type == 'Integer': | |
331 vals = entry[1].split(',') | |
332 val = self._mapper(int, vals) | |
333 elif entry_type == 'Float': | |
334 vals = entry[1].split(',') | |
335 val = self._mapper(float, vals) | |
336 elif entry_type == 'Flag': | |
337 val = True | |
338 elif entry_type == 'String': | |
339 val = entry[1] | |
340 | |
341 try: | |
342 if self.infos[ID].num == 1: | |
343 val = val[0] | |
344 except KeyError: | |
345 pass | |
346 | |
347 retdict[ID] = val | |
348 | |
349 return retdict | |
350 | |
351 def _parse_samples(self, samples, samp_fmt): | |
352 '''Parse a sample entry according to the format specified in the FORMAT | |
353 column.''' | |
354 samp_data = [] | |
355 samp_fmt = samp_fmt.split(':') | |
356 for sample in samples: | |
357 sampdict = dict(zip(samp_fmt, sample.split(':'))) | |
358 for fmt in sampdict: | |
359 vals = sampdict[fmt].split(',') | |
360 try: | |
361 entry_type = self.formats[fmt].type | |
362 except KeyError: | |
363 try: | |
364 entry_type = RESERVED_FORMAT[fmt] | |
365 except KeyError: | |
366 entry_type = 'String' | |
367 | |
368 if entry_type == 'Integer': | |
369 sampdict[fmt] = self._mapper(int, vals) | |
370 elif entry_type == 'Float' or entry_type == 'Numeric': | |
371 sampdict[fmt] = self._mapper(float, vals) | |
372 elif sampdict[fmt] == './.' and self.aggro: | |
373 sampdict[fmt] = None | |
374 | |
375 samp_data.append(sampdict) | |
376 | |
377 for name, data in zip(self.samples, samp_data): | |
378 data['name'] = name | |
379 | |
380 return samp_data | |
381 | |
382 def next(self): | |
383 '''Return the next record in the file.''' | |
384 if self._samples is None: | |
385 self._parse_metainfo() | |
386 row = self.reader.next().split() | |
387 chrom = row[0] | |
388 pos = int(row[1]) | |
389 | |
390 if row[2] != '.': | |
391 ID = row[2] | |
392 else: | |
393 ID = None if self.aggro else row[2] | |
394 | |
395 ref = row[3] | |
396 alt = self._mapper(str, row[4].split(',')) | |
397 qual = float(row[5]) if '.' in row[5] else int(row[5]) | |
398 filt = row[6].split(';') if ';' in row[6] else row[6] | |
399 if filt == 'PASS' and self.aggro: | |
400 filt = None | |
401 info = self._parse_info(row[7]) | |
402 | |
403 try: | |
404 fmt = row[8] | |
405 except IndexError: | |
406 fmt = None | |
407 samples = None | |
408 else: | |
409 samples = self._parse_samples(row[9:], fmt) | |
410 | |
411 record = _Record(chrom, pos, ID, ref, alt, qual, filt, info, fmt, | |
412 samples) | |
413 return record | |
414 | |
415 | |
416 def main(): | |
417 '''Parse the example VCF file from the specification and print every | |
418 record.''' | |
419 import contextlib | |
420 import StringIO | |
421 import textwrap | |
422 buff = '''\ | |
423 ##fileformat=VCFv4.0 | |
424 ##fileDate=20090805 | |
425 ##source=myImputationProgramV3.1 | |
426 ##reference=1000GenomesPilot-NCBI36 | |
427 ##phasing=partial | |
428 ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data"> | |
429 ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth"> | |
430 ##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency"> | |
431 ##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele"> | |
432 ##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129"> | |
433 ##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership"> | |
434 ##INFO=<ID=AC,Number=A,Type=Integer,Description="Total number of alternate alleles in called genotypes"> | |
435 ##FILTER=<ID=q10,Description="Quality below 10"> | |
436 ##FILTER=<ID=s50,Description="Less than 50% of samples have data"> | |
437 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |
438 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> | |
439 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> | |
440 ##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality"> | |
441 #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA00001\tNA00002\tNA00003 | |
442 20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,. | |
443 20\t17330\t.\tT\tA\t3\tq10\tNS=3;DP=11;AF=0.017\tGT:GQ:DP:HQ\t0|0:49:3:58,50\t0|1:3:5:65,3\t0/0:41:3 | |
444 20\t1110696\trs6040355\tA\tG,T\t67\tPASS\tNS=2;DP=10;AF=0.333,0.667;AA=T;DB\tGT:GQ:DP:HQ\t1|2:21:6:23,27\t2|1:2:0:18,2\t2/2:35:4 | |
445 20\t1230237\t.\tT\t.\t47\tPASS\tNS=3;DP=13;AA=T\tGT:GQ:DP:HQ\t0|0:54:7:56,60\t0|0:48:4:51,51\t0/0:61:2 | |
446 20\t1234567\tmicrosat1\tGTCT\tG,GTACT\t50\tPASS\tNS=3;DP=9;AA=G\tGT:GQ:DP\t./.:35:4\t0/2:17:2\t1/1:40:3 | |
447 ''' | |
448 with contextlib.closing(StringIO.StringIO(textwrap.dedent(buff))) as sock: | |
449 vcf_file = VCFReader(sock, aggressive=True) | |
450 for record in vcf_file: | |
451 print record | |
452 | |
453 | |
454 if __name__ == '__main__': | |
455 main() |