annotate breakdancer2vcf.py @ 7:bf9475fceac3 draft

Uploaded
author jeremie
date Thu, 26 Jun 2014 09:25:35 -0400
parents fd58a69ed896
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
1 #!/usr/bin/env python
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
2
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
3 __copyright__ = """
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
4 Copyright (C) 2013 - Tim te Beek
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
5 Copyright (C) 2013 - Wai Yi Leung
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
6 Copyright (C) 2013 AllBio (see AUTHORS file)
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
7 """
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
8
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
9 __desc__ = """Convert breakdancer output to pseudo .vcf file format."""
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
10 __created__ = "Mar 18, 2013"
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
11 __author__ = "tbeek"
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
12
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
13 import argparse
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
14 import csv
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
15 import os.path
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
16 import sys
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
17
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
18
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
19 def main(tsvfile, vcffile):
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
20 '''
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
21 :param tsvfile: filename of input file.tsv
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
22 :type tsvfile: string
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
23 :param vcffile: filename of output file.vcf
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
24 :type vcffile: string
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
25 '''
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
26 with open(tsvfile) as reader:
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
27 # Parse file
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
28 dictreader = _parse_tsvfile(reader)
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
29 print dictreader.fieldnames
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
30
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
31 # Write out file
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
32 _format_vcffile(dictreader, vcffile)
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
33
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
34 # Quick output
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
35 with open(vcffile) as reader:
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
36 print reader.read(1000)
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
37
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
38
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
39 def _parse_tsvfile(readable):
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
40 '''
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
41 Read readable using csv.Sniffer and csv.DictReader
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
42 :param readable: open file.tsv handle to read with csv.DictReader
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
43 :type readable: file
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
44 '''
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
45 prev, curr = 0, 0
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
46 while True:
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
47 line = readable.readline()
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
48 if not line.startswith('#'):
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
49 # lets start from prev # line, without the hash sign
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
50 readable.seek(prev + 1)
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
51 break
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
52 else:
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
53 prev = curr
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
54 curr = readable.tell()
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
55
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
56 # Determine dialect
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
57 curr = readable.tell()
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
58 #dialect = csv.Sniffer().sniff(readable.read(3000))
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
59 dialect = 'excel-tab'
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
60 readable.seek(curr)
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
61
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
62 # Read file
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
63 dictreader = csv.DictReader(readable, dialect=dialect)
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
64 return dictreader
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
65
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
66
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
67 _tsv_fields = ('Chr1', 'Pos1', 'Orientation1',
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
68 'Chr2', 'Pos2', 'Orientation2',
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
69 'Type', 'Size', 'Score',
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
70 'num_Reads', 'num_Reads_lib',
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
71 'ERR031544.sort.bam')
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
72 # 'Chr1': '1',
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
73 # 'Pos1': '269907',
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
74 # 'Orientation1': '39+39-',
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
75 # 'Chr2': '1',
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
76 # 'Pos2': '270919',
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
77 # 'Orientation2': '39+39-',
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
78 # 'Type': 'DEL',
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
79 # 'Size': '99',
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
80 # 'Score': '99',
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
81 # 'num_Reads': '38',
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
82 # 'num_Reads_lib': '/home/allbio/ERR031544.sort.bam|38',
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
83 # 'ERR031544.sort.bam': 'NA'
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
84
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
85 _vcf_fields = ('CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO')
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
86
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
87
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
88 def _format_vcffile(dictreader, vcffile):
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
89 '''
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
90 Create a pseudo .vcf file based on values read from DictReader instance.
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
91 :param dictreader: DictReader instance to read data from
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
92 :type dictreader: csv.DictRedaer
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
93 :param vcffile: output file.vcf filename
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
94 :type vcffile: string
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
95 '''
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
96 with open(vcffile, mode='w') as writer:
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
97 writer.write('#{}\n'.format('\t'.join(_vcf_fields)))
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
98 output_vcf = []
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
99 for line in dictreader:
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
100 CHROM = line['Chr1']
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
101 # TODO Figure out whether we have zero or one based positioning
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
102 POS = int(line['Pos1'])
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
103 SVEND = int(line['Pos2'])
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
104 INFO = 'PROGRAM=breakdancer;SVTYPE={};SVLEN={}'.format(line['Type'],
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
105 0 - int(line['Size']))
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
106 if line['Type'] not in ['CTX']:
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
107 INFO += ";SVEND={}".format(SVEND)
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
108
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
109 # Create record
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
110 output_vcf.append([CHROM, POS, '.', '.', '.', '.', 'PASS', INFO])
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
111
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
112 # Sort all results
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
113 output_vcf.sort()
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
114 output = "\n".join(["\t".join(map(str,vcf_row)) for vcf_row in output_vcf])
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
115 # Write record
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
116 writer.write(output)
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
117
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
118
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
119 if __name__ == '__main__':
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
120 parser = argparse.ArgumentParser()
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
121 parser.add_argument('-i', '--breakdancertsv', dest='breakdancertsv', type=str,
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
122 help='Breakdancer TSV outputfile')
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
123 parser.add_argument('-o', '--outputvcf', dest='outputvcf', type=str,
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
124 help='Output vcf to')
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
125
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
126 args = parser.parse_args()
fd58a69ed896 Uploaded
jeremie
parents:
diff changeset
127 main(args.breakdancertsv, args.outputvcf)