15
|
1 #!/usr/bin/env python
|
|
2
|
|
3 __copyright__ = """
|
|
4 Copyright (C) 2013 - Tim te Beek
|
|
5 Copyright (C) 2013 - Wai Yi Leung
|
|
6 Copyright (C) 2013 AllBio (see AUTHORS file)
|
|
7 """
|
|
8
|
|
9 __desc__ = """Convert breakdancer output to pseudo .vcf file format."""
|
|
10 __created__ = "Mar 18, 2013"
|
|
11 __author__ = "tbeek"
|
|
12
|
|
13 import argparse
|
|
14 import csv
|
|
15 import os.path
|
|
16 import sys
|
|
17
|
|
18
|
|
19 def main(tsvfile, vcffile):
|
|
20 '''
|
|
21 :param tsvfile: filename of input file.tsv
|
|
22 :type tsvfile: string
|
|
23 :param vcffile: filename of output file.vcf
|
|
24 :type vcffile: string
|
|
25 '''
|
|
26 with open(tsvfile) as reader:
|
|
27 # Parse file
|
|
28 dictreader = _parse_tsvfile(reader)
|
|
29 print dictreader.fieldnames
|
|
30
|
|
31 # Write out file
|
|
32 _format_vcffile(dictreader, vcffile)
|
|
33
|
|
34 # Quick output
|
|
35 with open(vcffile) as reader:
|
|
36 print reader.read(1000)
|
|
37
|
|
38
|
|
39 def _parse_tsvfile(readable):
|
|
40 '''
|
|
41 Read readable using csv.Sniffer and csv.DictReader
|
|
42 :param readable: open file.tsv handle to read with csv.DictReader
|
|
43 :type readable: file
|
|
44 '''
|
|
45 prev, curr = 0, 0
|
|
46 while True:
|
|
47 line = readable.readline()
|
|
48 if not line.startswith('#'):
|
|
49 # lets start from prev # line, without the hash sign
|
|
50 readable.seek(prev + 1)
|
|
51 break
|
|
52 else:
|
|
53 prev = curr
|
|
54 curr = readable.tell()
|
|
55
|
|
56 # Determine dialect
|
|
57 curr = readable.tell()
|
|
58 #dialect = csv.Sniffer().sniff(readable.read(3000))
|
|
59 dialect = 'excel-tab'
|
|
60 readable.seek(curr)
|
|
61
|
|
62 # Read file
|
|
63 dictreader = csv.DictReader(readable, dialect=dialect)
|
|
64 return dictreader
|
|
65
|
|
66
|
|
67 _tsv_fields = ('Chr1', 'Pos1', 'Orientation1',
|
|
68 'Chr2', 'Pos2', 'Orientation2',
|
|
69 'Type', 'Size', 'Score',
|
|
70 'num_Reads', 'num_Reads_lib',
|
|
71 'ERR031544.sort.bam')
|
|
72 # 'Chr1': '1',
|
|
73 # 'Pos1': '269907',
|
|
74 # 'Orientation1': '39+39-',
|
|
75 # 'Chr2': '1',
|
|
76 # 'Pos2': '270919',
|
|
77 # 'Orientation2': '39+39-',
|
|
78 # 'Type': 'DEL',
|
|
79 # 'Size': '99',
|
|
80 # 'Score': '99',
|
|
81 # 'num_Reads': '38',
|
|
82 # 'num_Reads_lib': '/home/allbio/ERR031544.sort.bam|38',
|
|
83 # 'ERR031544.sort.bam': 'NA'
|
|
84
|
|
85 _vcf_fields = ('CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO')
|
|
86
|
|
87
|
|
88 def _format_vcffile(dictreader, vcffile):
|
|
89 '''
|
|
90 Create a pseudo .vcf file based on values read from DictReader instance.
|
|
91 :param dictreader: DictReader instance to read data from
|
|
92 :type dictreader: csv.DictRedaer
|
|
93 :param vcffile: output file.vcf filename
|
|
94 :type vcffile: string
|
|
95 '''
|
|
96 with open(vcffile, mode='w') as writer:
|
|
97 writer.write('#{}\n'.format('\t'.join(_vcf_fields)))
|
|
98 output_vcf = []
|
|
99 for line in dictreader:
|
|
100 CHROM = line['Chr1']
|
|
101 # TODO Figure out whether we have zero or one based positioning
|
|
102 POS = int(line['Pos1'])
|
|
103 SVEND = int(line['Pos2'])
|
|
104 INFO = 'PROGRAM=breakdancer;SVTYPE={};SVLEN={}'.format(line['Type'],
|
|
105 0 - int(line['Size']))
|
|
106 if line['Type'] not in ['CTX']:
|
|
107 INFO += ";SVEND={}".format(SVEND)
|
|
108
|
|
109 # Create record
|
|
110 output_vcf.append([CHROM, POS, '.', '.', '.', '.', 'PASS', INFO])
|
|
111
|
|
112 # Sort all results
|
|
113 output_vcf.sort()
|
|
114 output = "\n".join(["\t".join(map(str,vcf_row)) for vcf_row in output_vcf])
|
|
115 # Write record
|
|
116 writer.write(output)
|
|
117
|
|
118
|
|
119 if __name__ == '__main__':
|
|
120 parser = argparse.ArgumentParser()
|
|
121 parser.add_argument('-i', '--breakdancertsv', dest='breakdancertsv', type=str,
|
|
122 help='Breakdancer TSV outputfile')
|
|
123 parser.add_argument('-o', '--outputvcf', dest='outputvcf', type=str,
|
|
124 help='Output vcf to')
|
|
125
|
|
126 args = parser.parse_args()
|
|
127 main(args.breakdancertsv, args.outputvcf)
|