Mercurial > repos > jeremie > package_breakdancer_1_2_6
comparison breakdancer2vcf.py @ 4:fd58a69ed896 draft
Uploaded
| author | jeremie |
|---|---|
| date | Thu, 26 Jun 2014 09:12:49 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 3:8dd3f7144428 | 4:fd58a69ed896 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 __copyright__ = """ | |
| 4 Copyright (C) 2013 - Tim te Beek | |
| 5 Copyright (C) 2013 - Wai Yi Leung | |
| 6 Copyright (C) 2013 AllBio (see AUTHORS file) | |
| 7 """ | |
| 8 | |
| 9 __desc__ = """Convert breakdancer output to pseudo .vcf file format.""" | |
| 10 __created__ = "Mar 18, 2013" | |
| 11 __author__ = "tbeek" | |
| 12 | |
| 13 import argparse | |
| 14 import csv | |
| 15 import os.path | |
| 16 import sys | |
| 17 | |
| 18 | |
| 19 def main(tsvfile, vcffile): | |
| 20 ''' | |
| 21 :param tsvfile: filename of input file.tsv | |
| 22 :type tsvfile: string | |
| 23 :param vcffile: filename of output file.vcf | |
| 24 :type vcffile: string | |
| 25 ''' | |
| 26 with open(tsvfile) as reader: | |
| 27 # Parse file | |
| 28 dictreader = _parse_tsvfile(reader) | |
| 29 print dictreader.fieldnames | |
| 30 | |
| 31 # Write out file | |
| 32 _format_vcffile(dictreader, vcffile) | |
| 33 | |
| 34 # Quick output | |
| 35 with open(vcffile) as reader: | |
| 36 print reader.read(1000) | |
| 37 | |
| 38 | |
| 39 def _parse_tsvfile(readable): | |
| 40 ''' | |
| 41 Read readable using csv.Sniffer and csv.DictReader | |
| 42 :param readable: open file.tsv handle to read with csv.DictReader | |
| 43 :type readable: file | |
| 44 ''' | |
| 45 prev, curr = 0, 0 | |
| 46 while True: | |
| 47 line = readable.readline() | |
| 48 if not line.startswith('#'): | |
| 49 # lets start from prev # line, without the hash sign | |
| 50 readable.seek(prev + 1) | |
| 51 break | |
| 52 else: | |
| 53 prev = curr | |
| 54 curr = readable.tell() | |
| 55 | |
| 56 # Determine dialect | |
| 57 curr = readable.tell() | |
| 58 #dialect = csv.Sniffer().sniff(readable.read(3000)) | |
| 59 dialect = 'excel-tab' | |
| 60 readable.seek(curr) | |
| 61 | |
| 62 # Read file | |
| 63 dictreader = csv.DictReader(readable, dialect=dialect) | |
| 64 return dictreader | |
| 65 | |
| 66 | |
| 67 _tsv_fields = ('Chr1', 'Pos1', 'Orientation1', | |
| 68 'Chr2', 'Pos2', 'Orientation2', | |
| 69 'Type', 'Size', 'Score', | |
| 70 'num_Reads', 'num_Reads_lib', | |
| 71 'ERR031544.sort.bam') | |
| 72 # 'Chr1': '1', | |
| 73 # 'Pos1': '269907', | |
| 74 # 'Orientation1': '39+39-', | |
| 75 # 'Chr2': '1', | |
| 76 # 'Pos2': '270919', | |
| 77 # 'Orientation2': '39+39-', | |
| 78 # 'Type': 'DEL', | |
| 79 # 'Size': '99', | |
| 80 # 'Score': '99', | |
| 81 # 'num_Reads': '38', | |
| 82 # 'num_Reads_lib': '/home/allbio/ERR031544.sort.bam|38', | |
| 83 # 'ERR031544.sort.bam': 'NA' | |
| 84 | |
| 85 _vcf_fields = ('CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO') | |
| 86 | |
| 87 | |
| 88 def _format_vcffile(dictreader, vcffile): | |
| 89 ''' | |
| 90 Create a pseudo .vcf file based on values read from DictReader instance. | |
| 91 :param dictreader: DictReader instance to read data from | |
| 92 :type dictreader: csv.DictRedaer | |
| 93 :param vcffile: output file.vcf filename | |
| 94 :type vcffile: string | |
| 95 ''' | |
| 96 with open(vcffile, mode='w') as writer: | |
| 97 writer.write('#{}\n'.format('\t'.join(_vcf_fields))) | |
| 98 output_vcf = [] | |
| 99 for line in dictreader: | |
| 100 CHROM = line['Chr1'] | |
| 101 # TODO Figure out whether we have zero or one based positioning | |
| 102 POS = int(line['Pos1']) | |
| 103 SVEND = int(line['Pos2']) | |
| 104 INFO = 'PROGRAM=breakdancer;SVTYPE={};SVLEN={}'.format(line['Type'], | |
| 105 0 - int(line['Size'])) | |
| 106 if line['Type'] not in ['CTX']: | |
| 107 INFO += ";SVEND={}".format(SVEND) | |
| 108 | |
| 109 # Create record | |
| 110 output_vcf.append([CHROM, POS, '.', '.', '.', '.', 'PASS', INFO]) | |
| 111 | |
| 112 # Sort all results | |
| 113 output_vcf.sort() | |
| 114 output = "\n".join(["\t".join(map(str,vcf_row)) for vcf_row in output_vcf]) | |
| 115 # Write record | |
| 116 writer.write(output) | |
| 117 | |
| 118 | |
| 119 if __name__ == '__main__': | |
| 120 parser = argparse.ArgumentParser() | |
| 121 parser.add_argument('-i', '--breakdancertsv', dest='breakdancertsv', type=str, | |
| 122 help='Breakdancer TSV outputfile') | |
| 123 parser.add_argument('-o', '--outputvcf', dest='outputvcf', type=str, | |
| 124 help='Output vcf to') | |
| 125 | |
| 126 args = parser.parse_args() | |
| 127 main(args.breakdancertsv, args.outputvcf) |
