comparison defuse_results_to_vcf.py @ 27:d57fcac025e2

Add more info fields to defuse_results_to_vcf.py
author Jim Johnson <jj@umn.edu>
date Wed, 14 Aug 2013 16:44:18 -0500
parents 2ecf82136986
children f51a95bdc38e
comparison
equal deleted inserted replaced
26:8f0775c43739 27:d57fcac025e2
102 ##INFO=<ID=SPANCNT,Number=1,Type=Integer,Description="number of spanning reads supporting the fusion"> 102 ##INFO=<ID=SPANCNT,Number=1,Type=Integer,Description="number of spanning reads supporting the fusion">
103 ##INFO=<ID=HOMLEN,Number=1,Type=Integer,Description="Length of base pair identical micro-homology at event breakpoints"> 103 ##INFO=<ID=HOMLEN,Number=1,Type=Integer,Description="Length of base pair identical micro-homology at event breakpoints">
104 ##INFO=<ID=SPLICESCORE,Number=1,Type=Integer,Description="number of nucleotides similar to GTAG at fusion splice"> 104 ##INFO=<ID=SPLICESCORE,Number=1,Type=Integer,Description="number of nucleotides similar to GTAG at fusion splice">
105 ##INFO=<ID=GENE,Number=2,Type=String,Description="Gene Names at each breakend"> 105 ##INFO=<ID=GENE,Number=2,Type=String,Description="Gene Names at each breakend">
106 ##INFO=<ID=GENEID,Number=2,Type=String,Description="Gene IDs at each breakend"> 106 ##INFO=<ID=GENEID,Number=2,Type=String,Description="Gene IDs at each breakend">
107 ##INFO=<ID=GENELOC,Number=2,Type=String,Description="location of breakpoint releative to genes">
108 ##INFO=<ID=EXPR,Number=2,Type=Integer,Description="expression of genes as number of concordant pairs aligned to exons">
107 ##INFO=<ID=ORF,Number=0,Type=Flag,Description="fusion combines genes in a way that preserves a reading frame"> 109 ##INFO=<ID=ORF,Number=0,Type=Flag,Description="fusion combines genes in a way that preserves a reading frame">
108 ##INFO=<ID=EXONBND,Number=0,Type=Flag,Description="fusion splice at exon boundaries"> 110 ##INFO=<ID=EXONBND,Number=0,Type=Flag,Description="fusion splice at exon boundaries">
111 ##INFO=<ID=INTERCHROM,Number=0,Type=Flag,Description="fusion produced by an interchromosomal translocation">
112 ##INFO=<ID=READTHROUGH,Number=0,Type=Flag,Description="fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement">
113 ##INFO=<ID=ADJACENT,Number=0,Type=Flag,Description="fusion between adjacent genes">
114 ##INFO=<ID=ALTSPLICE,Number=0,Type=Flag,Description="fusion likely the product of alternative splicing between adjacent genes">
115 ##INFO=<ID=DELETION,Number=0,Type=Flag,Description="fusion produced by a genomic deletion">
116 ##INFO=<ID=EVERSION,Number=0,Type=Flag,Description="fusion produced by a genomic eversion">
117 ##INFO=<ID=INVERSION,Number=0,Type=Flag,Description="fusion produced by a genomic inversion">
109 #CHROM POS ID REF ALT QUAL FILTER INFO\ 118 #CHROM POS ID REF ALT QUAL FILTER INFO\
110 """ 119 """
111 120
112 def cmp_alphanumeric(s1,s2): 121 def cmp_alphanumeric(s1,s2):
113 if s1 == s2: 122 if s1 == s2:
191 gene2 = fields[columns.index('gene2')] 200 gene2 = fields[columns.index('gene2')]
192 gene_info = 'GENEID=%s,%s' % (gene1,gene2) 201 gene_info = 'GENEID=%s,%s' % (gene1,gene2)
193 gene_name1 = fields[columns.index('gene_name1')] 202 gene_name1 = fields[columns.index('gene_name1')]
194 gene_name2 = fields[columns.index('gene_name2')] 203 gene_name2 = fields[columns.index('gene_name2')]
195 gene_name_info = 'GENE=%s,%s' % (gene_name1,gene_name2) 204 gene_name_info = 'GENE=%s,%s' % (gene_name1,gene_name2)
205 gene_location1 = fields[columns.index('gene_location1')]
206 gene_location2 = fields[columns.index('gene_location2')]
207 gene_loc = 'GENELOC=%s,%s' % (gene_location1,gene_location2)
208 expression1 = int(fields[columns.index('expression1')])
209 expression2 = int(fields[columns.index('expression2')])
210 expr = 'EXPR=%d,%d' % (expression1,expression2)
196 genomic_break_pos1 = int(fields[columns.index('genomic_break_pos1')]) 211 genomic_break_pos1 = int(fields[columns.index('genomic_break_pos1')])
197 genomic_break_pos2 = int(fields[columns.index('genomic_break_pos2')]) 212 genomic_break_pos2 = int(fields[columns.index('genomic_break_pos2')])
198 breakpoint_homology = int(fields[columns.index('breakpoint_homology')]) 213 breakpoint_homology = int(fields[columns.index('breakpoint_homology')])
199 homlen = 'HOMLEN=%s' % breakpoint_homology 214 homlen = 'HOMLEN=%s' % breakpoint_homology
200 orf = fields[columns.index('orf')] == 'Y' 215 orf = fields[columns.index('orf')] == 'Y'
201 exonboundaries = fields[columns.index('exonboundaries')] == 'Y' 216 exonboundaries = fields[columns.index('exonboundaries')] == 'Y'
202 read_through = fields[columns.index('read_through')] == 'Y' 217 read_through = fields[columns.index('read_through')] == 'Y'
218 interchromosomal = fields[columns.index('interchromosomal')] == 'Y'
219 adjacent = fields[columns.index('adjacent')] == 'Y'
220 altsplice = fields[columns.index('altsplice')] == 'Y'
221 deletion = fields[columns.index('deletion')] == 'Y'
222 eversion = fields[columns.index('eversion')] == 'Y'
223 inversion = fields[columns.index('inversion')] == 'Y'
203 span_count = int(fields[columns.index('span_count')]) 224 span_count = int(fields[columns.index('span_count')])
204 splitr_count = int(fields[columns.index('splitr_count')]) 225 splitr_count = int(fields[columns.index('splitr_count')])
205 splice_score = int(fields[columns.index('splice_score')]) 226 splice_score = int(fields[columns.index('splice_score')])
206 probability = fields[columns.index('probability')] if columns.index('probability') else '.' 227 probability = fields[columns.index('probability')] if columns.index('probability') else '.'
207 splitr_sequence = fields[columns.index('splitr_sequence')] 228 splitr_sequence = fields[columns.index('splitr_sequence')]
213 b1 = '[' if genomic_strand1 == '+' else ']' 234 b1 = '[' if genomic_strand1 == '+' else ']'
214 b2 = '[' if genomic_strand2 == '+' else ']' 235 b2 = '[' if genomic_strand2 == '+' else ']'
215 alt1 = "%s%s%s:%d%s" % (ref1,b2,gene_chromosome2,genomic_break_pos2,b2) 236 alt1 = "%s%s%s:%d%s" % (ref1,b2,gene_chromosome2,genomic_break_pos2,b2)
216 alt2 = "%s%s:%d%s%s" % (b1,gene_chromosome1,genomic_break_pos1,b1,ref2) 237 alt2 = "%s%s:%d%s%s" % (b1,gene_chromosome1,genomic_break_pos1,b1,ref2)
217 #TODO evaluate what should be included in the INFO field 238 #TODO evaluate what should be included in the INFO field
218 info = ['DP=%d' % (span_count + splitr_count),'SPLITCNT=%d' % splitr_count,'SPANCNT=%d' % span_count,gene_name_info,gene_info,homlen,'SPLICESCORE=%d' % splice_score] 239 info = ['DP=%d' % (span_count + splitr_count),'SPLITCNT=%d' % splitr_count,'SPANCNT=%d' % span_count,gene_name_info,gene_info,gene_loc,expr,homlen,'SPLICESCORE=%d' % splice_score]
219 if orf: 240 if orf:
220 info.append('ORF') 241 info.append('ORF')
221 if exonboundaries: 242 if exonboundaries:
222 info.append('EXONBND') 243 info.append('EXONBND')
244 if interchromosomal:
245 info.append('INTERCHROM')
246 if read_through:
247 info.append('READTHROUGH')
248 if adjacent:
249 info.append('ADJACENT')
250 if altsplice:
251 info.append('ALTSPLICE')
252 if deletion:
253 info.append('DELETION')
254 if eversion:
255 info.append('EVERSION')
256 if inversion:
257 info.append('INVERSION')
223 info1 = [svtype,'MATEID=%s' % mate_id2] + info 258 info1 = [svtype,'MATEID=%s' % mate_id2] + info
224 info2 = [svtype,'MATEID=%s' % mate_id1] + info 259 info2 = [svtype,'MATEID=%s' % mate_id1] + info
225 qual = int(float(fields[columns.index('probability')]) * 255) if columns.index('probability') else '.' 260 qual = int(float(fields[columns.index('probability')]) * 255) if columns.index('probability') else '.'
226 vcf1 = '%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s'% (gene_chromosome1,genomic_break_pos1, mate_id1, ref1, alt1, qual, filt, ';'.join(info1) ) 261 vcf1 = '%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s'% (gene_chromosome1,genomic_break_pos1, mate_id1, ref1, alt1, qual, filt, ';'.join(info1) )
227 vcf2 = '%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s'% (gene_chromosome2,genomic_break_pos2, mate_id2, ref2, alt2, qual, filt, ';'.join(info2) ) 262 vcf2 = '%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s'% (gene_chromosome2,genomic_break_pos2, mate_id2, ref2, alt2, qual, filt, ';'.join(info2) )