Mercurial > repos > jjohnson > defuse
comparison defuse_results_to_vcf.py @ 27:d57fcac025e2
Add more info fields to defuse_results_to_vcf.py
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Wed, 14 Aug 2013 16:44:18 -0500 |
parents | 2ecf82136986 |
children | f51a95bdc38e |
comparison
equal
deleted
inserted
replaced
26:8f0775c43739 | 27:d57fcac025e2 |
---|---|
102 ##INFO=<ID=SPANCNT,Number=1,Type=Integer,Description="number of spanning reads supporting the fusion"> | 102 ##INFO=<ID=SPANCNT,Number=1,Type=Integer,Description="number of spanning reads supporting the fusion"> |
103 ##INFO=<ID=HOMLEN,Number=1,Type=Integer,Description="Length of base pair identical micro-homology at event breakpoints"> | 103 ##INFO=<ID=HOMLEN,Number=1,Type=Integer,Description="Length of base pair identical micro-homology at event breakpoints"> |
104 ##INFO=<ID=SPLICESCORE,Number=1,Type=Integer,Description="number of nucleotides similar to GTAG at fusion splice"> | 104 ##INFO=<ID=SPLICESCORE,Number=1,Type=Integer,Description="number of nucleotides similar to GTAG at fusion splice"> |
105 ##INFO=<ID=GENE,Number=2,Type=String,Description="Gene Names at each breakend"> | 105 ##INFO=<ID=GENE,Number=2,Type=String,Description="Gene Names at each breakend"> |
106 ##INFO=<ID=GENEID,Number=2,Type=String,Description="Gene IDs at each breakend"> | 106 ##INFO=<ID=GENEID,Number=2,Type=String,Description="Gene IDs at each breakend"> |
107 ##INFO=<ID=GENELOC,Number=2,Type=String,Description="location of breakpoint releative to genes"> | |
108 ##INFO=<ID=EXPR,Number=2,Type=Integer,Description="expression of genes as number of concordant pairs aligned to exons"> | |
107 ##INFO=<ID=ORF,Number=0,Type=Flag,Description="fusion combines genes in a way that preserves a reading frame"> | 109 ##INFO=<ID=ORF,Number=0,Type=Flag,Description="fusion combines genes in a way that preserves a reading frame"> |
108 ##INFO=<ID=EXONBND,Number=0,Type=Flag,Description="fusion splice at exon boundaries"> | 110 ##INFO=<ID=EXONBND,Number=0,Type=Flag,Description="fusion splice at exon boundaries"> |
111 ##INFO=<ID=INTERCHROM,Number=0,Type=Flag,Description="fusion produced by an interchromosomal translocation"> | |
112 ##INFO=<ID=READTHROUGH,Number=0,Type=Flag,Description="fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement"> | |
113 ##INFO=<ID=ADJACENT,Number=0,Type=Flag,Description="fusion between adjacent genes"> | |
114 ##INFO=<ID=ALTSPLICE,Number=0,Type=Flag,Description="fusion likely the product of alternative splicing between adjacent genes"> | |
115 ##INFO=<ID=DELETION,Number=0,Type=Flag,Description="fusion produced by a genomic deletion"> | |
116 ##INFO=<ID=EVERSION,Number=0,Type=Flag,Description="fusion produced by a genomic eversion"> | |
117 ##INFO=<ID=INVERSION,Number=0,Type=Flag,Description="fusion produced by a genomic inversion"> | |
109 #CHROM POS ID REF ALT QUAL FILTER INFO\ | 118 #CHROM POS ID REF ALT QUAL FILTER INFO\ |
110 """ | 119 """ |
111 | 120 |
112 def cmp_alphanumeric(s1,s2): | 121 def cmp_alphanumeric(s1,s2): |
113 if s1 == s2: | 122 if s1 == s2: |
191 gene2 = fields[columns.index('gene2')] | 200 gene2 = fields[columns.index('gene2')] |
192 gene_info = 'GENEID=%s,%s' % (gene1,gene2) | 201 gene_info = 'GENEID=%s,%s' % (gene1,gene2) |
193 gene_name1 = fields[columns.index('gene_name1')] | 202 gene_name1 = fields[columns.index('gene_name1')] |
194 gene_name2 = fields[columns.index('gene_name2')] | 203 gene_name2 = fields[columns.index('gene_name2')] |
195 gene_name_info = 'GENE=%s,%s' % (gene_name1,gene_name2) | 204 gene_name_info = 'GENE=%s,%s' % (gene_name1,gene_name2) |
205 gene_location1 = fields[columns.index('gene_location1')] | |
206 gene_location2 = fields[columns.index('gene_location2')] | |
207 gene_loc = 'GENELOC=%s,%s' % (gene_location1,gene_location2) | |
208 expression1 = int(fields[columns.index('expression1')]) | |
209 expression2 = int(fields[columns.index('expression2')]) | |
210 expr = 'EXPR=%d,%d' % (expression1,expression2) | |
196 genomic_break_pos1 = int(fields[columns.index('genomic_break_pos1')]) | 211 genomic_break_pos1 = int(fields[columns.index('genomic_break_pos1')]) |
197 genomic_break_pos2 = int(fields[columns.index('genomic_break_pos2')]) | 212 genomic_break_pos2 = int(fields[columns.index('genomic_break_pos2')]) |
198 breakpoint_homology = int(fields[columns.index('breakpoint_homology')]) | 213 breakpoint_homology = int(fields[columns.index('breakpoint_homology')]) |
199 homlen = 'HOMLEN=%s' % breakpoint_homology | 214 homlen = 'HOMLEN=%s' % breakpoint_homology |
200 orf = fields[columns.index('orf')] == 'Y' | 215 orf = fields[columns.index('orf')] == 'Y' |
201 exonboundaries = fields[columns.index('exonboundaries')] == 'Y' | 216 exonboundaries = fields[columns.index('exonboundaries')] == 'Y' |
202 read_through = fields[columns.index('read_through')] == 'Y' | 217 read_through = fields[columns.index('read_through')] == 'Y' |
218 interchromosomal = fields[columns.index('interchromosomal')] == 'Y' | |
219 adjacent = fields[columns.index('adjacent')] == 'Y' | |
220 altsplice = fields[columns.index('altsplice')] == 'Y' | |
221 deletion = fields[columns.index('deletion')] == 'Y' | |
222 eversion = fields[columns.index('eversion')] == 'Y' | |
223 inversion = fields[columns.index('inversion')] == 'Y' | |
203 span_count = int(fields[columns.index('span_count')]) | 224 span_count = int(fields[columns.index('span_count')]) |
204 splitr_count = int(fields[columns.index('splitr_count')]) | 225 splitr_count = int(fields[columns.index('splitr_count')]) |
205 splice_score = int(fields[columns.index('splice_score')]) | 226 splice_score = int(fields[columns.index('splice_score')]) |
206 probability = fields[columns.index('probability')] if columns.index('probability') else '.' | 227 probability = fields[columns.index('probability')] if columns.index('probability') else '.' |
207 splitr_sequence = fields[columns.index('splitr_sequence')] | 228 splitr_sequence = fields[columns.index('splitr_sequence')] |
213 b1 = '[' if genomic_strand1 == '+' else ']' | 234 b1 = '[' if genomic_strand1 == '+' else ']' |
214 b2 = '[' if genomic_strand2 == '+' else ']' | 235 b2 = '[' if genomic_strand2 == '+' else ']' |
215 alt1 = "%s%s%s:%d%s" % (ref1,b2,gene_chromosome2,genomic_break_pos2,b2) | 236 alt1 = "%s%s%s:%d%s" % (ref1,b2,gene_chromosome2,genomic_break_pos2,b2) |
216 alt2 = "%s%s:%d%s%s" % (b1,gene_chromosome1,genomic_break_pos1,b1,ref2) | 237 alt2 = "%s%s:%d%s%s" % (b1,gene_chromosome1,genomic_break_pos1,b1,ref2) |
217 #TODO evaluate what should be included in the INFO field | 238 #TODO evaluate what should be included in the INFO field |
218 info = ['DP=%d' % (span_count + splitr_count),'SPLITCNT=%d' % splitr_count,'SPANCNT=%d' % span_count,gene_name_info,gene_info,homlen,'SPLICESCORE=%d' % splice_score] | 239 info = ['DP=%d' % (span_count + splitr_count),'SPLITCNT=%d' % splitr_count,'SPANCNT=%d' % span_count,gene_name_info,gene_info,gene_loc,expr,homlen,'SPLICESCORE=%d' % splice_score] |
219 if orf: | 240 if orf: |
220 info.append('ORF') | 241 info.append('ORF') |
221 if exonboundaries: | 242 if exonboundaries: |
222 info.append('EXONBND') | 243 info.append('EXONBND') |
244 if interchromosomal: | |
245 info.append('INTERCHROM') | |
246 if read_through: | |
247 info.append('READTHROUGH') | |
248 if adjacent: | |
249 info.append('ADJACENT') | |
250 if altsplice: | |
251 info.append('ALTSPLICE') | |
252 if deletion: | |
253 info.append('DELETION') | |
254 if eversion: | |
255 info.append('EVERSION') | |
256 if inversion: | |
257 info.append('INVERSION') | |
223 info1 = [svtype,'MATEID=%s' % mate_id2] + info | 258 info1 = [svtype,'MATEID=%s' % mate_id2] + info |
224 info2 = [svtype,'MATEID=%s' % mate_id1] + info | 259 info2 = [svtype,'MATEID=%s' % mate_id1] + info |
225 qual = int(float(fields[columns.index('probability')]) * 255) if columns.index('probability') else '.' | 260 qual = int(float(fields[columns.index('probability')]) * 255) if columns.index('probability') else '.' |
226 vcf1 = '%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s'% (gene_chromosome1,genomic_break_pos1, mate_id1, ref1, alt1, qual, filt, ';'.join(info1) ) | 261 vcf1 = '%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s'% (gene_chromosome1,genomic_break_pos1, mate_id1, ref1, alt1, qual, filt, ';'.join(info1) ) |
227 vcf2 = '%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s'% (gene_chromosome2,genomic_break_pos2, mate_id2, ref2, alt2, qual, filt, ';'.join(info2) ) | 262 vcf2 = '%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s'% (gene_chromosome2,genomic_break_pos2, mate_id2, ref2, alt2, qual, filt, ';'.join(info2) ) |