comparison ensembl_variant_report.py @ 5:9e83cc05d384 draft

Uploaded
author jjohnson
date Mon, 06 Feb 2017 09:25:43 -0500
parents c3a9e63e8c51
children d9fad18ffdb1
comparison
equal deleted inserted replaced
4:908c68e3c73f 5:9e83cc05d384
90 print >> sys.stderr, "%d: %s\n" % (linenum,line) 90 print >> sys.stderr, "%d: %s\n" % (linenum,line)
91 continue 91 continue
92 chrom = fields[ci] 92 chrom = fields[ci]
93 pos = int(fields[pi]) 93 pos = int(fields[pi])
94 ref = fields[ri] 94 ref = fields[ri]
95 alt = fields[ai] 95 alts = fields[ai]
96 dp = int(fields[di]) 96 dp = int(fields[di])
97 dpr = [int(x) for x in fields[fi].split(',')] 97 dpr = [int(x) for x in fields[fi].split(',')]
98 yield (transcript,pos,ref,alt,dp,dpr) 98 for i,alt in enumerate(alts.split(',')):
99 freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None
100 yield (transcript,pos,ref,alt,dp,freq)
99 101
100 def parse_snpeff_vcf(): 102 def parse_snpeff_vcf():
101 for linenum,line in enumerate(inputFile): 103 for linenum,line in enumerate(inputFile):
102 if line.startswith('##'): 104 if line.startswith('##'):
103 if line.find('SnpEffVersion=') > 0: 105 if line.find('SnpEffVersion=') > 0:
106 pass 108 pass
107 else: 109 else:
108 fields = line.strip('\r\n').split('\t') 110 fields = line.strip('\r\n').split('\t')
109 if options.debug: print >> sys.stderr, "\n%s" % (fields) 111 if options.debug: print >> sys.stderr, "\n%s" % (fields)
110 (chrom, pos, id, ref, alts, qual, filter, info) = fields[0:8] 112 (chrom, pos, id, ref, alts, qual, filter, info) = fields[0:8]
113 alt_list = alts.split(',')
111 pos = int(pos) 114 pos = int(pos)
112 qual = float(qual) 115 qual = float(qual)
113 for info_item in info.split(';'): 116 for info_item in info.split(';'):
114 if info_item.find('=') < 0: continue 117 if info_item.find('=') < 0: continue
115 (key, val) = info_item.split('=', 1) 118 (key, val) = info_item.split('=', 1)
123 if key == 'ANN': 126 if key == 'ANN':
124 (alt,eff,impact,gene_name,gene_id,feature_type,transcript,biotype,exon,c_hgvs,p_hgvs,cdna,cds,aa,distance,info) = effect.split('|') 127 (alt,eff,impact,gene_name,gene_id,feature_type,transcript,biotype,exon,c_hgvs,p_hgvs,cdna,cds,aa,distance,info) = effect.split('|')
125 elif key == 'EFF': 128 elif key == 'EFF':
126 (eff, effs) = effect.rstrip(')').split('(') 129 (eff, effs) = effect.rstrip(')').split('(')
127 (impact, functional_class, codon_change, aa_change, aa_len, gene_name, biotype, coding, transcript, exon, alt) = effs.split('|')[0:11] 130 (impact, functional_class, codon_change, aa_change, aa_len, gene_name, biotype, coding, transcript, exon, alt) = effs.split('|')[0:11]
128 yield (transcript,pos,ref,alt,dp,dpr) 131 i = alt_list.index(alt) if alt in alt_list else 0
132 freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None
133 yield (transcript,pos,ref,alt,dp,freq)
129 134
130 135
131 #Process gene model 136 #Process gene model
132 ens_ref = None 137 ens_ref = None
133 if options.gene_model != None: 138 if options.gene_model != None:
142 except Exception, e: 147 except Exception, e:
143 print >> sys.stderr, "Parsing gene model failed: %s" % e 148 print >> sys.stderr, "Parsing gene model failed: %s" % e
144 exit(2) 149 exit(2)
145 try: 150 try:
146 parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf 151 parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf
147 for tid,pos1,ref,alt,dp,dpr in parse_input(): 152 for tid,pos1,ref,alt,dp,freq in parse_input():
148 freq = float(dpr[1])/float(sum(dpr)) if dpr else None 153 if not tid:
154 continue
149 if options.min_depth and dp is not None and dp < options.min_depth: 155 if options.min_depth and dp is not None and dp < options.min_depth:
150 continue 156 continue
151 if options.min_freq and freq is not None and freq < options.min_freq: 157 if options.min_freq and freq is not None and freq < options.min_freq:
152 continue 158 continue
153 ## transcript_id, pos, ref, alt, dp, dpr 159 ## transcript_id, pos, ref, alt, dp, dpr
154 tx = ens_ref.get_gtf_transcript(tid) 160 tx = ens_ref.get_gtf_transcript(tid)
161 if not tx:
162 continue
155 coding = ens_ref.transcript_is_coding(tid) 163 coding = ens_ref.transcript_is_coding(tid)
156 if not coding: 164 if not coding:
157 continue 165 continue
158 frame_shift = len(ref) != len(alt) 166 frame_shift = len(ref) != len(alt)
159 cds = ens_ref.get_cds(tid) 167 cds = ens_ref.get_cds(tid)
160 pos0 = pos1 - 1 # zero based position 168 pos0 = pos1 - 1 # zero based position
161 spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1 169 spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1
162 alt_seq = alt if tx.gene.strand else reverse_complement(alt) 170 alt_seq = alt if tx.gene.strand else reverse_complement(alt)
171 ref_seq = ref if tx.gene.strand else reverse_complement(ref)
163 cds_pos = ens_ref.genome_to_cds_pos(tid, spos) 172 cds_pos = ens_ref.genome_to_cds_pos(tid, spos)
164 alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] 173 alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else ''
165 offset = 0 174 offset = 0
166 if tx.gene.strand: 175 if tx.gene.strand:
167 for i in range(min(len(ref),len(alt))): 176 for i in range(min(len(ref),len(alt))):
168 if ref[i] == alt[i]: 177 if ref[i] == alt[i]:
169 offset = i 178 offset = i
178 refpep = translate(cds[:len(cds)/3*3]) 187 refpep = translate(cds[:len(cds)/3*3])
179 pep = translate(alt_cds[:len(alt_cds)/3*3]) 188 pep = translate(alt_cds[:len(alt_cds)/3*3])
180 peplen = len(pep) 189 peplen = len(pep)
181 aa_pos = (cds_pos + offset) / 3 190 aa_pos = (cds_pos + offset) / 3
182 if aa_pos >= len(pep): 191 if aa_pos >= len(pep):
183 print >> sys.stderr, "%d: %s\n" % (linenum,line) 192 print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s\n" % (aa_pos,len(pep),tid,pos1,ref,alt)
184 continue 193 continue
185 if frame_shift: 194 if frame_shift:
186 #find stop_codons 195 #find stop_codons
187 nstops = 0 196 nstops = 0
188 stop_codons = [] 197 stop_codons = []
210 alt_codon = alt_cds[cs_pos:cs_pos+3] 219 alt_codon = alt_cds[cs_pos:cs_pos+3]
211 alt_aa = translate(alt_codon) 220 alt_aa = translate(alt_codon)
212 gene = tx.gene.names[0] 221 gene = tx.gene.names[0]
213 report_fields = [tx.gene.names[0], 222 report_fields = [tx.gene.names[0],
214 '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'), 223 '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'),
215 ref, 224 ref_seq,
216 alt, 225 alt_seq,
217 "%1.2f" % freq if freq is not None else '', 226 "%1.2f" % freq if freq is not None else '',
218 str(dp), 227 str(dp),
219 "%s|%s" % (tx.gene.gene_id,tx.cdna_id), 228 "%s|%s" % (tx.gene.gene_id,tx.cdna_id),
220 "%d" % (aa_pos + 1), 229 "%d" % (aa_pos + 1),
221 "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa), 230 "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa),