Mercurial > repos > jjohnson > ensembl_variant_report
comparison ensembl_variant_report.py @ 5:9e83cc05d384 draft
Uploaded
author | jjohnson |
---|---|
date | Mon, 06 Feb 2017 09:25:43 -0500 |
parents | c3a9e63e8c51 |
children | d9fad18ffdb1 |
comparison
equal
deleted
inserted
replaced
4:908c68e3c73f | 5:9e83cc05d384 |
---|---|
90 print >> sys.stderr, "%d: %s\n" % (linenum,line) | 90 print >> sys.stderr, "%d: %s\n" % (linenum,line) |
91 continue | 91 continue |
92 chrom = fields[ci] | 92 chrom = fields[ci] |
93 pos = int(fields[pi]) | 93 pos = int(fields[pi]) |
94 ref = fields[ri] | 94 ref = fields[ri] |
95 alt = fields[ai] | 95 alts = fields[ai] |
96 dp = int(fields[di]) | 96 dp = int(fields[di]) |
97 dpr = [int(x) for x in fields[fi].split(',')] | 97 dpr = [int(x) for x in fields[fi].split(',')] |
98 yield (transcript,pos,ref,alt,dp,dpr) | 98 for i,alt in enumerate(alts.split(',')): |
99 freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None | |
100 yield (transcript,pos,ref,alt,dp,freq) | |
99 | 101 |
100 def parse_snpeff_vcf(): | 102 def parse_snpeff_vcf(): |
101 for linenum,line in enumerate(inputFile): | 103 for linenum,line in enumerate(inputFile): |
102 if line.startswith('##'): | 104 if line.startswith('##'): |
103 if line.find('SnpEffVersion=') > 0: | 105 if line.find('SnpEffVersion=') > 0: |
106 pass | 108 pass |
107 else: | 109 else: |
108 fields = line.strip('\r\n').split('\t') | 110 fields = line.strip('\r\n').split('\t') |
109 if options.debug: print >> sys.stderr, "\n%s" % (fields) | 111 if options.debug: print >> sys.stderr, "\n%s" % (fields) |
110 (chrom, pos, id, ref, alts, qual, filter, info) = fields[0:8] | 112 (chrom, pos, id, ref, alts, qual, filter, info) = fields[0:8] |
113 alt_list = alts.split(',') | |
111 pos = int(pos) | 114 pos = int(pos) |
112 qual = float(qual) | 115 qual = float(qual) |
113 for info_item in info.split(';'): | 116 for info_item in info.split(';'): |
114 if info_item.find('=') < 0: continue | 117 if info_item.find('=') < 0: continue |
115 (key, val) = info_item.split('=', 1) | 118 (key, val) = info_item.split('=', 1) |
123 if key == 'ANN': | 126 if key == 'ANN': |
124 (alt,eff,impact,gene_name,gene_id,feature_type,transcript,biotype,exon,c_hgvs,p_hgvs,cdna,cds,aa,distance,info) = effect.split('|') | 127 (alt,eff,impact,gene_name,gene_id,feature_type,transcript,biotype,exon,c_hgvs,p_hgvs,cdna,cds,aa,distance,info) = effect.split('|') |
125 elif key == 'EFF': | 128 elif key == 'EFF': |
126 (eff, effs) = effect.rstrip(')').split('(') | 129 (eff, effs) = effect.rstrip(')').split('(') |
127 (impact, functional_class, codon_change, aa_change, aa_len, gene_name, biotype, coding, transcript, exon, alt) = effs.split('|')[0:11] | 130 (impact, functional_class, codon_change, aa_change, aa_len, gene_name, biotype, coding, transcript, exon, alt) = effs.split('|')[0:11] |
128 yield (transcript,pos,ref,alt,dp,dpr) | 131 i = alt_list.index(alt) if alt in alt_list else 0 |
132 freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None | |
133 yield (transcript,pos,ref,alt,dp,freq) | |
129 | 134 |
130 | 135 |
131 #Process gene model | 136 #Process gene model |
132 ens_ref = None | 137 ens_ref = None |
133 if options.gene_model != None: | 138 if options.gene_model != None: |
142 except Exception, e: | 147 except Exception, e: |
143 print >> sys.stderr, "Parsing gene model failed: %s" % e | 148 print >> sys.stderr, "Parsing gene model failed: %s" % e |
144 exit(2) | 149 exit(2) |
145 try: | 150 try: |
146 parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf | 151 parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf |
147 for tid,pos1,ref,alt,dp,dpr in parse_input(): | 152 for tid,pos1,ref,alt,dp,freq in parse_input(): |
148 freq = float(dpr[1])/float(sum(dpr)) if dpr else None | 153 if not tid: |
154 continue | |
149 if options.min_depth and dp is not None and dp < options.min_depth: | 155 if options.min_depth and dp is not None and dp < options.min_depth: |
150 continue | 156 continue |
151 if options.min_freq and freq is not None and freq < options.min_freq: | 157 if options.min_freq and freq is not None and freq < options.min_freq: |
152 continue | 158 continue |
153 ## transcript_id, pos, ref, alt, dp, dpr | 159 ## transcript_id, pos, ref, alt, dp, dpr |
154 tx = ens_ref.get_gtf_transcript(tid) | 160 tx = ens_ref.get_gtf_transcript(tid) |
161 if not tx: | |
162 continue | |
155 coding = ens_ref.transcript_is_coding(tid) | 163 coding = ens_ref.transcript_is_coding(tid) |
156 if not coding: | 164 if not coding: |
157 continue | 165 continue |
158 frame_shift = len(ref) != len(alt) | 166 frame_shift = len(ref) != len(alt) |
159 cds = ens_ref.get_cds(tid) | 167 cds = ens_ref.get_cds(tid) |
160 pos0 = pos1 - 1 # zero based position | 168 pos0 = pos1 - 1 # zero based position |
161 spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1 | 169 spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1 |
162 alt_seq = alt if tx.gene.strand else reverse_complement(alt) | 170 alt_seq = alt if tx.gene.strand else reverse_complement(alt) |
171 ref_seq = ref if tx.gene.strand else reverse_complement(ref) | |
163 cds_pos = ens_ref.genome_to_cds_pos(tid, spos) | 172 cds_pos = ens_ref.genome_to_cds_pos(tid, spos) |
164 alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] | 173 alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else '' |
165 offset = 0 | 174 offset = 0 |
166 if tx.gene.strand: | 175 if tx.gene.strand: |
167 for i in range(min(len(ref),len(alt))): | 176 for i in range(min(len(ref),len(alt))): |
168 if ref[i] == alt[i]: | 177 if ref[i] == alt[i]: |
169 offset = i | 178 offset = i |
178 refpep = translate(cds[:len(cds)/3*3]) | 187 refpep = translate(cds[:len(cds)/3*3]) |
179 pep = translate(alt_cds[:len(alt_cds)/3*3]) | 188 pep = translate(alt_cds[:len(alt_cds)/3*3]) |
180 peplen = len(pep) | 189 peplen = len(pep) |
181 aa_pos = (cds_pos + offset) / 3 | 190 aa_pos = (cds_pos + offset) / 3 |
182 if aa_pos >= len(pep): | 191 if aa_pos >= len(pep): |
183 print >> sys.stderr, "%d: %s\n" % (linenum,line) | 192 print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s\n" % (aa_pos,len(pep),tid,pos1,ref,alt) |
184 continue | 193 continue |
185 if frame_shift: | 194 if frame_shift: |
186 #find stop_codons | 195 #find stop_codons |
187 nstops = 0 | 196 nstops = 0 |
188 stop_codons = [] | 197 stop_codons = [] |
210 alt_codon = alt_cds[cs_pos:cs_pos+3] | 219 alt_codon = alt_cds[cs_pos:cs_pos+3] |
211 alt_aa = translate(alt_codon) | 220 alt_aa = translate(alt_codon) |
212 gene = tx.gene.names[0] | 221 gene = tx.gene.names[0] |
213 report_fields = [tx.gene.names[0], | 222 report_fields = [tx.gene.names[0], |
214 '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'), | 223 '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'), |
215 ref, | 224 ref_seq, |
216 alt, | 225 alt_seq, |
217 "%1.2f" % freq if freq is not None else '', | 226 "%1.2f" % freq if freq is not None else '', |
218 str(dp), | 227 str(dp), |
219 "%s|%s" % (tx.gene.gene_id,tx.cdna_id), | 228 "%s|%s" % (tx.gene.gene_id,tx.cdna_id), |
220 "%d" % (aa_pos + 1), | 229 "%d" % (aa_pos + 1), |
221 "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa), | 230 "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa), |