Mercurial > repos > jjohnson > ensembl_variant_report
comparison ensembl_variant_report.py @ 7:d5cb252c68da draft
planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/ensembl_variant_report commit b62e927469f96a857e880c77530c50c885ad92fc-dirty
author | jjohnson |
---|---|
date | Thu, 14 Jun 2018 18:07:10 -0400 |
parents | d9fad18ffdb1 |
children | fd612f8119a2 |
comparison
equal
deleted
inserted
replaced
6:d9fad18ffdb1 | 7:d5cb252c68da |
---|---|
150 print >> sys.stderr, "Parsing gene model failed: %s" % e | 150 print >> sys.stderr, "Parsing gene model failed: %s" % e |
151 exit(2) | 151 exit(2) |
152 try: | 152 try: |
153 parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf | 153 parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf |
154 for tid,pos1,ref,alt,dp,freq in parse_input(): | 154 for tid,pos1,ref,alt,dp,freq in parse_input(): |
155 if options.debug: print >> sys.stderr, "%s\t%d\t%s\t%s\t%d\t%f" % (tid,pos1,ref,alt,dp,freq) | |
155 if not tid: | 156 if not tid: |
156 continue | 157 continue |
157 if options.min_depth and dp is not None and dp < options.min_depth: | 158 if options.min_depth and dp is not None and dp < options.min_depth: |
158 continue | 159 continue |
159 if options.min_freq and freq is not None and freq < options.min_freq: | 160 if options.min_freq and freq is not None and freq < options.min_freq: |
160 continue | 161 continue |
161 ## transcript_id, pos, ref, alt, dp, dpr | 162 try: |
162 tx = ens_ref.get_gtf_transcript(tid) | 163 ## transcript_id, pos, ref, alt, dp, dpr |
163 if not tx: | 164 tx = ens_ref.get_gtf_transcript(tid) |
164 continue | 165 if not tx and tid.find('.') > 0: |
165 coding = ens_ref.transcript_is_coding(tid) | 166 tid = tid.split('.')[0] |
166 if not coding: | 167 tx = ens_ref.get_gtf_transcript(tid) |
167 continue | 168 if not tx: |
168 frame_shift = len(ref) != len(alt) | 169 continue |
169 cds = ens_ref.get_cds(tid) | 170 if options.debug: print >> sys.stderr, "%s" % (tx) |
170 pos0 = pos1 - 1 # zero based position | 171 coding = ens_ref.transcript_is_coding(tid) |
171 spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1 | 172 if not coding: |
172 alt_seq = alt if tx.gene.strand else reverse_complement(alt) | 173 continue |
173 ref_seq = ref if tx.gene.strand else reverse_complement(ref) | 174 frame_shift = len(ref) != len(alt) |
174 cds_pos = ens_ref.genome_to_cds_pos(tid, spos) | 175 cds = ens_ref.get_cds(tid) |
175 alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else '' | 176 if options.debug or not cds: print >> sys.stderr, "cds:\n%s" % (cds) |
176 offset = 0 | 177 pos0 = pos1 - 1 # zero based position |
177 if tx.gene.strand: | 178 spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1 |
178 for i in range(min(len(ref),len(alt))): | 179 alt_seq = alt if tx.gene.strand else reverse_complement(alt) |
179 if ref[i] == alt[i]: | 180 ref_seq = ref if tx.gene.strand else reverse_complement(ref) |
180 offset = i | 181 cds_pos = ens_ref.genome_to_cds_pos(tid, spos) |
181 else: | 182 if options.debug: print >> sys.stderr, "cds_pos: %s" % (str(cds_pos)) |
182 break | 183 alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else '' |
183 else: | 184 offset = 0 |
184 for i in range(-1,-min(len(ref),len(alt)) -1,-1): | 185 if tx.gene.strand: |
185 if ref[i] == alt[i]: | 186 for i in range(min(len(ref),len(alt))): |
186 offset = i | 187 if ref[i] == alt[i]: |
187 else: | 188 offset = i |
188 break | 189 else: |
189 refpep = translate(cds[:len(cds)/3*3]) | |
190 pep = translate(alt_cds[:len(alt_cds)/3*3]) | |
191 peplen = len(pep) | |
192 aa_pos = (cds_pos + offset) / 3 | |
193 if aa_pos >= len(pep): | |
194 print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s\n" % (aa_pos,len(pep),tid,pos1,ref,alt) | |
195 continue | |
196 if frame_shift: | |
197 #find stop_codons | |
198 nstops = 0 | |
199 stop_codons = [] | |
200 for i in range(aa_pos,peplen): | |
201 if refpep[i] != pep[i]: | |
202 aa_pos = i | |
203 break | |
204 for i in range(aa_pos,peplen): | |
205 if pep[i] == '*': | |
206 nstops += 1 | |
207 stop_codons.append("%s-%s%s" % (alt_cds[i*3-1],alt_cds[i*3:i*3+3],"-%s" % alt_cds[i*3+4] if len(alt_cds) > i*3 else '')) | |
208 if nstops > options.readthrough: | |
209 reported_peptide = pep[aa_pos:i+1] | |
210 reported_stop_codon = ','.join(stop_codons) | |
211 break | 190 break |
212 else: | 191 else: |
213 reported_stop_codon = "%s-%s" % (alt_cds[peplen*3-4],alt_cds[peplen*3-3:peplen*3]) | 192 for i in range(-1,-min(len(ref),len(alt)) -1,-1): |
214 reported_peptide = "%s_%s_%s" % (pep[max(aa_pos-options.leading_aa,0):aa_pos], | 193 if ref[i] == alt[i]: |
215 pep[aa_pos], | 194 offset = i |
216 pep[aa_pos+1:min(aa_pos+1+options.trailing_aa,len(pep))]) | 195 else: |
217 cs_pos = aa_pos * 3 | 196 break |
218 aa_pos = (cds_pos + offset) / 3 | 197 refpep = translate(cds[:len(cds)/3*3]) |
219 ref_codon = cds[cs_pos:cs_pos+3] | 198 pep = translate(alt_cds[:len(alt_cds)/3*3]) |
220 ref_aa = translate(ref_codon) | 199 peplen = len(pep) |
221 alt_codon = alt_cds[cs_pos:cs_pos+3] | 200 aa_pos = (cds_pos + offset) / 3 |
222 alt_aa = translate(alt_codon) | 201 reported_stop_codon = '' |
223 gene = tx.gene.names[0] | 202 if aa_pos >= len(pep): |
224 report_fields = [tx.gene.names[0], | 203 print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s" % (aa_pos,len(pep),tid,pos1,ref,alt) |
225 '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'), | 204 continue |
226 ref_seq, | 205 if frame_shift: |
227 alt_seq, | 206 #find stop_codons |
228 "%1.2f" % freq if freq is not None else '', | 207 nstops = 0 |
229 str(dp), | 208 stop_codons = [] |
230 "%s|%s" % (tx.gene.gene_id,tx.cdna_id), | 209 for i in range(aa_pos,peplen): |
231 "%d" % (aa_pos + 1), | 210 if refpep[i] != pep[i]: |
232 "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa), | 211 aa_pos = i |
233 "%d" % len(pep), | 212 break |
234 reported_stop_codon, | 213 reported_peptide = pep[aa_pos:] |
235 reported_peptide | 214 for i in range(aa_pos,peplen): |
236 ] | 215 if pep[i] == '*': |
237 if options.debug: | 216 nstops += 1 |
238 report_fields.append("%d %d %d %d %s %s" % (cds_pos, offset, cs_pos,aa_pos,ref_codon,alt_codon)) | 217 stop_codons.append("%s-%s%s" % (alt_cds[i*3-1],alt_cds[i*3:i*3+3],"-%s" % alt_cds[i*3+4] if len(alt_cds) > i*3 else '')) |
239 outputFile.write('\t'.join(report_fields)) | 218 if nstops > options.readthrough: |
240 if options.debug: | 219 reported_peptide = pep[aa_pos:i+1] |
241 print >> sys.stderr, "%s %s\n%s\n%s\n%s %s" % ( | 220 reported_stop_codon = ','.join(stop_codons) |
242 cds[cs_pos-6:cs_pos], cds[cs_pos:cs_pos+15], | 221 break |
243 translate(cds[cs_pos-6:cs_pos+15]), | 222 else: |
244 translate(alt_cds[cs_pos-6:cs_pos+15]), | 223 reported_stop_codon = "%s-%s" % (alt_cds[peplen*3-4],alt_cds[peplen*3-3:peplen*3]) |
245 alt_cds[cs_pos-6:cs_pos], alt_cds[cs_pos:cs_pos+15]) | 224 reported_peptide = "%s_%s_%s" % (pep[max(aa_pos-options.leading_aa,0):aa_pos], |
246 outputFile.write('\n') | 225 pep[aa_pos], |
226 pep[aa_pos+1:min(aa_pos+1+options.trailing_aa,len(pep))]) | |
227 cs_pos = aa_pos * 3 | |
228 aa_pos = (cds_pos + offset) / 3 | |
229 ref_codon = cds[cs_pos:cs_pos+3] | |
230 ref_aa = translate(ref_codon) | |
231 alt_codon = alt_cds[cs_pos:cs_pos+3] | |
232 alt_aa = translate(alt_codon) | |
233 gene = tx.gene.names[0] | |
234 report_fields = [tx.gene.names[0], | |
235 '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'), | |
236 ref_seq, | |
237 alt_seq, | |
238 "%1.2f" % freq if freq is not None else '', | |
239 str(dp), | |
240 "%s|%s" % (tx.gene.gene_id,tx.cdna_id), | |
241 "%d" % (aa_pos + 1), | |
242 "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa), | |
243 "%d" % len(pep), | |
244 reported_stop_codon, | |
245 reported_peptide, | |
246 tx.get_transcript_type_name() | |
247 ] | |
248 if options.debug: | |
249 report_fields.append("%d %d %d %d %s %s" % (cds_pos, offset, cs_pos,aa_pos,ref_codon,alt_codon)) | |
250 outputFile.write('\t'.join(report_fields)) | |
251 if options.debug: | |
252 print >> sys.stderr, "%s %s\n%s\n%s\n%s %s" % ( | |
253 cds[cs_pos-6:cs_pos], cds[cs_pos:cs_pos+15], | |
254 translate(cds[cs_pos-6:cs_pos+15]), | |
255 translate(alt_cds[cs_pos-6:cs_pos+15]), | |
256 alt_cds[cs_pos-6:cs_pos], alt_cds[cs_pos:cs_pos+15]) | |
257 outputFile.write('\n') | |
258 except Exception, e: | |
259 print >> sys.stderr, "failed: %s" % e | |
260 print >> sys.stderr, "%s\t%d\t%s\t%s\t%d\t%f\t%s" % (tid,pos1,ref,alt,dp,freq,e) | |
247 except Exception, e: | 261 except Exception, e: |
248 print >> sys.stderr, "failed: %s" % e | 262 print >> sys.stderr, "failed: %s" % e |
249 exit(1) | 263 exit(1) |
250 | 264 |
251 | 265 |