ensembl_variant_report: ensembl_variant

comparison ensembl_variant_report.py @ 5:9e83cc05d384 draft

Uploaded

author	jjohnson
date	Mon, 06 Feb 2017 09:25:43 -0500
parents	c3a9e63e8c51
children	d9fad18ffdb1

comparison

equal deleted inserted replaced

-:908c68e3c73f
+:9e83cc05d384
 print >> sys.stderr, "%d: %s\n" % (linenum,line)
 continue
 chrom = fields[ci]
 pos = int(fields[pi])
 ref = fields[ri]
-alt = fields[ai]
+alts = fields[ai]
 dp = int(fields[di])
 dpr = [int(x) for x in fields[fi].split(',')]
-yield (transcript,pos,ref,alt,dp,dpr)
+for i,alt in enumerate(alts.split(',')):
+freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None
+yield (transcript,pos,ref,alt,dp,freq)
 def parse_snpeff_vcf():
 for linenum,line in enumerate(inputFile):
 if line.startswith('##'):
 if line.find('SnpEffVersion=') > 0:
 pass
 else:
 fields = line.strip('\r\n').split('\t')
 if options.debug: print >> sys.stderr, "\n%s" % (fields)
 (chrom, pos, id, ref, alts, qual, filter, info) = fields[0:8]
+alt_list = alts.split(',')
 pos = int(pos)
 qual = float(qual)
 for info_item in info.split(';'):
 if info_item.find('=') < 0: continue
 (key, val) = info_item.split('=', 1)
 if key == 'ANN':
 (alt,eff,impact,gene_name,gene_id,feature_type,transcript,biotype,exon,c_hgvs,p_hgvs,cdna,cds,aa,distance,info) = effect.split('|')
 elif key == 'EFF':
 (eff, effs) = effect.rstrip(')').split('(')
 (impact, functional_class, codon_change, aa_change, aa_len, gene_name, biotype, coding, transcript, exon, alt) = effs.split('|')[0:11]
-yield (transcript,pos,ref,alt,dp,dpr)
+i = alt_list.index(alt) if alt in alt_list else 0
+freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None
+yield (transcript,pos,ref,alt,dp,freq)
 #Process gene model
 ens_ref = None
 if options.gene_model != None:
 except Exception, e:
 print >> sys.stderr, "Parsing gene model failed: %s" % e
 exit(2)
 try:
 parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf
-for tid,pos1,ref,alt,dp,dpr in parse_input():
+for tid,pos1,ref,alt,dp,freq in parse_input():
-freq = float(dpr[1])/float(sum(dpr)) if dpr else None
+if not tid:
+continue
 if options.min_depth and dp is not None and dp < options.min_depth:
 continue
 if options.min_freq and freq is not None and freq < options.min_freq:
 continue
 ## transcript_id, pos, ref, alt, dp, dpr
 tx = ens_ref.get_gtf_transcript(tid)
+if not tx:
+continue
 coding = ens_ref.transcript_is_coding(tid)
 if not coding:
 continue
 frame_shift = len(ref) != len(alt)
 cds = ens_ref.get_cds(tid)
 pos0 = pos1 - 1 # zero based position
 spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1
 alt_seq = alt if tx.gene.strand else reverse_complement(alt)
+ref_seq = ref if tx.gene.strand else reverse_complement(ref)
 cds_pos = ens_ref.genome_to_cds_pos(tid, spos)
-alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):]
+alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else ''
 offset = 0
 if tx.gene.strand:
 for i in range(min(len(ref),len(alt))):
 if ref[i] == alt[i]:
 offset = i
 refpep = translate(cds[:len(cds)/3*3])
 pep = translate(alt_cds[:len(alt_cds)/3*3])
 peplen = len(pep)
 aa_pos = (cds_pos + offset) / 3
 if aa_pos >= len(pep):
-print >> sys.stderr, "%d: %s\n" % (linenum,line)
+print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s\n" % (aa_pos,len(pep),tid,pos1,ref,alt)
 continue
 if frame_shift:
 #find stop_codons
 nstops = 0
 stop_codons = []
 alt_codon = alt_cds[cs_pos:cs_pos+3]
 alt_aa = translate(alt_codon)
 gene = tx.gene.names[0]
 report_fields = [tx.gene.names[0],
 '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'),
-ref,
+ref_seq,
-alt,
+alt_seq,
 "%1.2f" % freq if freq is not None else '',
 str(dp),
 "%s|%s" % (tx.gene.gene_id,tx.cdna_id),
 "%d" % (aa_pos + 1),
 "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa),

Mercurial > repos > jjohnson > ensembl_variant_report

comparison ensembl_variant_report.py @ 5:9e83cc05d384 draft