diff ensembl_variant_report.py @ 5:9e83cc05d384 draft

Uploaded
author jjohnson
date Mon, 06 Feb 2017 09:25:43 -0500
parents c3a9e63e8c51
children d9fad18ffdb1
line wrap: on
line diff
--- a/ensembl_variant_report.py	Fri Jan 06 17:43:29 2017 -0500
+++ b/ensembl_variant_report.py	Mon Feb 06 09:25:43 2017 -0500
@@ -92,10 +92,12 @@
             chrom = fields[ci]
             pos = int(fields[pi])
             ref = fields[ri]
-            alt = fields[ai]
+            alts = fields[ai]
             dp = int(fields[di])
             dpr = [int(x) for x in fields[fi].split(',')]
-            yield (transcript,pos,ref,alt,dp,dpr)
+            for i,alt in enumerate(alts.split(',')):
+                freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None
+                yield (transcript,pos,ref,alt,dp,freq)
 
     def parse_snpeff_vcf():
         for linenum,line in enumerate(inputFile):
@@ -108,6 +110,7 @@
                 fields = line.strip('\r\n').split('\t')
                 if options.debug: print >> sys.stderr, "\n%s" % (fields)
                 (chrom, pos, id, ref, alts, qual, filter, info) = fields[0:8]
+                alt_list = alts.split(',')
                 pos = int(pos)
                 qual = float(qual)
                 for info_item in info.split(';'):
@@ -125,7 +128,9 @@
                             elif key == 'EFF':
                                 (eff, effs) = effect.rstrip(')').split('(')
                                 (impact, functional_class, codon_change, aa_change, aa_len, gene_name, biotype, coding, transcript, exon, alt) = effs.split('|')[0:11]
-                            yield (transcript,pos,ref,alt,dp,dpr)
+                            i = alt_list.index(alt) if alt in alt_list else 0
+                            freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None
+                            yield (transcript,pos,ref,alt,dp,freq)
 
 
     #Process gene model
@@ -144,14 +149,17 @@
             exit(2)
     try:
         parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf
-        for tid,pos1,ref,alt,dp,dpr in parse_input():
-            freq = float(dpr[1])/float(sum(dpr)) if dpr else None
+        for tid,pos1,ref,alt,dp,freq in parse_input():
+            if not tid:
+                continue
             if options.min_depth and dp is not None and dp < options.min_depth:
                 continue
             if options.min_freq and freq is not None and freq < options.min_freq:
                 continue
             ## transcript_id, pos, ref, alt, dp, dpr
             tx = ens_ref.get_gtf_transcript(tid)
+            if not tx:
+                continue
             coding = ens_ref.transcript_is_coding(tid)
             if not coding:
                 continue
@@ -160,8 +168,9 @@
             pos0 = pos1 - 1 # zero based position
             spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1
             alt_seq = alt if tx.gene.strand else reverse_complement(alt)
+            ref_seq = ref if tx.gene.strand else reverse_complement(ref)
             cds_pos = ens_ref.genome_to_cds_pos(tid, spos)
-            alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):]
+            alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else '' 
             offset = 0
             if tx.gene.strand:
                 for i in range(min(len(ref),len(alt))):
@@ -180,7 +189,7 @@
             peplen = len(pep)
             aa_pos = (cds_pos + offset) / 3
             if aa_pos >= len(pep):
-                print >> sys.stderr, "%d: %s\n" % (linenum,line)
+                print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s\n" % (aa_pos,len(pep),tid,pos1,ref,alt)
                 continue
             if frame_shift:
                 #find stop_codons
@@ -212,8 +221,8 @@
             gene = tx.gene.names[0]
             report_fields = [tx.gene.names[0],
              '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'),
-             ref,
-             alt,
+             ref_seq,
+             alt_seq,
              "%1.2f" % freq if freq is not None else '',
              str(dp),
              "%s|%s" % (tx.gene.gene_id,tx.cdna_id),