Mercurial > repos > jjohnson > ensembl_variant_report
diff ensembl_variant_report.py @ 5:9e83cc05d384 draft
Uploaded
author | jjohnson |
---|---|
date | Mon, 06 Feb 2017 09:25:43 -0500 |
parents | c3a9e63e8c51 |
children | d9fad18ffdb1 |
line wrap: on
line diff
--- a/ensembl_variant_report.py Fri Jan 06 17:43:29 2017 -0500 +++ b/ensembl_variant_report.py Mon Feb 06 09:25:43 2017 -0500 @@ -92,10 +92,12 @@ chrom = fields[ci] pos = int(fields[pi]) ref = fields[ri] - alt = fields[ai] + alts = fields[ai] dp = int(fields[di]) dpr = [int(x) for x in fields[fi].split(',')] - yield (transcript,pos,ref,alt,dp,dpr) + for i,alt in enumerate(alts.split(',')): + freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None + yield (transcript,pos,ref,alt,dp,freq) def parse_snpeff_vcf(): for linenum,line in enumerate(inputFile): @@ -108,6 +110,7 @@ fields = line.strip('\r\n').split('\t') if options.debug: print >> sys.stderr, "\n%s" % (fields) (chrom, pos, id, ref, alts, qual, filter, info) = fields[0:8] + alt_list = alts.split(',') pos = int(pos) qual = float(qual) for info_item in info.split(';'): @@ -125,7 +128,9 @@ elif key == 'EFF': (eff, effs) = effect.rstrip(')').split('(') (impact, functional_class, codon_change, aa_change, aa_len, gene_name, biotype, coding, transcript, exon, alt) = effs.split('|')[0:11] - yield (transcript,pos,ref,alt,dp,dpr) + i = alt_list.index(alt) if alt in alt_list else 0 + freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None + yield (transcript,pos,ref,alt,dp,freq) #Process gene model @@ -144,14 +149,17 @@ exit(2) try: parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf - for tid,pos1,ref,alt,dp,dpr in parse_input(): - freq = float(dpr[1])/float(sum(dpr)) if dpr else None + for tid,pos1,ref,alt,dp,freq in parse_input(): + if not tid: + continue if options.min_depth and dp is not None and dp < options.min_depth: continue if options.min_freq and freq is not None and freq < options.min_freq: continue ## transcript_id, pos, ref, alt, dp, dpr tx = ens_ref.get_gtf_transcript(tid) + if not tx: + continue coding = ens_ref.transcript_is_coding(tid) if not coding: continue @@ -160,8 +168,9 @@ pos0 = pos1 - 1 # zero based position spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1 alt_seq = alt if tx.gene.strand else reverse_complement(alt) + ref_seq = ref if tx.gene.strand else reverse_complement(ref) cds_pos = ens_ref.genome_to_cds_pos(tid, spos) - alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] + alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else '' offset = 0 if tx.gene.strand: for i in range(min(len(ref),len(alt))): @@ -180,7 +189,7 @@ peplen = len(pep) aa_pos = (cds_pos + offset) / 3 if aa_pos >= len(pep): - print >> sys.stderr, "%d: %s\n" % (linenum,line) + print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s\n" % (aa_pos,len(pep),tid,pos1,ref,alt) continue if frame_shift: #find stop_codons @@ -212,8 +221,8 @@ gene = tx.gene.names[0] report_fields = [tx.gene.names[0], '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'), - ref, - alt, + ref_seq, + alt_seq, "%1.2f" % freq if freq is not None else '', str(dp), "%s|%s" % (tx.gene.gene_id,tx.cdna_id),