# HG changeset patch # User jjohnson # Date 1486391143 18000 # Node ID 9e83cc05d384b73ed5c2f19660dde78969fa4a54 # Parent 908c68e3c73f6cc9825af3756383df25385db717 Uploaded diff -r 908c68e3c73f -r 9e83cc05d384 ._ensembl_variant_report.py Binary file ._ensembl_variant_report.py has changed diff -r 908c68e3c73f -r 9e83cc05d384 ._ensembl_variant_report.xml Binary file ._ensembl_variant_report.xml has changed diff -r 908c68e3c73f -r 9e83cc05d384 ._ensemblref.py Binary file ._ensemblref.py has changed diff -r 908c68e3c73f -r 9e83cc05d384 ensembl_variant_report.py --- a/ensembl_variant_report.py Fri Jan 06 17:43:29 2017 -0500 +++ b/ensembl_variant_report.py Mon Feb 06 09:25:43 2017 -0500 @@ -92,10 +92,12 @@ chrom = fields[ci] pos = int(fields[pi]) ref = fields[ri] - alt = fields[ai] + alts = fields[ai] dp = int(fields[di]) dpr = [int(x) for x in fields[fi].split(',')] - yield (transcript,pos,ref,alt,dp,dpr) + for i,alt in enumerate(alts.split(',')): + freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None + yield (transcript,pos,ref,alt,dp,freq) def parse_snpeff_vcf(): for linenum,line in enumerate(inputFile): @@ -108,6 +110,7 @@ fields = line.strip('\r\n').split('\t') if options.debug: print >> sys.stderr, "\n%s" % (fields) (chrom, pos, id, ref, alts, qual, filter, info) = fields[0:8] + alt_list = alts.split(',') pos = int(pos) qual = float(qual) for info_item in info.split(';'): @@ -125,7 +128,9 @@ elif key == 'EFF': (eff, effs) = effect.rstrip(')').split('(') (impact, functional_class, codon_change, aa_change, aa_len, gene_name, biotype, coding, transcript, exon, alt) = effs.split('|')[0:11] - yield (transcript,pos,ref,alt,dp,dpr) + i = alt_list.index(alt) if alt in alt_list else 0 + freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None + yield (transcript,pos,ref,alt,dp,freq) #Process gene model @@ -144,14 +149,17 @@ exit(2) try: parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf - for tid,pos1,ref,alt,dp,dpr in parse_input(): - freq = float(dpr[1])/float(sum(dpr)) if dpr else None + for tid,pos1,ref,alt,dp,freq in parse_input(): + if not tid: + continue if options.min_depth and dp is not None and dp < options.min_depth: continue if options.min_freq and freq is not None and freq < options.min_freq: continue ## transcript_id, pos, ref, alt, dp, dpr tx = ens_ref.get_gtf_transcript(tid) + if not tx: + continue coding = ens_ref.transcript_is_coding(tid) if not coding: continue @@ -160,8 +168,9 @@ pos0 = pos1 - 1 # zero based position spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1 alt_seq = alt if tx.gene.strand else reverse_complement(alt) + ref_seq = ref if tx.gene.strand else reverse_complement(ref) cds_pos = ens_ref.genome_to_cds_pos(tid, spos) - alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] + alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else '' offset = 0 if tx.gene.strand: for i in range(min(len(ref),len(alt))): @@ -180,7 +189,7 @@ peplen = len(pep) aa_pos = (cds_pos + offset) / 3 if aa_pos >= len(pep): - print >> sys.stderr, "%d: %s\n" % (linenum,line) + print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s\n" % (aa_pos,len(pep),tid,pos1,ref,alt) continue if frame_shift: #find stop_codons @@ -212,8 +221,8 @@ gene = tx.gene.names[0] report_fields = [tx.gene.names[0], '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'), - ref, - alt, + ref_seq, + alt_seq, "%1.2f" % freq if freq is not None else '', str(dp), "%s|%s" % (tx.gene.gene_id,tx.cdna_id), diff -r 908c68e3c73f -r 9e83cc05d384 ensembl_variant_report.xml --- a/ensembl_variant_report.xml Fri Jan 06 17:43:29 2017 -0500 +++ b/ensembl_variant_report.xml Mon Feb 06 09:25:43 2017 -0500 @@ -20,12 +20,12 @@ --format=snpeff #else --input="$variant.input_tsv" - --pos_column=$pos_column - --ref_column=$ref_column - --alt_column=$alt_column - --transcript_column=$transcript_column - --dp_column=$dp_column - --dpr_column=$dpr_column + --pos_column=$variant.pos_column + --ref_column=$variant.ref_column + --alt_column=$variant.alt_column + --transcript_column=$variant.transcript_column + --dp_column=$variant.dp_column + --dpr_column=$variant.dpr_column #end if #if str($filter.min_depth) != '': --min_depth=$filter.min_depth @@ -46,7 +46,7 @@ ]]> - + @@ -54,7 +54,7 @@ - + @@ -69,7 +69,7 @@ - + diff -r 908c68e3c73f -r 9e83cc05d384 ensemblref.py --- a/ensemblref.py Fri Jan 06 17:43:29 2017 -0500 +++ b/ensemblref.py Mon Feb 06 09:25:43 2017 -0500 @@ -47,9 +47,15 @@ return self.name_idx - def get_gtf_transcript(self,transcript_id): + def get_gtf_transcript(self,name): idx = self.get_transcript_idx() - return idx[transcript_id] if transcript_id in idx else None + if name in idx: + return idx[name] + else: + nidx = self.get_name_idx() + if name in nidx: + return nidx[name] + return None def transcript_is_coding(self,transcript_id): @@ -62,34 +68,15 @@ return tx.start_codons[0] if len(tx.start_codons) > 0 else None - def get_bed_line(self,transcript_id,coding=False): + def get_bed_line(self,transcript_id,score=0,itemRgb='0,0,0',coding=False): tx = self.get_transcript_idx()[transcript_id] chrom = tx.gene.contig chromStart = tx.coding_beg if coding else tx.beg chromEnd = tx.coding_end if coding else tx.end name = transcript_id - score = 0 strand = '+' if tx.gene.strand else '-' thickStart = tx.coding_beg if tx.coding_beg else chromStart thickEnd = tx.coding_end if tx.coding_end else chromEnd - itemRgb = '0,0,0' - exons = tx.get_coding_exons() if coding else tx.get_exons() - blockCount = len(exons) - if tx.gene.strand: - strand = '+' - blockSizes = [abs(e-s) for s,e in exons] - - def get_bed_line(self,transcript_id,coding=False): - tx = self.get_transcript_idx()[transcript_id] - chrom = tx.gene.contig - chromStart = tx.coding_beg if coding else tx.beg - chromEnd = tx.coding_end if coding else tx.end - name = transcript_id - score = 0 - strand = '+' if tx.gene.strand else '-' - thickStart = tx.coding_beg if tx.coding_beg else chromStart - thickEnd = tx.coding_end if tx.coding_end else chromEnd - itemRgb = '0,0,0' exons = tx.get_coding_exons() if coding else tx.get_exons() blockCount = len(exons) if tx.gene.strand: diff -r 908c68e3c73f -r 9e83cc05d384 test-data/._.DS_Store Binary file test-data/._.DS_Store has changed