Mercurial > repos > jjohnson > ensembl_variant_report
changeset 5:9e83cc05d384 draft
Uploaded
author | jjohnson |
---|---|
date | Mon, 06 Feb 2017 09:25:43 -0500 |
parents | 908c68e3c73f |
children | d9fad18ffdb1 |
files | ._ensembl_variant_report.py ._ensembl_variant_report.xml ._ensemblref.py ensembl_variant_report.py ensembl_variant_report.xml ensemblref.py test-data/._.DS_Store |
diffstat | 7 files changed, 36 insertions(+), 40 deletions(-) [+] |
line wrap: on
line diff
--- a/ensembl_variant_report.py Fri Jan 06 17:43:29 2017 -0500 +++ b/ensembl_variant_report.py Mon Feb 06 09:25:43 2017 -0500 @@ -92,10 +92,12 @@ chrom = fields[ci] pos = int(fields[pi]) ref = fields[ri] - alt = fields[ai] + alts = fields[ai] dp = int(fields[di]) dpr = [int(x) for x in fields[fi].split(',')] - yield (transcript,pos,ref,alt,dp,dpr) + for i,alt in enumerate(alts.split(',')): + freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None + yield (transcript,pos,ref,alt,dp,freq) def parse_snpeff_vcf(): for linenum,line in enumerate(inputFile): @@ -108,6 +110,7 @@ fields = line.strip('\r\n').split('\t') if options.debug: print >> sys.stderr, "\n%s" % (fields) (chrom, pos, id, ref, alts, qual, filter, info) = fields[0:8] + alt_list = alts.split(',') pos = int(pos) qual = float(qual) for info_item in info.split(';'): @@ -125,7 +128,9 @@ elif key == 'EFF': (eff, effs) = effect.rstrip(')').split('(') (impact, functional_class, codon_change, aa_change, aa_len, gene_name, biotype, coding, transcript, exon, alt) = effs.split('|')[0:11] - yield (transcript,pos,ref,alt,dp,dpr) + i = alt_list.index(alt) if alt in alt_list else 0 + freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None + yield (transcript,pos,ref,alt,dp,freq) #Process gene model @@ -144,14 +149,17 @@ exit(2) try: parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf - for tid,pos1,ref,alt,dp,dpr in parse_input(): - freq = float(dpr[1])/float(sum(dpr)) if dpr else None + for tid,pos1,ref,alt,dp,freq in parse_input(): + if not tid: + continue if options.min_depth and dp is not None and dp < options.min_depth: continue if options.min_freq and freq is not None and freq < options.min_freq: continue ## transcript_id, pos, ref, alt, dp, dpr tx = ens_ref.get_gtf_transcript(tid) + if not tx: + continue coding = ens_ref.transcript_is_coding(tid) if not coding: continue @@ -160,8 +168,9 @@ pos0 = pos1 - 1 # zero based position spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1 alt_seq = alt if tx.gene.strand else reverse_complement(alt) + ref_seq = ref if tx.gene.strand else reverse_complement(ref) cds_pos = ens_ref.genome_to_cds_pos(tid, spos) - alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] + alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else '' offset = 0 if tx.gene.strand: for i in range(min(len(ref),len(alt))): @@ -180,7 +189,7 @@ peplen = len(pep) aa_pos = (cds_pos + offset) / 3 if aa_pos >= len(pep): - print >> sys.stderr, "%d: %s\n" % (linenum,line) + print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s\n" % (aa_pos,len(pep),tid,pos1,ref,alt) continue if frame_shift: #find stop_codons @@ -212,8 +221,8 @@ gene = tx.gene.names[0] report_fields = [tx.gene.names[0], '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'), - ref, - alt, + ref_seq, + alt_seq, "%1.2f" % freq if freq is not None else '', str(dp), "%s|%s" % (tx.gene.gene_id,tx.cdna_id),
--- a/ensembl_variant_report.xml Fri Jan 06 17:43:29 2017 -0500 +++ b/ensembl_variant_report.xml Mon Feb 06 09:25:43 2017 -0500 @@ -20,12 +20,12 @@ --format=snpeff #else --input="$variant.input_tsv" - --pos_column=$pos_column - --ref_column=$ref_column - --alt_column=$alt_column - --transcript_column=$transcript_column - --dp_column=$dp_column - --dpr_column=$dpr_column + --pos_column=$variant.pos_column + --ref_column=$variant.ref_column + --alt_column=$variant.alt_column + --transcript_column=$variant.transcript_column + --dp_column=$variant.dp_column + --dpr_column=$variant.dpr_column #end if #if str($filter.min_depth) != '': --min_depth=$filter.min_depth @@ -46,7 +46,7 @@ ]]></command> <inputs> <conditional name="variant"> - <param name="fmt" type="select" label=""> + <param name="fmt" type="select" label="Input format for variants"> <option value="vcf">snpEff vcf</option> <option value="tsv">tabular from snpsift extract</option> </param> @@ -54,7 +54,7 @@ <param name="input_vcf" type="data" format="vcf" label="snpEff VCF file"/> </when> <when value="tsv"> - <param name="input_tsv" type="data" format="vcf" label="tabular file"/> + <param name="input_tsv" type="data" format="tabular" label="tabular file"/> <param name="pos_column" type="data_column" data_ref="input_tsv" label="POS column"/> <param name="ref_column" type="data_column" data_ref="input_tsv" label="REF column"/> <param name="alt_column" type="data_column" data_ref="input_tsv" label="ALT column"/> @@ -69,7 +69,7 @@ <option value="history">History</option> </param> <when value="cached"> - <param name="ref_loc" type="data" format="twobit" label="Select reference 2bit file"> + <param name="ref_loc" type="select" label="Select reference 2bit file"> <options from_data_table="twobit" /> </param> </when>
--- a/ensemblref.py Fri Jan 06 17:43:29 2017 -0500 +++ b/ensemblref.py Mon Feb 06 09:25:43 2017 -0500 @@ -47,9 +47,15 @@ return self.name_idx - def get_gtf_transcript(self,transcript_id): + def get_gtf_transcript(self,name): idx = self.get_transcript_idx() - return idx[transcript_id] if transcript_id in idx else None + if name in idx: + return idx[name] + else: + nidx = self.get_name_idx() + if name in nidx: + return nidx[name] + return None def transcript_is_coding(self,transcript_id): @@ -62,34 +68,15 @@ return tx.start_codons[0] if len(tx.start_codons) > 0 else None - def get_bed_line(self,transcript_id,coding=False): + def get_bed_line(self,transcript_id,score=0,itemRgb='0,0,0',coding=False): tx = self.get_transcript_idx()[transcript_id] chrom = tx.gene.contig chromStart = tx.coding_beg if coding else tx.beg chromEnd = tx.coding_end if coding else tx.end name = transcript_id - score = 0 strand = '+' if tx.gene.strand else '-' thickStart = tx.coding_beg if tx.coding_beg else chromStart thickEnd = tx.coding_end if tx.coding_end else chromEnd - itemRgb = '0,0,0' - exons = tx.get_coding_exons() if coding else tx.get_exons() - blockCount = len(exons) - if tx.gene.strand: - strand = '+' - blockSizes = [abs(e-s) for s,e in exons] - - def get_bed_line(self,transcript_id,coding=False): - tx = self.get_transcript_idx()[transcript_id] - chrom = tx.gene.contig - chromStart = tx.coding_beg if coding else tx.beg - chromEnd = tx.coding_end if coding else tx.end - name = transcript_id - score = 0 - strand = '+' if tx.gene.strand else '-' - thickStart = tx.coding_beg if tx.coding_beg else chromStart - thickEnd = tx.coding_end if tx.coding_end else chromEnd - itemRgb = '0,0,0' exons = tx.get_coding_exons() if coding else tx.get_exons() blockCount = len(exons) if tx.gene.strand: