changeset 5:9e83cc05d384 draft

Uploaded
author jjohnson
date Mon, 06 Feb 2017 09:25:43 -0500
parents 908c68e3c73f
children d9fad18ffdb1
files ._ensembl_variant_report.py ._ensembl_variant_report.xml ._ensemblref.py ensembl_variant_report.py ensembl_variant_report.xml ensemblref.py test-data/._.DS_Store
diffstat 7 files changed, 36 insertions(+), 40 deletions(-) [+]
line wrap: on
line diff
Binary file ._ensembl_variant_report.py has changed
Binary file ._ensembl_variant_report.xml has changed
Binary file ._ensemblref.py has changed
--- a/ensembl_variant_report.py	Fri Jan 06 17:43:29 2017 -0500
+++ b/ensembl_variant_report.py	Mon Feb 06 09:25:43 2017 -0500
@@ -92,10 +92,12 @@
             chrom = fields[ci]
             pos = int(fields[pi])
             ref = fields[ri]
-            alt = fields[ai]
+            alts = fields[ai]
             dp = int(fields[di])
             dpr = [int(x) for x in fields[fi].split(',')]
-            yield (transcript,pos,ref,alt,dp,dpr)
+            for i,alt in enumerate(alts.split(',')):
+                freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None
+                yield (transcript,pos,ref,alt,dp,freq)
 
     def parse_snpeff_vcf():
         for linenum,line in enumerate(inputFile):
@@ -108,6 +110,7 @@
                 fields = line.strip('\r\n').split('\t')
                 if options.debug: print >> sys.stderr, "\n%s" % (fields)
                 (chrom, pos, id, ref, alts, qual, filter, info) = fields[0:8]
+                alt_list = alts.split(',')
                 pos = int(pos)
                 qual = float(qual)
                 for info_item in info.split(';'):
@@ -125,7 +128,9 @@
                             elif key == 'EFF':
                                 (eff, effs) = effect.rstrip(')').split('(')
                                 (impact, functional_class, codon_change, aa_change, aa_len, gene_name, biotype, coding, transcript, exon, alt) = effs.split('|')[0:11]
-                            yield (transcript,pos,ref,alt,dp,dpr)
+                            i = alt_list.index(alt) if alt in alt_list else 0
+                            freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None
+                            yield (transcript,pos,ref,alt,dp,freq)
 
 
     #Process gene model
@@ -144,14 +149,17 @@
             exit(2)
     try:
         parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf
-        for tid,pos1,ref,alt,dp,dpr in parse_input():
-            freq = float(dpr[1])/float(sum(dpr)) if dpr else None
+        for tid,pos1,ref,alt,dp,freq in parse_input():
+            if not tid:
+                continue
             if options.min_depth and dp is not None and dp < options.min_depth:
                 continue
             if options.min_freq and freq is not None and freq < options.min_freq:
                 continue
             ## transcript_id, pos, ref, alt, dp, dpr
             tx = ens_ref.get_gtf_transcript(tid)
+            if not tx:
+                continue
             coding = ens_ref.transcript_is_coding(tid)
             if not coding:
                 continue
@@ -160,8 +168,9 @@
             pos0 = pos1 - 1 # zero based position
             spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1
             alt_seq = alt if tx.gene.strand else reverse_complement(alt)
+            ref_seq = ref if tx.gene.strand else reverse_complement(ref)
             cds_pos = ens_ref.genome_to_cds_pos(tid, spos)
-            alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):]
+            alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else '' 
             offset = 0
             if tx.gene.strand:
                 for i in range(min(len(ref),len(alt))):
@@ -180,7 +189,7 @@
             peplen = len(pep)
             aa_pos = (cds_pos + offset) / 3
             if aa_pos >= len(pep):
-                print >> sys.stderr, "%d: %s\n" % (linenum,line)
+                print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s\n" % (aa_pos,len(pep),tid,pos1,ref,alt)
                 continue
             if frame_shift:
                 #find stop_codons
@@ -212,8 +221,8 @@
             gene = tx.gene.names[0]
             report_fields = [tx.gene.names[0],
              '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'),
-             ref,
-             alt,
+             ref_seq,
+             alt_seq,
              "%1.2f" % freq if freq is not None else '',
              str(dp),
              "%s|%s" % (tx.gene.gene_id,tx.cdna_id),
--- a/ensembl_variant_report.xml	Fri Jan 06 17:43:29 2017 -0500
+++ b/ensembl_variant_report.xml	Mon Feb 06 09:25:43 2017 -0500
@@ -20,12 +20,12 @@
              --format=snpeff
          #else
              --input="$variant.input_tsv"
-            --pos_column=$pos_column
-            --ref_column=$ref_column
-            --alt_column=$alt_column
-            --transcript_column=$transcript_column
-            --dp_column=$dp_column
-            --dpr_column=$dpr_column
+            --pos_column=$variant.pos_column
+            --ref_column=$variant.ref_column
+            --alt_column=$variant.alt_column
+            --transcript_column=$variant.transcript_column
+            --dp_column=$variant.dp_column
+            --dpr_column=$variant.dpr_column
          #end if
          #if str($filter.min_depth) != '':
              --min_depth=$filter.min_depth
@@ -46,7 +46,7 @@
     ]]></command>
     <inputs>
         <conditional name="variant">
-            <param name="fmt" type="select" label="">
+            <param name="fmt" type="select" label="Input format for variants">
                 <option value="vcf">snpEff vcf</option>
                 <option value="tsv">tabular from snpsift extract</option>
             </param>
@@ -54,7 +54,7 @@
                 <param name="input_vcf" type="data" format="vcf" label="snpEff VCF file"/>
             </when>
             <when value="tsv">
-                <param name="input_tsv" type="data" format="vcf" label="tabular file"/>
+                <param name="input_tsv" type="data" format="tabular" label="tabular file"/>
                 <param name="pos_column" type="data_column" data_ref="input_tsv" label="POS column"/>
                 <param name="ref_column" type="data_column" data_ref="input_tsv" label="REF column"/>
                 <param name="alt_column" type="data_column" data_ref="input_tsv" label="ALT column"/>
@@ -69,7 +69,7 @@
                 <option value="history">History</option>
             </param>
             <when value="cached">
-                <param name="ref_loc" type="data" format="twobit" label="Select reference 2bit file">
+                <param name="ref_loc" type="select" label="Select reference 2bit file">
                     <options from_data_table="twobit" />
                 </param>
             </when>
--- a/ensemblref.py	Fri Jan 06 17:43:29 2017 -0500
+++ b/ensemblref.py	Mon Feb 06 09:25:43 2017 -0500
@@ -47,9 +47,15 @@
         return self.name_idx
 
 
-    def get_gtf_transcript(self,transcript_id):
+    def get_gtf_transcript(self,name):
         idx = self.get_transcript_idx()
-        return idx[transcript_id] if transcript_id in idx else None
+        if name in idx:
+            return idx[name]
+        else:
+            nidx = self.get_name_idx()
+            if name in nidx:
+                return nidx[name]
+        return None
 
 
     def transcript_is_coding(self,transcript_id):
@@ -62,34 +68,15 @@
         return tx.start_codons[0] if len(tx.start_codons) > 0 else None
 
 
-    def get_bed_line(self,transcript_id,coding=False):
+    def get_bed_line(self,transcript_id,score=0,itemRgb='0,0,0',coding=False):
         tx = self.get_transcript_idx()[transcript_id]
         chrom = tx.gene.contig
         chromStart = tx.coding_beg if coding else tx.beg 
         chromEnd = tx.coding_end if coding else tx.end
         name = transcript_id
-        score = 0
         strand = '+' if tx.gene.strand else '-'
         thickStart = tx.coding_beg if tx.coding_beg else chromStart
         thickEnd = tx.coding_end if tx.coding_end else chromEnd
-        itemRgb = '0,0,0'
-        exons = tx.get_coding_exons() if coding else tx.get_exons()
-        blockCount = len(exons)
-        if tx.gene.strand:
-            strand = '+'
-            blockSizes = [abs(e-s) for s,e in exons]
-
-    def get_bed_line(self,transcript_id,coding=False):
-        tx = self.get_transcript_idx()[transcript_id]
-        chrom = tx.gene.contig
-        chromStart = tx.coding_beg if coding else tx.beg 
-        chromEnd = tx.coding_end if coding else tx.end
-        name = transcript_id
-        score = 0
-        strand = '+' if tx.gene.strand else '-'
-        thickStart = tx.coding_beg if tx.coding_beg else chromStart
-        thickEnd = tx.coding_end if tx.coding_end else chromEnd
-        itemRgb = '0,0,0'
         exons = tx.get_coding_exons() if coding else tx.get_exons()
         blockCount = len(exons)
         if tx.gene.strand:
Binary file test-data/._.DS_Store has changed