comparison ensembl_variant_report.py @ 0:c3a9e63e8c51 draft

planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/ensembl_variant_report commit 239c1ee096e5fc3e2e929f7bf2d4afba5c677d4b-dirty
author jjohnson
date Fri, 06 Jan 2017 16:19:40 -0500
parents
children 9e83cc05d384
comparison
equal deleted inserted replaced
-1:000000000000 0:c3a9e63e8c51
1 #!/usr/bin/env python
2 """
3 Report variants to Ensembl Transcripts
4
5 FrameShift report line
6 # Gene Variant Position Reference Variation Prevalence Sequencing Depth Transcript AA Position AA change AA Length Stop Codon Stop Region AA Variation
7 TIGD6 chr5:149374879 AAG AG 1.00 13 ENSG00000164296|ENST00000296736 345 Q344 522 A-TGA-T KRWTSSRPST*
8
9 MissSense report line
10 # Gene Variant Position Reference Variation Prevalence Sequencing Depth Transcript AA Position AA change AA Length Stop Codon Stop Region AA Variation
11 FN1 chr2:216235089 G A 1.00 7394 ENSG00000115414|ENST00000354785 2261 V2261I 2478 G-TAA TGLTRGATYN_I_IVEALKDQQR
12 """
13 import sys
14 import os.path
15 import re
16 import time
17 import optparse
18 from ensemblref import EnsemblRef
19 from Bio.Seq import reverse_complement, translate
20
21
22 def __main__():
23 #Parse Command Line
24 parser = optparse.OptionParser()
25 #I/O
26 parser.add_option( '-i', '--input', dest='input', default=None, help='Tabular file with peptide_sequence column' )
27 parser.add_option( '-s', '--format', dest='format', default='tabular', choices=['tabular','snpeff'], help='Tabular file with peptide_sequence column' )
28 #Columns for tabular input
29 parser.add_option( '-C', '--chrom_column', type='int', dest='chrom_column', default=1, help='column ordinal with Ensembl transctip ID' )
30 parser.add_option( '-P', '--pos_column', type='int', dest='pos_column', default=2, help='column ordinal with Ensembl transctip ID' )
31 parser.add_option( '-R', '--ref_column', type='int', dest='ref_column', default=3, help='column ordinal with Ensembl transctip ID' )
32 parser.add_option( '-A', '--alt_column', type='int', dest='alt_column', default=4, help='column ordinal with Ensembl transctip ID' )
33 parser.add_option( '-T', '--transcript_column', type='int', dest='transcript_column', default=1, help='column ordinal with Ensembl transctip ID' )
34 parser.add_option( '-F', '--dpr_column', type='int', dest='dpr_column', default=1, help='column with VCF: DPR or AD' )
35 parser.add_option( '-D', '--dp_column', type='int', dest='dp_column', default=1, help='column with VCF: DP' )
36 parser.add_option( '-g', '--gene_model', dest='gene_model', default=None, help='GTF gene model file. Used to annotate NSJ peptide entries.')
37 parser.add_option( '-2', '--twobit', dest='twobit', default=None, help='Reference genome in UCSC twobit format')
38 #Output file
39 parser.add_option( '-o', '--output', dest='output', default=None, help='The output report (else write to stdout)' )
40 #filters
41 parser.add_option( '-d', '--min_depth', type='int', dest='min_depth', default=None, help='Minimum read depth to report' )
42 parser.add_option( '-f', '--min_freq', type='float', dest='min_freq', default=None, help='Minimum variant frequency to report' )
43 #peptide options
44 parser.add_option( '-l', '--leading_aa', type='int', dest='leading_aa', default=10, help='Number AAs before missense variant' )
45 parser.add_option( '-t', '--trailing_aa', type='int', dest='trailing_aa', default=10, help='Number AAs after missense variant' )
46 parser.add_option( '-r', '--readthrough', type='int', dest='readthrough', default=0, help='' )
47 #
48 parser.add_option('--debug', dest='debug', action='store_true', default=False, help='Print debugging messages')
49 (options, args) = parser.parse_args()
50
51 ##INPUTS##
52 if options.input != None:
53 try:
54 inputPath = os.path.abspath(options.input)
55 inputFile = open(inputPath, 'r')
56 except Exception, e:
57 print >> sys.stderr, "failed: %s" % e
58 exit(2)
59 else:
60 inputFile = sys.stdin
61
62 if options.output != None:
63 try:
64 outputPath = os.path.abspath(options.output)
65 outputFile = open(outputPath, 'w')
66 except Exception, e:
67 print >> sys.stderr, "failed: %s" % e
68 exit(3)
69 else:
70 outputFile = sys.stdout
71
72 def parse_tabular():
73 ci = options.chrom_column - 1
74 pi = options.pos_column - 1
75 ri = options.ref_column - 1
76 ai = options.alt_column - 1
77 ti = options.transcript_column - 1
78 di = options.dp_column - 1
79 fi = options.dpr_column - 1
80 for linenum,line in enumerate(inputFile):
81 if options.debug:
82 print >> sys.stderr, "%d: %s\n" % (linenum,line)
83 if line.startswith('#'):
84 continue
85 if line.strip() == '':
86 continue
87 fields = line.rstrip('\r\n').split('\t')
88 transcript = fields[ti]
89 if not transcript:
90 print >> sys.stderr, "%d: %s\n" % (linenum,line)
91 continue
92 chrom = fields[ci]
93 pos = int(fields[pi])
94 ref = fields[ri]
95 alt = fields[ai]
96 dp = int(fields[di])
97 dpr = [int(x) for x in fields[fi].split(',')]
98 yield (transcript,pos,ref,alt,dp,dpr)
99
100 def parse_snpeff_vcf():
101 for linenum,line in enumerate(inputFile):
102 if line.startswith('##'):
103 if line.find('SnpEffVersion=') > 0:
104 SnpEffVersion = re.search('SnpEffVersion="?(\d+\.\d+)',line).groups()[0]
105 elif line.startswith('#CHROM'):
106 pass
107 else:
108 fields = line.strip('\r\n').split('\t')
109 if options.debug: print >> sys.stderr, "\n%s" % (fields)
110 (chrom, pos, id, ref, alts, qual, filter, info) = fields[0:8]
111 pos = int(pos)
112 qual = float(qual)
113 for info_item in info.split(';'):
114 if info_item.find('=') < 0: continue
115 (key, val) = info_item.split('=', 1)
116 if key == 'DP':
117 dp = int(val)
118 if key == 'DPR':
119 dpr = dpr = [int(x) for x in val.split(',')]
120 if key in ['EFF','ANN']:
121 for effect in val.split(','):
122 if options.debug: print >> sys.stderr, "\n%s" % (effect.split('|'))
123 if key == 'ANN':
124 (alt,eff,impact,gene_name,gene_id,feature_type,transcript,biotype,exon,c_hgvs,p_hgvs,cdna,cds,aa,distance,info) = effect.split('|')
125 elif key == 'EFF':
126 (eff, effs) = effect.rstrip(')').split('(')
127 (impact, functional_class, codon_change, aa_change, aa_len, gene_name, biotype, coding, transcript, exon, alt) = effs.split('|')[0:11]
128 yield (transcript,pos,ref,alt,dp,dpr)
129
130
131 #Process gene model
132 ens_ref = None
133 if options.gene_model != None:
134 try:
135 geneModelFile = os.path.abspath(options.gene_model)
136 twoBitFile = os.path.abspath(options.twobit)
137 print >> sys.stderr, "Parsing ensembl ref: %s %s" % (options.gene_model,options.twobit)
138 time1 = time.time()
139 ens_ref = EnsemblRef(geneModelFile,twoBitFile)
140 time2 = time.time()
141 print >> sys.stderr, "Parsing ensembl ref: %d seconds" % (int(time2-time1))
142 except Exception, e:
143 print >> sys.stderr, "Parsing gene model failed: %s" % e
144 exit(2)
145 try:
146 parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf
147 for tid,pos1,ref,alt,dp,dpr in parse_input():
148 freq = float(dpr[1])/float(sum(dpr)) if dpr else None
149 if options.min_depth and dp is not None and dp < options.min_depth:
150 continue
151 if options.min_freq and freq is not None and freq < options.min_freq:
152 continue
153 ## transcript_id, pos, ref, alt, dp, dpr
154 tx = ens_ref.get_gtf_transcript(tid)
155 coding = ens_ref.transcript_is_coding(tid)
156 if not coding:
157 continue
158 frame_shift = len(ref) != len(alt)
159 cds = ens_ref.get_cds(tid)
160 pos0 = pos1 - 1 # zero based position
161 spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1
162 alt_seq = alt if tx.gene.strand else reverse_complement(alt)
163 cds_pos = ens_ref.genome_to_cds_pos(tid, spos)
164 alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):]
165 offset = 0
166 if tx.gene.strand:
167 for i in range(min(len(ref),len(alt))):
168 if ref[i] == alt[i]:
169 offset = i
170 else:
171 break
172 else:
173 for i in range(-1,-min(len(ref),len(alt)) -1,-1):
174 if ref[i] == alt[i]:
175 offset = i
176 else:
177 break
178 refpep = translate(cds[:len(cds)/3*3])
179 pep = translate(alt_cds[:len(alt_cds)/3*3])
180 peplen = len(pep)
181 aa_pos = (cds_pos + offset) / 3
182 if aa_pos >= len(pep):
183 print >> sys.stderr, "%d: %s\n" % (linenum,line)
184 continue
185 if frame_shift:
186 #find stop_codons
187 nstops = 0
188 stop_codons = []
189 for i in range(aa_pos,peplen):
190 if refpep[i] != pep[i]:
191 aa_pos = i
192 break
193 for i in range(aa_pos,peplen):
194 if pep[i] == '*':
195 nstops += 1
196 stop_codons.append("%s-%s%s" % (alt_cds[i*3-1],alt_cds[i*3:i*3+3],"-%s" % alt_cds[i*3+4] if len(alt_cds) > i*3 else ''))
197 if nstops > options.readthrough:
198 reported_peptide = pep[aa_pos:i+1]
199 reported_stop_codon = ','.join(stop_codons)
200 break
201 else:
202 reported_stop_codon = "%s-%s" % (alt_cds[peplen*3-4],alt_cds[peplen*3-3:peplen*3])
203 reported_peptide = "%s_%s_%s" % (pep[max(aa_pos-options.leading_aa,0):aa_pos],
204 pep[aa_pos],
205 pep[aa_pos+1:min(aa_pos+1+options.trailing_aa,len(pep))])
206 cs_pos = aa_pos * 3
207 aa_pos = (cds_pos + offset) / 3
208 ref_codon = cds[cs_pos:cs_pos+3]
209 ref_aa = translate(ref_codon)
210 alt_codon = alt_cds[cs_pos:cs_pos+3]
211 alt_aa = translate(alt_codon)
212 gene = tx.gene.names[0]
213 report_fields = [tx.gene.names[0],
214 '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'),
215 ref,
216 alt,
217 "%1.2f" % freq if freq is not None else '',
218 str(dp),
219 "%s|%s" % (tx.gene.gene_id,tx.cdna_id),
220 "%d" % (aa_pos + 1),
221 "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa),
222 "%d" % len(pep),
223 reported_stop_codon,
224 reported_peptide
225 ]
226 if options.debug:
227 report_fields.append("%d %d %d %d %s %s" % (cds_pos, offset, cs_pos,aa_pos,ref_codon,alt_codon))
228 outputFile.write('\t'.join(report_fields))
229 if options.debug:
230 print >> sys.stderr, "%s %s\n%s\n%s\n%s %s" % (
231 cds[cs_pos-6:cs_pos], cds[cs_pos:cs_pos+15],
232 translate(cds[cs_pos-6:cs_pos+15]),
233 translate(alt_cds[cs_pos-6:cs_pos+15]),
234 alt_cds[cs_pos-6:cs_pos], alt_cds[cs_pos:cs_pos+15])
235 outputFile.write('\n')
236 except Exception, e:
237 print >> sys.stderr, "failed: %s" % e
238 exit(1)
239
240
241 if __name__ == "__main__" : __main__()
242