Mercurial > repos > jjohnson > translate_bed_sequences
annotate translate_bed_sequences.py @ 5:c626a939eef7 draft default tip
Uploaded
author | jjohnson |
---|---|
date | Tue, 12 Jan 2016 14:38:03 -0500 |
parents | 3b526a780849 |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 """ | |
3 # | |
4 #------------------------------------------------------------------------------ | |
5 # University of Minnesota | |
6 # Copyright 2014, Regents of the University of Minnesota | |
7 #------------------------------------------------------------------------------ | |
8 # Author: | |
9 # | |
10 # James E Johnson | |
11 # | |
12 #------------------------------------------------------------------------------ | |
13 """ | |
14 | |
15 """ | |
16 Input: BED file (12 column) + 13th sequence column appended by extract_genomic_dna | |
17 Output: Fasta of 3-frame translations of the spliced sequence | |
18 | |
19 """ | |
20 | |
21 import sys,re,os.path | |
5 | 22 import tempfile |
0 | 23 import optparse |
24 from optparse import OptionParser | |
25 from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate | |
26 | |
27 class BedEntry( object ): | |
28 def __init__(self, line): | |
29 self.line = line | |
30 try: | |
5 | 31 fields = line.rstrip('\r\n').split('\t') |
32 (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts) = fields[0:12] | |
33 seq = fields[12] if len(fields) > 12 else None | |
0 | 34 self.chrom = chrom |
35 self.chromStart = int(chromStart) | |
36 self.chromEnd = int(chromEnd) | |
37 self.name = name | |
38 self.score = int(score) | |
39 self.strand = strand | |
40 self.thickStart = int(thickStart) | |
41 self.thickEnd = int(thickEnd) | |
42 self.itemRgb = itemRgb | |
43 self.blockCount = int(blockCount) | |
44 self.blockSizes = [int(x) for x in blockSizes.split(',')] | |
45 self.blockStarts = [int(x) for x in blockStarts.split(',')] | |
46 self.seq = seq | |
47 except Exception, e: | |
48 print >> sys.stderr, "Unable to read Bed entry" % e | |
49 exit(1) | |
5 | 50 def __str__(self): |
51 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s%s' % ( | |
52 self.chrom, self.chromStart, self.chromEnd, self.name, self.score, self.strand, self.thickStart, self.thickEnd, self.itemRgb, self.blockCount, | |
53 ','.join([str(x) for x in self.blockSizes]), | |
54 ','.join([str(x) for x in self.blockStarts]), | |
55 '\t%s' % self.seq if self.seq else '') | |
0 | 56 def get_splice_junctions(self): |
57 splice_juncs = [] | |
58 for i in range(self.blockCount - 1): | |
59 splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) | |
60 splice_juncs.append(splice_junc) | |
61 return splice_juncs | |
62 def get_exon_seqs(self): | |
63 exons = [] | |
64 for i in range(self.blockCount): | |
65 # splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) | |
66 exons.append(self.seq[self.blockStarts[i]:self.blockStarts[i] + self.blockSizes[i]]) | |
67 if self.strand == '-': #reverse complement | |
68 exons.reverse() | |
69 for i,s in enumerate(exons): | |
70 exons[i] = reverse_complement(s) | |
71 return exons | |
72 def get_spliced_seq(self): | |
73 seq = ''.join(self.get_exon_seqs()) | |
74 return seq | |
75 def get_translation(self,sequence=None): | |
76 translation = None | |
77 seq = sequence if sequence else self.get_spliced_seq() | |
78 if seq: | |
79 seqlen = len(seq) / 3 * 3; | |
80 if seqlen >= 3: | |
81 translation = translate(seq[:seqlen]) | |
82 return translation | |
83 def get_translations(self): | |
84 translations = [] | |
85 seq = self.get_spliced_seq() | |
86 if seq: | |
87 for i in range(3): | |
88 translation = self.get_translation(sequence=seq[i:]) | |
89 if translation: | |
90 translations.append(translation) | |
91 return translations | |
5 | 92 ## (start,end) |
93 def get_subrange(self,tstart,tstop): | |
94 chromStart = self.chromStart | |
95 chromEnd = self.chromEnd | |
96 r = range(self.blockCount) | |
97 if self.strand == '-': | |
98 r.reverse() | |
99 bStart = 0 | |
100 for x in r: | |
101 bEnd = bStart + self.blockSizes[x] | |
102 if bStart <= tstart < bEnd: | |
103 if self.strand == '+': | |
104 chromStart = self.chromStart + self.blockStarts[x] + (tstart - bStart) | |
105 else: | |
106 chromEnd = self.chromStart + self.blockStarts[x] + (tstart - bStart) | |
107 if bStart <= tstop < bEnd: | |
108 if self.strand == '+': | |
109 chromEnd = self.chromStart + self.blockStarts[x] + (tstop - bStart) | |
110 else: | |
111 chromStart = self.chromStart + self.blockStarts[x] + self.blockSizes[x] - (tstop - bStart) | |
112 bStart += self.blockSizes[x] | |
113 return(chromStart,chromEnd) | |
114 #get the blocks for sub range | |
115 def get_blocks(self,chromStart,chromEnd): | |
116 tblockCount = 0 | |
117 tblockSizes = [] | |
118 tblockStarts = [] | |
119 for x in range(self.blockCount): | |
120 bStart = self.chromStart + self.blockStarts[x] | |
121 bEnd = bStart + self.blockSizes[x] | |
122 if bStart > chromEnd: | |
123 break | |
124 if bEnd < chromStart: | |
125 continue | |
126 cStart = max(chromStart,bStart) | |
127 tblockStarts.append(cStart - chromStart) | |
128 tblockSizes.append(min(chromEnd,bEnd) - cStart) | |
129 tblockCount += 1 | |
130 ## print >> sys.stderr, "tblockCount: %d tblockStarts: %s tblockSizes: %s" % (tblockCount,tblockStarts,tblockSizes) | |
131 return (tblockCount,tblockSizes,tblockStarts) | |
132 ## [(start,end,seq,blockCount,blockSizes,blockStarts),(start,end,seq,blockCount,blockSizes,blockStarts),(start,end,seq,blockCount,blockSizes,blockStarts)] | |
0 | 133 ## filter: ignore translation if stop codon in first exon after ignore_left_bp |
5 | 134 def get_filterd_translations(self,untrimmed=False,filtering=True,ignore_left_bp=0,ignore_right_bp=0,debug=False): |
135 translations = [None,None,None,None,None,None] | |
0 | 136 seq = self.get_spliced_seq() |
137 ignore = (ignore_left_bp if self.strand == '+' else ignore_right_bp) / 3 | |
138 block_sum = sum(self.blockSizes) | |
5 | 139 exon_sizes = [x for x in self.blockSizes] |
0 | 140 if self.strand == '-': |
141 exon_sizes.reverse() | |
142 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))] | |
5 | 143 if debug: |
144 print >> sys.stderr, "splice_sites: %s" % splice_sites | |
0 | 145 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0] |
146 if seq: | |
147 for i in range(3): | |
148 translation = self.get_translation(sequence=seq[i:]) | |
149 if translation: | |
150 tstart = 0 | |
151 tstop = len(translation) | |
5 | 152 offset = (block_sum - i) % 3 |
153 if debug: | |
154 print >> sys.stderr, "frame: %d\ttstart: %d tstop: %d offset: %d\t%s" % (i,tstart,tstop,offset,translation) | |
0 | 155 if not untrimmed: |
156 tstart = translation.rfind('*',0,junc) + 1 | |
2
359addb9b9d4
Do not include the stop codon when filtering
Jim Johnson <jj@umn.edu>
parents:
0
diff
changeset
|
157 stop = translation.find('*',junc) |
359addb9b9d4
Do not include the stop codon when filtering
Jim Johnson <jj@umn.edu>
parents:
0
diff
changeset
|
158 tstop = stop if stop >= 0 else len(translation) |
5 | 159 offset = (block_sum - i) % 3 |
160 trimmed = translation[tstart:tstop] | |
161 if debug: | |
162 print >> sys.stderr, "frame: %d\ttstart: %d tstop: %d offset: %d\t%s" % (i,tstart,tstop,offset,trimmed) | |
0 | 163 if filtering and tstart > ignore: |
164 continue | |
165 #get genomic locations for start and end | |
166 if self.strand == '+': | |
167 chromStart = self.chromStart + i + (tstart * 3) | |
168 chromEnd = self.chromEnd - offset - (len(translation) - tstop) * 3 | |
169 else: | |
170 chromStart = self.chromStart + offset + (len(translation) - tstop) * 3 | |
171 chromEnd = self.chromEnd - i - (tstart * 3) | |
5 | 172 #get the blocks for this translation |
173 (tblockCount,tblockSizes,tblockStarts) = self.get_blocks(chromStart,chromEnd) | |
174 translations[i] = (chromStart,chromEnd,trimmed,tblockCount,tblockSizes,tblockStarts) | |
175 if debug: | |
176 print >> sys.stderr, "tblockCount: %d tblockStarts: %s tblockSizes: %s" % (tblockCount,tblockStarts,tblockSizes) | |
177 # translations[i] = (chromStart,chromEnd,trimmed,tblockCount,tblockSizes,tblockStarts) | |
0 | 178 return translations |
179 def get_seq_id(self,seqtype='unk:unk',reference='',frame=None): | |
180 ## Ensembl fasta ID format | |
181 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT | |
3
3b526a780849
Change default seqtype from pep:novel to pep:splice
Jim Johnson <jj@umn.edu>
parents:
2
diff
changeset
|
182 # >ENSP00000328693 pep:splice chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding |
0 | 183 frame_name = '' |
184 chromStart = self.chromStart | |
185 chromEnd = self.chromEnd | |
186 strand = 1 if self.strand == '+' else -1 | |
187 if frame != None: | |
188 block_sum = sum(self.blockSizes) | |
189 offset = (block_sum - frame) % 3 | |
190 frame_name = '_' + str(frame + 1) | |
191 if self.strand == '+': | |
192 chromStart += frame | |
193 chromEnd -= offset | |
194 else: | |
195 chromStart += offset | |
196 chromEnd -= frame | |
197 location = "chromosome:%s:%s:%s:%s:%s" % (reference,self.chrom,chromStart,chromEnd,strand) | |
198 seq_id = "%s%s %s %s" % (self.name,frame_name,seqtype,location) | |
199 return seq_id | |
200 def get_line(self, start_offset = 0, end_offset = 0): | |
201 if start_offset or end_offset: | |
202 s_offset = start_offset if start_offset else 0 | |
203 e_offset = end_offset if end_offset else 0 | |
204 if s_offset > self.chromStart: | |
205 s_offset = self.chromStart | |
206 chrStart = self.chromStart - s_offset | |
207 chrEnd = self.chromEnd + e_offset | |
208 blkSizes = self.blockSizes | |
209 blkSizes[0] += s_offset | |
210 blkSizes[-1] += e_offset | |
211 blkStarts = self.blockStarts | |
212 for i in range(1,self.blockCount): | |
213 blkStarts[i] += s_offset | |
214 items = [str(x) for x in [self.chrom,chrStart,chrEnd,self.name,self.score,self.strand,self.thickStart,self.thickEnd,self.itemRgb,self.blockCount,','.join([str(x) for x in blkSizes]),','.join([str(x) for x in blkStarts])]] | |
215 return '\t'.join(items) + '\n' | |
216 return self.line | |
217 | |
218 def __main__(): | |
219 #Parse Command Line | |
220 parser = optparse.OptionParser() | |
221 parser.add_option( '-i', '--input', dest='input', help='BED file (tophat junctions.bed) with sequence column added' ) | |
222 parser.add_option( '-o', '--output', dest='output', help='Translations of spliced sequence') | |
223 parser.add_option( '-b', '--bed_format', dest='bed_format', action='store_true', default=False, help='Append translations to bed file instead of fasta' ) | |
5 | 224 parser.add_option( '-D', '--fa_db', dest='fa_db', default=None, help='Prefix DB identifier for fasta ID line, e.g. generic' ) |
225 parser.add_option( '-s', '--fa_sep', dest='fa_sep', default='|', help='fasta ID separator defaults to pipe char, e.g. generic|ProtID|description' ) | |
226 parser.add_option( '-B', '--bed', dest='bed', default=None, help='Output a bed file with added 13th column having translation' ) | |
227 parser.add_option( '-G', '--gff3', dest='gff', default=None, help='Output translations to a GFF3 file' ) | |
3
3b526a780849
Change default seqtype from pep:novel to pep:splice
Jim Johnson <jj@umn.edu>
parents:
2
diff
changeset
|
228 parser.add_option( '-S', '--seqtype', dest='seqtype', default='pep:splice', help='SEQTYPE:STATUS for fasta ID line' ) |
5 | 229 parser.add_option( '-P', '--id_prefix', dest='id_prefix', default='', help='prefix for the sequence ID' ) |
0 | 230 parser.add_option( '-R', '--reference', dest='reference', default=None, help='Genome Reference Name for fasta ID location ' ) |
5 | 231 parser.add_option( '-r', '--refsource', dest='refsource', default=None, help='Source for Genome Reference, e.g. Ensembl, UCSC, or NCBI' ) |
0 | 232 parser.add_option( '-Q', '--score_name', dest='score_name', default=None, help='include in the fasta ID line score_name:score ' ) |
233 parser.add_option( '-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to ignore when filtering' ) | |
234 parser.add_option( '-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to ignore when filtering' ) | |
235 parser.add_option( '-U', '--unfiltered', dest='filtering', action='store_false', default=True, help='Do NOT filterout translation with stop codon in the first exon' ) | |
236 parser.add_option( '-u', '--untrimmed', dest='untrimmed', action='store_true', default=False, help='Do NOT trim from splice site to stop codon' ) | |
237 parser.add_option( '-L', '--min_length', dest='min_length', type='int', default=None, help='Minimun length (to first stop codon)' ) | |
238 parser.add_option( '-M', '--max_stop_codons', dest='max_stop_codons', type='int', default=None, help='Filter out translations with more than max_stop_codons' ) | |
239 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) | |
240 (options, args) = parser.parse_args() | |
241 # Input files | |
242 if options.input != None: | |
243 try: | |
244 inputPath = os.path.abspath(options.input) | |
245 inputFile = open(inputPath, 'r') | |
246 except Exception, e: | |
247 print >> sys.stderr, "failed: %s" % e | |
248 exit(2) | |
249 else: | |
250 inputFile = sys.stdin | |
251 # Output files | |
5 | 252 bed_fh = None |
253 gff_fh = None | |
254 gff_fa_file = None | |
255 gff_fa = None | |
0 | 256 outFile = None |
257 if options.output == None: | |
258 #write to stdout | |
259 outFile = sys.stdout | |
5 | 260 if options.gff: |
261 gff_fa_file = tempfile.NamedTemporaryFile(prefix='gff_fasta_',suffix=".fa",dir=os.getcwd()).name | |
262 gff_fa = open(gff_fa_file,'w') | |
0 | 263 else: |
264 try: | |
265 outPath = os.path.abspath(options.output) | |
266 outFile = open(outPath, 'w') | |
267 except Exception, e: | |
268 print >> sys.stderr, "failed: %s" % e | |
269 exit(3) | |
5 | 270 if options.gff: |
271 gff_fa_file = outPath | |
272 if options.bed: | |
273 bed_fh = open(options.bed,'w') | |
274 bed_fh.write('track name="%s" description="%s" \n' % ('novel_junctioni_translations','test')) | |
275 if options.gff: | |
276 gff_fh = open(options.gff,'w') | |
277 gff_fh.write("##gff-version 3.2.1\n") | |
278 if options.reference: | |
279 gff_fh.write("##genome-build %s %s\n" % (options.refsource if options.refsource else 'unknown', options.reference)) | |
0 | 280 leading_bp = 0 |
281 trailing_bp = 0 | |
282 if options.leading_bp: | |
283 if options.leading_bp >= 0: | |
284 leading_bp = options.leading_bp | |
285 else: | |
286 print >> sys.stderr, "failed: leading_bp must be positive" | |
287 exit(5) | |
288 if options.trailing_bp: | |
289 if options.trailing_bp >= 0: | |
290 trailing_bp = options.trailing_bp | |
291 else: | |
292 print >> sys.stderr, "failed: trailing_bp must be positive" | |
293 exit(5) | |
294 # Scan bed file | |
295 try: | |
296 for i, line in enumerate( inputFile ): | |
297 if line.startswith('track'): | |
298 if outFile and options.bed_format: | |
299 outFile.write(line) | |
300 continue | |
301 entry = BedEntry(line) | |
302 strand = 1 if entry.strand == '+' else -1 | |
303 translations = entry.get_translations() | |
304 if options.debug: | |
305 exon_seqs = entry.get_exon_seqs() | |
306 exon_sizes = [len(seq) for seq in exon_seqs] | |
307 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))] | |
308 print >> sys.stderr, entry.name | |
309 print >> sys.stderr, line.rstrip('\r\n') | |
310 print >> sys.stderr, "exons: %s" % exon_seqs | |
311 print >> sys.stderr, "%s" % splice_sites | |
312 for i,translation in enumerate(translations): | |
313 print >> sys.stderr, "frame %d: %s" % (i+1,translation) | |
314 print >> sys.stderr, "splice: %s" % (''.join(['^' if (((j*3)+i)/3) in splice_sites else '-' for j in range(len(translation))])) | |
315 print >> sys.stderr, "" | |
316 if options.bed_format: | |
317 tx_entry = "%s\t%s\n" % (line.rstrip('\r\n'),'\t'.join(translations)) | |
318 outFile.write(tx_entry) | |
319 else: | |
5 | 320 translations = entry.get_filterd_translations(untrimmed=options.untrimmed,filtering=options.filtering,ignore_left_bp=leading_bp,ignore_right_bp=trailing_bp,debug=options.debug) |
0 | 321 for i,tx in enumerate(translations): |
322 if tx: | |
5 | 323 (chromStart,chromEnd,translation,blockCount,blockSizes,blockStarts) = tx |
0 | 324 if options.min_length != None and len(translation) < options.min_length: |
325 continue | |
326 if options.max_stop_codons != None and translation.count('*') > options.max_stop_codons: | |
327 continue | |
328 frame_name = '_%s' % (i + 1) | |
5 | 329 pep_id = "%s%s%s" % (options.id_prefix,entry.name,frame_name) |
330 if bed_fh: | |
331 bed_fh.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\n' % (str(entry.chrom),chromStart,chromEnd,pep_id,entry.score,entry.strand,chromStart,chromEnd,entry.itemRgb,blockCount,','.join([str(x) for x in blockSizes]),','.join([str(x) for x in blockStarts]),translation)) | |
0 | 332 location = "chromosome:%s:%s:%s:%s:%s" % (options.reference,entry.chrom,chromStart,chromEnd,strand) |
5 | 333 if blockCount: |
334 location += " blockCount:%d blockSizes:%s blockStarts:%s" % (blockCount,','.join([str(x) for x in blockSizes]),','.join([str(x) for x in blockStarts])) | |
0 | 335 score = " %s:%s" % (options.score_name,entry.score) if options.score_name else '' |
5 | 336 seq_description = "%s %s%s" % (options.seqtype, location, score) |
337 seq_id = "%s " % pep_id | |
338 if options.fa_db: | |
339 seq_id = "%s%s%s%s" % (options.fa_db,options.fa_sep,pep_id,options.fa_sep) | |
340 fa_id = "%s%s" % (seq_id,seq_description) | |
341 fa_entry = ">%s\n%s\n" % (fa_id,translation) | |
342 outFile.write(fa_entry) | |
343 if gff_fh: | |
344 if gff_fa: | |
345 gff_fa.write(fa_entry) | |
346 gff_fh.write("##sequence-region %s %d %d\n" % (entry.chrom,chromStart + 1,chromEnd - 1)) | |
347 gff_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\tID=%s\n" % (entry.chrom,'splice_junc','gene',chromStart + 1,chromEnd - 1,entry.score,entry.strand,0,pep_id)) | |
348 for x in range(blockCount): | |
349 start = chromStart+blockStarts[x] + 1 | |
350 end = start + blockSizes[x] - 1 | |
351 phase = (3 - sum(blockSizes[:x]) % 3) % 3 | |
352 gff_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\tParent=%s;ID=%s_%d\n" % (entry.chrom,'splice_junc','CDS',start,end,entry.score,entry.strand,phase,pep_id,pep_id,x)) | |
353 """ | |
354 ##gff-version 3 | |
355 ##sequence-region 19 1 287484 | |
356 19 MassSpec peptide 282299 287484 10.0 - 0 ID=TEARLSFYSGHSSFGMYCMVFLALYVQ | |
357 19 MassSpec CDS 287474 287484 . - 0 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812 | |
358 19 MassSpec CDS 282752 282809 . - 1 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812 | |
359 19 MassSpec CDS 282299 282310 . - 0 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812 | |
360 """ | |
361 if bed_fh: | |
362 bed_fh.close() | |
363 if gff_fh: | |
364 if gff_fa: | |
365 gff_fa.close() | |
366 else: | |
367 outFile.close() | |
368 gff_fa = open(gff_fa_file,'r') | |
369 gff_fh.write("##FASTA\n") | |
370 for i, line in enumerate(gff_fa): | |
371 gff_fh.write(line) | |
372 gff_fh.close() | |
0 | 373 except Exception, e: |
374 print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e) | |
375 | |
376 if __name__ == "__main__" : __main__() | |
377 |