comparison translate_bed_sequences.py @ 5:c626a939eef7 draft default tip

Uploaded
author jjohnson
date Tue, 12 Jan 2016 14:38:03 -0500
parents 3b526a780849
children
comparison
equal deleted inserted replaced
4:aa93f7910259 5:c626a939eef7
17 Output: Fasta of 3-frame translations of the spliced sequence 17 Output: Fasta of 3-frame translations of the spliced sequence
18 18
19 """ 19 """
20 20
21 import sys,re,os.path 21 import sys,re,os.path
22 import tempfile
22 import optparse 23 import optparse
23 from optparse import OptionParser 24 from optparse import OptionParser
24 from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate 25 from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate
25 26
26 class BedEntry( object ): 27 class BedEntry( object ):
27 def __init__(self, line): 28 def __init__(self, line):
28 self.line = line 29 self.line = line
29 try: 30 try:
30 (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,seq) = line.split('\t')[0:13] 31 fields = line.rstrip('\r\n').split('\t')
32 (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts) = fields[0:12]
33 seq = fields[12] if len(fields) > 12 else None
31 self.chrom = chrom 34 self.chrom = chrom
32 self.chromStart = int(chromStart) 35 self.chromStart = int(chromStart)
33 self.chromEnd = int(chromEnd) 36 self.chromEnd = int(chromEnd)
34 self.name = name 37 self.name = name
35 self.score = int(score) 38 self.score = int(score)
42 self.blockStarts = [int(x) for x in blockStarts.split(',')] 45 self.blockStarts = [int(x) for x in blockStarts.split(',')]
43 self.seq = seq 46 self.seq = seq
44 except Exception, e: 47 except Exception, e:
45 print >> sys.stderr, "Unable to read Bed entry" % e 48 print >> sys.stderr, "Unable to read Bed entry" % e
46 exit(1) 49 exit(1)
50 def __str__(self):
51 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s%s' % (
52 self.chrom, self.chromStart, self.chromEnd, self.name, self.score, self.strand, self.thickStart, self.thickEnd, self.itemRgb, self.blockCount,
53 ','.join([str(x) for x in self.blockSizes]),
54 ','.join([str(x) for x in self.blockStarts]),
55 '\t%s' % self.seq if self.seq else '')
47 def get_splice_junctions(self): 56 def get_splice_junctions(self):
48 splice_juncs = [] 57 splice_juncs = []
49 for i in range(self.blockCount - 1): 58 for i in range(self.blockCount - 1):
50 splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) 59 splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1])
51 splice_juncs.append(splice_junc) 60 splice_juncs.append(splice_junc)
78 for i in range(3): 87 for i in range(3):
79 translation = self.get_translation(sequence=seq[i:]) 88 translation = self.get_translation(sequence=seq[i:])
80 if translation: 89 if translation:
81 translations.append(translation) 90 translations.append(translation)
82 return translations 91 return translations
83 ## [[start,end,seq],[start,end,seq],[start,end,seq]] 92 ## (start,end)
93 def get_subrange(self,tstart,tstop):
94 chromStart = self.chromStart
95 chromEnd = self.chromEnd
96 r = range(self.blockCount)
97 if self.strand == '-':
98 r.reverse()
99 bStart = 0
100 for x in r:
101 bEnd = bStart + self.blockSizes[x]
102 if bStart <= tstart < bEnd:
103 if self.strand == '+':
104 chromStart = self.chromStart + self.blockStarts[x] + (tstart - bStart)
105 else:
106 chromEnd = self.chromStart + self.blockStarts[x] + (tstart - bStart)
107 if bStart <= tstop < bEnd:
108 if self.strand == '+':
109 chromEnd = self.chromStart + self.blockStarts[x] + (tstop - bStart)
110 else:
111 chromStart = self.chromStart + self.blockStarts[x] + self.blockSizes[x] - (tstop - bStart)
112 bStart += self.blockSizes[x]
113 return(chromStart,chromEnd)
114 #get the blocks for sub range
115 def get_blocks(self,chromStart,chromEnd):
116 tblockCount = 0
117 tblockSizes = []
118 tblockStarts = []
119 for x in range(self.blockCount):
120 bStart = self.chromStart + self.blockStarts[x]
121 bEnd = bStart + self.blockSizes[x]
122 if bStart > chromEnd:
123 break
124 if bEnd < chromStart:
125 continue
126 cStart = max(chromStart,bStart)
127 tblockStarts.append(cStart - chromStart)
128 tblockSizes.append(min(chromEnd,bEnd) - cStart)
129 tblockCount += 1
130 ## print >> sys.stderr, "tblockCount: %d tblockStarts: %s tblockSizes: %s" % (tblockCount,tblockStarts,tblockSizes)
131 return (tblockCount,tblockSizes,tblockStarts)
132 ## [(start,end,seq,blockCount,blockSizes,blockStarts),(start,end,seq,blockCount,blockSizes,blockStarts),(start,end,seq,blockCount,blockSizes,blockStarts)]
84 ## filter: ignore translation if stop codon in first exon after ignore_left_bp 133 ## filter: ignore translation if stop codon in first exon after ignore_left_bp
85 def get_filterd_translations(self,untrimmed=False,filtering=True,ignore_left_bp=0,ignore_right_bp=0): 134 def get_filterd_translations(self,untrimmed=False,filtering=True,ignore_left_bp=0,ignore_right_bp=0,debug=False):
86 translations = [None,None,None] 135 translations = [None,None,None,None,None,None]
87 seq = self.get_spliced_seq() 136 seq = self.get_spliced_seq()
88 ignore = (ignore_left_bp if self.strand == '+' else ignore_right_bp) / 3 137 ignore = (ignore_left_bp if self.strand == '+' else ignore_right_bp) / 3
89 block_sum = sum(self.blockSizes) 138 block_sum = sum(self.blockSizes)
90 exon_sizes = self.blockSizes 139 exon_sizes = [x for x in self.blockSizes]
91 if self.strand == '-': 140 if self.strand == '-':
92 exon_sizes.reverse() 141 exon_sizes.reverse()
93 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))] 142 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))]
143 if debug:
144 print >> sys.stderr, "splice_sites: %s" % splice_sites
94 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0] 145 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0]
95 if seq: 146 if seq:
96 for i in range(3): 147 for i in range(3):
97 translation = self.get_translation(sequence=seq[i:]) 148 translation = self.get_translation(sequence=seq[i:])
98 if translation: 149 if translation:
99 tstart = 0 150 tstart = 0
100 tstop = len(translation) 151 tstop = len(translation)
152 offset = (block_sum - i) % 3
153 if debug:
154 print >> sys.stderr, "frame: %d\ttstart: %d tstop: %d offset: %d\t%s" % (i,tstart,tstop,offset,translation)
101 if not untrimmed: 155 if not untrimmed:
102 tstart = translation.rfind('*',0,junc) + 1 156 tstart = translation.rfind('*',0,junc) + 1
103 stop = translation.find('*',junc) 157 stop = translation.find('*',junc)
104 tstop = stop if stop >= 0 else len(translation) 158 tstop = stop if stop >= 0 else len(translation)
159 offset = (block_sum - i) % 3
160 trimmed = translation[tstart:tstop]
161 if debug:
162 print >> sys.stderr, "frame: %d\ttstart: %d tstop: %d offset: %d\t%s" % (i,tstart,tstop,offset,trimmed)
105 if filtering and tstart > ignore: 163 if filtering and tstart > ignore:
106 continue 164 continue
107 trimmed = translation[tstart:tstop]
108 #get genomic locations for start and end 165 #get genomic locations for start and end
109 offset = (block_sum - i) % 3
110 if self.strand == '+': 166 if self.strand == '+':
111 chromStart = self.chromStart + i + (tstart * 3) 167 chromStart = self.chromStart + i + (tstart * 3)
112 chromEnd = self.chromEnd - offset - (len(translation) - tstop) * 3 168 chromEnd = self.chromEnd - offset - (len(translation) - tstop) * 3
113 else: 169 else:
114 chromStart = self.chromStart + offset + (len(translation) - tstop) * 3 170 chromStart = self.chromStart + offset + (len(translation) - tstop) * 3
115 chromEnd = self.chromEnd - i - (tstart * 3) 171 chromEnd = self.chromEnd - i - (tstart * 3)
116 translations[i] = [chromStart,chromEnd,trimmed] 172 #get the blocks for this translation
173 (tblockCount,tblockSizes,tblockStarts) = self.get_blocks(chromStart,chromEnd)
174 translations[i] = (chromStart,chromEnd,trimmed,tblockCount,tblockSizes,tblockStarts)
175 if debug:
176 print >> sys.stderr, "tblockCount: %d tblockStarts: %s tblockSizes: %s" % (tblockCount,tblockStarts,tblockSizes)
177 # translations[i] = (chromStart,chromEnd,trimmed,tblockCount,tblockSizes,tblockStarts)
117 return translations 178 return translations
118 def get_seq_id(self,seqtype='unk:unk',reference='',frame=None): 179 def get_seq_id(self,seqtype='unk:unk',reference='',frame=None):
119 ## Ensembl fasta ID format 180 ## Ensembl fasta ID format
120 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT 181 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT
121 # >ENSP00000328693 pep:splice chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding 182 # >ENSP00000328693 pep:splice chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding
158 #Parse Command Line 219 #Parse Command Line
159 parser = optparse.OptionParser() 220 parser = optparse.OptionParser()
160 parser.add_option( '-i', '--input', dest='input', help='BED file (tophat junctions.bed) with sequence column added' ) 221 parser.add_option( '-i', '--input', dest='input', help='BED file (tophat junctions.bed) with sequence column added' )
161 parser.add_option( '-o', '--output', dest='output', help='Translations of spliced sequence') 222 parser.add_option( '-o', '--output', dest='output', help='Translations of spliced sequence')
162 parser.add_option( '-b', '--bed_format', dest='bed_format', action='store_true', default=False, help='Append translations to bed file instead of fasta' ) 223 parser.add_option( '-b', '--bed_format', dest='bed_format', action='store_true', default=False, help='Append translations to bed file instead of fasta' )
224 parser.add_option( '-D', '--fa_db', dest='fa_db', default=None, help='Prefix DB identifier for fasta ID line, e.g. generic' )
225 parser.add_option( '-s', '--fa_sep', dest='fa_sep', default='|', help='fasta ID separator defaults to pipe char, e.g. generic|ProtID|description' )
226 parser.add_option( '-B', '--bed', dest='bed', default=None, help='Output a bed file with added 13th column having translation' )
227 parser.add_option( '-G', '--gff3', dest='gff', default=None, help='Output translations to a GFF3 file' )
163 parser.add_option( '-S', '--seqtype', dest='seqtype', default='pep:splice', help='SEQTYPE:STATUS for fasta ID line' ) 228 parser.add_option( '-S', '--seqtype', dest='seqtype', default='pep:splice', help='SEQTYPE:STATUS for fasta ID line' )
229 parser.add_option( '-P', '--id_prefix', dest='id_prefix', default='', help='prefix for the sequence ID' )
164 parser.add_option( '-R', '--reference', dest='reference', default=None, help='Genome Reference Name for fasta ID location ' ) 230 parser.add_option( '-R', '--reference', dest='reference', default=None, help='Genome Reference Name for fasta ID location ' )
231 parser.add_option( '-r', '--refsource', dest='refsource', default=None, help='Source for Genome Reference, e.g. Ensembl, UCSC, or NCBI' )
165 parser.add_option( '-Q', '--score_name', dest='score_name', default=None, help='include in the fasta ID line score_name:score ' ) 232 parser.add_option( '-Q', '--score_name', dest='score_name', default=None, help='include in the fasta ID line score_name:score ' )
166 parser.add_option( '-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to ignore when filtering' ) 233 parser.add_option( '-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to ignore when filtering' )
167 parser.add_option( '-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to ignore when filtering' ) 234 parser.add_option( '-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to ignore when filtering' )
168 parser.add_option( '-U', '--unfiltered', dest='filtering', action='store_false', default=True, help='Do NOT filterout translation with stop codon in the first exon' ) 235 parser.add_option( '-U', '--unfiltered', dest='filtering', action='store_false', default=True, help='Do NOT filterout translation with stop codon in the first exon' )
169 parser.add_option( '-u', '--untrimmed', dest='untrimmed', action='store_true', default=False, help='Do NOT trim from splice site to stop codon' ) 236 parser.add_option( '-u', '--untrimmed', dest='untrimmed', action='store_true', default=False, help='Do NOT trim from splice site to stop codon' )
180 print >> sys.stderr, "failed: %s" % e 247 print >> sys.stderr, "failed: %s" % e
181 exit(2) 248 exit(2)
182 else: 249 else:
183 inputFile = sys.stdin 250 inputFile = sys.stdin
184 # Output files 251 # Output files
252 bed_fh = None
253 gff_fh = None
254 gff_fa_file = None
255 gff_fa = None
185 outFile = None 256 outFile = None
186 if options.output == None: 257 if options.output == None:
187 #write to stdout 258 #write to stdout
188 outFile = sys.stdout 259 outFile = sys.stdout
260 if options.gff:
261 gff_fa_file = tempfile.NamedTemporaryFile(prefix='gff_fasta_',suffix=".fa",dir=os.getcwd()).name
262 gff_fa = open(gff_fa_file,'w')
189 else: 263 else:
190 try: 264 try:
191 outPath = os.path.abspath(options.output) 265 outPath = os.path.abspath(options.output)
192 outFile = open(outPath, 'w') 266 outFile = open(outPath, 'w')
193 except Exception, e: 267 except Exception, e:
194 print >> sys.stderr, "failed: %s" % e 268 print >> sys.stderr, "failed: %s" % e
195 exit(3) 269 exit(3)
270 if options.gff:
271 gff_fa_file = outPath
272 if options.bed:
273 bed_fh = open(options.bed,'w')
274 bed_fh.write('track name="%s" description="%s" \n' % ('novel_junctioni_translations','test'))
275 if options.gff:
276 gff_fh = open(options.gff,'w')
277 gff_fh.write("##gff-version 3.2.1\n")
278 if options.reference:
279 gff_fh.write("##genome-build %s %s\n" % (options.refsource if options.refsource else 'unknown', options.reference))
196 leading_bp = 0 280 leading_bp = 0
197 trailing_bp = 0 281 trailing_bp = 0
198 if options.leading_bp: 282 if options.leading_bp:
199 if options.leading_bp >= 0: 283 if options.leading_bp >= 0:
200 leading_bp = options.leading_bp 284 leading_bp = options.leading_bp
231 print >> sys.stderr, "" 315 print >> sys.stderr, ""
232 if options.bed_format: 316 if options.bed_format:
233 tx_entry = "%s\t%s\n" % (line.rstrip('\r\n'),'\t'.join(translations)) 317 tx_entry = "%s\t%s\n" % (line.rstrip('\r\n'),'\t'.join(translations))
234 outFile.write(tx_entry) 318 outFile.write(tx_entry)
235 else: 319 else:
236 translations = entry.get_filterd_translations(untrimmed=options.untrimmed,filtering=options.filtering,ignore_left_bp=leading_bp,ignore_right_bp=trailing_bp) 320 translations = entry.get_filterd_translations(untrimmed=options.untrimmed,filtering=options.filtering,ignore_left_bp=leading_bp,ignore_right_bp=trailing_bp,debug=options.debug)
237 for i,tx in enumerate(translations): 321 for i,tx in enumerate(translations):
238 if tx: 322 if tx:
239 (chromStart,chromEnd,translation) = tx 323 (chromStart,chromEnd,translation,blockCount,blockSizes,blockStarts) = tx
240 if options.min_length != None and len(translation) < options.min_length: 324 if options.min_length != None and len(translation) < options.min_length:
241 continue 325 continue
242 if options.max_stop_codons != None and translation.count('*') > options.max_stop_codons: 326 if options.max_stop_codons != None and translation.count('*') > options.max_stop_codons:
243 continue 327 continue
244 frame_name = '_%s' % (i + 1) 328 frame_name = '_%s' % (i + 1)
329 pep_id = "%s%s%s" % (options.id_prefix,entry.name,frame_name)
330 if bed_fh:
331 bed_fh.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\n' % (str(entry.chrom),chromStart,chromEnd,pep_id,entry.score,entry.strand,chromStart,chromEnd,entry.itemRgb,blockCount,','.join([str(x) for x in blockSizes]),','.join([str(x) for x in blockStarts]),translation))
245 location = "chromosome:%s:%s:%s:%s:%s" % (options.reference,entry.chrom,chromStart,chromEnd,strand) 332 location = "chromosome:%s:%s:%s:%s:%s" % (options.reference,entry.chrom,chromStart,chromEnd,strand)
333 if blockCount:
334 location += " blockCount:%d blockSizes:%s blockStarts:%s" % (blockCount,','.join([str(x) for x in blockSizes]),','.join([str(x) for x in blockStarts]))
246 score = " %s:%s" % (options.score_name,entry.score) if options.score_name else '' 335 score = " %s:%s" % (options.score_name,entry.score) if options.score_name else ''
247 seq_id = "%s%s %s %s%s" % (entry.name,frame_name,options.seqtype,location, score) 336 seq_description = "%s %s%s" % (options.seqtype, location, score)
248 outFile.write(">%s\n" % seq_id) 337 seq_id = "%s " % pep_id
249 outFile.write(translation) 338 if options.fa_db:
250 outFile.write('\n') 339 seq_id = "%s%s%s%s" % (options.fa_db,options.fa_sep,pep_id,options.fa_sep)
340 fa_id = "%s%s" % (seq_id,seq_description)
341 fa_entry = ">%s\n%s\n" % (fa_id,translation)
342 outFile.write(fa_entry)
343 if gff_fh:
344 if gff_fa:
345 gff_fa.write(fa_entry)
346 gff_fh.write("##sequence-region %s %d %d\n" % (entry.chrom,chromStart + 1,chromEnd - 1))
347 gff_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\tID=%s\n" % (entry.chrom,'splice_junc','gene',chromStart + 1,chromEnd - 1,entry.score,entry.strand,0,pep_id))
348 for x in range(blockCount):
349 start = chromStart+blockStarts[x] + 1
350 end = start + blockSizes[x] - 1
351 phase = (3 - sum(blockSizes[:x]) % 3) % 3
352 gff_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\tParent=%s;ID=%s_%d\n" % (entry.chrom,'splice_junc','CDS',start,end,entry.score,entry.strand,phase,pep_id,pep_id,x))
353 """
354 ##gff-version 3
355 ##sequence-region 19 1 287484
356 19 MassSpec peptide 282299 287484 10.0 - 0 ID=TEARLSFYSGHSSFGMYCMVFLALYVQ
357 19 MassSpec CDS 287474 287484 . - 0 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812
358 19 MassSpec CDS 282752 282809 . - 1 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812
359 19 MassSpec CDS 282299 282310 . - 0 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812
360 """
361 if bed_fh:
362 bed_fh.close()
363 if gff_fh:
364 if gff_fa:
365 gff_fa.close()
366 else:
367 outFile.close()
368 gff_fa = open(gff_fa_file,'r')
369 gff_fh.write("##FASTA\n")
370 for i, line in enumerate(gff_fa):
371 gff_fh.write(line)
372 gff_fh.close()
251 except Exception, e: 373 except Exception, e:
252 print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e) 374 print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e)
253 375
254 if __name__ == "__main__" : __main__() 376 if __name__ == "__main__" : __main__()
255 377