Mercurial > repos > jjohnson > translate_bed_sequences
comparison translate_bed_sequences.py @ 5:c626a939eef7 draft default tip
Uploaded
author | jjohnson |
---|---|
date | Tue, 12 Jan 2016 14:38:03 -0500 |
parents | 3b526a780849 |
children |
comparison
equal
deleted
inserted
replaced
4:aa93f7910259 | 5:c626a939eef7 |
---|---|
17 Output: Fasta of 3-frame translations of the spliced sequence | 17 Output: Fasta of 3-frame translations of the spliced sequence |
18 | 18 |
19 """ | 19 """ |
20 | 20 |
21 import sys,re,os.path | 21 import sys,re,os.path |
22 import tempfile | |
22 import optparse | 23 import optparse |
23 from optparse import OptionParser | 24 from optparse import OptionParser |
24 from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate | 25 from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate |
25 | 26 |
26 class BedEntry( object ): | 27 class BedEntry( object ): |
27 def __init__(self, line): | 28 def __init__(self, line): |
28 self.line = line | 29 self.line = line |
29 try: | 30 try: |
30 (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,seq) = line.split('\t')[0:13] | 31 fields = line.rstrip('\r\n').split('\t') |
32 (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts) = fields[0:12] | |
33 seq = fields[12] if len(fields) > 12 else None | |
31 self.chrom = chrom | 34 self.chrom = chrom |
32 self.chromStart = int(chromStart) | 35 self.chromStart = int(chromStart) |
33 self.chromEnd = int(chromEnd) | 36 self.chromEnd = int(chromEnd) |
34 self.name = name | 37 self.name = name |
35 self.score = int(score) | 38 self.score = int(score) |
42 self.blockStarts = [int(x) for x in blockStarts.split(',')] | 45 self.blockStarts = [int(x) for x in blockStarts.split(',')] |
43 self.seq = seq | 46 self.seq = seq |
44 except Exception, e: | 47 except Exception, e: |
45 print >> sys.stderr, "Unable to read Bed entry" % e | 48 print >> sys.stderr, "Unable to read Bed entry" % e |
46 exit(1) | 49 exit(1) |
50 def __str__(self): | |
51 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s%s' % ( | |
52 self.chrom, self.chromStart, self.chromEnd, self.name, self.score, self.strand, self.thickStart, self.thickEnd, self.itemRgb, self.blockCount, | |
53 ','.join([str(x) for x in self.blockSizes]), | |
54 ','.join([str(x) for x in self.blockStarts]), | |
55 '\t%s' % self.seq if self.seq else '') | |
47 def get_splice_junctions(self): | 56 def get_splice_junctions(self): |
48 splice_juncs = [] | 57 splice_juncs = [] |
49 for i in range(self.blockCount - 1): | 58 for i in range(self.blockCount - 1): |
50 splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) | 59 splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) |
51 splice_juncs.append(splice_junc) | 60 splice_juncs.append(splice_junc) |
78 for i in range(3): | 87 for i in range(3): |
79 translation = self.get_translation(sequence=seq[i:]) | 88 translation = self.get_translation(sequence=seq[i:]) |
80 if translation: | 89 if translation: |
81 translations.append(translation) | 90 translations.append(translation) |
82 return translations | 91 return translations |
83 ## [[start,end,seq],[start,end,seq],[start,end,seq]] | 92 ## (start,end) |
93 def get_subrange(self,tstart,tstop): | |
94 chromStart = self.chromStart | |
95 chromEnd = self.chromEnd | |
96 r = range(self.blockCount) | |
97 if self.strand == '-': | |
98 r.reverse() | |
99 bStart = 0 | |
100 for x in r: | |
101 bEnd = bStart + self.blockSizes[x] | |
102 if bStart <= tstart < bEnd: | |
103 if self.strand == '+': | |
104 chromStart = self.chromStart + self.blockStarts[x] + (tstart - bStart) | |
105 else: | |
106 chromEnd = self.chromStart + self.blockStarts[x] + (tstart - bStart) | |
107 if bStart <= tstop < bEnd: | |
108 if self.strand == '+': | |
109 chromEnd = self.chromStart + self.blockStarts[x] + (tstop - bStart) | |
110 else: | |
111 chromStart = self.chromStart + self.blockStarts[x] + self.blockSizes[x] - (tstop - bStart) | |
112 bStart += self.blockSizes[x] | |
113 return(chromStart,chromEnd) | |
114 #get the blocks for sub range | |
115 def get_blocks(self,chromStart,chromEnd): | |
116 tblockCount = 0 | |
117 tblockSizes = [] | |
118 tblockStarts = [] | |
119 for x in range(self.blockCount): | |
120 bStart = self.chromStart + self.blockStarts[x] | |
121 bEnd = bStart + self.blockSizes[x] | |
122 if bStart > chromEnd: | |
123 break | |
124 if bEnd < chromStart: | |
125 continue | |
126 cStart = max(chromStart,bStart) | |
127 tblockStarts.append(cStart - chromStart) | |
128 tblockSizes.append(min(chromEnd,bEnd) - cStart) | |
129 tblockCount += 1 | |
130 ## print >> sys.stderr, "tblockCount: %d tblockStarts: %s tblockSizes: %s" % (tblockCount,tblockStarts,tblockSizes) | |
131 return (tblockCount,tblockSizes,tblockStarts) | |
132 ## [(start,end,seq,blockCount,blockSizes,blockStarts),(start,end,seq,blockCount,blockSizes,blockStarts),(start,end,seq,blockCount,blockSizes,blockStarts)] | |
84 ## filter: ignore translation if stop codon in first exon after ignore_left_bp | 133 ## filter: ignore translation if stop codon in first exon after ignore_left_bp |
85 def get_filterd_translations(self,untrimmed=False,filtering=True,ignore_left_bp=0,ignore_right_bp=0): | 134 def get_filterd_translations(self,untrimmed=False,filtering=True,ignore_left_bp=0,ignore_right_bp=0,debug=False): |
86 translations = [None,None,None] | 135 translations = [None,None,None,None,None,None] |
87 seq = self.get_spliced_seq() | 136 seq = self.get_spliced_seq() |
88 ignore = (ignore_left_bp if self.strand == '+' else ignore_right_bp) / 3 | 137 ignore = (ignore_left_bp if self.strand == '+' else ignore_right_bp) / 3 |
89 block_sum = sum(self.blockSizes) | 138 block_sum = sum(self.blockSizes) |
90 exon_sizes = self.blockSizes | 139 exon_sizes = [x for x in self.blockSizes] |
91 if self.strand == '-': | 140 if self.strand == '-': |
92 exon_sizes.reverse() | 141 exon_sizes.reverse() |
93 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))] | 142 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))] |
143 if debug: | |
144 print >> sys.stderr, "splice_sites: %s" % splice_sites | |
94 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0] | 145 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0] |
95 if seq: | 146 if seq: |
96 for i in range(3): | 147 for i in range(3): |
97 translation = self.get_translation(sequence=seq[i:]) | 148 translation = self.get_translation(sequence=seq[i:]) |
98 if translation: | 149 if translation: |
99 tstart = 0 | 150 tstart = 0 |
100 tstop = len(translation) | 151 tstop = len(translation) |
152 offset = (block_sum - i) % 3 | |
153 if debug: | |
154 print >> sys.stderr, "frame: %d\ttstart: %d tstop: %d offset: %d\t%s" % (i,tstart,tstop,offset,translation) | |
101 if not untrimmed: | 155 if not untrimmed: |
102 tstart = translation.rfind('*',0,junc) + 1 | 156 tstart = translation.rfind('*',0,junc) + 1 |
103 stop = translation.find('*',junc) | 157 stop = translation.find('*',junc) |
104 tstop = stop if stop >= 0 else len(translation) | 158 tstop = stop if stop >= 0 else len(translation) |
159 offset = (block_sum - i) % 3 | |
160 trimmed = translation[tstart:tstop] | |
161 if debug: | |
162 print >> sys.stderr, "frame: %d\ttstart: %d tstop: %d offset: %d\t%s" % (i,tstart,tstop,offset,trimmed) | |
105 if filtering and tstart > ignore: | 163 if filtering and tstart > ignore: |
106 continue | 164 continue |
107 trimmed = translation[tstart:tstop] | |
108 #get genomic locations for start and end | 165 #get genomic locations for start and end |
109 offset = (block_sum - i) % 3 | |
110 if self.strand == '+': | 166 if self.strand == '+': |
111 chromStart = self.chromStart + i + (tstart * 3) | 167 chromStart = self.chromStart + i + (tstart * 3) |
112 chromEnd = self.chromEnd - offset - (len(translation) - tstop) * 3 | 168 chromEnd = self.chromEnd - offset - (len(translation) - tstop) * 3 |
113 else: | 169 else: |
114 chromStart = self.chromStart + offset + (len(translation) - tstop) * 3 | 170 chromStart = self.chromStart + offset + (len(translation) - tstop) * 3 |
115 chromEnd = self.chromEnd - i - (tstart * 3) | 171 chromEnd = self.chromEnd - i - (tstart * 3) |
116 translations[i] = [chromStart,chromEnd,trimmed] | 172 #get the blocks for this translation |
173 (tblockCount,tblockSizes,tblockStarts) = self.get_blocks(chromStart,chromEnd) | |
174 translations[i] = (chromStart,chromEnd,trimmed,tblockCount,tblockSizes,tblockStarts) | |
175 if debug: | |
176 print >> sys.stderr, "tblockCount: %d tblockStarts: %s tblockSizes: %s" % (tblockCount,tblockStarts,tblockSizes) | |
177 # translations[i] = (chromStart,chromEnd,trimmed,tblockCount,tblockSizes,tblockStarts) | |
117 return translations | 178 return translations |
118 def get_seq_id(self,seqtype='unk:unk',reference='',frame=None): | 179 def get_seq_id(self,seqtype='unk:unk',reference='',frame=None): |
119 ## Ensembl fasta ID format | 180 ## Ensembl fasta ID format |
120 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT | 181 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT |
121 # >ENSP00000328693 pep:splice chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding | 182 # >ENSP00000328693 pep:splice chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding |
158 #Parse Command Line | 219 #Parse Command Line |
159 parser = optparse.OptionParser() | 220 parser = optparse.OptionParser() |
160 parser.add_option( '-i', '--input', dest='input', help='BED file (tophat junctions.bed) with sequence column added' ) | 221 parser.add_option( '-i', '--input', dest='input', help='BED file (tophat junctions.bed) with sequence column added' ) |
161 parser.add_option( '-o', '--output', dest='output', help='Translations of spliced sequence') | 222 parser.add_option( '-o', '--output', dest='output', help='Translations of spliced sequence') |
162 parser.add_option( '-b', '--bed_format', dest='bed_format', action='store_true', default=False, help='Append translations to bed file instead of fasta' ) | 223 parser.add_option( '-b', '--bed_format', dest='bed_format', action='store_true', default=False, help='Append translations to bed file instead of fasta' ) |
224 parser.add_option( '-D', '--fa_db', dest='fa_db', default=None, help='Prefix DB identifier for fasta ID line, e.g. generic' ) | |
225 parser.add_option( '-s', '--fa_sep', dest='fa_sep', default='|', help='fasta ID separator defaults to pipe char, e.g. generic|ProtID|description' ) | |
226 parser.add_option( '-B', '--bed', dest='bed', default=None, help='Output a bed file with added 13th column having translation' ) | |
227 parser.add_option( '-G', '--gff3', dest='gff', default=None, help='Output translations to a GFF3 file' ) | |
163 parser.add_option( '-S', '--seqtype', dest='seqtype', default='pep:splice', help='SEQTYPE:STATUS for fasta ID line' ) | 228 parser.add_option( '-S', '--seqtype', dest='seqtype', default='pep:splice', help='SEQTYPE:STATUS for fasta ID line' ) |
229 parser.add_option( '-P', '--id_prefix', dest='id_prefix', default='', help='prefix for the sequence ID' ) | |
164 parser.add_option( '-R', '--reference', dest='reference', default=None, help='Genome Reference Name for fasta ID location ' ) | 230 parser.add_option( '-R', '--reference', dest='reference', default=None, help='Genome Reference Name for fasta ID location ' ) |
231 parser.add_option( '-r', '--refsource', dest='refsource', default=None, help='Source for Genome Reference, e.g. Ensembl, UCSC, or NCBI' ) | |
165 parser.add_option( '-Q', '--score_name', dest='score_name', default=None, help='include in the fasta ID line score_name:score ' ) | 232 parser.add_option( '-Q', '--score_name', dest='score_name', default=None, help='include in the fasta ID line score_name:score ' ) |
166 parser.add_option( '-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to ignore when filtering' ) | 233 parser.add_option( '-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to ignore when filtering' ) |
167 parser.add_option( '-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to ignore when filtering' ) | 234 parser.add_option( '-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to ignore when filtering' ) |
168 parser.add_option( '-U', '--unfiltered', dest='filtering', action='store_false', default=True, help='Do NOT filterout translation with stop codon in the first exon' ) | 235 parser.add_option( '-U', '--unfiltered', dest='filtering', action='store_false', default=True, help='Do NOT filterout translation with stop codon in the first exon' ) |
169 parser.add_option( '-u', '--untrimmed', dest='untrimmed', action='store_true', default=False, help='Do NOT trim from splice site to stop codon' ) | 236 parser.add_option( '-u', '--untrimmed', dest='untrimmed', action='store_true', default=False, help='Do NOT trim from splice site to stop codon' ) |
180 print >> sys.stderr, "failed: %s" % e | 247 print >> sys.stderr, "failed: %s" % e |
181 exit(2) | 248 exit(2) |
182 else: | 249 else: |
183 inputFile = sys.stdin | 250 inputFile = sys.stdin |
184 # Output files | 251 # Output files |
252 bed_fh = None | |
253 gff_fh = None | |
254 gff_fa_file = None | |
255 gff_fa = None | |
185 outFile = None | 256 outFile = None |
186 if options.output == None: | 257 if options.output == None: |
187 #write to stdout | 258 #write to stdout |
188 outFile = sys.stdout | 259 outFile = sys.stdout |
260 if options.gff: | |
261 gff_fa_file = tempfile.NamedTemporaryFile(prefix='gff_fasta_',suffix=".fa",dir=os.getcwd()).name | |
262 gff_fa = open(gff_fa_file,'w') | |
189 else: | 263 else: |
190 try: | 264 try: |
191 outPath = os.path.abspath(options.output) | 265 outPath = os.path.abspath(options.output) |
192 outFile = open(outPath, 'w') | 266 outFile = open(outPath, 'w') |
193 except Exception, e: | 267 except Exception, e: |
194 print >> sys.stderr, "failed: %s" % e | 268 print >> sys.stderr, "failed: %s" % e |
195 exit(3) | 269 exit(3) |
270 if options.gff: | |
271 gff_fa_file = outPath | |
272 if options.bed: | |
273 bed_fh = open(options.bed,'w') | |
274 bed_fh.write('track name="%s" description="%s" \n' % ('novel_junctioni_translations','test')) | |
275 if options.gff: | |
276 gff_fh = open(options.gff,'w') | |
277 gff_fh.write("##gff-version 3.2.1\n") | |
278 if options.reference: | |
279 gff_fh.write("##genome-build %s %s\n" % (options.refsource if options.refsource else 'unknown', options.reference)) | |
196 leading_bp = 0 | 280 leading_bp = 0 |
197 trailing_bp = 0 | 281 trailing_bp = 0 |
198 if options.leading_bp: | 282 if options.leading_bp: |
199 if options.leading_bp >= 0: | 283 if options.leading_bp >= 0: |
200 leading_bp = options.leading_bp | 284 leading_bp = options.leading_bp |
231 print >> sys.stderr, "" | 315 print >> sys.stderr, "" |
232 if options.bed_format: | 316 if options.bed_format: |
233 tx_entry = "%s\t%s\n" % (line.rstrip('\r\n'),'\t'.join(translations)) | 317 tx_entry = "%s\t%s\n" % (line.rstrip('\r\n'),'\t'.join(translations)) |
234 outFile.write(tx_entry) | 318 outFile.write(tx_entry) |
235 else: | 319 else: |
236 translations = entry.get_filterd_translations(untrimmed=options.untrimmed,filtering=options.filtering,ignore_left_bp=leading_bp,ignore_right_bp=trailing_bp) | 320 translations = entry.get_filterd_translations(untrimmed=options.untrimmed,filtering=options.filtering,ignore_left_bp=leading_bp,ignore_right_bp=trailing_bp,debug=options.debug) |
237 for i,tx in enumerate(translations): | 321 for i,tx in enumerate(translations): |
238 if tx: | 322 if tx: |
239 (chromStart,chromEnd,translation) = tx | 323 (chromStart,chromEnd,translation,blockCount,blockSizes,blockStarts) = tx |
240 if options.min_length != None and len(translation) < options.min_length: | 324 if options.min_length != None and len(translation) < options.min_length: |
241 continue | 325 continue |
242 if options.max_stop_codons != None and translation.count('*') > options.max_stop_codons: | 326 if options.max_stop_codons != None and translation.count('*') > options.max_stop_codons: |
243 continue | 327 continue |
244 frame_name = '_%s' % (i + 1) | 328 frame_name = '_%s' % (i + 1) |
329 pep_id = "%s%s%s" % (options.id_prefix,entry.name,frame_name) | |
330 if bed_fh: | |
331 bed_fh.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\n' % (str(entry.chrom),chromStart,chromEnd,pep_id,entry.score,entry.strand,chromStart,chromEnd,entry.itemRgb,blockCount,','.join([str(x) for x in blockSizes]),','.join([str(x) for x in blockStarts]),translation)) | |
245 location = "chromosome:%s:%s:%s:%s:%s" % (options.reference,entry.chrom,chromStart,chromEnd,strand) | 332 location = "chromosome:%s:%s:%s:%s:%s" % (options.reference,entry.chrom,chromStart,chromEnd,strand) |
333 if blockCount: | |
334 location += " blockCount:%d blockSizes:%s blockStarts:%s" % (blockCount,','.join([str(x) for x in blockSizes]),','.join([str(x) for x in blockStarts])) | |
246 score = " %s:%s" % (options.score_name,entry.score) if options.score_name else '' | 335 score = " %s:%s" % (options.score_name,entry.score) if options.score_name else '' |
247 seq_id = "%s%s %s %s%s" % (entry.name,frame_name,options.seqtype,location, score) | 336 seq_description = "%s %s%s" % (options.seqtype, location, score) |
248 outFile.write(">%s\n" % seq_id) | 337 seq_id = "%s " % pep_id |
249 outFile.write(translation) | 338 if options.fa_db: |
250 outFile.write('\n') | 339 seq_id = "%s%s%s%s" % (options.fa_db,options.fa_sep,pep_id,options.fa_sep) |
340 fa_id = "%s%s" % (seq_id,seq_description) | |
341 fa_entry = ">%s\n%s\n" % (fa_id,translation) | |
342 outFile.write(fa_entry) | |
343 if gff_fh: | |
344 if gff_fa: | |
345 gff_fa.write(fa_entry) | |
346 gff_fh.write("##sequence-region %s %d %d\n" % (entry.chrom,chromStart + 1,chromEnd - 1)) | |
347 gff_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\tID=%s\n" % (entry.chrom,'splice_junc','gene',chromStart + 1,chromEnd - 1,entry.score,entry.strand,0,pep_id)) | |
348 for x in range(blockCount): | |
349 start = chromStart+blockStarts[x] + 1 | |
350 end = start + blockSizes[x] - 1 | |
351 phase = (3 - sum(blockSizes[:x]) % 3) % 3 | |
352 gff_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\tParent=%s;ID=%s_%d\n" % (entry.chrom,'splice_junc','CDS',start,end,entry.score,entry.strand,phase,pep_id,pep_id,x)) | |
353 """ | |
354 ##gff-version 3 | |
355 ##sequence-region 19 1 287484 | |
356 19 MassSpec peptide 282299 287484 10.0 - 0 ID=TEARLSFYSGHSSFGMYCMVFLALYVQ | |
357 19 MassSpec CDS 287474 287484 . - 0 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812 | |
358 19 MassSpec CDS 282752 282809 . - 1 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812 | |
359 19 MassSpec CDS 282299 282310 . - 0 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812 | |
360 """ | |
361 if bed_fh: | |
362 bed_fh.close() | |
363 if gff_fh: | |
364 if gff_fa: | |
365 gff_fa.close() | |
366 else: | |
367 outFile.close() | |
368 gff_fa = open(gff_fa_file,'r') | |
369 gff_fh.write("##FASTA\n") | |
370 for i, line in enumerate(gff_fa): | |
371 gff_fh.write(line) | |
372 gff_fh.close() | |
251 except Exception, e: | 373 except Exception, e: |
252 print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e) | 374 print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e) |
253 | 375 |
254 if __name__ == "__main__" : __main__() | 376 if __name__ == "__main__" : __main__() |
255 | 377 |