Mercurial > repos > jjohnson > translate_bed_sequences
comparison translate_bed_sequences.py @ 0:57e586ee821e
Uploaded
author | jjohnson |
---|---|
date | Wed, 08 Jan 2014 18:33:29 -0500 |
parents | |
children | 359addb9b9d4 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:57e586ee821e |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 # | |
4 #------------------------------------------------------------------------------ | |
5 # University of Minnesota | |
6 # Copyright 2014, Regents of the University of Minnesota | |
7 #------------------------------------------------------------------------------ | |
8 # Author: | |
9 # | |
10 # James E Johnson | |
11 # | |
12 #------------------------------------------------------------------------------ | |
13 """ | |
14 | |
15 """ | |
16 Input: BED file (12 column) + 13th sequence column appended by extract_genomic_dna | |
17 Output: Fasta of 3-frame translations of the spliced sequence | |
18 | |
19 """ | |
20 | |
21 import sys,re,os.path | |
22 import optparse | |
23 from optparse import OptionParser | |
24 from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate | |
25 | |
26 class BedEntry( object ): | |
27 def __init__(self, line): | |
28 self.line = line | |
29 try: | |
30 (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,seq) = line.split('\t')[0:13] | |
31 self.chrom = chrom | |
32 self.chromStart = int(chromStart) | |
33 self.chromEnd = int(chromEnd) | |
34 self.name = name | |
35 self.score = int(score) | |
36 self.strand = strand | |
37 self.thickStart = int(thickStart) | |
38 self.thickEnd = int(thickEnd) | |
39 self.itemRgb = itemRgb | |
40 self.blockCount = int(blockCount) | |
41 self.blockSizes = [int(x) for x in blockSizes.split(',')] | |
42 self.blockStarts = [int(x) for x in blockStarts.split(',')] | |
43 self.seq = seq | |
44 except Exception, e: | |
45 print >> sys.stderr, "Unable to read Bed entry" % e | |
46 exit(1) | |
47 def get_splice_junctions(self): | |
48 splice_juncs = [] | |
49 for i in range(self.blockCount - 1): | |
50 splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) | |
51 splice_juncs.append(splice_junc) | |
52 return splice_juncs | |
53 def get_exon_seqs(self): | |
54 exons = [] | |
55 for i in range(self.blockCount): | |
56 # splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) | |
57 exons.append(self.seq[self.blockStarts[i]:self.blockStarts[i] + self.blockSizes[i]]) | |
58 if self.strand == '-': #reverse complement | |
59 exons.reverse() | |
60 for i,s in enumerate(exons): | |
61 exons[i] = reverse_complement(s) | |
62 return exons | |
63 def get_spliced_seq(self): | |
64 seq = ''.join(self.get_exon_seqs()) | |
65 return seq | |
66 def get_translation(self,sequence=None): | |
67 translation = None | |
68 seq = sequence if sequence else self.get_spliced_seq() | |
69 if seq: | |
70 seqlen = len(seq) / 3 * 3; | |
71 if seqlen >= 3: | |
72 translation = translate(seq[:seqlen]) | |
73 return translation | |
74 def get_translations(self): | |
75 translations = [] | |
76 seq = self.get_spliced_seq() | |
77 if seq: | |
78 for i in range(3): | |
79 translation = self.get_translation(sequence=seq[i:]) | |
80 if translation: | |
81 translations.append(translation) | |
82 return translations | |
83 ## [[start,end,seq],[start,end,seq],[start,end,seq]] | |
84 ## filter: ignore translation if stop codon in first exon after ignore_left_bp | |
85 def get_filterd_translations(self,untrimmed=False,filtering=True,ignore_left_bp=0,ignore_right_bp=0): | |
86 translations = [None,None,None] | |
87 seq = self.get_spliced_seq() | |
88 ignore = (ignore_left_bp if self.strand == '+' else ignore_right_bp) / 3 | |
89 block_sum = sum(self.blockSizes) | |
90 exon_sizes = self.blockSizes | |
91 if self.strand == '-': | |
92 exon_sizes.reverse() | |
93 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))] | |
94 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0] | |
95 if seq: | |
96 for i in range(3): | |
97 translation = self.get_translation(sequence=seq[i:]) | |
98 if translation: | |
99 tstart = 0 | |
100 tstop = len(translation) | |
101 if not untrimmed: | |
102 tstart = translation.rfind('*',0,junc) + 1 | |
103 tstop = translation.find('*',junc) + 1 | |
104 rtrim = tstop if tstop == 0 else len(translation) - tstop | |
105 tstop = tstop if tstop else len(translation) | |
106 if filtering and tstart > ignore: | |
107 continue | |
108 trimmed = translation[tstart:tstop] | |
109 #get genomic locations for start and end | |
110 offset = (block_sum - i) % 3 | |
111 if self.strand == '+': | |
112 chromStart = self.chromStart + i + (tstart * 3) | |
113 chromEnd = self.chromEnd - offset - (len(translation) - tstop) * 3 | |
114 else: | |
115 chromStart = self.chromStart + offset + (len(translation) - tstop) * 3 | |
116 chromEnd = self.chromEnd - i - (tstart * 3) | |
117 translations[i] = [chromStart,chromEnd,trimmed] | |
118 return translations | |
119 def get_seq_id(self,seqtype='unk:unk',reference='',frame=None): | |
120 ## Ensembl fasta ID format | |
121 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT | |
122 # >ENSP00000328693 pep:novel chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding | |
123 frame_name = '' | |
124 chromStart = self.chromStart | |
125 chromEnd = self.chromEnd | |
126 strand = 1 if self.strand == '+' else -1 | |
127 if frame != None: | |
128 block_sum = sum(self.blockSizes) | |
129 offset = (block_sum - frame) % 3 | |
130 frame_name = '_' + str(frame + 1) | |
131 if self.strand == '+': | |
132 chromStart += frame | |
133 chromEnd -= offset | |
134 else: | |
135 chromStart += offset | |
136 chromEnd -= frame | |
137 location = "chromosome:%s:%s:%s:%s:%s" % (reference,self.chrom,chromStart,chromEnd,strand) | |
138 seq_id = "%s%s %s %s" % (self.name,frame_name,seqtype,location) | |
139 return seq_id | |
140 def get_line(self, start_offset = 0, end_offset = 0): | |
141 if start_offset or end_offset: | |
142 s_offset = start_offset if start_offset else 0 | |
143 e_offset = end_offset if end_offset else 0 | |
144 if s_offset > self.chromStart: | |
145 s_offset = self.chromStart | |
146 chrStart = self.chromStart - s_offset | |
147 chrEnd = self.chromEnd + e_offset | |
148 blkSizes = self.blockSizes | |
149 blkSizes[0] += s_offset | |
150 blkSizes[-1] += e_offset | |
151 blkStarts = self.blockStarts | |
152 for i in range(1,self.blockCount): | |
153 blkStarts[i] += s_offset | |
154 items = [str(x) for x in [self.chrom,chrStart,chrEnd,self.name,self.score,self.strand,self.thickStart,self.thickEnd,self.itemRgb,self.blockCount,','.join([str(x) for x in blkSizes]),','.join([str(x) for x in blkStarts])]] | |
155 return '\t'.join(items) + '\n' | |
156 return self.line | |
157 | |
158 def __main__(): | |
159 #Parse Command Line | |
160 parser = optparse.OptionParser() | |
161 parser.add_option( '-i', '--input', dest='input', help='BED file (tophat junctions.bed) with sequence column added' ) | |
162 parser.add_option( '-o', '--output', dest='output', help='Translations of spliced sequence') | |
163 parser.add_option( '-b', '--bed_format', dest='bed_format', action='store_true', default=False, help='Append translations to bed file instead of fasta' ) | |
164 parser.add_option( '-S', '--seqtype', dest='seqtype', default='pep:novel', help='SEQTYPE:STATUS for fasta ID line' ) | |
165 parser.add_option( '-R', '--reference', dest='reference', default=None, help='Genome Reference Name for fasta ID location ' ) | |
166 parser.add_option( '-Q', '--score_name', dest='score_name', default=None, help='include in the fasta ID line score_name:score ' ) | |
167 parser.add_option( '-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to ignore when filtering' ) | |
168 parser.add_option( '-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to ignore when filtering' ) | |
169 parser.add_option( '-U', '--unfiltered', dest='filtering', action='store_false', default=True, help='Do NOT filterout translation with stop codon in the first exon' ) | |
170 parser.add_option( '-u', '--untrimmed', dest='untrimmed', action='store_true', default=False, help='Do NOT trim from splice site to stop codon' ) | |
171 parser.add_option( '-L', '--min_length', dest='min_length', type='int', default=None, help='Minimun length (to first stop codon)' ) | |
172 parser.add_option( '-M', '--max_stop_codons', dest='max_stop_codons', type='int', default=None, help='Filter out translations with more than max_stop_codons' ) | |
173 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) | |
174 (options, args) = parser.parse_args() | |
175 # Input files | |
176 if options.input != None: | |
177 try: | |
178 inputPath = os.path.abspath(options.input) | |
179 inputFile = open(inputPath, 'r') | |
180 except Exception, e: | |
181 print >> sys.stderr, "failed: %s" % e | |
182 exit(2) | |
183 else: | |
184 inputFile = sys.stdin | |
185 # Output files | |
186 outFile = None | |
187 if options.output == None: | |
188 #write to stdout | |
189 outFile = sys.stdout | |
190 else: | |
191 try: | |
192 outPath = os.path.abspath(options.output) | |
193 outFile = open(outPath, 'w') | |
194 except Exception, e: | |
195 print >> sys.stderr, "failed: %s" % e | |
196 exit(3) | |
197 leading_bp = 0 | |
198 trailing_bp = 0 | |
199 if options.leading_bp: | |
200 if options.leading_bp >= 0: | |
201 leading_bp = options.leading_bp | |
202 else: | |
203 print >> sys.stderr, "failed: leading_bp must be positive" | |
204 exit(5) | |
205 if options.trailing_bp: | |
206 if options.trailing_bp >= 0: | |
207 trailing_bp = options.trailing_bp | |
208 else: | |
209 print >> sys.stderr, "failed: trailing_bp must be positive" | |
210 exit(5) | |
211 # Scan bed file | |
212 try: | |
213 for i, line in enumerate( inputFile ): | |
214 if line.startswith('track'): | |
215 if outFile and options.bed_format: | |
216 outFile.write(line) | |
217 continue | |
218 entry = BedEntry(line) | |
219 strand = 1 if entry.strand == '+' else -1 | |
220 translations = entry.get_translations() | |
221 if options.debug: | |
222 exon_seqs = entry.get_exon_seqs() | |
223 exon_sizes = [len(seq) for seq in exon_seqs] | |
224 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))] | |
225 print >> sys.stderr, entry.name | |
226 print >> sys.stderr, line.rstrip('\r\n') | |
227 print >> sys.stderr, "exons: %s" % exon_seqs | |
228 print >> sys.stderr, "%s" % splice_sites | |
229 for i,translation in enumerate(translations): | |
230 print >> sys.stderr, "frame %d: %s" % (i+1,translation) | |
231 print >> sys.stderr, "splice: %s" % (''.join(['^' if (((j*3)+i)/3) in splice_sites else '-' for j in range(len(translation))])) | |
232 print >> sys.stderr, "" | |
233 if options.bed_format: | |
234 tx_entry = "%s\t%s\n" % (line.rstrip('\r\n'),'\t'.join(translations)) | |
235 outFile.write(tx_entry) | |
236 else: | |
237 translations = entry.get_filterd_translations(untrimmed=options.untrimmed,filtering=options.filtering,ignore_left_bp=leading_bp,ignore_right_bp=trailing_bp) | |
238 for i,tx in enumerate(translations): | |
239 if tx: | |
240 (chromStart,chromEnd,translation) = tx | |
241 if options.min_length != None and len(translation) < options.min_length: | |
242 continue | |
243 if options.max_stop_codons != None and translation.count('*') > options.max_stop_codons: | |
244 continue | |
245 frame_name = '_%s' % (i + 1) | |
246 location = "chromosome:%s:%s:%s:%s:%s" % (options.reference,entry.chrom,chromStart,chromEnd,strand) | |
247 score = " %s:%s" % (options.score_name,entry.score) if options.score_name else '' | |
248 seq_id = "%s%s %s %s%s" % (entry.name,frame_name,options.seqtype,location, score) | |
249 outFile.write(">%s\n" % seq_id) | |
250 outFile.write(translation) | |
251 outFile.write('\n') | |
252 except Exception, e: | |
253 print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e) | |
254 | |
255 if __name__ == "__main__" : __main__() | |
256 |