Mercurial > repos > jjohnson > translate_bed_sequences
annotate translate_bed_sequences.py @ 4:aa93f7910259
Fix test case
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Thu, 30 Jan 2014 13:26:58 -0600 |
parents | 3b526a780849 |
children | c626a939eef7 |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 """ | |
3 # | |
4 #------------------------------------------------------------------------------ | |
5 # University of Minnesota | |
6 # Copyright 2014, Regents of the University of Minnesota | |
7 #------------------------------------------------------------------------------ | |
8 # Author: | |
9 # | |
10 # James E Johnson | |
11 # | |
12 #------------------------------------------------------------------------------ | |
13 """ | |
14 | |
15 """ | |
16 Input: BED file (12 column) + 13th sequence column appended by extract_genomic_dna | |
17 Output: Fasta of 3-frame translations of the spliced sequence | |
18 | |
19 """ | |
20 | |
21 import sys,re,os.path | |
22 import optparse | |
23 from optparse import OptionParser | |
24 from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate | |
25 | |
26 class BedEntry( object ): | |
27 def __init__(self, line): | |
28 self.line = line | |
29 try: | |
30 (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,seq) = line.split('\t')[0:13] | |
31 self.chrom = chrom | |
32 self.chromStart = int(chromStart) | |
33 self.chromEnd = int(chromEnd) | |
34 self.name = name | |
35 self.score = int(score) | |
36 self.strand = strand | |
37 self.thickStart = int(thickStart) | |
38 self.thickEnd = int(thickEnd) | |
39 self.itemRgb = itemRgb | |
40 self.blockCount = int(blockCount) | |
41 self.blockSizes = [int(x) for x in blockSizes.split(',')] | |
42 self.blockStarts = [int(x) for x in blockStarts.split(',')] | |
43 self.seq = seq | |
44 except Exception, e: | |
45 print >> sys.stderr, "Unable to read Bed entry" % e | |
46 exit(1) | |
47 def get_splice_junctions(self): | |
48 splice_juncs = [] | |
49 for i in range(self.blockCount - 1): | |
50 splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) | |
51 splice_juncs.append(splice_junc) | |
52 return splice_juncs | |
53 def get_exon_seqs(self): | |
54 exons = [] | |
55 for i in range(self.blockCount): | |
56 # splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) | |
57 exons.append(self.seq[self.blockStarts[i]:self.blockStarts[i] + self.blockSizes[i]]) | |
58 if self.strand == '-': #reverse complement | |
59 exons.reverse() | |
60 for i,s in enumerate(exons): | |
61 exons[i] = reverse_complement(s) | |
62 return exons | |
63 def get_spliced_seq(self): | |
64 seq = ''.join(self.get_exon_seqs()) | |
65 return seq | |
66 def get_translation(self,sequence=None): | |
67 translation = None | |
68 seq = sequence if sequence else self.get_spliced_seq() | |
69 if seq: | |
70 seqlen = len(seq) / 3 * 3; | |
71 if seqlen >= 3: | |
72 translation = translate(seq[:seqlen]) | |
73 return translation | |
74 def get_translations(self): | |
75 translations = [] | |
76 seq = self.get_spliced_seq() | |
77 if seq: | |
78 for i in range(3): | |
79 translation = self.get_translation(sequence=seq[i:]) | |
80 if translation: | |
81 translations.append(translation) | |
82 return translations | |
83 ## [[start,end,seq],[start,end,seq],[start,end,seq]] | |
84 ## filter: ignore translation if stop codon in first exon after ignore_left_bp | |
85 def get_filterd_translations(self,untrimmed=False,filtering=True,ignore_left_bp=0,ignore_right_bp=0): | |
86 translations = [None,None,None] | |
87 seq = self.get_spliced_seq() | |
88 ignore = (ignore_left_bp if self.strand == '+' else ignore_right_bp) / 3 | |
89 block_sum = sum(self.blockSizes) | |
90 exon_sizes = self.blockSizes | |
91 if self.strand == '-': | |
92 exon_sizes.reverse() | |
93 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))] | |
94 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0] | |
95 if seq: | |
96 for i in range(3): | |
97 translation = self.get_translation(sequence=seq[i:]) | |
98 if translation: | |
99 tstart = 0 | |
100 tstop = len(translation) | |
101 if not untrimmed: | |
102 tstart = translation.rfind('*',0,junc) + 1 | |
2
359addb9b9d4
Do not include the stop codon when filtering
Jim Johnson <jj@umn.edu>
parents:
0
diff
changeset
|
103 stop = translation.find('*',junc) |
359addb9b9d4
Do not include the stop codon when filtering
Jim Johnson <jj@umn.edu>
parents:
0
diff
changeset
|
104 tstop = stop if stop >= 0 else len(translation) |
0 | 105 if filtering and tstart > ignore: |
106 continue | |
107 trimmed = translation[tstart:tstop] | |
108 #get genomic locations for start and end | |
109 offset = (block_sum - i) % 3 | |
110 if self.strand == '+': | |
111 chromStart = self.chromStart + i + (tstart * 3) | |
112 chromEnd = self.chromEnd - offset - (len(translation) - tstop) * 3 | |
113 else: | |
114 chromStart = self.chromStart + offset + (len(translation) - tstop) * 3 | |
115 chromEnd = self.chromEnd - i - (tstart * 3) | |
116 translations[i] = [chromStart,chromEnd,trimmed] | |
117 return translations | |
118 def get_seq_id(self,seqtype='unk:unk',reference='',frame=None): | |
119 ## Ensembl fasta ID format | |
120 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT | |
3
3b526a780849
Change default seqtype from pep:novel to pep:splice
Jim Johnson <jj@umn.edu>
parents:
2
diff
changeset
|
121 # >ENSP00000328693 pep:splice chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding |
0 | 122 frame_name = '' |
123 chromStart = self.chromStart | |
124 chromEnd = self.chromEnd | |
125 strand = 1 if self.strand == '+' else -1 | |
126 if frame != None: | |
127 block_sum = sum(self.blockSizes) | |
128 offset = (block_sum - frame) % 3 | |
129 frame_name = '_' + str(frame + 1) | |
130 if self.strand == '+': | |
131 chromStart += frame | |
132 chromEnd -= offset | |
133 else: | |
134 chromStart += offset | |
135 chromEnd -= frame | |
136 location = "chromosome:%s:%s:%s:%s:%s" % (reference,self.chrom,chromStart,chromEnd,strand) | |
137 seq_id = "%s%s %s %s" % (self.name,frame_name,seqtype,location) | |
138 return seq_id | |
139 def get_line(self, start_offset = 0, end_offset = 0): | |
140 if start_offset or end_offset: | |
141 s_offset = start_offset if start_offset else 0 | |
142 e_offset = end_offset if end_offset else 0 | |
143 if s_offset > self.chromStart: | |
144 s_offset = self.chromStart | |
145 chrStart = self.chromStart - s_offset | |
146 chrEnd = self.chromEnd + e_offset | |
147 blkSizes = self.blockSizes | |
148 blkSizes[0] += s_offset | |
149 blkSizes[-1] += e_offset | |
150 blkStarts = self.blockStarts | |
151 for i in range(1,self.blockCount): | |
152 blkStarts[i] += s_offset | |
153 items = [str(x) for x in [self.chrom,chrStart,chrEnd,self.name,self.score,self.strand,self.thickStart,self.thickEnd,self.itemRgb,self.blockCount,','.join([str(x) for x in blkSizes]),','.join([str(x) for x in blkStarts])]] | |
154 return '\t'.join(items) + '\n' | |
155 return self.line | |
156 | |
157 def __main__(): | |
158 #Parse Command Line | |
159 parser = optparse.OptionParser() | |
160 parser.add_option( '-i', '--input', dest='input', help='BED file (tophat junctions.bed) with sequence column added' ) | |
161 parser.add_option( '-o', '--output', dest='output', help='Translations of spliced sequence') | |
162 parser.add_option( '-b', '--bed_format', dest='bed_format', action='store_true', default=False, help='Append translations to bed file instead of fasta' ) | |
3
3b526a780849
Change default seqtype from pep:novel to pep:splice
Jim Johnson <jj@umn.edu>
parents:
2
diff
changeset
|
163 parser.add_option( '-S', '--seqtype', dest='seqtype', default='pep:splice', help='SEQTYPE:STATUS for fasta ID line' ) |
0 | 164 parser.add_option( '-R', '--reference', dest='reference', default=None, help='Genome Reference Name for fasta ID location ' ) |
165 parser.add_option( '-Q', '--score_name', dest='score_name', default=None, help='include in the fasta ID line score_name:score ' ) | |
166 parser.add_option( '-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to ignore when filtering' ) | |
167 parser.add_option( '-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to ignore when filtering' ) | |
168 parser.add_option( '-U', '--unfiltered', dest='filtering', action='store_false', default=True, help='Do NOT filterout translation with stop codon in the first exon' ) | |
169 parser.add_option( '-u', '--untrimmed', dest='untrimmed', action='store_true', default=False, help='Do NOT trim from splice site to stop codon' ) | |
170 parser.add_option( '-L', '--min_length', dest='min_length', type='int', default=None, help='Minimun length (to first stop codon)' ) | |
171 parser.add_option( '-M', '--max_stop_codons', dest='max_stop_codons', type='int', default=None, help='Filter out translations with more than max_stop_codons' ) | |
172 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) | |
173 (options, args) = parser.parse_args() | |
174 # Input files | |
175 if options.input != None: | |
176 try: | |
177 inputPath = os.path.abspath(options.input) | |
178 inputFile = open(inputPath, 'r') | |
179 except Exception, e: | |
180 print >> sys.stderr, "failed: %s" % e | |
181 exit(2) | |
182 else: | |
183 inputFile = sys.stdin | |
184 # Output files | |
185 outFile = None | |
186 if options.output == None: | |
187 #write to stdout | |
188 outFile = sys.stdout | |
189 else: | |
190 try: | |
191 outPath = os.path.abspath(options.output) | |
192 outFile = open(outPath, 'w') | |
193 except Exception, e: | |
194 print >> sys.stderr, "failed: %s" % e | |
195 exit(3) | |
196 leading_bp = 0 | |
197 trailing_bp = 0 | |
198 if options.leading_bp: | |
199 if options.leading_bp >= 0: | |
200 leading_bp = options.leading_bp | |
201 else: | |
202 print >> sys.stderr, "failed: leading_bp must be positive" | |
203 exit(5) | |
204 if options.trailing_bp: | |
205 if options.trailing_bp >= 0: | |
206 trailing_bp = options.trailing_bp | |
207 else: | |
208 print >> sys.stderr, "failed: trailing_bp must be positive" | |
209 exit(5) | |
210 # Scan bed file | |
211 try: | |
212 for i, line in enumerate( inputFile ): | |
213 if line.startswith('track'): | |
214 if outFile and options.bed_format: | |
215 outFile.write(line) | |
216 continue | |
217 entry = BedEntry(line) | |
218 strand = 1 if entry.strand == '+' else -1 | |
219 translations = entry.get_translations() | |
220 if options.debug: | |
221 exon_seqs = entry.get_exon_seqs() | |
222 exon_sizes = [len(seq) for seq in exon_seqs] | |
223 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))] | |
224 print >> sys.stderr, entry.name | |
225 print >> sys.stderr, line.rstrip('\r\n') | |
226 print >> sys.stderr, "exons: %s" % exon_seqs | |
227 print >> sys.stderr, "%s" % splice_sites | |
228 for i,translation in enumerate(translations): | |
229 print >> sys.stderr, "frame %d: %s" % (i+1,translation) | |
230 print >> sys.stderr, "splice: %s" % (''.join(['^' if (((j*3)+i)/3) in splice_sites else '-' for j in range(len(translation))])) | |
231 print >> sys.stderr, "" | |
232 if options.bed_format: | |
233 tx_entry = "%s\t%s\n" % (line.rstrip('\r\n'),'\t'.join(translations)) | |
234 outFile.write(tx_entry) | |
235 else: | |
236 translations = entry.get_filterd_translations(untrimmed=options.untrimmed,filtering=options.filtering,ignore_left_bp=leading_bp,ignore_right_bp=trailing_bp) | |
237 for i,tx in enumerate(translations): | |
238 if tx: | |
239 (chromStart,chromEnd,translation) = tx | |
240 if options.min_length != None and len(translation) < options.min_length: | |
241 continue | |
242 if options.max_stop_codons != None and translation.count('*') > options.max_stop_codons: | |
243 continue | |
244 frame_name = '_%s' % (i + 1) | |
245 location = "chromosome:%s:%s:%s:%s:%s" % (options.reference,entry.chrom,chromStart,chromEnd,strand) | |
246 score = " %s:%s" % (options.score_name,entry.score) if options.score_name else '' | |
247 seq_id = "%s%s %s %s%s" % (entry.name,frame_name,options.seqtype,location, score) | |
248 outFile.write(">%s\n" % seq_id) | |
249 outFile.write(translation) | |
250 outFile.write('\n') | |
251 except Exception, e: | |
252 print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e) | |
253 | |
254 if __name__ == "__main__" : __main__() | |
255 |