0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 #
|
|
4 #------------------------------------------------------------------------------
|
|
5 # University of Minnesota
|
|
6 # Copyright 2017, Regents of the University of Minnesota
|
|
7 #------------------------------------------------------------------------------
|
|
8 # Author:
|
|
9 #
|
|
10 # James E Johnson
|
|
11 #
|
|
12 #------------------------------------------------------------------------------
|
|
13 """
|
|
14
|
|
15 import argparse
|
7
|
16 import re
|
0
|
17 import sys
|
|
18 from time import sleep
|
|
19
|
|
20 from Bio.Seq import translate
|
|
21
|
|
22 import requests
|
|
23
|
|
24 import digest
|
|
25
|
|
26
|
|
27 server = "https://rest.ensembl.org"
|
|
28 ext = "/info/assembly/homo_sapiens?"
|
7
|
29 max_region = 4000000
|
0
|
30
|
|
31
|
|
32 def ensembl_rest(ext, headers):
|
7
|
33 if True: print >> sys.stderr, "%s" % ext
|
0
|
34 r = requests.get(server+ext, headers=headers)
|
|
35 if r.status_code == 429:
|
|
36 print >> sys.stderr, "response headers: %s\n" % r.headers
|
|
37 if 'Retry-After' in r.headers:
|
|
38 sleep(r.headers['Retry-After'])
|
|
39 r = requests.get(server+ext, headers=headers)
|
|
40 if not r.ok:
|
|
41 r.raise_for_status()
|
|
42 return r
|
|
43
|
|
44
|
|
45 def get_species():
|
|
46 results = dict()
|
|
47 ext = "/info/species"
|
|
48 req_header = {"Content-Type": "application/json"}
|
|
49 r = ensembl_rest(ext, req_header)
|
|
50 for species in r.json()['species']:
|
|
51 results[species['name']] = species
|
|
52 print >> sys.stdout,\
|
|
53 "%s\t%s\t%s\t%s\t%s"\
|
|
54 % (species['name'], species['common_name'],
|
|
55 species['display_name'],
|
|
56 species['strain'],
|
|
57 species['taxon_id'])
|
|
58 return results
|
|
59
|
|
60
|
|
61 def get_biotypes(species):
|
|
62 biotypes = []
|
|
63 ext = "/info/biotypes/%s?" % species
|
|
64 req_header = {"Content-Type": "application/json"}
|
|
65 r = ensembl_rest(ext, req_header)
|
|
66 for entry in r.json():
|
|
67 if 'biotype' in entry:
|
|
68 biotypes.append(entry['biotype'])
|
|
69 return biotypes
|
|
70
|
|
71
|
|
72 def get_toplevel(species):
|
|
73 coord_systems = dict()
|
|
74 ext = "/info/assembly/%s?" % species
|
|
75 req_header = {"Content-Type": "application/json"}
|
|
76 r = ensembl_rest(ext, req_header)
|
|
77 toplevel = r.json()
|
|
78 for seq in toplevel['top_level_region']:
|
|
79 if seq['coord_system'] not in coord_systems:
|
|
80 coord_systems[seq['coord_system']] = dict()
|
|
81 coord_system = coord_systems[seq['coord_system']]
|
|
82 coord_system[seq['name']] = int(seq['length'])
|
|
83 return coord_systems
|
|
84
|
|
85
|
7
|
86 def get_transcripts_bed(species, refseq, start, length, strand='', params=None):
|
0
|
87 bed = []
|
|
88 param = params if params else ''
|
|
89 req_header = {"Content-Type": "text/x-bed"}
|
|
90 regions = range(start, length, max_region)
|
|
91 if not regions or regions[-1] < length:
|
|
92 regions.append(length)
|
|
93 for end in regions[1:]:
|
7
|
94 ext = "/overlap/region/%s/%s:%d-%d%s?feature=transcript;%s"\
|
|
95 % (species, refseq, start, end, strand, param)
|
0
|
96 start = end + 1
|
|
97 r = ensembl_rest(ext, req_header)
|
|
98 if r.text:
|
|
99 bed += r.text.splitlines()
|
|
100 return bed
|
|
101
|
|
102
|
|
103 def get_seq(id, seqtype,params=None):
|
|
104 param = params if params else ''
|
|
105 ext = "/sequence/id/%s?type=%s;%s" % (id, seqtype,param)
|
|
106 req_header = {"Content-Type": "text/plain"}
|
|
107 r = ensembl_rest(ext, req_header)
|
|
108 return r.text
|
|
109
|
|
110
|
|
111 def get_cdna(id,params=None):
|
|
112 return get_seq(id, 'cdna',params=params)
|
|
113
|
|
114
|
|
115 def get_cds(id,params=None):
|
|
116 return get_seq(id, 'cds',params=params)
|
|
117
|
|
118
|
7
|
119 def get_transcript_haplotypes(species,transcript):
|
|
120 ext = "/transcript_haplotypes/%s/%s?aligned_sequences=1" % (species,transcript)
|
|
121 req_header = {"Content-Type" : "application/json"}
|
|
122 r = ensembl_rest(ext, req_header)
|
|
123 decoded = r.json()
|
|
124
|
|
125
|
0
|
126 def bed_from_line(line):
|
|
127 fields = line.rstrip('\r\n').split('\t')
|
7
|
128 if len(fields) < 12:
|
|
129 return None
|
0
|
130 (chrom, chromStart, chromEnd, name, score, strand,
|
|
131 thickStart, thickEnd, itemRgb,
|
|
132 blockCount, blockSizes, blockStarts) = fields[0:12]
|
|
133 bed_entry = BedEntry(chrom=chrom, chromStart=chromStart, chromEnd=chromEnd,
|
|
134 name=name, score=score, strand=strand,
|
|
135 thickStart=thickStart, thickEnd=thickEnd,
|
|
136 itemRgb=itemRgb,
|
|
137 blockCount=blockCount,
|
|
138 blockSizes=blockSizes.rstrip(','),
|
|
139 blockStarts=blockStarts.rstrip(','))
|
7
|
140 if len(fields) == 20:
|
|
141 bed_entry.second_name = fields[12]
|
|
142 bed_entry.cds_start_status = fields[13]
|
|
143 bed_entry.cds_end_status = fields[14]
|
|
144 bed_entry.exon_frames = fields[15]
|
|
145 bed_entry.biotype = fields[16]
|
|
146 bed_entry.gene_name = fields[17]
|
|
147 bed_entry.second_gene_name = fields[18]
|
|
148 bed_entry.gene_type = fields[19]
|
0
|
149 return bed_entry
|
|
150
|
|
151
|
|
152 class BedEntry(object):
|
|
153 def __init__(self, chrom=None, chromStart=None, chromEnd=None,
|
|
154 name=None, score=None, strand=None,
|
|
155 thickStart=None, thickEnd=None, itemRgb=None,
|
|
156 blockCount=None, blockSizes=None, blockStarts=None):
|
|
157 self.chrom = chrom
|
|
158 self.chromStart = int(chromStart)
|
|
159 self.chromEnd = int(chromEnd)
|
|
160 self.name = name
|
|
161 self.score = int(score) if score is not None else 0
|
|
162 self.strand = '-' if str(strand).startswith('-') else '+'
|
|
163 self.thickStart = int(thickStart) if thickStart else self.chromStart
|
|
164 self.thickEnd = int(thickEnd) if thickEnd else self.chromEnd
|
|
165 self.itemRgb = str(itemRgb) if itemRgb is not None else r'100,100,100'
|
|
166 self.blockCount = int(blockCount)
|
|
167 if isinstance(blockSizes, str) or isinstance(blockSizes, unicode):
|
|
168 self.blockSizes = [int(x) for x in blockSizes.split(',')]
|
|
169 elif isinstance(blockSizes, list):
|
|
170 self.blockSizes = [int(x) for x in blockSizes]
|
|
171 else:
|
|
172 self.blockSizes = blockSizes
|
|
173 if isinstance(blockStarts, str) or isinstance(blockSizes, unicode):
|
|
174 self.blockStarts = [int(x) for x in blockStarts.split(',')]
|
|
175 elif isinstance(blockStarts, list):
|
|
176 self.blockStarts = [int(x) for x in blockStarts]
|
|
177 else:
|
|
178 self.blockStarts = blockStarts
|
7
|
179 self.second_name = None
|
|
180 self.cds_start_status = None
|
|
181 self.cds_end_status = None
|
|
182 self.exon_frames = None
|
|
183 self.biotype = None
|
|
184 self.gene_name = None
|
|
185 self.second_gene_name = None
|
|
186 self.gene_type = None
|
0
|
187 self.seq = None
|
|
188 self.pep = None
|
|
189
|
|
190 def __str__(self):
|
|
191 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s' % (
|
|
192 self.chrom, self.chromStart, self.chromEnd,
|
|
193 self.name, self.score, self.strand,
|
|
194 self.thickStart, self.thickEnd, str(self.itemRgb), self.blockCount,
|
|
195 ','.join([str(x) for x in self.blockSizes]),
|
|
196 ','.join([str(x) for x in self.blockStarts]))
|
|
197
|
|
198 # (start, end)
|
|
199 def get_subrange(self, tstart, tstop, debug=False):
|
|
200 chromStart = self.chromStart
|
|
201 chromEnd = self.chromEnd
|
|
202 if debug:
|
|
203 print >> sys.stderr, "%s" % (str(self))
|
|
204 r = range(self.blockCount)
|
|
205 if self.strand == '-':
|
|
206 r.reverse()
|
|
207 bStart = 0
|
|
208 bEnd = 0
|
|
209 for x in r:
|
|
210 bEnd = bStart + self.blockSizes[x]
|
|
211 if bStart <= tstart < bEnd:
|
|
212 if self.strand == '+':
|
|
213 chromStart = self.chromStart + self.blockStarts[x] +\
|
|
214 (tstart - bStart)
|
|
215 else:
|
|
216 chromEnd = self.chromStart + self.blockStarts[x] +\
|
|
217 self.blockSizes[x] - (tstart - bStart)
|
|
218 if bStart <= tstop < bEnd:
|
|
219 if self.strand == '+':
|
|
220 chromEnd = self.chromStart + self.blockStarts[x] +\
|
|
221 (tstop - bStart)
|
|
222 else:
|
|
223 chromStart = self.chromStart + self.blockStarts[x] +\
|
|
224 self.blockSizes[x] - (tstop - bStart)
|
|
225 if debug:
|
|
226 print >> sys.stderr,\
|
|
227 "%3d %s\t%d\t%d\t%d\t%d\t%d\t%d"\
|
|
228 % (x, self.strand, bStart, bEnd,
|
|
229 tstart, tstop, chromStart, chromEnd)
|
|
230 bStart += self.blockSizes[x]
|
|
231 return(chromStart, chromEnd)
|
|
232
|
|
233 # get the blocks for sub range
|
|
234 def get_blocks(self, chromStart, chromEnd):
|
|
235 tblockCount = 0
|
|
236 tblockSizes = []
|
|
237 tblockStarts = []
|
|
238 for x in range(self.blockCount):
|
|
239 bStart = self.chromStart + self.blockStarts[x]
|
|
240 bEnd = bStart + self.blockSizes[x]
|
|
241 if bStart > chromEnd:
|
|
242 break
|
|
243 if bEnd < chromStart:
|
|
244 continue
|
|
245 cStart = max(chromStart, bStart)
|
|
246 tblockStarts.append(cStart - chromStart)
|
|
247 tblockSizes.append(min(chromEnd, bEnd) - cStart)
|
|
248 tblockCount += 1
|
|
249 return (tblockCount, tblockSizes, tblockStarts)
|
|
250
|
|
251 def trim(self, tstart, tstop, debug=False):
|
|
252 (tchromStart, tchromEnd) =\
|
|
253 self.get_subrange(tstart, tstop, debug=debug)
|
|
254 (tblockCount, tblockSizes, tblockStarts) =\
|
|
255 self.get_blocks(tchromStart, tchromEnd)
|
|
256 tbed = BedEntry(
|
|
257 chrom=self.chrom, chromStart=tchromStart, chromEnd=tchromEnd,
|
|
258 name=self.name, score=self.score, strand=self.strand,
|
|
259 thickStart=tchromStart, thickEnd=tchromEnd, itemRgb=self.itemRgb,
|
|
260 blockCount=tblockCount,
|
|
261 blockSizes=tblockSizes, blockStarts=tblockStarts)
|
|
262 if self.seq:
|
|
263 ts = tchromStart-self.chromStart
|
|
264 te = tchromEnd - tchromStart + ts
|
|
265 tbed.seq = self.seq[ts:te]
|
|
266 return tbed
|
|
267
|
|
268
|
|
269 def __main__():
|
|
270 parser = argparse.ArgumentParser(
|
|
271 description='Retrieve Ensembl cDNAs and three frame translate')
|
|
272 parser.add_argument(
|
|
273 '-s', '--species', default='human',
|
|
274 help='Ensembl Species to retrieve')
|
|
275 parser.add_argument(
|
7
|
276 '-R', '--regions', action='append', default=[],
|
|
277 help='Restrict Ensembl retrieval to regions e.g.: X,2:20000-25000,3:100-500+')
|
|
278 parser.add_argument(
|
0
|
279 '-B', '--biotypes', action='append', default=[],
|
|
280 help='Restrict Ensembl biotypes to retrieve')
|
|
281 parser.add_argument(
|
|
282 '-i', '--input', default=None,
|
|
283 help='Use BED instead of retrieving cDNA from ensembl (-) for stdin')
|
|
284 parser.add_argument(
|
|
285 '-t', '--transcripts', default=None,
|
|
286 help='Path to output cDNA transcripts.bed (-) for stdout')
|
|
287 parser.add_argument(
|
|
288 '-r', '--raw', action='store_true',
|
|
289 help='Report transcript exacty as returned from Ensembl')
|
|
290 parser.add_argument(
|
|
291 '-f', '--fasta', default=None,
|
|
292 help='Path to output translations.fasta')
|
|
293 parser.add_argument(
|
|
294 '-b', '--bed', default=None,
|
|
295 help='Path to output translations.bed')
|
|
296 parser.add_argument(
|
|
297 '-m', '--min_length', type=int, default=7,
|
|
298 help='Minimum length of protein translation to report')
|
|
299 parser.add_argument(
|
|
300 '-e', '--enzyme', default=None,
|
|
301 help='Digest translation with enzyme')
|
|
302 parser.add_argument(
|
|
303 '-a', '--all', action='store_true',
|
|
304 help='Include reference protein translations')
|
|
305 parser.add_argument('-v', '--verbose', action='store_true', help='Verbose')
|
|
306 parser.add_argument('-d', '--debug', action='store_true', help='Debug')
|
|
307 args = parser.parse_args()
|
|
308 # print >> sys.stderr, "args: %s" % args
|
|
309 species = args.species
|
|
310 input_rdr = None
|
|
311 if args.input is not None:
|
|
312 input_rdr = open(args.input, 'r') if args.input != '-' else sys.stdin
|
|
313 tx_wtr = None
|
|
314 if args.transcripts is not None:
|
|
315 tx_wtr = open(args.transcripts, 'w')\
|
|
316 if args.transcripts != '-' else sys.stdout
|
|
317 fa_wtr = open(args.fasta, 'w') if args.fasta is not None else None
|
|
318 bed_wtr = open(args.bed, 'w') if args.bed is not None else None
|
|
319
|
|
320 enzyme = digest.expasy_rules.get(args.enzyme,args.enzyme)
|
|
321
|
|
322 # print >> sys.stderr, "args biotypes: %s" % args.biotypes
|
|
323 biotypea = ['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',')]
|
|
324 # print >> sys.stderr, "args biotypes: %s" % biotypea
|
|
325 biotypes = ';'.join(['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',') if bt.strip()])
|
|
326 # print >> sys.stderr, "biotypes: %s" % biotypes
|
|
327
|
7
|
328 selected_regions = dict() # chrom:(start,end)
|
|
329 region_pat = '^([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?'
|
|
330 if args.regions:
|
|
331 for entry in args.regions:
|
|
332 if not entry:
|
|
333 continue
|
|
334 regs = [x.strip() for x in entry.split(',') if x.strip()]
|
|
335 for reg in regs:
|
|
336 m = re.match(region_pat,reg)
|
|
337 if m:
|
|
338 (chrom,start,end,strand) = m.groups()
|
|
339 if chrom:
|
|
340 if chrom not in selected_regions:
|
|
341 selected_regions[chrom] = []
|
|
342 selected_regions[chrom].append([start,end,strand])
|
|
343
|
0
|
344 translations = dict() # start : end : seq
|
|
345
|
|
346 def unique_prot(tbed, seq):
|
|
347 if tbed.chromStart not in translations:
|
|
348 translations[tbed.chromStart] = dict()
|
|
349 translations[tbed.chromStart][tbed.chromEnd] = []
|
|
350 translations[tbed.chromStart][tbed.chromEnd].append(seq)
|
|
351 elif tbed.chromEnd not in translations[tbed.chromStart]:
|
|
352 translations[tbed.chromStart][tbed.chromEnd] = []
|
|
353 translations[tbed.chromStart][tbed.chromEnd].append(seq)
|
|
354 elif seq not in translations[tbed.chromStart][tbed.chromEnd]:
|
|
355 translations[tbed.chromStart][tbed.chromEnd].append(seq)
|
|
356 else:
|
|
357 return False
|
|
358 return True
|
|
359
|
|
360 def translate_bed(bed):
|
|
361 translate_count = 0
|
|
362 if any([fa_wtr, bed_wtr]):
|
|
363 transcript_id = bed.name
|
|
364 refprot = None
|
|
365 if not args.all:
|
|
366 try:
|
|
367 cds = get_cds(transcript_id)
|
|
368 if len(cds) % 3 != 0:
|
|
369 cds = cds[:-(len(cds) % 3)]
|
|
370 refprot = translate(cds) if cds else None
|
|
371 except:
|
|
372 refprot = None
|
|
373 cdna = get_cdna(transcript_id)
|
|
374 cdna_len = len(cdna)
|
|
375 for offset in range(3):
|
|
376 seqend = cdna_len - (cdna_len - offset) % 3
|
|
377 aaseq = translate(cdna[offset:seqend])
|
|
378 aa_start = 0
|
|
379 while aa_start < len(aaseq):
|
|
380 aa_end = aaseq.find('*', aa_start)
|
|
381 if aa_end < 0:
|
|
382 aa_end = len(aaseq)
|
|
383 prot = aaseq[aa_start:aa_end]
|
|
384 if enzyme and refprot:
|
|
385 frags = digest._cleave(prot,enzyme)
|
|
386 for frag in reversed(frags):
|
|
387 if frag in refprot:
|
|
388 prot = prot[:prot.rfind(frag)]
|
|
389 else:
|
|
390 break
|
|
391 if len(prot) < args.min_length:
|
|
392 pass
|
|
393 elif refprot and prot in refprot:
|
|
394 pass
|
|
395 else:
|
|
396 tstart = aa_start*3+offset
|
|
397 tend = aa_end*3+offset
|
|
398 prot_acc = "%s_%d_%d" % (transcript_id, tstart, tend)
|
|
399 tbed = bed.trim(tstart, tend)
|
|
400 if args.all or unique_prot(tbed, prot):
|
|
401 translate_count += 1
|
|
402 tbed.name = prot_acc
|
|
403 bed_wtr.write("%s\t%s\n" % (str(tbed), prot))
|
|
404 bed_wtr.flush()
|
|
405 fa_id = ">%s\n" % (prot_acc)
|
|
406 fa_wtr.write(fa_id)
|
|
407 fa_wtr.write(prot)
|
|
408 fa_wtr.write("\n")
|
|
409 fa_wtr.flush()
|
|
410 aa_start = aa_end + 1
|
|
411 return translate_count
|
|
412
|
7
|
413 def translate_region(species,ref,start,stop,strand):
|
|
414 translation_count = 0
|
|
415 regions = range(start, stop, max_region)
|
|
416 if not regions or regions[-1] < stop:
|
|
417 regions.append(stop)
|
|
418 for end in regions[1:]:
|
|
419 bedlines = get_transcripts_bed(species, ref, start, end, strand=strand, params=biotypes)
|
|
420 if args.verbose or args.debug:
|
|
421 print >> sys.stderr,\
|
|
422 "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\
|
|
423 % (species, ref, start, end, len(bedlines))
|
|
424 # start, end, seq
|
|
425 for i, bedline in enumerate(bedlines):
|
|
426 try:
|
|
427 bed = bed_from_line(bedline)\
|
|
428 if any([not args.raw, fa_wtr, bed_wtr])\
|
|
429 else None
|
|
430 if tx_wtr:
|
|
431 tx_wtr.write(bedline if args.raw else str(bed))
|
|
432 tx_wtr.write("\n")
|
|
433 tx_wtr.flush()
|
|
434 if bed:
|
|
435 translation_count += translate_bed(bed)
|
|
436 except Exception as e:
|
|
437 print >> sys.stderr,\
|
|
438 "BED error (%s) : %s\n" % (e, bedline)
|
|
439 start = end + 1
|
|
440 return translation_count
|
|
441
|
0
|
442 if input_rdr:
|
|
443 translation_count = 0
|
|
444 for i, bedline in enumerate(input_rdr):
|
|
445 try:
|
|
446 bed = bed_from_line(bedline)
|
7
|
447 if bed is None:
|
|
448 continue
|
|
449 if bed.biotype and biotypea and bed.biotype not in biotypea:
|
|
450 continue
|
0
|
451 translation_count += translate_bed(bed)
|
|
452 except:
|
|
453 print >> sys.stderr, "BED format error: %s\n" % bedline
|
|
454 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])):
|
|
455 print >> sys.stderr,\
|
|
456 "%s\tcDNA translations:%d" % (species, translation_count)
|
|
457 else:
|
|
458 coord_systems = get_toplevel(species)
|
|
459 if 'chromosome' in coord_systems:
|
7
|
460 ref_lengths = dict()
|
0
|
461 for ref in sorted(coord_systems['chromosome'].keys()):
|
|
462 length = coord_systems['chromosome'][ref]
|
7
|
463 ref_lengths[ref] = length
|
0
|
464 if not any([tx_wtr, fa_wtr, bed_wtr]):
|
|
465 print >> sys.stderr,\
|
|
466 "%s\t%s\tlength: %d" % (species, ref, length)
|
7
|
467 if selected_regions:
|
0
|
468 translation_count = 0
|
7
|
469 for ref in sorted(selected_regions.keys()):
|
|
470 if ref in ref_lengths:
|
|
471 for reg in selected_regions[ref]:
|
|
472 (_start,_stop,_strand) = reg
|
|
473 start = int(_start) if _start else 0
|
|
474 stop = int(_stop) if _stop else ref_lengths[ref]
|
|
475 strand = '' if not _strand else ':1' if _strand == '+' else ':-1'
|
|
476 translation_count += translate_region(species,ref,start,stop,strand)
|
|
477 else:
|
|
478 strand = ''
|
0
|
479 start = 0
|
7
|
480 for ref in sorted(ref_lengths.keys()):
|
|
481 length = ref_lengths[ref]
|
|
482 translation_count = 0
|
|
483 if args.debug:
|
0
|
484 print >> sys.stderr,\
|
7
|
485 "Retrieving transcripts: %s\t%s\tlength: %d"\
|
|
486 % (species, ref, length)
|
|
487 translation_count += translate_region(species,ref,start,length,strand)
|
|
488 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])):
|
|
489 print >> sys.stderr,\
|
|
490 "%s\t%s\tlength: %d\tcDNA translations:%d"\
|
|
491 % (species, ref, length, translation_count)
|
0
|
492
|
|
493
|
|
494 if __name__ == "__main__":
|
|
495 __main__()
|