comparison ensembl_cdna_translate.py @ 7:d59e3ce10e74 draft

Uploaded
author jjohnson
date Wed, 13 Dec 2017 11:15:34 -0500
parents b7f2f5e3390c
children 5c92d0be6514
comparison
equal deleted inserted replaced
6:f8e0e5e237a5 7:d59e3ce10e74
11 # 11 #
12 #------------------------------------------------------------------------------ 12 #------------------------------------------------------------------------------
13 """ 13 """
14 14
15 import argparse 15 import argparse
16 import re
16 import sys 17 import sys
17 from time import sleep 18 from time import sleep
18 19
19 from Bio.Seq import translate 20 from Bio.Seq import translate
20 21
23 import digest 24 import digest
24 25
25 26
26 server = "https://rest.ensembl.org" 27 server = "https://rest.ensembl.org"
27 ext = "/info/assembly/homo_sapiens?" 28 ext = "/info/assembly/homo_sapiens?"
28 max_region = 5000000 29 max_region = 4000000
29 30
30 31
31 def ensembl_rest(ext, headers): 32 def ensembl_rest(ext, headers):
32 # if True: print >> sys.stderr, "%s" % ext 33 if True: print >> sys.stderr, "%s" % ext
33 r = requests.get(server+ext, headers=headers) 34 r = requests.get(server+ext, headers=headers)
34 if r.status_code == 429: 35 if r.status_code == 429:
35 print >> sys.stderr, "response headers: %s\n" % r.headers 36 print >> sys.stderr, "response headers: %s\n" % r.headers
36 if 'Retry-After' in r.headers: 37 if 'Retry-After' in r.headers:
37 sleep(r.headers['Retry-After']) 38 sleep(r.headers['Retry-After'])
80 coord_system = coord_systems[seq['coord_system']] 81 coord_system = coord_systems[seq['coord_system']]
81 coord_system[seq['name']] = int(seq['length']) 82 coord_system[seq['name']] = int(seq['length'])
82 return coord_systems 83 return coord_systems
83 84
84 85
85 def get_transcripts_bed(species, refseq, start, length,params=None): 86 def get_transcripts_bed(species, refseq, start, length, strand='', params=None):
86 bed = [] 87 bed = []
87 param = params if params else '' 88 param = params if params else ''
88 req_header = {"Content-Type": "text/x-bed"} 89 req_header = {"Content-Type": "text/x-bed"}
89 regions = range(start, length, max_region) 90 regions = range(start, length, max_region)
90 if not regions or regions[-1] < length: 91 if not regions or regions[-1] < length:
91 regions.append(length) 92 regions.append(length)
92 for end in regions[1:]: 93 for end in regions[1:]:
93 ext = "/overlap/region/%s/%s:%d-%d?feature=transcript;%s"\ 94 ext = "/overlap/region/%s/%s:%d-%d%s?feature=transcript;%s"\
94 % (species, refseq, start, end, param) 95 % (species, refseq, start, end, strand, param)
95 start = end + 1 96 start = end + 1
96 r = ensembl_rest(ext, req_header) 97 r = ensembl_rest(ext, req_header)
97 if r.text: 98 if r.text:
98 bed += r.text.splitlines() 99 bed += r.text.splitlines()
99 return bed 100 return bed
113 114
114 def get_cds(id,params=None): 115 def get_cds(id,params=None):
115 return get_seq(id, 'cds',params=params) 116 return get_seq(id, 'cds',params=params)
116 117
117 118
119 def get_transcript_haplotypes(species,transcript):
120 ext = "/transcript_haplotypes/%s/%s?aligned_sequences=1" % (species,transcript)
121 req_header = {"Content-Type" : "application/json"}
122 r = ensembl_rest(ext, req_header)
123 decoded = r.json()
124
125
118 def bed_from_line(line): 126 def bed_from_line(line):
119 fields = line.rstrip('\r\n').split('\t') 127 fields = line.rstrip('\r\n').split('\t')
128 if len(fields) < 12:
129 return None
120 (chrom, chromStart, chromEnd, name, score, strand, 130 (chrom, chromStart, chromEnd, name, score, strand,
121 thickStart, thickEnd, itemRgb, 131 thickStart, thickEnd, itemRgb,
122 blockCount, blockSizes, blockStarts) = fields[0:12] 132 blockCount, blockSizes, blockStarts) = fields[0:12]
123 bed_entry = BedEntry(chrom=chrom, chromStart=chromStart, chromEnd=chromEnd, 133 bed_entry = BedEntry(chrom=chrom, chromStart=chromStart, chromEnd=chromEnd,
124 name=name, score=score, strand=strand, 134 name=name, score=score, strand=strand,
125 thickStart=thickStart, thickEnd=thickEnd, 135 thickStart=thickStart, thickEnd=thickEnd,
126 itemRgb=itemRgb, 136 itemRgb=itemRgb,
127 blockCount=blockCount, 137 blockCount=blockCount,
128 blockSizes=blockSizes.rstrip(','), 138 blockSizes=blockSizes.rstrip(','),
129 blockStarts=blockStarts.rstrip(',')) 139 blockStarts=blockStarts.rstrip(','))
140 if len(fields) == 20:
141 bed_entry.second_name = fields[12]
142 bed_entry.cds_start_status = fields[13]
143 bed_entry.cds_end_status = fields[14]
144 bed_entry.exon_frames = fields[15]
145 bed_entry.biotype = fields[16]
146 bed_entry.gene_name = fields[17]
147 bed_entry.second_gene_name = fields[18]
148 bed_entry.gene_type = fields[19]
130 return bed_entry 149 return bed_entry
131 150
132 151
133 class BedEntry(object): 152 class BedEntry(object):
134 def __init__(self, chrom=None, chromStart=None, chromEnd=None, 153 def __init__(self, chrom=None, chromStart=None, chromEnd=None,
155 self.blockStarts = [int(x) for x in blockStarts.split(',')] 174 self.blockStarts = [int(x) for x in blockStarts.split(',')]
156 elif isinstance(blockStarts, list): 175 elif isinstance(blockStarts, list):
157 self.blockStarts = [int(x) for x in blockStarts] 176 self.blockStarts = [int(x) for x in blockStarts]
158 else: 177 else:
159 self.blockStarts = blockStarts 178 self.blockStarts = blockStarts
179 self.second_name = None
180 self.cds_start_status = None
181 self.cds_end_status = None
182 self.exon_frames = None
183 self.biotype = None
184 self.gene_name = None
185 self.second_gene_name = None
186 self.gene_type = None
160 self.seq = None 187 self.seq = None
161 self.pep = None 188 self.pep = None
162 189
163 def __str__(self): 190 def __str__(self):
164 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s' % ( 191 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s' % (
244 description='Retrieve Ensembl cDNAs and three frame translate') 271 description='Retrieve Ensembl cDNAs and three frame translate')
245 parser.add_argument( 272 parser.add_argument(
246 '-s', '--species', default='human', 273 '-s', '--species', default='human',
247 help='Ensembl Species to retrieve') 274 help='Ensembl Species to retrieve')
248 parser.add_argument( 275 parser.add_argument(
276 '-R', '--regions', action='append', default=[],
277 help='Restrict Ensembl retrieval to regions e.g.: X,2:20000-25000,3:100-500+')
278 parser.add_argument(
249 '-B', '--biotypes', action='append', default=[], 279 '-B', '--biotypes', action='append', default=[],
250 help='Restrict Ensembl biotypes to retrieve') 280 help='Restrict Ensembl biotypes to retrieve')
251 parser.add_argument( 281 parser.add_argument(
252 '-i', '--input', default=None, 282 '-i', '--input', default=None,
253 help='Use BED instead of retrieving cDNA from ensembl (-) for stdin') 283 help='Use BED instead of retrieving cDNA from ensembl (-) for stdin')
292 # print >> sys.stderr, "args biotypes: %s" % args.biotypes 322 # print >> sys.stderr, "args biotypes: %s" % args.biotypes
293 biotypea = ['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',')] 323 biotypea = ['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',')]
294 # print >> sys.stderr, "args biotypes: %s" % biotypea 324 # print >> sys.stderr, "args biotypes: %s" % biotypea
295 biotypes = ';'.join(['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',') if bt.strip()]) 325 biotypes = ';'.join(['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',') if bt.strip()])
296 # print >> sys.stderr, "biotypes: %s" % biotypes 326 # print >> sys.stderr, "biotypes: %s" % biotypes
327
328 selected_regions = dict() # chrom:(start,end)
329 region_pat = '^([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?'
330 if args.regions:
331 for entry in args.regions:
332 if not entry:
333 continue
334 regs = [x.strip() for x in entry.split(',') if x.strip()]
335 for reg in regs:
336 m = re.match(region_pat,reg)
337 if m:
338 (chrom,start,end,strand) = m.groups()
339 if chrom:
340 if chrom not in selected_regions:
341 selected_regions[chrom] = []
342 selected_regions[chrom].append([start,end,strand])
297 343
298 translations = dict() # start : end : seq 344 translations = dict() # start : end : seq
299 345
300 def unique_prot(tbed, seq): 346 def unique_prot(tbed, seq):
301 if tbed.chromStart not in translations: 347 if tbed.chromStart not in translations:
362 fa_wtr.write("\n") 408 fa_wtr.write("\n")
363 fa_wtr.flush() 409 fa_wtr.flush()
364 aa_start = aa_end + 1 410 aa_start = aa_end + 1
365 return translate_count 411 return translate_count
366 412
413 def translate_region(species,ref,start,stop,strand):
414 translation_count = 0
415 regions = range(start, stop, max_region)
416 if not regions or regions[-1] < stop:
417 regions.append(stop)
418 for end in regions[1:]:
419 bedlines = get_transcripts_bed(species, ref, start, end, strand=strand, params=biotypes)
420 if args.verbose or args.debug:
421 print >> sys.stderr,\
422 "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\
423 % (species, ref, start, end, len(bedlines))
424 # start, end, seq
425 for i, bedline in enumerate(bedlines):
426 try:
427 bed = bed_from_line(bedline)\
428 if any([not args.raw, fa_wtr, bed_wtr])\
429 else None
430 if tx_wtr:
431 tx_wtr.write(bedline if args.raw else str(bed))
432 tx_wtr.write("\n")
433 tx_wtr.flush()
434 if bed:
435 translation_count += translate_bed(bed)
436 except Exception as e:
437 print >> sys.stderr,\
438 "BED error (%s) : %s\n" % (e, bedline)
439 start = end + 1
440 return translation_count
441
367 if input_rdr: 442 if input_rdr:
368 translation_count = 0 443 translation_count = 0
369 for i, bedline in enumerate(input_rdr): 444 for i, bedline in enumerate(input_rdr):
370 try: 445 try:
371 bed = bed_from_line(bedline) 446 bed = bed_from_line(bedline)
447 if bed is None:
448 continue
449 if bed.biotype and biotypea and bed.biotype not in biotypea:
450 continue
372 translation_count += translate_bed(bed) 451 translation_count += translate_bed(bed)
373 except: 452 except:
374 print >> sys.stderr, "BED format error: %s\n" % bedline 453 print >> sys.stderr, "BED format error: %s\n" % bedline
375 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])): 454 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])):
376 print >> sys.stderr,\ 455 print >> sys.stderr,\
377 "%s\tcDNA translations:%d" % (species, translation_count) 456 "%s\tcDNA translations:%d" % (species, translation_count)
378 else: 457 else:
379 coord_systems = get_toplevel(species) 458 coord_systems = get_toplevel(species)
380 if 'chromosome' in coord_systems: 459 if 'chromosome' in coord_systems:
460 ref_lengths = dict()
381 for ref in sorted(coord_systems['chromosome'].keys()): 461 for ref in sorted(coord_systems['chromosome'].keys()):
382 length = coord_systems['chromosome'][ref] 462 length = coord_systems['chromosome'][ref]
463 ref_lengths[ref] = length
383 if not any([tx_wtr, fa_wtr, bed_wtr]): 464 if not any([tx_wtr, fa_wtr, bed_wtr]):
384 print >> sys.stderr,\ 465 print >> sys.stderr,\
385 "%s\t%s\tlength: %d" % (species, ref, length) 466 "%s\t%s\tlength: %d" % (species, ref, length)
386 continue 467 if selected_regions:
387 if args.debug:
388 print >> sys.stderr,\
389 "Retrieving transcripts: %s\t%s\tlength: %d"\
390 % (species, ref, length)
391 translation_count = 0 468 translation_count = 0
469 for ref in sorted(selected_regions.keys()):
470 if ref in ref_lengths:
471 for reg in selected_regions[ref]:
472 (_start,_stop,_strand) = reg
473 start = int(_start) if _start else 0
474 stop = int(_stop) if _stop else ref_lengths[ref]
475 strand = '' if not _strand else ':1' if _strand == '+' else ':-1'
476 translation_count += translate_region(species,ref,start,stop,strand)
477 else:
478 strand = ''
392 start = 0 479 start = 0
393 regions = range(start, length, max_region) 480 for ref in sorted(ref_lengths.keys()):
394 if not regions or regions[-1] < length: 481 length = ref_lengths[ref]
395 regions.append(length) 482 translation_count = 0
396 for end in regions[1:]: 483 if args.debug:
397 bedlines = get_transcripts_bed(species, ref, start, end, params=biotypes)
398 if args.verbose or args.debug:
399 print >> sys.stderr,\ 484 print >> sys.stderr,\
400 "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\ 485 "Retrieving transcripts: %s\t%s\tlength: %d"\
401 % (species, ref, start, end, len(bedlines)) 486 % (species, ref, length)
402 # start, end, seq 487 translation_count += translate_region(species,ref,start,length,strand)
403 for i, bedline in enumerate(bedlines): 488 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])):
404 try: 489 print >> sys.stderr,\
405 bed = bed_from_line(bedline)\ 490 "%s\t%s\tlength: %d\tcDNA translations:%d"\
406 if any([not args.raw, fa_wtr, bed_wtr])\ 491 % (species, ref, length, translation_count)
407 else None
408 if tx_wtr:
409 tx_wtr.write(bedline if args.raw else str(bed))
410 tx_wtr.write("\n")
411 tx_wtr.flush()
412 if bed:
413 translation_count += translate_bed(bed)
414 except Exception as e:
415 print >> sys.stderr,\
416 "BED error (%s) : %s\n" % (e, bedline)
417 start = end + 1
418
419 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])):
420 print >> sys.stderr,\
421 "%s\t%s\tlength: %d\tcDNA translations:%d"\
422 % (species, ref, length, translation_count)
423 492
424 493
425 if __name__ == "__main__": 494 if __name__ == "__main__":
426 __main__() 495 __main__()