Mercurial > repos > jjohnson > ensembl_cdna_translate
comparison ensembl_cdna_translate.py @ 7:d59e3ce10e74 draft
Uploaded
author | jjohnson |
---|---|
date | Wed, 13 Dec 2017 11:15:34 -0500 |
parents | b7f2f5e3390c |
children | 5c92d0be6514 |
comparison
equal
deleted
inserted
replaced
6:f8e0e5e237a5 | 7:d59e3ce10e74 |
---|---|
11 # | 11 # |
12 #------------------------------------------------------------------------------ | 12 #------------------------------------------------------------------------------ |
13 """ | 13 """ |
14 | 14 |
15 import argparse | 15 import argparse |
16 import re | |
16 import sys | 17 import sys |
17 from time import sleep | 18 from time import sleep |
18 | 19 |
19 from Bio.Seq import translate | 20 from Bio.Seq import translate |
20 | 21 |
23 import digest | 24 import digest |
24 | 25 |
25 | 26 |
26 server = "https://rest.ensembl.org" | 27 server = "https://rest.ensembl.org" |
27 ext = "/info/assembly/homo_sapiens?" | 28 ext = "/info/assembly/homo_sapiens?" |
28 max_region = 5000000 | 29 max_region = 4000000 |
29 | 30 |
30 | 31 |
31 def ensembl_rest(ext, headers): | 32 def ensembl_rest(ext, headers): |
32 # if True: print >> sys.stderr, "%s" % ext | 33 if True: print >> sys.stderr, "%s" % ext |
33 r = requests.get(server+ext, headers=headers) | 34 r = requests.get(server+ext, headers=headers) |
34 if r.status_code == 429: | 35 if r.status_code == 429: |
35 print >> sys.stderr, "response headers: %s\n" % r.headers | 36 print >> sys.stderr, "response headers: %s\n" % r.headers |
36 if 'Retry-After' in r.headers: | 37 if 'Retry-After' in r.headers: |
37 sleep(r.headers['Retry-After']) | 38 sleep(r.headers['Retry-After']) |
80 coord_system = coord_systems[seq['coord_system']] | 81 coord_system = coord_systems[seq['coord_system']] |
81 coord_system[seq['name']] = int(seq['length']) | 82 coord_system[seq['name']] = int(seq['length']) |
82 return coord_systems | 83 return coord_systems |
83 | 84 |
84 | 85 |
85 def get_transcripts_bed(species, refseq, start, length,params=None): | 86 def get_transcripts_bed(species, refseq, start, length, strand='', params=None): |
86 bed = [] | 87 bed = [] |
87 param = params if params else '' | 88 param = params if params else '' |
88 req_header = {"Content-Type": "text/x-bed"} | 89 req_header = {"Content-Type": "text/x-bed"} |
89 regions = range(start, length, max_region) | 90 regions = range(start, length, max_region) |
90 if not regions or regions[-1] < length: | 91 if not regions or regions[-1] < length: |
91 regions.append(length) | 92 regions.append(length) |
92 for end in regions[1:]: | 93 for end in regions[1:]: |
93 ext = "/overlap/region/%s/%s:%d-%d?feature=transcript;%s"\ | 94 ext = "/overlap/region/%s/%s:%d-%d%s?feature=transcript;%s"\ |
94 % (species, refseq, start, end, param) | 95 % (species, refseq, start, end, strand, param) |
95 start = end + 1 | 96 start = end + 1 |
96 r = ensembl_rest(ext, req_header) | 97 r = ensembl_rest(ext, req_header) |
97 if r.text: | 98 if r.text: |
98 bed += r.text.splitlines() | 99 bed += r.text.splitlines() |
99 return bed | 100 return bed |
113 | 114 |
114 def get_cds(id,params=None): | 115 def get_cds(id,params=None): |
115 return get_seq(id, 'cds',params=params) | 116 return get_seq(id, 'cds',params=params) |
116 | 117 |
117 | 118 |
119 def get_transcript_haplotypes(species,transcript): | |
120 ext = "/transcript_haplotypes/%s/%s?aligned_sequences=1" % (species,transcript) | |
121 req_header = {"Content-Type" : "application/json"} | |
122 r = ensembl_rest(ext, req_header) | |
123 decoded = r.json() | |
124 | |
125 | |
118 def bed_from_line(line): | 126 def bed_from_line(line): |
119 fields = line.rstrip('\r\n').split('\t') | 127 fields = line.rstrip('\r\n').split('\t') |
128 if len(fields) < 12: | |
129 return None | |
120 (chrom, chromStart, chromEnd, name, score, strand, | 130 (chrom, chromStart, chromEnd, name, score, strand, |
121 thickStart, thickEnd, itemRgb, | 131 thickStart, thickEnd, itemRgb, |
122 blockCount, blockSizes, blockStarts) = fields[0:12] | 132 blockCount, blockSizes, blockStarts) = fields[0:12] |
123 bed_entry = BedEntry(chrom=chrom, chromStart=chromStart, chromEnd=chromEnd, | 133 bed_entry = BedEntry(chrom=chrom, chromStart=chromStart, chromEnd=chromEnd, |
124 name=name, score=score, strand=strand, | 134 name=name, score=score, strand=strand, |
125 thickStart=thickStart, thickEnd=thickEnd, | 135 thickStart=thickStart, thickEnd=thickEnd, |
126 itemRgb=itemRgb, | 136 itemRgb=itemRgb, |
127 blockCount=blockCount, | 137 blockCount=blockCount, |
128 blockSizes=blockSizes.rstrip(','), | 138 blockSizes=blockSizes.rstrip(','), |
129 blockStarts=blockStarts.rstrip(',')) | 139 blockStarts=blockStarts.rstrip(',')) |
140 if len(fields) == 20: | |
141 bed_entry.second_name = fields[12] | |
142 bed_entry.cds_start_status = fields[13] | |
143 bed_entry.cds_end_status = fields[14] | |
144 bed_entry.exon_frames = fields[15] | |
145 bed_entry.biotype = fields[16] | |
146 bed_entry.gene_name = fields[17] | |
147 bed_entry.second_gene_name = fields[18] | |
148 bed_entry.gene_type = fields[19] | |
130 return bed_entry | 149 return bed_entry |
131 | 150 |
132 | 151 |
133 class BedEntry(object): | 152 class BedEntry(object): |
134 def __init__(self, chrom=None, chromStart=None, chromEnd=None, | 153 def __init__(self, chrom=None, chromStart=None, chromEnd=None, |
155 self.blockStarts = [int(x) for x in blockStarts.split(',')] | 174 self.blockStarts = [int(x) for x in blockStarts.split(',')] |
156 elif isinstance(blockStarts, list): | 175 elif isinstance(blockStarts, list): |
157 self.blockStarts = [int(x) for x in blockStarts] | 176 self.blockStarts = [int(x) for x in blockStarts] |
158 else: | 177 else: |
159 self.blockStarts = blockStarts | 178 self.blockStarts = blockStarts |
179 self.second_name = None | |
180 self.cds_start_status = None | |
181 self.cds_end_status = None | |
182 self.exon_frames = None | |
183 self.biotype = None | |
184 self.gene_name = None | |
185 self.second_gene_name = None | |
186 self.gene_type = None | |
160 self.seq = None | 187 self.seq = None |
161 self.pep = None | 188 self.pep = None |
162 | 189 |
163 def __str__(self): | 190 def __str__(self): |
164 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s' % ( | 191 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s' % ( |
244 description='Retrieve Ensembl cDNAs and three frame translate') | 271 description='Retrieve Ensembl cDNAs and three frame translate') |
245 parser.add_argument( | 272 parser.add_argument( |
246 '-s', '--species', default='human', | 273 '-s', '--species', default='human', |
247 help='Ensembl Species to retrieve') | 274 help='Ensembl Species to retrieve') |
248 parser.add_argument( | 275 parser.add_argument( |
276 '-R', '--regions', action='append', default=[], | |
277 help='Restrict Ensembl retrieval to regions e.g.: X,2:20000-25000,3:100-500+') | |
278 parser.add_argument( | |
249 '-B', '--biotypes', action='append', default=[], | 279 '-B', '--biotypes', action='append', default=[], |
250 help='Restrict Ensembl biotypes to retrieve') | 280 help='Restrict Ensembl biotypes to retrieve') |
251 parser.add_argument( | 281 parser.add_argument( |
252 '-i', '--input', default=None, | 282 '-i', '--input', default=None, |
253 help='Use BED instead of retrieving cDNA from ensembl (-) for stdin') | 283 help='Use BED instead of retrieving cDNA from ensembl (-) for stdin') |
292 # print >> sys.stderr, "args biotypes: %s" % args.biotypes | 322 # print >> sys.stderr, "args biotypes: %s" % args.biotypes |
293 biotypea = ['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',')] | 323 biotypea = ['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',')] |
294 # print >> sys.stderr, "args biotypes: %s" % biotypea | 324 # print >> sys.stderr, "args biotypes: %s" % biotypea |
295 biotypes = ';'.join(['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',') if bt.strip()]) | 325 biotypes = ';'.join(['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',') if bt.strip()]) |
296 # print >> sys.stderr, "biotypes: %s" % biotypes | 326 # print >> sys.stderr, "biotypes: %s" % biotypes |
327 | |
328 selected_regions = dict() # chrom:(start,end) | |
329 region_pat = '^([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?' | |
330 if args.regions: | |
331 for entry in args.regions: | |
332 if not entry: | |
333 continue | |
334 regs = [x.strip() for x in entry.split(',') if x.strip()] | |
335 for reg in regs: | |
336 m = re.match(region_pat,reg) | |
337 if m: | |
338 (chrom,start,end,strand) = m.groups() | |
339 if chrom: | |
340 if chrom not in selected_regions: | |
341 selected_regions[chrom] = [] | |
342 selected_regions[chrom].append([start,end,strand]) | |
297 | 343 |
298 translations = dict() # start : end : seq | 344 translations = dict() # start : end : seq |
299 | 345 |
300 def unique_prot(tbed, seq): | 346 def unique_prot(tbed, seq): |
301 if tbed.chromStart not in translations: | 347 if tbed.chromStart not in translations: |
362 fa_wtr.write("\n") | 408 fa_wtr.write("\n") |
363 fa_wtr.flush() | 409 fa_wtr.flush() |
364 aa_start = aa_end + 1 | 410 aa_start = aa_end + 1 |
365 return translate_count | 411 return translate_count |
366 | 412 |
413 def translate_region(species,ref,start,stop,strand): | |
414 translation_count = 0 | |
415 regions = range(start, stop, max_region) | |
416 if not regions or regions[-1] < stop: | |
417 regions.append(stop) | |
418 for end in regions[1:]: | |
419 bedlines = get_transcripts_bed(species, ref, start, end, strand=strand, params=biotypes) | |
420 if args.verbose or args.debug: | |
421 print >> sys.stderr,\ | |
422 "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\ | |
423 % (species, ref, start, end, len(bedlines)) | |
424 # start, end, seq | |
425 for i, bedline in enumerate(bedlines): | |
426 try: | |
427 bed = bed_from_line(bedline)\ | |
428 if any([not args.raw, fa_wtr, bed_wtr])\ | |
429 else None | |
430 if tx_wtr: | |
431 tx_wtr.write(bedline if args.raw else str(bed)) | |
432 tx_wtr.write("\n") | |
433 tx_wtr.flush() | |
434 if bed: | |
435 translation_count += translate_bed(bed) | |
436 except Exception as e: | |
437 print >> sys.stderr,\ | |
438 "BED error (%s) : %s\n" % (e, bedline) | |
439 start = end + 1 | |
440 return translation_count | |
441 | |
367 if input_rdr: | 442 if input_rdr: |
368 translation_count = 0 | 443 translation_count = 0 |
369 for i, bedline in enumerate(input_rdr): | 444 for i, bedline in enumerate(input_rdr): |
370 try: | 445 try: |
371 bed = bed_from_line(bedline) | 446 bed = bed_from_line(bedline) |
447 if bed is None: | |
448 continue | |
449 if bed.biotype and biotypea and bed.biotype not in biotypea: | |
450 continue | |
372 translation_count += translate_bed(bed) | 451 translation_count += translate_bed(bed) |
373 except: | 452 except: |
374 print >> sys.stderr, "BED format error: %s\n" % bedline | 453 print >> sys.stderr, "BED format error: %s\n" % bedline |
375 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])): | 454 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])): |
376 print >> sys.stderr,\ | 455 print >> sys.stderr,\ |
377 "%s\tcDNA translations:%d" % (species, translation_count) | 456 "%s\tcDNA translations:%d" % (species, translation_count) |
378 else: | 457 else: |
379 coord_systems = get_toplevel(species) | 458 coord_systems = get_toplevel(species) |
380 if 'chromosome' in coord_systems: | 459 if 'chromosome' in coord_systems: |
460 ref_lengths = dict() | |
381 for ref in sorted(coord_systems['chromosome'].keys()): | 461 for ref in sorted(coord_systems['chromosome'].keys()): |
382 length = coord_systems['chromosome'][ref] | 462 length = coord_systems['chromosome'][ref] |
463 ref_lengths[ref] = length | |
383 if not any([tx_wtr, fa_wtr, bed_wtr]): | 464 if not any([tx_wtr, fa_wtr, bed_wtr]): |
384 print >> sys.stderr,\ | 465 print >> sys.stderr,\ |
385 "%s\t%s\tlength: %d" % (species, ref, length) | 466 "%s\t%s\tlength: %d" % (species, ref, length) |
386 continue | 467 if selected_regions: |
387 if args.debug: | |
388 print >> sys.stderr,\ | |
389 "Retrieving transcripts: %s\t%s\tlength: %d"\ | |
390 % (species, ref, length) | |
391 translation_count = 0 | 468 translation_count = 0 |
469 for ref in sorted(selected_regions.keys()): | |
470 if ref in ref_lengths: | |
471 for reg in selected_regions[ref]: | |
472 (_start,_stop,_strand) = reg | |
473 start = int(_start) if _start else 0 | |
474 stop = int(_stop) if _stop else ref_lengths[ref] | |
475 strand = '' if not _strand else ':1' if _strand == '+' else ':-1' | |
476 translation_count += translate_region(species,ref,start,stop,strand) | |
477 else: | |
478 strand = '' | |
392 start = 0 | 479 start = 0 |
393 regions = range(start, length, max_region) | 480 for ref in sorted(ref_lengths.keys()): |
394 if not regions or regions[-1] < length: | 481 length = ref_lengths[ref] |
395 regions.append(length) | 482 translation_count = 0 |
396 for end in regions[1:]: | 483 if args.debug: |
397 bedlines = get_transcripts_bed(species, ref, start, end, params=biotypes) | |
398 if args.verbose or args.debug: | |
399 print >> sys.stderr,\ | 484 print >> sys.stderr,\ |
400 "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\ | 485 "Retrieving transcripts: %s\t%s\tlength: %d"\ |
401 % (species, ref, start, end, len(bedlines)) | 486 % (species, ref, length) |
402 # start, end, seq | 487 translation_count += translate_region(species,ref,start,length,strand) |
403 for i, bedline in enumerate(bedlines): | 488 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])): |
404 try: | 489 print >> sys.stderr,\ |
405 bed = bed_from_line(bedline)\ | 490 "%s\t%s\tlength: %d\tcDNA translations:%d"\ |
406 if any([not args.raw, fa_wtr, bed_wtr])\ | 491 % (species, ref, length, translation_count) |
407 else None | |
408 if tx_wtr: | |
409 tx_wtr.write(bedline if args.raw else str(bed)) | |
410 tx_wtr.write("\n") | |
411 tx_wtr.flush() | |
412 if bed: | |
413 translation_count += translate_bed(bed) | |
414 except Exception as e: | |
415 print >> sys.stderr,\ | |
416 "BED error (%s) : %s\n" % (e, bedline) | |
417 start = end + 1 | |
418 | |
419 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])): | |
420 print >> sys.stderr,\ | |
421 "%s\t%s\tlength: %d\tcDNA translations:%d"\ | |
422 % (species, ref, length, translation_count) | |
423 | 492 |
424 | 493 |
425 if __name__ == "__main__": | 494 if __name__ == "__main__": |
426 __main__() | 495 __main__() |