8
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 #
|
|
4 #------------------------------------------------------------------------------
|
|
5 # University of Minnesota
|
|
6 # Copyright 2017, Regents of the University of Minnesota
|
|
7 #------------------------------------------------------------------------------
|
|
8 # Author:
|
|
9 #
|
|
10 # James E Johnson
|
|
11 #
|
|
12 #------------------------------------------------------------------------------
|
|
13 """
|
|
14
|
|
15 import sys
|
|
16 import requests
|
|
17 from time import sleep
|
|
18
|
|
19
|
|
20 server = "https://rest.ensembl.org"
|
|
21 ext = "/info/assembly/homo_sapiens?"
|
|
22 max_region = 4000000
|
|
23
|
|
24
|
|
25 def ensembl_rest(ext, headers):
|
|
26 if True: print >> sys.stderr, "%s" % ext
|
|
27 r = requests.get(server+ext, headers=headers)
|
|
28 if r.status_code == 429:
|
|
29 print >> sys.stderr, "response headers: %s\n" % r.headers
|
|
30 if 'Retry-After' in r.headers:
|
|
31 sleep(r.headers['Retry-After'])
|
|
32 r = requests.get(server+ext, headers=headers)
|
|
33 if not r.ok:
|
|
34 r.raise_for_status()
|
|
35 return r
|
|
36
|
|
37
|
|
38 def get_species():
|
|
39 results = dict()
|
|
40 ext = "/info/species"
|
|
41 req_header = {"Content-Type": "application/json"}
|
|
42 r = ensembl_rest(ext, req_header)
|
|
43 for species in r.json()['species']:
|
|
44 results[species['name']] = species
|
|
45 print >> sys.stdout,\
|
|
46 "%s\t%s\t%s\t%s\t%s"\
|
|
47 % (species['name'], species['common_name'],
|
|
48 species['display_name'],
|
|
49 species['strain'],
|
|
50 species['taxon_id'])
|
|
51 return results
|
|
52
|
|
53
|
|
54 def get_biotypes(species):
|
|
55 biotypes = []
|
|
56 ext = "/info/biotypes/%s?" % species
|
|
57 req_header = {"Content-Type": "application/json"}
|
|
58 r = ensembl_rest(ext, req_header)
|
|
59 for entry in r.json():
|
|
60 if 'biotype' in entry:
|
|
61 biotypes.append(entry['biotype'])
|
|
62 return biotypes
|
|
63
|
|
64
|
|
65 def get_toplevel(species):
|
|
66 coord_systems = dict()
|
|
67 ext = "/info/assembly/%s?" % species
|
|
68 req_header = {"Content-Type": "application/json"}
|
|
69 r = ensembl_rest(ext, req_header)
|
|
70 toplevel = r.json()
|
|
71 for seq in toplevel['top_level_region']:
|
|
72 if seq['coord_system'] not in coord_systems:
|
|
73 coord_systems[seq['coord_system']] = dict()
|
|
74 coord_system = coord_systems[seq['coord_system']]
|
|
75 coord_system[seq['name']] = int(seq['length'])
|
|
76 return coord_systems
|
|
77
|
|
78
|
|
79 def get_transcripts_bed(species, refseq, start, length, strand='', params=None):
|
|
80 bed = []
|
|
81 param = params if params else ''
|
|
82 req_header = {"Content-Type": "text/x-bed"}
|
|
83 regions = range(start, length, max_region)
|
|
84 if not regions or regions[-1] < length:
|
|
85 regions.append(length)
|
|
86 for end in regions[1:]:
|
|
87 ext = "/overlap/region/%s/%s:%d-%d%s?feature=transcript;%s"\
|
|
88 % (species, refseq, start, end, strand, param)
|
|
89 start = end + 1
|
|
90 r = ensembl_rest(ext, req_header)
|
|
91 if r.text:
|
|
92 bed += r.text.splitlines()
|
|
93 return bed
|
|
94
|
|
95
|
|
96 def get_seq(id, seqtype,params=None):
|
|
97 param = params if params else ''
|
|
98 ext = "/sequence/id/%s?type=%s;%s" % (id, seqtype,param)
|
|
99 req_header = {"Content-Type": "text/plain"}
|
|
100 r = ensembl_rest(ext, req_header)
|
|
101 return r.text
|
|
102
|
|
103
|
|
104 def get_cdna(id,params=None):
|
|
105 return get_seq(id, 'cdna',params=params)
|
|
106
|
|
107
|
|
108 def get_cds(id,params=None):
|
|
109 return get_seq(id, 'cds',params=params)
|
|
110
|
|
111
|
|
112 def get_genomic(id,params=None):
|
|
113 return get_seq(id, 'genomic',params=params)
|
|
114
|
|
115
|
|
116 def get_transcript_haplotypes(species,transcript):
|
|
117 ext = "/transcript_haplotypes/%s/%s?aligned_sequences=1" % (species,transcript)
|
|
118 req_header = {"Content-Type" : "application/json"}
|
|
119 r = ensembl_rest(ext, req_header)
|
|
120 decoded = r.json()
|