comparison ensembl_cdna_translate.py @ 8:5c92d0be6514 draft

Uploaded
author jjohnson
date Thu, 14 Dec 2017 13:32:00 -0500
parents d59e3ce10e74
children
comparison
equal deleted inserted replaced
7:d59e3ce10e74 8:5c92d0be6514
19 19
20 from Bio.Seq import translate 20 from Bio.Seq import translate
21 21
22 import requests 22 import requests
23 23
24 from bedutil import BedEntry, bed_from_line
24 import digest 25 import digest
25 26 from ensembl_rest import get_toplevel, get_transcripts_bed, get_cds, get_cdna, max_region
26 27 from twobitreader import TwoBitFile
27 server = "https://rest.ensembl.org"
28 ext = "/info/assembly/homo_sapiens?"
29 max_region = 4000000
30
31
32 def ensembl_rest(ext, headers):
33 if True: print >> sys.stderr, "%s" % ext
34 r = requests.get(server+ext, headers=headers)
35 if r.status_code == 429:
36 print >> sys.stderr, "response headers: %s\n" % r.headers
37 if 'Retry-After' in r.headers:
38 sleep(r.headers['Retry-After'])
39 r = requests.get(server+ext, headers=headers)
40 if not r.ok:
41 r.raise_for_status()
42 return r
43
44
45 def get_species():
46 results = dict()
47 ext = "/info/species"
48 req_header = {"Content-Type": "application/json"}
49 r = ensembl_rest(ext, req_header)
50 for species in r.json()['species']:
51 results[species['name']] = species
52 print >> sys.stdout,\
53 "%s\t%s\t%s\t%s\t%s"\
54 % (species['name'], species['common_name'],
55 species['display_name'],
56 species['strain'],
57 species['taxon_id'])
58 return results
59
60
61 def get_biotypes(species):
62 biotypes = []
63 ext = "/info/biotypes/%s?" % species
64 req_header = {"Content-Type": "application/json"}
65 r = ensembl_rest(ext, req_header)
66 for entry in r.json():
67 if 'biotype' in entry:
68 biotypes.append(entry['biotype'])
69 return biotypes
70
71
72 def get_toplevel(species):
73 coord_systems = dict()
74 ext = "/info/assembly/%s?" % species
75 req_header = {"Content-Type": "application/json"}
76 r = ensembl_rest(ext, req_header)
77 toplevel = r.json()
78 for seq in toplevel['top_level_region']:
79 if seq['coord_system'] not in coord_systems:
80 coord_systems[seq['coord_system']] = dict()
81 coord_system = coord_systems[seq['coord_system']]
82 coord_system[seq['name']] = int(seq['length'])
83 return coord_systems
84
85
86 def get_transcripts_bed(species, refseq, start, length, strand='', params=None):
87 bed = []
88 param = params if params else ''
89 req_header = {"Content-Type": "text/x-bed"}
90 regions = range(start, length, max_region)
91 if not regions or regions[-1] < length:
92 regions.append(length)
93 for end in regions[1:]:
94 ext = "/overlap/region/%s/%s:%d-%d%s?feature=transcript;%s"\
95 % (species, refseq, start, end, strand, param)
96 start = end + 1
97 r = ensembl_rest(ext, req_header)
98 if r.text:
99 bed += r.text.splitlines()
100 return bed
101
102
103 def get_seq(id, seqtype,params=None):
104 param = params if params else ''
105 ext = "/sequence/id/%s?type=%s;%s" % (id, seqtype,param)
106 req_header = {"Content-Type": "text/plain"}
107 r = ensembl_rest(ext, req_header)
108 return r.text
109
110
111 def get_cdna(id,params=None):
112 return get_seq(id, 'cdna',params=params)
113
114
115 def get_cds(id,params=None):
116 return get_seq(id, 'cds',params=params)
117
118
119 def get_transcript_haplotypes(species,transcript):
120 ext = "/transcript_haplotypes/%s/%s?aligned_sequences=1" % (species,transcript)
121 req_header = {"Content-Type" : "application/json"}
122 r = ensembl_rest(ext, req_header)
123 decoded = r.json()
124
125
126 def bed_from_line(line):
127 fields = line.rstrip('\r\n').split('\t')
128 if len(fields) < 12:
129 return None
130 (chrom, chromStart, chromEnd, name, score, strand,
131 thickStart, thickEnd, itemRgb,
132 blockCount, blockSizes, blockStarts) = fields[0:12]
133 bed_entry = BedEntry(chrom=chrom, chromStart=chromStart, chromEnd=chromEnd,
134 name=name, score=score, strand=strand,
135 thickStart=thickStart, thickEnd=thickEnd,
136 itemRgb=itemRgb,
137 blockCount=blockCount,
138 blockSizes=blockSizes.rstrip(','),
139 blockStarts=blockStarts.rstrip(','))
140 if len(fields) == 20:
141 bed_entry.second_name = fields[12]
142 bed_entry.cds_start_status = fields[13]
143 bed_entry.cds_end_status = fields[14]
144 bed_entry.exon_frames = fields[15]
145 bed_entry.biotype = fields[16]
146 bed_entry.gene_name = fields[17]
147 bed_entry.second_gene_name = fields[18]
148 bed_entry.gene_type = fields[19]
149 return bed_entry
150
151
152 class BedEntry(object):
153 def __init__(self, chrom=None, chromStart=None, chromEnd=None,
154 name=None, score=None, strand=None,
155 thickStart=None, thickEnd=None, itemRgb=None,
156 blockCount=None, blockSizes=None, blockStarts=None):
157 self.chrom = chrom
158 self.chromStart = int(chromStart)
159 self.chromEnd = int(chromEnd)
160 self.name = name
161 self.score = int(score) if score is not None else 0
162 self.strand = '-' if str(strand).startswith('-') else '+'
163 self.thickStart = int(thickStart) if thickStart else self.chromStart
164 self.thickEnd = int(thickEnd) if thickEnd else self.chromEnd
165 self.itemRgb = str(itemRgb) if itemRgb is not None else r'100,100,100'
166 self.blockCount = int(blockCount)
167 if isinstance(blockSizes, str) or isinstance(blockSizes, unicode):
168 self.blockSizes = [int(x) for x in blockSizes.split(',')]
169 elif isinstance(blockSizes, list):
170 self.blockSizes = [int(x) for x in blockSizes]
171 else:
172 self.blockSizes = blockSizes
173 if isinstance(blockStarts, str) or isinstance(blockSizes, unicode):
174 self.blockStarts = [int(x) for x in blockStarts.split(',')]
175 elif isinstance(blockStarts, list):
176 self.blockStarts = [int(x) for x in blockStarts]
177 else:
178 self.blockStarts = blockStarts
179 self.second_name = None
180 self.cds_start_status = None
181 self.cds_end_status = None
182 self.exon_frames = None
183 self.biotype = None
184 self.gene_name = None
185 self.second_gene_name = None
186 self.gene_type = None
187 self.seq = None
188 self.pep = None
189
190 def __str__(self):
191 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s' % (
192 self.chrom, self.chromStart, self.chromEnd,
193 self.name, self.score, self.strand,
194 self.thickStart, self.thickEnd, str(self.itemRgb), self.blockCount,
195 ','.join([str(x) for x in self.blockSizes]),
196 ','.join([str(x) for x in self.blockStarts]))
197
198 # (start, end)
199 def get_subrange(self, tstart, tstop, debug=False):
200 chromStart = self.chromStart
201 chromEnd = self.chromEnd
202 if debug:
203 print >> sys.stderr, "%s" % (str(self))
204 r = range(self.blockCount)
205 if self.strand == '-':
206 r.reverse()
207 bStart = 0
208 bEnd = 0
209 for x in r:
210 bEnd = bStart + self.blockSizes[x]
211 if bStart <= tstart < bEnd:
212 if self.strand == '+':
213 chromStart = self.chromStart + self.blockStarts[x] +\
214 (tstart - bStart)
215 else:
216 chromEnd = self.chromStart + self.blockStarts[x] +\
217 self.blockSizes[x] - (tstart - bStart)
218 if bStart <= tstop < bEnd:
219 if self.strand == '+':
220 chromEnd = self.chromStart + self.blockStarts[x] +\
221 (tstop - bStart)
222 else:
223 chromStart = self.chromStart + self.blockStarts[x] +\
224 self.blockSizes[x] - (tstop - bStart)
225 if debug:
226 print >> sys.stderr,\
227 "%3d %s\t%d\t%d\t%d\t%d\t%d\t%d"\
228 % (x, self.strand, bStart, bEnd,
229 tstart, tstop, chromStart, chromEnd)
230 bStart += self.blockSizes[x]
231 return(chromStart, chromEnd)
232
233 # get the blocks for sub range
234 def get_blocks(self, chromStart, chromEnd):
235 tblockCount = 0
236 tblockSizes = []
237 tblockStarts = []
238 for x in range(self.blockCount):
239 bStart = self.chromStart + self.blockStarts[x]
240 bEnd = bStart + self.blockSizes[x]
241 if bStart > chromEnd:
242 break
243 if bEnd < chromStart:
244 continue
245 cStart = max(chromStart, bStart)
246 tblockStarts.append(cStart - chromStart)
247 tblockSizes.append(min(chromEnd, bEnd) - cStart)
248 tblockCount += 1
249 return (tblockCount, tblockSizes, tblockStarts)
250
251 def trim(self, tstart, tstop, debug=False):
252 (tchromStart, tchromEnd) =\
253 self.get_subrange(tstart, tstop, debug=debug)
254 (tblockCount, tblockSizes, tblockStarts) =\
255 self.get_blocks(tchromStart, tchromEnd)
256 tbed = BedEntry(
257 chrom=self.chrom, chromStart=tchromStart, chromEnd=tchromEnd,
258 name=self.name, score=self.score, strand=self.strand,
259 thickStart=tchromStart, thickEnd=tchromEnd, itemRgb=self.itemRgb,
260 blockCount=tblockCount,
261 blockSizes=tblockSizes, blockStarts=tblockStarts)
262 if self.seq:
263 ts = tchromStart-self.chromStart
264 te = tchromEnd - tchromStart + ts
265 tbed.seq = self.seq[ts:te]
266 return tbed
267 28
268 29
269 def __main__(): 30 def __main__():
270 parser = argparse.ArgumentParser( 31 parser = argparse.ArgumentParser(
271 description='Retrieve Ensembl cDNAs and three frame translate') 32 description='Retrieve Ensembl cDNAs and three frame translate')
279 '-B', '--biotypes', action='append', default=[], 40 '-B', '--biotypes', action='append', default=[],
280 help='Restrict Ensembl biotypes to retrieve') 41 help='Restrict Ensembl biotypes to retrieve')
281 parser.add_argument( 42 parser.add_argument(
282 '-i', '--input', default=None, 43 '-i', '--input', default=None,
283 help='Use BED instead of retrieving cDNA from ensembl (-) for stdin') 44 help='Use BED instead of retrieving cDNA from ensembl (-) for stdin')
45 parser.add_argument(
46 '-T', '--twobit', default=None,
47 help='Genome reference sequence in 2bit format')
284 parser.add_argument( 48 parser.add_argument(
285 '-t', '--transcripts', default=None, 49 '-t', '--transcripts', default=None,
286 help='Path to output cDNA transcripts.bed (-) for stdout') 50 help='Path to output cDNA transcripts.bed (-) for stdout')
287 parser.add_argument( 51 parser.add_argument(
288 '-r', '--raw', action='store_true', 52 '-r', '--raw', action='store_true',
323 biotypea = ['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',')] 87 biotypea = ['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',')]
324 # print >> sys.stderr, "args biotypes: %s" % biotypea 88 # print >> sys.stderr, "args biotypes: %s" % biotypea
325 biotypes = ';'.join(['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',') if bt.strip()]) 89 biotypes = ';'.join(['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',') if bt.strip()])
326 # print >> sys.stderr, "biotypes: %s" % biotypes 90 # print >> sys.stderr, "biotypes: %s" % biotypes
327 91
92 twobit = TwoBitFile(args.twobit) if args.twobit else None
93
328 selected_regions = dict() # chrom:(start,end) 94 selected_regions = dict() # chrom:(start,end)
329 region_pat = '^([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?' 95 region_pat = '^([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?'
330 if args.regions: 96 if args.regions:
331 for entry in args.regions: 97 for entry in args.regions:
332 if not entry: 98 if not entry:
338 (chrom,start,end,strand) = m.groups() 104 (chrom,start,end,strand) = m.groups()
339 if chrom: 105 if chrom:
340 if chrom not in selected_regions: 106 if chrom not in selected_regions:
341 selected_regions[chrom] = [] 107 selected_regions[chrom] = []
342 selected_regions[chrom].append([start,end,strand]) 108 selected_regions[chrom].append([start,end,strand])
109 if args.debug: print >> sys.stderr, "selected_regions: %s" % selected_regions
343 110
344 translations = dict() # start : end : seq 111 translations = dict() # start : end : seq
345 112
346 def unique_prot(tbed, seq): 113 def unique_prot(tbed, seq):
347 if tbed.chromStart not in translations: 114 if tbed.chromStart not in translations:
355 translations[tbed.chromStart][tbed.chromEnd].append(seq) 122 translations[tbed.chromStart][tbed.chromEnd].append(seq)
356 else: 123 else:
357 return False 124 return False
358 return True 125 return True
359 126
127 def get_sequence(chrom,start,end):
128 if twobit:
129 if chrom in twobit:
130 return twobit[chrom][start:end]
131 contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom
132 if contig in twobit:
133 return twobit[contig][start:end]
134 return None
135
360 def translate_bed(bed): 136 def translate_bed(bed):
361 translate_count = 0 137 translate_count = 0
362 if any([fa_wtr, bed_wtr]): 138 if any([fa_wtr, bed_wtr]):
363 transcript_id = bed.name 139 transcript_id = bed.name
364 refprot = None 140 refprot = None
141 if twobit:
142 bed.seq = get_sequence(bed.chrom,bed.chromStart,bed.chromEnd)
143 else:
144 bed.cdna = get_cdna(transcript_id)
145 cdna = bed.get_cdna()
146 cdna_len = len(cdna)
365 if not args.all: 147 if not args.all:
366 try: 148 try:
367 cds = get_cds(transcript_id) 149 cds = bed.get_cds()
150 if cds is None:
151 cds = get_cds(transcript_id)
368 if len(cds) % 3 != 0: 152 if len(cds) % 3 != 0:
369 cds = cds[:-(len(cds) % 3)] 153 cds = cds[:-(len(cds) % 3)]
370 refprot = translate(cds) if cds else None 154 refprot = translate(cds) if cds else None
371 except: 155 except:
372 refprot = None 156 refprot = None
373 cdna = get_cdna(transcript_id)
374 cdna_len = len(cdna)
375 for offset in range(3): 157 for offset in range(3):
376 seqend = cdna_len - (cdna_len - offset) % 3 158 seqend = cdna_len - (cdna_len - offset) % 3
377 aaseq = translate(cdna[offset:seqend]) 159 aaseq = translate(cdna[offset:seqend])
378 aa_start = 0 160 aa_start = 0
379 while aa_start < len(aaseq): 161 while aa_start < len(aaseq):