Mercurial > repos > jjohnson > ensembl_cdna_translate
comparison ensembl_cdna_translate.py @ 8:5c92d0be6514 draft
Uploaded
author | jjohnson |
---|---|
date | Thu, 14 Dec 2017 13:32:00 -0500 |
parents | d59e3ce10e74 |
children |
comparison
equal
deleted
inserted
replaced
7:d59e3ce10e74 | 8:5c92d0be6514 |
---|---|
19 | 19 |
20 from Bio.Seq import translate | 20 from Bio.Seq import translate |
21 | 21 |
22 import requests | 22 import requests |
23 | 23 |
24 from bedutil import BedEntry, bed_from_line | |
24 import digest | 25 import digest |
25 | 26 from ensembl_rest import get_toplevel, get_transcripts_bed, get_cds, get_cdna, max_region |
26 | 27 from twobitreader import TwoBitFile |
27 server = "https://rest.ensembl.org" | |
28 ext = "/info/assembly/homo_sapiens?" | |
29 max_region = 4000000 | |
30 | |
31 | |
32 def ensembl_rest(ext, headers): | |
33 if True: print >> sys.stderr, "%s" % ext | |
34 r = requests.get(server+ext, headers=headers) | |
35 if r.status_code == 429: | |
36 print >> sys.stderr, "response headers: %s\n" % r.headers | |
37 if 'Retry-After' in r.headers: | |
38 sleep(r.headers['Retry-After']) | |
39 r = requests.get(server+ext, headers=headers) | |
40 if not r.ok: | |
41 r.raise_for_status() | |
42 return r | |
43 | |
44 | |
45 def get_species(): | |
46 results = dict() | |
47 ext = "/info/species" | |
48 req_header = {"Content-Type": "application/json"} | |
49 r = ensembl_rest(ext, req_header) | |
50 for species in r.json()['species']: | |
51 results[species['name']] = species | |
52 print >> sys.stdout,\ | |
53 "%s\t%s\t%s\t%s\t%s"\ | |
54 % (species['name'], species['common_name'], | |
55 species['display_name'], | |
56 species['strain'], | |
57 species['taxon_id']) | |
58 return results | |
59 | |
60 | |
61 def get_biotypes(species): | |
62 biotypes = [] | |
63 ext = "/info/biotypes/%s?" % species | |
64 req_header = {"Content-Type": "application/json"} | |
65 r = ensembl_rest(ext, req_header) | |
66 for entry in r.json(): | |
67 if 'biotype' in entry: | |
68 biotypes.append(entry['biotype']) | |
69 return biotypes | |
70 | |
71 | |
72 def get_toplevel(species): | |
73 coord_systems = dict() | |
74 ext = "/info/assembly/%s?" % species | |
75 req_header = {"Content-Type": "application/json"} | |
76 r = ensembl_rest(ext, req_header) | |
77 toplevel = r.json() | |
78 for seq in toplevel['top_level_region']: | |
79 if seq['coord_system'] not in coord_systems: | |
80 coord_systems[seq['coord_system']] = dict() | |
81 coord_system = coord_systems[seq['coord_system']] | |
82 coord_system[seq['name']] = int(seq['length']) | |
83 return coord_systems | |
84 | |
85 | |
86 def get_transcripts_bed(species, refseq, start, length, strand='', params=None): | |
87 bed = [] | |
88 param = params if params else '' | |
89 req_header = {"Content-Type": "text/x-bed"} | |
90 regions = range(start, length, max_region) | |
91 if not regions or regions[-1] < length: | |
92 regions.append(length) | |
93 for end in regions[1:]: | |
94 ext = "/overlap/region/%s/%s:%d-%d%s?feature=transcript;%s"\ | |
95 % (species, refseq, start, end, strand, param) | |
96 start = end + 1 | |
97 r = ensembl_rest(ext, req_header) | |
98 if r.text: | |
99 bed += r.text.splitlines() | |
100 return bed | |
101 | |
102 | |
103 def get_seq(id, seqtype,params=None): | |
104 param = params if params else '' | |
105 ext = "/sequence/id/%s?type=%s;%s" % (id, seqtype,param) | |
106 req_header = {"Content-Type": "text/plain"} | |
107 r = ensembl_rest(ext, req_header) | |
108 return r.text | |
109 | |
110 | |
111 def get_cdna(id,params=None): | |
112 return get_seq(id, 'cdna',params=params) | |
113 | |
114 | |
115 def get_cds(id,params=None): | |
116 return get_seq(id, 'cds',params=params) | |
117 | |
118 | |
119 def get_transcript_haplotypes(species,transcript): | |
120 ext = "/transcript_haplotypes/%s/%s?aligned_sequences=1" % (species,transcript) | |
121 req_header = {"Content-Type" : "application/json"} | |
122 r = ensembl_rest(ext, req_header) | |
123 decoded = r.json() | |
124 | |
125 | |
126 def bed_from_line(line): | |
127 fields = line.rstrip('\r\n').split('\t') | |
128 if len(fields) < 12: | |
129 return None | |
130 (chrom, chromStart, chromEnd, name, score, strand, | |
131 thickStart, thickEnd, itemRgb, | |
132 blockCount, blockSizes, blockStarts) = fields[0:12] | |
133 bed_entry = BedEntry(chrom=chrom, chromStart=chromStart, chromEnd=chromEnd, | |
134 name=name, score=score, strand=strand, | |
135 thickStart=thickStart, thickEnd=thickEnd, | |
136 itemRgb=itemRgb, | |
137 blockCount=blockCount, | |
138 blockSizes=blockSizes.rstrip(','), | |
139 blockStarts=blockStarts.rstrip(',')) | |
140 if len(fields) == 20: | |
141 bed_entry.second_name = fields[12] | |
142 bed_entry.cds_start_status = fields[13] | |
143 bed_entry.cds_end_status = fields[14] | |
144 bed_entry.exon_frames = fields[15] | |
145 bed_entry.biotype = fields[16] | |
146 bed_entry.gene_name = fields[17] | |
147 bed_entry.second_gene_name = fields[18] | |
148 bed_entry.gene_type = fields[19] | |
149 return bed_entry | |
150 | |
151 | |
152 class BedEntry(object): | |
153 def __init__(self, chrom=None, chromStart=None, chromEnd=None, | |
154 name=None, score=None, strand=None, | |
155 thickStart=None, thickEnd=None, itemRgb=None, | |
156 blockCount=None, blockSizes=None, blockStarts=None): | |
157 self.chrom = chrom | |
158 self.chromStart = int(chromStart) | |
159 self.chromEnd = int(chromEnd) | |
160 self.name = name | |
161 self.score = int(score) if score is not None else 0 | |
162 self.strand = '-' if str(strand).startswith('-') else '+' | |
163 self.thickStart = int(thickStart) if thickStart else self.chromStart | |
164 self.thickEnd = int(thickEnd) if thickEnd else self.chromEnd | |
165 self.itemRgb = str(itemRgb) if itemRgb is not None else r'100,100,100' | |
166 self.blockCount = int(blockCount) | |
167 if isinstance(blockSizes, str) or isinstance(blockSizes, unicode): | |
168 self.blockSizes = [int(x) for x in blockSizes.split(',')] | |
169 elif isinstance(blockSizes, list): | |
170 self.blockSizes = [int(x) for x in blockSizes] | |
171 else: | |
172 self.blockSizes = blockSizes | |
173 if isinstance(blockStarts, str) or isinstance(blockSizes, unicode): | |
174 self.blockStarts = [int(x) for x in blockStarts.split(',')] | |
175 elif isinstance(blockStarts, list): | |
176 self.blockStarts = [int(x) for x in blockStarts] | |
177 else: | |
178 self.blockStarts = blockStarts | |
179 self.second_name = None | |
180 self.cds_start_status = None | |
181 self.cds_end_status = None | |
182 self.exon_frames = None | |
183 self.biotype = None | |
184 self.gene_name = None | |
185 self.second_gene_name = None | |
186 self.gene_type = None | |
187 self.seq = None | |
188 self.pep = None | |
189 | |
190 def __str__(self): | |
191 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s' % ( | |
192 self.chrom, self.chromStart, self.chromEnd, | |
193 self.name, self.score, self.strand, | |
194 self.thickStart, self.thickEnd, str(self.itemRgb), self.blockCount, | |
195 ','.join([str(x) for x in self.blockSizes]), | |
196 ','.join([str(x) for x in self.blockStarts])) | |
197 | |
198 # (start, end) | |
199 def get_subrange(self, tstart, tstop, debug=False): | |
200 chromStart = self.chromStart | |
201 chromEnd = self.chromEnd | |
202 if debug: | |
203 print >> sys.stderr, "%s" % (str(self)) | |
204 r = range(self.blockCount) | |
205 if self.strand == '-': | |
206 r.reverse() | |
207 bStart = 0 | |
208 bEnd = 0 | |
209 for x in r: | |
210 bEnd = bStart + self.blockSizes[x] | |
211 if bStart <= tstart < bEnd: | |
212 if self.strand == '+': | |
213 chromStart = self.chromStart + self.blockStarts[x] +\ | |
214 (tstart - bStart) | |
215 else: | |
216 chromEnd = self.chromStart + self.blockStarts[x] +\ | |
217 self.blockSizes[x] - (tstart - bStart) | |
218 if bStart <= tstop < bEnd: | |
219 if self.strand == '+': | |
220 chromEnd = self.chromStart + self.blockStarts[x] +\ | |
221 (tstop - bStart) | |
222 else: | |
223 chromStart = self.chromStart + self.blockStarts[x] +\ | |
224 self.blockSizes[x] - (tstop - bStart) | |
225 if debug: | |
226 print >> sys.stderr,\ | |
227 "%3d %s\t%d\t%d\t%d\t%d\t%d\t%d"\ | |
228 % (x, self.strand, bStart, bEnd, | |
229 tstart, tstop, chromStart, chromEnd) | |
230 bStart += self.blockSizes[x] | |
231 return(chromStart, chromEnd) | |
232 | |
233 # get the blocks for sub range | |
234 def get_blocks(self, chromStart, chromEnd): | |
235 tblockCount = 0 | |
236 tblockSizes = [] | |
237 tblockStarts = [] | |
238 for x in range(self.blockCount): | |
239 bStart = self.chromStart + self.blockStarts[x] | |
240 bEnd = bStart + self.blockSizes[x] | |
241 if bStart > chromEnd: | |
242 break | |
243 if bEnd < chromStart: | |
244 continue | |
245 cStart = max(chromStart, bStart) | |
246 tblockStarts.append(cStart - chromStart) | |
247 tblockSizes.append(min(chromEnd, bEnd) - cStart) | |
248 tblockCount += 1 | |
249 return (tblockCount, tblockSizes, tblockStarts) | |
250 | |
251 def trim(self, tstart, tstop, debug=False): | |
252 (tchromStart, tchromEnd) =\ | |
253 self.get_subrange(tstart, tstop, debug=debug) | |
254 (tblockCount, tblockSizes, tblockStarts) =\ | |
255 self.get_blocks(tchromStart, tchromEnd) | |
256 tbed = BedEntry( | |
257 chrom=self.chrom, chromStart=tchromStart, chromEnd=tchromEnd, | |
258 name=self.name, score=self.score, strand=self.strand, | |
259 thickStart=tchromStart, thickEnd=tchromEnd, itemRgb=self.itemRgb, | |
260 blockCount=tblockCount, | |
261 blockSizes=tblockSizes, blockStarts=tblockStarts) | |
262 if self.seq: | |
263 ts = tchromStart-self.chromStart | |
264 te = tchromEnd - tchromStart + ts | |
265 tbed.seq = self.seq[ts:te] | |
266 return tbed | |
267 | 28 |
268 | 29 |
269 def __main__(): | 30 def __main__(): |
270 parser = argparse.ArgumentParser( | 31 parser = argparse.ArgumentParser( |
271 description='Retrieve Ensembl cDNAs and three frame translate') | 32 description='Retrieve Ensembl cDNAs and three frame translate') |
279 '-B', '--biotypes', action='append', default=[], | 40 '-B', '--biotypes', action='append', default=[], |
280 help='Restrict Ensembl biotypes to retrieve') | 41 help='Restrict Ensembl biotypes to retrieve') |
281 parser.add_argument( | 42 parser.add_argument( |
282 '-i', '--input', default=None, | 43 '-i', '--input', default=None, |
283 help='Use BED instead of retrieving cDNA from ensembl (-) for stdin') | 44 help='Use BED instead of retrieving cDNA from ensembl (-) for stdin') |
45 parser.add_argument( | |
46 '-T', '--twobit', default=None, | |
47 help='Genome reference sequence in 2bit format') | |
284 parser.add_argument( | 48 parser.add_argument( |
285 '-t', '--transcripts', default=None, | 49 '-t', '--transcripts', default=None, |
286 help='Path to output cDNA transcripts.bed (-) for stdout') | 50 help='Path to output cDNA transcripts.bed (-) for stdout') |
287 parser.add_argument( | 51 parser.add_argument( |
288 '-r', '--raw', action='store_true', | 52 '-r', '--raw', action='store_true', |
323 biotypea = ['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',')] | 87 biotypea = ['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',')] |
324 # print >> sys.stderr, "args biotypes: %s" % biotypea | 88 # print >> sys.stderr, "args biotypes: %s" % biotypea |
325 biotypes = ';'.join(['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',') if bt.strip()]) | 89 biotypes = ';'.join(['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',') if bt.strip()]) |
326 # print >> sys.stderr, "biotypes: %s" % biotypes | 90 # print >> sys.stderr, "biotypes: %s" % biotypes |
327 | 91 |
92 twobit = TwoBitFile(args.twobit) if args.twobit else None | |
93 | |
328 selected_regions = dict() # chrom:(start,end) | 94 selected_regions = dict() # chrom:(start,end) |
329 region_pat = '^([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?' | 95 region_pat = '^([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?' |
330 if args.regions: | 96 if args.regions: |
331 for entry in args.regions: | 97 for entry in args.regions: |
332 if not entry: | 98 if not entry: |
338 (chrom,start,end,strand) = m.groups() | 104 (chrom,start,end,strand) = m.groups() |
339 if chrom: | 105 if chrom: |
340 if chrom not in selected_regions: | 106 if chrom not in selected_regions: |
341 selected_regions[chrom] = [] | 107 selected_regions[chrom] = [] |
342 selected_regions[chrom].append([start,end,strand]) | 108 selected_regions[chrom].append([start,end,strand]) |
109 if args.debug: print >> sys.stderr, "selected_regions: %s" % selected_regions | |
343 | 110 |
344 translations = dict() # start : end : seq | 111 translations = dict() # start : end : seq |
345 | 112 |
346 def unique_prot(tbed, seq): | 113 def unique_prot(tbed, seq): |
347 if tbed.chromStart not in translations: | 114 if tbed.chromStart not in translations: |
355 translations[tbed.chromStart][tbed.chromEnd].append(seq) | 122 translations[tbed.chromStart][tbed.chromEnd].append(seq) |
356 else: | 123 else: |
357 return False | 124 return False |
358 return True | 125 return True |
359 | 126 |
127 def get_sequence(chrom,start,end): | |
128 if twobit: | |
129 if chrom in twobit: | |
130 return twobit[chrom][start:end] | |
131 contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom | |
132 if contig in twobit: | |
133 return twobit[contig][start:end] | |
134 return None | |
135 | |
360 def translate_bed(bed): | 136 def translate_bed(bed): |
361 translate_count = 0 | 137 translate_count = 0 |
362 if any([fa_wtr, bed_wtr]): | 138 if any([fa_wtr, bed_wtr]): |
363 transcript_id = bed.name | 139 transcript_id = bed.name |
364 refprot = None | 140 refprot = None |
141 if twobit: | |
142 bed.seq = get_sequence(bed.chrom,bed.chromStart,bed.chromEnd) | |
143 else: | |
144 bed.cdna = get_cdna(transcript_id) | |
145 cdna = bed.get_cdna() | |
146 cdna_len = len(cdna) | |
365 if not args.all: | 147 if not args.all: |
366 try: | 148 try: |
367 cds = get_cds(transcript_id) | 149 cds = bed.get_cds() |
150 if cds is None: | |
151 cds = get_cds(transcript_id) | |
368 if len(cds) % 3 != 0: | 152 if len(cds) % 3 != 0: |
369 cds = cds[:-(len(cds) % 3)] | 153 cds = cds[:-(len(cds) % 3)] |
370 refprot = translate(cds) if cds else None | 154 refprot = translate(cds) if cds else None |
371 except: | 155 except: |
372 refprot = None | 156 refprot = None |
373 cdna = get_cdna(transcript_id) | |
374 cdna_len = len(cdna) | |
375 for offset in range(3): | 157 for offset in range(3): |
376 seqend = cdna_len - (cdna_len - offset) % 3 | 158 seqend = cdna_len - (cdna_len - offset) % 3 |
377 aaseq = translate(cdna[offset:seqend]) | 159 aaseq = translate(cdna[offset:seqend]) |
378 aa_start = 0 | 160 aa_start = 0 |
379 while aa_start < len(aaseq): | 161 while aa_start < len(aaseq): |