annotate ensembl_cdna_translate.py @ 0:a8218b11216f draft

Uploaded
author jjohnson
date Wed, 29 Nov 2017 15:55:59 -0500
parents
children b7f2f5e3390c
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
1 #!/usr/bin/env python
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
2 """
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
3 #
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
4 #------------------------------------------------------------------------------
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
5 # University of Minnesota
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
6 # Copyright 2017, Regents of the University of Minnesota
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
7 #------------------------------------------------------------------------------
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
8 # Author:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
9 #
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
10 # James E Johnson
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
11 #
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
12 #------------------------------------------------------------------------------
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
13 """
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
14
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
15 import argparse
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
16 import sys
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
17 from time import sleep
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
18
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
19 from Bio.Seq import translate
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
20
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
21 import requests
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
22
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
23 import digest
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
24
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
25
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
26 server = "https://rest.ensembl.org"
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
27 ext = "/info/assembly/homo_sapiens?"
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
28 max_region = 5000000
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
29
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
30
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
31 def ensembl_rest(ext, headers):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
32 if True: print >> sys.stderr, "%s" % ext
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
33 r = requests.get(server+ext, headers=headers)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
34 if r.status_code == 429:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
35 print >> sys.stderr, "response headers: %s\n" % r.headers
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
36 if 'Retry-After' in r.headers:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
37 sleep(r.headers['Retry-After'])
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
38 r = requests.get(server+ext, headers=headers)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
39 if not r.ok:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
40 r.raise_for_status()
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
41 return r
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
42
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
43
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
44 def get_species():
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
45 results = dict()
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
46 ext = "/info/species"
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
47 req_header = {"Content-Type": "application/json"}
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
48 r = ensembl_rest(ext, req_header)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
49 for species in r.json()['species']:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
50 results[species['name']] = species
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
51 print >> sys.stdout,\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
52 "%s\t%s\t%s\t%s\t%s"\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
53 % (species['name'], species['common_name'],
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
54 species['display_name'],
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
55 species['strain'],
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
56 species['taxon_id'])
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
57 return results
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
58
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
59
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
60 def get_biotypes(species):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
61 biotypes = []
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
62 ext = "/info/biotypes/%s?" % species
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
63 req_header = {"Content-Type": "application/json"}
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
64 r = ensembl_rest(ext, req_header)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
65 for entry in r.json():
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
66 if 'biotype' in entry:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
67 biotypes.append(entry['biotype'])
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
68 return biotypes
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
69
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
70
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
71 def get_toplevel(species):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
72 coord_systems = dict()
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
73 ext = "/info/assembly/%s?" % species
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
74 req_header = {"Content-Type": "application/json"}
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
75 r = ensembl_rest(ext, req_header)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
76 toplevel = r.json()
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
77 for seq in toplevel['top_level_region']:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
78 if seq['coord_system'] not in coord_systems:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
79 coord_systems[seq['coord_system']] = dict()
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
80 coord_system = coord_systems[seq['coord_system']]
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
81 coord_system[seq['name']] = int(seq['length'])
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
82 return coord_systems
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
83
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
84
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
85 def get_transcripts_bed(species, refseq, start, length,params=None):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
86 bed = []
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
87 param = params if params else ''
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
88 req_header = {"Content-Type": "text/x-bed"}
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
89 regions = range(start, length, max_region)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
90 if not regions or regions[-1] < length:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
91 regions.append(length)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
92 for end in regions[1:]:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
93 ext = "/overlap/region/%s/%s:%d-%d?feature=transcript;%s"\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
94 % (species, refseq, start, end, param)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
95 start = end + 1
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
96 r = ensembl_rest(ext, req_header)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
97 if r.text:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
98 bed += r.text.splitlines()
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
99 return bed
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
100
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
101
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
102 def get_seq(id, seqtype,params=None):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
103 param = params if params else ''
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
104 ext = "/sequence/id/%s?type=%s;%s" % (id, seqtype,param)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
105 req_header = {"Content-Type": "text/plain"}
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
106 r = ensembl_rest(ext, req_header)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
107 return r.text
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
108
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
109
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
110 def get_cdna(id,params=None):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
111 return get_seq(id, 'cdna',params=params)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
112
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
113
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
114 def get_cds(id,params=None):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
115 return get_seq(id, 'cds',params=params)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
116
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
117
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
118 def bed_from_line(line):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
119 fields = line.rstrip('\r\n').split('\t')
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
120 (chrom, chromStart, chromEnd, name, score, strand,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
121 thickStart, thickEnd, itemRgb,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
122 blockCount, blockSizes, blockStarts) = fields[0:12]
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
123 bed_entry = BedEntry(chrom=chrom, chromStart=chromStart, chromEnd=chromEnd,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
124 name=name, score=score, strand=strand,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
125 thickStart=thickStart, thickEnd=thickEnd,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
126 itemRgb=itemRgb,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
127 blockCount=blockCount,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
128 blockSizes=blockSizes.rstrip(','),
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
129 blockStarts=blockStarts.rstrip(','))
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
130 return bed_entry
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
131
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
132
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
133 class BedEntry(object):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
134 def __init__(self, chrom=None, chromStart=None, chromEnd=None,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
135 name=None, score=None, strand=None,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
136 thickStart=None, thickEnd=None, itemRgb=None,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
137 blockCount=None, blockSizes=None, blockStarts=None):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
138 self.chrom = chrom
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
139 self.chromStart = int(chromStart)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
140 self.chromEnd = int(chromEnd)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
141 self.name = name
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
142 self.score = int(score) if score is not None else 0
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
143 self.strand = '-' if str(strand).startswith('-') else '+'
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
144 self.thickStart = int(thickStart) if thickStart else self.chromStart
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
145 self.thickEnd = int(thickEnd) if thickEnd else self.chromEnd
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
146 self.itemRgb = str(itemRgb) if itemRgb is not None else r'100,100,100'
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
147 self.blockCount = int(blockCount)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
148 if isinstance(blockSizes, str) or isinstance(blockSizes, unicode):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
149 self.blockSizes = [int(x) for x in blockSizes.split(',')]
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
150 elif isinstance(blockSizes, list):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
151 self.blockSizes = [int(x) for x in blockSizes]
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
152 else:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
153 self.blockSizes = blockSizes
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
154 if isinstance(blockStarts, str) or isinstance(blockSizes, unicode):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
155 self.blockStarts = [int(x) for x in blockStarts.split(',')]
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
156 elif isinstance(blockStarts, list):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
157 self.blockStarts = [int(x) for x in blockStarts]
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
158 else:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
159 self.blockStarts = blockStarts
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
160 self.seq = None
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
161 self.pep = None
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
162
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
163 def __str__(self):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
164 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s' % (
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
165 self.chrom, self.chromStart, self.chromEnd,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
166 self.name, self.score, self.strand,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
167 self.thickStart, self.thickEnd, str(self.itemRgb), self.blockCount,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
168 ','.join([str(x) for x in self.blockSizes]),
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
169 ','.join([str(x) for x in self.blockStarts]))
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
170
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
171 # (start, end)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
172 def get_subrange(self, tstart, tstop, debug=False):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
173 chromStart = self.chromStart
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
174 chromEnd = self.chromEnd
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
175 if debug:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
176 print >> sys.stderr, "%s" % (str(self))
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
177 r = range(self.blockCount)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
178 if self.strand == '-':
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
179 r.reverse()
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
180 bStart = 0
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
181 bEnd = 0
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
182 for x in r:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
183 bEnd = bStart + self.blockSizes[x]
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
184 if bStart <= tstart < bEnd:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
185 if self.strand == '+':
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
186 chromStart = self.chromStart + self.blockStarts[x] +\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
187 (tstart - bStart)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
188 else:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
189 chromEnd = self.chromStart + self.blockStarts[x] +\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
190 self.blockSizes[x] - (tstart - bStart)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
191 if bStart <= tstop < bEnd:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
192 if self.strand == '+':
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
193 chromEnd = self.chromStart + self.blockStarts[x] +\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
194 (tstop - bStart)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
195 else:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
196 chromStart = self.chromStart + self.blockStarts[x] +\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
197 self.blockSizes[x] - (tstop - bStart)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
198 if debug:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
199 print >> sys.stderr,\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
200 "%3d %s\t%d\t%d\t%d\t%d\t%d\t%d"\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
201 % (x, self.strand, bStart, bEnd,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
202 tstart, tstop, chromStart, chromEnd)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
203 bStart += self.blockSizes[x]
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
204 return(chromStart, chromEnd)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
205
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
206 # get the blocks for sub range
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
207 def get_blocks(self, chromStart, chromEnd):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
208 tblockCount = 0
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
209 tblockSizes = []
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
210 tblockStarts = []
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
211 for x in range(self.blockCount):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
212 bStart = self.chromStart + self.blockStarts[x]
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
213 bEnd = bStart + self.blockSizes[x]
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
214 if bStart > chromEnd:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
215 break
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
216 if bEnd < chromStart:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
217 continue
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
218 cStart = max(chromStart, bStart)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
219 tblockStarts.append(cStart - chromStart)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
220 tblockSizes.append(min(chromEnd, bEnd) - cStart)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
221 tblockCount += 1
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
222 return (tblockCount, tblockSizes, tblockStarts)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
223
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
224 def trim(self, tstart, tstop, debug=False):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
225 (tchromStart, tchromEnd) =\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
226 self.get_subrange(tstart, tstop, debug=debug)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
227 (tblockCount, tblockSizes, tblockStarts) =\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
228 self.get_blocks(tchromStart, tchromEnd)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
229 tbed = BedEntry(
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
230 chrom=self.chrom, chromStart=tchromStart, chromEnd=tchromEnd,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
231 name=self.name, score=self.score, strand=self.strand,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
232 thickStart=tchromStart, thickEnd=tchromEnd, itemRgb=self.itemRgb,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
233 blockCount=tblockCount,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
234 blockSizes=tblockSizes, blockStarts=tblockStarts)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
235 if self.seq:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
236 ts = tchromStart-self.chromStart
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
237 te = tchromEnd - tchromStart + ts
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
238 tbed.seq = self.seq[ts:te]
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
239 return tbed
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
240
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
241
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
242 def __main__():
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
243 parser = argparse.ArgumentParser(
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
244 description='Retrieve Ensembl cDNAs and three frame translate')
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
245 parser.add_argument(
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
246 '-s', '--species', default='human',
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
247 help='Ensembl Species to retrieve')
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
248 parser.add_argument(
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
249 '-B', '--biotypes', action='append', default=[],
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
250 help='Restrict Ensembl biotypes to retrieve')
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
251 parser.add_argument(
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
252 '-i', '--input', default=None,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
253 help='Use BED instead of retrieving cDNA from ensembl (-) for stdin')
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
254 parser.add_argument(
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
255 '-t', '--transcripts', default=None,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
256 help='Path to output cDNA transcripts.bed (-) for stdout')
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
257 parser.add_argument(
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
258 '-r', '--raw', action='store_true',
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
259 help='Report transcript exacty as returned from Ensembl')
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
260 parser.add_argument(
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
261 '-f', '--fasta', default=None,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
262 help='Path to output translations.fasta')
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
263 parser.add_argument(
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
264 '-b', '--bed', default=None,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
265 help='Path to output translations.bed')
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
266 parser.add_argument(
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
267 '-m', '--min_length', type=int, default=7,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
268 help='Minimum length of protein translation to report')
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
269 parser.add_argument(
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
270 '-e', '--enzyme', default=None,
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
271 help='Digest translation with enzyme')
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
272 parser.add_argument(
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
273 '-a', '--all', action='store_true',
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
274 help='Include reference protein translations')
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
275 parser.add_argument('-v', '--verbose', action='store_true', help='Verbose')
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
276 parser.add_argument('-d', '--debug', action='store_true', help='Debug')
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
277 args = parser.parse_args()
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
278 # print >> sys.stderr, "args: %s" % args
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
279 species = args.species
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
280 input_rdr = None
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
281 if args.input is not None:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
282 input_rdr = open(args.input, 'r') if args.input != '-' else sys.stdin
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
283 tx_wtr = None
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
284 if args.transcripts is not None:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
285 tx_wtr = open(args.transcripts, 'w')\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
286 if args.transcripts != '-' else sys.stdout
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
287 fa_wtr = open(args.fasta, 'w') if args.fasta is not None else None
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
288 bed_wtr = open(args.bed, 'w') if args.bed is not None else None
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
289
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
290 enzyme = digest.expasy_rules.get(args.enzyme,args.enzyme)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
291
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
292 # print >> sys.stderr, "args biotypes: %s" % args.biotypes
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
293 biotypea = ['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',')]
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
294 # print >> sys.stderr, "args biotypes: %s" % biotypea
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
295 biotypes = ';'.join(['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',') if bt.strip()])
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
296 # print >> sys.stderr, "biotypes: %s" % biotypes
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
297
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
298 translations = dict() # start : end : seq
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
299
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
300 def unique_prot(tbed, seq):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
301 if tbed.chromStart not in translations:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
302 translations[tbed.chromStart] = dict()
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
303 translations[tbed.chromStart][tbed.chromEnd] = []
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
304 translations[tbed.chromStart][tbed.chromEnd].append(seq)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
305 elif tbed.chromEnd not in translations[tbed.chromStart]:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
306 translations[tbed.chromStart][tbed.chromEnd] = []
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
307 translations[tbed.chromStart][tbed.chromEnd].append(seq)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
308 elif seq not in translations[tbed.chromStart][tbed.chromEnd]:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
309 translations[tbed.chromStart][tbed.chromEnd].append(seq)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
310 else:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
311 return False
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
312 return True
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
313
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
314 def translate_bed(bed):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
315 translate_count = 0
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
316 if any([fa_wtr, bed_wtr]):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
317 transcript_id = bed.name
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
318 refprot = None
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
319 if not args.all:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
320 try:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
321 cds = get_cds(transcript_id)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
322 if len(cds) % 3 != 0:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
323 cds = cds[:-(len(cds) % 3)]
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
324 refprot = translate(cds) if cds else None
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
325 except:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
326 refprot = None
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
327 cdna = get_cdna(transcript_id)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
328 cdna_len = len(cdna)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
329 for offset in range(3):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
330 seqend = cdna_len - (cdna_len - offset) % 3
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
331 aaseq = translate(cdna[offset:seqend])
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
332 aa_start = 0
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
333 while aa_start < len(aaseq):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
334 aa_end = aaseq.find('*', aa_start)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
335 if aa_end < 0:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
336 aa_end = len(aaseq)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
337 prot = aaseq[aa_start:aa_end]
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
338 if enzyme and refprot:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
339 frags = digest._cleave(prot,enzyme)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
340 for frag in reversed(frags):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
341 if frag in refprot:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
342 prot = prot[:prot.rfind(frag)]
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
343 else:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
344 break
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
345 if len(prot) < args.min_length:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
346 pass
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
347 elif refprot and prot in refprot:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
348 pass
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
349 else:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
350 tstart = aa_start*3+offset
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
351 tend = aa_end*3+offset
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
352 prot_acc = "%s_%d_%d" % (transcript_id, tstart, tend)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
353 tbed = bed.trim(tstart, tend)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
354 if args.all or unique_prot(tbed, prot):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
355 translate_count += 1
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
356 tbed.name = prot_acc
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
357 bed_wtr.write("%s\t%s\n" % (str(tbed), prot))
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
358 bed_wtr.flush()
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
359 fa_id = ">%s\n" % (prot_acc)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
360 fa_wtr.write(fa_id)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
361 fa_wtr.write(prot)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
362 fa_wtr.write("\n")
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
363 fa_wtr.flush()
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
364 aa_start = aa_end + 1
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
365 return translate_count
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
366
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
367 if input_rdr:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
368 translation_count = 0
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
369 for i, bedline in enumerate(input_rdr):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
370 try:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
371 bed = bed_from_line(bedline)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
372 translation_count += translate_bed(bed)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
373 except:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
374 print >> sys.stderr, "BED format error: %s\n" % bedline
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
375 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
376 print >> sys.stderr,\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
377 "%s\tcDNA translations:%d" % (species, translation_count)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
378 else:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
379 coord_systems = get_toplevel(species)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
380 if 'chromosome' in coord_systems:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
381 for ref in sorted(coord_systems['chromosome'].keys()):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
382 length = coord_systems['chromosome'][ref]
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
383 if not any([tx_wtr, fa_wtr, bed_wtr]):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
384 print >> sys.stderr,\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
385 "%s\t%s\tlength: %d" % (species, ref, length)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
386 continue
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
387 if args.debug:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
388 print >> sys.stderr,\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
389 "Retrieving transcripts: %s\t%s\tlength: %d"\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
390 % (species, ref, length)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
391 translation_count = 0
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
392 start = 0
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
393 regions = range(start, length, max_region)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
394 if not regions or regions[-1] < length:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
395 regions.append(length)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
396 for end in regions[1:]:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
397 bedlines = get_transcripts_bed(species, ref, start, end, params=biotypes)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
398 if args.verbose or args.debug:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
399 print >> sys.stderr,\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
400 "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
401 % (species, ref, start, end, len(bedlines))
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
402 # start, end, seq
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
403 for i, bedline in enumerate(bedlines):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
404 try:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
405 bed = bed_from_line(bedline)\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
406 if any([not args.raw, fa_wtr, bed_wtr])\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
407 else None
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
408 if tx_wtr:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
409 tx_wtr.write(bedline if args.raw else str(bed))
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
410 tx_wtr.write("\n")
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
411 tx_wtr.flush()
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
412 if bed:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
413 translation_count += translate_bed(bed)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
414 except Exception as e:
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
415 print >> sys.stderr,\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
416 "BED error (%s) : %s\n" % (e, bedline)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
417 start = end + 1
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
418
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
419 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])):
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
420 print >> sys.stderr,\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
421 "%s\t%s\tlength: %d\tcDNA translations:%d"\
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
422 % (species, ref, length, translation_count)
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
423
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
424
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
425 if __name__ == "__main__":
a8218b11216f Uploaded
jjohnson
parents:
diff changeset
426 __main__()