0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 #
|
|
4 #------------------------------------------------------------------------------
|
|
5 # University of Minnesota
|
|
6 # Copyright 2017, Regents of the University of Minnesota
|
|
7 #------------------------------------------------------------------------------
|
|
8 # Author:
|
|
9 #
|
|
10 # James E Johnson
|
|
11 #
|
|
12 #------------------------------------------------------------------------------
|
|
13 """
|
|
14
|
|
15 import argparse
|
7
|
16 import re
|
0
|
17 import sys
|
|
18 from time import sleep
|
|
19
|
|
20 from Bio.Seq import translate
|
|
21
|
|
22 import requests
|
|
23
|
8
|
24 from bedutil import BedEntry, bed_from_line
|
0
|
25 import digest
|
8
|
26 from ensembl_rest import get_toplevel, get_transcripts_bed, get_cds, get_cdna, max_region
|
|
27 from twobitreader import TwoBitFile
|
0
|
28
|
|
29
|
|
30 def __main__():
|
|
31 parser = argparse.ArgumentParser(
|
|
32 description='Retrieve Ensembl cDNAs and three frame translate')
|
|
33 parser.add_argument(
|
|
34 '-s', '--species', default='human',
|
|
35 help='Ensembl Species to retrieve')
|
|
36 parser.add_argument(
|
7
|
37 '-R', '--regions', action='append', default=[],
|
|
38 help='Restrict Ensembl retrieval to regions e.g.: X,2:20000-25000,3:100-500+')
|
|
39 parser.add_argument(
|
0
|
40 '-B', '--biotypes', action='append', default=[],
|
|
41 help='Restrict Ensembl biotypes to retrieve')
|
|
42 parser.add_argument(
|
|
43 '-i', '--input', default=None,
|
|
44 help='Use BED instead of retrieving cDNA from ensembl (-) for stdin')
|
|
45 parser.add_argument(
|
8
|
46 '-T', '--twobit', default=None,
|
|
47 help='Genome reference sequence in 2bit format')
|
|
48 parser.add_argument(
|
0
|
49 '-t', '--transcripts', default=None,
|
|
50 help='Path to output cDNA transcripts.bed (-) for stdout')
|
|
51 parser.add_argument(
|
|
52 '-r', '--raw', action='store_true',
|
|
53 help='Report transcript exacty as returned from Ensembl')
|
|
54 parser.add_argument(
|
|
55 '-f', '--fasta', default=None,
|
|
56 help='Path to output translations.fasta')
|
|
57 parser.add_argument(
|
|
58 '-b', '--bed', default=None,
|
|
59 help='Path to output translations.bed')
|
|
60 parser.add_argument(
|
|
61 '-m', '--min_length', type=int, default=7,
|
|
62 help='Minimum length of protein translation to report')
|
|
63 parser.add_argument(
|
|
64 '-e', '--enzyme', default=None,
|
|
65 help='Digest translation with enzyme')
|
|
66 parser.add_argument(
|
|
67 '-a', '--all', action='store_true',
|
|
68 help='Include reference protein translations')
|
|
69 parser.add_argument('-v', '--verbose', action='store_true', help='Verbose')
|
|
70 parser.add_argument('-d', '--debug', action='store_true', help='Debug')
|
|
71 args = parser.parse_args()
|
|
72 # print >> sys.stderr, "args: %s" % args
|
|
73 species = args.species
|
|
74 input_rdr = None
|
|
75 if args.input is not None:
|
|
76 input_rdr = open(args.input, 'r') if args.input != '-' else sys.stdin
|
|
77 tx_wtr = None
|
|
78 if args.transcripts is not None:
|
|
79 tx_wtr = open(args.transcripts, 'w')\
|
|
80 if args.transcripts != '-' else sys.stdout
|
|
81 fa_wtr = open(args.fasta, 'w') if args.fasta is not None else None
|
|
82 bed_wtr = open(args.bed, 'w') if args.bed is not None else None
|
|
83
|
|
84 enzyme = digest.expasy_rules.get(args.enzyme,args.enzyme)
|
|
85
|
|
86 # print >> sys.stderr, "args biotypes: %s" % args.biotypes
|
|
87 biotypea = ['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',')]
|
|
88 # print >> sys.stderr, "args biotypes: %s" % biotypea
|
|
89 biotypes = ';'.join(['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',') if bt.strip()])
|
|
90 # print >> sys.stderr, "biotypes: %s" % biotypes
|
|
91
|
8
|
92 twobit = TwoBitFile(args.twobit) if args.twobit else None
|
|
93
|
7
|
94 selected_regions = dict() # chrom:(start,end)
|
|
95 region_pat = '^([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?'
|
|
96 if args.regions:
|
|
97 for entry in args.regions:
|
|
98 if not entry:
|
|
99 continue
|
|
100 regs = [x.strip() for x in entry.split(',') if x.strip()]
|
|
101 for reg in regs:
|
|
102 m = re.match(region_pat,reg)
|
|
103 if m:
|
|
104 (chrom,start,end,strand) = m.groups()
|
|
105 if chrom:
|
|
106 if chrom not in selected_regions:
|
|
107 selected_regions[chrom] = []
|
|
108 selected_regions[chrom].append([start,end,strand])
|
8
|
109 if args.debug: print >> sys.stderr, "selected_regions: %s" % selected_regions
|
7
|
110
|
0
|
111 translations = dict() # start : end : seq
|
|
112
|
|
113 def unique_prot(tbed, seq):
|
|
114 if tbed.chromStart not in translations:
|
|
115 translations[tbed.chromStart] = dict()
|
|
116 translations[tbed.chromStart][tbed.chromEnd] = []
|
|
117 translations[tbed.chromStart][tbed.chromEnd].append(seq)
|
|
118 elif tbed.chromEnd not in translations[tbed.chromStart]:
|
|
119 translations[tbed.chromStart][tbed.chromEnd] = []
|
|
120 translations[tbed.chromStart][tbed.chromEnd].append(seq)
|
|
121 elif seq not in translations[tbed.chromStart][tbed.chromEnd]:
|
|
122 translations[tbed.chromStart][tbed.chromEnd].append(seq)
|
|
123 else:
|
|
124 return False
|
|
125 return True
|
|
126
|
8
|
127 def get_sequence(chrom,start,end):
|
|
128 if twobit:
|
|
129 if chrom in twobit:
|
|
130 return twobit[chrom][start:end]
|
|
131 contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom
|
|
132 if contig in twobit:
|
|
133 return twobit[contig][start:end]
|
|
134 return None
|
|
135
|
0
|
136 def translate_bed(bed):
|
|
137 translate_count = 0
|
|
138 if any([fa_wtr, bed_wtr]):
|
|
139 transcript_id = bed.name
|
|
140 refprot = None
|
8
|
141 if twobit:
|
|
142 bed.seq = get_sequence(bed.chrom,bed.chromStart,bed.chromEnd)
|
|
143 else:
|
|
144 bed.cdna = get_cdna(transcript_id)
|
|
145 cdna = bed.get_cdna()
|
|
146 cdna_len = len(cdna)
|
0
|
147 if not args.all:
|
|
148 try:
|
8
|
149 cds = bed.get_cds()
|
|
150 if cds is None:
|
|
151 cds = get_cds(transcript_id)
|
0
|
152 if len(cds) % 3 != 0:
|
|
153 cds = cds[:-(len(cds) % 3)]
|
|
154 refprot = translate(cds) if cds else None
|
|
155 except:
|
|
156 refprot = None
|
|
157 for offset in range(3):
|
|
158 seqend = cdna_len - (cdna_len - offset) % 3
|
|
159 aaseq = translate(cdna[offset:seqend])
|
|
160 aa_start = 0
|
|
161 while aa_start < len(aaseq):
|
|
162 aa_end = aaseq.find('*', aa_start)
|
|
163 if aa_end < 0:
|
|
164 aa_end = len(aaseq)
|
|
165 prot = aaseq[aa_start:aa_end]
|
|
166 if enzyme and refprot:
|
|
167 frags = digest._cleave(prot,enzyme)
|
|
168 for frag in reversed(frags):
|
|
169 if frag in refprot:
|
|
170 prot = prot[:prot.rfind(frag)]
|
|
171 else:
|
|
172 break
|
|
173 if len(prot) < args.min_length:
|
|
174 pass
|
|
175 elif refprot and prot in refprot:
|
|
176 pass
|
|
177 else:
|
|
178 tstart = aa_start*3+offset
|
|
179 tend = aa_end*3+offset
|
|
180 prot_acc = "%s_%d_%d" % (transcript_id, tstart, tend)
|
|
181 tbed = bed.trim(tstart, tend)
|
|
182 if args.all or unique_prot(tbed, prot):
|
|
183 translate_count += 1
|
|
184 tbed.name = prot_acc
|
|
185 bed_wtr.write("%s\t%s\n" % (str(tbed), prot))
|
|
186 bed_wtr.flush()
|
|
187 fa_id = ">%s\n" % (prot_acc)
|
|
188 fa_wtr.write(fa_id)
|
|
189 fa_wtr.write(prot)
|
|
190 fa_wtr.write("\n")
|
|
191 fa_wtr.flush()
|
|
192 aa_start = aa_end + 1
|
|
193 return translate_count
|
|
194
|
7
|
195 def translate_region(species,ref,start,stop,strand):
|
|
196 translation_count = 0
|
|
197 regions = range(start, stop, max_region)
|
|
198 if not regions or regions[-1] < stop:
|
|
199 regions.append(stop)
|
|
200 for end in regions[1:]:
|
|
201 bedlines = get_transcripts_bed(species, ref, start, end, strand=strand, params=biotypes)
|
|
202 if args.verbose or args.debug:
|
|
203 print >> sys.stderr,\
|
|
204 "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\
|
|
205 % (species, ref, start, end, len(bedlines))
|
|
206 # start, end, seq
|
|
207 for i, bedline in enumerate(bedlines):
|
|
208 try:
|
|
209 bed = bed_from_line(bedline)\
|
|
210 if any([not args.raw, fa_wtr, bed_wtr])\
|
|
211 else None
|
|
212 if tx_wtr:
|
|
213 tx_wtr.write(bedline if args.raw else str(bed))
|
|
214 tx_wtr.write("\n")
|
|
215 tx_wtr.flush()
|
|
216 if bed:
|
|
217 translation_count += translate_bed(bed)
|
|
218 except Exception as e:
|
|
219 print >> sys.stderr,\
|
|
220 "BED error (%s) : %s\n" % (e, bedline)
|
|
221 start = end + 1
|
|
222 return translation_count
|
|
223
|
0
|
224 if input_rdr:
|
|
225 translation_count = 0
|
|
226 for i, bedline in enumerate(input_rdr):
|
|
227 try:
|
|
228 bed = bed_from_line(bedline)
|
7
|
229 if bed is None:
|
|
230 continue
|
|
231 if bed.biotype and biotypea and bed.biotype not in biotypea:
|
|
232 continue
|
0
|
233 translation_count += translate_bed(bed)
|
|
234 except:
|
|
235 print >> sys.stderr, "BED format error: %s\n" % bedline
|
|
236 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])):
|
|
237 print >> sys.stderr,\
|
|
238 "%s\tcDNA translations:%d" % (species, translation_count)
|
|
239 else:
|
|
240 coord_systems = get_toplevel(species)
|
|
241 if 'chromosome' in coord_systems:
|
7
|
242 ref_lengths = dict()
|
0
|
243 for ref in sorted(coord_systems['chromosome'].keys()):
|
|
244 length = coord_systems['chromosome'][ref]
|
7
|
245 ref_lengths[ref] = length
|
0
|
246 if not any([tx_wtr, fa_wtr, bed_wtr]):
|
|
247 print >> sys.stderr,\
|
|
248 "%s\t%s\tlength: %d" % (species, ref, length)
|
7
|
249 if selected_regions:
|
0
|
250 translation_count = 0
|
7
|
251 for ref in sorted(selected_regions.keys()):
|
|
252 if ref in ref_lengths:
|
|
253 for reg in selected_regions[ref]:
|
|
254 (_start,_stop,_strand) = reg
|
|
255 start = int(_start) if _start else 0
|
|
256 stop = int(_stop) if _stop else ref_lengths[ref]
|
|
257 strand = '' if not _strand else ':1' if _strand == '+' else ':-1'
|
|
258 translation_count += translate_region(species,ref,start,stop,strand)
|
|
259 else:
|
|
260 strand = ''
|
0
|
261 start = 0
|
7
|
262 for ref in sorted(ref_lengths.keys()):
|
|
263 length = ref_lengths[ref]
|
|
264 translation_count = 0
|
|
265 if args.debug:
|
0
|
266 print >> sys.stderr,\
|
7
|
267 "Retrieving transcripts: %s\t%s\tlength: %d"\
|
|
268 % (species, ref, length)
|
|
269 translation_count += translate_region(species,ref,start,length,strand)
|
|
270 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])):
|
|
271 print >> sys.stderr,\
|
|
272 "%s\t%s\tlength: %d\tcDNA translations:%d"\
|
|
273 % (species, ref, length, translation_count)
|
0
|
274
|
|
275
|
|
276 if __name__ == "__main__":
|
|
277 __main__()
|