0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 TranscriptMapper - A utility class used to perform coordinate conversions
|
|
24 between a number of coordinate systems relating to transcripts
|
|
25
|
|
26 =head1 SYNOPSIS
|
|
27
|
|
28 my $trmapper = Bio::EnsEMBL::TranscriptMapper->new($transcript);
|
|
29
|
|
30 @coords = $trmapper->cdna2genomic( 123, 554 );
|
|
31
|
|
32 @coords = $trmapper->genomic2cdna( 141, 500, -1 );
|
|
33
|
|
34 @coords = $trmapper->genomic2cds( 141, 500, -1 );
|
|
35
|
|
36 @coords = $trmapper->pep2genomic( 10, 60 );
|
|
37
|
|
38 @coords = $trmapper->genomic2pep( 123, 400, 1 );
|
|
39
|
|
40 =head1 DESCRIPTION
|
|
41
|
|
42 This is a utility class which can be used to perform coordinate conversions
|
|
43 between a number of coordinate systems relating to transcripts.
|
|
44
|
|
45 =head1 METHODS
|
|
46
|
|
47 =cut
|
|
48
|
|
49 package Bio::EnsEMBL::TranscriptMapper;
|
|
50
|
|
51 use strict;
|
|
52 use warnings;
|
|
53
|
|
54 use Bio::EnsEMBL::Utils::Exception qw(throw);
|
|
55
|
|
56 use Bio::EnsEMBL::Mapper;
|
|
57 use Bio::EnsEMBL::Mapper::Gap;
|
|
58 use Bio::EnsEMBL::Mapper::Coordinate;
|
|
59
|
|
60
|
|
61
|
|
62 =head2 new
|
|
63
|
|
64 Arg [1] : Bio::EnsEMBL::Transcript $transcript
|
|
65 The transcript for which a TranscriptMapper should be created.
|
|
66 Example : $trans_mapper = Bio::EnsEMBL::TranscriptMapper->new($transcript)
|
|
67 Description: Creates a TranscriptMapper object which can be used to perform
|
|
68 various coordinate transformations relating to transcripts.
|
|
69 Note that the TranscriptMapper uses the transcript state at the
|
|
70 time of creation to perform the conversions, and that a new
|
|
71 TranscriptMapper must be created if the Transcript is altered.
|
|
72 'Genomic' coordinates are coordinates which are relative to the
|
|
73 slice that the Transcript is on.
|
|
74 Returntype : Bio::EnsEMBL::TranscriptMapper
|
|
75 Exceptions : throws if a transcript is not an argument
|
|
76 Caller : Transcript::get_TranscriptMapper
|
|
77 Status : Stable
|
|
78
|
|
79 =cut
|
|
80
|
|
81 sub new {
|
|
82 my $caller = shift;
|
|
83 my $transcript = shift;
|
|
84
|
|
85 my $class = ref($caller) || $caller;
|
|
86
|
|
87 if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
|
|
88 throw("Transcript argument is required.");
|
|
89 }
|
|
90
|
|
91
|
|
92 my $exons = $transcript->get_all_Exons();
|
|
93 my $start_phase;
|
|
94 if(@$exons) {
|
|
95 $start_phase = $exons->[0]->phase;
|
|
96 } else {
|
|
97 $start_phase = -1;
|
|
98 }
|
|
99
|
|
100 # Create a cdna <-> genomic mapper and load it with exon coords
|
|
101 my $mapper = _load_mapper($transcript,$start_phase);
|
|
102
|
|
103 my $self = bless({'exon_coord_mapper' => $mapper,
|
|
104 'start_phase' => $start_phase,
|
|
105 'cdna_coding_start' => $transcript->cdna_coding_start(),
|
|
106 'cdna_coding_end' => $transcript->cdna_coding_end()},
|
|
107 $class);
|
|
108 }
|
|
109
|
|
110
|
|
111 =head2 _load_mapper
|
|
112
|
|
113 Arg [1] : Bio::EnsEMBL::Transcript $transcript
|
|
114 The transcript for which a mapper should be created.
|
|
115 Example : my $mapper = _load_mapper($transcript);
|
|
116 Description: loads the mapper
|
|
117 Returntype : Bio::EnsEMBL::Mapper
|
|
118 Exceptions : none
|
|
119 Caller : Internal
|
|
120 Status : Stable
|
|
121
|
|
122 =cut
|
|
123
|
|
124 sub _load_mapper {
|
|
125 my $transcript = shift;
|
|
126 my $start_phase = shift;
|
|
127
|
|
128 my $mapper = Bio::EnsEMBL::Mapper->new( 'cdna', 'genomic');
|
|
129
|
|
130 my $edits_on = $transcript->edits_enabled();
|
|
131 my @edits;
|
|
132
|
|
133 if($edits_on) {
|
|
134 @edits = @{$transcript->get_all_SeqEdits()};
|
|
135 @edits = sort {$a->start() <=> $b->start()} @edits;
|
|
136 }
|
|
137
|
|
138 my $edit_shift = 0;
|
|
139
|
|
140 my $cdna_start = undef;
|
|
141
|
|
142 my $cdna_end = 0;
|
|
143
|
|
144
|
|
145 foreach my $ex (@{$transcript->get_all_Exons}) {
|
|
146 my $gen_start = $ex->start();
|
|
147 my $gen_end = $ex->end();
|
|
148
|
|
149 $cdna_start = $cdna_end + 1;
|
|
150 $cdna_end = $cdna_start + $ex->length() - 1;
|
|
151
|
|
152 my $strand = $ex->strand();
|
|
153
|
|
154 # add deletions and insertions into pairs when SeqEdits turned on
|
|
155 # ignore mismatches (i.e. treat as matches)
|
|
156 if($edits_on) {
|
|
157 while(@edits && $edits[0]->start() + $edit_shift <= $cdna_end) {
|
|
158
|
|
159 my $edit = shift(@edits);
|
|
160 my $len_diff = $edit->length_diff();
|
|
161
|
|
162 if($len_diff) {
|
|
163 # break pair into two parts, finish first pair just before edit
|
|
164
|
|
165 my $prev_cdna_end = $edit->start() + $edit_shift - 1;
|
|
166 my $prev_cdna_start = $cdna_start;
|
|
167 my $prev_len = $prev_cdna_end - $prev_cdna_start + 1;
|
|
168
|
|
169 my $prev_gen_end;
|
|
170 my $prev_gen_start;
|
|
171 if($strand == 1) {
|
|
172 $prev_gen_start = $gen_start;
|
|
173 $prev_gen_end = $gen_start + $prev_len - 1;
|
|
174 } else {
|
|
175 $prev_gen_start = $gen_end - $prev_len + 1;
|
|
176 $prev_gen_end = $gen_end;
|
|
177 }
|
|
178
|
|
179 if($prev_len > 0) { # only create map pair if not boundary case
|
|
180 $mapper->add_map_coordinates
|
|
181 ('cdna', $prev_cdna_start, $prev_cdna_end, $strand,
|
|
182 'genome', $prev_gen_start,$prev_gen_end);
|
|
183 }
|
|
184
|
|
185 $cdna_start = $prev_cdna_end + 1;
|
|
186
|
|
187 if($strand == 1) {
|
|
188 $gen_start = $prev_gen_end + 1;
|
|
189 } else {
|
|
190 $gen_end = $prev_gen_start - 1;
|
|
191 }
|
|
192
|
|
193 $cdna_end += $len_diff;
|
|
194
|
|
195 if($len_diff > 0) {
|
|
196 # insert in cdna, shift cdna coords along
|
|
197 $cdna_start += $len_diff;
|
|
198 } else {
|
|
199 # delete in cdna (insert in genomic), shift genomic coords along
|
|
200
|
|
201 if($strand == 1) {
|
|
202 $gen_start -= $len_diff;
|
|
203 } else {
|
|
204 $gen_end += $len_diff;
|
|
205 }
|
|
206 }
|
|
207
|
|
208 $edit_shift += $len_diff;
|
|
209 }
|
|
210 }
|
|
211 }
|
|
212
|
|
213 my $pair_len = $cdna_end - $cdna_start + 1;
|
|
214
|
|
215 if($pair_len > 0) {
|
|
216 $mapper->add_map_coordinates('cdna', $cdna_start, $cdna_end, $strand,
|
|
217 'genome', $gen_start, $gen_end);
|
|
218 }
|
|
219 }
|
|
220
|
|
221 return $mapper;
|
|
222 }
|
|
223
|
|
224
|
|
225 =head2 cdna2genomic
|
|
226
|
|
227 Arg [1] : $start
|
|
228 The start position in cdna coordinates
|
|
229 Arg [2] : $end
|
|
230 The end position in cdna coordinates
|
|
231 Example : @cdna_coords = $transcript_mapper->cdna2genomic($start, $end);
|
|
232 Description: Converts cdna coordinates to genomic coordinates. The
|
|
233 return value is a list of coordinates and gaps.
|
|
234 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and
|
|
235 Bio::EnsEMBL::Mapper::Gap objects
|
|
236 Exceptions : throws if no start or end
|
|
237 Caller : general
|
|
238 Status : Stable
|
|
239
|
|
240 =cut
|
|
241
|
|
242
|
|
243 sub cdna2genomic {
|
|
244 my ($self,$start,$end) = @_;
|
|
245
|
|
246 if( !defined $end ) {
|
|
247 throw("Must call with start/end");
|
|
248 }
|
|
249
|
|
250 my $mapper = $self->{'exon_coord_mapper'};
|
|
251
|
|
252 return $mapper->map_coordinates( 'cdna', $start, $end, 1, "cdna" );
|
|
253
|
|
254 }
|
|
255
|
|
256
|
|
257 =head2 genomic2cdna
|
|
258
|
|
259 Arg [1] : $start
|
|
260 The start position in genomic coordinates
|
|
261 Arg [2] : $end
|
|
262 The end position in genomic coordinates
|
|
263 Arg [3] : $strand
|
|
264 The strand of the genomic coordinates (default value 1)
|
|
265 Example : @coords = $trans_mapper->genomic2cdna($start, $end, $strnd);
|
|
266 Description: Converts genomic coordinates to cdna coordinates. The
|
|
267 return value is a list of coordinates and gaps. Gaps
|
|
268 represent intronic or upstream/downstream regions which do
|
|
269 not comprise this transcripts cdna. Coordinate objects
|
|
270 represent genomic regions which map to exons (utrs included).
|
|
271 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and
|
|
272 Bio::EnsEMBL::Mapper::Gap objects
|
|
273 Exceptions : throws if start, end or strand not defined
|
|
274 Caller : general
|
|
275 Status : Stable
|
|
276
|
|
277 =cut
|
|
278
|
|
279 sub genomic2cdna {
|
|
280 my ($self, $start, $end, $strand) = @_;
|
|
281
|
|
282 unless(defined $start && defined $end && defined $strand) {
|
|
283 throw("start, end and strand arguments are required\n");
|
|
284 }
|
|
285
|
|
286 my $mapper = $self->{'exon_coord_mapper'};
|
|
287
|
|
288 return $mapper->map_coordinates("genome", $start, $end, $strand,"genomic");
|
|
289
|
|
290 }
|
|
291
|
|
292
|
|
293 =head2 cds2genomic
|
|
294
|
|
295 Arg [1] : int $start
|
|
296 start position in cds coords
|
|
297 Arg [2] : int $end
|
|
298 end position in cds coords
|
|
299 Example : @genomic_coords = $transcript_mapper->cds2genomic(69, 306);
|
|
300 Description: Converts cds coordinates into genomic coordinates. The
|
|
301 coordinates returned are relative to the same slice that the
|
|
302 transcript used to construct this TranscriptMapper was on.
|
|
303 Returntype : list of Bio::EnsEMBL::Mapper::Gap and
|
|
304 Bio::EnsEMBL::Mapper::Coordinate objects
|
|
305 Exceptions : throws if no end
|
|
306 Caller : general
|
|
307 Status : at risk
|
|
308
|
|
309 =cut
|
|
310
|
|
311 sub cds2genomic {
|
|
312 my ( $self, $start, $end ) = @_;
|
|
313
|
|
314 if ( !( defined($start) && defined($end) ) ) {
|
|
315 throw("Must call with start and end");
|
|
316 }
|
|
317
|
|
318 # Move start end into translate cDNA coordinates now.
|
|
319 $start = $start +( $self->{'cdna_coding_start'} - 1 ) ;
|
|
320 $end = $end + ( $self->{'cdna_coding_start'} - 1 );
|
|
321
|
|
322 return $self->cdna2genomic( $start, $end );
|
|
323 }
|
|
324
|
|
325 =head2 pep2genomic
|
|
326
|
|
327 Arg [1] : int $start
|
|
328 start position in peptide coords
|
|
329 Arg [2] : int $end
|
|
330 end position in peptide coords
|
|
331 Example : @genomic_coords = $transcript_mapper->pep2genomic(23, 102);
|
|
332 Description: Converts peptide coordinates into genomic coordinates. The
|
|
333 coordinates returned are relative to the same slice that the
|
|
334 transcript used to construct this TranscriptMapper was on.
|
|
335 Returntype : list of Bio::EnsEMBL::Mapper::Gap and
|
|
336 Bio::EnsEMBL::Mapper::Coordinate objects
|
|
337 Exceptions : throws if no end
|
|
338 Caller : general
|
|
339 Status : Stable
|
|
340
|
|
341 =cut
|
|
342
|
|
343 sub pep2genomic {
|
|
344 my ( $self, $start, $end ) = @_;
|
|
345
|
|
346 if ( !( defined($start) && defined($end) ) ) {
|
|
347 throw("Must call with start and end");
|
|
348 }
|
|
349
|
|
350 # Take possible N-padding at beginning of CDS into account.
|
|
351 my $start_phase = $self->{'start_phase'};
|
|
352 my $shift = ( $start_phase > 0 ) ? $start_phase : 0;
|
|
353
|
|
354 # Move start end into translate cDNA coordinates now.
|
|
355 $start = 3*$start - 2 + ( $self->{'cdna_coding_start'} - 1 ) - $shift;
|
|
356 $end = 3*$end + ( $self->{'cdna_coding_start'} - 1 ) - $shift;
|
|
357
|
|
358 return $self->cdna2genomic( $start, $end );
|
|
359 }
|
|
360
|
|
361
|
|
362 =head2 genomic2cds
|
|
363
|
|
364 Arg [1] : int $start
|
|
365 The genomic start position
|
|
366 Arg [2] : int $end
|
|
367 The genomic end position
|
|
368 Arg [3] : int $strand
|
|
369 The genomic strand
|
|
370 Example : @cds_coords = $trans_mapper->genomic2cds($start, $end, $strand);
|
|
371 Description: Converts genomic coordinates into CDS coordinates of the
|
|
372 transcript that was used to create this transcript mapper.
|
|
373 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and
|
|
374 Bio::EnsEMBL::Mapper::Gap objects
|
|
375 Exceptions : throw if start, end or strand not defined
|
|
376 Caller : general
|
|
377 Status : Stable
|
|
378
|
|
379 =cut
|
|
380
|
|
381 sub genomic2cds {
|
|
382 my ($self, $start, $end, $strand) = @_;
|
|
383
|
|
384 if(!defined($start) || !defined($end) || !defined($strand)) {
|
|
385 throw("start, end and strand arguments are required");
|
|
386 }
|
|
387
|
|
388 if($start > $end + 1) {
|
|
389 throw("start arg must be less than or equal to end arg + 1");
|
|
390 }
|
|
391
|
|
392 my $cdna_cstart = $self->{'cdna_coding_start'};
|
|
393 my $cdna_cend = $self->{'cdna_coding_end'};
|
|
394
|
|
395 #this is a pseudogene if there is no coding region
|
|
396 if(!defined($cdna_cstart)) {
|
|
397 #return a gap of the entire requested region, there is no CDS
|
|
398 return Bio::EnsEMBL::Mapper::Gap->new($start,$end);
|
|
399 }
|
|
400
|
|
401 my @coords = $self->genomic2cdna($start, $end, $strand);
|
|
402
|
|
403 my @out;
|
|
404
|
|
405 foreach my $coord (@coords) {
|
|
406 if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) {
|
|
407 push @out, $coord;
|
|
408 } else {
|
|
409 my $start = $coord->start;
|
|
410 my $end = $coord->end;
|
|
411
|
|
412 if($coord->strand == -1 || $end < $cdna_cstart || $start > $cdna_cend) {
|
|
413 #is all gap - does not map to peptide
|
|
414 push @out, Bio::EnsEMBL::Mapper::Gap->new($start,$end);
|
|
415 } else {
|
|
416 #we know area is at least partially overlapping CDS
|
|
417
|
|
418 my $cds_start = $start - $cdna_cstart + 1;
|
|
419 my $cds_end = $end - $cdna_cstart + 1;
|
|
420
|
|
421 if($start < $cdna_cstart) {
|
|
422 #start of coordinates are in the 5prime UTR
|
|
423 push @out, Bio::EnsEMBL::Mapper::Gap->new($start, $cdna_cstart-1);
|
|
424
|
|
425 #start is now relative to start of CDS
|
|
426 $cds_start = 1;
|
|
427 }
|
|
428
|
|
429 my $end_gap = undef;
|
|
430 if($end > $cdna_cend) {
|
|
431 #end of coordinates are in the 3prime UTR
|
|
432 $end_gap = Bio::EnsEMBL::Mapper::Gap->new($cdna_cend + 1, $end);
|
|
433 #adjust end to relative to CDS start
|
|
434 $cds_end = $cdna_cend - $cdna_cstart + 1;
|
|
435 }
|
|
436
|
|
437 #start and end are now entirely in CDS and relative to CDS start
|
|
438 $coord->start($cds_start);
|
|
439 $coord->end($cds_end);
|
|
440
|
|
441 push @out, $coord;
|
|
442
|
|
443 if($end_gap) {
|
|
444 #push out the region which was in the 3prime utr
|
|
445 push @out, $end_gap;
|
|
446 }
|
|
447 }
|
|
448 }
|
|
449 }
|
|
450
|
|
451 return @out;
|
|
452
|
|
453 }
|
|
454
|
|
455
|
|
456 =head2 genomic2pep
|
|
457
|
|
458 Arg [1] : $start
|
|
459 The start position in genomic coordinates
|
|
460 Arg [2] : $end
|
|
461 The end position in genomic coordinates
|
|
462 Arg [3] : $strand
|
|
463 The strand of the genomic coordinates
|
|
464 Example : @pep_coords = $transcript->genomic2pep($start, $end, $strand);
|
|
465 Description: Converts genomic coordinates to peptide coordinates. The
|
|
466 return value is a list of coordinates and gaps.
|
|
467 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and
|
|
468 Bio::EnsEMBL::Mapper::Gap objects
|
|
469 Exceptions : throw if start, end or strand not defined
|
|
470 Caller : general
|
|
471 Status : Stable
|
|
472
|
|
473 =cut
|
|
474
|
|
475 sub genomic2pep {
|
|
476 my ($self, $start, $end, $strand) = @_;
|
|
477
|
|
478 unless(defined $start && defined $end && defined $strand) {
|
|
479 throw("start, end and strand arguments are required");
|
|
480 }
|
|
481
|
|
482 my @coords = $self->genomic2cds($start, $end, $strand);
|
|
483
|
|
484 my @out;
|
|
485
|
|
486 my $start_phase = $self->{'start_phase'};
|
|
487
|
|
488 #take into account possible N padding at beginning of CDS
|
|
489 my $shift = ($start_phase > 0) ? $start_phase : 0;
|
|
490
|
|
491 foreach my $coord (@coords) {
|
|
492 if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) {
|
|
493 push @out, $coord;
|
|
494 } else {
|
|
495
|
|
496 #start and end are now entirely in CDS and relative to CDS start
|
|
497
|
|
498 #convert to peptide coordinates
|
|
499 my $pep_start = int(($coord->start + $shift + 2) / 3);
|
|
500 my $pep_end = int(($coord->end + $shift + 2) / 3);
|
|
501 $coord->start($pep_start);
|
|
502 $coord->end($pep_end);
|
|
503
|
|
504 push @out, $coord;
|
|
505 }
|
|
506 }
|
|
507
|
|
508 return @out;
|
|
509 }
|
|
510
|
|
511
|
|
512 1;
|