comparison variant_effect_predictor/Bio/EnsEMBL/TranscriptMapper.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 TranscriptMapper - A utility class used to perform coordinate conversions
24 between a number of coordinate systems relating to transcripts
25
26 =head1 SYNOPSIS
27
28 my $trmapper = Bio::EnsEMBL::TranscriptMapper->new($transcript);
29
30 @coords = $trmapper->cdna2genomic( 123, 554 );
31
32 @coords = $trmapper->genomic2cdna( 141, 500, -1 );
33
34 @coords = $trmapper->genomic2cds( 141, 500, -1 );
35
36 @coords = $trmapper->pep2genomic( 10, 60 );
37
38 @coords = $trmapper->genomic2pep( 123, 400, 1 );
39
40 =head1 DESCRIPTION
41
42 This is a utility class which can be used to perform coordinate conversions
43 between a number of coordinate systems relating to transcripts.
44
45 =head1 METHODS
46
47 =cut
48
49 package Bio::EnsEMBL::TranscriptMapper;
50
51 use strict;
52 use warnings;
53
54 use Bio::EnsEMBL::Utils::Exception qw(throw);
55
56 use Bio::EnsEMBL::Mapper;
57 use Bio::EnsEMBL::Mapper::Gap;
58 use Bio::EnsEMBL::Mapper::Coordinate;
59
60
61
62 =head2 new
63
64 Arg [1] : Bio::EnsEMBL::Transcript $transcript
65 The transcript for which a TranscriptMapper should be created.
66 Example : $trans_mapper = Bio::EnsEMBL::TranscriptMapper->new($transcript)
67 Description: Creates a TranscriptMapper object which can be used to perform
68 various coordinate transformations relating to transcripts.
69 Note that the TranscriptMapper uses the transcript state at the
70 time of creation to perform the conversions, and that a new
71 TranscriptMapper must be created if the Transcript is altered.
72 'Genomic' coordinates are coordinates which are relative to the
73 slice that the Transcript is on.
74 Returntype : Bio::EnsEMBL::TranscriptMapper
75 Exceptions : throws if a transcript is not an argument
76 Caller : Transcript::get_TranscriptMapper
77 Status : Stable
78
79 =cut
80
81 sub new {
82 my $caller = shift;
83 my $transcript = shift;
84
85 my $class = ref($caller) || $caller;
86
87 if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
88 throw("Transcript argument is required.");
89 }
90
91
92 my $exons = $transcript->get_all_Exons();
93 my $start_phase;
94 if(@$exons) {
95 $start_phase = $exons->[0]->phase;
96 } else {
97 $start_phase = -1;
98 }
99
100 # Create a cdna <-> genomic mapper and load it with exon coords
101 my $mapper = _load_mapper($transcript,$start_phase);
102
103 my $self = bless({'exon_coord_mapper' => $mapper,
104 'start_phase' => $start_phase,
105 'cdna_coding_start' => $transcript->cdna_coding_start(),
106 'cdna_coding_end' => $transcript->cdna_coding_end()},
107 $class);
108 }
109
110
111 =head2 _load_mapper
112
113 Arg [1] : Bio::EnsEMBL::Transcript $transcript
114 The transcript for which a mapper should be created.
115 Example : my $mapper = _load_mapper($transcript);
116 Description: loads the mapper
117 Returntype : Bio::EnsEMBL::Mapper
118 Exceptions : none
119 Caller : Internal
120 Status : Stable
121
122 =cut
123
124 sub _load_mapper {
125 my $transcript = shift;
126 my $start_phase = shift;
127
128 my $mapper = Bio::EnsEMBL::Mapper->new( 'cdna', 'genomic');
129
130 my $edits_on = $transcript->edits_enabled();
131 my @edits;
132
133 if($edits_on) {
134 @edits = @{$transcript->get_all_SeqEdits()};
135 @edits = sort {$a->start() <=> $b->start()} @edits;
136 }
137
138 my $edit_shift = 0;
139
140 my $cdna_start = undef;
141
142 my $cdna_end = 0;
143
144
145 foreach my $ex (@{$transcript->get_all_Exons}) {
146 my $gen_start = $ex->start();
147 my $gen_end = $ex->end();
148
149 $cdna_start = $cdna_end + 1;
150 $cdna_end = $cdna_start + $ex->length() - 1;
151
152 my $strand = $ex->strand();
153
154 # add deletions and insertions into pairs when SeqEdits turned on
155 # ignore mismatches (i.e. treat as matches)
156 if($edits_on) {
157 while(@edits && $edits[0]->start() + $edit_shift <= $cdna_end) {
158
159 my $edit = shift(@edits);
160 my $len_diff = $edit->length_diff();
161
162 if($len_diff) {
163 # break pair into two parts, finish first pair just before edit
164
165 my $prev_cdna_end = $edit->start() + $edit_shift - 1;
166 my $prev_cdna_start = $cdna_start;
167 my $prev_len = $prev_cdna_end - $prev_cdna_start + 1;
168
169 my $prev_gen_end;
170 my $prev_gen_start;
171 if($strand == 1) {
172 $prev_gen_start = $gen_start;
173 $prev_gen_end = $gen_start + $prev_len - 1;
174 } else {
175 $prev_gen_start = $gen_end - $prev_len + 1;
176 $prev_gen_end = $gen_end;
177 }
178
179 if($prev_len > 0) { # only create map pair if not boundary case
180 $mapper->add_map_coordinates
181 ('cdna', $prev_cdna_start, $prev_cdna_end, $strand,
182 'genome', $prev_gen_start,$prev_gen_end);
183 }
184
185 $cdna_start = $prev_cdna_end + 1;
186
187 if($strand == 1) {
188 $gen_start = $prev_gen_end + 1;
189 } else {
190 $gen_end = $prev_gen_start - 1;
191 }
192
193 $cdna_end += $len_diff;
194
195 if($len_diff > 0) {
196 # insert in cdna, shift cdna coords along
197 $cdna_start += $len_diff;
198 } else {
199 # delete in cdna (insert in genomic), shift genomic coords along
200
201 if($strand == 1) {
202 $gen_start -= $len_diff;
203 } else {
204 $gen_end += $len_diff;
205 }
206 }
207
208 $edit_shift += $len_diff;
209 }
210 }
211 }
212
213 my $pair_len = $cdna_end - $cdna_start + 1;
214
215 if($pair_len > 0) {
216 $mapper->add_map_coordinates('cdna', $cdna_start, $cdna_end, $strand,
217 'genome', $gen_start, $gen_end);
218 }
219 }
220
221 return $mapper;
222 }
223
224
225 =head2 cdna2genomic
226
227 Arg [1] : $start
228 The start position in cdna coordinates
229 Arg [2] : $end
230 The end position in cdna coordinates
231 Example : @cdna_coords = $transcript_mapper->cdna2genomic($start, $end);
232 Description: Converts cdna coordinates to genomic coordinates. The
233 return value is a list of coordinates and gaps.
234 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and
235 Bio::EnsEMBL::Mapper::Gap objects
236 Exceptions : throws if no start or end
237 Caller : general
238 Status : Stable
239
240 =cut
241
242
243 sub cdna2genomic {
244 my ($self,$start,$end) = @_;
245
246 if( !defined $end ) {
247 throw("Must call with start/end");
248 }
249
250 my $mapper = $self->{'exon_coord_mapper'};
251
252 return $mapper->map_coordinates( 'cdna', $start, $end, 1, "cdna" );
253
254 }
255
256
257 =head2 genomic2cdna
258
259 Arg [1] : $start
260 The start position in genomic coordinates
261 Arg [2] : $end
262 The end position in genomic coordinates
263 Arg [3] : $strand
264 The strand of the genomic coordinates (default value 1)
265 Example : @coords = $trans_mapper->genomic2cdna($start, $end, $strnd);
266 Description: Converts genomic coordinates to cdna coordinates. The
267 return value is a list of coordinates and gaps. Gaps
268 represent intronic or upstream/downstream regions which do
269 not comprise this transcripts cdna. Coordinate objects
270 represent genomic regions which map to exons (utrs included).
271 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and
272 Bio::EnsEMBL::Mapper::Gap objects
273 Exceptions : throws if start, end or strand not defined
274 Caller : general
275 Status : Stable
276
277 =cut
278
279 sub genomic2cdna {
280 my ($self, $start, $end, $strand) = @_;
281
282 unless(defined $start && defined $end && defined $strand) {
283 throw("start, end and strand arguments are required\n");
284 }
285
286 my $mapper = $self->{'exon_coord_mapper'};
287
288 return $mapper->map_coordinates("genome", $start, $end, $strand,"genomic");
289
290 }
291
292
293 =head2 cds2genomic
294
295 Arg [1] : int $start
296 start position in cds coords
297 Arg [2] : int $end
298 end position in cds coords
299 Example : @genomic_coords = $transcript_mapper->cds2genomic(69, 306);
300 Description: Converts cds coordinates into genomic coordinates. The
301 coordinates returned are relative to the same slice that the
302 transcript used to construct this TranscriptMapper was on.
303 Returntype : list of Bio::EnsEMBL::Mapper::Gap and
304 Bio::EnsEMBL::Mapper::Coordinate objects
305 Exceptions : throws if no end
306 Caller : general
307 Status : at risk
308
309 =cut
310
311 sub cds2genomic {
312 my ( $self, $start, $end ) = @_;
313
314 if ( !( defined($start) && defined($end) ) ) {
315 throw("Must call with start and end");
316 }
317
318 # Move start end into translate cDNA coordinates now.
319 $start = $start +( $self->{'cdna_coding_start'} - 1 ) ;
320 $end = $end + ( $self->{'cdna_coding_start'} - 1 );
321
322 return $self->cdna2genomic( $start, $end );
323 }
324
325 =head2 pep2genomic
326
327 Arg [1] : int $start
328 start position in peptide coords
329 Arg [2] : int $end
330 end position in peptide coords
331 Example : @genomic_coords = $transcript_mapper->pep2genomic(23, 102);
332 Description: Converts peptide coordinates into genomic coordinates. The
333 coordinates returned are relative to the same slice that the
334 transcript used to construct this TranscriptMapper was on.
335 Returntype : list of Bio::EnsEMBL::Mapper::Gap and
336 Bio::EnsEMBL::Mapper::Coordinate objects
337 Exceptions : throws if no end
338 Caller : general
339 Status : Stable
340
341 =cut
342
343 sub pep2genomic {
344 my ( $self, $start, $end ) = @_;
345
346 if ( !( defined($start) && defined($end) ) ) {
347 throw("Must call with start and end");
348 }
349
350 # Take possible N-padding at beginning of CDS into account.
351 my $start_phase = $self->{'start_phase'};
352 my $shift = ( $start_phase > 0 ) ? $start_phase : 0;
353
354 # Move start end into translate cDNA coordinates now.
355 $start = 3*$start - 2 + ( $self->{'cdna_coding_start'} - 1 ) - $shift;
356 $end = 3*$end + ( $self->{'cdna_coding_start'} - 1 ) - $shift;
357
358 return $self->cdna2genomic( $start, $end );
359 }
360
361
362 =head2 genomic2cds
363
364 Arg [1] : int $start
365 The genomic start position
366 Arg [2] : int $end
367 The genomic end position
368 Arg [3] : int $strand
369 The genomic strand
370 Example : @cds_coords = $trans_mapper->genomic2cds($start, $end, $strand);
371 Description: Converts genomic coordinates into CDS coordinates of the
372 transcript that was used to create this transcript mapper.
373 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and
374 Bio::EnsEMBL::Mapper::Gap objects
375 Exceptions : throw if start, end or strand not defined
376 Caller : general
377 Status : Stable
378
379 =cut
380
381 sub genomic2cds {
382 my ($self, $start, $end, $strand) = @_;
383
384 if(!defined($start) || !defined($end) || !defined($strand)) {
385 throw("start, end and strand arguments are required");
386 }
387
388 if($start > $end + 1) {
389 throw("start arg must be less than or equal to end arg + 1");
390 }
391
392 my $cdna_cstart = $self->{'cdna_coding_start'};
393 my $cdna_cend = $self->{'cdna_coding_end'};
394
395 #this is a pseudogene if there is no coding region
396 if(!defined($cdna_cstart)) {
397 #return a gap of the entire requested region, there is no CDS
398 return Bio::EnsEMBL::Mapper::Gap->new($start,$end);
399 }
400
401 my @coords = $self->genomic2cdna($start, $end, $strand);
402
403 my @out;
404
405 foreach my $coord (@coords) {
406 if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) {
407 push @out, $coord;
408 } else {
409 my $start = $coord->start;
410 my $end = $coord->end;
411
412 if($coord->strand == -1 || $end < $cdna_cstart || $start > $cdna_cend) {
413 #is all gap - does not map to peptide
414 push @out, Bio::EnsEMBL::Mapper::Gap->new($start,$end);
415 } else {
416 #we know area is at least partially overlapping CDS
417
418 my $cds_start = $start - $cdna_cstart + 1;
419 my $cds_end = $end - $cdna_cstart + 1;
420
421 if($start < $cdna_cstart) {
422 #start of coordinates are in the 5prime UTR
423 push @out, Bio::EnsEMBL::Mapper::Gap->new($start, $cdna_cstart-1);
424
425 #start is now relative to start of CDS
426 $cds_start = 1;
427 }
428
429 my $end_gap = undef;
430 if($end > $cdna_cend) {
431 #end of coordinates are in the 3prime UTR
432 $end_gap = Bio::EnsEMBL::Mapper::Gap->new($cdna_cend + 1, $end);
433 #adjust end to relative to CDS start
434 $cds_end = $cdna_cend - $cdna_cstart + 1;
435 }
436
437 #start and end are now entirely in CDS and relative to CDS start
438 $coord->start($cds_start);
439 $coord->end($cds_end);
440
441 push @out, $coord;
442
443 if($end_gap) {
444 #push out the region which was in the 3prime utr
445 push @out, $end_gap;
446 }
447 }
448 }
449 }
450
451 return @out;
452
453 }
454
455
456 =head2 genomic2pep
457
458 Arg [1] : $start
459 The start position in genomic coordinates
460 Arg [2] : $end
461 The end position in genomic coordinates
462 Arg [3] : $strand
463 The strand of the genomic coordinates
464 Example : @pep_coords = $transcript->genomic2pep($start, $end, $strand);
465 Description: Converts genomic coordinates to peptide coordinates. The
466 return value is a list of coordinates and gaps.
467 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and
468 Bio::EnsEMBL::Mapper::Gap objects
469 Exceptions : throw if start, end or strand not defined
470 Caller : general
471 Status : Stable
472
473 =cut
474
475 sub genomic2pep {
476 my ($self, $start, $end, $strand) = @_;
477
478 unless(defined $start && defined $end && defined $strand) {
479 throw("start, end and strand arguments are required");
480 }
481
482 my @coords = $self->genomic2cds($start, $end, $strand);
483
484 my @out;
485
486 my $start_phase = $self->{'start_phase'};
487
488 #take into account possible N padding at beginning of CDS
489 my $shift = ($start_phase > 0) ? $start_phase : 0;
490
491 foreach my $coord (@coords) {
492 if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) {
493 push @out, $coord;
494 } else {
495
496 #start and end are now entirely in CDS and relative to CDS start
497
498 #convert to peptide coordinates
499 my $pep_start = int(($coord->start + $shift + 2) / 3);
500 my $pep_end = int(($coord->end + $shift + 2) / 3);
501 $coord->start($pep_start);
502 $coord->end($pep_end);
503
504 push @out, $coord;
505 }
506 }
507
508 return @out;
509 }
510
511
512 1;