Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/EnsEMBL/TranscriptMapper.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 TranscriptMapper - A utility class used to perform coordinate conversions | |
24 between a number of coordinate systems relating to transcripts | |
25 | |
26 =head1 SYNOPSIS | |
27 | |
28 my $trmapper = Bio::EnsEMBL::TranscriptMapper->new($transcript); | |
29 | |
30 @coords = $trmapper->cdna2genomic( 123, 554 ); | |
31 | |
32 @coords = $trmapper->genomic2cdna( 141, 500, -1 ); | |
33 | |
34 @coords = $trmapper->genomic2cds( 141, 500, -1 ); | |
35 | |
36 @coords = $trmapper->pep2genomic( 10, 60 ); | |
37 | |
38 @coords = $trmapper->genomic2pep( 123, 400, 1 ); | |
39 | |
40 =head1 DESCRIPTION | |
41 | |
42 This is a utility class which can be used to perform coordinate conversions | |
43 between a number of coordinate systems relating to transcripts. | |
44 | |
45 =head1 METHODS | |
46 | |
47 =cut | |
48 | |
49 package Bio::EnsEMBL::TranscriptMapper; | |
50 | |
51 use strict; | |
52 use warnings; | |
53 | |
54 use Bio::EnsEMBL::Utils::Exception qw(throw); | |
55 | |
56 use Bio::EnsEMBL::Mapper; | |
57 use Bio::EnsEMBL::Mapper::Gap; | |
58 use Bio::EnsEMBL::Mapper::Coordinate; | |
59 | |
60 | |
61 | |
62 =head2 new | |
63 | |
64 Arg [1] : Bio::EnsEMBL::Transcript $transcript | |
65 The transcript for which a TranscriptMapper should be created. | |
66 Example : $trans_mapper = Bio::EnsEMBL::TranscriptMapper->new($transcript) | |
67 Description: Creates a TranscriptMapper object which can be used to perform | |
68 various coordinate transformations relating to transcripts. | |
69 Note that the TranscriptMapper uses the transcript state at the | |
70 time of creation to perform the conversions, and that a new | |
71 TranscriptMapper must be created if the Transcript is altered. | |
72 'Genomic' coordinates are coordinates which are relative to the | |
73 slice that the Transcript is on. | |
74 Returntype : Bio::EnsEMBL::TranscriptMapper | |
75 Exceptions : throws if a transcript is not an argument | |
76 Caller : Transcript::get_TranscriptMapper | |
77 Status : Stable | |
78 | |
79 =cut | |
80 | |
81 sub new { | |
82 my $caller = shift; | |
83 my $transcript = shift; | |
84 | |
85 my $class = ref($caller) || $caller; | |
86 | |
87 if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) { | |
88 throw("Transcript argument is required."); | |
89 } | |
90 | |
91 | |
92 my $exons = $transcript->get_all_Exons(); | |
93 my $start_phase; | |
94 if(@$exons) { | |
95 $start_phase = $exons->[0]->phase; | |
96 } else { | |
97 $start_phase = -1; | |
98 } | |
99 | |
100 # Create a cdna <-> genomic mapper and load it with exon coords | |
101 my $mapper = _load_mapper($transcript,$start_phase); | |
102 | |
103 my $self = bless({'exon_coord_mapper' => $mapper, | |
104 'start_phase' => $start_phase, | |
105 'cdna_coding_start' => $transcript->cdna_coding_start(), | |
106 'cdna_coding_end' => $transcript->cdna_coding_end()}, | |
107 $class); | |
108 } | |
109 | |
110 | |
111 =head2 _load_mapper | |
112 | |
113 Arg [1] : Bio::EnsEMBL::Transcript $transcript | |
114 The transcript for which a mapper should be created. | |
115 Example : my $mapper = _load_mapper($transcript); | |
116 Description: loads the mapper | |
117 Returntype : Bio::EnsEMBL::Mapper | |
118 Exceptions : none | |
119 Caller : Internal | |
120 Status : Stable | |
121 | |
122 =cut | |
123 | |
124 sub _load_mapper { | |
125 my $transcript = shift; | |
126 my $start_phase = shift; | |
127 | |
128 my $mapper = Bio::EnsEMBL::Mapper->new( 'cdna', 'genomic'); | |
129 | |
130 my $edits_on = $transcript->edits_enabled(); | |
131 my @edits; | |
132 | |
133 if($edits_on) { | |
134 @edits = @{$transcript->get_all_SeqEdits()}; | |
135 @edits = sort {$a->start() <=> $b->start()} @edits; | |
136 } | |
137 | |
138 my $edit_shift = 0; | |
139 | |
140 my $cdna_start = undef; | |
141 | |
142 my $cdna_end = 0; | |
143 | |
144 | |
145 foreach my $ex (@{$transcript->get_all_Exons}) { | |
146 my $gen_start = $ex->start(); | |
147 my $gen_end = $ex->end(); | |
148 | |
149 $cdna_start = $cdna_end + 1; | |
150 $cdna_end = $cdna_start + $ex->length() - 1; | |
151 | |
152 my $strand = $ex->strand(); | |
153 | |
154 # add deletions and insertions into pairs when SeqEdits turned on | |
155 # ignore mismatches (i.e. treat as matches) | |
156 if($edits_on) { | |
157 while(@edits && $edits[0]->start() + $edit_shift <= $cdna_end) { | |
158 | |
159 my $edit = shift(@edits); | |
160 my $len_diff = $edit->length_diff(); | |
161 | |
162 if($len_diff) { | |
163 # break pair into two parts, finish first pair just before edit | |
164 | |
165 my $prev_cdna_end = $edit->start() + $edit_shift - 1; | |
166 my $prev_cdna_start = $cdna_start; | |
167 my $prev_len = $prev_cdna_end - $prev_cdna_start + 1; | |
168 | |
169 my $prev_gen_end; | |
170 my $prev_gen_start; | |
171 if($strand == 1) { | |
172 $prev_gen_start = $gen_start; | |
173 $prev_gen_end = $gen_start + $prev_len - 1; | |
174 } else { | |
175 $prev_gen_start = $gen_end - $prev_len + 1; | |
176 $prev_gen_end = $gen_end; | |
177 } | |
178 | |
179 if($prev_len > 0) { # only create map pair if not boundary case | |
180 $mapper->add_map_coordinates | |
181 ('cdna', $prev_cdna_start, $prev_cdna_end, $strand, | |
182 'genome', $prev_gen_start,$prev_gen_end); | |
183 } | |
184 | |
185 $cdna_start = $prev_cdna_end + 1; | |
186 | |
187 if($strand == 1) { | |
188 $gen_start = $prev_gen_end + 1; | |
189 } else { | |
190 $gen_end = $prev_gen_start - 1; | |
191 } | |
192 | |
193 $cdna_end += $len_diff; | |
194 | |
195 if($len_diff > 0) { | |
196 # insert in cdna, shift cdna coords along | |
197 $cdna_start += $len_diff; | |
198 } else { | |
199 # delete in cdna (insert in genomic), shift genomic coords along | |
200 | |
201 if($strand == 1) { | |
202 $gen_start -= $len_diff; | |
203 } else { | |
204 $gen_end += $len_diff; | |
205 } | |
206 } | |
207 | |
208 $edit_shift += $len_diff; | |
209 } | |
210 } | |
211 } | |
212 | |
213 my $pair_len = $cdna_end - $cdna_start + 1; | |
214 | |
215 if($pair_len > 0) { | |
216 $mapper->add_map_coordinates('cdna', $cdna_start, $cdna_end, $strand, | |
217 'genome', $gen_start, $gen_end); | |
218 } | |
219 } | |
220 | |
221 return $mapper; | |
222 } | |
223 | |
224 | |
225 =head2 cdna2genomic | |
226 | |
227 Arg [1] : $start | |
228 The start position in cdna coordinates | |
229 Arg [2] : $end | |
230 The end position in cdna coordinates | |
231 Example : @cdna_coords = $transcript_mapper->cdna2genomic($start, $end); | |
232 Description: Converts cdna coordinates to genomic coordinates. The | |
233 return value is a list of coordinates and gaps. | |
234 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and | |
235 Bio::EnsEMBL::Mapper::Gap objects | |
236 Exceptions : throws if no start or end | |
237 Caller : general | |
238 Status : Stable | |
239 | |
240 =cut | |
241 | |
242 | |
243 sub cdna2genomic { | |
244 my ($self,$start,$end) = @_; | |
245 | |
246 if( !defined $end ) { | |
247 throw("Must call with start/end"); | |
248 } | |
249 | |
250 my $mapper = $self->{'exon_coord_mapper'}; | |
251 | |
252 return $mapper->map_coordinates( 'cdna', $start, $end, 1, "cdna" ); | |
253 | |
254 } | |
255 | |
256 | |
257 =head2 genomic2cdna | |
258 | |
259 Arg [1] : $start | |
260 The start position in genomic coordinates | |
261 Arg [2] : $end | |
262 The end position in genomic coordinates | |
263 Arg [3] : $strand | |
264 The strand of the genomic coordinates (default value 1) | |
265 Example : @coords = $trans_mapper->genomic2cdna($start, $end, $strnd); | |
266 Description: Converts genomic coordinates to cdna coordinates. The | |
267 return value is a list of coordinates and gaps. Gaps | |
268 represent intronic or upstream/downstream regions which do | |
269 not comprise this transcripts cdna. Coordinate objects | |
270 represent genomic regions which map to exons (utrs included). | |
271 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and | |
272 Bio::EnsEMBL::Mapper::Gap objects | |
273 Exceptions : throws if start, end or strand not defined | |
274 Caller : general | |
275 Status : Stable | |
276 | |
277 =cut | |
278 | |
279 sub genomic2cdna { | |
280 my ($self, $start, $end, $strand) = @_; | |
281 | |
282 unless(defined $start && defined $end && defined $strand) { | |
283 throw("start, end and strand arguments are required\n"); | |
284 } | |
285 | |
286 my $mapper = $self->{'exon_coord_mapper'}; | |
287 | |
288 return $mapper->map_coordinates("genome", $start, $end, $strand,"genomic"); | |
289 | |
290 } | |
291 | |
292 | |
293 =head2 cds2genomic | |
294 | |
295 Arg [1] : int $start | |
296 start position in cds coords | |
297 Arg [2] : int $end | |
298 end position in cds coords | |
299 Example : @genomic_coords = $transcript_mapper->cds2genomic(69, 306); | |
300 Description: Converts cds coordinates into genomic coordinates. The | |
301 coordinates returned are relative to the same slice that the | |
302 transcript used to construct this TranscriptMapper was on. | |
303 Returntype : list of Bio::EnsEMBL::Mapper::Gap and | |
304 Bio::EnsEMBL::Mapper::Coordinate objects | |
305 Exceptions : throws if no end | |
306 Caller : general | |
307 Status : at risk | |
308 | |
309 =cut | |
310 | |
311 sub cds2genomic { | |
312 my ( $self, $start, $end ) = @_; | |
313 | |
314 if ( !( defined($start) && defined($end) ) ) { | |
315 throw("Must call with start and end"); | |
316 } | |
317 | |
318 # Move start end into translate cDNA coordinates now. | |
319 $start = $start +( $self->{'cdna_coding_start'} - 1 ) ; | |
320 $end = $end + ( $self->{'cdna_coding_start'} - 1 ); | |
321 | |
322 return $self->cdna2genomic( $start, $end ); | |
323 } | |
324 | |
325 =head2 pep2genomic | |
326 | |
327 Arg [1] : int $start | |
328 start position in peptide coords | |
329 Arg [2] : int $end | |
330 end position in peptide coords | |
331 Example : @genomic_coords = $transcript_mapper->pep2genomic(23, 102); | |
332 Description: Converts peptide coordinates into genomic coordinates. The | |
333 coordinates returned are relative to the same slice that the | |
334 transcript used to construct this TranscriptMapper was on. | |
335 Returntype : list of Bio::EnsEMBL::Mapper::Gap and | |
336 Bio::EnsEMBL::Mapper::Coordinate objects | |
337 Exceptions : throws if no end | |
338 Caller : general | |
339 Status : Stable | |
340 | |
341 =cut | |
342 | |
343 sub pep2genomic { | |
344 my ( $self, $start, $end ) = @_; | |
345 | |
346 if ( !( defined($start) && defined($end) ) ) { | |
347 throw("Must call with start and end"); | |
348 } | |
349 | |
350 # Take possible N-padding at beginning of CDS into account. | |
351 my $start_phase = $self->{'start_phase'}; | |
352 my $shift = ( $start_phase > 0 ) ? $start_phase : 0; | |
353 | |
354 # Move start end into translate cDNA coordinates now. | |
355 $start = 3*$start - 2 + ( $self->{'cdna_coding_start'} - 1 ) - $shift; | |
356 $end = 3*$end + ( $self->{'cdna_coding_start'} - 1 ) - $shift; | |
357 | |
358 return $self->cdna2genomic( $start, $end ); | |
359 } | |
360 | |
361 | |
362 =head2 genomic2cds | |
363 | |
364 Arg [1] : int $start | |
365 The genomic start position | |
366 Arg [2] : int $end | |
367 The genomic end position | |
368 Arg [3] : int $strand | |
369 The genomic strand | |
370 Example : @cds_coords = $trans_mapper->genomic2cds($start, $end, $strand); | |
371 Description: Converts genomic coordinates into CDS coordinates of the | |
372 transcript that was used to create this transcript mapper. | |
373 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and | |
374 Bio::EnsEMBL::Mapper::Gap objects | |
375 Exceptions : throw if start, end or strand not defined | |
376 Caller : general | |
377 Status : Stable | |
378 | |
379 =cut | |
380 | |
381 sub genomic2cds { | |
382 my ($self, $start, $end, $strand) = @_; | |
383 | |
384 if(!defined($start) || !defined($end) || !defined($strand)) { | |
385 throw("start, end and strand arguments are required"); | |
386 } | |
387 | |
388 if($start > $end + 1) { | |
389 throw("start arg must be less than or equal to end arg + 1"); | |
390 } | |
391 | |
392 my $cdna_cstart = $self->{'cdna_coding_start'}; | |
393 my $cdna_cend = $self->{'cdna_coding_end'}; | |
394 | |
395 #this is a pseudogene if there is no coding region | |
396 if(!defined($cdna_cstart)) { | |
397 #return a gap of the entire requested region, there is no CDS | |
398 return Bio::EnsEMBL::Mapper::Gap->new($start,$end); | |
399 } | |
400 | |
401 my @coords = $self->genomic2cdna($start, $end, $strand); | |
402 | |
403 my @out; | |
404 | |
405 foreach my $coord (@coords) { | |
406 if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) { | |
407 push @out, $coord; | |
408 } else { | |
409 my $start = $coord->start; | |
410 my $end = $coord->end; | |
411 | |
412 if($coord->strand == -1 || $end < $cdna_cstart || $start > $cdna_cend) { | |
413 #is all gap - does not map to peptide | |
414 push @out, Bio::EnsEMBL::Mapper::Gap->new($start,$end); | |
415 } else { | |
416 #we know area is at least partially overlapping CDS | |
417 | |
418 my $cds_start = $start - $cdna_cstart + 1; | |
419 my $cds_end = $end - $cdna_cstart + 1; | |
420 | |
421 if($start < $cdna_cstart) { | |
422 #start of coordinates are in the 5prime UTR | |
423 push @out, Bio::EnsEMBL::Mapper::Gap->new($start, $cdna_cstart-1); | |
424 | |
425 #start is now relative to start of CDS | |
426 $cds_start = 1; | |
427 } | |
428 | |
429 my $end_gap = undef; | |
430 if($end > $cdna_cend) { | |
431 #end of coordinates are in the 3prime UTR | |
432 $end_gap = Bio::EnsEMBL::Mapper::Gap->new($cdna_cend + 1, $end); | |
433 #adjust end to relative to CDS start | |
434 $cds_end = $cdna_cend - $cdna_cstart + 1; | |
435 } | |
436 | |
437 #start and end are now entirely in CDS and relative to CDS start | |
438 $coord->start($cds_start); | |
439 $coord->end($cds_end); | |
440 | |
441 push @out, $coord; | |
442 | |
443 if($end_gap) { | |
444 #push out the region which was in the 3prime utr | |
445 push @out, $end_gap; | |
446 } | |
447 } | |
448 } | |
449 } | |
450 | |
451 return @out; | |
452 | |
453 } | |
454 | |
455 | |
456 =head2 genomic2pep | |
457 | |
458 Arg [1] : $start | |
459 The start position in genomic coordinates | |
460 Arg [2] : $end | |
461 The end position in genomic coordinates | |
462 Arg [3] : $strand | |
463 The strand of the genomic coordinates | |
464 Example : @pep_coords = $transcript->genomic2pep($start, $end, $strand); | |
465 Description: Converts genomic coordinates to peptide coordinates. The | |
466 return value is a list of coordinates and gaps. | |
467 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and | |
468 Bio::EnsEMBL::Mapper::Gap objects | |
469 Exceptions : throw if start, end or strand not defined | |
470 Caller : general | |
471 Status : Stable | |
472 | |
473 =cut | |
474 | |
475 sub genomic2pep { | |
476 my ($self, $start, $end, $strand) = @_; | |
477 | |
478 unless(defined $start && defined $end && defined $strand) { | |
479 throw("start, end and strand arguments are required"); | |
480 } | |
481 | |
482 my @coords = $self->genomic2cds($start, $end, $strand); | |
483 | |
484 my @out; | |
485 | |
486 my $start_phase = $self->{'start_phase'}; | |
487 | |
488 #take into account possible N padding at beginning of CDS | |
489 my $shift = ($start_phase > 0) ? $start_phase : 0; | |
490 | |
491 foreach my $coord (@coords) { | |
492 if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) { | |
493 push @out, $coord; | |
494 } else { | |
495 | |
496 #start and end are now entirely in CDS and relative to CDS start | |
497 | |
498 #convert to peptide coordinates | |
499 my $pep_start = int(($coord->start + $shift + 2) / 3); | |
500 my $pep_end = int(($coord->end + $shift + 2) / 3); | |
501 $coord->start($pep_start); | |
502 $coord->end($pep_end); | |
503 | |
504 push @out, $coord; | |
505 } | |
506 } | |
507 | |
508 return @out; | |
509 } | |
510 | |
511 | |
512 1; |