Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/TranscriptMapper.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
| 4 Genome Research Limited. All rights reserved. | |
| 5 | |
| 6 This software is distributed under a modified Apache license. | |
| 7 For license details, please see | |
| 8 | |
| 9 http://www.ensembl.org/info/about/code_licence.html | |
| 10 | |
| 11 =head1 CONTACT | |
| 12 | |
| 13 Please email comments or questions to the public Ensembl | |
| 14 developers list at <dev@ensembl.org>. | |
| 15 | |
| 16 Questions may also be sent to the Ensembl help desk at | |
| 17 <helpdesk@ensembl.org>. | |
| 18 | |
| 19 =cut | |
| 20 | |
| 21 =head1 NAME | |
| 22 | |
| 23 TranscriptMapper - A utility class used to perform coordinate conversions | |
| 24 between a number of coordinate systems relating to transcripts | |
| 25 | |
| 26 =head1 SYNOPSIS | |
| 27 | |
| 28 my $trmapper = Bio::EnsEMBL::TranscriptMapper->new($transcript); | |
| 29 | |
| 30 @coords = $trmapper->cdna2genomic( 123, 554 ); | |
| 31 | |
| 32 @coords = $trmapper->genomic2cdna( 141, 500, -1 ); | |
| 33 | |
| 34 @coords = $trmapper->genomic2cds( 141, 500, -1 ); | |
| 35 | |
| 36 @coords = $trmapper->pep2genomic( 10, 60 ); | |
| 37 | |
| 38 @coords = $trmapper->genomic2pep( 123, 400, 1 ); | |
| 39 | |
| 40 =head1 DESCRIPTION | |
| 41 | |
| 42 This is a utility class which can be used to perform coordinate conversions | |
| 43 between a number of coordinate systems relating to transcripts. | |
| 44 | |
| 45 =head1 METHODS | |
| 46 | |
| 47 =cut | |
| 48 | |
| 49 package Bio::EnsEMBL::TranscriptMapper; | |
| 50 | |
| 51 use strict; | |
| 52 use warnings; | |
| 53 | |
| 54 use Bio::EnsEMBL::Utils::Exception qw(throw); | |
| 55 | |
| 56 use Bio::EnsEMBL::Mapper; | |
| 57 use Bio::EnsEMBL::Mapper::Gap; | |
| 58 use Bio::EnsEMBL::Mapper::Coordinate; | |
| 59 | |
| 60 | |
| 61 | |
| 62 =head2 new | |
| 63 | |
| 64 Arg [1] : Bio::EnsEMBL::Transcript $transcript | |
| 65 The transcript for which a TranscriptMapper should be created. | |
| 66 Example : $trans_mapper = Bio::EnsEMBL::TranscriptMapper->new($transcript) | |
| 67 Description: Creates a TranscriptMapper object which can be used to perform | |
| 68 various coordinate transformations relating to transcripts. | |
| 69 Note that the TranscriptMapper uses the transcript state at the | |
| 70 time of creation to perform the conversions, and that a new | |
| 71 TranscriptMapper must be created if the Transcript is altered. | |
| 72 'Genomic' coordinates are coordinates which are relative to the | |
| 73 slice that the Transcript is on. | |
| 74 Returntype : Bio::EnsEMBL::TranscriptMapper | |
| 75 Exceptions : throws if a transcript is not an argument | |
| 76 Caller : Transcript::get_TranscriptMapper | |
| 77 Status : Stable | |
| 78 | |
| 79 =cut | |
| 80 | |
| 81 sub new { | |
| 82 my $caller = shift; | |
| 83 my $transcript = shift; | |
| 84 | |
| 85 my $class = ref($caller) || $caller; | |
| 86 | |
| 87 if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) { | |
| 88 throw("Transcript argument is required."); | |
| 89 } | |
| 90 | |
| 91 | |
| 92 my $exons = $transcript->get_all_Exons(); | |
| 93 my $start_phase; | |
| 94 if(@$exons) { | |
| 95 $start_phase = $exons->[0]->phase; | |
| 96 } else { | |
| 97 $start_phase = -1; | |
| 98 } | |
| 99 | |
| 100 # Create a cdna <-> genomic mapper and load it with exon coords | |
| 101 my $mapper = _load_mapper($transcript,$start_phase); | |
| 102 | |
| 103 my $self = bless({'exon_coord_mapper' => $mapper, | |
| 104 'start_phase' => $start_phase, | |
| 105 'cdna_coding_start' => $transcript->cdna_coding_start(), | |
| 106 'cdna_coding_end' => $transcript->cdna_coding_end()}, | |
| 107 $class); | |
| 108 } | |
| 109 | |
| 110 | |
| 111 =head2 _load_mapper | |
| 112 | |
| 113 Arg [1] : Bio::EnsEMBL::Transcript $transcript | |
| 114 The transcript for which a mapper should be created. | |
| 115 Example : my $mapper = _load_mapper($transcript); | |
| 116 Description: loads the mapper | |
| 117 Returntype : Bio::EnsEMBL::Mapper | |
| 118 Exceptions : none | |
| 119 Caller : Internal | |
| 120 Status : Stable | |
| 121 | |
| 122 =cut | |
| 123 | |
| 124 sub _load_mapper { | |
| 125 my $transcript = shift; | |
| 126 my $start_phase = shift; | |
| 127 | |
| 128 my $mapper = Bio::EnsEMBL::Mapper->new( 'cdna', 'genomic'); | |
| 129 | |
| 130 my $edits_on = $transcript->edits_enabled(); | |
| 131 my @edits; | |
| 132 | |
| 133 if($edits_on) { | |
| 134 @edits = @{$transcript->get_all_SeqEdits()}; | |
| 135 @edits = sort {$a->start() <=> $b->start()} @edits; | |
| 136 } | |
| 137 | |
| 138 my $edit_shift = 0; | |
| 139 | |
| 140 my $cdna_start = undef; | |
| 141 | |
| 142 my $cdna_end = 0; | |
| 143 | |
| 144 | |
| 145 foreach my $ex (@{$transcript->get_all_Exons}) { | |
| 146 my $gen_start = $ex->start(); | |
| 147 my $gen_end = $ex->end(); | |
| 148 | |
| 149 $cdna_start = $cdna_end + 1; | |
| 150 $cdna_end = $cdna_start + $ex->length() - 1; | |
| 151 | |
| 152 my $strand = $ex->strand(); | |
| 153 | |
| 154 # add deletions and insertions into pairs when SeqEdits turned on | |
| 155 # ignore mismatches (i.e. treat as matches) | |
| 156 if($edits_on) { | |
| 157 while(@edits && $edits[0]->start() + $edit_shift <= $cdna_end) { | |
| 158 | |
| 159 my $edit = shift(@edits); | |
| 160 my $len_diff = $edit->length_diff(); | |
| 161 | |
| 162 if($len_diff) { | |
| 163 # break pair into two parts, finish first pair just before edit | |
| 164 | |
| 165 my $prev_cdna_end = $edit->start() + $edit_shift - 1; | |
| 166 my $prev_cdna_start = $cdna_start; | |
| 167 my $prev_len = $prev_cdna_end - $prev_cdna_start + 1; | |
| 168 | |
| 169 my $prev_gen_end; | |
| 170 my $prev_gen_start; | |
| 171 if($strand == 1) { | |
| 172 $prev_gen_start = $gen_start; | |
| 173 $prev_gen_end = $gen_start + $prev_len - 1; | |
| 174 } else { | |
| 175 $prev_gen_start = $gen_end - $prev_len + 1; | |
| 176 $prev_gen_end = $gen_end; | |
| 177 } | |
| 178 | |
| 179 if($prev_len > 0) { # only create map pair if not boundary case | |
| 180 $mapper->add_map_coordinates | |
| 181 ('cdna', $prev_cdna_start, $prev_cdna_end, $strand, | |
| 182 'genome', $prev_gen_start,$prev_gen_end); | |
| 183 } | |
| 184 | |
| 185 $cdna_start = $prev_cdna_end + 1; | |
| 186 | |
| 187 if($strand == 1) { | |
| 188 $gen_start = $prev_gen_end + 1; | |
| 189 } else { | |
| 190 $gen_end = $prev_gen_start - 1; | |
| 191 } | |
| 192 | |
| 193 $cdna_end += $len_diff; | |
| 194 | |
| 195 if($len_diff > 0) { | |
| 196 # insert in cdna, shift cdna coords along | |
| 197 $cdna_start += $len_diff; | |
| 198 } else { | |
| 199 # delete in cdna (insert in genomic), shift genomic coords along | |
| 200 | |
| 201 if($strand == 1) { | |
| 202 $gen_start -= $len_diff; | |
| 203 } else { | |
| 204 $gen_end += $len_diff; | |
| 205 } | |
| 206 } | |
| 207 | |
| 208 $edit_shift += $len_diff; | |
| 209 } | |
| 210 } | |
| 211 } | |
| 212 | |
| 213 my $pair_len = $cdna_end - $cdna_start + 1; | |
| 214 | |
| 215 if($pair_len > 0) { | |
| 216 $mapper->add_map_coordinates('cdna', $cdna_start, $cdna_end, $strand, | |
| 217 'genome', $gen_start, $gen_end); | |
| 218 } | |
| 219 } | |
| 220 | |
| 221 return $mapper; | |
| 222 } | |
| 223 | |
| 224 | |
| 225 =head2 cdna2genomic | |
| 226 | |
| 227 Arg [1] : $start | |
| 228 The start position in cdna coordinates | |
| 229 Arg [2] : $end | |
| 230 The end position in cdna coordinates | |
| 231 Example : @cdna_coords = $transcript_mapper->cdna2genomic($start, $end); | |
| 232 Description: Converts cdna coordinates to genomic coordinates. The | |
| 233 return value is a list of coordinates and gaps. | |
| 234 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and | |
| 235 Bio::EnsEMBL::Mapper::Gap objects | |
| 236 Exceptions : throws if no start or end | |
| 237 Caller : general | |
| 238 Status : Stable | |
| 239 | |
| 240 =cut | |
| 241 | |
| 242 | |
| 243 sub cdna2genomic { | |
| 244 my ($self,$start,$end) = @_; | |
| 245 | |
| 246 if( !defined $end ) { | |
| 247 throw("Must call with start/end"); | |
| 248 } | |
| 249 | |
| 250 my $mapper = $self->{'exon_coord_mapper'}; | |
| 251 | |
| 252 return $mapper->map_coordinates( 'cdna', $start, $end, 1, "cdna" ); | |
| 253 | |
| 254 } | |
| 255 | |
| 256 | |
| 257 =head2 genomic2cdna | |
| 258 | |
| 259 Arg [1] : $start | |
| 260 The start position in genomic coordinates | |
| 261 Arg [2] : $end | |
| 262 The end position in genomic coordinates | |
| 263 Arg [3] : $strand | |
| 264 The strand of the genomic coordinates (default value 1) | |
| 265 Example : @coords = $trans_mapper->genomic2cdna($start, $end, $strnd); | |
| 266 Description: Converts genomic coordinates to cdna coordinates. The | |
| 267 return value is a list of coordinates and gaps. Gaps | |
| 268 represent intronic or upstream/downstream regions which do | |
| 269 not comprise this transcripts cdna. Coordinate objects | |
| 270 represent genomic regions which map to exons (utrs included). | |
| 271 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and | |
| 272 Bio::EnsEMBL::Mapper::Gap objects | |
| 273 Exceptions : throws if start, end or strand not defined | |
| 274 Caller : general | |
| 275 Status : Stable | |
| 276 | |
| 277 =cut | |
| 278 | |
| 279 sub genomic2cdna { | |
| 280 my ($self, $start, $end, $strand) = @_; | |
| 281 | |
| 282 unless(defined $start && defined $end && defined $strand) { | |
| 283 throw("start, end and strand arguments are required\n"); | |
| 284 } | |
| 285 | |
| 286 my $mapper = $self->{'exon_coord_mapper'}; | |
| 287 | |
| 288 return $mapper->map_coordinates("genome", $start, $end, $strand,"genomic"); | |
| 289 | |
| 290 } | |
| 291 | |
| 292 | |
| 293 =head2 cds2genomic | |
| 294 | |
| 295 Arg [1] : int $start | |
| 296 start position in cds coords | |
| 297 Arg [2] : int $end | |
| 298 end position in cds coords | |
| 299 Example : @genomic_coords = $transcript_mapper->cds2genomic(69, 306); | |
| 300 Description: Converts cds coordinates into genomic coordinates. The | |
| 301 coordinates returned are relative to the same slice that the | |
| 302 transcript used to construct this TranscriptMapper was on. | |
| 303 Returntype : list of Bio::EnsEMBL::Mapper::Gap and | |
| 304 Bio::EnsEMBL::Mapper::Coordinate objects | |
| 305 Exceptions : throws if no end | |
| 306 Caller : general | |
| 307 Status : at risk | |
| 308 | |
| 309 =cut | |
| 310 | |
| 311 sub cds2genomic { | |
| 312 my ( $self, $start, $end ) = @_; | |
| 313 | |
| 314 if ( !( defined($start) && defined($end) ) ) { | |
| 315 throw("Must call with start and end"); | |
| 316 } | |
| 317 | |
| 318 # Move start end into translate cDNA coordinates now. | |
| 319 $start = $start +( $self->{'cdna_coding_start'} - 1 ) ; | |
| 320 $end = $end + ( $self->{'cdna_coding_start'} - 1 ); | |
| 321 | |
| 322 return $self->cdna2genomic( $start, $end ); | |
| 323 } | |
| 324 | |
| 325 =head2 pep2genomic | |
| 326 | |
| 327 Arg [1] : int $start | |
| 328 start position in peptide coords | |
| 329 Arg [2] : int $end | |
| 330 end position in peptide coords | |
| 331 Example : @genomic_coords = $transcript_mapper->pep2genomic(23, 102); | |
| 332 Description: Converts peptide coordinates into genomic coordinates. The | |
| 333 coordinates returned are relative to the same slice that the | |
| 334 transcript used to construct this TranscriptMapper was on. | |
| 335 Returntype : list of Bio::EnsEMBL::Mapper::Gap and | |
| 336 Bio::EnsEMBL::Mapper::Coordinate objects | |
| 337 Exceptions : throws if no end | |
| 338 Caller : general | |
| 339 Status : Stable | |
| 340 | |
| 341 =cut | |
| 342 | |
| 343 sub pep2genomic { | |
| 344 my ( $self, $start, $end ) = @_; | |
| 345 | |
| 346 if ( !( defined($start) && defined($end) ) ) { | |
| 347 throw("Must call with start and end"); | |
| 348 } | |
| 349 | |
| 350 # Take possible N-padding at beginning of CDS into account. | |
| 351 my $start_phase = $self->{'start_phase'}; | |
| 352 my $shift = ( $start_phase > 0 ) ? $start_phase : 0; | |
| 353 | |
| 354 # Move start end into translate cDNA coordinates now. | |
| 355 $start = 3*$start - 2 + ( $self->{'cdna_coding_start'} - 1 ) - $shift; | |
| 356 $end = 3*$end + ( $self->{'cdna_coding_start'} - 1 ) - $shift; | |
| 357 | |
| 358 return $self->cdna2genomic( $start, $end ); | |
| 359 } | |
| 360 | |
| 361 | |
| 362 =head2 genomic2cds | |
| 363 | |
| 364 Arg [1] : int $start | |
| 365 The genomic start position | |
| 366 Arg [2] : int $end | |
| 367 The genomic end position | |
| 368 Arg [3] : int $strand | |
| 369 The genomic strand | |
| 370 Example : @cds_coords = $trans_mapper->genomic2cds($start, $end, $strand); | |
| 371 Description: Converts genomic coordinates into CDS coordinates of the | |
| 372 transcript that was used to create this transcript mapper. | |
| 373 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and | |
| 374 Bio::EnsEMBL::Mapper::Gap objects | |
| 375 Exceptions : throw if start, end or strand not defined | |
| 376 Caller : general | |
| 377 Status : Stable | |
| 378 | |
| 379 =cut | |
| 380 | |
| 381 sub genomic2cds { | |
| 382 my ($self, $start, $end, $strand) = @_; | |
| 383 | |
| 384 if(!defined($start) || !defined($end) || !defined($strand)) { | |
| 385 throw("start, end and strand arguments are required"); | |
| 386 } | |
| 387 | |
| 388 if($start > $end + 1) { | |
| 389 throw("start arg must be less than or equal to end arg + 1"); | |
| 390 } | |
| 391 | |
| 392 my $cdna_cstart = $self->{'cdna_coding_start'}; | |
| 393 my $cdna_cend = $self->{'cdna_coding_end'}; | |
| 394 | |
| 395 #this is a pseudogene if there is no coding region | |
| 396 if(!defined($cdna_cstart)) { | |
| 397 #return a gap of the entire requested region, there is no CDS | |
| 398 return Bio::EnsEMBL::Mapper::Gap->new($start,$end); | |
| 399 } | |
| 400 | |
| 401 my @coords = $self->genomic2cdna($start, $end, $strand); | |
| 402 | |
| 403 my @out; | |
| 404 | |
| 405 foreach my $coord (@coords) { | |
| 406 if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) { | |
| 407 push @out, $coord; | |
| 408 } else { | |
| 409 my $start = $coord->start; | |
| 410 my $end = $coord->end; | |
| 411 | |
| 412 if($coord->strand == -1 || $end < $cdna_cstart || $start > $cdna_cend) { | |
| 413 #is all gap - does not map to peptide | |
| 414 push @out, Bio::EnsEMBL::Mapper::Gap->new($start,$end); | |
| 415 } else { | |
| 416 #we know area is at least partially overlapping CDS | |
| 417 | |
| 418 my $cds_start = $start - $cdna_cstart + 1; | |
| 419 my $cds_end = $end - $cdna_cstart + 1; | |
| 420 | |
| 421 if($start < $cdna_cstart) { | |
| 422 #start of coordinates are in the 5prime UTR | |
| 423 push @out, Bio::EnsEMBL::Mapper::Gap->new($start, $cdna_cstart-1); | |
| 424 | |
| 425 #start is now relative to start of CDS | |
| 426 $cds_start = 1; | |
| 427 } | |
| 428 | |
| 429 my $end_gap = undef; | |
| 430 if($end > $cdna_cend) { | |
| 431 #end of coordinates are in the 3prime UTR | |
| 432 $end_gap = Bio::EnsEMBL::Mapper::Gap->new($cdna_cend + 1, $end); | |
| 433 #adjust end to relative to CDS start | |
| 434 $cds_end = $cdna_cend - $cdna_cstart + 1; | |
| 435 } | |
| 436 | |
| 437 #start and end are now entirely in CDS and relative to CDS start | |
| 438 $coord->start($cds_start); | |
| 439 $coord->end($cds_end); | |
| 440 | |
| 441 push @out, $coord; | |
| 442 | |
| 443 if($end_gap) { | |
| 444 #push out the region which was in the 3prime utr | |
| 445 push @out, $end_gap; | |
| 446 } | |
| 447 } | |
| 448 } | |
| 449 } | |
| 450 | |
| 451 return @out; | |
| 452 | |
| 453 } | |
| 454 | |
| 455 | |
| 456 =head2 genomic2pep | |
| 457 | |
| 458 Arg [1] : $start | |
| 459 The start position in genomic coordinates | |
| 460 Arg [2] : $end | |
| 461 The end position in genomic coordinates | |
| 462 Arg [3] : $strand | |
| 463 The strand of the genomic coordinates | |
| 464 Example : @pep_coords = $transcript->genomic2pep($start, $end, $strand); | |
| 465 Description: Converts genomic coordinates to peptide coordinates. The | |
| 466 return value is a list of coordinates and gaps. | |
| 467 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and | |
| 468 Bio::EnsEMBL::Mapper::Gap objects | |
| 469 Exceptions : throw if start, end or strand not defined | |
| 470 Caller : general | |
| 471 Status : Stable | |
| 472 | |
| 473 =cut | |
| 474 | |
| 475 sub genomic2pep { | |
| 476 my ($self, $start, $end, $strand) = @_; | |
| 477 | |
| 478 unless(defined $start && defined $end && defined $strand) { | |
| 479 throw("start, end and strand arguments are required"); | |
| 480 } | |
| 481 | |
| 482 my @coords = $self->genomic2cds($start, $end, $strand); | |
| 483 | |
| 484 my @out; | |
| 485 | |
| 486 my $start_phase = $self->{'start_phase'}; | |
| 487 | |
| 488 #take into account possible N padding at beginning of CDS | |
| 489 my $shift = ($start_phase > 0) ? $start_phase : 0; | |
| 490 | |
| 491 foreach my $coord (@coords) { | |
| 492 if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) { | |
| 493 push @out, $coord; | |
| 494 } else { | |
| 495 | |
| 496 #start and end are now entirely in CDS and relative to CDS start | |
| 497 | |
| 498 #convert to peptide coordinates | |
| 499 my $pep_start = int(($coord->start + $shift + 2) / 3); | |
| 500 my $pep_end = int(($coord->end + $shift + 2) / 3); | |
| 501 $coord->start($pep_start); | |
| 502 $coord->end($pep_end); | |
| 503 | |
| 504 push @out, $coord; | |
| 505 } | |
| 506 } | |
| 507 | |
| 508 return @out; | |
| 509 } | |
| 510 | |
| 511 | |
| 512 1; |
