Mercurial > repos > mahtabm > ensemb_rep_gvl
diff variant_effect_predictor/Bio/EnsEMBL/TranscriptMapper.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/TranscriptMapper.pm Thu Apr 11 06:29:17 2013 -0400 @@ -0,0 +1,512 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +TranscriptMapper - A utility class used to perform coordinate conversions +between a number of coordinate systems relating to transcripts + +=head1 SYNOPSIS + + my $trmapper = Bio::EnsEMBL::TranscriptMapper->new($transcript); + + @coords = $trmapper->cdna2genomic( 123, 554 ); + + @coords = $trmapper->genomic2cdna( 141, 500, -1 ); + + @coords = $trmapper->genomic2cds( 141, 500, -1 ); + + @coords = $trmapper->pep2genomic( 10, 60 ); + + @coords = $trmapper->genomic2pep( 123, 400, 1 ); + +=head1 DESCRIPTION + +This is a utility class which can be used to perform coordinate conversions +between a number of coordinate systems relating to transcripts. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::TranscriptMapper; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Exception qw(throw); + +use Bio::EnsEMBL::Mapper; +use Bio::EnsEMBL::Mapper::Gap; +use Bio::EnsEMBL::Mapper::Coordinate; + + + +=head2 new + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + The transcript for which a TranscriptMapper should be created. + Example : $trans_mapper = Bio::EnsEMBL::TranscriptMapper->new($transcript) + Description: Creates a TranscriptMapper object which can be used to perform + various coordinate transformations relating to transcripts. + Note that the TranscriptMapper uses the transcript state at the + time of creation to perform the conversions, and that a new + TranscriptMapper must be created if the Transcript is altered. + 'Genomic' coordinates are coordinates which are relative to the + slice that the Transcript is on. + Returntype : Bio::EnsEMBL::TranscriptMapper + Exceptions : throws if a transcript is not an argument + Caller : Transcript::get_TranscriptMapper + Status : Stable + +=cut + +sub new { + my $caller = shift; + my $transcript = shift; + + my $class = ref($caller) || $caller; + + if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) { + throw("Transcript argument is required."); + } + + + my $exons = $transcript->get_all_Exons(); + my $start_phase; + if(@$exons) { + $start_phase = $exons->[0]->phase; + } else { + $start_phase = -1; + } + + # Create a cdna <-> genomic mapper and load it with exon coords + my $mapper = _load_mapper($transcript,$start_phase); + + my $self = bless({'exon_coord_mapper' => $mapper, + 'start_phase' => $start_phase, + 'cdna_coding_start' => $transcript->cdna_coding_start(), + 'cdna_coding_end' => $transcript->cdna_coding_end()}, + $class); +} + + +=head2 _load_mapper + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + The transcript for which a mapper should be created. + Example : my $mapper = _load_mapper($transcript); + Description: loads the mapper + Returntype : Bio::EnsEMBL::Mapper + Exceptions : none + Caller : Internal + Status : Stable + +=cut + +sub _load_mapper { + my $transcript = shift; + my $start_phase = shift; + + my $mapper = Bio::EnsEMBL::Mapper->new( 'cdna', 'genomic'); + + my $edits_on = $transcript->edits_enabled(); + my @edits; + + if($edits_on) { + @edits = @{$transcript->get_all_SeqEdits()}; + @edits = sort {$a->start() <=> $b->start()} @edits; + } + + my $edit_shift = 0; + + my $cdna_start = undef; + + my $cdna_end = 0; + + + foreach my $ex (@{$transcript->get_all_Exons}) { + my $gen_start = $ex->start(); + my $gen_end = $ex->end(); + + $cdna_start = $cdna_end + 1; + $cdna_end = $cdna_start + $ex->length() - 1; + + my $strand = $ex->strand(); + + # add deletions and insertions into pairs when SeqEdits turned on + # ignore mismatches (i.e. treat as matches) + if($edits_on) { + while(@edits && $edits[0]->start() + $edit_shift <= $cdna_end) { + + my $edit = shift(@edits); + my $len_diff = $edit->length_diff(); + + if($len_diff) { + # break pair into two parts, finish first pair just before edit + + my $prev_cdna_end = $edit->start() + $edit_shift - 1; + my $prev_cdna_start = $cdna_start; + my $prev_len = $prev_cdna_end - $prev_cdna_start + 1; + + my $prev_gen_end; + my $prev_gen_start; + if($strand == 1) { + $prev_gen_start = $gen_start; + $prev_gen_end = $gen_start + $prev_len - 1; + } else { + $prev_gen_start = $gen_end - $prev_len + 1; + $prev_gen_end = $gen_end; + } + + if($prev_len > 0) { # only create map pair if not boundary case + $mapper->add_map_coordinates + ('cdna', $prev_cdna_start, $prev_cdna_end, $strand, + 'genome', $prev_gen_start,$prev_gen_end); + } + + $cdna_start = $prev_cdna_end + 1; + + if($strand == 1) { + $gen_start = $prev_gen_end + 1; + } else { + $gen_end = $prev_gen_start - 1; + } + + $cdna_end += $len_diff; + + if($len_diff > 0) { + # insert in cdna, shift cdna coords along + $cdna_start += $len_diff; + } else { + # delete in cdna (insert in genomic), shift genomic coords along + + if($strand == 1) { + $gen_start -= $len_diff; + } else { + $gen_end += $len_diff; + } + } + + $edit_shift += $len_diff; + } + } + } + + my $pair_len = $cdna_end - $cdna_start + 1; + + if($pair_len > 0) { + $mapper->add_map_coordinates('cdna', $cdna_start, $cdna_end, $strand, + 'genome', $gen_start, $gen_end); + } + } + + return $mapper; +} + + +=head2 cdna2genomic + + Arg [1] : $start + The start position in cdna coordinates + Arg [2] : $end + The end position in cdna coordinates + Example : @cdna_coords = $transcript_mapper->cdna2genomic($start, $end); + Description: Converts cdna coordinates to genomic coordinates. The + return value is a list of coordinates and gaps. + Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and + Bio::EnsEMBL::Mapper::Gap objects + Exceptions : throws if no start or end + Caller : general + Status : Stable + +=cut + + +sub cdna2genomic { + my ($self,$start,$end) = @_; + + if( !defined $end ) { + throw("Must call with start/end"); + } + + my $mapper = $self->{'exon_coord_mapper'}; + + return $mapper->map_coordinates( 'cdna', $start, $end, 1, "cdna" ); + +} + + +=head2 genomic2cdna + + Arg [1] : $start + The start position in genomic coordinates + Arg [2] : $end + The end position in genomic coordinates + Arg [3] : $strand + The strand of the genomic coordinates (default value 1) + Example : @coords = $trans_mapper->genomic2cdna($start, $end, $strnd); + Description: Converts genomic coordinates to cdna coordinates. The + return value is a list of coordinates and gaps. Gaps + represent intronic or upstream/downstream regions which do + not comprise this transcripts cdna. Coordinate objects + represent genomic regions which map to exons (utrs included). + Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and + Bio::EnsEMBL::Mapper::Gap objects + Exceptions : throws if start, end or strand not defined + Caller : general + Status : Stable + +=cut + +sub genomic2cdna { + my ($self, $start, $end, $strand) = @_; + + unless(defined $start && defined $end && defined $strand) { + throw("start, end and strand arguments are required\n"); + } + + my $mapper = $self->{'exon_coord_mapper'}; + + return $mapper->map_coordinates("genome", $start, $end, $strand,"genomic"); + +} + + +=head2 cds2genomic + + Arg [1] : int $start + start position in cds coords + Arg [2] : int $end + end position in cds coords + Example : @genomic_coords = $transcript_mapper->cds2genomic(69, 306); + Description: Converts cds coordinates into genomic coordinates. The + coordinates returned are relative to the same slice that the + transcript used to construct this TranscriptMapper was on. + Returntype : list of Bio::EnsEMBL::Mapper::Gap and + Bio::EnsEMBL::Mapper::Coordinate objects + Exceptions : throws if no end + Caller : general + Status : at risk + +=cut + +sub cds2genomic { + my ( $self, $start, $end ) = @_; + + if ( !( defined($start) && defined($end) ) ) { + throw("Must call with start and end"); + } + + # Move start end into translate cDNA coordinates now. + $start = $start +( $self->{'cdna_coding_start'} - 1 ) ; + $end = $end + ( $self->{'cdna_coding_start'} - 1 ); + + return $self->cdna2genomic( $start, $end ); +} + +=head2 pep2genomic + + Arg [1] : int $start + start position in peptide coords + Arg [2] : int $end + end position in peptide coords + Example : @genomic_coords = $transcript_mapper->pep2genomic(23, 102); + Description: Converts peptide coordinates into genomic coordinates. The + coordinates returned are relative to the same slice that the + transcript used to construct this TranscriptMapper was on. + Returntype : list of Bio::EnsEMBL::Mapper::Gap and + Bio::EnsEMBL::Mapper::Coordinate objects + Exceptions : throws if no end + Caller : general + Status : Stable + +=cut + +sub pep2genomic { + my ( $self, $start, $end ) = @_; + + if ( !( defined($start) && defined($end) ) ) { + throw("Must call with start and end"); + } + + # Take possible N-padding at beginning of CDS into account. + my $start_phase = $self->{'start_phase'}; + my $shift = ( $start_phase > 0 ) ? $start_phase : 0; + + # Move start end into translate cDNA coordinates now. + $start = 3*$start - 2 + ( $self->{'cdna_coding_start'} - 1 ) - $shift; + $end = 3*$end + ( $self->{'cdna_coding_start'} - 1 ) - $shift; + + return $self->cdna2genomic( $start, $end ); +} + + +=head2 genomic2cds + + Arg [1] : int $start + The genomic start position + Arg [2] : int $end + The genomic end position + Arg [3] : int $strand + The genomic strand + Example : @cds_coords = $trans_mapper->genomic2cds($start, $end, $strand); + Description: Converts genomic coordinates into CDS coordinates of the + transcript that was used to create this transcript mapper. + Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and + Bio::EnsEMBL::Mapper::Gap objects + Exceptions : throw if start, end or strand not defined + Caller : general + Status : Stable + +=cut + +sub genomic2cds { + my ($self, $start, $end, $strand) = @_; + + if(!defined($start) || !defined($end) || !defined($strand)) { + throw("start, end and strand arguments are required"); + } + + if($start > $end + 1) { + throw("start arg must be less than or equal to end arg + 1"); + } + + my $cdna_cstart = $self->{'cdna_coding_start'}; + my $cdna_cend = $self->{'cdna_coding_end'}; + + #this is a pseudogene if there is no coding region + if(!defined($cdna_cstart)) { + #return a gap of the entire requested region, there is no CDS + return Bio::EnsEMBL::Mapper::Gap->new($start,$end); + } + + my @coords = $self->genomic2cdna($start, $end, $strand); + + my @out; + + foreach my $coord (@coords) { + if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) { + push @out, $coord; + } else { + my $start = $coord->start; + my $end = $coord->end; + + if($coord->strand == -1 || $end < $cdna_cstart || $start > $cdna_cend) { + #is all gap - does not map to peptide + push @out, Bio::EnsEMBL::Mapper::Gap->new($start,$end); + } else { + #we know area is at least partially overlapping CDS + + my $cds_start = $start - $cdna_cstart + 1; + my $cds_end = $end - $cdna_cstart + 1; + + if($start < $cdna_cstart) { + #start of coordinates are in the 5prime UTR + push @out, Bio::EnsEMBL::Mapper::Gap->new($start, $cdna_cstart-1); + + #start is now relative to start of CDS + $cds_start = 1; + } + + my $end_gap = undef; + if($end > $cdna_cend) { + #end of coordinates are in the 3prime UTR + $end_gap = Bio::EnsEMBL::Mapper::Gap->new($cdna_cend + 1, $end); + #adjust end to relative to CDS start + $cds_end = $cdna_cend - $cdna_cstart + 1; + } + + #start and end are now entirely in CDS and relative to CDS start + $coord->start($cds_start); + $coord->end($cds_end); + + push @out, $coord; + + if($end_gap) { + #push out the region which was in the 3prime utr + push @out, $end_gap; + } + } + } + } + + return @out; + +} + + +=head2 genomic2pep + + Arg [1] : $start + The start position in genomic coordinates + Arg [2] : $end + The end position in genomic coordinates + Arg [3] : $strand + The strand of the genomic coordinates + Example : @pep_coords = $transcript->genomic2pep($start, $end, $strand); + Description: Converts genomic coordinates to peptide coordinates. The + return value is a list of coordinates and gaps. + Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and + Bio::EnsEMBL::Mapper::Gap objects + Exceptions : throw if start, end or strand not defined + Caller : general + Status : Stable + +=cut + +sub genomic2pep { + my ($self, $start, $end, $strand) = @_; + + unless(defined $start && defined $end && defined $strand) { + throw("start, end and strand arguments are required"); + } + + my @coords = $self->genomic2cds($start, $end, $strand); + + my @out; + + my $start_phase = $self->{'start_phase'}; + + #take into account possible N padding at beginning of CDS + my $shift = ($start_phase > 0) ? $start_phase : 0; + + foreach my $coord (@coords) { + if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) { + push @out, $coord; + } else { + + #start and end are now entirely in CDS and relative to CDS start + + #convert to peptide coordinates + my $pep_start = int(($coord->start + $shift + 2) / 3); + my $pep_end = int(($coord->end + $shift + 2) / 3); + $coord->start($pep_start); + $coord->end($pep_end); + + push @out, $coord; + } + } + + return @out; +} + + +1;