Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Variation/BaseTranscriptVariation.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/BaseTranscriptVariation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,680 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::BaseTranscriptVariation + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::BaseTranscriptVariation; + +=head1 DESCRIPTION + +A helper class for representing an overlap of a Transcript and a +Variation (either sequence or structural). Should not be invoked directly. + +=cut + +package Bio::EnsEMBL::Variation::BaseTranscriptVariation; + +use strict; +use warnings; + +use Digest::MD5 qw(md5_hex); + +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref check_ref); +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap within_cds); + +use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlap); + +=head2 transcript_stable_id + + Description: Returns the stable_id of the associated Transcript + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub transcript_stable_id { + my $self = shift; + return $self->SUPER::_feature_stable_id(@_); +} + +=head2 transcript + + Arg [1] : (optional) Bio::EnsEMBL::Transcript + Description: Get/set the associated Transcript + Returntype : Bio::EnsEMBL::Transcript + Exceptions : throws if argument is wrong type + Status : At Risk + +=cut + +sub transcript { + my ($self, $transcript) = @_; + assert_ref($transcript, 'Bio::EnsEMBL::Transcript') if $transcript; + return $self->SUPER::feature($transcript, 'Transcript'); +} + +=head2 feature + + Arg [1] : (optional) Bio::EnsEMBL::Transcript + Description: Get/set the associated Transcript (overriding the superclass feature method) + Returntype : Bio::EnsEMBL::Transcript + Exceptions : throws if argument is wrong type + Status : At Risk + +=cut + +sub feature { + my $self = shift; + return $self->transcript(@_); +} + +=head2 cdna_start + + Arg [1] : (optional) int $start + Example : $cdna_start = $tv->cdna_start; + Description: Getter/Setter for the start position of this variation on the + transcript in cDNA coordinates. + Returntype : int + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub cdna_start { + my ($self, $cdna_start) = @_; + + $self->{cdna_start} = $cdna_start if defined $cdna_start; + + unless (exists $self->{cdna_start}) { + my $cdna_coords = $self->cdna_coords; + + my ($first, $last) = ($cdna_coords->[0], $cdna_coords->[-1]); + + $self->{cdna_start} = $first->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $first->start; + $self->{cdna_end} = $last->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $last->end; + } + + return $self->{cdna_start}; +} + +=head2 cdna_end + + Arg [1] : (optional) int $end + Example : $cdna_end = $tv->cdna_end; + Description: Getter/Setter for the end position of this variation on the + transcript in cDNA coordinates. + Returntype : int + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub cdna_end { + my ($self, $cdna_end) = @_; + + $self->{cdna_end} = $cdna_end if defined $cdna_end; + + # call cdna_start to calculate the start and end + $self->cdna_start unless exists $self->{cdna_end}; + + return $self->{cdna_end}; +} + +=head2 cds_start + + Arg [1] : (optional) int $start + Example : $cds_start = $tv->cds_start; + Description: Getter/Setter for the start position of this variation on the + transcript in CDS coordinates. + Returntype : int + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub cds_start { + my ($self, $cds_start) = @_; + + $self->{cds_start} = $cds_start if defined $cds_start; + + unless (exists $self->{cds_start}) { + my $cds_coords = $self->cds_coords; + + my ($first, $last) = ($cds_coords->[0], $cds_coords->[-1]); + my $exon_phase = $self->transcript->start_Exon->phase; + + $self->{cds_start} = $first->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $first->start + ($exon_phase > 0 ? $exon_phase : 0); + $self->{cds_end} = $last->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $last->end + ($exon_phase > 0 ? $exon_phase : 0); + } + + return $self->{cds_start}; +} + +=head2 cds_end + + Arg [1] : (optional) int $end + Example : $cds_end = $tv->cds_end; + Description: Getter/Setter for the end position of this variation on the + transcript in CDS coordinates. + Returntype : int + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub cds_end { + my ($self, $cds_end) = @_; + + $self->{cds_end} = $cds_end if defined $cds_end; + + # call cds_start to calculate the start and end + $self->cds_start unless exists $self->{cds_end}; + + return $self->{cds_end}; +} + +=head2 translation_start + + Arg [1] : (optional) int $start + Example : $translation_start = $tv->translation_start; + Description: Getter/Setter for the start position of this variation on the + transcript in peptide coordinates. + Returntype : int + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub translation_start { + my ($self, $translation_start) = @_; + + $self->{translation_start} = $translation_start if defined $translation_start; + + unless (exists $self->{translation_start}) { + my $translation_coords = $self->translation_coords; + + my ($first, $last) = ($translation_coords->[0], $translation_coords->[-1]); + + $self->{translation_start} = $first->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $first->start; + $self->{translation_end} = $last->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $last->end; + } + + return $self->{translation_start}; +} + + +=head2 translation_end + + Arg [1] : (optional) int $end + Example : $transaltion_end = $tv->translation_end; + Description: Getter/Setter for the end position of this variation on the + transcript in peptide coordinates. + Returntype : int + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub translation_end { + my ($self, $translation_end) = @_; + + $self->{translation_end} = $translation_end if defined $translation_end; + + # call translation_start to calculate the start and end + $self->translation_start unless exists $self->{translation_end}; + + return $self->{translation_end}; +} + +=head2 cdna_coords + + Description: Use the TranscriptMapper to calculate the cDNA + coordinates of this variation + Returntype : listref of Bio::EnsEMBL::Coordinate and Bio::EnsEMBL::Gap objects + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub cdna_coords { + my $self = shift; + + unless ($self->{_cdna_coords}) { + my $vf = $self->base_variation_feature; + my $tran = $self->transcript; + $self->{_cdna_coords} = [ $self->_mapper->genomic2cdna($vf->start, $vf->end, $tran->strand) ]; + } + + return $self->{_cdna_coords}; +} + +=head2 cds_coords + + Description: Use the TranscriptMapper to calculate the CDS + coordinates of this variation + Returntype : listref of Bio::EnsEMBL::Coordinate and Bio::EnsEMBL::Gap objects + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub cds_coords { + my $self = shift; + + unless ($self->{_cds_coords}) { + my $vf = $self->base_variation_feature; + my $tran = $self->transcript; + $self->{_cds_coords} = [ $self->_mapper->genomic2cds($vf->start, $vf->end, $tran->strand) ]; + } + + return $self->{_cds_coords}; +} + +=head2 translation_coords + + Description: Use the TranscriptMapper to calculate the peptide + coordinates of this variation + Returntype : listref of Bio::EnsEMBL::Coordinate and Bio::EnsEMBL::Gap objects + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub translation_coords { + my $self = shift; + + unless ($self->{_translation_coords}) { + my $vf = $self->base_variation_feature; + my $tran = $self->transcript; + $self->{_translation_coords} = [ $self->_mapper->genomic2pep($vf->start, $vf->end, $tran->strand) ]; + } + + return $self->{_translation_coords}; +} + + +=head2 distance_to_transcript + + Arg [1] : (optional) int $distance + Example : $distance = $tv->distance_to_transcript; + Description: Getter/Setter for the distance of this variant to the transcript. + This is the shortest distance between variant start/end and + transcript start/end, so if a variant falls 5' of a transcript + on the forward strand this distance will be that between the + variant end and the transcript start; if it falls 3' it will be + the distance between the variant start and the transcript end. + Returntype : int + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub distance_to_transcript { + my ($self, $distance) = @_; + + $self->{distance_to_transcript} = $distance if defined $distance; + + unless (exists $self->{distance_to_transcript}) { + my $vf = $self->base_variation_feature; + my $tr = $self->transcript; + + my @dists = ( + $vf->start - $tr->start, + $vf->start - $tr->end, + $vf->end - $tr->start, + $vf->end - $tr->end + ); + + # make positive if <0 and sort + @dists = sort {$a <=> $b} map {$_ < 0 ? 0 - $_ : $_} @dists; + + $self->{distance_to_transcript} = $dists[0]; + } + + return $self->{distance_to_transcript}; +} + +=head2 get_overlapping_ProteinFeatures + + Description: Find any ProteinFeatures (e.g. pfam or interpro domains etc.) that + the associated variation feature lies in + Returntype : listref of Bio::EnsEMBL::ProteinFeatures (possibly empty) + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub get_overlapping_ProteinFeatures { + my $self = shift; + + unless (exists $self->{_protein_features}) { + + $self->{_protein_features } = []; + + my $tl = $self->transcript->translation; + + if (defined $tl) { + + my $tl_start = $self->translation_start; + my $tl_end = $self->translation_end; + + if (defined $tl_start && defined $tl_end) { + for my $feat (@{ $tl->get_all_ProteinFeatures }) { + if (overlap($feat->start, $feat->end, $tl_start, $tl_end)) { + push @{ $self->{_protein_features} }, $feat; + } + } + } + } + } + + return $self->{_protein_features}; +} + +=head2 exon_number + + Description: Identify which exon(s) this variant falls in + Returntype : '/'-separated string containing the exon number(s) and the total + number of exons in this transcript, or undef if this variant + does not fall in any exons + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub exon_number { + my $self = shift; + $self->_exon_intron_number unless exists $self->{exon_number}; + return $self->{exon_number}; +} + +=head2 intron_number + + Description: Identify which intron(s) this variant falls in + Returntype : '/'-separated string containing the intron number(s) and the total + number of introns in this transcript, or undef if this variant + does not fall in any introns + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub intron_number { + my $self = shift; + $self->_exon_intron_number unless exists $self->{intron_number}; + return $self->{intron_number}; +} + +sub _exon_intron_number { + my $self = shift; + + # work out which exon or intron this variant falls in + + # ensure the keys exist so even if we don't fall in an exon + # or intron we'll only call this method once + + $self->{exon_number} = $self->{intron_number} = undef; + + my $vf = $self->base_variation_feature; + + my $vf_start = $vf->start; + my $vf_end = $vf->end; + + my $strand = $self->transcript->strand; + + my $exons = $self->_exons; + + my $tot_exons = scalar(@$exons); + + my $exon_count = 0; + + my $prev_exon; + + my (@overlapped_exons, @overlapped_introns); + + for my $exon (@$exons) { + + $exon_count++; + + if (overlap($vf_start, $vf_end, $exon->start, $exon->end)) { + push @overlapped_exons, $exon_count; + #$self->{exon_number} = defined($self->{exon_number}) ? $self->{exon_number}.",".$exon_count : $exon_count; + } + + if ($prev_exon) { + my $intron_start = $strand == 1 ? $prev_exon->end + 1 : $exon->end + 1; + my $intron_end = $strand == 1 ? $exon->start - 1 : $prev_exon->start - 1; + + if ($prev_exon && overlap($vf_start, $vf_end, $intron_start, $intron_end)) { + push @overlapped_introns, $exon_count - 1; + #$self->{intron_number} = defined($self->{intron_number}) ? $self->{intron_number}.",".($exon_count - 1) : ($exon_count - 1); + } + } + + $prev_exon = $exon; + } + + if(@overlapped_exons) { + $self->{exon_number} = (scalar @overlapped_exons > 1 ? $overlapped_exons[0].'-'.$overlapped_exons[-1] : $overlapped_exons[0]).'/'.$tot_exons; + } + if(@overlapped_introns) { + $self->{intron_number} = (scalar @overlapped_introns > 1 ? $overlapped_introns[0].'-'.$overlapped_introns[-1] : $overlapped_introns[0]).'/'.($tot_exons - 1); + } +} + +sub _intron_effects { + my $self = shift; + + # internal method used by Bio::EnsEMBL::Variation::Utils::VariationEffect + # when calculating various consequence types + + # this method is a major bottle neck in the effect calculation code so + # we cache results and use local variables instead of method calls where + # possible to speed things up - caveat bug-fixer! + + unless ($self->{_intron_effects}) { + + my $vf = $self->base_variation_feature; + + my $intron_effects = {}; + + my $found_effect = 0; + + my $vf_start = $vf->start; + my $vf_end = $vf->end; + + my $insertion = $vf_start == $vf_end+1; + + for my $intron (@{ $self->_introns }) { + + my $intron_start = $intron->start; + my $intron_end = $intron->end; + + # under various circumstances the genebuild process can introduce + # artificial short (<= 12 nucleotide) introns into transcripts + # (e.g. to deal with errors in the reference sequence etc.), we + # don't want to categorise variations that fall in these introns + # as intronic, or as any kind of splice variant + + my $frameshift_intron = ( abs($intron_end - $intron_start) <= 12 ); + + if ($frameshift_intron) { + if (overlap($vf_start, $vf_end, $intron_start, $intron_end)) { + $intron_effects->{within_frameshift_intron} = 1; + next; + } + } + + if (overlap($vf_start, $vf_end, $intron_start, $intron_start+1)) { + $intron_effects->{start_splice_site} = 1; + } + + if (overlap($vf_start, $vf_end, $intron_end-1, $intron_end)) { + $intron_effects->{end_splice_site} = 1; + } + + # we need to special case insertions between the donor and acceptor sites + + if (overlap($vf_start, $vf_end, $intron_start+2, $intron_end-2) or + ($insertion && ($vf_start == $intron_start+2 || $vf_end == $intron_end-2)) ) { + $intron_effects->{intronic} = 1; + } + + # the definition of splice_region (SO:0001630) is "within 1-3 bases + # of the exon or 3-8 bases of the intron", the intron start is the + # first base of the intron so we only need to add or subtract 7 from + # it to get the correct coordinate. We also need to special case + # insertions between the edge of an exon and a donor or acceptor site + # and between a donor or acceptor site and the intron + + if ( overlap($vf_start, $vf_end, $intron_start-3, $intron_start-1) or + overlap($vf_start, $vf_end, $intron_start+2, $intron_start+7) or + overlap($vf_start, $vf_end, $intron_end-7, $intron_end-2 ) or + overlap($vf_start, $vf_end, $intron_end+1, $intron_end+3 ) or + ($insertion && ( + $vf_start == $intron_start || + $vf_end == $intron_end || + $vf_start == $intron_start+2 || + $vf_end == $intron_end-2 + ) )) { + + $intron_effects->{splice_region} = 1; + } + } + + $self->{_intron_effects} = $intron_effects; + } + + return $self->{_intron_effects}; +} + +# NB: the methods below all cache their data in the associated transcript itself, this +# gives a significant speed up when you are calculating the effect of all variations +# on a transcript, and means that the cache will be freed when the transcript itself +# is garbage collected rather than us having to maintain a transcript feature cache +# ourselves + +sub _introns { + my $self = shift; + + my $tran = $self->transcript; + + my $introns = $tran->{_variation_effect_feature_cache}->{introns} ||= $tran->get_all_Introns; + + return $introns; +} + +sub _exons { + my $self = shift; + + my $tran = $self->transcript; + + my $exons = $tran->{_variation_effect_feature_cache}->{exons} ||= $tran->get_all_Exons; + + return $exons; +} + +sub _mapper { + my $self = shift; + + my $tran = $self->transcript; + + my $mapper = $tran->{_variation_effect_feature_cache}->{mapper} ||= $tran->get_TranscriptMapper; + + return $mapper; +} +sub _translateable_seq { + my $self = shift; + + my $tran = $self->transcript; + + my $tran_seq = $tran->{_variation_effect_feature_cache}->{translateable_seq} ||= $tran->translateable_seq; + + return $tran_seq; +} + +sub _peptide { + my $self = shift; + + my $tran = $self->transcript; + + my $peptide = $tran->{_variation_effect_feature_cache}->{peptide}; + + unless ($peptide) { + my $translation = $tran->translate; + $peptide = $translation ? $translation->seq : undef; + $tran->{_variation_effect_feature_cache}->{peptide} = $peptide; + } + + return $peptide; +} + +sub _translation_md5 { + my $self = shift; + + my $tran = $self->transcript; + + unless (exists $tran->{_variation_effect_feature_cache}->{translation_md5}) { + $tran->{_variation_effect_feature_cache}->{translation_md5} = + $self->_peptide ? md5_hex($self->_peptide) : undef; + } + + return $tran->{_variation_effect_feature_cache}->{translation_md5}; +} + +sub _codon_table { + my $self = shift; + + my $tran = $self->transcript; + + my $codon_table = $tran->{_variation_effect_feature_cache}->{codon_table}; + + unless ($codon_table) { + # for mithocondrial dna we need to to use a different codon table + my $attrib = $tran->slice->get_all_Attributes('codon_table')->[0]; + + # default to the vertebrate codon table which is denoted as 1 + $codon_table = $attrib ? $attrib->value : 1; + + $tran->{_variation_effect_feature_cache}->{codon_table} = $codon_table + } + + return $codon_table; +} + +1; \ No newline at end of file