view variant_effect_predictor/Bio/EnsEMBL/Upstream.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
line wrap: on
line source

=head1 LICENSE

  Copyright (c) 1999-2012 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <dev@ensembl.org>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=cut

=head1 NAME

Bio::EnsEMBL::Upstream - Object that defines an upstream region

=head1 SYNOPSIS

  use Bio::EnsEMBL::Upstream;

  my $upstream = Bio::EnsEMBL::Upstream->new(
    -transcript => $transcript,
    -length     => 2000           # bp
  );

  # Retrieve coordinates of upstream region
  my $upstream_region_start = $upstream->upstart;
  my $upstream_region_end   = $upstream->upend;

  # Retrieve coordinates in 'downstream' first intron
  my $intron_region_start = $upstream->downstart;
  my $intron_region_end   = $upstream->downend;

  # Coordinates are returned in the same scheme as the input transcript.
  # However, the coordinates of an upstream region can be transformed to
  # any other scheme using a slice

  $upstream->transform($slice);

  # Coordinates can be retrieved in scheme in the same manner as the
  # above.

=head1 DESCRIPTION

An object that determines the upstream region of a transcript.  Such a
region is non-coding and ensures that other genes or transcripts are
not present.  Ultimately, these objects can be used to looking for
promoter elements.  To this end, it is also possible to derive a region
downstream of the first exon, within the first intron and where promoter
elements sometimes are found.

=head1 METHODS

=cut

package Bio::EnsEMBL::Upstream;

use strict;
use vars qw(@ISA);
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::DBSQL::SimpleFeatureAdaptor;

use Bio::EnsEMBL::Utils::Exception qw(throw);
use Bio::EnsEMBL::Utils::Argument  qw(rearrange);

@ISA = qw(Bio::EnsEMBL::Feature);


=head2 new

  Arg [transcript] : (optional) Bio::EnsEMBL::Transcript
  Arg [length]     : (optional) int $length
  Example    : $upstream = Bio::EnsEMBL::Upstream->new(-transcript => $transcript, 
						       -length => 2000);
  Description: Creates a new upstream object
  Returntype : Bio::EnsEMBL::Upstream
  Exceptions : none
  Caller     : Bio::EnsEMBL::Transcript, general
  Status     : Stable

=cut

sub new {
  my ($class, @args) = @_;
  my $self = {};
  
  bless $self, $class;
  
  my ($transcript, 
      $length) = rearrange([qw(TRANSCRIPT
			       LENGTH
			      )],@args);
	
  $self->transcript($transcript) 	if defined $transcript;
  $self->length($length) 		if $length;
  
  return $self
}

=head2 transcript

  Arg        : (optional) Bio::EnsEMBL::Transcript
  Example    : $self->transcript($transcript);
  Description: Getter/setter for transcript object
  Returntype : Bio::EnsEMBL::Transcript
  Exceptions : Throws if argument is not undefined 
               or a Bio::EnsEMBL::Transcript
  Caller     : $self->new, $self->_derive_coords, 
               $self->_first_coding_Exon
  Status     : Stable

=cut


sub transcript {
  my $self = shift;
  
  if (@_){
    $self->{_transcript} = shift;
    
    if (defined $self->{_transcript}) {
      throw("Transcript is not a Bio::EnsEMBL::Transcript") 
	if (! $self->{_transcript}->isa("Bio::EnsEMBL::Transcript"));
      $self->_flush_cache;
    }
  }
  
  return $self->{_transcript}
}

=head2 length

  Arg        : (optional) int $length
  Example    : $self->length(2000); # bp
  Description: Getter/setter for upstream region length.
  Returntype : int
  Exceptions : Throws if length is requested before it has been set.
  Caller     : $self->new, $self->_derive_coords
  Status     : Stable

=cut

sub length {
  my $self = shift;
  
  if (@_){
    $self->{_length} = shift;
    $self->_flush_cache;
  }
  
  throw("Region length has not been set.")
    unless $self->{_length};
  
  return $self->{_length}
}

=head2 _flush_cache

  Arg        : none
  Example    : $self->_flush_cache;
  Description: Empties cached coordinates (called when 
	       coordinate scheme or region length has changed).
  Returntype : none
  Exceptions : none
  Caller     : $self->length, $self->transform
  Status     : Stable

=cut

sub _flush_cache {
  my $self = shift;
  
  $self->upstart(undef);
  $self->upend(undef);
  $self->downstart(undef);
  $self->downend(undef);
}

=head2 upstart

  Arg        : none
  Example    : $self->upstart;
  Description: Returns the start coordinate of the region 
               upstream of the transcript.  This coordinate 
               is always the furthest from the translation 
               initiation codon, whereas upend always abutts 
               the translation initiation codon.
  Returntype : int
  Exceptions : none
  Caller     : general
  Status     : Stable

=cut

sub upstart {
  my $self = shift;
  
  if (@_) {
    $self->{_upstart} = shift @_;
    return
  }

  if (! defined $self->{_upstart}) {
    $self->_derive_coords('up');
  }

  return $self->{_upstart}
}

=head2 upend

  Arg        : none
  Example    : $self->upend;
  Description: Returns the end coordinate of the region 
               upstream of the transcript.  This coordinate 
               always always abutts the translation 
               initiation codon, whereas upstart always 
               returns the coorindate furthest from the 
               translation initiation codon.
  Returntype : int
  Exceptions : none
  Caller     : general
  Status     : Stable

=cut

sub upend {
  my $self = shift;
	
  if (@_) {
    $self->{_upend} = shift @_;
    return
  }

  if (! defined $self->{_upend}) {
    $self->_derive_coords('up');
  }

  return $self->{_upend}
}

=head2 downstart

  Arg        : none
  Example    : $self->downstart;
  Description: Returns the start coordinate of the region 
               in the first intron of the transcript.  This 
               coordinate is always closest to the first 
               exon (irregardless of strand).
  Returntype : int
  Exceptions : none
  Caller     : general
  Status     : Stable

=cut

sub downstart {
  my $self = shift;
	
  if (@_) {
    $self->{_downstart} = shift @_;
    return
  }

  if (! defined $self->{_downstart}) {
    $self->_derive_coords('down');
  }

  return $self->{_downstart}
}

=head2 downend

  Arg        : none
  Example    : $self->downend;
  Description: Returns the end coordinate of the region 
               in the first intron of the transcript.  This 
               coordinate is always furthest from the first 
               exon (irregardless of strand).
  Returntype : int
  Exceptions : none
  Caller     : general
  Status     : Stable

=cut

sub downend {
  my $self = shift;

  if (@_) {
    $self->{_downend} = shift @_;
    return
  }

  if (! defined $self->{_downend}) {
    $self->_derive_coords('down');
  }

  return $self->{_downend}
}

=head2 transform

  Arg        : 
  Example    : 
  Description: Not yet implemented
  Returntype : 
  Exceptions : 
  Caller     : 
  Status     : At Risk

=cut


# Over-riding inherited class.  As yet unimplemented.

sub transform {
  my $self = shift;

  throw("No transform method implemented for " . $self);
}

=head2 derive_upstream_coords

  Arg        : none
  Example    : my ($upstart, $upend) 
                       = $self->derive_upstream_coords;
  Description: Derives upstream coordinates (for 
               compatability with older scripts).
  Returntype : arrayref
  Exceptions : none
  Caller     : general
  Status     : Stable

=cut

sub derive_upstream_coords {
  my $self = shift;

  return [$self->upstart, $self->upend]
}

=head2 derive_downstream_coords

  Arg        : none
  Example    : my ($downstart, $downend) 
                       = $self->derive_downstream_coords;
  Description: Derives downstream coordinates (for 
               compatability with older scripts).
  Returntype : arrayref
  Exceptions : none
  Caller     : general
  Status     : Stable

=cut

sub derive_downstream_coords {
  my $self = shift;

  return [$self->downstart, $self->downend]
}

=head2 _derive_coords

  Arg        : string $direction (either 'up' or 'down').
  Example    : $self->_derive_coords('up');
  Description: Determines the coordinates of either upstream 
               or downstream region.
  Returntype : none
  Exceptions : Throws if argument is not either 'up' or 'down'
  Caller     : $self->upstart, $self->upend, $self->downstart, 
               $self->downend
  Status     : Stable

=cut

sub _derive_coords {
  my ($self, $direction) = @_;

  # Check direction
  throw("Must specify either \'up\' of \'down\'-stream direction to derive coords.")
    unless (($direction eq 'up')||($direction eq 'down'));

  # Put things in easily accessible places.
  my $core_db_slice_adaptor = $self->transcript->slice->adaptor;
  my $region_length = $self->length;

  # Whatever coord system the gene is currently is, transform to the toplevel.
  my $transcript = $self->transcript->transform('toplevel');

  # Use our transformed transcript to determine the upstream region coords.
  # End should always be just before the coding start (like ATG), including 3' UTR.
  # Start is the outer limit of the region upstream (furthest from ATG).

  my $region_start;
  my $region_end;

  if ($transcript->strand == 1){
    if ($direction eq 'up'){
      $region_end = $transcript->coding_region_start - 1;
      $region_start = $region_end - $region_length;
    } elsif ($direction eq 'down'){
      $region_end = $self->_first_coding_Exon->end + 1;
      $region_start = $region_end + $region_length;
    }
  } elsif ($transcript->strand == -1) {
    if ($direction eq 'up'){
      $region_end = $transcript->coding_region_end + 1;
      $region_start = $region_end + $region_length;

    } elsif ($direction eq 'down'){
      $region_end = $self->_first_coding_Exon->start - 1;
      $region_start = $region_end - $region_length;
    }
  }

  # Trim the upstream/downstream region to remove extraneous coding sequences 
  # from other genes and/or transcripts.
    
  my ($slice_low_coord, $slice_high_coord) = sort {$a <=> $b} ($region_start, $region_end);

  my $region_slice 
      = $core_db_slice_adaptor->fetch_by_region($transcript->slice->coord_system->name,
						$transcript->slice->seq_region_name, 
						$slice_low_coord, 
						$slice_high_coord);

  if ($transcript->strand == 1) {
    if ($direction eq 'up') {
      $region_start += $self->_bases_to_trim('left_end', $region_slice);
    } elsif ($direction eq 'down') {
      $region_start -= $self->_bases_to_trim('right_end', $region_slice);
    }
  } elsif ($transcript->strand == -1) {
    if ($direction eq 'up') {
      $region_start -= $self->_bases_to_trim('right_end', $region_slice);
    } elsif ($direction eq 'down') {
      $region_start += $self->_bases_to_trim('left_end', $region_slice);
    }
  }

  # Always return start < end

  ($region_start, $region_end) = sort {$a <=> $b} ($region_start, $region_end);

  if ($direction eq 'up') {
    $self->upstart($region_start);
    $self->upend($region_end);
  } elsif ($direction eq 'down') {
    $self->downstart($region_start);
    $self->downend($region_end);
  }
}

=head2 _bases_to_trim

  Arg        : string $end_to_trim (either 'right_end' or 
               'left_end').
  Arg        : Bio::EnsEMBL::Slice
  Example    : $self->_derive_coords('right_end', $slice);
  Description: Finds exons from other genes/transcripts that
               invade our upstream/downstream slice and 
               returns the number of bases that should be 
               truncated from the appropriate end of the 
               upstream/downstream region.
  Returntype : in
  Exceptions : Throws if argument is not either 'right_end' 
               or 'left_end'
  Caller     : $self->_derive_coords
  Status     : Stable

=cut

# Method to look for coding regions that invade the upstream region.  For
# now, this method returns the number of bases to trim.  I doesn't yet
# do anything special if an exon is completely swallowed (truncates at 
# the end of the overlapping exon and discards any non-coding sequence 
# further upstream) or overlaps the 'wrong' end of the region (cases where 
# two alternate exons share one end of sequence - does this happen?).

# The input argument 'end' defines the end of the slice that should be 
# truncated. 

sub _bases_to_trim {
    my ($self, $end_to_trim, $slice) = @_;

    throw "Slice end argument must be either left_end or right_end" 
	unless ($end_to_trim eq 'right_end' || $end_to_trim eq 'left_end'); 

    my @overlap_coords;
    my $slice_length = $slice->length;
    my $right_trim = 0;
    my $left_trim  = 0;

    foreach my $exon (@{$slice->get_all_Exons}){
      next if $exon->stable_id eq $self->_first_coding_Exon->stable_id;

      my $start = $exon->start;
      my $end   = $exon->end;

      # Choose from four possible exon arrangements

      #  -----|********************|----- Slice
      #  --|=========================|--- Exon arrangement 1
      #  ----------|======|-------------- Exon arrangement 2
      #  --|=======|--------------------- Exon arrangement 3
      #  -------------------|=========|-- Exon arrangement 4


      if ($start <=  0 && $end >= $slice_length) {     # exon arrangement 1
	$right_trim = $slice_length - 1;
	$left_trim  = $slice_length - 1;
	last;

      } elsif ($start >= 0 && $end <= $slice_length) { # exon arrangement 2
	my $this_right_trim = ($slice_length - $start) + 1;

	$right_trim = $this_right_trim 
	  if $this_right_trim > $right_trim;

	$left_trim  = $end 
	  if $end > $left_trim;

      } elsif ($start <= 0 && $end < $slice_length) {  # exon arrangement 3
	$right_trim = $slice_length; # a bit draconian
	$left_trim  = $end 
	  if $end > $left_trim;

      } elsif ($start > 0 && $end >= $slice_length) {  # exon arrangement 4
	my $this_right_trim = ($slice_length - $start) + 1;

	$right_trim = $this_right_trim 
	  if $this_right_trim > $right_trim;

	$left_trim = $slice_length; # also a bit draconian
      }

    }

    return $right_trim  if $end_to_trim eq 'right_end';
    return $left_trim   if $end_to_trim eq 'left_end';
}

=head2 _first_coding_Exon

  Arg        : none
  Example    : $self->_first_coding_Exon;
  Description: Finds the first exon of our transcript that 
               contains coding bases.
  Returntype : Bio::EnsEMBL::Exon
  Exceptions : none
  Caller     : $self->_derive_coords, $self->_bases_to_trim
  Status     : Stable

=cut

sub _first_coding_Exon {
  my $self = shift;

  unless ($self->{_first_coding_exon}){

    my $exons = $self->transcript->get_all_translateable_Exons;

    $self->{_first_coding_exon} =  $exons->[0]  
      if $self->transcript->strand == 1;
    $self->{_first_coding_exon} =  $exons->[-1] 
      if $self->transcript->strand == -1;
  }

  return $self->{_first_coding_exon}
}


return 1;