diff variant_effect_predictor/Bio/EnsEMBL/Upstream.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Upstream.pm	Thu Apr 11 06:29:17 2013 -0400
@@ -0,0 +1,582 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+Bio::EnsEMBL::Upstream - Object that defines an upstream region
+
+=head1 SYNOPSIS
+
+  use Bio::EnsEMBL::Upstream;
+
+  my $upstream = Bio::EnsEMBL::Upstream->new(
+    -transcript => $transcript,
+    -length     => 2000           # bp
+  );
+
+  # Retrieve coordinates of upstream region
+  my $upstream_region_start = $upstream->upstart;
+  my $upstream_region_end   = $upstream->upend;
+
+  # Retrieve coordinates in 'downstream' first intron
+  my $intron_region_start = $upstream->downstart;
+  my $intron_region_end   = $upstream->downend;
+
+  # Coordinates are returned in the same scheme as the input transcript.
+  # However, the coordinates of an upstream region can be transformed to
+  # any other scheme using a slice
+
+  $upstream->transform($slice);
+
+  # Coordinates can be retrieved in scheme in the same manner as the
+  # above.
+
+=head1 DESCRIPTION
+
+An object that determines the upstream region of a transcript.  Such a
+region is non-coding and ensures that other genes or transcripts are
+not present.  Ultimately, these objects can be used to looking for
+promoter elements.  To this end, it is also possible to derive a region
+downstream of the first exon, within the first intron and where promoter
+elements sometimes are found.
+
+=head1 METHODS
+
+=cut
+
+package Bio::EnsEMBL::Upstream;
+
+use strict;
+use vars qw(@ISA);
+use Bio::EnsEMBL::DBSQL::DBAdaptor;
+use Bio::EnsEMBL::DBSQL::SimpleFeatureAdaptor;
+
+use Bio::EnsEMBL::Utils::Exception qw(throw);
+use Bio::EnsEMBL::Utils::Argument  qw(rearrange);
+
+@ISA = qw(Bio::EnsEMBL::Feature);
+
+
+=head2 new
+
+  Arg [transcript] : (optional) Bio::EnsEMBL::Transcript
+  Arg [length]     : (optional) int $length
+  Example    : $upstream = Bio::EnsEMBL::Upstream->new(-transcript => $transcript, 
+						       -length => 2000);
+  Description: Creates a new upstream object
+  Returntype : Bio::EnsEMBL::Upstream
+  Exceptions : none
+  Caller     : Bio::EnsEMBL::Transcript, general
+  Status     : Stable
+
+=cut
+
+sub new {
+  my ($class, @args) = @_;
+  my $self = {};
+  
+  bless $self, $class;
+  
+  my ($transcript, 
+      $length) = rearrange([qw(TRANSCRIPT
+			       LENGTH
+			      )],@args);
+	
+  $self->transcript($transcript) 	if defined $transcript;
+  $self->length($length) 		if $length;
+  
+  return $self
+}
+
+=head2 transcript
+
+  Arg        : (optional) Bio::EnsEMBL::Transcript
+  Example    : $self->transcript($transcript);
+  Description: Getter/setter for transcript object
+  Returntype : Bio::EnsEMBL::Transcript
+  Exceptions : Throws if argument is not undefined 
+               or a Bio::EnsEMBL::Transcript
+  Caller     : $self->new, $self->_derive_coords, 
+               $self->_first_coding_Exon
+  Status     : Stable
+
+=cut
+
+
+sub transcript {
+  my $self = shift;
+  
+  if (@_){
+    $self->{_transcript} = shift;
+    
+    if (defined $self->{_transcript}) {
+      throw("Transcript is not a Bio::EnsEMBL::Transcript") 
+	if (! $self->{_transcript}->isa("Bio::EnsEMBL::Transcript"));
+      $self->_flush_cache;
+    }
+  }
+  
+  return $self->{_transcript}
+}
+
+=head2 length
+
+  Arg        : (optional) int $length
+  Example    : $self->length(2000); # bp
+  Description: Getter/setter for upstream region length.
+  Returntype : int
+  Exceptions : Throws if length is requested before it has been set.
+  Caller     : $self->new, $self->_derive_coords
+  Status     : Stable
+
+=cut
+
+sub length {
+  my $self = shift;
+  
+  if (@_){
+    $self->{_length} = shift;
+    $self->_flush_cache;
+  }
+  
+  throw("Region length has not been set.")
+    unless $self->{_length};
+  
+  return $self->{_length}
+}
+
+=head2 _flush_cache
+
+  Arg        : none
+  Example    : $self->_flush_cache;
+  Description: Empties cached coordinates (called when 
+	       coordinate scheme or region length has changed).
+  Returntype : none
+  Exceptions : none
+  Caller     : $self->length, $self->transform
+  Status     : Stable
+
+=cut
+
+sub _flush_cache {
+  my $self = shift;
+  
+  $self->upstart(undef);
+  $self->upend(undef);
+  $self->downstart(undef);
+  $self->downend(undef);
+}
+
+=head2 upstart
+
+  Arg        : none
+  Example    : $self->upstart;
+  Description: Returns the start coordinate of the region 
+               upstream of the transcript.  This coordinate 
+               is always the furthest from the translation 
+               initiation codon, whereas upend always abutts 
+               the translation initiation codon.
+  Returntype : int
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub upstart {
+  my $self = shift;
+  
+  if (@_) {
+    $self->{_upstart} = shift @_;
+    return
+  }
+
+  if (! defined $self->{_upstart}) {
+    $self->_derive_coords('up');
+  }
+
+  return $self->{_upstart}
+}
+
+=head2 upend
+
+  Arg        : none
+  Example    : $self->upend;
+  Description: Returns the end coordinate of the region 
+               upstream of the transcript.  This coordinate 
+               always always abutts the translation 
+               initiation codon, whereas upstart always 
+               returns the coorindate furthest from the 
+               translation initiation codon.
+  Returntype : int
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub upend {
+  my $self = shift;
+	
+  if (@_) {
+    $self->{_upend} = shift @_;
+    return
+  }
+
+  if (! defined $self->{_upend}) {
+    $self->_derive_coords('up');
+  }
+
+  return $self->{_upend}
+}
+
+=head2 downstart
+
+  Arg        : none
+  Example    : $self->downstart;
+  Description: Returns the start coordinate of the region 
+               in the first intron of the transcript.  This 
+               coordinate is always closest to the first 
+               exon (irregardless of strand).
+  Returntype : int
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub downstart {
+  my $self = shift;
+	
+  if (@_) {
+    $self->{_downstart} = shift @_;
+    return
+  }
+
+  if (! defined $self->{_downstart}) {
+    $self->_derive_coords('down');
+  }
+
+  return $self->{_downstart}
+}
+
+=head2 downend
+
+  Arg        : none
+  Example    : $self->downend;
+  Description: Returns the end coordinate of the region 
+               in the first intron of the transcript.  This 
+               coordinate is always furthest from the first 
+               exon (irregardless of strand).
+  Returntype : int
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub downend {
+  my $self = shift;
+
+  if (@_) {
+    $self->{_downend} = shift @_;
+    return
+  }
+
+  if (! defined $self->{_downend}) {
+    $self->_derive_coords('down');
+  }
+
+  return $self->{_downend}
+}
+
+=head2 transform
+
+  Arg        : 
+  Example    : 
+  Description: Not yet implemented
+  Returntype : 
+  Exceptions : 
+  Caller     : 
+  Status     : At Risk
+
+=cut
+
+
+# Over-riding inherited class.  As yet unimplemented.
+
+sub transform {
+  my $self = shift;
+
+  throw("No transform method implemented for " . $self);
+}
+
+=head2 derive_upstream_coords
+
+  Arg        : none
+  Example    : my ($upstart, $upend) 
+                       = $self->derive_upstream_coords;
+  Description: Derives upstream coordinates (for 
+               compatability with older scripts).
+  Returntype : arrayref
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub derive_upstream_coords {
+  my $self = shift;
+
+  return [$self->upstart, $self->upend]
+}
+
+=head2 derive_downstream_coords
+
+  Arg        : none
+  Example    : my ($downstart, $downend) 
+                       = $self->derive_downstream_coords;
+  Description: Derives downstream coordinates (for 
+               compatability with older scripts).
+  Returntype : arrayref
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub derive_downstream_coords {
+  my $self = shift;
+
+  return [$self->downstart, $self->downend]
+}
+
+=head2 _derive_coords
+
+  Arg        : string $direction (either 'up' or 'down').
+  Example    : $self->_derive_coords('up');
+  Description: Determines the coordinates of either upstream 
+               or downstream region.
+  Returntype : none
+  Exceptions : Throws if argument is not either 'up' or 'down'
+  Caller     : $self->upstart, $self->upend, $self->downstart, 
+               $self->downend
+  Status     : Stable
+
+=cut
+
+sub _derive_coords {
+  my ($self, $direction) = @_;
+
+  # Check direction
+  throw("Must specify either \'up\' of \'down\'-stream direction to derive coords.")
+    unless (($direction eq 'up')||($direction eq 'down'));
+
+  # Put things in easily accessible places.
+  my $core_db_slice_adaptor = $self->transcript->slice->adaptor;
+  my $region_length = $self->length;
+
+  # Whatever coord system the gene is currently is, transform to the toplevel.
+  my $transcript = $self->transcript->transform('toplevel');
+
+  # Use our transformed transcript to determine the upstream region coords.
+  # End should always be just before the coding start (like ATG), including 3' UTR.
+  # Start is the outer limit of the region upstream (furthest from ATG).
+
+  my $region_start;
+  my $region_end;
+
+  if ($transcript->strand == 1){
+    if ($direction eq 'up'){
+      $region_end = $transcript->coding_region_start - 1;
+      $region_start = $region_end - $region_length;
+    } elsif ($direction eq 'down'){
+      $region_end = $self->_first_coding_Exon->end + 1;
+      $region_start = $region_end + $region_length;
+    }
+  } elsif ($transcript->strand == -1) {
+    if ($direction eq 'up'){
+      $region_end = $transcript->coding_region_end + 1;
+      $region_start = $region_end + $region_length;
+
+    } elsif ($direction eq 'down'){
+      $region_end = $self->_first_coding_Exon->start - 1;
+      $region_start = $region_end - $region_length;
+    }
+  }
+
+  # Trim the upstream/downstream region to remove extraneous coding sequences 
+  # from other genes and/or transcripts.
+    
+  my ($slice_low_coord, $slice_high_coord) = sort {$a <=> $b} ($region_start, $region_end);
+
+  my $region_slice 
+      = $core_db_slice_adaptor->fetch_by_region($transcript->slice->coord_system->name,
+						$transcript->slice->seq_region_name, 
+						$slice_low_coord, 
+						$slice_high_coord);
+
+  if ($transcript->strand == 1) {
+    if ($direction eq 'up') {
+      $region_start += $self->_bases_to_trim('left_end', $region_slice);
+    } elsif ($direction eq 'down') {
+      $region_start -= $self->_bases_to_trim('right_end', $region_slice);
+    }
+  } elsif ($transcript->strand == -1) {
+    if ($direction eq 'up') {
+      $region_start -= $self->_bases_to_trim('right_end', $region_slice);
+    } elsif ($direction eq 'down') {
+      $region_start += $self->_bases_to_trim('left_end', $region_slice);
+    }
+  }
+
+  # Always return start < end
+
+  ($region_start, $region_end) = sort {$a <=> $b} ($region_start, $region_end);
+
+  if ($direction eq 'up') {
+    $self->upstart($region_start);
+    $self->upend($region_end);
+  } elsif ($direction eq 'down') {
+    $self->downstart($region_start);
+    $self->downend($region_end);
+  }
+}
+
+=head2 _bases_to_trim
+
+  Arg        : string $end_to_trim (either 'right_end' or 
+               'left_end').
+  Arg        : Bio::EnsEMBL::Slice
+  Example    : $self->_derive_coords('right_end', $slice);
+  Description: Finds exons from other genes/transcripts that
+               invade our upstream/downstream slice and 
+               returns the number of bases that should be 
+               truncated from the appropriate end of the 
+               upstream/downstream region.
+  Returntype : in
+  Exceptions : Throws if argument is not either 'right_end' 
+               or 'left_end'
+  Caller     : $self->_derive_coords
+  Status     : Stable
+
+=cut
+
+# Method to look for coding regions that invade the upstream region.  For
+# now, this method returns the number of bases to trim.  I doesn't yet
+# do anything special if an exon is completely swallowed (truncates at 
+# the end of the overlapping exon and discards any non-coding sequence 
+# further upstream) or overlaps the 'wrong' end of the region (cases where 
+# two alternate exons share one end of sequence - does this happen?).
+
+# The input argument 'end' defines the end of the slice that should be 
+# truncated. 
+
+sub _bases_to_trim {
+    my ($self, $end_to_trim, $slice) = @_;
+
+    throw "Slice end argument must be either left_end or right_end" 
+	unless ($end_to_trim eq 'right_end' || $end_to_trim eq 'left_end'); 
+
+    my @overlap_coords;
+    my $slice_length = $slice->length;
+    my $right_trim = 0;
+    my $left_trim  = 0;
+
+    foreach my $exon (@{$slice->get_all_Exons}){
+      next if $exon->stable_id eq $self->_first_coding_Exon->stable_id;
+
+      my $start = $exon->start;
+      my $end   = $exon->end;
+
+      # Choose from four possible exon arrangements
+
+      #  -----|********************|----- Slice
+      #  --|=========================|--- Exon arrangement 1
+      #  ----------|======|-------------- Exon arrangement 2
+      #  --|=======|--------------------- Exon arrangement 3
+      #  -------------------|=========|-- Exon arrangement 4
+
+
+      if ($start <=  0 && $end >= $slice_length) {     # exon arrangement 1
+	$right_trim = $slice_length - 1;
+	$left_trim  = $slice_length - 1;
+	last;
+
+      } elsif ($start >= 0 && $end <= $slice_length) { # exon arrangement 2
+	my $this_right_trim = ($slice_length - $start) + 1;
+
+	$right_trim = $this_right_trim 
+	  if $this_right_trim > $right_trim;
+
+	$left_trim  = $end 
+	  if $end > $left_trim;
+
+      } elsif ($start <= 0 && $end < $slice_length) {  # exon arrangement 3
+	$right_trim = $slice_length; # a bit draconian
+	$left_trim  = $end 
+	  if $end > $left_trim;
+
+      } elsif ($start > 0 && $end >= $slice_length) {  # exon arrangement 4
+	my $this_right_trim = ($slice_length - $start) + 1;
+
+	$right_trim = $this_right_trim 
+	  if $this_right_trim > $right_trim;
+
+	$left_trim = $slice_length; # also a bit draconian
+      }
+
+    }
+
+    return $right_trim  if $end_to_trim eq 'right_end';
+    return $left_trim   if $end_to_trim eq 'left_end';
+}
+
+=head2 _first_coding_Exon
+
+  Arg        : none
+  Example    : $self->_first_coding_Exon;
+  Description: Finds the first exon of our transcript that 
+               contains coding bases.
+  Returntype : Bio::EnsEMBL::Exon
+  Exceptions : none
+  Caller     : $self->_derive_coords, $self->_bases_to_trim
+  Status     : Stable
+
+=cut
+
+sub _first_coding_Exon {
+  my $self = shift;
+
+  unless ($self->{_first_coding_exon}){
+
+    my $exons = $self->transcript->get_all_translateable_Exons;
+
+    $self->{_first_coding_exon} =  $exons->[0]  
+      if $self->transcript->strand == 1;
+    $self->{_first_coding_exon} =  $exons->[-1] 
+      if $self->transcript->strand == -1;
+  }
+
+  return $self->{_first_coding_exon}
+}
+
+
+return 1;