diff variant_effect_predictor/Bio/EnsEMBL/Utils/AssemblyProjector.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/AssemblyProjector.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,459 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+Bio::EnsEMBL::Utils::AssemblyProjector -
+utility class to post-process projections from one assembly to another
+
+=head1 SYNOPSIS
+
+  # connect to an old database
+  my $dba_old = new Bio::EnsEMBL::DBSQL::DBAdaptor(
+    -host   => 'ensembldb.ensembl.org',
+    -port   => 3306,
+    -user   => ensro,
+    -dbname => 'mus_musculus_core_46_36g',
+    -group  => 'core_old',
+  );
+
+  # connect to the new database containing the mapping between old and
+  # new assembly
+  my $dba_new = new Bio::EnsEMBL::DBSQL::DBAdaptor(
+    -host   => 'ensembldb.ensembl.org',
+    -port   => 3306,
+    -user   => ensro,
+    -dbname => 'mus_musculus_core_47_37',
+    -group  => 'core_new',
+  );
+
+  my $assembly_projector = Bio::EnsEMBL::Utils::AssemblyProjector->new(
+    -OLD_ASSEMBLY    => 'NCBIM36',
+    -NEW_ASSEMBLY    => 'NCBIM37',
+    -ADAPTOR         => $dba_new,
+    -EXTERNAL_SOURCE => 1,
+    -MERGE_FRAGMENTS => 1,
+    -CHECK_LENGTH    => 0,
+  );
+
+  # fetch a slice on the old assembly
+  my $slice_adaptor = $dba_old->get_SliceAdaptor;
+  my $slice =
+    $slice_adaptor->fetch_by_region( 'chromosome', 1, undef, undef,
+    undef, 'NCBIM36' );
+
+  my $new_slice = $assembly_projector->old_to_new($slice);
+
+  print $new_slice->name, " (", $assembly_projector->last_status, ")\n";
+
+=head1 DESCRIPTION
+
+This class implements some utility functions for converting coordinates
+between assemblies. A mapping between the two assemblies has to present
+the database for this to work, see the 'Related Modules' section below
+on how to generate the mapping.
+
+In addition to the "raw" projecting of features and slices, the methods
+in this module also apply some sensible rules to the results of the
+projection (like discarding unwanted results or merging fragmented
+projections). These are the rules (depending on configuration):
+
+Discard the projected feature/slice if:
+
+  1. it doesn't project at all (no segments returned)
+  2. [unless MERGE_FRAGMENTS is set] the projection is fragmented (more
+     than one segment)
+  3. [if CHECK_LENGTH is set] the projection doesn't have the same
+     length as the original feature/slice
+  4. all segments are on same chromosome and strand
+
+If a projection fails any of these rules, undef is returned instead of
+a projected feature/slice. You can use the last_status() method to find
+out about the results of the rules tests.
+
+Also note that when projecting features, only a shallow projection is
+performed, i.e. other features attached to your features (e.g. the
+transcripts of a gene) are not projected automatically, so it will be
+the responsability of the user code project all levels of features
+involved.
+
+=head1 METHODS
+
+  new
+  project
+  old_to_new
+  new_to_old
+  adaptor
+  external_source
+  old_assembly
+  new_assembly
+  merge_fragments
+  check_length
+
+=head1 RELATED MODULES
+
+The process of creating a whole genome alignment between two assemblies
+(which is the basis for the use of the methods in this class) is done by
+a series of scripts. Please see
+
+  ensembl/misc-scripts/assembly/README
+
+for a high-level description of this process, and POD in the individual
+scripts for the details.
+
+=cut
+
+package Bio::EnsEMBL::Utils::AssemblyProjector;
+
+use strict;
+use warnings;
+no warnings qw(uninitialized);
+
+use Bio::EnsEMBL::Utils::Exception qw(throw warning);
+use Bio::EnsEMBL::Utils::Argument qw(rearrange);
+use Bio::EnsEMBL::Slice;
+use Scalar::Util qw(weaken);
+
+=head2 new
+
+  Arg [ADAPTOR]         : Bio::EnsEMBL::DBSQL::DBAdaptor $adaptor - a db adaptor
+                          for a database containing the assembly mapping
+  Arg [EXTERNAL_SOURCE] : (optional) Boolean $external_source - indicates if
+                          source is from a different database
+  Arg [OLD_ASSEMBLY]    : name of the old assembly
+  Arg [OLD_ASSEMBLY]    : name of the new assembly
+  Arg [OBJECT_TYPE]     : (optional) object type ('slice' or 'feature')
+  Arg [MERGE_FRAGMENTS] : (optional) Boolean - determines if segments are merged
+                          to return a single object spanning all segments
+                          (default: true)
+  Arg [CHECK_LENGTH]    : (optional) Boolean - determines if projected objects
+                          have to have same length as original (default: false)
+  Example     : my $ap = Bio::EnsEMBL::Utils::AssemblyProjector->new(
+                  -DBADAPTOR    => $dba,
+                  -OLD_ASSEMBLY => NCBIM36,
+                  -NEW_ASSEMBLY => NCBIM37,
+                );
+  Description : Constructor.
+  Return type : a Bio::EnsEMBL::Utils::AssemblyProjector object
+  Exceptions  : thrown on missing arguments
+                thrown on invalid OBJECT_TYPE
+  Caller      : general
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub new {
+  my $caller = shift;
+  my $class = ref($caller) || $caller;
+
+  my ($adaptor, $external_source, $old_assembly, $new_assembly,
+      $merge_fragments, $check_length) = rearrange([qw(ADAPTOR EXTERNAL_SOURCE 
+        OLD_ASSEMBLY NEW_ASSEMBLY MERGE_FRAGMENTS CHECK_LENGTH)], @_);
+
+  unless ($adaptor and ref($adaptor) and
+          $adaptor->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) {
+    throw("You must provide a DBAdaptor to a database containing the assembly mapping.");
+  }
+
+  unless ($old_assembly and $new_assembly) {
+    throw("You must provide an old and new assembly name.");
+  }
+
+  my $self = {};
+  bless ($self, $class);
+
+  # initialise
+  $self->adaptor($adaptor);
+  $self->{'old_assembly'} = $old_assembly;
+  $self->{'new_assembly'} = $new_assembly;
+  
+  # by default, merge fragments
+  $self->{'merge_fragments'} = $merge_fragments || 1;
+
+  # by default, do not check length
+  $self->{'check_length'} = $check_length || 0;
+
+  # by default, features and slices are expected in same database as the 
+  # assembly mapping
+  $self->{'external_source'} = $external_source || 0;
+
+  return $self;
+}
+
+
+=head2 project
+
+  Arg[1]      : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object -
+                the object to project
+  Arg[2]      : String $to_assembly - assembly to project to
+  Example     : my $new_slice = $assembly_projector->project($old_slice, 
+                  'NCBIM37');
+  Description : Projects a Slice or Feature to the specified assembly.
+
+                Several tests are performed on the result to discard unwanted
+                results. All projection segments have to be on the same
+                seq_region and strand. If -MERGE_FRAGMENTS is set, gaps will be
+                bridged by creating a single object from first_segment_start to
+                last_segment_end. If -CHECK_LENGTH is set, the projected object
+                will have to have the same length as the original. You can use
+                the last_status() method to find out what the result of some of
+                these rule tests were. Please see the comments in the code for
+                more details about these rules.
+
+                The return value of this method will always be a single object,
+                or undef if the projection fails any of the rules.
+                
+                Note that when projecting features, only a "shallow" projection
+                is performed, i.e. attached features aren't projected
+                automatically! (e.g. if you project a gene, its transcripts will
+                have to be projected manually before storing the new gene)
+  Return type : same a Arg 1, or undef if projection fails any of the rules
+  Exceptions  : thrown on invalid arguments
+  Caller      : general, $self->old_to_new, $self->new_to_old
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub project {
+  my ($self, $object, $to_assembly) = @_;
+
+  throw("Need an assembly version to project to.") unless ($to_assembly);
+  throw("Need an object to project.") unless ($object and ref($object));
+
+  my ($slice, $object_type);
+
+  if ($object->isa('Bio::EnsEMBL::Feature')) {
+    $object_type = 'feature';
+  } elsif ($object->isa('Bio::EnsEMBL::Slice')) {
+    $object_type = 'slice';
+  } else {
+    throw("Need a Feature or Slice to project.");
+  }
+
+  # if the feature or slice is sourced from another db, we have to "transfer"
+  # it to the db that contains the assembly mapping. the transfer is very 
+  # shallow but that should do for our purposes
+  if ($self->external_source) {
+    my $slice_adaptor = $self->adaptor->get_SliceAdaptor;
+
+    if ($object_type eq 'feature') {
+    
+      # createa a new slice from the target db
+      my $f_slice = $object->slice;
+      my $target_slice = $slice_adaptor->fetch_by_name($f_slice->name);
+      
+      # now change the feature so that it appears it's from the target db
+      $object->slice($target_slice);
+    
+    } else {
+    
+      # createa a new slice from the target db
+      $object = $slice_adaptor->fetch_by_name($object->name);
+      
+    }
+  }
+
+  if ($object_type eq 'feature') {
+    $slice = $object->feature_Slice;
+  } else {
+    $slice = $object;
+  }
+
+  # warn if trying to project to assembly version the object already is on
+  if ($slice->coord_system->version eq $to_assembly) {
+    warning("Assembly version to project to ($to_assembly) is the same as your object's assembly (".$slice->coord_system->version.").");
+  }
+
+  # now project the slice
+  my $cs_name = $slice->coord_system_name;
+  my @segments = @{ $slice->project($cs_name, $to_assembly) };
+
+  # we need to reverse the projection segment list if the orignial 
+  if ($slice->strand == -1) {
+    @segments = reverse(@segments);
+  }
+
+  # apply rules to projection results
+  # 
+  # discard the projected feature/slice if
+  #   1. it doesn't project at all (no segments returned)
+  #   2. [unless MERGE_FRAGMENTS is set] the projection is fragmented (more
+  #      than one segment)
+  #   3. [if CHECK_LENGTH is set] the projection doesn't have the same length
+  #      as the original feature/slice
+  #   4. all segments are on same chromosome and strand
+
+  # keep track of the status of applied rules
+  my @status = ();
+
+  # test (1)
+  return undef unless (@segments);
+  #warn "DEBUG: passed test 1\n";
+
+  # test (2)
+  return undef if (!($self->merge_fragments) and scalar(@segments) > 1);
+  push @status, 'fragmented' if (scalar(@segments) > 1);
+  #warn "DEBUG: passed test 2\n";
+
+  # test (3)
+  my $first_slice = $segments[0]->to_Slice;
+  my $last_slice = $segments[-1]->to_Slice;
+  my $length_mismatch = (($last_slice->end - $first_slice->start + 1) !=
+    $object->length);
+  return undef if ($self->check_length and $length_mismatch);
+  push @status, 'length_mismatch' if ($length_mismatch);
+  #warn "DEBUG: passed test 3\n";
+
+  # test (4)
+  my %sr_names = ();
+  my %strands = ();
+  foreach my $seg (@segments) {
+    my $sl = $seg->to_Slice;
+    $sr_names{$sl->seq_region_name}++;
+    $strands{$sl->strand}++;
+  }
+  return undef if (scalar(keys %sr_names) > 1 or scalar(keys %strands) > 1);
+  #warn "DEBUG: passed test 4\n";
+
+  # remember rule status
+  $self->last_status(join('|', @status));
+
+  # everything looks fine, so adjust the coords of your feature/slice
+  my $new_slice = $first_slice;
+  $new_slice->{'end'} = $last_slice->end;
+
+  if ($object_type eq 'slice') {
+    return $new_slice;
+  } else {
+  
+    $object->start($new_slice->start);
+    $object->end($new_slice->end);
+    $object->strand($new_slice->strand);
+    $object->slice($new_slice->seq_region_Slice);
+
+    # undef dbID and adaptor so you can store the feature in the target db
+    $object->dbID(undef);
+    $object->adaptor(undef);
+
+    return $object;
+  }
+  
+}
+
+
+=head2 old_to_new
+
+  Arg[1]      : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object -
+                the object to project
+  Example     : my $new_slice = $assembly_projector->old_to_new($old_slice);
+  Description : Projects a Slice or Feature from old to new assembly.
+                This method is just a convenience wrapper for $self->project.
+  Return type : same a Arg 1, or undef
+  Exceptions  : none
+  Caller      : general
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub old_to_new {
+  my ($self, $object) = @_;
+  return $self->project($object, $self->new_assembly);
+}
+
+
+=head2 new_to_old
+
+  Arg[1]      : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object -
+                the object to project
+  Example     : my $old_slice = $assembly_projector->new_to_old($new_slice, 1);
+  Description : Projects a Slice or Feature from new to old assembly.
+                This method is just a convenience wrapper for $self->project.
+  Return type : same a Arg 1, or undef
+  Exceptions  : none
+  Caller      : general
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub new_to_old {
+  my ($self, $object) = @_;
+  return $self->project($object, $self->old_assembly);
+}
+
+
+#
+# accessors
+#
+sub adaptor {
+  my $self = shift;
+  weaken($self->{'adaptor'} = shift) if (@_);
+  return $self->{'adaptor'};
+}
+
+
+sub external_source {
+  my $self = shift;
+  $self->{'external_source'} = shift if (@_);
+  return $self->{'external_source'};
+}
+
+
+sub old_assembly {
+  my $self = shift;
+  $self->{'old_assembly'} = shift if (@_);
+  return $self->{'old_assembly'};
+}
+
+
+sub new_assembly {
+  my $self = shift;
+  $self->{'new_assembly'} = shift if (@_);
+  return $self->{'new_assembly'};
+}
+
+
+sub merge_fragments {
+  my $self = shift;
+  $self->{'merge_fragments'} = shift if (@_);
+  return $self->{'merge_fragments'};
+}
+
+
+sub check_length {
+  my $self = shift;
+  $self->{'check_length'} = shift if (@_);
+  return $self->{'check_length'};
+}
+
+
+sub last_status {
+  my $self = shift;
+  $self->{'last_status'} = shift if (@_);
+  return $self->{'last_status'};
+}
+
+
+1;
+
+