diff variant_effect_predictor/Bio/EnsEMBL/AlignStrainSlice.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/AlignStrainSlice.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,344 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+Bio::EnsEMBL::AlignStrainSlice - Represents the slice of the genome aligned with certain strains (applying the variations/indels)
+
+=head1 SYNOPSIS
+
+  $sa = $db->get_SliceAdaptor;
+
+  $slice =
+    $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 );
+
+  $strainSlice1 = $slice->get_by_Strain($strain_name1);
+  $strainSlice2 = $slice->get_by_Strain($strain_name2);
+
+  my @strainSlices;
+  push @strainSlices, $strainSlice1;
+  push @strainSlices, $strainSlice2;
+
+  $alignSlice = Bio::EnsEMBL::AlignStrainSlice->new(
+    -SLICE   => $slice,
+    -STRAINS => \@strainSlices
+  );
+
+  # Get coordinates of variation in alignSlice
+  my $alleleFeatures = $strainSlice1->get_all_AlleleFeature_Slice();
+
+  foreach my $af ( @{$alleleFeatures} ) {
+    my $new_feature = $alignSlice->alignFeature( $af, $strainSlice1 );
+    print( "Coordinates of the feature in AlignSlice are: ",
+      $new_feature->start, "-", $new_feature->end, "\n" );
+  }
+
+=head1 DESCRIPTION
+
+A AlignStrainSlice object represents a region of a genome align for
+certain strains.  It can be used to align certain strains to a reference
+slice.
+
+=head1 METHODS
+
+=cut
+
+package Bio::EnsEMBL::AlignStrainSlice;
+use strict;
+
+use Bio::EnsEMBL::Utils::Argument qw(rearrange);
+use Bio::EnsEMBL::Mapper;
+use Bio::EnsEMBL::Mapper::RangeRegistry;
+use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning);
+
+=head2 new
+
+    Arg[1]      : Bio::EnsEMBL::Slice $Slice
+    Arg[2]      : listref of Bio::EnsEMBL::StrainSlice $strainSlice
+    Example     : push @strainSlices, $strainSlice1;
+                  push @strainSlices, $strainSlice2;
+                  .....
+                  push @strainSlices, $strainSliceN;
+                  $alignStrainSlice = Bio::EnsEMBL::AlignStrainSlice->new(-SLICE => $slice,
+									  -STRAIN => \@strainSlices);
+    Description : Creates a new Bio::EnsEMBL::AlignStrainSlice object that will contain a mapper between
+                  the Slice object, plus all the indels from the different Strains
+    ReturnType  : Bio::EnsEMBL::AlignStrainSlice
+    Exceptions  : none
+    Caller      : general
+
+=cut
+
+sub new{
+    my $caller = shift;
+    my $class = ref($caller) || $caller;
+
+    my ($slice, $strainSlices) = rearrange([qw(SLICE STRAINS)],@_);
+
+    #check that both StrainSlice and Slice are identical (must have been defined in the same slice)
+    foreach my $strainSlice (@{$strainSlices}){
+	if (($strainSlice->start != $slice->start) || ($strainSlice->end != $slice->end) || ($strainSlice->seq_region_name ne $slice->seq_region_name)){
+	    warning("Not possible to create Align object from different Slices");
+	    return [];
+	}
+    }
+
+    return bless{'slice' => $slice,
+		 'strains' => $strainSlices}, $class;
+}
+
+=head2 alignFeature
+
+    Arg[1]      : Bio::EnsEMBL::Feature $feature
+    Arg[2]      : Bio::EnsEMBL::StrainSlice $strainSlice
+    Example     : $new_feature = $alignSlice->alignFeature($feature, $strainSlice);
+    Description : Creates a new Bio::EnsEMBL::Feature object that aligned to 
+                  the AlignStrainSlice object.
+    ReturnType  : Bio::EnsEMBL::Feature
+    Exceptions  : none
+    Caller      : general
+
+=cut
+
+sub alignFeature{
+    my $self = shift;
+    my $feature = shift;
+
+    #check that the object is a Feature
+    if (!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')){	
+	throw("Bio::EnsEMBL::Feature object expected");
+    }
+    #and align it to the AlignStrainSlice object
+    my $mapper_strain = $self->mapper();
+
+    my @results;
+  
+    if ($feature->start > $feature->end){
+	#this is an Indel, map it with the special method
+	@results = $mapper_strain->map_indel('Slice',$feature->start, $feature->end, $feature->strand,'Slice');
+	#and modify the coordinates according to the length of the indel
+	$results[0]->end($results[0]->start + $feature->length_diff -1);
+    }
+    else{
+	@results = $mapper_strain->map_coordinates('Slice',$feature->start, $feature->end, $feature->strand,'Slice');
+     }
+    #get need start and end of the new feature, aligned ot AlignStrainSlice
+    my @results_ordered = sort {$a->start <=> $b->start} @results;
+
+    my %new_feature = %$feature; #make a shallow copy of the Feature
+    $new_feature{'start'}= $results_ordered[0]->start();
+    $new_feature{'end'} = $results_ordered[-1]->end();  #get last element of the array, the end of the slice
+
+    return bless \%new_feature, ref($feature);
+    
+}
+
+
+#getter for the mapper between the Slice and the different StrainSlice objects
+sub mapper{
+    my $self = shift;
+    
+    if (!defined $self->{'mapper'}){
+	#get the alleleFeatures in all the strains
+	if (!defined $self->{'indels'}){
+	    #when the list of indels is not defined, get them
+	    $self->{'indels'} = $self->_get_indels();
+	}
+	my $indels = $self->{'indels'}; #gaps in reference slice
+	my $mapper = Bio::EnsEMBL::Mapper->new('Slice', 'AlignStrainSlice');
+	my $start_slice = 1;
+	my $end_slice;
+	my $start_align = 1;
+	my $end_align;
+	my $length_indel = 0;
+	my $length_acum_indel = 0;
+	foreach my $indel (@{$indels}){
+	    $end_slice = $indel->[0] - 1;
+	    $end_align = $indel->[0] - 1 + $length_acum_indel; #we must consider length previous indels
+
+	    $length_indel = $indel->[1] - $indel->[0] + 1;
+	    
+
+	    $mapper->add_map_coordinates('Slice',$start_slice,$end_slice,1,'AlignStrainSlice',$start_align,$end_align);
+	    
+	    $mapper->add_indel_coordinates('Slice',$end_slice + 1,$end_slice,1,'AlignStrainSlice',$end_align + 1,$end_align + $length_indel);
+	    $start_slice = $end_slice + 1;
+	    $start_align = $indel->[1] + 1 + $length_acum_indel; #we must consider legnth previous indels
+	    
+	    $length_acum_indel += $length_indel;
+	}
+	if ($start_slice <= $self->length){
+	    $mapper->add_map_coordinates('Slice',$start_slice,$self->length,1,'AlignStrainSlice',$start_align,$start_align + $self->length - $start_slice)
+	}
+	$self->{'mapper'} = $mapper;
+	
+    }
+    return $self->{'mapper'};
+}
+
+#returns the length of the AlignSlice: length of the Slice plus the gaps
+sub length{
+    my $self = shift;
+    my $length;
+    if (!defined $self->{'indels'}){
+	#when the list of indels is not defined, get them
+	$self->{'indels'} = $self->_get_indels();	
+    }
+    $length = $self->{'slice'}->length;
+    map {$length += ($_->[1] - $_->[0] + 1)} @{$self->{'indels'}};
+    return $length;
+}
+
+=head2 strains
+
+  Args       : None
+  Description: Returns list with all strains used to
+               define this AlignStrainSlice object
+  Returntype : listref of Bio::EnsEMBL::StrainSlice objects
+  Exceptions : none
+  Caller     : general
+
+=cut
+
+sub strains{
+    my $self = shift;
+
+    return $self->{'strains'};
+}
+
+=head2 Slice
+
+  Args       : None
+  Description: Returns slice where the AlignStrainSlice
+               is defined
+  Returntype : Bio::EnsEMBL::Slice object
+  Exceptions : none
+  Caller     : general
+
+=cut
+
+sub Slice{
+    my $self = shift;
+    return $self->{'slice'};
+}
+#method to retrieve, in order, a list with all the indels in the different strains
+sub _get_indels{
+    my $self = shift;
+    
+    #go throuh all the strains getting ONLY the indels (length_diff <> 0)
+    my @indels;
+    foreach my $strainSlice (@{$self->strains}){
+	my $differences = $strainSlice->get_all_AlleleFeatures_Slice(); #need to check there are differences....
+	foreach my $af (@{$differences}){
+	    #if length is 0, but is a -, it is still a gap in the strain
+	    if (($af->length_diff != 0) || ($af->length_diff == 0 && $af->allele_string =~ /-/)){
+		push @indels, $af;
+	    }
+	}
+    }
+    #need to overlap the gaps using the RangeRegistry module
+    my $range_registry = Bio::EnsEMBL::Mapper::RangeRegistry->new();
+    foreach my $indel (@indels){
+	#in the reference and the strain there is a gap
+	$range_registry->check_and_register(1,$indel->start,$indel->start) if ($indel->length_diff == 0);
+	#deletion in reference slice
+	$range_registry->check_and_register(1,$indel->start, $indel->end ) if ($indel->length_diff < 0);
+	#insertion in reference slice
+	$range_registry->check_and_register(1,$indel->start,$indel->start + $indel->length_diff - 1) if ($indel->length_diff > 0);
+    }
+    #and return all the gap coordinates....
+    return $range_registry->get_ranges(1);
+}
+
+=head2 get_all_Slices
+
+  Args       : none
+  Description: This Slice is made of several Bio::EnsEMBL::StrainSlices
+               sequence. This method returns these StrainSlices (or part of
+               them) with the original coordinates 
+  Returntype : listref of Bio::EnsEMBL::StrainSlice objects
+  Exceptions : end should be at least as big as start
+  Caller     : general
+
+=cut
+
+sub get_all_Slices {
+  my $self = shift;
+
+  my @strains;
+  #add the reference strain
+  my $dbVar = $self->Slice->adaptor->db->get_db_adaptor('variation');
+  unless($dbVar) {
+	warning("Variation database must be attached to core database to " .
+		"retrieve variation information" );
+	return '';
+    }
+  my $indAdaptor = $dbVar->get_IndividualAdaptor();
+  my $ref_name =  $indAdaptor->get_reference_strain_name;
+  my $ref_strain = Bio::EnsEMBL::StrainSlice->new(
+					  -START   => $self->Slice->{'start'},
+					  -END     => $self->Slice->{'end'},
+					  -STRAND  => $self->Slice->{'strand'},
+					  -ADAPTOR => $self->Slice->adaptor(),
+					  -SEQ    => $self->Slice->{'seq'},
+					  -SEQ_REGION_NAME => $self->Slice->{'seq_region_name'},
+					  -SEQ_REGION_LENGTH => $self->Slice->{'seq_region_length'},
+					  -COORD_SYSTEM    => $self->Slice->{'coord_system'},
+					  -STRAIN_NAME     => $ref_name,
+					   );
+  #this is a fake reference alisce, should not contain any alleleFeature
+  undef $ref_strain->{'alleleFeatures'};
+  
+  push @strains, @{$self->strains};
+  my $new_feature;
+  my $indel;
+  my $aligned_features;
+  my $indels = (); #reference to a hash containing indels in the different strains
+  #we need to realign all Features in the different Slices and add '-' in the reference Slice
+  foreach my $strain (@{$self->strains}){
+      foreach my $af (@{$strain->get_all_AlleleFeatures_Slice()}){
+	  $new_feature = $self->alignFeature($af); #align feature in AlignSlice coordinates
+	  push @{$aligned_features},$new_feature if($new_feature->seq_region_start <= $strain->end); #some features might map outside slice
+	  if ($af->start != $af->end){ #an indel, need to add to the reference, and realign in the strain	     
+              #make a shallow copy of the indel - clear it first!
+          $indel = undef;
+	      %{$indel} = %{$new_feature};
+	      bless $indel, ref($new_feature);
+	      $indel->allele_string('-');
+	      push @{$indels},$indel; #and include in the list of potential indels
+	  }
+      }
+      next if (!defined $aligned_features);
+      undef $strain->{'alleleFeatures'}; #remove all features before adding new aligned features
+      push @{$strain->{'alleleFeatures'}}, @{$aligned_features};
+      undef $aligned_features;
+  }  
+  push @strains, $ref_strain;
+  #need to add indels in the different strains, if not present
+  if (defined $indels){
+      foreach my $strain (@strains){
+	  #inlcude the indels in the StrainSlice object
+	  push @{$strain->{'alignIndels'}},@{$indels};
+      }
+  }
+  return \@strains;
+}
+
+1;