diff variant_effect_predictor/Bio/EnsEMBL/DBSQL/StrainSliceAdaptor.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/StrainSliceAdaptor.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,361 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2009 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+Bio::EnsEMBL::DBSQL::StrainSliceAdaptor - adaptor/factory for MappedSlices
+representing alternative assemblies
+
+=head1 SYNOPSIS
+
+  my $slice =
+    $slice_adaptor->fetch_by_region( 'chromosome', 14, 900000, 950000 );
+
+  my $msc = Bio::EnsEMBL::MappedSliceContainer->new(-SLICE => $slice);
+
+  # create a new strain slice adaptor and attach it to the MSC
+  my $ssa = Bio::EnsEMBL::DBSQL::StrainSliceAdaptor->new($sa->db);
+  $msc->set_StrainSliceAdaptor($ssa);
+  
+  # now attach strain
+  $msc->attach_StrainSlice('Watson');
+
+=head1 DESCRIPTION
+
+NOTE: this code is under development and not fully functional nor tested
+yet.  Use only for development.
+
+This adaptor is a factory for creating MappedSlices representing
+strains and attaching them to a MappedSliceContainer. A mapper will be created
+to map between the reference slice and the common container slice coordinate
+system.
+
+=head1 METHODS
+
+  new
+  fetch_by_name
+
+=head1 REALTED MODULES
+
+  Bio::EnsEMBL::MappedSlice
+  Bio::EnsEMBL::MappedSliceContainer
+  Bio::EnsEMBL::AlignStrainSlice
+  Bio::EnsEMBL::StrainSlice
+
+=cut
+
+package Bio::EnsEMBL::DBSQL::StrainSliceAdaptor;
+
+use strict;
+use warnings;
+no warnings 'uninitialized';
+
+use Bio::EnsEMBL::Utils::Argument qw(rearrange);
+use Bio::EnsEMBL::Utils::Exception qw(throw warning);
+use Bio::EnsEMBL::MappedSlice;
+use Bio::EnsEMBL::Mapper;
+use Bio::EnsEMBL::DBSQL::BaseAdaptor;
+
+our @ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);
+
+
+=head2 new
+
+  Example     : my $strain_slice_adaptor =
+                  Bio::EnsEMBL::DBSQL::StrainSliceAdaptor->new;
+  Description : Constructor.
+  Return type : Bio::EnsEMBL::DBSQL::StrainSliceAdaptor
+  Exceptions  : none
+  Caller      : general
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub new {
+  my $caller = shift;
+
+  my $class = ref($caller) || $caller;
+  my $self = $class->SUPER::new(@_);
+  
+  return $self;
+}
+
+
+=head2 fetch_by_name
+
+  Arg[1]      : Bio::EnsEMBL::MappedSliceContainer $container - the container
+                  to attach MappedSlices to
+  Arg[2]      : String $name - the name of the strain to fetch
+  Example     : my ($mapped_slice) = @{ $msc->fetch_by_name('Watson') };
+  Description : Creates a MappedSlice representing a version of the container's
+                reference slice with variant alleles from the named strain
+  Return type : listref of Bio::EnsEMBL::MappedSlice
+  Exceptions  : thrown on wrong or missing arguments
+  Caller      : general, Bio::EnsEMBL::MappedSliceContainer
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub fetch_by_name {
+  my $self = shift;
+  my $container = shift;
+  my $name = shift;
+
+  # argueent check
+  unless ($container and ref($container) and
+          $container->isa('Bio::EnsEMBL::MappedSliceContainer')) {
+    throw("Need a MappedSliceContainer.");
+  }
+
+  unless ($name) {
+    throw("Need a strain name.");
+  }
+
+  my $slice = $container->ref_slice;
+
+  # get a connection to the variation DB
+  my $variation_db = $self->db->get_db_adaptor('variation');
+
+  unless($variation_db) {
+    warning("Variation database must be attached to core database to retrieve variation information");
+    return '';
+  }
+  
+  # now get an allele feature adaptor
+  my $af_adaptor = $variation_db->get_AlleleFeatureAdaptor;
+  
+  # check we got it
+  unless(defined $af_adaptor) {
+    warning("Not possible to retrieve AlleleFeatureAdaptor from variation database");
+    return '';
+  }
+  
+  # now get an individual adaptor
+  my $ind_adaptor = $variation_db->get_IndividualAdaptor;
+  
+  # check we got it
+  unless(defined $ind_adaptor) {
+    warning("Not possible to retrieve IndividualAdaptor from variation database");
+    return '';
+  }
+  
+  # fetch individual object for this strain name
+  my $ind = shift @{$ind_adaptor->fetch_all_by_name($name)};
+  
+  # check we got a result
+  unless(defined $ind) {
+    warn("Strain ".$name." not found in the database");
+    return '';
+  }
+  
+  
+  ## MAP STRAIN SLICE TO REF SLICE
+  ################################
+  
+  # create a mapper
+  my $mapper = Bio::EnsEMBL::Mapper->new('mapped_slice', 'ref_slice');
+  
+  # create a mapped_slice object  
+  my $mapped_slice = Bio::EnsEMBL::MappedSlice->new(
+    -ADAPTOR   => $self,
+    -CONTAINER => $container,
+    -NAME      => $slice->name."\#strain_$name",
+  );
+  
+  # get the strain slice
+  my $strain_slice = $slice->get_by_strain($ind->name);
+  
+  # get all allele features for this slice and individual
+  #my @afs = sort {$a->start() <=> $b->start()} @{$af_adaptor->fetch_all_by_Slice($slice, $ind)};
+  
+  # get allele features with coverage info
+  my $afs = $strain_slice->get_all_AlleleFeatures_Slice(1);
+  
+  # check we got some data
+  #warning("No strain genotype data available for slice ".$slice->name." and strain ".$ind->name) if ! defined $afs[0];
+  
+  
+  my $start_slice = $slice->start;
+  my $start_strain = 1;
+  my $sr_name = $slice->seq_region_name;
+  #my $sr_name = 'ref_slice';
+  my ($end_slice, $end_strain, $allele_length);
+  
+  my $indel_flag = 0;
+  my $total_length_diff = 0;
+  
+  # check for AFs
+  if(defined($afs) && scalar @$afs) {
+  
+    # go through each AF
+    foreach my $af(@$afs) {
+      
+      # find out if it changes the length
+      if($af->length_diff != 0) {
+        
+        $indel_flag = 1;
+        $total_length_diff += $af->length_diff;
+        
+        # get the allele length
+        $allele_length = $af->length + $af->length_diff();
+        
+        $end_slice = $slice->start + $af->start() - 2;
+  
+        if ($end_slice >= $start_slice){
+            $end_strain = $end_slice - $start_slice + $start_strain;
+            
+            #add the sequence that maps
+            $mapper->add_map_coordinates('mapped_slice', $start_strain, $end_strain, 1, $sr_name, $start_slice, $end_slice);
+            
+            #add the indel
+            $mapper->add_indel_coordinates('mapped_slice', $end_strain+1, $end_strain+$allele_length, 1, $sr_name,$end_slice+1,$end_slice + $af->length);
+            
+            $start_strain = $end_strain + $allele_length + 1;
+        }
+        
+        else{
+          
+            #add the indel
+            $mapper->add_indel_coordinates('mapped_slice', $end_strain+1,$end_strain + $allele_length, 1, $sr_name,$end_slice+1,$end_slice + $af->length);
+            
+            $start_strain += $allele_length;
+        }
+        
+        $start_slice = $end_slice + $af->length+ 1;
+      }
+    }
+  }
+  
+  # add the remaining coordinates (or the whole length if no indels found)
+  $mapper->add_map_coordinates('mapped_slice', $start_strain, $start_strain + ($slice->end - $start_slice), 1, $sr_name, $start_slice, $slice->end);
+  
+  # add the slice/mapper pair
+  $mapped_slice->add_Slice_Mapper_pair($strain_slice, $mapper);  
+  
+  
+  
+  ## MAP REF_SLICE TO CONTAINER SLICE
+  ###################################
+  
+  if($total_length_diff > 0) {
+  
+    # create a new mapper
+    my $new_mapper = Bio::EnsEMBL::Mapper->new('ref_slice', 'container');
+    
+    # get existing pairs
+    my @existing_pairs = $container->mapper->list_pairs('container', 1, $container->container_slice->length, 'container');
+    my @new_pairs = $mapper->list_pairs('mapped_slice', 1, $strain_slice->length(), 'mapped_slice');
+    
+    # we need a list of indels (specifically inserts)
+    my @indels;
+    
+    # go through existing first
+    foreach my $pair(@existing_pairs) {
+      
+      if($pair->from->end - $pair->from->start != $pair->to->end - $pair->to->start) {
+        my $indel;
+        $indel->{'length_diff'} = ($pair->to->end - $pair->to->start) - ($pair->from->end - $pair->from->start);
+        
+        # we're only interested in inserts here, not deletions
+        next unless $indel->{'length_diff'} > 0;
+        
+        $indel->{'ref_start'} = $pair->from->start;
+        $indel->{'ref_end'} = $pair->from->end;
+        $indel->{'length'} = $pair->to->end - $pair->to->start;
+        
+        push @indels, $indel;
+      }
+    }
+    
+    # now new ones
+    foreach my $pair(@new_pairs) {
+      
+      if($pair->from->end - $pair->from->start != $pair->to->end - $pair->to->start) {
+        my $indel;
+        $indel->{'length_diff'} = ($pair->from->end - $pair->from->start) - ($pair->to->end - $pair->to->start);
+        
+        # we're only interested in inserts here, not deletions
+        next unless $indel->{'length_diff'} > 0;
+        
+        $indel->{'ref_start'} = $pair->to->start;
+        $indel->{'ref_end'} = $pair->to->end;
+        $indel->{'length'} = $pair->from->end - $pair->from->start;
+        
+        push @indels, $indel;
+      }
+    }
+    
+    # sort them
+    @indels = sort {
+      $a->{'ref_start'} <=> $b->{'ref_start'} ||  # by position
+      $b->{'length_diff'} <=> $a->{'length_diff'} # then by length diff so we only keep the longest
+    } @indels;
+    
+    # clean them
+    my @new_indels = ();
+    my $p = $indels[0];
+    push @new_indels, $indels[0] if scalar @indels;
+    
+    for my $i(1..$#indels) {
+      my $c = $indels[$i];
+      
+      if($c->{'ref_start'} != $p->{'ref_start'} && $c->{'ref_end'} != $p->{'ref_end'}) {
+        push @new_indels, $c;
+        $p = $c;
+      }
+    }
+    
+    $start_slice = $slice->start;
+    $start_strain = 1;
+    $sr_name = $slice->seq_region_name;
+    #$sr_name = 'ref_slice';
+    
+    foreach my $indel(@new_indels) {
+      
+      $end_slice = $indel->{'ref_end'};
+      $end_strain = $start_strain + ($end_slice - $start_slice);
+      
+      $allele_length = $indel->{'length'} + $indel->{'length_diff'};
+      
+      $new_mapper->add_map_coordinates($sr_name, $start_slice, $end_slice, 1, 'container', $start_strain, $end_strain);
+      
+      $new_mapper->add_indel_coordinates($sr_name,$end_slice+1,$end_slice + $indel->{'length'}, 1, 'container', $end_strain+1, $end_strain+$allele_length);
+      
+      $start_strain = $end_strain + $allele_length + 1;
+      $start_slice = $end_slice + $indel->{'length'} + 1;
+    }
+    
+    $new_mapper->add_map_coordinates($sr_name, $start_slice, $slice->end, 1, 'container', $start_strain, $start_strain + ($slice->end - $start_slice));
+    
+    # replace the mapper with the new mapper
+    $container->mapper($new_mapper);
+    
+    # change the container slice's length according to length diff
+    $container->container_slice($container->container_slice->expand(undef, $total_length_diff, 1));
+  }
+  
+  return [$mapped_slice];
+}
+
+
+1;
+