view variant_effect_predictor/Bio/EnsEMBL/DBSQL/StrainSliceAdaptor.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
line wrap: on
line source

=head1 LICENSE

  Copyright (c) 1999-2009 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <dev@ensembl.org>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=cut

=head1 NAME

Bio::EnsEMBL::DBSQL::StrainSliceAdaptor - adaptor/factory for MappedSlices
representing alternative assemblies

=head1 SYNOPSIS

  my $slice =
    $slice_adaptor->fetch_by_region( 'chromosome', 14, 900000, 950000 );

  my $msc = Bio::EnsEMBL::MappedSliceContainer->new(-SLICE => $slice);

  # create a new strain slice adaptor and attach it to the MSC
  my $ssa = Bio::EnsEMBL::DBSQL::StrainSliceAdaptor->new($sa->db);
  $msc->set_StrainSliceAdaptor($ssa);
  
  # now attach strain
  $msc->attach_StrainSlice('Watson');

=head1 DESCRIPTION

NOTE: this code is under development and not fully functional nor tested
yet.  Use only for development.

This adaptor is a factory for creating MappedSlices representing
strains and attaching them to a MappedSliceContainer. A mapper will be created
to map between the reference slice and the common container slice coordinate
system.

=head1 METHODS

  new
  fetch_by_name

=head1 REALTED MODULES

  Bio::EnsEMBL::MappedSlice
  Bio::EnsEMBL::MappedSliceContainer
  Bio::EnsEMBL::AlignStrainSlice
  Bio::EnsEMBL::StrainSlice

=cut

package Bio::EnsEMBL::DBSQL::StrainSliceAdaptor;

use strict;
use warnings;
no warnings 'uninitialized';

use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::MappedSlice;
use Bio::EnsEMBL::Mapper;
use Bio::EnsEMBL::DBSQL::BaseAdaptor;

our @ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);


=head2 new

  Example     : my $strain_slice_adaptor =
                  Bio::EnsEMBL::DBSQL::StrainSliceAdaptor->new;
  Description : Constructor.
  Return type : Bio::EnsEMBL::DBSQL::StrainSliceAdaptor
  Exceptions  : none
  Caller      : general
  Status      : At Risk
              : under development

=cut

sub new {
  my $caller = shift;

  my $class = ref($caller) || $caller;
  my $self = $class->SUPER::new(@_);
  
  return $self;
}


=head2 fetch_by_name

  Arg[1]      : Bio::EnsEMBL::MappedSliceContainer $container - the container
                  to attach MappedSlices to
  Arg[2]      : String $name - the name of the strain to fetch
  Example     : my ($mapped_slice) = @{ $msc->fetch_by_name('Watson') };
  Description : Creates a MappedSlice representing a version of the container's
                reference slice with variant alleles from the named strain
  Return type : listref of Bio::EnsEMBL::MappedSlice
  Exceptions  : thrown on wrong or missing arguments
  Caller      : general, Bio::EnsEMBL::MappedSliceContainer
  Status      : At Risk
              : under development

=cut

sub fetch_by_name {
  my $self = shift;
  my $container = shift;
  my $name = shift;

  # argueent check
  unless ($container and ref($container) and
          $container->isa('Bio::EnsEMBL::MappedSliceContainer')) {
    throw("Need a MappedSliceContainer.");
  }

  unless ($name) {
    throw("Need a strain name.");
  }

  my $slice = $container->ref_slice;

  # get a connection to the variation DB
  my $variation_db = $self->db->get_db_adaptor('variation');

  unless($variation_db) {
    warning("Variation database must be attached to core database to retrieve variation information");
    return '';
  }
  
  # now get an allele feature adaptor
  my $af_adaptor = $variation_db->get_AlleleFeatureAdaptor;
  
  # check we got it
  unless(defined $af_adaptor) {
    warning("Not possible to retrieve AlleleFeatureAdaptor from variation database");
    return '';
  }
  
  # now get an individual adaptor
  my $ind_adaptor = $variation_db->get_IndividualAdaptor;
  
  # check we got it
  unless(defined $ind_adaptor) {
    warning("Not possible to retrieve IndividualAdaptor from variation database");
    return '';
  }
  
  # fetch individual object for this strain name
  my $ind = shift @{$ind_adaptor->fetch_all_by_name($name)};
  
  # check we got a result
  unless(defined $ind) {
    warn("Strain ".$name." not found in the database");
    return '';
  }
  
  
  ## MAP STRAIN SLICE TO REF SLICE
  ################################
  
  # create a mapper
  my $mapper = Bio::EnsEMBL::Mapper->new('mapped_slice', 'ref_slice');
  
  # create a mapped_slice object  
  my $mapped_slice = Bio::EnsEMBL::MappedSlice->new(
    -ADAPTOR   => $self,
    -CONTAINER => $container,
    -NAME      => $slice->name."\#strain_$name",
  );
  
  # get the strain slice
  my $strain_slice = $slice->get_by_strain($ind->name);
  
  # get all allele features for this slice and individual
  #my @afs = sort {$a->start() <=> $b->start()} @{$af_adaptor->fetch_all_by_Slice($slice, $ind)};
  
  # get allele features with coverage info
  my $afs = $strain_slice->get_all_AlleleFeatures_Slice(1);
  
  # check we got some data
  #warning("No strain genotype data available for slice ".$slice->name." and strain ".$ind->name) if ! defined $afs[0];
  
  
  my $start_slice = $slice->start;
  my $start_strain = 1;
  my $sr_name = $slice->seq_region_name;
  #my $sr_name = 'ref_slice';
  my ($end_slice, $end_strain, $allele_length);
  
  my $indel_flag = 0;
  my $total_length_diff = 0;
  
  # check for AFs
  if(defined($afs) && scalar @$afs) {
  
    # go through each AF
    foreach my $af(@$afs) {
      
      # find out if it changes the length
      if($af->length_diff != 0) {
        
        $indel_flag = 1;
        $total_length_diff += $af->length_diff;
        
        # get the allele length
        $allele_length = $af->length + $af->length_diff();
        
        $end_slice = $slice->start + $af->start() - 2;
  
        if ($end_slice >= $start_slice){
            $end_strain = $end_slice - $start_slice + $start_strain;
            
            #add the sequence that maps
            $mapper->add_map_coordinates('mapped_slice', $start_strain, $end_strain, 1, $sr_name, $start_slice, $end_slice);
            
            #add the indel
            $mapper->add_indel_coordinates('mapped_slice', $end_strain+1, $end_strain+$allele_length, 1, $sr_name,$end_slice+1,$end_slice + $af->length);
            
            $start_strain = $end_strain + $allele_length + 1;
        }
        
        else{
          
            #add the indel
            $mapper->add_indel_coordinates('mapped_slice', $end_strain+1,$end_strain + $allele_length, 1, $sr_name,$end_slice+1,$end_slice + $af->length);
            
            $start_strain += $allele_length;
        }
        
        $start_slice = $end_slice + $af->length+ 1;
      }
    }
  }
  
  # add the remaining coordinates (or the whole length if no indels found)
  $mapper->add_map_coordinates('mapped_slice', $start_strain, $start_strain + ($slice->end - $start_slice), 1, $sr_name, $start_slice, $slice->end);
  
  # add the slice/mapper pair
  $mapped_slice->add_Slice_Mapper_pair($strain_slice, $mapper);  
  
  
  
  ## MAP REF_SLICE TO CONTAINER SLICE
  ###################################
  
  if($total_length_diff > 0) {
  
    # create a new mapper
    my $new_mapper = Bio::EnsEMBL::Mapper->new('ref_slice', 'container');
    
    # get existing pairs
    my @existing_pairs = $container->mapper->list_pairs('container', 1, $container->container_slice->length, 'container');
    my @new_pairs = $mapper->list_pairs('mapped_slice', 1, $strain_slice->length(), 'mapped_slice');
    
    # we need a list of indels (specifically inserts)
    my @indels;
    
    # go through existing first
    foreach my $pair(@existing_pairs) {
      
      if($pair->from->end - $pair->from->start != $pair->to->end - $pair->to->start) {
        my $indel;
        $indel->{'length_diff'} = ($pair->to->end - $pair->to->start) - ($pair->from->end - $pair->from->start);
        
        # we're only interested in inserts here, not deletions
        next unless $indel->{'length_diff'} > 0;
        
        $indel->{'ref_start'} = $pair->from->start;
        $indel->{'ref_end'} = $pair->from->end;
        $indel->{'length'} = $pair->to->end - $pair->to->start;
        
        push @indels, $indel;
      }
    }
    
    # now new ones
    foreach my $pair(@new_pairs) {
      
      if($pair->from->end - $pair->from->start != $pair->to->end - $pair->to->start) {
        my $indel;
        $indel->{'length_diff'} = ($pair->from->end - $pair->from->start) - ($pair->to->end - $pair->to->start);
        
        # we're only interested in inserts here, not deletions
        next unless $indel->{'length_diff'} > 0;
        
        $indel->{'ref_start'} = $pair->to->start;
        $indel->{'ref_end'} = $pair->to->end;
        $indel->{'length'} = $pair->from->end - $pair->from->start;
        
        push @indels, $indel;
      }
    }
    
    # sort them
    @indels = sort {
      $a->{'ref_start'} <=> $b->{'ref_start'} ||  # by position
      $b->{'length_diff'} <=> $a->{'length_diff'} # then by length diff so we only keep the longest
    } @indels;
    
    # clean them
    my @new_indels = ();
    my $p = $indels[0];
    push @new_indels, $indels[0] if scalar @indels;
    
    for my $i(1..$#indels) {
      my $c = $indels[$i];
      
      if($c->{'ref_start'} != $p->{'ref_start'} && $c->{'ref_end'} != $p->{'ref_end'}) {
        push @new_indels, $c;
        $p = $c;
      }
    }
    
    $start_slice = $slice->start;
    $start_strain = 1;
    $sr_name = $slice->seq_region_name;
    #$sr_name = 'ref_slice';
    
    foreach my $indel(@new_indels) {
      
      $end_slice = $indel->{'ref_end'};
      $end_strain = $start_strain + ($end_slice - $start_slice);
      
      $allele_length = $indel->{'length'} + $indel->{'length_diff'};
      
      $new_mapper->add_map_coordinates($sr_name, $start_slice, $end_slice, 1, 'container', $start_strain, $end_strain);
      
      $new_mapper->add_indel_coordinates($sr_name,$end_slice+1,$end_slice + $indel->{'length'}, 1, 'container', $end_strain+1, $end_strain+$allele_length);
      
      $start_strain = $end_strain + $allele_length + 1;
      $start_slice = $end_slice + $indel->{'length'} + 1;
    }
    
    $new_mapper->add_map_coordinates($sr_name, $start_slice, $slice->end, 1, 'container', $start_strain, $start_strain + ($slice->end - $start_slice));
    
    # replace the mapper with the new mapper
    $container->mapper($new_mapper);
    
    # change the container slice's length according to length diff
    $container->container_slice($container->container_slice->expand(undef, $total_length_diff, 1));
  }
  
  return [$mapped_slice];
}


1;