diff variant_effect_predictor/Bio/EnsEMBL/Variation/LDFeatureContainer.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/LDFeatureContainer.pm	Fri Aug 03 10:04:48 2012 -0400
@@ -0,0 +1,482 @@
+=head1 LICENSE
+
+ Copyright (c) 1999-2012 The European Bioinformatics Institute and
+ Genome Research Limited.  All rights reserved.
+
+ This software is distributed under a modified Apache license.
+ For license details, please see
+
+   http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+ Please email comments or questions to the public Ensembl
+ developers list at <dev@ensembl.org>.
+
+ Questions may also be sent to the Ensembl help desk at
+ <helpdesk@ensembl.org>.
+
+=cut
+
+# Ensembl module for Bio::EnsEMBL::Variation::LDFeatureContainer
+#
+# Copyright (c) 2004 Ensembl
+#
+
+
+=head1 NAME
+
+Bio::EnsEMBL::Variation::LDFeatureContainer - A container with all the ld values for quick access
+
+=head1 SYNOPSIS
+
+    # LDFeature Container representing all the LD values for a certain contig
+    $ldContainer = Bio::EnsEMBL::Variation::LDFeatureContainer->new
+       (-name   => NT_085213,
+        -ldContainer => $ldhash,
+        -variation_features => $vfhash);
+
+    ...
+
+    #get the d_prime values for a certain pair of variation_features
+    d_prime = $ldContainer->get_d_prime($variation_feature_1,$variation_feature_2);
+    #get the list of variation in the container
+    $variations = $ldContainer->get_variations();
+
+    ...
+
+=head1 DESCRIPTION
+
+This is a class representing the LD information for a certain region
+from the ensembl-variation database.
+See B<Bio::EnsEMBL::Variation::Variation>.
+
+=head1 METHODS
+
+=cut
+
+use strict;
+use warnings;
+
+package Bio::EnsEMBL::Variation::LDFeatureContainer;
+
+use Bio::EnsEMBL::Storable;
+use Bio::EnsEMBL::Utils::Exception qw(throw warning);
+use Bio::EnsEMBL::Utils::Argument  qw(rearrange);
+
+use vars qw(@ISA);
+
+@ISA = qw(Bio::EnsEMBL::Storable);
+
+=head2 new
+
+  Arg [-NAME] :
+    string - name of the feature object that the LD container refers to. (chr1,NT_08542,...)
+
+  Arg [-LDCONTAINER] :
+    reference - hash containing all the LD information present, with the key
+    (variation_feature_1-variation_feature_2) to access the information
+
+  Arg [-VARIATIONFEATURES] :
+    reference - hash containing all the Bio::EnsEMBL::Variation::VariationFeature objects that are present in the Container
+    
+  Example    :
+    $ldContainer = Bio::EnsEMBL::Variation::LDFeatureContainer->new
+       (-name => 'chr1'
+        -ldContainer => {'variation_feature_1-variation_feature_2' => { 'sample_id_1' =>
+	                                                                          { 'd_prime' => 0.5,
+										    'r2'      => 0.421,
+										    'sample_count' => 120
+										    },
+									'sample_id_2' => 
+ 								                  { 'd_prime' => 0.3,
+										    'r2'     => 0.321,
+										    'sample_count' => 35
+										    }
+								    }
+			 
+		     }
+	-variationFeatures => hash of Bio::EnsEMBL::Variation::VariationFeature
+	);
+
+
+  Description: Constructor. Instantiates a new LDFeatureContainer object.
+  Returntype : Bio::EnsEMBL::Variation::LDFeatureContainer
+  Exceptions : none
+  Caller     : general
+  Status     : At Risk
+
+=cut
+
+sub new {
+  my $caller = shift;
+  my $class = ref($caller) || $caller;
+
+  my ($ldContainer,$name,$variationFeatures) =  rearrange([qw(LDCONTAINER NAME VARIATIONFEATURES)], @_);
+  if (defined($ldContainer) && ref($ldContainer ne 'HASH')){
+      throw("Reference to a hash object expected as a LDContainer");
+  }
+  $ldContainer ||= {};
+
+  return bless {'name' => $name,
+		'ldContainer' => $ldContainer,
+		'variationFeatures' => $variationFeatures}, $class;
+}
+
+=head2 name
+
+  Arg [1]    : string $newval (optional)
+               The new value to set the name attribute to
+  Example    : $name = $obj->name()
+  Description: Getter/Setter for the name attribute.
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : At Risk
+
+=cut
+
+sub name{
+  my $self = shift;
+  return $self->{'name'} = shift if(@_);
+  return $self->{'name'};
+}
+
+
+=head2 get_variations
+
+    Example     : $variations = $obj->get_variations()
+    Description : Gets all the variation objects contained in the LDFeatureContainer
+    ReturnType  : list of Bio::EnsEMBL::Variation::Variation
+    Exceptions  : none
+    Caller      : general
+    Status      : At Risk
+
+=cut
+
+sub get_variations{
+    my $self = shift;
+    my @variations;
+
+    foreach my $variation_feature (keys %{$self->{'variationFeatures'}}){
+	push @variations,$self->{'variationFeatures'}->{$variation_feature}->variation();
+    }    
+    return \@variations;
+}
+
+=head2 get_r_square
+
+    Arg [1]     : Bio::EnsEMBL::Variation::VariationFeature $variationFeature
+    Arg [2]     : Bio::EnsEMBL::Variation::VariationFeature $variationFeature
+    Arg [3]     : (optional) int - sample_id of population you want to get the r_square value
+    Example     : $r_square = $obj->get_r_square($vf1,$vf2,$sample_id);
+    Description : Get the r_square value for a pair of variation features in the given population. If no population is provided,
+    return the r_square for the default population with more sample counts (in case more than 1)
+    ReturnType  : float
+    Exceptions  : throw on incorrect arguments
+    Caller      : general
+    Status      : At Risk
+
+=cut
+
+sub get_r_square{
+    my $self = shift;
+    my $variation_feature_1 = shift;
+    my $variation_feature_2 = shift;
+    my $sample_id = shift;
+    
+    $sample_id ||= 0; #in case no population provided, to avoid warning in the hash
+    my $r_square;
+    my $key;
+
+    #check if the default poppulation has been calculated, otherwise, find it
+    if (! defined $self->{'_default_population'}){
+	$self->{'_default_population'} = $self->_get_major_population;
+    }
+    #first of all, check that both arguments have been properly provided
+    if (!ref($variation_feature_1) || !$variation_feature_1->isa('Bio::EnsEMBL::Variation::VariationFeature') || !ref($variation_feature_2) || !$variation_feature_2->isa('Bio::EnsEMBL::Variation::VariationFeature')){
+	throw("Bio::EnsEMBL::Variation::VariationFeature arguments expected");
+    }
+    else{
+	#check if the ldContainer does not contain pairwise information for the variation features provided
+	if (!exists $self->{'ldContainer'}->{$variation_feature_1->dbID() . '-' . $variation_feature_2->dbID()} && !exists $self->{'ldContainer'}->{$variation_feature_2->dbID() . '-' . $variation_feature_1->dbID()}){
+	    warning("variation features have no pairwise ld information");
+	} 
+	else{
+	    #find out the key in the ld Hash: vf1 - vf2 or vf2 - vf1
+	    if (exists $self->{'ldContainer'}->{$variation_feature_1->dbID() . '-' . $variation_feature_2->dbID()}){
+		$key = $variation_feature_1->dbID() . '-' . $variation_feature_2->dbID();
+	    }
+	    else{
+		$key = $variation_feature_2->dbID() . '-' . $variation_feature_1->dbID();
+	    }
+	    #and finally, if population provided or the only population
+	    if (exists $self->{'ldContainer'}->{$key}->{$sample_id}){
+		$r_square = $self->{'ldContainer'}->{$key}->{$sample_id}->{'r2'}
+	    }
+	    else{
+		if (exists $self->{'ldContainer'}->{$key}->{$self->{'_default_population'}}){
+		    #there was no population provided, return the r_square for the default population
+		    $r_square = $self->{'ldContainer'}->{$key}->{$self->{'_default_population'}}->{'r2'};		    
+		}
+		else{
+		    warning("variation features have no pairwise ld information for default population ", $self->{'_default_population'});
+		}
+	    }
+	}
+	    
+    }
+    return $r_square;
+}
+
+=head2 get_d_prime
+
+    Arg [1]     : Bio::EnsEMBL::Variation::VariationFeature $variationFeature
+    Arg [2]     : Bio::EnsEMBL::Variation::VariationFeature $variationFeature
+    Arg [3]     : (optional) int - sample_id of population you want to get the d_prime value
+    Example     : $d_prime = $obj->get_d_prime($vf1,$vf2,$sample_id);
+    Description : Get the d_prime value for a pair of variation features for a known or unknown population. In case of an unknown population, the default
+poulation is used    
+    ReturnType  : float
+    Exceptions  : throw on incorrect arguments
+    Caller      : general
+    Status      : At Risk
+
+=cut
+
+sub get_d_prime{
+    my $self = shift;
+    my $variation_feature_1 = shift;
+    my $variation_feature_2 = shift;
+    my $sample_id = shift;
+
+    $sample_id ||= 0; #in case no population provided, to avoid warning in the hash
+    my $d_prime;
+    my $key;
+
+    if (! defined $self->{'_default_population'}){
+	$self->{'_default_population'} = $self->_get_major_population;
+    }
+    #first of all, check that both arguments have been properly provided
+    if (!ref($variation_feature_1) || !$variation_feature_1->isa('Bio::EnsEMBL::Variation::VariationFeature') || !ref($variation_feature_2) || !$variation_feature_2->isa('Bio::EnsEMBL::Variation::VariationFeature')){
+	throw("Bio::EnsEMBL::Variation::VariationFeature arguments expected");
+    }
+    else{
+	#check if the ldContainer does not contain pairwise information for the variation features provided
+	if (!exists $self->{'ldContainer'}->{$variation_feature_1->dbID() . '-' . $variation_feature_2->dbID()} && !exists $self->{'ldContainer'}->{$variation_feature_2->dbID() . '-' . $variation_feature_1->dbID()}){
+	    warning("variation features have no pairwise ld information");
+	} 
+	else{
+	    #find out the key in the ld Hash: vf1 - vf2 or vf2 - vf1
+	    if (exists $self->{'ldContainer'}->{$variation_feature_1->dbID() . '-' . $variation_feature_2->dbID()}){
+		$key = $variation_feature_1->dbID() . '-' . $variation_feature_2->dbID();
+	    }
+	    else{
+		$key = $variation_feature_2->dbID() . '-' . $variation_feature_1->dbID();
+	    }
+	    #and finally, if population provided or the only population
+	    if (exists $self->{'ldContainer'}->{$key}->{$sample_id}){
+		$d_prime = $self->{'ldContainer'}->{$key}->{$sample_id}->{'d_prime'};
+	    }
+	    else{
+		if (exists $self->{'ldContainer'}->{$key}->{$self->{'_default_population'}}){
+		    #there was no population provided, return the r_square for the default population
+		    $d_prime = $self->{'ldContainer'}->{$key}->{$self->{'_default_population'}}->{'d_prime'};
+		}
+		else{
+		    warning("variation features have no pairwise ld information for default population ", $self->{'_default_population'});
+		}
+	    }
+	}	    
+    }
+    return $d_prime;
+}
+
+
+=head2 get_all_ld_values
+
+    Example     : $ld_values = $obj->get_all_ld_values();
+    Description : Get all the information contained in the LDFeatureContainer object
+    ReturnType  : reference to list of hashes [{variation1 => Bio::EnsEMBL::Variation::VariationFeature, variation2=>Bio::EnsEMBL::Variation::VariationFeature, d_prime=>d_prime, r2=>r2, sample_count=>sample_count, sample_id=>population_sample_id}]
+    Exceptions  : no exceptions
+    Caller      : general
+    Status      : At Risk
+
+=cut
+
+
+sub get_all_ld_values{
+    my $self = shift;
+    my @ld_values; #contains ALL the ld values in the container
+
+    #the keys in the ldContainer hash
+    my $variation_feature_id_1;
+    my $variation_feature_id_2;
+    
+    if (! defined $self->{'_default_population'}){
+	$self->{'_default_population'} = $self->_get_major_population;
+    }
+    foreach my $key_ld (keys %{$self->{'ldContainer'}}){
+	my %ld_value;  #contains a single ld value in the container {variation_feature variation_feature d_prime r2 snp_distance_count}
+	($variation_feature_id_1, $variation_feature_id_2) =  split /-/,$key_ld; #get the variation_features ids
+	if (exists $self->{'ldContainer'}->{$key_ld}->{$self->{'_default_population'}}){
+	    #add the information to the ld_value hash
+	    $ld_value{'variation1'} = $self->{'variationFeatures'}->{$variation_feature_id_1};
+	    $ld_value{'variation2'} = $self->{'variationFeatures'}->{$variation_feature_id_2};
+	    $ld_value{'d_prime'} = $self->{'ldContainer'}->{$key_ld}->{$self->{'_default_population'}}->{'d_prime'};
+	    $ld_value{'r2'} = $self->{'ldContainer'}->{$key_ld}->{$self->{'_default_population'}}->{'r2'};
+	    $ld_value{'sample_count'} = $self->{'ldContainer'}->{$key_ld}->{$self->{'_default_population'}}->{'sample_count'};
+	    $ld_value{'sample_id'} = $self->{'_default_population'};
+	    push @ld_values, \%ld_value;
+	}
+    }
+    return \@ld_values;
+}
+
+
+=head2 get_all_r_square_values
+
+    Example     : $r_square_values = $obj->get_all_r_square_values();
+    Description : Get all r_square values contained in the LDFeatureContainer object
+    ReturnType  : reference to list of [{variation1=>Bio::EnsEMBL::Variation::VariationFeature, variation2=>Bio::EnsEMBL::Variation::VariationFeature, r2=>r2, sample_id=>population_sample_id}]
+    Exceptions  : no exceptions
+    Caller      : general
+    Status      : At Risk
+
+=cut
+
+
+sub get_all_r_square_values{
+    my $self = shift;
+    my @r_squares; #contains ALL the r2 values in the container
+
+    #the keys in the ldContainer hash
+    my $variation_feature_id_1;
+    my $variation_feature_id_2;
+
+   if (! defined $self->{'_default_population'}){
+       $self->{'_default_population'} = $self->_get_major_population;
+   }
+    foreach my $key_ld (keys %{$self->{'ldContainer'}}){
+	my %r_square;  #contains a single r2 value in the container {variation_feature r2 population_id}
+	($variation_feature_id_1, $variation_feature_id_2) =  split /-/,$key_ld; #get the variation_features ids
+	if (exists $self->{'ldContainer'}->{$key_ld}->{$self->{'_default_population'}}){
+	    $r_square{'variation1'} = $self->{'variationFeatures'}->{$variation_feature_id_1};
+	    $r_square{'variation2'} = $self->{'variationFeatures'}->{$variation_feature_id_2};
+	    $r_square{'r2'} = $self->{'ldContainer'}->{$key_ld}->{$self->{'_default_population'}}->{'r2'};
+	    $r_square{'sample_id'} = $self->{'_default_population'};	    
+	    #and add all the ld information to the final list
+	    push @r_squares, \%r_square;
+	}
+    }
+    return \@r_squares;
+}
+
+=head2 get_all_d_prime_values
+
+    Example     : $d_prime_values = $obj->get_all_d_prime_values();
+    Description : Get all d_prime values contained in the LDFeatureContainer object
+    ReturnType  : reference to list of [{variation1=>Bio::EnsEMBL::Variation::VariationFeature, variation2=>Bio::EnsEMBL::Variation::VariationFeature, d_prime=>d_prime, sample_id=>population_sample_id}]
+    Exceptions  : no exceptions
+    Caller      : general
+    Status      : At Risk
+
+=cut
+
+
+sub get_all_d_prime_values{
+   my $self = shift;
+    my @d_primes; #contains ALL the d_prime values in the container
+
+    #the keys in the ldContainer hash
+    my $variation_feature_id_1;
+    my $variation_feature_id_2;
+
+   if (! defined $self->{'_default_population'}){
+       $self->{'_default_population'} = $self->_get_major_population;
+   }
+   foreach my $key_ld (keys %{$self->{'ldContainer'}}){
+       my %d_prime;  #contains a single d_prime value in the container {variation_feature d_prime population_id}
+       ($variation_feature_id_1, $variation_feature_id_2) =  split /-/,$key_ld; #get the variation_features ids
+       #add the information to the ld_value array
+       if (exists $self->{'ldContainer'}->{$key_ld}->{$self->{'_default_population'}}){
+	   $d_prime{'variation1'} = $self->{'variationFeatures'}->{$variation_feature_id_1};
+	   $d_prime{'variation2'} = $self->{'variationFeatures'}->{$variation_feature_id_2};
+	   $d_prime{'d_prime'} = $self->{'ldContainer'}->{$key_ld}->{$self->{'_default_population'}}->{'d_prime'};
+	   $d_prime{'sample_id'} = $self->{'_default_population'};
+	   #and add all the ld information to the final list if exists the value
+	   push @d_primes, \%d_prime;
+       }
+
+   }
+   return \@d_primes;
+}
+
+=head2 get_all_populations
+
+    Arg [1]     : (optional) Bio::EnsEMBL::Variation::VariationFeature $variationFeature
+    Arg [2]     : (optional) Bio::EnsEMBL::Variation::VariationFeature $variationFeature
+    Example     : $populations = $obj->get_all_populations($vf1,$vf2);
+    Description : If no arguments provided, returns ALL the populations present in the container. When 2 variation features provided, returns the 
+    population/populations where these variation features occurs
+    ReturnType  : ref to an array of int
+    Exceptions  : throw on incorrect arguments
+    Caller      : general
+    Status      : At Risk
+
+=cut
+
+sub get_all_populations{
+    my $self = shift;
+    my $variation_feature_1 = shift;
+    my $variation_feature_2 = shift;
+    my %populations;
+    my @populations;    
+    my $key;
+
+    #if no variation provided, return ALL the populations in the container
+    if (! defined($variation_feature_1) && ! defined($variation_feature_2)){
+	foreach my $key (keys %{$self->{'ldContainer'}}){
+	    map {$populations{$_}++} keys %{$self->{'ldContainer'}->{$key}};
+	}
+	@populations = keys %populations;
+    }
+    else{
+	#first, check if both arguments have been properly provided
+	if (!ref($variation_feature_1) || !$variation_feature_1->isa('Bio::EnsEMBL::Variation::VariationFeature') || !ref($variation_feature_2) || !$variation_feature_2->isa('Bio::EnsEMBL::Variation::VariationFeature')){
+	    throw("Bio::EnsEMBL::Variation::VariationFeature arguments expected");
+	}
+	#if the variation_features are correct, return the list of populations
+	else{
+	    #find out the key in the ld Hash: vf1 - vf2 or vf2 - vf1
+	    if (exists $self->{'ldContainer'}->{$variation_feature_1->dbID() . '-' . $variation_feature_2->dbID()}){
+		$key = $variation_feature_1->dbID() . '-' . $variation_feature_2->dbID();
+	    }
+	    else{
+		$key = $variation_feature_2->dbID() . '-' . $variation_feature_1->dbID();
+	    }	    
+	    @populations = keys %{$self->{'ldContainer'}->{$key}};
+	}
+    }
+
+    return \@populations;
+}
+
+#returns from the container the population_id with the maximum number of pairwise_ld 
+sub _get_populations {
+    my $self = shift;
+    my %populations;
+
+    foreach my $key (keys %{$self->{'ldContainer'}}){
+	map {$populations{$_}++} keys %{$self->{'ldContainer'}->{$key}};
+    }
+    my @sorted_populations = sort{$populations{$b} <=> $populations{$a}} keys %populations;
+    return @sorted_populations;
+}
+
+sub _get_major_population { 
+  my( $pop ) = $_[0]->_get_populations;
+  return $pop;
+}
+1;
+