diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/BindingMatrix.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/BindingMatrix.pm	Fri Aug 03 10:04:48 2012 -0400
@@ -0,0 +1,529 @@
+#
+# Ensembl module for Bio::EnsEMBL::Funcgen::BindingMatrix
+#
+# You may distribute this module under the same terms as Perl itself
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2011 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <ensembl-dev@ebi.ac.uk>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+
+=head1 NAME
+
+Bio::EnsEMBL::Funcgen::BindingMatrix - A module to represent a BindingMatrix. 
+In EFG this represents the binding affinities of a Transcription Factor to DNA.
+
+=head1 SYNOPSIS
+
+use Bio::EnsEMBL::Funcgen::BindingMatrix;
+
+my $matrix = Bio::EnsEMBL::Funcgen::BindingMatrix->new(
+                                                        -name  => "MA0122.1",
+                                                        -type => "Jaspar",
+                                                        -description => "Nkx3-2 Jaspar Matrix",
+                                                       );
+$matrix->frequencies("A  [ 4  1 13 24  0  0  6  4  9 ]
+                      C  [ 7  4  1  0  0  0  0  6  7 ]
+                      G  [ 4  5  7  0 24  0 18 12  5 ]
+                      T  [ 9 14  3  0  0 24  0  2  3 ]");
+
+print $matrix->relative_affinity("TGGCCACCA")."\n";
+
+print $matrix->threshold."\n";
+
+=head1 DESCRIPTION
+
+This class represents information about a BindingMatrix, containing the name 
+(e.g. the Jaspar ID, or an internal name), and description. A BindingMatrix 
+is always associated to an Analysis (indicating the origin of the matrix e.g. 
+Jaspar) and a FeatureType (the binding factor).   
+
+=head1 SEE ALSO
+
+Bio::EnsEMBL::Funcgen::DBSQL::BindingMatrixAdaptor
+
+=cut
+
+
+use strict;
+use warnings;
+
+package Bio::EnsEMBL::Funcgen::BindingMatrix;
+
+use Bio::EnsEMBL::Utils::Argument qw( rearrange ) ;
+use Bio::EnsEMBL::Utils::Exception qw( throw warning );
+use Bio::EnsEMBL::Funcgen::Storable;
+
+use vars qw(@ISA);
+@ISA = qw(Bio::EnsEMBL::Funcgen::Storable);
+
+=head2 new
+
+  Arg [-name]: string - name of Matrix
+  Arg [-analysis]: Bio::EnsEMBL::Analysis - analysis describing how the matrix was obtained
+  Arg [-frequencies]: (optional) string - frequencies representing the binding affinities of a Matrix
+  Arg [-threshold]: (optional) float - minimum relative affinity for binding sites of this matrix
+  Arg [-description]: (optional) string - descriptiom of Matrix
+  Example    : my $matrix = Bio::EnsEMBL::Funcgen::BindingMatrix->new(
+                                                               -name  => "MA0122.1",
+                                                               -analysis => $analysis,
+                                                               -description => "Jaspar Matrix",
+                                                                );
+  Description: Constructor method for BindingMatrix class
+  Returntype : Bio::EnsEMBL::Funcgen::BindingMatrix
+  Exceptions : Throws if name or/and type not defined
+  Caller     : General
+  Status     : Medium risk
+
+=cut
+
+sub new {
+  my $caller = shift;
+
+  my $obj_class = ref($caller) || $caller;
+  my $self = $obj_class->SUPER::new(@_);
+  
+  my ( $name, $analysis, $freq, $desc, $ftype, $thresh ) = rearrange
+	( [
+	   'NAME', 'ANALYSIS', 'FREQUENCIES', 'DESCRIPTION', 'FEATURE_TYPE', 'THRESHOLD'
+	  ], @_);
+  
+  
+  if(! defined $name){
+    throw("Must supply a name\n");
+  }
+
+  if(! ((ref $analysis) && $analysis->isa('Bio::EnsEMBL::Analysis') )){
+    throw("You must define a valid Bio::EnsEMBL::Analysis");
+    #leave is stored test to adaptor
+  }
+  
+  if(! (ref($ftype) && $ftype->isa('Bio::EnsEMBL::Funcgen::FeatureType'))){
+	throw("You must define a valid Bio::EnsEMBL::Funcgen::FeatureType");
+	#leave is stored test to adaptor
+  }
+
+  $self->name($name);
+  $self->{analysis} = $analysis;
+  $self->{feature_type} = $ftype;
+  $self->frequencies($freq) if $freq;
+  $self->description($desc) if $desc;
+  $self->threshold($thresh) if $thresh;
+
+  return $self;
+}
+
+
+=head2 feature_type
+
+  Example    : my $ft_name = $matrix->feature_type()->name();
+  Description: Getter for the feature_type attribute for this matrix.
+  Returntype : Bio::EnsEMBL::Funcgen::FeatureType
+  Exceptions : None
+  Caller     : General
+  Status     : At risk
+
+=cut
+
+sub feature_type {
+  my $self = shift;
+  
+  return $self->{'feature_type'};
+}
+
+
+=head2 name
+
+  Arg [1]    : (optional) string - name
+  Example    : my $name = $matrix->name();
+  Description: Getter and setter of name attribute
+  Returntype : string
+  Exceptions : None
+  Caller     : General
+  Status     : Low Risk
+
+=cut
+
+sub name {
+    my $self = shift;
+    $self->{'name'} = shift if @_;
+    return $self->{'name'};
+}
+
+=head2 description
+
+  Arg [1]    : (optional) string - description
+  Example    : my $desc = $matrix->description();
+  Description: Getter and setter of description attribute 
+  Returntype : string
+  Exceptions : None
+  Caller     : General
+  Status     : Low Risk
+
+=cut
+
+sub description {
+    my $self = shift;
+    $self->{'description'} = shift if @_;
+    return $self->{'description'};
+}
+
+=head2 threshold
+
+  Arg [1]    : (optional) float - threshold
+  Example    : my $thresh = $matrix->threshold();
+  Description: Getter and setter of threshold attribute 
+  Returntype : float
+  Exceptions : None
+  Caller     : General
+  Status     : At Risk
+
+=cut
+
+sub threshold {
+    my $self = shift;
+    $self->{'threshold'} = shift if @_;
+    return $self->{'threshold'};
+}
+
+
+=head2 analysis
+  Example    : $matrix->analysis()->logic_name();
+  Description: Getter for the feature_type attribute for this matrix.
+  Returntype : Bio::EnsEMBL::Analysis
+  Exceptions : None
+  Caller     : General
+  Status     : At risk
+
+=cut
+
+sub analysis {
+  my $self = shift;
+
+  return $self->{'analysis'};
+}
+
+
+=head2 frequencies
+
+  Arg [1]    : (optional) string - frequencies
+  Example    : $matrix->frequencies($frequencies_string);
+  Description: Getter and setter of frequencies attribute
+
+  The attribute is a string representing the matrix binding
+  affinities in the Jaspar format. E.g. 
+  	">
+  	[ ]
+  	"
+  
+  Returntype : string
+  Exceptions : Throws if the string attribute is not a properly
+  formed matrix in the Jaspar format
+  Caller     : General
+  Status     : At Risk
+
+=cut
+
+sub frequencies {
+  my $self = shift;
+ 
+  my $frequencies = shift if @_; 
+  if($frequencies){
+  	$self->_weights($frequencies);
+  	$self->{'frequencies'} = $frequencies;  	
+  }
+  return $self->{'frequencies'};
+}
+
+=head2 frequencies_revcomp
+
+  Example    : $matrix->frequencies_revcomp();
+  Description: Getter for the reverse complement frequencies attribute
+
+  The attribute represents the reverse complement of frequencies
+  
+  Returntype : string
+  Caller     : General
+  Status     : At Risk
+
+=cut
+
+sub frequencies_revcomp {
+  my $self = shift;
+ 
+  return $self->{'frequencies_revcomp'};
+}
+
+
+=head2 relative_affinity
+
+  Arg [1]    : string - Binding Site Sequence
+  Arg [2]    : (optional) boolean - 1 if results are to be in linear scale (default is log scale)
+  Example    : $matrix->relative_affinity($sequence);
+  Description: Calculates the binding affinity of a given sequence
+	relative to the optimal site for the matrix
+	The site is taken as if it were in the proper orientation
+        Considers a purely random background p(A)=p(C)=p(G)=p(T)
+  Returntype : double
+  Exceptions : Throws if the sequence length does not have the matrix length
+  or if the sequence has unclear bases (N is not accepted)
+  Caller     : General
+  Status     : At Risk
+
+=cut
+
+sub relative_affinity {
+  my ($self, $sequence, $linear) = (shift, shift, shift);
+  $sequence =~ s/^\s+//;
+  $sequence =~ s/\s+$//;
+  
+  throw "No sequence given" if !$sequence;
+  $sequence = uc($sequence);
+  if($sequence =~ /[^ACGT]/){
+    throw "Sequence $sequence contains invalid characters: Only Aa Cc Gg Tt accepted";	
+  }
+  
+  my $weight_matrix = $self->_weights;
+  my $matrix_length = scalar(@{$weight_matrix->{'A'}});
+  if(length($sequence) != $matrix_length){
+    throw "Sequence $sequence does not have length $matrix_length";
+  }
+  
+  my $log_odds = 0;
+  my @bases = split(//,$sequence);
+  for(my $i=0;$i<$matrix_length;$i++){
+    $log_odds += $weight_matrix->{$bases[$i]}->[$i];	
+  }
+  
+  #This log scale may be quite unrealistic... but usefull just for comparisons...
+  if(!$linear){
+    return ($log_odds - $self->_min_bind) / ($self->_max_bind - $self->_min_bind);
+  } else {
+    return (exp($log_odds) - exp($self->_min_bind)) / (exp($self->_max_bind) - exp($self->_min_bind));
+  }
+
+}
+
+=head2 is_position_informative
+
+  Arg [1]    : int - 1-based position withing the matrix
+  Arg [2]    : (optional) double - threshold [0-2] for information content [default is 1.5]
+  Example    : $matrix->is_position_informative($pos);
+  Description: Returns true if position information content is over threshold
+  Returntype : boolean
+  Exceptions : Throws if position or threshold out of bounds
+  Caller     : General
+  Status     : At High Risk
+
+=cut
+
+sub is_position_informative {
+  my ($self,$position,$threshold) = (shift,shift,shift);
+  throw "Need a position" if(!defined($position));
+  throw "Position out of bounds" if(($position<1) || ($position > $self->length));
+  if(!defined($threshold)){ $threshold = 1.5; }
+  throw "Threshold out of bounds" if(($threshold<0) || ($threshold>2));
+  return ($self->{'ic'}->[$position-1] >= $threshold);
+}
+
+
+
+=head2 length
+
+  Example    : $bm->length();
+  Description: Returns the length of the the matrix (e.g. 19bp long)
+  Returntype : int with the length of this binding matrix 
+  Exceptions : none
+  Caller     : General
+  Status     : At Risk
+
+=cut
+
+sub length {
+  my $self = shift;
+
+  my $weight_matrix = $self->_weights;
+
+  return scalar(@{$weight_matrix->{'A'}});
+}
+
+=head2 _weights
+
+  Arg [1]    : (optional) string - frequencies
+  Example    : _weights($frequencies);
+  Description: Private Getter Setter for the weight matrix based on frequencies
+  Returntype : HASHREF with the weights of this binding matrix 
+  Exceptions : Throws if the frequencies attribute string does not correspond 
+  to 4 rows of an equal number of integer numbers.
+  Caller     : Self
+  Status     : At Risk
+
+=cut
+
+sub _weights {
+	my $self = shift;
+	
+ 	#for the moment use equiprobability and constant pseudo-count
+	my $pseudo = 0.1;
+
+ 	#TODO allow for it to be passed as parameters?
+  	my $frequencies = shift if @_; 
+  	if($frequencies){
+  		$frequencies =~ s/^(>.*?\n)//;
+		my $header = $1;
+
+  		my ($a,$c,$g,$t) = split(/\n/,$frequencies);
+  		my @As = split(/\s+/,_parse_matrix_line('[A\[\]]',$a));
+  		my @Cs = split(/\s+/,_parse_matrix_line('[C\[\]]',$c));
+  		my @Gs = split(/\s+/,_parse_matrix_line('[G\[\]]',$g));
+  		my @Ts = split(/\s+/,_parse_matrix_line('[T\[\]]',$t));
+		if((scalar(@As)!=scalar(@Cs)) || (scalar(@As)!=scalar(@Gs)) || (scalar(@As)!=scalar(@Ts)) ){
+			throw "Frequencies provided are not a valid frequency matrix"		
+		}
+		$self->_calc_ic(\@As,\@Cs,\@Gs,\@Ts,$pseudo);
+
+		#Create the reverse complement
+		my @revT = reverse(@As);
+		my @revA = reverse(@Ts);
+		my @revC = reverse(@Gs);
+		my @revG = reverse(@Cs);
+		my $revcomp = $header;
+		$revcomp.= "A [ ".join("\t",@revA)." ]\n";
+		$revcomp.= "C [ ".join("\t",@revC)." ]\n";
+		$revcomp.= "G [ ".join("\t",@revG)." ]\n";
+		$revcomp.= "T [ ".join("\t",@revT)." ]\n";
+		$self->{'frequencies_revcomp'} = $revcomp;
+
+  		my @totals;
+  		for(my $i=0;$i<scalar(@As);$i++){ 
+  			$totals[$i]=$As[$i]+$Cs[$i]+$Gs[$i]+$Ts[$i];
+  		}
+  		
+		my %weights;			
+		#We can allow distinct background per nucleotide, instead of 0.25 for all... pass as parameter
+		#But if the matrix was obtained using in-vivo data, it shouldn't matter the organism nucleotide bias..
+		#We're using 0.1 as pseudo-count... the matrix cannot have very few elements... (e.g. <30 not good)
+		my @was; for(my $i=0;$i<scalar(@As);$i++){ $was[$i] = log((($As[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); };
+		$weights{'A'} = \@was;
+		my @wcs; for(my $i=0;$i<scalar(@Cs);$i++){ $wcs[$i] = log((($Cs[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); };
+		$weights{'C'} = \@wcs;
+		my @wgs; for(my $i=0;$i<scalar(@Gs);$i++){ $wgs[$i] = log((($Gs[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); };
+		$weights{'G'} = \@wgs;
+		my @wts; for(my $i=0;$i<scalar(@Ts);$i++){ $wts[$i] = log((($Ts[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); };	     
+		$weights{'T'} = \@wts;
+	
+		$self->{'weights'} = \%weights;
+
+		my $max = 0; my $min = 0;
+		for(my $i=0;$i<scalar(@As);$i++){
+			my $col = [ $was[$i], $wcs[$i], $wgs[$i], $wts[$i] ];
+			$min += _min($col);
+			$max += _max($col);
+		}
+
+		#Log scale
+		$self->_max_bind($max);
+		$self->_min_bind($min);
+	}
+	
+	return $self->{'weights'};	
+
+}
+
+=head2 _calc_ic
+
+  Example    : _calc_ic($as,$cs,$gs,$ts,$pseudo);
+  Description: Private function to calculate the matrix information content per position
+  Caller     : self
+  Status     : At Risk
+
+=cut
+
+sub _calc_ic {
+  my ($self,$as, $cs, $gs, $ts,$pseudo) = (shift,shift, shift, shift, shift, shift);
+  my @ic = ();
+  for (my $i=0;$i<scalar(@$as);$i++){
+    my $total_i = $as->[$i] + $cs->[$i] + $gs->[$i] + $ts->[$i] + (4*$pseudo);
+    my $fas = ($as->[$i] + $pseudo) / $total_i;
+    my $fcs = ($cs->[$i] + $pseudo) / $total_i;
+    my $fgs = ($gs->[$i] + $pseudo) / $total_i;
+    my $fts = ($ts->[$i] + $pseudo) / $total_i;    
+    my $ic_i = 2 + ($fas * log($fas)/log(2)) + ($fcs * log($fcs)/log(2)) + ($fgs * log($fgs)/log(2)) + ($fts * log($fts)/log(2));
+    push @ic, $ic_i;
+  }
+  $self->{'ic'} = \@ic;
+}
+
+sub _parse_matrix_line {
+	 my ($pat,$line) = (shift,shift);
+	 $line=~s/$pat//g; $line=~s/^\s+//; $line=~s/\s+$//;	
+	 return $line;
+}
+
+sub _max { return _min_max(shift, 0); }
+
+sub _min { return _min_max(shift, 1); }
+
+sub _min_max {
+	my ($list,$min) = (shift, shift);
+	my $min_max = $list->[0];
+	map { if($min ? $_ < $min_max : $_ > $min_max){ $min_max = $_; } } @$list;
+	return $min_max;
+}
+
+
+=head2 _max_bind
+
+  Arg [1]    : (optional) double - maximum binding affinity
+  Example    : $matrix->_max_bind(10.2);
+  Description: Private Getter and setter of max_bind attribute (not to be called directly)
+  Returntype : float with the maximum binding affinity of the matrix 
+  Exceptions : None
+  Caller     : Self
+  Status     : At Risk
+
+=cut
+
+sub _max_bind {
+  my $self = shift;
+  
+  $self->{'max_bind'} = shift if @_;
+
+  return $self->{'max_bind'};
+}
+
+=head2 _min_bind
+
+  Arg [1]    : (optional) double - minimum binding affinity
+  Example    : $matrix->_min_bind(-10.2);
+  Description: Private Getter and setter of min_bind attribute (not to be called directly)
+  Returntype : float with the minimum binding affinity of the matrix 
+  Exceptions : None
+  Caller     : Self
+  Status     : At Risk
+
+=cut
+
+sub _min_bind {
+  my $self = shift;
+  
+  $self->{'min_bind'} = shift if @_;
+
+  return $self->{'min_bind'};
+}
+
+1;