view variant_effect_predictor/Bio/EnsEMBL/Funcgen/BindingMatrix.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line source

#
# Ensembl module for Bio::EnsEMBL::Funcgen::BindingMatrix
#
# You may distribute this module under the same terms as Perl itself

=head1 LICENSE

  Copyright (c) 1999-2011 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <ensembl-dev@ebi.ac.uk>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.


=head1 NAME

Bio::EnsEMBL::Funcgen::BindingMatrix - A module to represent a BindingMatrix. 
In EFG this represents the binding affinities of a Transcription Factor to DNA.

=head1 SYNOPSIS

use Bio::EnsEMBL::Funcgen::BindingMatrix;

my $matrix = Bio::EnsEMBL::Funcgen::BindingMatrix->new(
                                                        -name  => "MA0122.1",
                                                        -type => "Jaspar",
                                                        -description => "Nkx3-2 Jaspar Matrix",
                                                       );
$matrix->frequencies("A  [ 4  1 13 24  0  0  6  4  9 ]
                      C  [ 7  4  1  0  0  0  0  6  7 ]
                      G  [ 4  5  7  0 24  0 18 12  5 ]
                      T  [ 9 14  3  0  0 24  0  2  3 ]");

print $matrix->relative_affinity("TGGCCACCA")."\n";

print $matrix->threshold."\n";

=head1 DESCRIPTION

This class represents information about a BindingMatrix, containing the name 
(e.g. the Jaspar ID, or an internal name), and description. A BindingMatrix 
is always associated to an Analysis (indicating the origin of the matrix e.g. 
Jaspar) and a FeatureType (the binding factor).   

=head1 SEE ALSO

Bio::EnsEMBL::Funcgen::DBSQL::BindingMatrixAdaptor

=cut


use strict;
use warnings;

package Bio::EnsEMBL::Funcgen::BindingMatrix;

use Bio::EnsEMBL::Utils::Argument qw( rearrange ) ;
use Bio::EnsEMBL::Utils::Exception qw( throw warning );
use Bio::EnsEMBL::Funcgen::Storable;

use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Funcgen::Storable);

=head2 new

  Arg [-name]: string - name of Matrix
  Arg [-analysis]: Bio::EnsEMBL::Analysis - analysis describing how the matrix was obtained
  Arg [-frequencies]: (optional) string - frequencies representing the binding affinities of a Matrix
  Arg [-threshold]: (optional) float - minimum relative affinity for binding sites of this matrix
  Arg [-description]: (optional) string - descriptiom of Matrix
  Example    : my $matrix = Bio::EnsEMBL::Funcgen::BindingMatrix->new(
                                                               -name  => "MA0122.1",
                                                               -analysis => $analysis,
                                                               -description => "Jaspar Matrix",
                                                                );
  Description: Constructor method for BindingMatrix class
  Returntype : Bio::EnsEMBL::Funcgen::BindingMatrix
  Exceptions : Throws if name or/and type not defined
  Caller     : General
  Status     : Medium risk

=cut

sub new {
  my $caller = shift;

  my $obj_class = ref($caller) || $caller;
  my $self = $obj_class->SUPER::new(@_);
  
  my ( $name, $analysis, $freq, $desc, $ftype, $thresh ) = rearrange
	( [
	   'NAME', 'ANALYSIS', 'FREQUENCIES', 'DESCRIPTION', 'FEATURE_TYPE', 'THRESHOLD'
	  ], @_);
  
  
  if(! defined $name){
    throw("Must supply a name\n");
  }

  if(! ((ref $analysis) && $analysis->isa('Bio::EnsEMBL::Analysis') )){
    throw("You must define a valid Bio::EnsEMBL::Analysis");
    #leave is stored test to adaptor
  }
  
  if(! (ref($ftype) && $ftype->isa('Bio::EnsEMBL::Funcgen::FeatureType'))){
	throw("You must define a valid Bio::EnsEMBL::Funcgen::FeatureType");
	#leave is stored test to adaptor
  }

  $self->name($name);
  $self->{analysis} = $analysis;
  $self->{feature_type} = $ftype;
  $self->frequencies($freq) if $freq;
  $self->description($desc) if $desc;
  $self->threshold($thresh) if $thresh;

  return $self;
}


=head2 feature_type

  Example    : my $ft_name = $matrix->feature_type()->name();
  Description: Getter for the feature_type attribute for this matrix.
  Returntype : Bio::EnsEMBL::Funcgen::FeatureType
  Exceptions : None
  Caller     : General
  Status     : At risk

=cut

sub feature_type {
  my $self = shift;
  
  return $self->{'feature_type'};
}


=head2 name

  Arg [1]    : (optional) string - name
  Example    : my $name = $matrix->name();
  Description: Getter and setter of name attribute
  Returntype : string
  Exceptions : None
  Caller     : General
  Status     : Low Risk

=cut

sub name {
    my $self = shift;
    $self->{'name'} = shift if @_;
    return $self->{'name'};
}

=head2 description

  Arg [1]    : (optional) string - description
  Example    : my $desc = $matrix->description();
  Description: Getter and setter of description attribute 
  Returntype : string
  Exceptions : None
  Caller     : General
  Status     : Low Risk

=cut

sub description {
    my $self = shift;
    $self->{'description'} = shift if @_;
    return $self->{'description'};
}

=head2 threshold

  Arg [1]    : (optional) float - threshold
  Example    : my $thresh = $matrix->threshold();
  Description: Getter and setter of threshold attribute 
  Returntype : float
  Exceptions : None
  Caller     : General
  Status     : At Risk

=cut

sub threshold {
    my $self = shift;
    $self->{'threshold'} = shift if @_;
    return $self->{'threshold'};
}


=head2 analysis
  Example    : $matrix->analysis()->logic_name();
  Description: Getter for the feature_type attribute for this matrix.
  Returntype : Bio::EnsEMBL::Analysis
  Exceptions : None
  Caller     : General
  Status     : At risk

=cut

sub analysis {
  my $self = shift;

  return $self->{'analysis'};
}


=head2 frequencies

  Arg [1]    : (optional) string - frequencies
  Example    : $matrix->frequencies($frequencies_string);
  Description: Getter and setter of frequencies attribute

  The attribute is a string representing the matrix binding
  affinities in the Jaspar format. E.g. 
  	">
  	[ ]
  	"
  
  Returntype : string
  Exceptions : Throws if the string attribute is not a properly
  formed matrix in the Jaspar format
  Caller     : General
  Status     : At Risk

=cut

sub frequencies {
  my $self = shift;
 
  my $frequencies = shift if @_; 
  if($frequencies){
  	$self->_weights($frequencies);
  	$self->{'frequencies'} = $frequencies;  	
  }
  return $self->{'frequencies'};
}

=head2 frequencies_revcomp

  Example    : $matrix->frequencies_revcomp();
  Description: Getter for the reverse complement frequencies attribute

  The attribute represents the reverse complement of frequencies
  
  Returntype : string
  Caller     : General
  Status     : At Risk

=cut

sub frequencies_revcomp {
  my $self = shift;
 
  return $self->{'frequencies_revcomp'};
}


=head2 relative_affinity

  Arg [1]    : string - Binding Site Sequence
  Arg [2]    : (optional) boolean - 1 if results are to be in linear scale (default is log scale)
  Example    : $matrix->relative_affinity($sequence);
  Description: Calculates the binding affinity of a given sequence
	relative to the optimal site for the matrix
	The site is taken as if it were in the proper orientation
        Considers a purely random background p(A)=p(C)=p(G)=p(T)
  Returntype : double
  Exceptions : Throws if the sequence length does not have the matrix length
  or if the sequence has unclear bases (N is not accepted)
  Caller     : General
  Status     : At Risk

=cut

sub relative_affinity {
  my ($self, $sequence, $linear) = (shift, shift, shift);
  $sequence =~ s/^\s+//;
  $sequence =~ s/\s+$//;
  
  throw "No sequence given" if !$sequence;
  $sequence = uc($sequence);
  if($sequence =~ /[^ACGT]/){
    throw "Sequence $sequence contains invalid characters: Only Aa Cc Gg Tt accepted";	
  }
  
  my $weight_matrix = $self->_weights;
  my $matrix_length = scalar(@{$weight_matrix->{'A'}});
  if(length($sequence) != $matrix_length){
    throw "Sequence $sequence does not have length $matrix_length";
  }
  
  my $log_odds = 0;
  my @bases = split(//,$sequence);
  for(my $i=0;$i<$matrix_length;$i++){
    $log_odds += $weight_matrix->{$bases[$i]}->[$i];	
  }
  
  #This log scale may be quite unrealistic... but usefull just for comparisons...
  if(!$linear){
    return ($log_odds - $self->_min_bind) / ($self->_max_bind - $self->_min_bind);
  } else {
    return (exp($log_odds) - exp($self->_min_bind)) / (exp($self->_max_bind) - exp($self->_min_bind));
  }

}

=head2 is_position_informative

  Arg [1]    : int - 1-based position withing the matrix
  Arg [2]    : (optional) double - threshold [0-2] for information content [default is 1.5]
  Example    : $matrix->is_position_informative($pos);
  Description: Returns true if position information content is over threshold
  Returntype : boolean
  Exceptions : Throws if position or threshold out of bounds
  Caller     : General
  Status     : At High Risk

=cut

sub is_position_informative {
  my ($self,$position,$threshold) = (shift,shift,shift);
  throw "Need a position" if(!defined($position));
  throw "Position out of bounds" if(($position<1) || ($position > $self->length));
  if(!defined($threshold)){ $threshold = 1.5; }
  throw "Threshold out of bounds" if(($threshold<0) || ($threshold>2));
  return ($self->{'ic'}->[$position-1] >= $threshold);
}



=head2 length

  Example    : $bm->length();
  Description: Returns the length of the the matrix (e.g. 19bp long)
  Returntype : int with the length of this binding matrix 
  Exceptions : none
  Caller     : General
  Status     : At Risk

=cut

sub length {
  my $self = shift;

  my $weight_matrix = $self->_weights;

  return scalar(@{$weight_matrix->{'A'}});
}

=head2 _weights

  Arg [1]    : (optional) string - frequencies
  Example    : _weights($frequencies);
  Description: Private Getter Setter for the weight matrix based on frequencies
  Returntype : HASHREF with the weights of this binding matrix 
  Exceptions : Throws if the frequencies attribute string does not correspond 
  to 4 rows of an equal number of integer numbers.
  Caller     : Self
  Status     : At Risk

=cut

sub _weights {
	my $self = shift;
	
 	#for the moment use equiprobability and constant pseudo-count
	my $pseudo = 0.1;

 	#TODO allow for it to be passed as parameters?
  	my $frequencies = shift if @_; 
  	if($frequencies){
  		$frequencies =~ s/^(>.*?\n)//;
		my $header = $1;

  		my ($a,$c,$g,$t) = split(/\n/,$frequencies);
  		my @As = split(/\s+/,_parse_matrix_line('[A\[\]]',$a));
  		my @Cs = split(/\s+/,_parse_matrix_line('[C\[\]]',$c));
  		my @Gs = split(/\s+/,_parse_matrix_line('[G\[\]]',$g));
  		my @Ts = split(/\s+/,_parse_matrix_line('[T\[\]]',$t));
		if((scalar(@As)!=scalar(@Cs)) || (scalar(@As)!=scalar(@Gs)) || (scalar(@As)!=scalar(@Ts)) ){
			throw "Frequencies provided are not a valid frequency matrix"		
		}
		$self->_calc_ic(\@As,\@Cs,\@Gs,\@Ts,$pseudo);

		#Create the reverse complement
		my @revT = reverse(@As);
		my @revA = reverse(@Ts);
		my @revC = reverse(@Gs);
		my @revG = reverse(@Cs);
		my $revcomp = $header;
		$revcomp.= "A [ ".join("\t",@revA)." ]\n";
		$revcomp.= "C [ ".join("\t",@revC)." ]\n";
		$revcomp.= "G [ ".join("\t",@revG)." ]\n";
		$revcomp.= "T [ ".join("\t",@revT)." ]\n";
		$self->{'frequencies_revcomp'} = $revcomp;

  		my @totals;
  		for(my $i=0;$i<scalar(@As);$i++){ 
  			$totals[$i]=$As[$i]+$Cs[$i]+$Gs[$i]+$Ts[$i];
  		}
  		
		my %weights;			
		#We can allow distinct background per nucleotide, instead of 0.25 for all... pass as parameter
		#But if the matrix was obtained using in-vivo data, it shouldn't matter the organism nucleotide bias..
		#We're using 0.1 as pseudo-count... the matrix cannot have very few elements... (e.g. <30 not good)
		my @was; for(my $i=0;$i<scalar(@As);$i++){ $was[$i] = log((($As[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); };
		$weights{'A'} = \@was;
		my @wcs; for(my $i=0;$i<scalar(@Cs);$i++){ $wcs[$i] = log((($Cs[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); };
		$weights{'C'} = \@wcs;
		my @wgs; for(my $i=0;$i<scalar(@Gs);$i++){ $wgs[$i] = log((($Gs[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); };
		$weights{'G'} = \@wgs;
		my @wts; for(my $i=0;$i<scalar(@Ts);$i++){ $wts[$i] = log((($Ts[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); };	     
		$weights{'T'} = \@wts;
	
		$self->{'weights'} = \%weights;

		my $max = 0; my $min = 0;
		for(my $i=0;$i<scalar(@As);$i++){
			my $col = [ $was[$i], $wcs[$i], $wgs[$i], $wts[$i] ];
			$min += _min($col);
			$max += _max($col);
		}

		#Log scale
		$self->_max_bind($max);
		$self->_min_bind($min);
	}
	
	return $self->{'weights'};	

}

=head2 _calc_ic

  Example    : _calc_ic($as,$cs,$gs,$ts,$pseudo);
  Description: Private function to calculate the matrix information content per position
  Caller     : self
  Status     : At Risk

=cut

sub _calc_ic {
  my ($self,$as, $cs, $gs, $ts,$pseudo) = (shift,shift, shift, shift, shift, shift);
  my @ic = ();
  for (my $i=0;$i<scalar(@$as);$i++){
    my $total_i = $as->[$i] + $cs->[$i] + $gs->[$i] + $ts->[$i] + (4*$pseudo);
    my $fas = ($as->[$i] + $pseudo) / $total_i;
    my $fcs = ($cs->[$i] + $pseudo) / $total_i;
    my $fgs = ($gs->[$i] + $pseudo) / $total_i;
    my $fts = ($ts->[$i] + $pseudo) / $total_i;    
    my $ic_i = 2 + ($fas * log($fas)/log(2)) + ($fcs * log($fcs)/log(2)) + ($fgs * log($fgs)/log(2)) + ($fts * log($fts)/log(2));
    push @ic, $ic_i;
  }
  $self->{'ic'} = \@ic;
}

sub _parse_matrix_line {
	 my ($pat,$line) = (shift,shift);
	 $line=~s/$pat//g; $line=~s/^\s+//; $line=~s/\s+$//;	
	 return $line;
}

sub _max { return _min_max(shift, 0); }

sub _min { return _min_max(shift, 1); }

sub _min_max {
	my ($list,$min) = (shift, shift);
	my $min_max = $list->[0];
	map { if($min ? $_ < $min_max : $_ > $min_max){ $min_max = $_; } } @$list;
	return $min_max;
}


=head2 _max_bind

  Arg [1]    : (optional) double - maximum binding affinity
  Example    : $matrix->_max_bind(10.2);
  Description: Private Getter and setter of max_bind attribute (not to be called directly)
  Returntype : float with the maximum binding affinity of the matrix 
  Exceptions : None
  Caller     : Self
  Status     : At Risk

=cut

sub _max_bind {
  my $self = shift;
  
  $self->{'max_bind'} = shift if @_;

  return $self->{'max_bind'};
}

=head2 _min_bind

  Arg [1]    : (optional) double - minimum binding affinity
  Example    : $matrix->_min_bind(-10.2);
  Description: Private Getter and setter of min_bind attribute (not to be called directly)
  Returntype : float with the minimum binding affinity of the matrix 
  Exceptions : None
  Caller     : Self
  Status     : At Risk

=cut

sub _min_bind {
  my $self = shift;
  
  $self->{'min_bind'} = shift if @_;

  return $self->{'min_bind'};
}

1;