comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/BindingMatrix.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 #
2 # Ensembl module for Bio::EnsEMBL::Funcgen::BindingMatrix
3 #
4 # You may distribute this module under the same terms as Perl itself
5
6 =head1 LICENSE
7
8 Copyright (c) 1999-2011 The European Bioinformatics Institute and
9 Genome Research Limited. All rights reserved.
10
11 This software is distributed under a modified Apache license.
12 For license details, please see
13
14 http://www.ensembl.org/info/about/code_licence.html
15
16 =head1 CONTACT
17
18 Please email comments or questions to the public Ensembl
19 developers list at <ensembl-dev@ebi.ac.uk>.
20
21 Questions may also be sent to the Ensembl help desk at
22 <helpdesk@ensembl.org>.
23
24
25 =head1 NAME
26
27 Bio::EnsEMBL::Funcgen::BindingMatrix - A module to represent a BindingMatrix.
28 In EFG this represents the binding affinities of a Transcription Factor to DNA.
29
30 =head1 SYNOPSIS
31
32 use Bio::EnsEMBL::Funcgen::BindingMatrix;
33
34 my $matrix = Bio::EnsEMBL::Funcgen::BindingMatrix->new(
35 -name => "MA0122.1",
36 -type => "Jaspar",
37 -description => "Nkx3-2 Jaspar Matrix",
38 );
39 $matrix->frequencies("A [ 4 1 13 24 0 0 6 4 9 ]
40 C [ 7 4 1 0 0 0 0 6 7 ]
41 G [ 4 5 7 0 24 0 18 12 5 ]
42 T [ 9 14 3 0 0 24 0 2 3 ]");
43
44 print $matrix->relative_affinity("TGGCCACCA")."\n";
45
46 print $matrix->threshold."\n";
47
48 =head1 DESCRIPTION
49
50 This class represents information about a BindingMatrix, containing the name
51 (e.g. the Jaspar ID, or an internal name), and description. A BindingMatrix
52 is always associated to an Analysis (indicating the origin of the matrix e.g.
53 Jaspar) and a FeatureType (the binding factor).
54
55 =head1 SEE ALSO
56
57 Bio::EnsEMBL::Funcgen::DBSQL::BindingMatrixAdaptor
58
59 =cut
60
61
62 use strict;
63 use warnings;
64
65 package Bio::EnsEMBL::Funcgen::BindingMatrix;
66
67 use Bio::EnsEMBL::Utils::Argument qw( rearrange ) ;
68 use Bio::EnsEMBL::Utils::Exception qw( throw warning );
69 use Bio::EnsEMBL::Funcgen::Storable;
70
71 use vars qw(@ISA);
72 @ISA = qw(Bio::EnsEMBL::Funcgen::Storable);
73
74 =head2 new
75
76 Arg [-name]: string - name of Matrix
77 Arg [-analysis]: Bio::EnsEMBL::Analysis - analysis describing how the matrix was obtained
78 Arg [-frequencies]: (optional) string - frequencies representing the binding affinities of a Matrix
79 Arg [-threshold]: (optional) float - minimum relative affinity for binding sites of this matrix
80 Arg [-description]: (optional) string - descriptiom of Matrix
81 Example : my $matrix = Bio::EnsEMBL::Funcgen::BindingMatrix->new(
82 -name => "MA0122.1",
83 -analysis => $analysis,
84 -description => "Jaspar Matrix",
85 );
86 Description: Constructor method for BindingMatrix class
87 Returntype : Bio::EnsEMBL::Funcgen::BindingMatrix
88 Exceptions : Throws if name or/and type not defined
89 Caller : General
90 Status : Medium risk
91
92 =cut
93
94 sub new {
95 my $caller = shift;
96
97 my $obj_class = ref($caller) || $caller;
98 my $self = $obj_class->SUPER::new(@_);
99
100 my ( $name, $analysis, $freq, $desc, $ftype, $thresh ) = rearrange
101 ( [
102 'NAME', 'ANALYSIS', 'FREQUENCIES', 'DESCRIPTION', 'FEATURE_TYPE', 'THRESHOLD'
103 ], @_);
104
105
106 if(! defined $name){
107 throw("Must supply a name\n");
108 }
109
110 if(! ((ref $analysis) && $analysis->isa('Bio::EnsEMBL::Analysis') )){
111 throw("You must define a valid Bio::EnsEMBL::Analysis");
112 #leave is stored test to adaptor
113 }
114
115 if(! (ref($ftype) && $ftype->isa('Bio::EnsEMBL::Funcgen::FeatureType'))){
116 throw("You must define a valid Bio::EnsEMBL::Funcgen::FeatureType");
117 #leave is stored test to adaptor
118 }
119
120 $self->name($name);
121 $self->{analysis} = $analysis;
122 $self->{feature_type} = $ftype;
123 $self->frequencies($freq) if $freq;
124 $self->description($desc) if $desc;
125 $self->threshold($thresh) if $thresh;
126
127 return $self;
128 }
129
130
131 =head2 feature_type
132
133 Example : my $ft_name = $matrix->feature_type()->name();
134 Description: Getter for the feature_type attribute for this matrix.
135 Returntype : Bio::EnsEMBL::Funcgen::FeatureType
136 Exceptions : None
137 Caller : General
138 Status : At risk
139
140 =cut
141
142 sub feature_type {
143 my $self = shift;
144
145 return $self->{'feature_type'};
146 }
147
148
149 =head2 name
150
151 Arg [1] : (optional) string - name
152 Example : my $name = $matrix->name();
153 Description: Getter and setter of name attribute
154 Returntype : string
155 Exceptions : None
156 Caller : General
157 Status : Low Risk
158
159 =cut
160
161 sub name {
162 my $self = shift;
163 $self->{'name'} = shift if @_;
164 return $self->{'name'};
165 }
166
167 =head2 description
168
169 Arg [1] : (optional) string - description
170 Example : my $desc = $matrix->description();
171 Description: Getter and setter of description attribute
172 Returntype : string
173 Exceptions : None
174 Caller : General
175 Status : Low Risk
176
177 =cut
178
179 sub description {
180 my $self = shift;
181 $self->{'description'} = shift if @_;
182 return $self->{'description'};
183 }
184
185 =head2 threshold
186
187 Arg [1] : (optional) float - threshold
188 Example : my $thresh = $matrix->threshold();
189 Description: Getter and setter of threshold attribute
190 Returntype : float
191 Exceptions : None
192 Caller : General
193 Status : At Risk
194
195 =cut
196
197 sub threshold {
198 my $self = shift;
199 $self->{'threshold'} = shift if @_;
200 return $self->{'threshold'};
201 }
202
203
204 =head2 analysis
205 Example : $matrix->analysis()->logic_name();
206 Description: Getter for the feature_type attribute for this matrix.
207 Returntype : Bio::EnsEMBL::Analysis
208 Exceptions : None
209 Caller : General
210 Status : At risk
211
212 =cut
213
214 sub analysis {
215 my $self = shift;
216
217 return $self->{'analysis'};
218 }
219
220
221 =head2 frequencies
222
223 Arg [1] : (optional) string - frequencies
224 Example : $matrix->frequencies($frequencies_string);
225 Description: Getter and setter of frequencies attribute
226
227 The attribute is a string representing the matrix binding
228 affinities in the Jaspar format. E.g.
229 ">
230 [ ]
231 "
232
233 Returntype : string
234 Exceptions : Throws if the string attribute is not a properly
235 formed matrix in the Jaspar format
236 Caller : General
237 Status : At Risk
238
239 =cut
240
241 sub frequencies {
242 my $self = shift;
243
244 my $frequencies = shift if @_;
245 if($frequencies){
246 $self->_weights($frequencies);
247 $self->{'frequencies'} = $frequencies;
248 }
249 return $self->{'frequencies'};
250 }
251
252 =head2 frequencies_revcomp
253
254 Example : $matrix->frequencies_revcomp();
255 Description: Getter for the reverse complement frequencies attribute
256
257 The attribute represents the reverse complement of frequencies
258
259 Returntype : string
260 Caller : General
261 Status : At Risk
262
263 =cut
264
265 sub frequencies_revcomp {
266 my $self = shift;
267
268 return $self->{'frequencies_revcomp'};
269 }
270
271
272 =head2 relative_affinity
273
274 Arg [1] : string - Binding Site Sequence
275 Arg [2] : (optional) boolean - 1 if results are to be in linear scale (default is log scale)
276 Example : $matrix->relative_affinity($sequence);
277 Description: Calculates the binding affinity of a given sequence
278 relative to the optimal site for the matrix
279 The site is taken as if it were in the proper orientation
280 Considers a purely random background p(A)=p(C)=p(G)=p(T)
281 Returntype : double
282 Exceptions : Throws if the sequence length does not have the matrix length
283 or if the sequence has unclear bases (N is not accepted)
284 Caller : General
285 Status : At Risk
286
287 =cut
288
289 sub relative_affinity {
290 my ($self, $sequence, $linear) = (shift, shift, shift);
291 $sequence =~ s/^\s+//;
292 $sequence =~ s/\s+$//;
293
294 throw "No sequence given" if !$sequence;
295 $sequence = uc($sequence);
296 if($sequence =~ /[^ACGT]/){
297 throw "Sequence $sequence contains invalid characters: Only Aa Cc Gg Tt accepted";
298 }
299
300 my $weight_matrix = $self->_weights;
301 my $matrix_length = scalar(@{$weight_matrix->{'A'}});
302 if(length($sequence) != $matrix_length){
303 throw "Sequence $sequence does not have length $matrix_length";
304 }
305
306 my $log_odds = 0;
307 my @bases = split(//,$sequence);
308 for(my $i=0;$i<$matrix_length;$i++){
309 $log_odds += $weight_matrix->{$bases[$i]}->[$i];
310 }
311
312 #This log scale may be quite unrealistic... but usefull just for comparisons...
313 if(!$linear){
314 return ($log_odds - $self->_min_bind) / ($self->_max_bind - $self->_min_bind);
315 } else {
316 return (exp($log_odds) - exp($self->_min_bind)) / (exp($self->_max_bind) - exp($self->_min_bind));
317 }
318
319 }
320
321 =head2 is_position_informative
322
323 Arg [1] : int - 1-based position withing the matrix
324 Arg [2] : (optional) double - threshold [0-2] for information content [default is 1.5]
325 Example : $matrix->is_position_informative($pos);
326 Description: Returns true if position information content is over threshold
327 Returntype : boolean
328 Exceptions : Throws if position or threshold out of bounds
329 Caller : General
330 Status : At High Risk
331
332 =cut
333
334 sub is_position_informative {
335 my ($self,$position,$threshold) = (shift,shift,shift);
336 throw "Need a position" if(!defined($position));
337 throw "Position out of bounds" if(($position<1) || ($position > $self->length));
338 if(!defined($threshold)){ $threshold = 1.5; }
339 throw "Threshold out of bounds" if(($threshold<0) || ($threshold>2));
340 return ($self->{'ic'}->[$position-1] >= $threshold);
341 }
342
343
344
345 =head2 length
346
347 Example : $bm->length();
348 Description: Returns the length of the the matrix (e.g. 19bp long)
349 Returntype : int with the length of this binding matrix
350 Exceptions : none
351 Caller : General
352 Status : At Risk
353
354 =cut
355
356 sub length {
357 my $self = shift;
358
359 my $weight_matrix = $self->_weights;
360
361 return scalar(@{$weight_matrix->{'A'}});
362 }
363
364 =head2 _weights
365
366 Arg [1] : (optional) string - frequencies
367 Example : _weights($frequencies);
368 Description: Private Getter Setter for the weight matrix based on frequencies
369 Returntype : HASHREF with the weights of this binding matrix
370 Exceptions : Throws if the frequencies attribute string does not correspond
371 to 4 rows of an equal number of integer numbers.
372 Caller : Self
373 Status : At Risk
374
375 =cut
376
377 sub _weights {
378 my $self = shift;
379
380 #for the moment use equiprobability and constant pseudo-count
381 my $pseudo = 0.1;
382
383 #TODO allow for it to be passed as parameters?
384 my $frequencies = shift if @_;
385 if($frequencies){
386 $frequencies =~ s/^(>.*?\n)//;
387 my $header = $1;
388
389 my ($a,$c,$g,$t) = split(/\n/,$frequencies);
390 my @As = split(/\s+/,_parse_matrix_line('[A\[\]]',$a));
391 my @Cs = split(/\s+/,_parse_matrix_line('[C\[\]]',$c));
392 my @Gs = split(/\s+/,_parse_matrix_line('[G\[\]]',$g));
393 my @Ts = split(/\s+/,_parse_matrix_line('[T\[\]]',$t));
394 if((scalar(@As)!=scalar(@Cs)) || (scalar(@As)!=scalar(@Gs)) || (scalar(@As)!=scalar(@Ts)) ){
395 throw "Frequencies provided are not a valid frequency matrix"
396 }
397 $self->_calc_ic(\@As,\@Cs,\@Gs,\@Ts,$pseudo);
398
399 #Create the reverse complement
400 my @revT = reverse(@As);
401 my @revA = reverse(@Ts);
402 my @revC = reverse(@Gs);
403 my @revG = reverse(@Cs);
404 my $revcomp = $header;
405 $revcomp.= "A [ ".join("\t",@revA)." ]\n";
406 $revcomp.= "C [ ".join("\t",@revC)." ]\n";
407 $revcomp.= "G [ ".join("\t",@revG)." ]\n";
408 $revcomp.= "T [ ".join("\t",@revT)." ]\n";
409 $self->{'frequencies_revcomp'} = $revcomp;
410
411 my @totals;
412 for(my $i=0;$i<scalar(@As);$i++){
413 $totals[$i]=$As[$i]+$Cs[$i]+$Gs[$i]+$Ts[$i];
414 }
415
416 my %weights;
417 #We can allow distinct background per nucleotide, instead of 0.25 for all... pass as parameter
418 #But if the matrix was obtained using in-vivo data, it shouldn't matter the organism nucleotide bias..
419 #We're using 0.1 as pseudo-count... the matrix cannot have very few elements... (e.g. <30 not good)
420 my @was; for(my $i=0;$i<scalar(@As);$i++){ $was[$i] = log((($As[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); };
421 $weights{'A'} = \@was;
422 my @wcs; for(my $i=0;$i<scalar(@Cs);$i++){ $wcs[$i] = log((($Cs[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); };
423 $weights{'C'} = \@wcs;
424 my @wgs; for(my $i=0;$i<scalar(@Gs);$i++){ $wgs[$i] = log((($Gs[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); };
425 $weights{'G'} = \@wgs;
426 my @wts; for(my $i=0;$i<scalar(@Ts);$i++){ $wts[$i] = log((($Ts[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); };
427 $weights{'T'} = \@wts;
428
429 $self->{'weights'} = \%weights;
430
431 my $max = 0; my $min = 0;
432 for(my $i=0;$i<scalar(@As);$i++){
433 my $col = [ $was[$i], $wcs[$i], $wgs[$i], $wts[$i] ];
434 $min += _min($col);
435 $max += _max($col);
436 }
437
438 #Log scale
439 $self->_max_bind($max);
440 $self->_min_bind($min);
441 }
442
443 return $self->{'weights'};
444
445 }
446
447 =head2 _calc_ic
448
449 Example : _calc_ic($as,$cs,$gs,$ts,$pseudo);
450 Description: Private function to calculate the matrix information content per position
451 Caller : self
452 Status : At Risk
453
454 =cut
455
456 sub _calc_ic {
457 my ($self,$as, $cs, $gs, $ts,$pseudo) = (shift,shift, shift, shift, shift, shift);
458 my @ic = ();
459 for (my $i=0;$i<scalar(@$as);$i++){
460 my $total_i = $as->[$i] + $cs->[$i] + $gs->[$i] + $ts->[$i] + (4*$pseudo);
461 my $fas = ($as->[$i] + $pseudo) / $total_i;
462 my $fcs = ($cs->[$i] + $pseudo) / $total_i;
463 my $fgs = ($gs->[$i] + $pseudo) / $total_i;
464 my $fts = ($ts->[$i] + $pseudo) / $total_i;
465 my $ic_i = 2 + ($fas * log($fas)/log(2)) + ($fcs * log($fcs)/log(2)) + ($fgs * log($fgs)/log(2)) + ($fts * log($fts)/log(2));
466 push @ic, $ic_i;
467 }
468 $self->{'ic'} = \@ic;
469 }
470
471 sub _parse_matrix_line {
472 my ($pat,$line) = (shift,shift);
473 $line=~s/$pat//g; $line=~s/^\s+//; $line=~s/\s+$//;
474 return $line;
475 }
476
477 sub _max { return _min_max(shift, 0); }
478
479 sub _min { return _min_max(shift, 1); }
480
481 sub _min_max {
482 my ($list,$min) = (shift, shift);
483 my $min_max = $list->[0];
484 map { if($min ? $_ < $min_max : $_ > $min_max){ $min_max = $_; } } @$list;
485 return $min_max;
486 }
487
488
489 =head2 _max_bind
490
491 Arg [1] : (optional) double - maximum binding affinity
492 Example : $matrix->_max_bind(10.2);
493 Description: Private Getter and setter of max_bind attribute (not to be called directly)
494 Returntype : float with the maximum binding affinity of the matrix
495 Exceptions : None
496 Caller : Self
497 Status : At Risk
498
499 =cut
500
501 sub _max_bind {
502 my $self = shift;
503
504 $self->{'max_bind'} = shift if @_;
505
506 return $self->{'max_bind'};
507 }
508
509 =head2 _min_bind
510
511 Arg [1] : (optional) double - minimum binding affinity
512 Example : $matrix->_min_bind(-10.2);
513 Description: Private Getter and setter of min_bind attribute (not to be called directly)
514 Returntype : float with the minimum binding affinity of the matrix
515 Exceptions : None
516 Caller : Self
517 Status : At Risk
518
519 =cut
520
521 sub _min_bind {
522 my $self = shift;
523
524 $self->{'min_bind'} = shift if @_;
525
526 return $self->{'min_bind'};
527 }
528
529 1;