0
|
1 #
|
|
2 # Ensembl module for Bio::EnsEMBL::Funcgen::BindingMatrix
|
|
3 #
|
|
4 # You may distribute this module under the same terms as Perl itself
|
|
5
|
|
6 =head1 LICENSE
|
|
7
|
|
8 Copyright (c) 1999-2011 The European Bioinformatics Institute and
|
|
9 Genome Research Limited. All rights reserved.
|
|
10
|
|
11 This software is distributed under a modified Apache license.
|
|
12 For license details, please see
|
|
13
|
|
14 http://www.ensembl.org/info/about/code_licence.html
|
|
15
|
|
16 =head1 CONTACT
|
|
17
|
|
18 Please email comments or questions to the public Ensembl
|
|
19 developers list at <ensembl-dev@ebi.ac.uk>.
|
|
20
|
|
21 Questions may also be sent to the Ensembl help desk at
|
|
22 <helpdesk@ensembl.org>.
|
|
23
|
|
24
|
|
25 =head1 NAME
|
|
26
|
|
27 Bio::EnsEMBL::Funcgen::BindingMatrix - A module to represent a BindingMatrix.
|
|
28 In EFG this represents the binding affinities of a Transcription Factor to DNA.
|
|
29
|
|
30 =head1 SYNOPSIS
|
|
31
|
|
32 use Bio::EnsEMBL::Funcgen::BindingMatrix;
|
|
33
|
|
34 my $matrix = Bio::EnsEMBL::Funcgen::BindingMatrix->new(
|
|
35 -name => "MA0122.1",
|
|
36 -type => "Jaspar",
|
|
37 -description => "Nkx3-2 Jaspar Matrix",
|
|
38 );
|
|
39 $matrix->frequencies("A [ 4 1 13 24 0 0 6 4 9 ]
|
|
40 C [ 7 4 1 0 0 0 0 6 7 ]
|
|
41 G [ 4 5 7 0 24 0 18 12 5 ]
|
|
42 T [ 9 14 3 0 0 24 0 2 3 ]");
|
|
43
|
|
44 print $matrix->relative_affinity("TGGCCACCA")."\n";
|
|
45
|
|
46 print $matrix->threshold."\n";
|
|
47
|
|
48 =head1 DESCRIPTION
|
|
49
|
|
50 This class represents information about a BindingMatrix, containing the name
|
|
51 (e.g. the Jaspar ID, or an internal name), and description. A BindingMatrix
|
|
52 is always associated to an Analysis (indicating the origin of the matrix e.g.
|
|
53 Jaspar) and a FeatureType (the binding factor).
|
|
54
|
|
55 =head1 SEE ALSO
|
|
56
|
|
57 Bio::EnsEMBL::Funcgen::DBSQL::BindingMatrixAdaptor
|
|
58
|
|
59 =cut
|
|
60
|
|
61
|
|
62 use strict;
|
|
63 use warnings;
|
|
64
|
|
65 package Bio::EnsEMBL::Funcgen::BindingMatrix;
|
|
66
|
|
67 use Bio::EnsEMBL::Utils::Argument qw( rearrange ) ;
|
|
68 use Bio::EnsEMBL::Utils::Exception qw( throw warning );
|
|
69 use Bio::EnsEMBL::Funcgen::Storable;
|
|
70
|
|
71 use vars qw(@ISA);
|
|
72 @ISA = qw(Bio::EnsEMBL::Funcgen::Storable);
|
|
73
|
|
74 =head2 new
|
|
75
|
|
76 Arg [-name]: string - name of Matrix
|
|
77 Arg [-analysis]: Bio::EnsEMBL::Analysis - analysis describing how the matrix was obtained
|
|
78 Arg [-frequencies]: (optional) string - frequencies representing the binding affinities of a Matrix
|
|
79 Arg [-threshold]: (optional) float - minimum relative affinity for binding sites of this matrix
|
|
80 Arg [-description]: (optional) string - descriptiom of Matrix
|
|
81 Example : my $matrix = Bio::EnsEMBL::Funcgen::BindingMatrix->new(
|
|
82 -name => "MA0122.1",
|
|
83 -analysis => $analysis,
|
|
84 -description => "Jaspar Matrix",
|
|
85 );
|
|
86 Description: Constructor method for BindingMatrix class
|
|
87 Returntype : Bio::EnsEMBL::Funcgen::BindingMatrix
|
|
88 Exceptions : Throws if name or/and type not defined
|
|
89 Caller : General
|
|
90 Status : Medium risk
|
|
91
|
|
92 =cut
|
|
93
|
|
94 sub new {
|
|
95 my $caller = shift;
|
|
96
|
|
97 my $obj_class = ref($caller) || $caller;
|
|
98 my $self = $obj_class->SUPER::new(@_);
|
|
99
|
|
100 my ( $name, $analysis, $freq, $desc, $ftype, $thresh ) = rearrange
|
|
101 ( [
|
|
102 'NAME', 'ANALYSIS', 'FREQUENCIES', 'DESCRIPTION', 'FEATURE_TYPE', 'THRESHOLD'
|
|
103 ], @_);
|
|
104
|
|
105
|
|
106 if(! defined $name){
|
|
107 throw("Must supply a name\n");
|
|
108 }
|
|
109
|
|
110 if(! ((ref $analysis) && $analysis->isa('Bio::EnsEMBL::Analysis') )){
|
|
111 throw("You must define a valid Bio::EnsEMBL::Analysis");
|
|
112 #leave is stored test to adaptor
|
|
113 }
|
|
114
|
|
115 if(! (ref($ftype) && $ftype->isa('Bio::EnsEMBL::Funcgen::FeatureType'))){
|
|
116 throw("You must define a valid Bio::EnsEMBL::Funcgen::FeatureType");
|
|
117 #leave is stored test to adaptor
|
|
118 }
|
|
119
|
|
120 $self->name($name);
|
|
121 $self->{analysis} = $analysis;
|
|
122 $self->{feature_type} = $ftype;
|
|
123 $self->frequencies($freq) if $freq;
|
|
124 $self->description($desc) if $desc;
|
|
125 $self->threshold($thresh) if $thresh;
|
|
126
|
|
127 return $self;
|
|
128 }
|
|
129
|
|
130
|
|
131 =head2 feature_type
|
|
132
|
|
133 Example : my $ft_name = $matrix->feature_type()->name();
|
|
134 Description: Getter for the feature_type attribute for this matrix.
|
|
135 Returntype : Bio::EnsEMBL::Funcgen::FeatureType
|
|
136 Exceptions : None
|
|
137 Caller : General
|
|
138 Status : At risk
|
|
139
|
|
140 =cut
|
|
141
|
|
142 sub feature_type {
|
|
143 my $self = shift;
|
|
144
|
|
145 return $self->{'feature_type'};
|
|
146 }
|
|
147
|
|
148
|
|
149 =head2 name
|
|
150
|
|
151 Arg [1] : (optional) string - name
|
|
152 Example : my $name = $matrix->name();
|
|
153 Description: Getter and setter of name attribute
|
|
154 Returntype : string
|
|
155 Exceptions : None
|
|
156 Caller : General
|
|
157 Status : Low Risk
|
|
158
|
|
159 =cut
|
|
160
|
|
161 sub name {
|
|
162 my $self = shift;
|
|
163 $self->{'name'} = shift if @_;
|
|
164 return $self->{'name'};
|
|
165 }
|
|
166
|
|
167 =head2 description
|
|
168
|
|
169 Arg [1] : (optional) string - description
|
|
170 Example : my $desc = $matrix->description();
|
|
171 Description: Getter and setter of description attribute
|
|
172 Returntype : string
|
|
173 Exceptions : None
|
|
174 Caller : General
|
|
175 Status : Low Risk
|
|
176
|
|
177 =cut
|
|
178
|
|
179 sub description {
|
|
180 my $self = shift;
|
|
181 $self->{'description'} = shift if @_;
|
|
182 return $self->{'description'};
|
|
183 }
|
|
184
|
|
185 =head2 threshold
|
|
186
|
|
187 Arg [1] : (optional) float - threshold
|
|
188 Example : my $thresh = $matrix->threshold();
|
|
189 Description: Getter and setter of threshold attribute
|
|
190 Returntype : float
|
|
191 Exceptions : None
|
|
192 Caller : General
|
|
193 Status : At Risk
|
|
194
|
|
195 =cut
|
|
196
|
|
197 sub threshold {
|
|
198 my $self = shift;
|
|
199 $self->{'threshold'} = shift if @_;
|
|
200 return $self->{'threshold'};
|
|
201 }
|
|
202
|
|
203
|
|
204 =head2 analysis
|
|
205 Example : $matrix->analysis()->logic_name();
|
|
206 Description: Getter for the feature_type attribute for this matrix.
|
|
207 Returntype : Bio::EnsEMBL::Analysis
|
|
208 Exceptions : None
|
|
209 Caller : General
|
|
210 Status : At risk
|
|
211
|
|
212 =cut
|
|
213
|
|
214 sub analysis {
|
|
215 my $self = shift;
|
|
216
|
|
217 return $self->{'analysis'};
|
|
218 }
|
|
219
|
|
220
|
|
221 =head2 frequencies
|
|
222
|
|
223 Arg [1] : (optional) string - frequencies
|
|
224 Example : $matrix->frequencies($frequencies_string);
|
|
225 Description: Getter and setter of frequencies attribute
|
|
226
|
|
227 The attribute is a string representing the matrix binding
|
|
228 affinities in the Jaspar format. E.g.
|
|
229 ">
|
|
230 [ ]
|
|
231 "
|
|
232
|
|
233 Returntype : string
|
|
234 Exceptions : Throws if the string attribute is not a properly
|
|
235 formed matrix in the Jaspar format
|
|
236 Caller : General
|
|
237 Status : At Risk
|
|
238
|
|
239 =cut
|
|
240
|
|
241 sub frequencies {
|
|
242 my $self = shift;
|
|
243
|
|
244 my $frequencies = shift if @_;
|
|
245 if($frequencies){
|
|
246 $self->_weights($frequencies);
|
|
247 $self->{'frequencies'} = $frequencies;
|
|
248 }
|
|
249 return $self->{'frequencies'};
|
|
250 }
|
|
251
|
|
252 =head2 frequencies_revcomp
|
|
253
|
|
254 Example : $matrix->frequencies_revcomp();
|
|
255 Description: Getter for the reverse complement frequencies attribute
|
|
256
|
|
257 The attribute represents the reverse complement of frequencies
|
|
258
|
|
259 Returntype : string
|
|
260 Caller : General
|
|
261 Status : At Risk
|
|
262
|
|
263 =cut
|
|
264
|
|
265 sub frequencies_revcomp {
|
|
266 my $self = shift;
|
|
267
|
|
268 return $self->{'frequencies_revcomp'};
|
|
269 }
|
|
270
|
|
271
|
|
272 =head2 relative_affinity
|
|
273
|
|
274 Arg [1] : string - Binding Site Sequence
|
|
275 Arg [2] : (optional) boolean - 1 if results are to be in linear scale (default is log scale)
|
|
276 Example : $matrix->relative_affinity($sequence);
|
|
277 Description: Calculates the binding affinity of a given sequence
|
|
278 relative to the optimal site for the matrix
|
|
279 The site is taken as if it were in the proper orientation
|
|
280 Considers a purely random background p(A)=p(C)=p(G)=p(T)
|
|
281 Returntype : double
|
|
282 Exceptions : Throws if the sequence length does not have the matrix length
|
|
283 or if the sequence has unclear bases (N is not accepted)
|
|
284 Caller : General
|
|
285 Status : At Risk
|
|
286
|
|
287 =cut
|
|
288
|
|
289 sub relative_affinity {
|
|
290 my ($self, $sequence, $linear) = (shift, shift, shift);
|
|
291 $sequence =~ s/^\s+//;
|
|
292 $sequence =~ s/\s+$//;
|
|
293
|
|
294 throw "No sequence given" if !$sequence;
|
|
295 $sequence = uc($sequence);
|
|
296 if($sequence =~ /[^ACGT]/){
|
|
297 throw "Sequence $sequence contains invalid characters: Only Aa Cc Gg Tt accepted";
|
|
298 }
|
|
299
|
|
300 my $weight_matrix = $self->_weights;
|
|
301 my $matrix_length = scalar(@{$weight_matrix->{'A'}});
|
|
302 if(length($sequence) != $matrix_length){
|
|
303 throw "Sequence $sequence does not have length $matrix_length";
|
|
304 }
|
|
305
|
|
306 my $log_odds = 0;
|
|
307 my @bases = split(//,$sequence);
|
|
308 for(my $i=0;$i<$matrix_length;$i++){
|
|
309 $log_odds += $weight_matrix->{$bases[$i]}->[$i];
|
|
310 }
|
|
311
|
|
312 #This log scale may be quite unrealistic... but usefull just for comparisons...
|
|
313 if(!$linear){
|
|
314 return ($log_odds - $self->_min_bind) / ($self->_max_bind - $self->_min_bind);
|
|
315 } else {
|
|
316 return (exp($log_odds) - exp($self->_min_bind)) / (exp($self->_max_bind) - exp($self->_min_bind));
|
|
317 }
|
|
318
|
|
319 }
|
|
320
|
|
321 =head2 is_position_informative
|
|
322
|
|
323 Arg [1] : int - 1-based position withing the matrix
|
|
324 Arg [2] : (optional) double - threshold [0-2] for information content [default is 1.5]
|
|
325 Example : $matrix->is_position_informative($pos);
|
|
326 Description: Returns true if position information content is over threshold
|
|
327 Returntype : boolean
|
|
328 Exceptions : Throws if position or threshold out of bounds
|
|
329 Caller : General
|
|
330 Status : At High Risk
|
|
331
|
|
332 =cut
|
|
333
|
|
334 sub is_position_informative {
|
|
335 my ($self,$position,$threshold) = (shift,shift,shift);
|
|
336 throw "Need a position" if(!defined($position));
|
|
337 throw "Position out of bounds" if(($position<1) || ($position > $self->length));
|
|
338 if(!defined($threshold)){ $threshold = 1.5; }
|
|
339 throw "Threshold out of bounds" if(($threshold<0) || ($threshold>2));
|
|
340 return ($self->{'ic'}->[$position-1] >= $threshold);
|
|
341 }
|
|
342
|
|
343
|
|
344
|
|
345 =head2 length
|
|
346
|
|
347 Example : $bm->length();
|
|
348 Description: Returns the length of the the matrix (e.g. 19bp long)
|
|
349 Returntype : int with the length of this binding matrix
|
|
350 Exceptions : none
|
|
351 Caller : General
|
|
352 Status : At Risk
|
|
353
|
|
354 =cut
|
|
355
|
|
356 sub length {
|
|
357 my $self = shift;
|
|
358
|
|
359 my $weight_matrix = $self->_weights;
|
|
360
|
|
361 return scalar(@{$weight_matrix->{'A'}});
|
|
362 }
|
|
363
|
|
364 =head2 _weights
|
|
365
|
|
366 Arg [1] : (optional) string - frequencies
|
|
367 Example : _weights($frequencies);
|
|
368 Description: Private Getter Setter for the weight matrix based on frequencies
|
|
369 Returntype : HASHREF with the weights of this binding matrix
|
|
370 Exceptions : Throws if the frequencies attribute string does not correspond
|
|
371 to 4 rows of an equal number of integer numbers.
|
|
372 Caller : Self
|
|
373 Status : At Risk
|
|
374
|
|
375 =cut
|
|
376
|
|
377 sub _weights {
|
|
378 my $self = shift;
|
|
379
|
|
380 #for the moment use equiprobability and constant pseudo-count
|
|
381 my $pseudo = 0.1;
|
|
382
|
|
383 #TODO allow for it to be passed as parameters?
|
|
384 my $frequencies = shift if @_;
|
|
385 if($frequencies){
|
|
386 $frequencies =~ s/^(>.*?\n)//;
|
|
387 my $header = $1;
|
|
388
|
|
389 my ($a,$c,$g,$t) = split(/\n/,$frequencies);
|
|
390 my @As = split(/\s+/,_parse_matrix_line('[A\[\]]',$a));
|
|
391 my @Cs = split(/\s+/,_parse_matrix_line('[C\[\]]',$c));
|
|
392 my @Gs = split(/\s+/,_parse_matrix_line('[G\[\]]',$g));
|
|
393 my @Ts = split(/\s+/,_parse_matrix_line('[T\[\]]',$t));
|
|
394 if((scalar(@As)!=scalar(@Cs)) || (scalar(@As)!=scalar(@Gs)) || (scalar(@As)!=scalar(@Ts)) ){
|
|
395 throw "Frequencies provided are not a valid frequency matrix"
|
|
396 }
|
|
397 $self->_calc_ic(\@As,\@Cs,\@Gs,\@Ts,$pseudo);
|
|
398
|
|
399 #Create the reverse complement
|
|
400 my @revT = reverse(@As);
|
|
401 my @revA = reverse(@Ts);
|
|
402 my @revC = reverse(@Gs);
|
|
403 my @revG = reverse(@Cs);
|
|
404 my $revcomp = $header;
|
|
405 $revcomp.= "A [ ".join("\t",@revA)." ]\n";
|
|
406 $revcomp.= "C [ ".join("\t",@revC)." ]\n";
|
|
407 $revcomp.= "G [ ".join("\t",@revG)." ]\n";
|
|
408 $revcomp.= "T [ ".join("\t",@revT)." ]\n";
|
|
409 $self->{'frequencies_revcomp'} = $revcomp;
|
|
410
|
|
411 my @totals;
|
|
412 for(my $i=0;$i<scalar(@As);$i++){
|
|
413 $totals[$i]=$As[$i]+$Cs[$i]+$Gs[$i]+$Ts[$i];
|
|
414 }
|
|
415
|
|
416 my %weights;
|
|
417 #We can allow distinct background per nucleotide, instead of 0.25 for all... pass as parameter
|
|
418 #But if the matrix was obtained using in-vivo data, it shouldn't matter the organism nucleotide bias..
|
|
419 #We're using 0.1 as pseudo-count... the matrix cannot have very few elements... (e.g. <30 not good)
|
|
420 my @was; for(my $i=0;$i<scalar(@As);$i++){ $was[$i] = log((($As[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); };
|
|
421 $weights{'A'} = \@was;
|
|
422 my @wcs; for(my $i=0;$i<scalar(@Cs);$i++){ $wcs[$i] = log((($Cs[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); };
|
|
423 $weights{'C'} = \@wcs;
|
|
424 my @wgs; for(my $i=0;$i<scalar(@Gs);$i++){ $wgs[$i] = log((($Gs[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); };
|
|
425 $weights{'G'} = \@wgs;
|
|
426 my @wts; for(my $i=0;$i<scalar(@Ts);$i++){ $wts[$i] = log((($Ts[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); };
|
|
427 $weights{'T'} = \@wts;
|
|
428
|
|
429 $self->{'weights'} = \%weights;
|
|
430
|
|
431 my $max = 0; my $min = 0;
|
|
432 for(my $i=0;$i<scalar(@As);$i++){
|
|
433 my $col = [ $was[$i], $wcs[$i], $wgs[$i], $wts[$i] ];
|
|
434 $min += _min($col);
|
|
435 $max += _max($col);
|
|
436 }
|
|
437
|
|
438 #Log scale
|
|
439 $self->_max_bind($max);
|
|
440 $self->_min_bind($min);
|
|
441 }
|
|
442
|
|
443 return $self->{'weights'};
|
|
444
|
|
445 }
|
|
446
|
|
447 =head2 _calc_ic
|
|
448
|
|
449 Example : _calc_ic($as,$cs,$gs,$ts,$pseudo);
|
|
450 Description: Private function to calculate the matrix information content per position
|
|
451 Caller : self
|
|
452 Status : At Risk
|
|
453
|
|
454 =cut
|
|
455
|
|
456 sub _calc_ic {
|
|
457 my ($self,$as, $cs, $gs, $ts,$pseudo) = (shift,shift, shift, shift, shift, shift);
|
|
458 my @ic = ();
|
|
459 for (my $i=0;$i<scalar(@$as);$i++){
|
|
460 my $total_i = $as->[$i] + $cs->[$i] + $gs->[$i] + $ts->[$i] + (4*$pseudo);
|
|
461 my $fas = ($as->[$i] + $pseudo) / $total_i;
|
|
462 my $fcs = ($cs->[$i] + $pseudo) / $total_i;
|
|
463 my $fgs = ($gs->[$i] + $pseudo) / $total_i;
|
|
464 my $fts = ($ts->[$i] + $pseudo) / $total_i;
|
|
465 my $ic_i = 2 + ($fas * log($fas)/log(2)) + ($fcs * log($fcs)/log(2)) + ($fgs * log($fgs)/log(2)) + ($fts * log($fts)/log(2));
|
|
466 push @ic, $ic_i;
|
|
467 }
|
|
468 $self->{'ic'} = \@ic;
|
|
469 }
|
|
470
|
|
471 sub _parse_matrix_line {
|
|
472 my ($pat,$line) = (shift,shift);
|
|
473 $line=~s/$pat//g; $line=~s/^\s+//; $line=~s/\s+$//;
|
|
474 return $line;
|
|
475 }
|
|
476
|
|
477 sub _max { return _min_max(shift, 0); }
|
|
478
|
|
479 sub _min { return _min_max(shift, 1); }
|
|
480
|
|
481 sub _min_max {
|
|
482 my ($list,$min) = (shift, shift);
|
|
483 my $min_max = $list->[0];
|
|
484 map { if($min ? $_ < $min_max : $_ > $min_max){ $min_max = $_; } } @$list;
|
|
485 return $min_max;
|
|
486 }
|
|
487
|
|
488
|
|
489 =head2 _max_bind
|
|
490
|
|
491 Arg [1] : (optional) double - maximum binding affinity
|
|
492 Example : $matrix->_max_bind(10.2);
|
|
493 Description: Private Getter and setter of max_bind attribute (not to be called directly)
|
|
494 Returntype : float with the maximum binding affinity of the matrix
|
|
495 Exceptions : None
|
|
496 Caller : Self
|
|
497 Status : At Risk
|
|
498
|
|
499 =cut
|
|
500
|
|
501 sub _max_bind {
|
|
502 my $self = shift;
|
|
503
|
|
504 $self->{'max_bind'} = shift if @_;
|
|
505
|
|
506 return $self->{'max_bind'};
|
|
507 }
|
|
508
|
|
509 =head2 _min_bind
|
|
510
|
|
511 Arg [1] : (optional) double - minimum binding affinity
|
|
512 Example : $matrix->_min_bind(-10.2);
|
|
513 Description: Private Getter and setter of min_bind attribute (not to be called directly)
|
|
514 Returntype : float with the minimum binding affinity of the matrix
|
|
515 Exceptions : None
|
|
516 Caller : Self
|
|
517 Status : At Risk
|
|
518
|
|
519 =cut
|
|
520
|
|
521 sub _min_bind {
|
|
522 my $self = shift;
|
|
523
|
|
524 $self->{'min_bind'} = shift if @_;
|
|
525
|
|
526 return $self->{'min_bind'};
|
|
527 }
|
|
528
|
|
529 1;
|