Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/BindingMatrix.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # | |
2 # Ensembl module for Bio::EnsEMBL::Funcgen::BindingMatrix | |
3 # | |
4 # You may distribute this module under the same terms as Perl itself | |
5 | |
6 =head1 LICENSE | |
7 | |
8 Copyright (c) 1999-2011 The European Bioinformatics Institute and | |
9 Genome Research Limited. All rights reserved. | |
10 | |
11 This software is distributed under a modified Apache license. | |
12 For license details, please see | |
13 | |
14 http://www.ensembl.org/info/about/code_licence.html | |
15 | |
16 =head1 CONTACT | |
17 | |
18 Please email comments or questions to the public Ensembl | |
19 developers list at <ensembl-dev@ebi.ac.uk>. | |
20 | |
21 Questions may also be sent to the Ensembl help desk at | |
22 <helpdesk@ensembl.org>. | |
23 | |
24 | |
25 =head1 NAME | |
26 | |
27 Bio::EnsEMBL::Funcgen::BindingMatrix - A module to represent a BindingMatrix. | |
28 In EFG this represents the binding affinities of a Transcription Factor to DNA. | |
29 | |
30 =head1 SYNOPSIS | |
31 | |
32 use Bio::EnsEMBL::Funcgen::BindingMatrix; | |
33 | |
34 my $matrix = Bio::EnsEMBL::Funcgen::BindingMatrix->new( | |
35 -name => "MA0122.1", | |
36 -type => "Jaspar", | |
37 -description => "Nkx3-2 Jaspar Matrix", | |
38 ); | |
39 $matrix->frequencies("A [ 4 1 13 24 0 0 6 4 9 ] | |
40 C [ 7 4 1 0 0 0 0 6 7 ] | |
41 G [ 4 5 7 0 24 0 18 12 5 ] | |
42 T [ 9 14 3 0 0 24 0 2 3 ]"); | |
43 | |
44 print $matrix->relative_affinity("TGGCCACCA")."\n"; | |
45 | |
46 print $matrix->threshold."\n"; | |
47 | |
48 =head1 DESCRIPTION | |
49 | |
50 This class represents information about a BindingMatrix, containing the name | |
51 (e.g. the Jaspar ID, or an internal name), and description. A BindingMatrix | |
52 is always associated to an Analysis (indicating the origin of the matrix e.g. | |
53 Jaspar) and a FeatureType (the binding factor). | |
54 | |
55 =head1 SEE ALSO | |
56 | |
57 Bio::EnsEMBL::Funcgen::DBSQL::BindingMatrixAdaptor | |
58 | |
59 =cut | |
60 | |
61 | |
62 use strict; | |
63 use warnings; | |
64 | |
65 package Bio::EnsEMBL::Funcgen::BindingMatrix; | |
66 | |
67 use Bio::EnsEMBL::Utils::Argument qw( rearrange ) ; | |
68 use Bio::EnsEMBL::Utils::Exception qw( throw warning ); | |
69 use Bio::EnsEMBL::Funcgen::Storable; | |
70 | |
71 use vars qw(@ISA); | |
72 @ISA = qw(Bio::EnsEMBL::Funcgen::Storable); | |
73 | |
74 =head2 new | |
75 | |
76 Arg [-name]: string - name of Matrix | |
77 Arg [-analysis]: Bio::EnsEMBL::Analysis - analysis describing how the matrix was obtained | |
78 Arg [-frequencies]: (optional) string - frequencies representing the binding affinities of a Matrix | |
79 Arg [-threshold]: (optional) float - minimum relative affinity for binding sites of this matrix | |
80 Arg [-description]: (optional) string - descriptiom of Matrix | |
81 Example : my $matrix = Bio::EnsEMBL::Funcgen::BindingMatrix->new( | |
82 -name => "MA0122.1", | |
83 -analysis => $analysis, | |
84 -description => "Jaspar Matrix", | |
85 ); | |
86 Description: Constructor method for BindingMatrix class | |
87 Returntype : Bio::EnsEMBL::Funcgen::BindingMatrix | |
88 Exceptions : Throws if name or/and type not defined | |
89 Caller : General | |
90 Status : Medium risk | |
91 | |
92 =cut | |
93 | |
94 sub new { | |
95 my $caller = shift; | |
96 | |
97 my $obj_class = ref($caller) || $caller; | |
98 my $self = $obj_class->SUPER::new(@_); | |
99 | |
100 my ( $name, $analysis, $freq, $desc, $ftype, $thresh ) = rearrange | |
101 ( [ | |
102 'NAME', 'ANALYSIS', 'FREQUENCIES', 'DESCRIPTION', 'FEATURE_TYPE', 'THRESHOLD' | |
103 ], @_); | |
104 | |
105 | |
106 if(! defined $name){ | |
107 throw("Must supply a name\n"); | |
108 } | |
109 | |
110 if(! ((ref $analysis) && $analysis->isa('Bio::EnsEMBL::Analysis') )){ | |
111 throw("You must define a valid Bio::EnsEMBL::Analysis"); | |
112 #leave is stored test to adaptor | |
113 } | |
114 | |
115 if(! (ref($ftype) && $ftype->isa('Bio::EnsEMBL::Funcgen::FeatureType'))){ | |
116 throw("You must define a valid Bio::EnsEMBL::Funcgen::FeatureType"); | |
117 #leave is stored test to adaptor | |
118 } | |
119 | |
120 $self->name($name); | |
121 $self->{analysis} = $analysis; | |
122 $self->{feature_type} = $ftype; | |
123 $self->frequencies($freq) if $freq; | |
124 $self->description($desc) if $desc; | |
125 $self->threshold($thresh) if $thresh; | |
126 | |
127 return $self; | |
128 } | |
129 | |
130 | |
131 =head2 feature_type | |
132 | |
133 Example : my $ft_name = $matrix->feature_type()->name(); | |
134 Description: Getter for the feature_type attribute for this matrix. | |
135 Returntype : Bio::EnsEMBL::Funcgen::FeatureType | |
136 Exceptions : None | |
137 Caller : General | |
138 Status : At risk | |
139 | |
140 =cut | |
141 | |
142 sub feature_type { | |
143 my $self = shift; | |
144 | |
145 return $self->{'feature_type'}; | |
146 } | |
147 | |
148 | |
149 =head2 name | |
150 | |
151 Arg [1] : (optional) string - name | |
152 Example : my $name = $matrix->name(); | |
153 Description: Getter and setter of name attribute | |
154 Returntype : string | |
155 Exceptions : None | |
156 Caller : General | |
157 Status : Low Risk | |
158 | |
159 =cut | |
160 | |
161 sub name { | |
162 my $self = shift; | |
163 $self->{'name'} = shift if @_; | |
164 return $self->{'name'}; | |
165 } | |
166 | |
167 =head2 description | |
168 | |
169 Arg [1] : (optional) string - description | |
170 Example : my $desc = $matrix->description(); | |
171 Description: Getter and setter of description attribute | |
172 Returntype : string | |
173 Exceptions : None | |
174 Caller : General | |
175 Status : Low Risk | |
176 | |
177 =cut | |
178 | |
179 sub description { | |
180 my $self = shift; | |
181 $self->{'description'} = shift if @_; | |
182 return $self->{'description'}; | |
183 } | |
184 | |
185 =head2 threshold | |
186 | |
187 Arg [1] : (optional) float - threshold | |
188 Example : my $thresh = $matrix->threshold(); | |
189 Description: Getter and setter of threshold attribute | |
190 Returntype : float | |
191 Exceptions : None | |
192 Caller : General | |
193 Status : At Risk | |
194 | |
195 =cut | |
196 | |
197 sub threshold { | |
198 my $self = shift; | |
199 $self->{'threshold'} = shift if @_; | |
200 return $self->{'threshold'}; | |
201 } | |
202 | |
203 | |
204 =head2 analysis | |
205 Example : $matrix->analysis()->logic_name(); | |
206 Description: Getter for the feature_type attribute for this matrix. | |
207 Returntype : Bio::EnsEMBL::Analysis | |
208 Exceptions : None | |
209 Caller : General | |
210 Status : At risk | |
211 | |
212 =cut | |
213 | |
214 sub analysis { | |
215 my $self = shift; | |
216 | |
217 return $self->{'analysis'}; | |
218 } | |
219 | |
220 | |
221 =head2 frequencies | |
222 | |
223 Arg [1] : (optional) string - frequencies | |
224 Example : $matrix->frequencies($frequencies_string); | |
225 Description: Getter and setter of frequencies attribute | |
226 | |
227 The attribute is a string representing the matrix binding | |
228 affinities in the Jaspar format. E.g. | |
229 "> | |
230 [ ] | |
231 " | |
232 | |
233 Returntype : string | |
234 Exceptions : Throws if the string attribute is not a properly | |
235 formed matrix in the Jaspar format | |
236 Caller : General | |
237 Status : At Risk | |
238 | |
239 =cut | |
240 | |
241 sub frequencies { | |
242 my $self = shift; | |
243 | |
244 my $frequencies = shift if @_; | |
245 if($frequencies){ | |
246 $self->_weights($frequencies); | |
247 $self->{'frequencies'} = $frequencies; | |
248 } | |
249 return $self->{'frequencies'}; | |
250 } | |
251 | |
252 =head2 frequencies_revcomp | |
253 | |
254 Example : $matrix->frequencies_revcomp(); | |
255 Description: Getter for the reverse complement frequencies attribute | |
256 | |
257 The attribute represents the reverse complement of frequencies | |
258 | |
259 Returntype : string | |
260 Caller : General | |
261 Status : At Risk | |
262 | |
263 =cut | |
264 | |
265 sub frequencies_revcomp { | |
266 my $self = shift; | |
267 | |
268 return $self->{'frequencies_revcomp'}; | |
269 } | |
270 | |
271 | |
272 =head2 relative_affinity | |
273 | |
274 Arg [1] : string - Binding Site Sequence | |
275 Arg [2] : (optional) boolean - 1 if results are to be in linear scale (default is log scale) | |
276 Example : $matrix->relative_affinity($sequence); | |
277 Description: Calculates the binding affinity of a given sequence | |
278 relative to the optimal site for the matrix | |
279 The site is taken as if it were in the proper orientation | |
280 Considers a purely random background p(A)=p(C)=p(G)=p(T) | |
281 Returntype : double | |
282 Exceptions : Throws if the sequence length does not have the matrix length | |
283 or if the sequence has unclear bases (N is not accepted) | |
284 Caller : General | |
285 Status : At Risk | |
286 | |
287 =cut | |
288 | |
289 sub relative_affinity { | |
290 my ($self, $sequence, $linear) = (shift, shift, shift); | |
291 $sequence =~ s/^\s+//; | |
292 $sequence =~ s/\s+$//; | |
293 | |
294 throw "No sequence given" if !$sequence; | |
295 $sequence = uc($sequence); | |
296 if($sequence =~ /[^ACGT]/){ | |
297 throw "Sequence $sequence contains invalid characters: Only Aa Cc Gg Tt accepted"; | |
298 } | |
299 | |
300 my $weight_matrix = $self->_weights; | |
301 my $matrix_length = scalar(@{$weight_matrix->{'A'}}); | |
302 if(length($sequence) != $matrix_length){ | |
303 throw "Sequence $sequence does not have length $matrix_length"; | |
304 } | |
305 | |
306 my $log_odds = 0; | |
307 my @bases = split(//,$sequence); | |
308 for(my $i=0;$i<$matrix_length;$i++){ | |
309 $log_odds += $weight_matrix->{$bases[$i]}->[$i]; | |
310 } | |
311 | |
312 #This log scale may be quite unrealistic... but usefull just for comparisons... | |
313 if(!$linear){ | |
314 return ($log_odds - $self->_min_bind) / ($self->_max_bind - $self->_min_bind); | |
315 } else { | |
316 return (exp($log_odds) - exp($self->_min_bind)) / (exp($self->_max_bind) - exp($self->_min_bind)); | |
317 } | |
318 | |
319 } | |
320 | |
321 =head2 is_position_informative | |
322 | |
323 Arg [1] : int - 1-based position withing the matrix | |
324 Arg [2] : (optional) double - threshold [0-2] for information content [default is 1.5] | |
325 Example : $matrix->is_position_informative($pos); | |
326 Description: Returns true if position information content is over threshold | |
327 Returntype : boolean | |
328 Exceptions : Throws if position or threshold out of bounds | |
329 Caller : General | |
330 Status : At High Risk | |
331 | |
332 =cut | |
333 | |
334 sub is_position_informative { | |
335 my ($self,$position,$threshold) = (shift,shift,shift); | |
336 throw "Need a position" if(!defined($position)); | |
337 throw "Position out of bounds" if(($position<1) || ($position > $self->length)); | |
338 if(!defined($threshold)){ $threshold = 1.5; } | |
339 throw "Threshold out of bounds" if(($threshold<0) || ($threshold>2)); | |
340 return ($self->{'ic'}->[$position-1] >= $threshold); | |
341 } | |
342 | |
343 | |
344 | |
345 =head2 length | |
346 | |
347 Example : $bm->length(); | |
348 Description: Returns the length of the the matrix (e.g. 19bp long) | |
349 Returntype : int with the length of this binding matrix | |
350 Exceptions : none | |
351 Caller : General | |
352 Status : At Risk | |
353 | |
354 =cut | |
355 | |
356 sub length { | |
357 my $self = shift; | |
358 | |
359 my $weight_matrix = $self->_weights; | |
360 | |
361 return scalar(@{$weight_matrix->{'A'}}); | |
362 } | |
363 | |
364 =head2 _weights | |
365 | |
366 Arg [1] : (optional) string - frequencies | |
367 Example : _weights($frequencies); | |
368 Description: Private Getter Setter for the weight matrix based on frequencies | |
369 Returntype : HASHREF with the weights of this binding matrix | |
370 Exceptions : Throws if the frequencies attribute string does not correspond | |
371 to 4 rows of an equal number of integer numbers. | |
372 Caller : Self | |
373 Status : At Risk | |
374 | |
375 =cut | |
376 | |
377 sub _weights { | |
378 my $self = shift; | |
379 | |
380 #for the moment use equiprobability and constant pseudo-count | |
381 my $pseudo = 0.1; | |
382 | |
383 #TODO allow for it to be passed as parameters? | |
384 my $frequencies = shift if @_; | |
385 if($frequencies){ | |
386 $frequencies =~ s/^(>.*?\n)//; | |
387 my $header = $1; | |
388 | |
389 my ($a,$c,$g,$t) = split(/\n/,$frequencies); | |
390 my @As = split(/\s+/,_parse_matrix_line('[A\[\]]',$a)); | |
391 my @Cs = split(/\s+/,_parse_matrix_line('[C\[\]]',$c)); | |
392 my @Gs = split(/\s+/,_parse_matrix_line('[G\[\]]',$g)); | |
393 my @Ts = split(/\s+/,_parse_matrix_line('[T\[\]]',$t)); | |
394 if((scalar(@As)!=scalar(@Cs)) || (scalar(@As)!=scalar(@Gs)) || (scalar(@As)!=scalar(@Ts)) ){ | |
395 throw "Frequencies provided are not a valid frequency matrix" | |
396 } | |
397 $self->_calc_ic(\@As,\@Cs,\@Gs,\@Ts,$pseudo); | |
398 | |
399 #Create the reverse complement | |
400 my @revT = reverse(@As); | |
401 my @revA = reverse(@Ts); | |
402 my @revC = reverse(@Gs); | |
403 my @revG = reverse(@Cs); | |
404 my $revcomp = $header; | |
405 $revcomp.= "A [ ".join("\t",@revA)." ]\n"; | |
406 $revcomp.= "C [ ".join("\t",@revC)." ]\n"; | |
407 $revcomp.= "G [ ".join("\t",@revG)." ]\n"; | |
408 $revcomp.= "T [ ".join("\t",@revT)." ]\n"; | |
409 $self->{'frequencies_revcomp'} = $revcomp; | |
410 | |
411 my @totals; | |
412 for(my $i=0;$i<scalar(@As);$i++){ | |
413 $totals[$i]=$As[$i]+$Cs[$i]+$Gs[$i]+$Ts[$i]; | |
414 } | |
415 | |
416 my %weights; | |
417 #We can allow distinct background per nucleotide, instead of 0.25 for all... pass as parameter | |
418 #But if the matrix was obtained using in-vivo data, it shouldn't matter the organism nucleotide bias.. | |
419 #We're using 0.1 as pseudo-count... the matrix cannot have very few elements... (e.g. <30 not good) | |
420 my @was; for(my $i=0;$i<scalar(@As);$i++){ $was[$i] = log((($As[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); }; | |
421 $weights{'A'} = \@was; | |
422 my @wcs; for(my $i=0;$i<scalar(@Cs);$i++){ $wcs[$i] = log((($Cs[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); }; | |
423 $weights{'C'} = \@wcs; | |
424 my @wgs; for(my $i=0;$i<scalar(@Gs);$i++){ $wgs[$i] = log((($Gs[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); }; | |
425 $weights{'G'} = \@wgs; | |
426 my @wts; for(my $i=0;$i<scalar(@Ts);$i++){ $wts[$i] = log((($Ts[$i] + $pseudo) / ($totals[$i]+(4*$pseudo))) / 0.25); }; | |
427 $weights{'T'} = \@wts; | |
428 | |
429 $self->{'weights'} = \%weights; | |
430 | |
431 my $max = 0; my $min = 0; | |
432 for(my $i=0;$i<scalar(@As);$i++){ | |
433 my $col = [ $was[$i], $wcs[$i], $wgs[$i], $wts[$i] ]; | |
434 $min += _min($col); | |
435 $max += _max($col); | |
436 } | |
437 | |
438 #Log scale | |
439 $self->_max_bind($max); | |
440 $self->_min_bind($min); | |
441 } | |
442 | |
443 return $self->{'weights'}; | |
444 | |
445 } | |
446 | |
447 =head2 _calc_ic | |
448 | |
449 Example : _calc_ic($as,$cs,$gs,$ts,$pseudo); | |
450 Description: Private function to calculate the matrix information content per position | |
451 Caller : self | |
452 Status : At Risk | |
453 | |
454 =cut | |
455 | |
456 sub _calc_ic { | |
457 my ($self,$as, $cs, $gs, $ts,$pseudo) = (shift,shift, shift, shift, shift, shift); | |
458 my @ic = (); | |
459 for (my $i=0;$i<scalar(@$as);$i++){ | |
460 my $total_i = $as->[$i] + $cs->[$i] + $gs->[$i] + $ts->[$i] + (4*$pseudo); | |
461 my $fas = ($as->[$i] + $pseudo) / $total_i; | |
462 my $fcs = ($cs->[$i] + $pseudo) / $total_i; | |
463 my $fgs = ($gs->[$i] + $pseudo) / $total_i; | |
464 my $fts = ($ts->[$i] + $pseudo) / $total_i; | |
465 my $ic_i = 2 + ($fas * log($fas)/log(2)) + ($fcs * log($fcs)/log(2)) + ($fgs * log($fgs)/log(2)) + ($fts * log($fts)/log(2)); | |
466 push @ic, $ic_i; | |
467 } | |
468 $self->{'ic'} = \@ic; | |
469 } | |
470 | |
471 sub _parse_matrix_line { | |
472 my ($pat,$line) = (shift,shift); | |
473 $line=~s/$pat//g; $line=~s/^\s+//; $line=~s/\s+$//; | |
474 return $line; | |
475 } | |
476 | |
477 sub _max { return _min_max(shift, 0); } | |
478 | |
479 sub _min { return _min_max(shift, 1); } | |
480 | |
481 sub _min_max { | |
482 my ($list,$min) = (shift, shift); | |
483 my $min_max = $list->[0]; | |
484 map { if($min ? $_ < $min_max : $_ > $min_max){ $min_max = $_; } } @$list; | |
485 return $min_max; | |
486 } | |
487 | |
488 | |
489 =head2 _max_bind | |
490 | |
491 Arg [1] : (optional) double - maximum binding affinity | |
492 Example : $matrix->_max_bind(10.2); | |
493 Description: Private Getter and setter of max_bind attribute (not to be called directly) | |
494 Returntype : float with the maximum binding affinity of the matrix | |
495 Exceptions : None | |
496 Caller : Self | |
497 Status : At Risk | |
498 | |
499 =cut | |
500 | |
501 sub _max_bind { | |
502 my $self = shift; | |
503 | |
504 $self->{'max_bind'} = shift if @_; | |
505 | |
506 return $self->{'max_bind'}; | |
507 } | |
508 | |
509 =head2 _min_bind | |
510 | |
511 Arg [1] : (optional) double - minimum binding affinity | |
512 Example : $matrix->_min_bind(-10.2); | |
513 Description: Private Getter and setter of min_bind attribute (not to be called directly) | |
514 Returntype : float with the minimum binding affinity of the matrix | |
515 Exceptions : None | |
516 Caller : Self | |
517 Status : At Risk | |
518 | |
519 =cut | |
520 | |
521 sub _min_bind { | |
522 my $self = shift; | |
523 | |
524 $self->{'min_bind'} = shift if @_; | |
525 | |
526 return $self->{'min_bind'}; | |
527 } | |
528 | |
529 1; |