comparison variant_effect_predictor/Bio/EnsEMBL/Variation/ProteinFunctionPredictionMatrix.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix
24
25 =head1 SYNOPSIS
26
27 # create a new matrix for polyphen predictions
28
29 my $orig_pfpm = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new(
30 -analysis => 'polyphen',
31 -peptide_length => 134,
32 );
33
34 # add some predictions
35
36 $orig_pfpm->add_prediction(1, 'A', 'probably damaging', 0.967);
37 $orig_pfpm->add_prediction(2, 'C', 'benign', 0.09);
38
39 # serialize the matrix to a compressed binary string
40
41 my $binary_string = $pfpm->serialize;
42
43 # store the string somewhere, fetch it later, and then create a new matrix using it
44
45 my $new_pfpm = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new(
46 -analysis => 'polyphen',
47 -matrix => $binary_string
48 );
49
50 # retrieve predictions
51
52 my ($prediction, $score) = $new_pfpm->get_prediction(2, 'C');
53
54 print "A mutation to 'C' at position 2 is predicted to be $prediction\n";
55
56 =head1 DESCRIPTION
57
58 This module defines a class representing a matrix of protein
59 function predictions, and provides method to access and set
60 predictions for a given position and amino acid, and also to
61 serialize the matrix for storage in a database, and deserialize
62 a matrix from the compressed format into a perl hash.
63
64 =cut
65
66 package Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix;
67
68 use strict;
69 use warnings;
70
71 use POSIX qw(ceil);
72 use List::Util qw(max);
73 use Compress::Zlib;
74
75 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
76 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
77
78 use base qw(Exporter);
79
80 our @EXPORT_OK = qw($AA_LOOKUP @ALL_AAS);
81
82 my $DEBUG = 0;
83
84 # user-defined constants
85
86 # a header which lets us identify prediction matrices and
87 # to check if they may have been corrupted
88
89 our $HEADER = 'VEP';
90
91 # the format we use when 'pack'ing our predictions
92
93 my $PACK_FORMAT = 'v';
94
95 # a hash mapping qualitative predictions to numerical values
96 # for all the analyses we use
97
98 my $PREDICTION_TO_VAL = {
99 polyphen => {
100 'probably damaging' => 0,
101 'possibly damaging' => 1,
102 'benign' => 2,
103 'unknown' => 3,
104 },
105
106 sift => {
107 'tolerated' => 0,
108 'deleterious' => 1,
109 },
110 };
111
112 # all valid amino acids
113
114 our @ALL_AAS = qw(A C D E F G H I K L M N P Q R S T V W Y);
115
116 # we use a short with all bits set to represent the lack of a prediction in
117 # an (uncompressed) prediction matrix, we will never observe this value
118 # as a real prediction even if we set all the (6) prediction bits because we
119 # limit the max score to 1000 so the 10 score bits will never all be set
120
121 our $NO_PREDICTION = pack($PACK_FORMAT, 0xFFFF);
122
123 # the number of bytes in a short
124
125 my $BYTES_PER_PREDICTION = 2;
126
127
128 # constants derived from the the user-defined constants
129
130 # the maximum number of distinct qualitative predictions used by any tool
131
132 my $MAX_NUM_PREDS = max( map { scalar keys %$_ } values %$PREDICTION_TO_VAL );
133
134 # the number of bits used to encode the qualitative prediction
135
136 my $NUM_PRED_BITS = ceil( log($MAX_NUM_PREDS) / log(2) );
137
138 throw("Cannot represent more than ".(2**6-1)." predictions") if $NUM_PRED_BITS > 6;
139
140 # a hash mapping back from a numerical value to a qualitative prediction
141
142 my $VAL_TO_PREDICTION = {
143 map {
144 my $tool = $_;
145 $tool => {
146 map {
147 $PREDICTION_TO_VAL->{$tool}->{$_} => $_
148 } keys %{ $PREDICTION_TO_VAL->{$tool} }
149 }
150 } keys %$PREDICTION_TO_VAL
151 };
152
153 # a hash from amino acid single letter code to a numerical value
154
155 our $AA_LOOKUP = { map {$ALL_AAS[$_] => $_} 0 .. $#ALL_AAS };
156
157 # the number of valid amino acids
158
159 our $NUM_AAS = scalar(@ALL_AAS);
160
161 =head2 new
162
163 Arg [-ANALYSIS] :
164 The name of the analysis tool that made these predictions,
165 currently must be one of 'sift' or 'polyphen'
166
167 Arg [-MATRIX] :
168 A gzip compressed binary string encoding the predictions,
169 typically created using this class and fetched from the
170 variation database (optional)
171
172 Arg [-PEPTIDE_LENGTH] :
173 The length of the associated peptide, only required if
174 you want to serialize this matrix (optional)
175
176 Arg [-TRANSLATION_MD5] :
177 The hex MD5 hash of the associated peptide sequence
178
179 Description: Constructs a new ProteinFunctionPredictionMatrix object
180 Returntype : A new Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix instance
181 Exceptions : throws unless ANALYSIS is supplied and recognised
182 Status : At Risk
183
184 =cut
185
186 sub new {
187 my $class = shift;
188
189 my (
190 $analysis,
191 $sub_analysis,
192 $matrix,
193 $peptide_length,
194 $translation_md5,
195 ) = rearrange([qw(
196 ANALYSIS
197 SUB_ANALYSIS
198 MATRIX
199 PEPTIDE_LENGTH
200 TRANSLATION_MD5
201 )], @_);
202
203 throw("analysis argument required") unless defined $analysis;
204
205 throw("Unrecognised analysis '$analysis'")
206 unless defined $PREDICTION_TO_VAL->{$analysis};
207
208 my $self = bless {
209 analysis => $analysis,
210 sub_analysis => $sub_analysis,
211 matrix => $matrix,
212 peptide_length => $peptide_length,
213 translation_md5 => $translation_md5,
214 }, $class;
215
216 $self->{matrix_compressed} = defined $matrix ? 1 : 0;
217
218 return $self;
219 }
220
221 =head2 analysis
222
223 Arg[1] : string $analysis - the name of the analysis tool (optional)
224 Description : Get/set the analysis name
225 Returntype : string
226 Exceptions : throws if the name is not recognised
227 Status : At Risk
228
229 =cut
230
231 sub analysis {
232 my ($self, $analysis) = @_;
233
234 if ($analysis) {
235 throw("Unrecognised analysis '$analysis'")
236 unless defined $PREDICTION_TO_VAL->{$analysis};
237
238 $self->{analysis} = $analysis;
239 }
240
241 return $self->{analysis};
242 }
243
244 =head2 sub_analysis
245
246 Arg[1] : string $sub_analysis - the name of the sub analysis (optional)
247 Description : Get/set the sub analysis name
248 Returntype : string
249 Exceptions : None
250 Status : At Risk
251
252 =cut
253
254 sub sub_analysis {
255 my ($self, $sub_analysis) = @_;
256
257 $self->{sub_analysis} = $sub_analysis if $sub_analysis;
258
259 return $self->{sub_analysis};
260 }
261
262 =head2 peptide_length
263
264 Arg[1] : int $peptide_length - the length of the peptide (optional)
265 Description : Get/set the length of the peptide - required when you want to
266 serialize a matrix, as we need to know how many rows the matrix has
267 Returntype : int
268 Exceptions : none
269 Status : At Risk
270
271 =cut
272
273 sub peptide_length {
274 my ($self, $peptide_length) = @_;
275 $self->{peptide_length} = $peptide_length if defined $peptide_length;
276 return $self->{peptide_length};
277 }
278
279 =head2 translation_md5
280
281 Arg[1] : string $translation_md5 - the hex MD5 hash of the peptide sequence (optional)
282 Description : Get/set the MD5 hash of the peptide sequence
283 Returntype : string
284 Exceptions : none
285 Status : At Risk
286
287 =cut
288
289 sub translation_md5 {
290 my ($self, $translation_md5) = @_;
291 $self->{translation_md5} = $translation_md5 if defined $translation_md5;
292 return $self->{translation_md5};
293 }
294
295 =head2 get_prediction
296
297 Arg[1] : int $pos - the desired position
298 Arg[2] : string $aa - the mutant amino acid
299 Description : get the prediction and score for the given position and amino acid
300 Returntype : a list containing 2 values, the prediction and the score
301 Exceptions : throws if either the position or amino acid are invalid
302 Status : At Risk
303
304 =cut
305
306 sub get_prediction {
307 my ($self, $pos, $aa) = @_;
308
309 # if we have it in our uncompressed hash then just return it
310
311 if (defined $self->{preds}->{$pos}->{$aa}) {
312 return @{ $self->{preds}->{$pos}->{$aa} };
313 }
314
315 # otherwise we look in the serialized matrix string
316
317 return $self->prediction_from_matrix($pos, $aa);
318 }
319
320 =head2 add_prediction
321
322 Arg[1] : int $pos - the peptide position
323 Arg[2] : string $aa - the mutant amino acid
324 Arg[3] : string $prediction - the prediction to store
325 Arg[4] : float $score - the score to store
326 Description : add a prediction to the matrix for the specified position and amino acid,
327 note that this just adds the prediction to a perl hash. If you want to
328 encode the matrix in the binary format you should call serialize on the
329 matrix object after you have added all the predictions.
330 Exceptions : none
331 Status : At Risk
332
333 =cut
334
335 sub add_prediction {
336 my ($self, $pos, $aa, $prediction, $score) = @_;
337
338 $self->{preds}->{$pos}->{$aa} = [$prediction, $score];
339 }
340
341 =head2 serialize
342
343 Arg[1] : int $peptide_length - the length of the associated peptide (optional)
344 Description : serialize the matrix into a compressed binary format suitable for
345 storage in a database, file etc. The same string can later be used
346 to create a new matrix object and the predictions can be retrieved
347 Exceptions : throws if the peptide length has not been specified
348 Status : At Risk
349
350 =cut
351
352 sub serialize {
353 my ($self, $peptide_length) = @_;
354
355 $self->{peptide_length} = $peptide_length if defined $peptide_length;
356
357 throw("peptide_length required to serialize predictions")
358 unless defined $self->{peptide_length};
359
360 # convert predictions to the binary format, and concatenate them all
361 # together in the correct order, inserting our dummy $NO_PREDICTION
362 # value to fill in any gaps
363
364 if ($self->{preds}) {
365
366 $self->{matrix_compressed} = 0;
367
368 $self->{matrix} = $HEADER;
369
370 for my $pos (1 .. $self->{peptide_length}) {
371
372 for my $aa (@ALL_AAS) {
373
374 my $short;
375
376 if ($self->{preds}->{$pos}->{$aa}) {
377 my ($prediction, $score) = @{ $self->{preds}->{$pos}->{$aa} };
378
379 $short = $self->prediction_to_short($prediction, $score);
380 }
381
382 $self->{matrix} .= defined $short ? $short : $NO_PREDICTION;
383 }
384 }
385
386 # delete the hash copy, so things don't get out of sync
387
388 $self->{preds} = undef;
389 }
390 else {
391 warning("There don't seem to be any predictions in the matrix to serialize!");
392 }
393
394 # and return the compressed string for storage
395
396 return $self->compress_matrix;
397 }
398
399 =head2 deserialize
400
401 Arg [1] : coderef $coderef - an anonymous subroutine that will be called
402 as each prediction is decoded in order. The subroutine will be
403 called with 4 arguments: the peptide position, the amino acid,
404 the prediction and the score. This can be used, for example to
405 dump out the prediction matrix to a file. (optional)
406 Description : deserialize a binary formatted matrix into a perl hash reference
407 containing all the uncompressed predictions. This hash has an
408 entry for each position in the peptide, which is itself a hashref
409 with an entry for each possible alternate amino acid which is a
410 listref containing the prediction and score. For example, to retrieve
411 the prediction for a substitution of 'C' at position 23 from this
412 data structure, you could use code like:
413
414 my $prediction_hash = $pfpm->deserialize;
415 my ($prediction, $score) = @{ $prediction_hash->{23}->{'C'} };
416
417 Note that if you don't explicitly deserialize a matrix this
418 class will keep it in the memory-efficient encoded format, and
419 you can access individual predictions with the get_prediction()
420 method. You should only use this method if you want to decode
421 all predictions (for example to perform some large-scale
422 analysis, or to reformat the predictions somehow)
423
424 Returntype : hashref containing decoded predictions
425 Exceptions : throws if the binary matrix isn't in the expected format
426 Status : At Risk
427
428 =cut
429
430 sub deserialize {
431 my ($self, $coderef) = @_;
432
433 if ($self->{matrix_compressed}) {
434 $self->expand_matrix;
435 }
436
437 throw("Matrix looks corrupted") unless $self->header_ok;
438
439 # we can work out the length of the peptide by counting the rows in the matrix
440
441 my $length = ((length($self->{matrix}) - length($HEADER)) / $BYTES_PER_PREDICTION) / $NUM_AAS;
442
443 for my $pos (1 .. $length) {
444
445 for my $aa (@ALL_AAS) {
446
447 # we call prediction_from_short directly to avoid doing all the
448 # checks performed in prediction_from_string
449
450 my ($prediction, $score) = $self->prediction_from_short(substr($self->{matrix}, $self->compute_offset($pos, $aa), $BYTES_PER_PREDICTION));
451
452 $self->{preds}->{$pos}->{$aa} = [$prediction, $score];
453
454 if ($coderef) {
455 $coderef->($pos, $aa, $prediction, $score);
456 }
457 }
458 }
459
460 return $self->{preds};
461 }
462
463 =head2 prediction_to_short
464
465 Arg[1] : string $pred - one of the possible qualitative predictions of the tool
466 Arg[2] : float $prob - the prediction score (with 3 d.p.s of precision)
467 Description : converts a prediction and corresponding score into a 2-byte short value
468 Returntype : the packed short value
469 Exceptions : throws if the prediction argument is invalid
470 Status : At Risk
471
472 =cut
473
474 sub prediction_to_short {
475 my ($self, $pred, $prob) = @_;
476
477 # we only store 3 d.p. so multiply by 1000 to turn our
478 # probability into a number between 0 and 1000.
479 # 2^10 == 1024 so we need 10 bits of our short to store
480 # this value
481
482 my $val = $prob * 1000;
483
484 # we store the prediction in the top $NUM_PRED_BITS bits
485 # so look up the numerical value for the prediction,
486 # shift this $NUM_PRED_BITS bits to the left and then set
487 # the appropriate bits of our short value
488
489 $pred = lc($pred);
490
491 my $pred_val = $PREDICTION_TO_VAL->{$self->{analysis}}->{$pred};
492
493 throw("No value defined for prediction: '$pred'?")
494 unless defined $pred_val;
495
496 $val |= ($pred_val << (16 - $NUM_PRED_BITS));
497
498 printf("p2s: $pred ($prob) => 0x%04x\n", $val) if $DEBUG;
499
500 $val = pack $PACK_FORMAT, $val;
501
502 return $val;
503 }
504
505 =head2 prediction_from_short
506
507 Arg[1] : string $pred - the packed short value
508 Description : converts a 2-byte short value back into a prediction and a score
509 Exceptions : none
510 Returntype : a list containing 2 values, the prediction and the score
511 Status : At Risk
512
513 =cut
514
515 sub prediction_from_short {
516 my ($self, $val) = @_;
517
518 # check it isn't our special null prediction
519
520 if ($val eq $NO_PREDICTION) {
521 print "no pred\n" if $DEBUG;
522 return undef;
523 }
524
525 # unpack the value as a short
526
527 $val = unpack $PACK_FORMAT, $val;
528
529 # shift the prediction bits down and look up the prediction string
530
531 my $pred = $VAL_TO_PREDICTION->{$self->{analysis}}->{$val >> (16 - $NUM_PRED_BITS)};
532
533 # mask off the top 6 bits reserved for the prediction and convert back to a 3 d.p. float
534
535 my $prob = ($val & (2**10 - 1)) / 1000;
536
537 printf("pfs: 0x%04x => $pred ($prob)\n", $val) if $DEBUG;
538
539 return ($pred, $prob);
540 }
541
542 =head2 compress_matrix
543
544 Description : compresses a prediction matrix with gzip
545 Returntype : the compressed matrix
546 Exceptions : throws if the matrix is an unexpected length, or if the compression fails
547 Status : At Risk
548
549 =cut
550
551 sub compress_matrix {
552 my ($self) = @_;
553
554 my $matrix = $self->{matrix};
555
556 return undef unless $matrix;
557
558 return $matrix if $self->{matrix_compressed};
559
560 # prepend a header, so we can tell if our matrix has been mangled, or
561 # is compressed etc.
562
563 unless ($self->header_ok) {
564 $matrix = $HEADER.$matrix;
565 }
566
567 throw("Prediction matrix is an unexpected length")
568 unless ( (length($matrix) - length($HEADER)) % $NUM_AAS) == 0;
569
570 $self->{matrix} = Compress::Zlib::memGzip($matrix) or throw("Failed to gzip: $gzerrno");
571
572 $self->{matrix_compressed} = 1;
573
574 return $self->{matrix};
575 }
576
577 =head2 header_ok
578
579 Description : checks if the binary matrix has the expected header
580 Returntype : boolean
581 Exceptions : none
582 Status : At Risk
583
584 =cut
585
586 sub header_ok {
587 my ($self) = @_;
588 return undef unless ($self->{matrix} && !$self->{matrix_compressed});
589 return substr($self->{matrix},0,length($HEADER)) eq $HEADER;
590 }
591
592 =head2 expand_matrix
593
594 Description : uncompresses a compressed prediction matrix
595 Returntype : the uncompressed binary matrix string
596 Exceptions : throws if the header is incorrect, or if the decompression fails
597 Status : At Risk
598
599 =cut
600
601 sub expand_matrix {
602 my ($self) = @_;
603
604 return undef unless $self->{matrix};
605
606 return $self->{matrix} unless $self->{matrix_compressed};
607
608 $self->{matrix} = Compress::Zlib::memGunzip($self->{matrix})
609 or throw("Failed to gunzip: $gzerrno");
610
611 $self->{matrix_compressed} = 0;
612
613 throw("Malformed prediction matrix") unless $self->header_ok;
614
615 return $self->{matrix};
616 }
617
618 =head2 compute_offset
619
620 Arg[1] : int $pos - the desired position in the peptide
621 Arg[2] : string $aa - the desired mutant amino acid
622 Description : computes the correct offset into a prediction matrix for a given
623 peptide position and mutant amino acid
624 Returntype : the integer offset
625 Exceptions : none
626 Status : At Risk
627
628 =cut
629
630 sub compute_offset {
631 my ($self, $pos, $aa) = @_;
632
633 my $offset = length($HEADER) + ( ( (($pos-1) * $NUM_AAS) + $AA_LOOKUP->{$aa} ) * $BYTES_PER_PREDICTION );
634
635 return $offset;
636 }
637
638 =head2 prediction_from_matrix
639
640 Arg[1] : int $pos - the desired position in the peptide
641 Arg[2] : string $aa - the desired mutant amino acid
642 Description : returns the prediction and score for the requested
643 position and mutant amino acid in the matrix
644 Returntype : a list containing 2 values, the prediction and the score
645 Exceptions : throws if either the position or amino acid are invalid,
646 or if the prediction matrix looks to be malformed
647 Status : At Risk
648
649 =cut
650
651 sub prediction_from_matrix {
652 my ($self, $pos, $aa) = @_;
653
654 if ($self->{matrix_compressed}) {
655 # the matrix is still compressed so we try to uncompress it,
656 $self->expand_matrix;
657 }
658
659 $aa = uc($aa) if defined $aa;
660
661 throw("Invalid position: $pos") unless (defined $pos && $pos > 0);
662
663 throw("Invalid amino acid: $aa") unless (defined $aa && defined $AA_LOOKUP->{$aa});
664
665 # compute our offset into the prediction matrix
666
667 my $offset = $self->compute_offset($pos, $aa);
668
669 print "offset: $offset\n" if $DEBUG;
670
671 if ($offset + 1 > length($self->{matrix})) {
672 warning("Offset outside of prediction matrix for position $pos and amino acid $aa?");
673 return undef;
674 }
675
676 my $pred = substr($self->{matrix}, $offset, $BYTES_PER_PREDICTION);
677
678 return $self->prediction_from_short($pred);
679 }
680
681 1;
682