0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 # create a new matrix for polyphen predictions
|
|
28
|
|
29 my $orig_pfpm = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new(
|
|
30 -analysis => 'polyphen',
|
|
31 -peptide_length => 134,
|
|
32 );
|
|
33
|
|
34 # add some predictions
|
|
35
|
|
36 $orig_pfpm->add_prediction(1, 'A', 'probably damaging', 0.967);
|
|
37 $orig_pfpm->add_prediction(2, 'C', 'benign', 0.09);
|
|
38
|
|
39 # serialize the matrix to a compressed binary string
|
|
40
|
|
41 my $binary_string = $pfpm->serialize;
|
|
42
|
|
43 # store the string somewhere, fetch it later, and then create a new matrix using it
|
|
44
|
|
45 my $new_pfpm = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new(
|
|
46 -analysis => 'polyphen',
|
|
47 -matrix => $binary_string
|
|
48 );
|
|
49
|
|
50 # retrieve predictions
|
|
51
|
|
52 my ($prediction, $score) = $new_pfpm->get_prediction(2, 'C');
|
|
53
|
|
54 print "A mutation to 'C' at position 2 is predicted to be $prediction\n";
|
|
55
|
|
56 =head1 DESCRIPTION
|
|
57
|
|
58 This module defines a class representing a matrix of protein
|
|
59 function predictions, and provides method to access and set
|
|
60 predictions for a given position and amino acid, and also to
|
|
61 serialize the matrix for storage in a database, and deserialize
|
|
62 a matrix from the compressed format into a perl hash.
|
|
63
|
|
64 =cut
|
|
65
|
|
66 package Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix;
|
|
67
|
|
68 use strict;
|
|
69 use warnings;
|
|
70
|
|
71 use POSIX qw(ceil);
|
|
72 use List::Util qw(max);
|
|
73 use Compress::Zlib;
|
|
74
|
|
75 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
|
|
76 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
|
|
77
|
|
78 use base qw(Exporter);
|
|
79
|
|
80 our @EXPORT_OK = qw($AA_LOOKUP @ALL_AAS);
|
|
81
|
|
82 my $DEBUG = 0;
|
|
83
|
|
84 # user-defined constants
|
|
85
|
|
86 # a header which lets us identify prediction matrices and
|
|
87 # to check if they may have been corrupted
|
|
88
|
|
89 our $HEADER = 'VEP';
|
|
90
|
|
91 # the format we use when 'pack'ing our predictions
|
|
92
|
|
93 my $PACK_FORMAT = 'v';
|
|
94
|
|
95 # a hash mapping qualitative predictions to numerical values
|
|
96 # for all the analyses we use
|
|
97
|
|
98 my $PREDICTION_TO_VAL = {
|
|
99 polyphen => {
|
|
100 'probably damaging' => 0,
|
|
101 'possibly damaging' => 1,
|
|
102 'benign' => 2,
|
|
103 'unknown' => 3,
|
|
104 },
|
|
105
|
|
106 sift => {
|
|
107 'tolerated' => 0,
|
|
108 'deleterious' => 1,
|
|
109 },
|
|
110 };
|
|
111
|
|
112 # all valid amino acids
|
|
113
|
|
114 our @ALL_AAS = qw(A C D E F G H I K L M N P Q R S T V W Y);
|
|
115
|
|
116 # we use a short with all bits set to represent the lack of a prediction in
|
|
117 # an (uncompressed) prediction matrix, we will never observe this value
|
|
118 # as a real prediction even if we set all the (6) prediction bits because we
|
|
119 # limit the max score to 1000 so the 10 score bits will never all be set
|
|
120
|
|
121 our $NO_PREDICTION = pack($PACK_FORMAT, 0xFFFF);
|
|
122
|
|
123 # the number of bytes in a short
|
|
124
|
|
125 my $BYTES_PER_PREDICTION = 2;
|
|
126
|
|
127
|
|
128 # constants derived from the the user-defined constants
|
|
129
|
|
130 # the maximum number of distinct qualitative predictions used by any tool
|
|
131
|
|
132 my $MAX_NUM_PREDS = max( map { scalar keys %$_ } values %$PREDICTION_TO_VAL );
|
|
133
|
|
134 # the number of bits used to encode the qualitative prediction
|
|
135
|
|
136 my $NUM_PRED_BITS = ceil( log($MAX_NUM_PREDS) / log(2) );
|
|
137
|
|
138 throw("Cannot represent more than ".(2**6-1)." predictions") if $NUM_PRED_BITS > 6;
|
|
139
|
|
140 # a hash mapping back from a numerical value to a qualitative prediction
|
|
141
|
|
142 my $VAL_TO_PREDICTION = {
|
|
143 map {
|
|
144 my $tool = $_;
|
|
145 $tool => {
|
|
146 map {
|
|
147 $PREDICTION_TO_VAL->{$tool}->{$_} => $_
|
|
148 } keys %{ $PREDICTION_TO_VAL->{$tool} }
|
|
149 }
|
|
150 } keys %$PREDICTION_TO_VAL
|
|
151 };
|
|
152
|
|
153 # a hash from amino acid single letter code to a numerical value
|
|
154
|
|
155 our $AA_LOOKUP = { map {$ALL_AAS[$_] => $_} 0 .. $#ALL_AAS };
|
|
156
|
|
157 # the number of valid amino acids
|
|
158
|
|
159 our $NUM_AAS = scalar(@ALL_AAS);
|
|
160
|
|
161 =head2 new
|
|
162
|
|
163 Arg [-ANALYSIS] :
|
|
164 The name of the analysis tool that made these predictions,
|
|
165 currently must be one of 'sift' or 'polyphen'
|
|
166
|
|
167 Arg [-MATRIX] :
|
|
168 A gzip compressed binary string encoding the predictions,
|
|
169 typically created using this class and fetched from the
|
|
170 variation database (optional)
|
|
171
|
|
172 Arg [-PEPTIDE_LENGTH] :
|
|
173 The length of the associated peptide, only required if
|
|
174 you want to serialize this matrix (optional)
|
|
175
|
|
176 Arg [-TRANSLATION_MD5] :
|
|
177 The hex MD5 hash of the associated peptide sequence
|
|
178
|
|
179 Description: Constructs a new ProteinFunctionPredictionMatrix object
|
|
180 Returntype : A new Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix instance
|
|
181 Exceptions : throws unless ANALYSIS is supplied and recognised
|
|
182 Status : At Risk
|
|
183
|
|
184 =cut
|
|
185
|
|
186 sub new {
|
|
187 my $class = shift;
|
|
188
|
|
189 my (
|
|
190 $analysis,
|
|
191 $sub_analysis,
|
|
192 $matrix,
|
|
193 $peptide_length,
|
|
194 $translation_md5,
|
|
195 ) = rearrange([qw(
|
|
196 ANALYSIS
|
|
197 SUB_ANALYSIS
|
|
198 MATRIX
|
|
199 PEPTIDE_LENGTH
|
|
200 TRANSLATION_MD5
|
|
201 )], @_);
|
|
202
|
|
203 throw("analysis argument required") unless defined $analysis;
|
|
204
|
|
205 throw("Unrecognised analysis '$analysis'")
|
|
206 unless defined $PREDICTION_TO_VAL->{$analysis};
|
|
207
|
|
208 my $self = bless {
|
|
209 analysis => $analysis,
|
|
210 sub_analysis => $sub_analysis,
|
|
211 matrix => $matrix,
|
|
212 peptide_length => $peptide_length,
|
|
213 translation_md5 => $translation_md5,
|
|
214 }, $class;
|
|
215
|
|
216 $self->{matrix_compressed} = defined $matrix ? 1 : 0;
|
|
217
|
|
218 return $self;
|
|
219 }
|
|
220
|
|
221 =head2 analysis
|
|
222
|
|
223 Arg[1] : string $analysis - the name of the analysis tool (optional)
|
|
224 Description : Get/set the analysis name
|
|
225 Returntype : string
|
|
226 Exceptions : throws if the name is not recognised
|
|
227 Status : At Risk
|
|
228
|
|
229 =cut
|
|
230
|
|
231 sub analysis {
|
|
232 my ($self, $analysis) = @_;
|
|
233
|
|
234 if ($analysis) {
|
|
235 throw("Unrecognised analysis '$analysis'")
|
|
236 unless defined $PREDICTION_TO_VAL->{$analysis};
|
|
237
|
|
238 $self->{analysis} = $analysis;
|
|
239 }
|
|
240
|
|
241 return $self->{analysis};
|
|
242 }
|
|
243
|
|
244 =head2 sub_analysis
|
|
245
|
|
246 Arg[1] : string $sub_analysis - the name of the sub analysis (optional)
|
|
247 Description : Get/set the sub analysis name
|
|
248 Returntype : string
|
|
249 Exceptions : None
|
|
250 Status : At Risk
|
|
251
|
|
252 =cut
|
|
253
|
|
254 sub sub_analysis {
|
|
255 my ($self, $sub_analysis) = @_;
|
|
256
|
|
257 $self->{sub_analysis} = $sub_analysis if $sub_analysis;
|
|
258
|
|
259 return $self->{sub_analysis};
|
|
260 }
|
|
261
|
|
262 =head2 peptide_length
|
|
263
|
|
264 Arg[1] : int $peptide_length - the length of the peptide (optional)
|
|
265 Description : Get/set the length of the peptide - required when you want to
|
|
266 serialize a matrix, as we need to know how many rows the matrix has
|
|
267 Returntype : int
|
|
268 Exceptions : none
|
|
269 Status : At Risk
|
|
270
|
|
271 =cut
|
|
272
|
|
273 sub peptide_length {
|
|
274 my ($self, $peptide_length) = @_;
|
|
275 $self->{peptide_length} = $peptide_length if defined $peptide_length;
|
|
276 return $self->{peptide_length};
|
|
277 }
|
|
278
|
|
279 =head2 translation_md5
|
|
280
|
|
281 Arg[1] : string $translation_md5 - the hex MD5 hash of the peptide sequence (optional)
|
|
282 Description : Get/set the MD5 hash of the peptide sequence
|
|
283 Returntype : string
|
|
284 Exceptions : none
|
|
285 Status : At Risk
|
|
286
|
|
287 =cut
|
|
288
|
|
289 sub translation_md5 {
|
|
290 my ($self, $translation_md5) = @_;
|
|
291 $self->{translation_md5} = $translation_md5 if defined $translation_md5;
|
|
292 return $self->{translation_md5};
|
|
293 }
|
|
294
|
|
295 =head2 get_prediction
|
|
296
|
|
297 Arg[1] : int $pos - the desired position
|
|
298 Arg[2] : string $aa - the mutant amino acid
|
|
299 Description : get the prediction and score for the given position and amino acid
|
|
300 Returntype : a list containing 2 values, the prediction and the score
|
|
301 Exceptions : throws if either the position or amino acid are invalid
|
|
302 Status : At Risk
|
|
303
|
|
304 =cut
|
|
305
|
|
306 sub get_prediction {
|
|
307 my ($self, $pos, $aa) = @_;
|
|
308
|
|
309 # if we have it in our uncompressed hash then just return it
|
|
310
|
|
311 if (defined $self->{preds}->{$pos}->{$aa}) {
|
|
312 return @{ $self->{preds}->{$pos}->{$aa} };
|
|
313 }
|
|
314
|
|
315 # otherwise we look in the serialized matrix string
|
|
316
|
|
317 return $self->prediction_from_matrix($pos, $aa);
|
|
318 }
|
|
319
|
|
320 =head2 add_prediction
|
|
321
|
|
322 Arg[1] : int $pos - the peptide position
|
|
323 Arg[2] : string $aa - the mutant amino acid
|
|
324 Arg[3] : string $prediction - the prediction to store
|
|
325 Arg[4] : float $score - the score to store
|
|
326 Description : add a prediction to the matrix for the specified position and amino acid,
|
|
327 note that this just adds the prediction to a perl hash. If you want to
|
|
328 encode the matrix in the binary format you should call serialize on the
|
|
329 matrix object after you have added all the predictions.
|
|
330 Exceptions : none
|
|
331 Status : At Risk
|
|
332
|
|
333 =cut
|
|
334
|
|
335 sub add_prediction {
|
|
336 my ($self, $pos, $aa, $prediction, $score) = @_;
|
|
337
|
|
338 $self->{preds}->{$pos}->{$aa} = [$prediction, $score];
|
|
339 }
|
|
340
|
|
341 =head2 serialize
|
|
342
|
|
343 Arg[1] : int $peptide_length - the length of the associated peptide (optional)
|
|
344 Description : serialize the matrix into a compressed binary format suitable for
|
|
345 storage in a database, file etc. The same string can later be used
|
|
346 to create a new matrix object and the predictions can be retrieved
|
|
347 Exceptions : throws if the peptide length has not been specified
|
|
348 Status : At Risk
|
|
349
|
|
350 =cut
|
|
351
|
|
352 sub serialize {
|
|
353 my ($self, $peptide_length) = @_;
|
|
354
|
|
355 $self->{peptide_length} = $peptide_length if defined $peptide_length;
|
|
356
|
|
357 throw("peptide_length required to serialize predictions")
|
|
358 unless defined $self->{peptide_length};
|
|
359
|
|
360 # convert predictions to the binary format, and concatenate them all
|
|
361 # together in the correct order, inserting our dummy $NO_PREDICTION
|
|
362 # value to fill in any gaps
|
|
363
|
|
364 if ($self->{preds}) {
|
|
365
|
|
366 $self->{matrix_compressed} = 0;
|
|
367
|
|
368 $self->{matrix} = $HEADER;
|
|
369
|
|
370 for my $pos (1 .. $self->{peptide_length}) {
|
|
371
|
|
372 for my $aa (@ALL_AAS) {
|
|
373
|
|
374 my $short;
|
|
375
|
|
376 if ($self->{preds}->{$pos}->{$aa}) {
|
|
377 my ($prediction, $score) = @{ $self->{preds}->{$pos}->{$aa} };
|
|
378
|
|
379 $short = $self->prediction_to_short($prediction, $score);
|
|
380 }
|
|
381
|
|
382 $self->{matrix} .= defined $short ? $short : $NO_PREDICTION;
|
|
383 }
|
|
384 }
|
|
385
|
|
386 # delete the hash copy, so things don't get out of sync
|
|
387
|
|
388 $self->{preds} = undef;
|
|
389 }
|
|
390 else {
|
|
391 warning("There don't seem to be any predictions in the matrix to serialize!");
|
|
392 }
|
|
393
|
|
394 # and return the compressed string for storage
|
|
395
|
|
396 return $self->compress_matrix;
|
|
397 }
|
|
398
|
|
399 =head2 deserialize
|
|
400
|
|
401 Arg [1] : coderef $coderef - an anonymous subroutine that will be called
|
|
402 as each prediction is decoded in order. The subroutine will be
|
|
403 called with 4 arguments: the peptide position, the amino acid,
|
|
404 the prediction and the score. This can be used, for example to
|
|
405 dump out the prediction matrix to a file. (optional)
|
|
406 Description : deserialize a binary formatted matrix into a perl hash reference
|
|
407 containing all the uncompressed predictions. This hash has an
|
|
408 entry for each position in the peptide, which is itself a hashref
|
|
409 with an entry for each possible alternate amino acid which is a
|
|
410 listref containing the prediction and score. For example, to retrieve
|
|
411 the prediction for a substitution of 'C' at position 23 from this
|
|
412 data structure, you could use code like:
|
|
413
|
|
414 my $prediction_hash = $pfpm->deserialize;
|
|
415 my ($prediction, $score) = @{ $prediction_hash->{23}->{'C'} };
|
|
416
|
|
417 Note that if you don't explicitly deserialize a matrix this
|
|
418 class will keep it in the memory-efficient encoded format, and
|
|
419 you can access individual predictions with the get_prediction()
|
|
420 method. You should only use this method if you want to decode
|
|
421 all predictions (for example to perform some large-scale
|
|
422 analysis, or to reformat the predictions somehow)
|
|
423
|
|
424 Returntype : hashref containing decoded predictions
|
|
425 Exceptions : throws if the binary matrix isn't in the expected format
|
|
426 Status : At Risk
|
|
427
|
|
428 =cut
|
|
429
|
|
430 sub deserialize {
|
|
431 my ($self, $coderef) = @_;
|
|
432
|
|
433 if ($self->{matrix_compressed}) {
|
|
434 $self->expand_matrix;
|
|
435 }
|
|
436
|
|
437 throw("Matrix looks corrupted") unless $self->header_ok;
|
|
438
|
|
439 # we can work out the length of the peptide by counting the rows in the matrix
|
|
440
|
|
441 my $length = ((length($self->{matrix}) - length($HEADER)) / $BYTES_PER_PREDICTION) / $NUM_AAS;
|
|
442
|
|
443 for my $pos (1 .. $length) {
|
|
444
|
|
445 for my $aa (@ALL_AAS) {
|
|
446
|
|
447 # we call prediction_from_short directly to avoid doing all the
|
|
448 # checks performed in prediction_from_string
|
|
449
|
|
450 my ($prediction, $score) = $self->prediction_from_short(substr($self->{matrix}, $self->compute_offset($pos, $aa), $BYTES_PER_PREDICTION));
|
|
451
|
|
452 $self->{preds}->{$pos}->{$aa} = [$prediction, $score];
|
|
453
|
|
454 if ($coderef) {
|
|
455 $coderef->($pos, $aa, $prediction, $score);
|
|
456 }
|
|
457 }
|
|
458 }
|
|
459
|
|
460 return $self->{preds};
|
|
461 }
|
|
462
|
|
463 =head2 prediction_to_short
|
|
464
|
|
465 Arg[1] : string $pred - one of the possible qualitative predictions of the tool
|
|
466 Arg[2] : float $prob - the prediction score (with 3 d.p.s of precision)
|
|
467 Description : converts a prediction and corresponding score into a 2-byte short value
|
|
468 Returntype : the packed short value
|
|
469 Exceptions : throws if the prediction argument is invalid
|
|
470 Status : At Risk
|
|
471
|
|
472 =cut
|
|
473
|
|
474 sub prediction_to_short {
|
|
475 my ($self, $pred, $prob) = @_;
|
|
476
|
|
477 # we only store 3 d.p. so multiply by 1000 to turn our
|
|
478 # probability into a number between 0 and 1000.
|
|
479 # 2^10 == 1024 so we need 10 bits of our short to store
|
|
480 # this value
|
|
481
|
|
482 my $val = $prob * 1000;
|
|
483
|
|
484 # we store the prediction in the top $NUM_PRED_BITS bits
|
|
485 # so look up the numerical value for the prediction,
|
|
486 # shift this $NUM_PRED_BITS bits to the left and then set
|
|
487 # the appropriate bits of our short value
|
|
488
|
|
489 $pred = lc($pred);
|
|
490
|
|
491 my $pred_val = $PREDICTION_TO_VAL->{$self->{analysis}}->{$pred};
|
|
492
|
|
493 throw("No value defined for prediction: '$pred'?")
|
|
494 unless defined $pred_val;
|
|
495
|
|
496 $val |= ($pred_val << (16 - $NUM_PRED_BITS));
|
|
497
|
|
498 printf("p2s: $pred ($prob) => 0x%04x\n", $val) if $DEBUG;
|
|
499
|
|
500 $val = pack $PACK_FORMAT, $val;
|
|
501
|
|
502 return $val;
|
|
503 }
|
|
504
|
|
505 =head2 prediction_from_short
|
|
506
|
|
507 Arg[1] : string $pred - the packed short value
|
|
508 Description : converts a 2-byte short value back into a prediction and a score
|
|
509 Exceptions : none
|
|
510 Returntype : a list containing 2 values, the prediction and the score
|
|
511 Status : At Risk
|
|
512
|
|
513 =cut
|
|
514
|
|
515 sub prediction_from_short {
|
|
516 my ($self, $val) = @_;
|
|
517
|
|
518 # check it isn't our special null prediction
|
|
519
|
|
520 if ($val eq $NO_PREDICTION) {
|
|
521 print "no pred\n" if $DEBUG;
|
|
522 return undef;
|
|
523 }
|
|
524
|
|
525 # unpack the value as a short
|
|
526
|
|
527 $val = unpack $PACK_FORMAT, $val;
|
|
528
|
|
529 # shift the prediction bits down and look up the prediction string
|
|
530
|
|
531 my $pred = $VAL_TO_PREDICTION->{$self->{analysis}}->{$val >> (16 - $NUM_PRED_BITS)};
|
|
532
|
|
533 # mask off the top 6 bits reserved for the prediction and convert back to a 3 d.p. float
|
|
534
|
|
535 my $prob = ($val & (2**10 - 1)) / 1000;
|
|
536
|
|
537 printf("pfs: 0x%04x => $pred ($prob)\n", $val) if $DEBUG;
|
|
538
|
|
539 return ($pred, $prob);
|
|
540 }
|
|
541
|
|
542 =head2 compress_matrix
|
|
543
|
|
544 Description : compresses a prediction matrix with gzip
|
|
545 Returntype : the compressed matrix
|
|
546 Exceptions : throws if the matrix is an unexpected length, or if the compression fails
|
|
547 Status : At Risk
|
|
548
|
|
549 =cut
|
|
550
|
|
551 sub compress_matrix {
|
|
552 my ($self) = @_;
|
|
553
|
|
554 my $matrix = $self->{matrix};
|
|
555
|
|
556 return undef unless $matrix;
|
|
557
|
|
558 return $matrix if $self->{matrix_compressed};
|
|
559
|
|
560 # prepend a header, so we can tell if our matrix has been mangled, or
|
|
561 # is compressed etc.
|
|
562
|
|
563 unless ($self->header_ok) {
|
|
564 $matrix = $HEADER.$matrix;
|
|
565 }
|
|
566
|
|
567 throw("Prediction matrix is an unexpected length")
|
|
568 unless ( (length($matrix) - length($HEADER)) % $NUM_AAS) == 0;
|
|
569
|
|
570 $self->{matrix} = Compress::Zlib::memGzip($matrix) or throw("Failed to gzip: $gzerrno");
|
|
571
|
|
572 $self->{matrix_compressed} = 1;
|
|
573
|
|
574 return $self->{matrix};
|
|
575 }
|
|
576
|
|
577 =head2 header_ok
|
|
578
|
|
579 Description : checks if the binary matrix has the expected header
|
|
580 Returntype : boolean
|
|
581 Exceptions : none
|
|
582 Status : At Risk
|
|
583
|
|
584 =cut
|
|
585
|
|
586 sub header_ok {
|
|
587 my ($self) = @_;
|
|
588 return undef unless ($self->{matrix} && !$self->{matrix_compressed});
|
|
589 return substr($self->{matrix},0,length($HEADER)) eq $HEADER;
|
|
590 }
|
|
591
|
|
592 =head2 expand_matrix
|
|
593
|
|
594 Description : uncompresses a compressed prediction matrix
|
|
595 Returntype : the uncompressed binary matrix string
|
|
596 Exceptions : throws if the header is incorrect, or if the decompression fails
|
|
597 Status : At Risk
|
|
598
|
|
599 =cut
|
|
600
|
|
601 sub expand_matrix {
|
|
602 my ($self) = @_;
|
|
603
|
|
604 return undef unless $self->{matrix};
|
|
605
|
|
606 return $self->{matrix} unless $self->{matrix_compressed};
|
|
607
|
|
608 $self->{matrix} = Compress::Zlib::memGunzip($self->{matrix})
|
|
609 or throw("Failed to gunzip: $gzerrno");
|
|
610
|
|
611 $self->{matrix_compressed} = 0;
|
|
612
|
|
613 throw("Malformed prediction matrix") unless $self->header_ok;
|
|
614
|
|
615 return $self->{matrix};
|
|
616 }
|
|
617
|
|
618 =head2 compute_offset
|
|
619
|
|
620 Arg[1] : int $pos - the desired position in the peptide
|
|
621 Arg[2] : string $aa - the desired mutant amino acid
|
|
622 Description : computes the correct offset into a prediction matrix for a given
|
|
623 peptide position and mutant amino acid
|
|
624 Returntype : the integer offset
|
|
625 Exceptions : none
|
|
626 Status : At Risk
|
|
627
|
|
628 =cut
|
|
629
|
|
630 sub compute_offset {
|
|
631 my ($self, $pos, $aa) = @_;
|
|
632
|
|
633 my $offset = length($HEADER) + ( ( (($pos-1) * $NUM_AAS) + $AA_LOOKUP->{$aa} ) * $BYTES_PER_PREDICTION );
|
|
634
|
|
635 return $offset;
|
|
636 }
|
|
637
|
|
638 =head2 prediction_from_matrix
|
|
639
|
|
640 Arg[1] : int $pos - the desired position in the peptide
|
|
641 Arg[2] : string $aa - the desired mutant amino acid
|
|
642 Description : returns the prediction and score for the requested
|
|
643 position and mutant amino acid in the matrix
|
|
644 Returntype : a list containing 2 values, the prediction and the score
|
|
645 Exceptions : throws if either the position or amino acid are invalid,
|
|
646 or if the prediction matrix looks to be malformed
|
|
647 Status : At Risk
|
|
648
|
|
649 =cut
|
|
650
|
|
651 sub prediction_from_matrix {
|
|
652 my ($self, $pos, $aa) = @_;
|
|
653
|
|
654 if ($self->{matrix_compressed}) {
|
|
655 # the matrix is still compressed so we try to uncompress it,
|
|
656 $self->expand_matrix;
|
|
657 }
|
|
658
|
|
659 $aa = uc($aa) if defined $aa;
|
|
660
|
|
661 throw("Invalid position: $pos") unless (defined $pos && $pos > 0);
|
|
662
|
|
663 throw("Invalid amino acid: $aa") unless (defined $aa && defined $AA_LOOKUP->{$aa});
|
|
664
|
|
665 # compute our offset into the prediction matrix
|
|
666
|
|
667 my $offset = $self->compute_offset($pos, $aa);
|
|
668
|
|
669 print "offset: $offset\n" if $DEBUG;
|
|
670
|
|
671 if ($offset + 1 > length($self->{matrix})) {
|
|
672 warning("Offset outside of prediction matrix for position $pos and amino acid $aa?");
|
|
673 return undef;
|
|
674 }
|
|
675
|
|
676 my $pred = substr($self->{matrix}, $offset, $BYTES_PER_PREDICTION);
|
|
677
|
|
678 return $self->prediction_from_short($pred);
|
|
679 }
|
|
680
|
|
681 1;
|
|
682
|