annotate variant_effect_predictor/Bio/EnsEMBL/Variation/ProteinFunctionPredictionMatrix.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1 =head1 LICENSE
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
2
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
4 Genome Research Limited. All rights reserved.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
5
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
6 This software is distributed under a modified Apache license.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
7 For license details, please see
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
8
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
9 http://www.ensembl.org/info/about/code_licence.html
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
10
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
11 =head1 CONTACT
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
12
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
13 Please email comments or questions to the public Ensembl
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
14 developers list at <dev@ensembl.org>.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
15
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
16 Questions may also be sent to the Ensembl help desk at
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
17 <helpdesk@ensembl.org>.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
18
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
19 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
20
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
21 =head1 NAME
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
22
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
23 Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
24
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
25 =head1 SYNOPSIS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
26
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
27 # create a new matrix for polyphen predictions
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
28
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
29 my $orig_pfpm = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new(
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
30 -analysis => 'polyphen',
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
31 -peptide_length => 134,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
32 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
33
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
34 # add some predictions
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
35
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
36 $orig_pfpm->add_prediction(1, 'A', 'probably damaging', 0.967);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
37 $orig_pfpm->add_prediction(2, 'C', 'benign', 0.09);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
38
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
39 # serialize the matrix to a compressed binary string
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
40
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
41 my $binary_string = $pfpm->serialize;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
42
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
43 # store the string somewhere, fetch it later, and then create a new matrix using it
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
44
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
45 my $new_pfpm = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new(
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
46 -analysis => 'polyphen',
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
47 -matrix => $binary_string
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
48 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
49
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
50 # retrieve predictions
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
51
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
52 my ($prediction, $score) = $new_pfpm->get_prediction(2, 'C');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
53
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
54 print "A mutation to 'C' at position 2 is predicted to be $prediction\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
55
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
56 =head1 DESCRIPTION
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
57
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
58 This module defines a class representing a matrix of protein
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
59 function predictions, and provides method to access and set
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
60 predictions for a given position and amino acid, and also to
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
61 serialize the matrix for storage in a database, and deserialize
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
62 a matrix from the compressed format into a perl hash.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
63
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
64 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
65
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
66 package Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
67
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
68 use strict;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
69 use warnings;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
70
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
71 use POSIX qw(ceil);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
72 use List::Util qw(max);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
73 use Compress::Zlib;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
74
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
75 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
76 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
77
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
78 use base qw(Exporter);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
79
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
80 our @EXPORT_OK = qw($AA_LOOKUP @ALL_AAS);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
81
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
82 my $DEBUG = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
83
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
84 # user-defined constants
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
85
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
86 # a header which lets us identify prediction matrices and
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
87 # to check if they may have been corrupted
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
88
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
89 our $HEADER = 'VEP';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
90
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
91 # the format we use when 'pack'ing our predictions
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
92
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
93 my $PACK_FORMAT = 'v';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
94
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
95 # a hash mapping qualitative predictions to numerical values
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
96 # for all the analyses we use
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
97
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
98 my $PREDICTION_TO_VAL = {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
99 polyphen => {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
100 'probably damaging' => 0,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
101 'possibly damaging' => 1,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
102 'benign' => 2,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
103 'unknown' => 3,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
104 },
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
105
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
106 sift => {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
107 'tolerated' => 0,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
108 'deleterious' => 1,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
109 },
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
110 };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
111
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
112 # all valid amino acids
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
113
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
114 our @ALL_AAS = qw(A C D E F G H I K L M N P Q R S T V W Y);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
115
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
116 # we use a short with all bits set to represent the lack of a prediction in
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
117 # an (uncompressed) prediction matrix, we will never observe this value
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
118 # as a real prediction even if we set all the (6) prediction bits because we
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
119 # limit the max score to 1000 so the 10 score bits will never all be set
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
120
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
121 our $NO_PREDICTION = pack($PACK_FORMAT, 0xFFFF);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
122
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
123 # the number of bytes in a short
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
124
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
125 my $BYTES_PER_PREDICTION = 2;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
126
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
127
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
128 # constants derived from the the user-defined constants
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
129
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
130 # the maximum number of distinct qualitative predictions used by any tool
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
131
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
132 my $MAX_NUM_PREDS = max( map { scalar keys %$_ } values %$PREDICTION_TO_VAL );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
133
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
134 # the number of bits used to encode the qualitative prediction
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
135
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
136 my $NUM_PRED_BITS = ceil( log($MAX_NUM_PREDS) / log(2) );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
137
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
138 throw("Cannot represent more than ".(2**6-1)." predictions") if $NUM_PRED_BITS > 6;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
139
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
140 # a hash mapping back from a numerical value to a qualitative prediction
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
141
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
142 my $VAL_TO_PREDICTION = {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
143 map {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
144 my $tool = $_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
145 $tool => {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
146 map {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
147 $PREDICTION_TO_VAL->{$tool}->{$_} => $_
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
148 } keys %{ $PREDICTION_TO_VAL->{$tool} }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
149 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
150 } keys %$PREDICTION_TO_VAL
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
151 };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
152
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
153 # a hash from amino acid single letter code to a numerical value
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
154
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
155 our $AA_LOOKUP = { map {$ALL_AAS[$_] => $_} 0 .. $#ALL_AAS };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
156
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
157 # the number of valid amino acids
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
158
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
159 our $NUM_AAS = scalar(@ALL_AAS);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
160
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
161 =head2 new
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
162
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
163 Arg [-ANALYSIS] :
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
164 The name of the analysis tool that made these predictions,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
165 currently must be one of 'sift' or 'polyphen'
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
166
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
167 Arg [-MATRIX] :
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
168 A gzip compressed binary string encoding the predictions,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
169 typically created using this class and fetched from the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
170 variation database (optional)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
171
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
172 Arg [-PEPTIDE_LENGTH] :
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
173 The length of the associated peptide, only required if
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
174 you want to serialize this matrix (optional)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
175
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
176 Arg [-TRANSLATION_MD5] :
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
177 The hex MD5 hash of the associated peptide sequence
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
178
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
179 Description: Constructs a new ProteinFunctionPredictionMatrix object
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
180 Returntype : A new Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix instance
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
181 Exceptions : throws unless ANALYSIS is supplied and recognised
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
182 Status : At Risk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
183
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
184 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
185
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
186 sub new {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
187 my $class = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
188
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
189 my (
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
190 $analysis,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
191 $sub_analysis,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
192 $matrix,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
193 $peptide_length,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
194 $translation_md5,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
195 ) = rearrange([qw(
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
196 ANALYSIS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
197 SUB_ANALYSIS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
198 MATRIX
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
199 PEPTIDE_LENGTH
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
200 TRANSLATION_MD5
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
201 )], @_);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
202
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
203 throw("analysis argument required") unless defined $analysis;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
204
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
205 throw("Unrecognised analysis '$analysis'")
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
206 unless defined $PREDICTION_TO_VAL->{$analysis};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
207
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
208 my $self = bless {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
209 analysis => $analysis,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
210 sub_analysis => $sub_analysis,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
211 matrix => $matrix,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
212 peptide_length => $peptide_length,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
213 translation_md5 => $translation_md5,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
214 }, $class;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
215
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
216 $self->{matrix_compressed} = defined $matrix ? 1 : 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
217
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
218 return $self;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
219 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
220
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
221 =head2 analysis
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
222
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
223 Arg[1] : string $analysis - the name of the analysis tool (optional)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
224 Description : Get/set the analysis name
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
225 Returntype : string
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
226 Exceptions : throws if the name is not recognised
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
227 Status : At Risk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
228
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
229 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
230
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
231 sub analysis {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
232 my ($self, $analysis) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
233
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
234 if ($analysis) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
235 throw("Unrecognised analysis '$analysis'")
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
236 unless defined $PREDICTION_TO_VAL->{$analysis};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
237
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
238 $self->{analysis} = $analysis;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
239 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
240
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
241 return $self->{analysis};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
242 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
243
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
244 =head2 sub_analysis
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
245
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
246 Arg[1] : string $sub_analysis - the name of the sub analysis (optional)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
247 Description : Get/set the sub analysis name
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
248 Returntype : string
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
249 Exceptions : None
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
250 Status : At Risk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
251
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
252 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
253
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
254 sub sub_analysis {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
255 my ($self, $sub_analysis) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
256
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
257 $self->{sub_analysis} = $sub_analysis if $sub_analysis;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
258
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
259 return $self->{sub_analysis};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
260 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
261
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
262 =head2 peptide_length
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
263
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
264 Arg[1] : int $peptide_length - the length of the peptide (optional)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
265 Description : Get/set the length of the peptide - required when you want to
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
266 serialize a matrix, as we need to know how many rows the matrix has
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
267 Returntype : int
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
268 Exceptions : none
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
269 Status : At Risk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
270
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
271 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
272
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
273 sub peptide_length {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
274 my ($self, $peptide_length) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
275 $self->{peptide_length} = $peptide_length if defined $peptide_length;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
276 return $self->{peptide_length};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
277 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
278
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
279 =head2 translation_md5
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
280
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
281 Arg[1] : string $translation_md5 - the hex MD5 hash of the peptide sequence (optional)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
282 Description : Get/set the MD5 hash of the peptide sequence
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
283 Returntype : string
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
284 Exceptions : none
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
285 Status : At Risk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
286
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
287 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
288
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
289 sub translation_md5 {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
290 my ($self, $translation_md5) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
291 $self->{translation_md5} = $translation_md5 if defined $translation_md5;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
292 return $self->{translation_md5};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
293 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
294
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
295 =head2 get_prediction
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
296
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
297 Arg[1] : int $pos - the desired position
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
298 Arg[2] : string $aa - the mutant amino acid
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
299 Description : get the prediction and score for the given position and amino acid
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
300 Returntype : a list containing 2 values, the prediction and the score
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
301 Exceptions : throws if either the position or amino acid are invalid
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
302 Status : At Risk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
303
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
304 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
305
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
306 sub get_prediction {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
307 my ($self, $pos, $aa) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
308
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
309 # if we have it in our uncompressed hash then just return it
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
310
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
311 if (defined $self->{preds}->{$pos}->{$aa}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
312 return @{ $self->{preds}->{$pos}->{$aa} };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
313 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
314
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
315 # otherwise we look in the serialized matrix string
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
316
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
317 return $self->prediction_from_matrix($pos, $aa);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
318 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
319
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
320 =head2 add_prediction
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
321
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
322 Arg[1] : int $pos - the peptide position
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
323 Arg[2] : string $aa - the mutant amino acid
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
324 Arg[3] : string $prediction - the prediction to store
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
325 Arg[4] : float $score - the score to store
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
326 Description : add a prediction to the matrix for the specified position and amino acid,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
327 note that this just adds the prediction to a perl hash. If you want to
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
328 encode the matrix in the binary format you should call serialize on the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
329 matrix object after you have added all the predictions.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
330 Exceptions : none
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
331 Status : At Risk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
332
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
333 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
334
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
335 sub add_prediction {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
336 my ($self, $pos, $aa, $prediction, $score) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
337
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
338 $self->{preds}->{$pos}->{$aa} = [$prediction, $score];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
339 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
340
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
341 =head2 serialize
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
342
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
343 Arg[1] : int $peptide_length - the length of the associated peptide (optional)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
344 Description : serialize the matrix into a compressed binary format suitable for
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
345 storage in a database, file etc. The same string can later be used
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
346 to create a new matrix object and the predictions can be retrieved
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
347 Exceptions : throws if the peptide length has not been specified
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
348 Status : At Risk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
349
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
350 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
351
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
352 sub serialize {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
353 my ($self, $peptide_length) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
354
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
355 $self->{peptide_length} = $peptide_length if defined $peptide_length;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
356
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
357 throw("peptide_length required to serialize predictions")
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
358 unless defined $self->{peptide_length};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
359
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
360 # convert predictions to the binary format, and concatenate them all
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
361 # together in the correct order, inserting our dummy $NO_PREDICTION
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
362 # value to fill in any gaps
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
363
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
364 if ($self->{preds}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
365
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
366 $self->{matrix_compressed} = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
367
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
368 $self->{matrix} = $HEADER;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
369
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
370 for my $pos (1 .. $self->{peptide_length}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
371
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
372 for my $aa (@ALL_AAS) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
373
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
374 my $short;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
375
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
376 if ($self->{preds}->{$pos}->{$aa}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
377 my ($prediction, $score) = @{ $self->{preds}->{$pos}->{$aa} };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
378
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
379 $short = $self->prediction_to_short($prediction, $score);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
380 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
381
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
382 $self->{matrix} .= defined $short ? $short : $NO_PREDICTION;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
383 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
384 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
385
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
386 # delete the hash copy, so things don't get out of sync
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
387
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
388 $self->{preds} = undef;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
389 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
390 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
391 warning("There don't seem to be any predictions in the matrix to serialize!");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
392 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
393
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
394 # and return the compressed string for storage
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
395
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
396 return $self->compress_matrix;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
397 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
398
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
399 =head2 deserialize
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
400
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
401 Arg [1] : coderef $coderef - an anonymous subroutine that will be called
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
402 as each prediction is decoded in order. The subroutine will be
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
403 called with 4 arguments: the peptide position, the amino acid,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
404 the prediction and the score. This can be used, for example to
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
405 dump out the prediction matrix to a file. (optional)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
406 Description : deserialize a binary formatted matrix into a perl hash reference
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
407 containing all the uncompressed predictions. This hash has an
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
408 entry for each position in the peptide, which is itself a hashref
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
409 with an entry for each possible alternate amino acid which is a
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
410 listref containing the prediction and score. For example, to retrieve
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
411 the prediction for a substitution of 'C' at position 23 from this
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
412 data structure, you could use code like:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
413
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
414 my $prediction_hash = $pfpm->deserialize;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
415 my ($prediction, $score) = @{ $prediction_hash->{23}->{'C'} };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
416
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
417 Note that if you don't explicitly deserialize a matrix this
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
418 class will keep it in the memory-efficient encoded format, and
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
419 you can access individual predictions with the get_prediction()
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
420 method. You should only use this method if you want to decode
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
421 all predictions (for example to perform some large-scale
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
422 analysis, or to reformat the predictions somehow)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
423
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
424 Returntype : hashref containing decoded predictions
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
425 Exceptions : throws if the binary matrix isn't in the expected format
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
426 Status : At Risk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
427
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
428 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
429
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
430 sub deserialize {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
431 my ($self, $coderef) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
432
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
433 if ($self->{matrix_compressed}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
434 $self->expand_matrix;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
435 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
436
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
437 throw("Matrix looks corrupted") unless $self->header_ok;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
438
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
439 # we can work out the length of the peptide by counting the rows in the matrix
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
440
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
441 my $length = ((length($self->{matrix}) - length($HEADER)) / $BYTES_PER_PREDICTION) / $NUM_AAS;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
442
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
443 for my $pos (1 .. $length) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
444
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
445 for my $aa (@ALL_AAS) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
446
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
447 # we call prediction_from_short directly to avoid doing all the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
448 # checks performed in prediction_from_string
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
449
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
450 my ($prediction, $score) = $self->prediction_from_short(substr($self->{matrix}, $self->compute_offset($pos, $aa), $BYTES_PER_PREDICTION));
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
451
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
452 $self->{preds}->{$pos}->{$aa} = [$prediction, $score];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
453
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
454 if ($coderef) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
455 $coderef->($pos, $aa, $prediction, $score);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
456 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
457 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
458 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
459
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
460 return $self->{preds};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
461 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
462
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
463 =head2 prediction_to_short
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
464
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
465 Arg[1] : string $pred - one of the possible qualitative predictions of the tool
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
466 Arg[2] : float $prob - the prediction score (with 3 d.p.s of precision)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
467 Description : converts a prediction and corresponding score into a 2-byte short value
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
468 Returntype : the packed short value
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
469 Exceptions : throws if the prediction argument is invalid
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
470 Status : At Risk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
471
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
472 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
473
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
474 sub prediction_to_short {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
475 my ($self, $pred, $prob) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
476
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
477 # we only store 3 d.p. so multiply by 1000 to turn our
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
478 # probability into a number between 0 and 1000.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
479 # 2^10 == 1024 so we need 10 bits of our short to store
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
480 # this value
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
481
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
482 my $val = $prob * 1000;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
483
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
484 # we store the prediction in the top $NUM_PRED_BITS bits
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
485 # so look up the numerical value for the prediction,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
486 # shift this $NUM_PRED_BITS bits to the left and then set
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
487 # the appropriate bits of our short value
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
488
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
489 $pred = lc($pred);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
490
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
491 my $pred_val = $PREDICTION_TO_VAL->{$self->{analysis}}->{$pred};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
492
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
493 throw("No value defined for prediction: '$pred'?")
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
494 unless defined $pred_val;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
495
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
496 $val |= ($pred_val << (16 - $NUM_PRED_BITS));
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
497
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
498 printf("p2s: $pred ($prob) => 0x%04x\n", $val) if $DEBUG;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
499
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
500 $val = pack $PACK_FORMAT, $val;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
501
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
502 return $val;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
503 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
504
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
505 =head2 prediction_from_short
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
506
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
507 Arg[1] : string $pred - the packed short value
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
508 Description : converts a 2-byte short value back into a prediction and a score
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
509 Exceptions : none
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
510 Returntype : a list containing 2 values, the prediction and the score
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
511 Status : At Risk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
512
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
513 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
514
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
515 sub prediction_from_short {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
516 my ($self, $val) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
517
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
518 # check it isn't our special null prediction
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
519
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
520 if ($val eq $NO_PREDICTION) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
521 print "no pred\n" if $DEBUG;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
522 return undef;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
523 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
524
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
525 # unpack the value as a short
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
526
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
527 $val = unpack $PACK_FORMAT, $val;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
528
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
529 # shift the prediction bits down and look up the prediction string
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
530
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
531 my $pred = $VAL_TO_PREDICTION->{$self->{analysis}}->{$val >> (16 - $NUM_PRED_BITS)};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
532
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
533 # mask off the top 6 bits reserved for the prediction and convert back to a 3 d.p. float
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
534
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
535 my $prob = ($val & (2**10 - 1)) / 1000;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
536
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
537 printf("pfs: 0x%04x => $pred ($prob)\n", $val) if $DEBUG;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
538
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
539 return ($pred, $prob);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
540 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
541
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
542 =head2 compress_matrix
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
543
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
544 Description : compresses a prediction matrix with gzip
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
545 Returntype : the compressed matrix
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
546 Exceptions : throws if the matrix is an unexpected length, or if the compression fails
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
547 Status : At Risk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
548
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
549 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
550
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
551 sub compress_matrix {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
552 my ($self) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
553
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
554 my $matrix = $self->{matrix};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
555
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
556 return undef unless $matrix;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
557
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
558 return $matrix if $self->{matrix_compressed};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
559
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
560 # prepend a header, so we can tell if our matrix has been mangled, or
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
561 # is compressed etc.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
562
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
563 unless ($self->header_ok) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
564 $matrix = $HEADER.$matrix;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
565 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
566
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
567 throw("Prediction matrix is an unexpected length")
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
568 unless ( (length($matrix) - length($HEADER)) % $NUM_AAS) == 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
569
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
570 $self->{matrix} = Compress::Zlib::memGzip($matrix) or throw("Failed to gzip: $gzerrno");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
571
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
572 $self->{matrix_compressed} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
573
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
574 return $self->{matrix};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
575 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
576
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
577 =head2 header_ok
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
578
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
579 Description : checks if the binary matrix has the expected header
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
580 Returntype : boolean
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
581 Exceptions : none
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
582 Status : At Risk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
583
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
584 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
585
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
586 sub header_ok {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
587 my ($self) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
588 return undef unless ($self->{matrix} && !$self->{matrix_compressed});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
589 return substr($self->{matrix},0,length($HEADER)) eq $HEADER;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
590 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
591
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
592 =head2 expand_matrix
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
593
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
594 Description : uncompresses a compressed prediction matrix
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
595 Returntype : the uncompressed binary matrix string
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
596 Exceptions : throws if the header is incorrect, or if the decompression fails
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
597 Status : At Risk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
598
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
599 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
600
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
601 sub expand_matrix {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
602 my ($self) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
603
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
604 return undef unless $self->{matrix};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
605
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
606 return $self->{matrix} unless $self->{matrix_compressed};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
607
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
608 $self->{matrix} = Compress::Zlib::memGunzip($self->{matrix})
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
609 or throw("Failed to gunzip: $gzerrno");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
610
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
611 $self->{matrix_compressed} = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
612
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
613 throw("Malformed prediction matrix") unless $self->header_ok;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
614
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
615 return $self->{matrix};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
616 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
617
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
618 =head2 compute_offset
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
619
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
620 Arg[1] : int $pos - the desired position in the peptide
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
621 Arg[2] : string $aa - the desired mutant amino acid
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
622 Description : computes the correct offset into a prediction matrix for a given
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
623 peptide position and mutant amino acid
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
624 Returntype : the integer offset
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
625 Exceptions : none
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
626 Status : At Risk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
627
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
628 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
629
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
630 sub compute_offset {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
631 my ($self, $pos, $aa) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
632
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
633 my $offset = length($HEADER) + ( ( (($pos-1) * $NUM_AAS) + $AA_LOOKUP->{$aa} ) * $BYTES_PER_PREDICTION );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
634
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
635 return $offset;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
636 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
637
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
638 =head2 prediction_from_matrix
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
639
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
640 Arg[1] : int $pos - the desired position in the peptide
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
641 Arg[2] : string $aa - the desired mutant amino acid
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
642 Description : returns the prediction and score for the requested
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
643 position and mutant amino acid in the matrix
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
644 Returntype : a list containing 2 values, the prediction and the score
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
645 Exceptions : throws if either the position or amino acid are invalid,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
646 or if the prediction matrix looks to be malformed
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
647 Status : At Risk
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
648
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
649 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
650
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
651 sub prediction_from_matrix {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
652 my ($self, $pos, $aa) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
653
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
654 if ($self->{matrix_compressed}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
655 # the matrix is still compressed so we try to uncompress it,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
656 $self->expand_matrix;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
657 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
658
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
659 $aa = uc($aa) if defined $aa;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
660
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
661 throw("Invalid position: $pos") unless (defined $pos && $pos > 0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
662
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
663 throw("Invalid amino acid: $aa") unless (defined $aa && defined $AA_LOOKUP->{$aa});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
664
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
665 # compute our offset into the prediction matrix
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
666
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
667 my $offset = $self->compute_offset($pos, $aa);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
668
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
669 print "offset: $offset\n" if $DEBUG;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
670
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
671 if ($offset + 1 > length($self->{matrix})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
672 warning("Offset outside of prediction matrix for position $pos and amino acid $aa?");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
673 return undef;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
674 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
675
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
676 my $pred = substr($self->{matrix}, $offset, $BYTES_PER_PREDICTION);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
677
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
678 return $self->prediction_from_short($pred);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
679 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
680
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
681 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
682