Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Variation/ProteinFunctionPredictionMatrix.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/ProteinFunctionPredictionMatrix.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,682 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix + +=head1 SYNOPSIS + + # create a new matrix for polyphen predictions + + my $orig_pfpm = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new( + -analysis => 'polyphen', + -peptide_length => 134, + ); + + # add some predictions + + $orig_pfpm->add_prediction(1, 'A', 'probably damaging', 0.967); + $orig_pfpm->add_prediction(2, 'C', 'benign', 0.09); + + # serialize the matrix to a compressed binary string + + my $binary_string = $pfpm->serialize; + + # store the string somewhere, fetch it later, and then create a new matrix using it + + my $new_pfpm = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new( + -analysis => 'polyphen', + -matrix => $binary_string + ); + + # retrieve predictions + + my ($prediction, $score) = $new_pfpm->get_prediction(2, 'C'); + + print "A mutation to 'C' at position 2 is predicted to be $prediction\n"; + +=head1 DESCRIPTION + +This module defines a class representing a matrix of protein +function predictions, and provides method to access and set +predictions for a given position and amino acid, and also to +serialize the matrix for storage in a database, and deserialize +a matrix from the compressed format into a perl hash. + +=cut + +package Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix; + +use strict; +use warnings; + +use POSIX qw(ceil); +use List::Util qw(max); +use Compress::Zlib; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + +use base qw(Exporter); + +our @EXPORT_OK = qw($AA_LOOKUP @ALL_AAS); + +my $DEBUG = 0; + +# user-defined constants + +# a header which lets us identify prediction matrices and +# to check if they may have been corrupted + +our $HEADER = 'VEP'; + +# the format we use when 'pack'ing our predictions + +my $PACK_FORMAT = 'v'; + +# a hash mapping qualitative predictions to numerical values +# for all the analyses we use + +my $PREDICTION_TO_VAL = { + polyphen => { + 'probably damaging' => 0, + 'possibly damaging' => 1, + 'benign' => 2, + 'unknown' => 3, + }, + + sift => { + 'tolerated' => 0, + 'deleterious' => 1, + }, +}; + +# all valid amino acids + +our @ALL_AAS = qw(A C D E F G H I K L M N P Q R S T V W Y); + +# we use a short with all bits set to represent the lack of a prediction in +# an (uncompressed) prediction matrix, we will never observe this value +# as a real prediction even if we set all the (6) prediction bits because we +# limit the max score to 1000 so the 10 score bits will never all be set + +our $NO_PREDICTION = pack($PACK_FORMAT, 0xFFFF); + +# the number of bytes in a short + +my $BYTES_PER_PREDICTION = 2; + + +# constants derived from the the user-defined constants + +# the maximum number of distinct qualitative predictions used by any tool + +my $MAX_NUM_PREDS = max( map { scalar keys %$_ } values %$PREDICTION_TO_VAL ); + +# the number of bits used to encode the qualitative prediction + +my $NUM_PRED_BITS = ceil( log($MAX_NUM_PREDS) / log(2) ); + +throw("Cannot represent more than ".(2**6-1)." predictions") if $NUM_PRED_BITS > 6; + +# a hash mapping back from a numerical value to a qualitative prediction + +my $VAL_TO_PREDICTION = { + map { + my $tool = $_; + $tool => { + map { + $PREDICTION_TO_VAL->{$tool}->{$_} => $_ + } keys %{ $PREDICTION_TO_VAL->{$tool} } + } + } keys %$PREDICTION_TO_VAL +}; + +# a hash from amino acid single letter code to a numerical value + +our $AA_LOOKUP = { map {$ALL_AAS[$_] => $_} 0 .. $#ALL_AAS }; + +# the number of valid amino acids + +our $NUM_AAS = scalar(@ALL_AAS); + +=head2 new + + Arg [-ANALYSIS] : + The name of the analysis tool that made these predictions, + currently must be one of 'sift' or 'polyphen' + + Arg [-MATRIX] : + A gzip compressed binary string encoding the predictions, + typically created using this class and fetched from the + variation database (optional) + + Arg [-PEPTIDE_LENGTH] : + The length of the associated peptide, only required if + you want to serialize this matrix (optional) + + Arg [-TRANSLATION_MD5] : + The hex MD5 hash of the associated peptide sequence + + Description: Constructs a new ProteinFunctionPredictionMatrix object + Returntype : A new Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix instance + Exceptions : throws unless ANALYSIS is supplied and recognised + Status : At Risk + +=cut + +sub new { + my $class = shift; + + my ( + $analysis, + $sub_analysis, + $matrix, + $peptide_length, + $translation_md5, + ) = rearrange([qw( + ANALYSIS + SUB_ANALYSIS + MATRIX + PEPTIDE_LENGTH + TRANSLATION_MD5 + )], @_); + + throw("analysis argument required") unless defined $analysis; + + throw("Unrecognised analysis '$analysis'") + unless defined $PREDICTION_TO_VAL->{$analysis}; + + my $self = bless { + analysis => $analysis, + sub_analysis => $sub_analysis, + matrix => $matrix, + peptide_length => $peptide_length, + translation_md5 => $translation_md5, + }, $class; + + $self->{matrix_compressed} = defined $matrix ? 1 : 0; + + return $self; +} + +=head2 analysis + + Arg[1] : string $analysis - the name of the analysis tool (optional) + Description : Get/set the analysis name + Returntype : string + Exceptions : throws if the name is not recognised + Status : At Risk + +=cut + +sub analysis { + my ($self, $analysis) = @_; + + if ($analysis) { + throw("Unrecognised analysis '$analysis'") + unless defined $PREDICTION_TO_VAL->{$analysis}; + + $self->{analysis} = $analysis; + } + + return $self->{analysis}; +} + +=head2 sub_analysis + + Arg[1] : string $sub_analysis - the name of the sub analysis (optional) + Description : Get/set the sub analysis name + Returntype : string + Exceptions : None + Status : At Risk + +=cut + +sub sub_analysis { + my ($self, $sub_analysis) = @_; + + $self->{sub_analysis} = $sub_analysis if $sub_analysis; + + return $self->{sub_analysis}; +} + +=head2 peptide_length + + Arg[1] : int $peptide_length - the length of the peptide (optional) + Description : Get/set the length of the peptide - required when you want to + serialize a matrix, as we need to know how many rows the matrix has + Returntype : int + Exceptions : none + Status : At Risk + +=cut + +sub peptide_length { + my ($self, $peptide_length) = @_; + $self->{peptide_length} = $peptide_length if defined $peptide_length; + return $self->{peptide_length}; +} + +=head2 translation_md5 + + Arg[1] : string $translation_md5 - the hex MD5 hash of the peptide sequence (optional) + Description : Get/set the MD5 hash of the peptide sequence + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub translation_md5 { + my ($self, $translation_md5) = @_; + $self->{translation_md5} = $translation_md5 if defined $translation_md5; + return $self->{translation_md5}; +} + +=head2 get_prediction + + Arg[1] : int $pos - the desired position + Arg[2] : string $aa - the mutant amino acid + Description : get the prediction and score for the given position and amino acid + Returntype : a list containing 2 values, the prediction and the score + Exceptions : throws if either the position or amino acid are invalid + Status : At Risk + +=cut + +sub get_prediction { + my ($self, $pos, $aa) = @_; + + # if we have it in our uncompressed hash then just return it + + if (defined $self->{preds}->{$pos}->{$aa}) { + return @{ $self->{preds}->{$pos}->{$aa} }; + } + + # otherwise we look in the serialized matrix string + + return $self->prediction_from_matrix($pos, $aa); +} + +=head2 add_prediction + + Arg[1] : int $pos - the peptide position + Arg[2] : string $aa - the mutant amino acid + Arg[3] : string $prediction - the prediction to store + Arg[4] : float $score - the score to store + Description : add a prediction to the matrix for the specified position and amino acid, + note that this just adds the prediction to a perl hash. If you want to + encode the matrix in the binary format you should call serialize on the + matrix object after you have added all the predictions. + Exceptions : none + Status : At Risk + +=cut + +sub add_prediction { + my ($self, $pos, $aa, $prediction, $score) = @_; + + $self->{preds}->{$pos}->{$aa} = [$prediction, $score]; +} + +=head2 serialize + + Arg[1] : int $peptide_length - the length of the associated peptide (optional) + Description : serialize the matrix into a compressed binary format suitable for + storage in a database, file etc. The same string can later be used + to create a new matrix object and the predictions can be retrieved + Exceptions : throws if the peptide length has not been specified + Status : At Risk + +=cut + +sub serialize { + my ($self, $peptide_length) = @_; + + $self->{peptide_length} = $peptide_length if defined $peptide_length; + + throw("peptide_length required to serialize predictions") + unless defined $self->{peptide_length}; + + # convert predictions to the binary format, and concatenate them all + # together in the correct order, inserting our dummy $NO_PREDICTION + # value to fill in any gaps + + if ($self->{preds}) { + + $self->{matrix_compressed} = 0; + + $self->{matrix} = $HEADER; + + for my $pos (1 .. $self->{peptide_length}) { + + for my $aa (@ALL_AAS) { + + my $short; + + if ($self->{preds}->{$pos}->{$aa}) { + my ($prediction, $score) = @{ $self->{preds}->{$pos}->{$aa} }; + + $short = $self->prediction_to_short($prediction, $score); + } + + $self->{matrix} .= defined $short ? $short : $NO_PREDICTION; + } + } + + # delete the hash copy, so things don't get out of sync + + $self->{preds} = undef; + } + else { + warning("There don't seem to be any predictions in the matrix to serialize!"); + } + + # and return the compressed string for storage + + return $self->compress_matrix; +} + +=head2 deserialize + + Arg [1] : coderef $coderef - an anonymous subroutine that will be called + as each prediction is decoded in order. The subroutine will be + called with 4 arguments: the peptide position, the amino acid, + the prediction and the score. This can be used, for example to + dump out the prediction matrix to a file. (optional) + Description : deserialize a binary formatted matrix into a perl hash reference + containing all the uncompressed predictions. This hash has an + entry for each position in the peptide, which is itself a hashref + with an entry for each possible alternate amino acid which is a + listref containing the prediction and score. For example, to retrieve + the prediction for a substitution of 'C' at position 23 from this + data structure, you could use code like: + + my $prediction_hash = $pfpm->deserialize; + my ($prediction, $score) = @{ $prediction_hash->{23}->{'C'} }; + + Note that if you don't explicitly deserialize a matrix this + class will keep it in the memory-efficient encoded format, and + you can access individual predictions with the get_prediction() + method. You should only use this method if you want to decode + all predictions (for example to perform some large-scale + analysis, or to reformat the predictions somehow) + + Returntype : hashref containing decoded predictions + Exceptions : throws if the binary matrix isn't in the expected format + Status : At Risk + +=cut + +sub deserialize { + my ($self, $coderef) = @_; + + if ($self->{matrix_compressed}) { + $self->expand_matrix; + } + + throw("Matrix looks corrupted") unless $self->header_ok; + + # we can work out the length of the peptide by counting the rows in the matrix + + my $length = ((length($self->{matrix}) - length($HEADER)) / $BYTES_PER_PREDICTION) / $NUM_AAS; + + for my $pos (1 .. $length) { + + for my $aa (@ALL_AAS) { + + # we call prediction_from_short directly to avoid doing all the + # checks performed in prediction_from_string + + my ($prediction, $score) = $self->prediction_from_short(substr($self->{matrix}, $self->compute_offset($pos, $aa), $BYTES_PER_PREDICTION)); + + $self->{preds}->{$pos}->{$aa} = [$prediction, $score]; + + if ($coderef) { + $coderef->($pos, $aa, $prediction, $score); + } + } + } + + return $self->{preds}; +} + +=head2 prediction_to_short + + Arg[1] : string $pred - one of the possible qualitative predictions of the tool + Arg[2] : float $prob - the prediction score (with 3 d.p.s of precision) + Description : converts a prediction and corresponding score into a 2-byte short value + Returntype : the packed short value + Exceptions : throws if the prediction argument is invalid + Status : At Risk + +=cut + +sub prediction_to_short { + my ($self, $pred, $prob) = @_; + + # we only store 3 d.p. so multiply by 1000 to turn our + # probability into a number between 0 and 1000. + # 2^10 == 1024 so we need 10 bits of our short to store + # this value + + my $val = $prob * 1000; + + # we store the prediction in the top $NUM_PRED_BITS bits + # so look up the numerical value for the prediction, + # shift this $NUM_PRED_BITS bits to the left and then set + # the appropriate bits of our short value + + $pred = lc($pred); + + my $pred_val = $PREDICTION_TO_VAL->{$self->{analysis}}->{$pred}; + + throw("No value defined for prediction: '$pred'?") + unless defined $pred_val; + + $val |= ($pred_val << (16 - $NUM_PRED_BITS)); + + printf("p2s: $pred ($prob) => 0x%04x\n", $val) if $DEBUG; + + $val = pack $PACK_FORMAT, $val; + + return $val; +} + +=head2 prediction_from_short + + Arg[1] : string $pred - the packed short value + Description : converts a 2-byte short value back into a prediction and a score + Exceptions : none + Returntype : a list containing 2 values, the prediction and the score + Status : At Risk + +=cut + +sub prediction_from_short { + my ($self, $val) = @_; + + # check it isn't our special null prediction + + if ($val eq $NO_PREDICTION) { + print "no pred\n" if $DEBUG; + return undef; + } + + # unpack the value as a short + + $val = unpack $PACK_FORMAT, $val; + + # shift the prediction bits down and look up the prediction string + + my $pred = $VAL_TO_PREDICTION->{$self->{analysis}}->{$val >> (16 - $NUM_PRED_BITS)}; + + # mask off the top 6 bits reserved for the prediction and convert back to a 3 d.p. float + + my $prob = ($val & (2**10 - 1)) / 1000; + + printf("pfs: 0x%04x => $pred ($prob)\n", $val) if $DEBUG; + + return ($pred, $prob); +} + +=head2 compress_matrix + + Description : compresses a prediction matrix with gzip + Returntype : the compressed matrix + Exceptions : throws if the matrix is an unexpected length, or if the compression fails + Status : At Risk + +=cut + +sub compress_matrix { + my ($self) = @_; + + my $matrix = $self->{matrix}; + + return undef unless $matrix; + + return $matrix if $self->{matrix_compressed}; + + # prepend a header, so we can tell if our matrix has been mangled, or + # is compressed etc. + + unless ($self->header_ok) { + $matrix = $HEADER.$matrix; + } + + throw("Prediction matrix is an unexpected length") + unless ( (length($matrix) - length($HEADER)) % $NUM_AAS) == 0; + + $self->{matrix} = Compress::Zlib::memGzip($matrix) or throw("Failed to gzip: $gzerrno"); + + $self->{matrix_compressed} = 1; + + return $self->{matrix}; +} + +=head2 header_ok + + Description : checks if the binary matrix has the expected header + Returntype : boolean + Exceptions : none + Status : At Risk + +=cut + +sub header_ok { + my ($self) = @_; + return undef unless ($self->{matrix} && !$self->{matrix_compressed}); + return substr($self->{matrix},0,length($HEADER)) eq $HEADER; +} + +=head2 expand_matrix + + Description : uncompresses a compressed prediction matrix + Returntype : the uncompressed binary matrix string + Exceptions : throws if the header is incorrect, or if the decompression fails + Status : At Risk + +=cut + +sub expand_matrix { + my ($self) = @_; + + return undef unless $self->{matrix}; + + return $self->{matrix} unless $self->{matrix_compressed}; + + $self->{matrix} = Compress::Zlib::memGunzip($self->{matrix}) + or throw("Failed to gunzip: $gzerrno"); + + $self->{matrix_compressed} = 0; + + throw("Malformed prediction matrix") unless $self->header_ok; + + return $self->{matrix}; +} + +=head2 compute_offset + + Arg[1] : int $pos - the desired position in the peptide + Arg[2] : string $aa - the desired mutant amino acid + Description : computes the correct offset into a prediction matrix for a given + peptide position and mutant amino acid + Returntype : the integer offset + Exceptions : none + Status : At Risk + +=cut + +sub compute_offset { + my ($self, $pos, $aa) = @_; + + my $offset = length($HEADER) + ( ( (($pos-1) * $NUM_AAS) + $AA_LOOKUP->{$aa} ) * $BYTES_PER_PREDICTION ); + + return $offset; +} + +=head2 prediction_from_matrix + + Arg[1] : int $pos - the desired position in the peptide + Arg[2] : string $aa - the desired mutant amino acid + Description : returns the prediction and score for the requested + position and mutant amino acid in the matrix + Returntype : a list containing 2 values, the prediction and the score + Exceptions : throws if either the position or amino acid are invalid, + or if the prediction matrix looks to be malformed + Status : At Risk + +=cut + +sub prediction_from_matrix { + my ($self, $pos, $aa) = @_; + + if ($self->{matrix_compressed}) { + # the matrix is still compressed so we try to uncompress it, + $self->expand_matrix; + } + + $aa = uc($aa) if defined $aa; + + throw("Invalid position: $pos") unless (defined $pos && $pos > 0); + + throw("Invalid amino acid: $aa") unless (defined $aa && defined $AA_LOOKUP->{$aa}); + + # compute our offset into the prediction matrix + + my $offset = $self->compute_offset($pos, $aa); + + print "offset: $offset\n" if $DEBUG; + + if ($offset + 1 > length($self->{matrix})) { + warning("Offset outside of prediction matrix for position $pos and amino acid $aa?"); + return undef; + } + + my $pred = substr($self->{matrix}, $offset, $BYTES_PER_PREDICTION); + + return $self->prediction_from_short($pred); +} + +1; +