diff variant_effect_predictor/Bio/EnsEMBL/Variation/ProteinFunctionPredictionMatrix.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/ProteinFunctionPredictionMatrix.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,682 @@
+=head1 LICENSE
+
+ Copyright (c) 1999-2012 The European Bioinformatics Institute and
+ Genome Research Limited.  All rights reserved.
+
+ This software is distributed under a modified Apache license.
+ For license details, please see
+
+   http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+ Please email comments or questions to the public Ensembl
+ developers list at <dev@ensembl.org>.
+
+ Questions may also be sent to the Ensembl help desk at
+ <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix
+
+=head1 SYNOPSIS
+
+  # create a new matrix for polyphen predictions
+
+  my $orig_pfpm = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new(
+      -analysis       => 'polyphen',
+      -peptide_length => 134,
+  );
+
+  # add some predictions
+
+  $orig_pfpm->add_prediction(1, 'A', 'probably damaging', 0.967);
+  $orig_pfpm->add_prediction(2, 'C', 'benign', 0.09);
+
+  # serialize the matrix to a compressed binary string
+
+  my $binary_string = $pfpm->serialize;
+
+  # store the string somewhere, fetch it later, and then create a new matrix using it
+
+  my $new_pfpm = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new(
+      -analysis   => 'polyphen',
+      -matrix     => $binary_string
+  );
+
+  # retrieve predictions
+
+  my ($prediction, $score) = $new_pfpm->get_prediction(2, 'C');
+
+  print "A mutation to 'C' at position 2 is predicted to be $prediction\n";
+
+=head1 DESCRIPTION
+
+This module defines a class representing a matrix of protein
+function predictions, and provides method to access and set
+predictions for a given position and amino acid, and also to
+serialize the matrix for storage in a database, and deserialize
+a matrix from the compressed format into a perl hash.
+
+=cut
+
+package Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix;
+
+use strict;
+use warnings;
+
+use POSIX qw(ceil);
+use List::Util qw(max);
+use Compress::Zlib;
+
+use Bio::EnsEMBL::Utils::Exception qw(throw warning);
+use Bio::EnsEMBL::Utils::Argument qw(rearrange);
+
+use base qw(Exporter);
+
+our @EXPORT_OK = qw($AA_LOOKUP @ALL_AAS);
+
+my $DEBUG = 0;
+
+# user-defined constants
+
+# a header which lets us identify prediction matrices and
+# to check if they may have been corrupted
+
+our $HEADER     = 'VEP';
+
+# the format we use when 'pack'ing our predictions
+
+my $PACK_FORMAT = 'v';
+
+# a hash mapping qualitative predictions to numerical values
+# for all the analyses we use
+
+my $PREDICTION_TO_VAL = {
+    polyphen => {
+        'probably damaging' => 0,
+        'possibly damaging' => 1,
+        'benign'            => 2,
+        'unknown'           => 3,
+    },
+
+    sift => {
+        'tolerated'     => 0,
+        'deleterious'   => 1,
+    },
+};
+
+# all valid amino acids
+
+our @ALL_AAS = qw(A C D E F G H I K L M N P Q R S T V W Y);
+
+# we use a short with all bits set to represent the lack of a prediction in 
+# an (uncompressed) prediction matrix, we will never observe this value
+# as a real prediction even if we set all the (6) prediction bits because we 
+# limit the max score to 1000 so the 10 score bits will never all be set
+
+our $NO_PREDICTION = pack($PACK_FORMAT, 0xFFFF);
+
+# the number of bytes in a short
+
+my $BYTES_PER_PREDICTION = 2;
+
+
+# constants derived from the the user-defined constants
+
+# the maximum number of distinct qualitative predictions used by any tool
+
+my $MAX_NUM_PREDS = max( map { scalar keys %$_ } values %$PREDICTION_TO_VAL ); 
+
+# the number of bits used to encode the qualitative prediction
+
+my $NUM_PRED_BITS = ceil( log($MAX_NUM_PREDS) / log(2) );
+
+throw("Cannot represent more than ".(2**6-1)." predictions") if $NUM_PRED_BITS > 6;
+
+# a hash mapping back from a numerical value to a qualitative prediction
+
+my $VAL_TO_PREDICTION = {
+    map {
+        my $tool = $_; 
+        $tool => {
+            map {
+                $PREDICTION_TO_VAL->{$tool}->{$_} => $_ 
+            } keys %{ $PREDICTION_TO_VAL->{$tool} }
+        }
+    } keys %$PREDICTION_TO_VAL
+};
+
+# a hash from amino acid single letter code to a numerical value
+
+our $AA_LOOKUP = { map {$ALL_AAS[$_] => $_} 0 .. $#ALL_AAS };
+
+# the number of valid amino acids
+
+our $NUM_AAS = scalar(@ALL_AAS);
+
+=head2 new
+
+  Arg [-ANALYSIS] : 
+    The name of the analysis tool that made these predictions,
+    currently must be one of 'sift' or 'polyphen'
+
+  Arg [-MATRIX] :
+    A gzip compressed binary string encoding the predictions,
+    typically created using this class and fetched from the 
+    variation database (optional)
+
+  Arg [-PEPTIDE_LENGTH] :
+    The length of the associated peptide, only required if
+    you want to serialize this matrix (optional)
+
+  Arg [-TRANSLATION_MD5] :
+    The hex MD5 hash of the associated peptide sequence
+
+  Description: Constructs a new ProteinFunctionPredictionMatrix object
+  Returntype : A new Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix instance 
+  Exceptions : throws unless ANALYSIS is supplied and recognised
+  Status     : At Risk
+
+=cut 
+
+sub new {
+    my $class = shift;
+    
+    my (
+        $analysis,
+        $sub_analysis,
+        $matrix,
+        $peptide_length,
+        $translation_md5,
+    ) = rearrange([qw(
+            ANALYSIS
+            SUB_ANALYSIS
+            MATRIX
+            PEPTIDE_LENGTH
+            TRANSLATION_MD5
+        )], @_);
+    
+    throw("analysis argument required") unless defined $analysis;
+    
+    throw("Unrecognised analysis '$analysis'") 
+        unless defined $PREDICTION_TO_VAL->{$analysis};
+
+    my $self = bless {
+        analysis        => $analysis,
+        sub_analysis    => $sub_analysis,
+        matrix          => $matrix,
+        peptide_length  => $peptide_length,
+        translation_md5 => $translation_md5,
+    }, $class;
+
+    $self->{matrix_compressed} = defined $matrix ? 1 : 0;
+
+    return $self;
+}
+
+=head2 analysis
+
+  Arg[1]      : string $analysis - the name of the analysis tool (optional)
+  Description : Get/set the analysis name
+  Returntype  : string
+  Exceptions  : throws if the name is not recognised 
+  Status      : At Risk
+  
+=cut
+
+sub analysis {
+    my ($self, $analysis) = @_;
+    
+    if ($analysis) {
+        throw("Unrecognised analysis '$analysis'") 
+            unless defined $PREDICTION_TO_VAL->{$analysis};
+
+        $self->{analysis} = $analysis;
+    }
+
+    return $self->{analysis};
+}
+
+=head2 sub_analysis
+
+  Arg[1]      : string $sub_analysis - the name of the sub analysis (optional)
+  Description : Get/set the sub analysis name
+  Returntype  : string
+  Exceptions  : None 
+  Status      : At Risk
+  
+=cut
+
+sub sub_analysis {
+    my ($self, $sub_analysis) = @_;
+    
+    $self->{sub_analysis} = $sub_analysis if $sub_analysis;
+    
+    return $self->{sub_analysis};
+}
+
+=head2 peptide_length
+
+  Arg[1]      : int $peptide_length - the length of the peptide (optional)
+  Description : Get/set the length of the peptide - required when you want to
+                serialize a matrix, as we need to know how many rows the matrix has
+  Returntype  : int
+  Exceptions  : none 
+  Status      : At Risk
+  
+=cut
+
+sub peptide_length {
+    my ($self, $peptide_length) = @_;
+    $self->{peptide_length} = $peptide_length if defined $peptide_length;
+    return $self->{peptide_length};
+}
+
+=head2 translation_md5
+
+  Arg[1]      : string $translation_md5 - the hex MD5 hash of the peptide sequence (optional)
+  Description : Get/set the MD5 hash of the peptide sequence
+  Returntype  : string
+  Exceptions  : none 
+  Status      : At Risk
+  
+=cut
+
+sub translation_md5 {
+    my ($self, $translation_md5) = @_;
+    $self->{translation_md5} = $translation_md5 if defined $translation_md5;
+    return $self->{translation_md5};
+}
+
+=head2 get_prediction
+
+  Arg[1]      : int $pos - the desired position
+  Arg[2]      : string $aa - the mutant amino acid
+  Description : get the prediction and score for the given position and amino acid
+  Returntype  : a list containing 2 values, the prediction and the score
+  Exceptions  : throws if either the position or amino acid are invalid 
+  Status      : At Risk
+  
+=cut
+
+sub get_prediction {
+    my ($self, $pos, $aa) = @_;
+
+    # if we have it in our uncompressed hash then just return it
+
+    if (defined $self->{preds}->{$pos}->{$aa}) {
+        return @{ $self->{preds}->{$pos}->{$aa} };
+    }
+
+    # otherwise we look in the serialized matrix string
+
+    return $self->prediction_from_matrix($pos, $aa);
+}
+
+=head2 add_prediction
+
+  Arg[1]      : int $pos - the peptide position
+  Arg[2]      : string $aa - the mutant amino acid
+  Arg[3]      : string $prediction - the prediction to store
+  Arg[4]      : float $score - the score to store
+  Description : add a prediction to the matrix for the specified position and amino acid,
+                note that this just adds the prediction to a perl hash. If you want to
+                encode the matrix in the binary format you should call serialize on the 
+                matrix object after you have added all the predictions.
+  Exceptions  : none
+  Status      : At Risk
+  
+=cut
+
+sub add_prediction {
+    my ($self, $pos, $aa, $prediction, $score) = @_;
+
+    $self->{preds}->{$pos}->{$aa} = [$prediction, $score];
+}
+
+=head2 serialize
+
+  Arg[1]      : int $peptide_length - the length of the associated peptide (optional)
+  Description : serialize the matrix into a compressed binary format suitable for
+                storage in a database, file etc. The same string can later be used
+                to create a new matrix object and the predictions can be retrieved
+  Exceptions  : throws if the peptide length has not been specified
+  Status      : At Risk
+  
+=cut
+
+sub serialize {
+    my ($self, $peptide_length) = @_;
+
+    $self->{peptide_length} = $peptide_length if defined $peptide_length;
+
+    throw("peptide_length required to serialize predictions") 
+        unless defined $self->{peptide_length};
+
+    # convert predictions to the binary format, and concatenate them all 
+    # together in the correct order, inserting our dummy $NO_PREDICTION
+    # value to fill in any gaps
+
+    if ($self->{preds}) {
+
+        $self->{matrix_compressed} = 0;
+
+        $self->{matrix} = $HEADER;
+
+        for my $pos (1 .. $self->{peptide_length}) {
+        
+            for my $aa (@ALL_AAS) {
+                
+                my $short;
+
+                if ($self->{preds}->{$pos}->{$aa}) {
+                    my ($prediction, $score) = @{ $self->{preds}->{$pos}->{$aa} };
+                
+                    $short = $self->prediction_to_short($prediction, $score);
+                }
+
+                $self->{matrix} .= defined $short ? $short : $NO_PREDICTION;
+            }
+        }
+
+        # delete the hash copy, so things don't get out of sync
+
+        $self->{preds} = undef;
+    }
+    else {
+        warning("There don't seem to be any predictions in the matrix to serialize!");
+    }
+
+    # and return the compressed string for storage
+
+    return $self->compress_matrix;
+}
+
+=head2 deserialize
+
+  Arg [1]     : coderef $coderef - an anonymous subroutine that will be called 
+                as each prediction is decoded in order. The subroutine will be 
+                called with 4 arguments: the peptide position, the amino acid, 
+                the prediction and the score. This can be used, for example to 
+                dump out the prediction matrix to a file. (optional)
+  Description : deserialize a binary formatted matrix into a perl hash reference
+                containing all the uncompressed predictions. This hash has an 
+                entry for each position in the peptide, which is itself a hashref
+                with an entry for each possible alternate amino acid which is a 
+                listref containing the prediction and score. For example, to retrieve
+                the prediction for a substitution of 'C' at position 23 from this
+                data structure, you could use code like:
+
+                my $prediction_hash = $pfpm->deserialize;
+                my ($prediction, $score) = @{ $prediction_hash->{23}->{'C'} };
+
+                Note that if you don't explicitly deserialize a matrix this
+                class will keep it in the memory-efficient encoded format, and
+                you can access individual predictions with the get_prediction()
+                method. You should only use this method if you want to decode
+                all predictions (for example to perform some large-scale 
+                analysis, or to reformat the predictions somehow)
+
+  Returntype  : hashref containing decoded predictions
+  Exceptions  : throws if the binary matrix isn't in the expected format
+  Status      : At Risk
+  
+=cut
+
+sub deserialize {
+    my ($self, $coderef) = @_;
+
+    if ($self->{matrix_compressed}) {
+        $self->expand_matrix;
+    }
+
+    throw("Matrix looks corrupted") unless $self->header_ok;
+
+    # we can work out the length of the peptide by counting the rows in the matrix
+
+    my $length = ((length($self->{matrix}) - length($HEADER)) / $BYTES_PER_PREDICTION) / $NUM_AAS;
+
+    for my $pos (1 .. $length) {
+        
+        for my $aa (@ALL_AAS) {
+
+            # we call prediction_from_short directly to avoid doing all the
+            # checks performed in prediction_from_string
+
+            my ($prediction, $score) = $self->prediction_from_short(substr($self->{matrix}, $self->compute_offset($pos, $aa), $BYTES_PER_PREDICTION));
+
+            $self->{preds}->{$pos}->{$aa} = [$prediction, $score];
+
+            if ($coderef) {
+                $coderef->($pos, $aa, $prediction, $score);
+            }
+        }
+    }
+    
+    return $self->{preds};
+}
+
+=head2 prediction_to_short
+
+  Arg[1]      : string $pred - one of the possible qualitative predictions of the tool 
+  Arg[2]      : float $prob - the prediction score (with 3 d.p.s of precision)
+  Description : converts a prediction and corresponding score into a 2-byte short value
+  Returntype  : the packed short value
+  Exceptions  : throws if the prediction argument is invalid 
+  Status      : At Risk
+  
+=cut
+
+sub prediction_to_short {
+    my ($self, $pred, $prob) = @_;
+
+    # we only store 3 d.p. so multiply by 1000 to turn our
+    # probability into a number between 0 and 1000. 
+    # 2^10 == 1024 so we need 10 bits of our short to store 
+    # this value
+    
+    my $val = $prob * 1000;
+
+    # we store the prediction in the top $NUM_PRED_BITS bits
+    # so look up the numerical value for the prediction, 
+    # shift this $NUM_PRED_BITS bits to the left and then set
+    # the appropriate bits of our short value
+    
+    $pred = lc($pred);
+    
+    my $pred_val = $PREDICTION_TO_VAL->{$self->{analysis}}->{$pred};
+
+    throw("No value defined for prediction: '$pred'?")
+        unless defined $pred_val;
+
+    $val |= ($pred_val << (16 - $NUM_PRED_BITS));
+
+    printf("p2s: $pred ($prob) => 0x%04x\n", $val) if $DEBUG;
+    
+    $val = pack $PACK_FORMAT, $val;
+
+    return $val;
+}
+
+=head2 prediction_from_short
+
+  Arg[1]      : string $pred - the packed short value 
+  Description : converts a 2-byte short value back into a prediction and a score
+  Exceptions  : none
+  Returntype  : a list containing 2 values, the prediction and the score
+  Status      : At Risk
+  
+=cut
+
+sub prediction_from_short {
+    my ($self, $val) = @_;
+
+    # check it isn't our special null prediction
+
+    if ($val eq $NO_PREDICTION) {
+        print "no pred\n" if $DEBUG;
+        return undef;
+    }
+
+    # unpack the value as a short
+
+    $val = unpack $PACK_FORMAT, $val;
+
+    # shift the prediction bits down and look up the prediction string
+
+    my $pred = $VAL_TO_PREDICTION->{$self->{analysis}}->{$val >> (16 - $NUM_PRED_BITS)};
+
+    # mask off the top 6 bits reserved for the prediction and convert back to a 3 d.p. float
+
+    my $prob = ($val & (2**10 - 1)) / 1000;
+
+    printf("pfs: 0x%04x => $pred ($prob)\n", $val) if $DEBUG;
+
+    return ($pred, $prob);
+}
+
+=head2 compress_matrix
+
+  Description : compresses a prediction matrix with gzip
+  Returntype  : the compressed matrix
+  Exceptions  : throws if the matrix is an unexpected length, or if the compression fails
+  Status      : At Risk
+  
+=cut
+
+sub compress_matrix {
+    my ($self) = @_;
+
+    my $matrix = $self->{matrix};
+
+    return undef unless $matrix;
+
+    return $matrix if $self->{matrix_compressed};
+    
+    # prepend a header, so we can tell if our matrix has been mangled, or
+    # is compressed etc.
+    
+    unless ($self->header_ok) {
+        $matrix = $HEADER.$matrix;
+    }
+
+    throw("Prediction matrix is an unexpected length")
+         unless ( (length($matrix) - length($HEADER)) % $NUM_AAS) == 0;
+
+    $self->{matrix} = Compress::Zlib::memGzip($matrix) or throw("Failed to gzip: $gzerrno");
+
+    $self->{matrix_compressed} = 1;
+
+    return $self->{matrix};
+}
+
+=head2 header_ok
+
+  Description : checks if the binary matrix has the expected header
+  Returntype  : boolean
+  Exceptions  : none
+  Status      : At Risk
+  
+=cut
+
+sub header_ok {
+    my ($self) = @_;
+    return undef unless ($self->{matrix} && !$self->{matrix_compressed});
+    return substr($self->{matrix},0,length($HEADER)) eq $HEADER;
+}
+
+=head2 expand_matrix
+
+  Description : uncompresses a compressed prediction matrix
+  Returntype  : the uncompressed binary matrix string
+  Exceptions  : throws if the header is incorrect, or if the decompression fails
+  Status      : At Risk
+  
+=cut
+
+sub expand_matrix {
+    my ($self) = @_;
+
+    return undef unless $self->{matrix};
+
+    return $self->{matrix} unless $self->{matrix_compressed};
+
+    $self->{matrix} = Compress::Zlib::memGunzip($self->{matrix})
+        or throw("Failed to gunzip: $gzerrno");
+
+    $self->{matrix_compressed} = 0;
+
+    throw("Malformed prediction matrix") unless $self->header_ok;
+    
+    return $self->{matrix};
+}
+
+=head2 compute_offset
+
+  Arg[1]      : int $pos - the desired position in the peptide 
+  Arg[2]      : string $aa - the desired mutant amino acid
+  Description : computes the correct offset into a prediction matrix for a given
+                peptide position and mutant amino acid
+  Returntype  : the integer offset
+  Exceptions  : none
+  Status      : At Risk
+  
+=cut
+
+sub compute_offset {
+    my ($self, $pos, $aa) = @_;
+
+    my $offset = length($HEADER) + ( ( (($pos-1) * $NUM_AAS) + $AA_LOOKUP->{$aa} ) * $BYTES_PER_PREDICTION );
+
+    return $offset;
+}
+
+=head2 prediction_from_matrix
+
+  Arg[1]      : int $pos - the desired position in the peptide 
+  Arg[2]      : string $aa - the desired mutant amino acid
+  Description : returns the prediction and score for the requested 
+                position and mutant amino acid in the matrix
+  Returntype  : a list containing 2 values, the prediction and the score
+  Exceptions  : throws if either the position or amino acid are invalid, 
+                or if the prediction matrix looks to be malformed
+  Status      : At Risk
+  
+=cut
+
+sub prediction_from_matrix {
+    my ($self, $pos, $aa) = @_;
+
+    if ($self->{matrix_compressed}) {
+        # the matrix is still compressed so we try to uncompress it, 
+        $self->expand_matrix;
+    }
+
+    $aa = uc($aa) if defined $aa;
+
+    throw("Invalid position: $pos") unless (defined $pos && $pos > 0);
+    
+    throw("Invalid amino acid: $aa") unless (defined $aa && defined $AA_LOOKUP->{$aa});
+
+    # compute our offset into the prediction matrix
+
+    my $offset = $self->compute_offset($pos, $aa);
+    
+    print "offset: $offset\n" if $DEBUG;
+    
+    if ($offset + 1 > length($self->{matrix})) {
+        warning("Offset outside of prediction matrix for position $pos and amino acid $aa?");
+        return undef;
+    }
+    
+    my $pred = substr($self->{matrix}, $offset, $BYTES_PER_PREDICTION);
+
+    return $self->prediction_from_short($pred);
+}
+
+1;
+