view dir_plugins/MaxEntScan.pm @ 10:f594c6bed58f draft default tip

Uploaded
author dvanzessen
date Tue, 21 Apr 2020 11:40:19 +0000
parents e545d0a25ffe
children
line wrap: on
line source

=head1 LICENSE

Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2018] EMBL-European Bioinformatics Institute

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=head1 CONTACT

 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
    
=cut

=head1 NAME

 MaxEntScan

=head1 SYNOPSIS

 mv MaxEntScan.pm ~/.vep/Plugins
 ./vep -i variants.vcf --plugin MaxEntScan,/path/to/maxentscan/fordownload
 ./vep -i variants.vcf --plugin MaxEntScan,/path/to/maxentscan/fordownload,SWA,NCSS

=head1 DESCRIPTION

 This is a plugin for the Ensembl Variant Effect Predictor (VEP) that
 runs MaxEntScan (http://genes.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html)
 to get splice site predictions.

 The plugin copies most of the code verbatim from the score5.pl and score3.pl
 scripts provided in the MaxEntScan download. To run the plugin you must get and
 unpack the archive from http://genes.mit.edu/burgelab/maxent/download/; the path
 to this unpacked directory is then the param you pass to the --plugin flag.

 The plugin executes the logic from one of the scripts depending on which
 splice region the variant overlaps:

 score5.pl : last 3 bases of exon    --> first 6 bases of intron
 score3.pl : last 20 bases of intron --> first 3 bases of exon

 The plugin reports the reference, alternate and difference (REF - ALT) maximum
 entropy scores.

 If 'SWA' is specified as a command-line argument, a sliding window algorithm
 is applied to subsequences containing the reference and alternate alleles to
 identify k-mers with the highest donor and acceptor splice site scores. To assess
 the impact of variants, reference comparison scores are also provided. For SNVs,
 the comparison scores are derived from sequence in the same frame as the highest
 scoring k-mers containing the alternate allele. For all other variants, the
 comparison scores are derived from the highest scoring k-mers containing the
 reference allele. The difference between the reference comparison and alternate
 scores (SWA_REF_COMP - SWA_ALT) are also provided.

 If 'NCSS' is specified as a command-line argument, scores for the nearest
 upstream and downstream canonical splice sites are also included.

 By default, only scores are reported. Add 'verbose' to the list of command-
 line arguments to include the sequence output associated with those scores.

=cut

package MaxEntScan;

use strict;
use warnings;

use Digest::MD5 qw(md5_hex);

use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap);

use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);

# how many seq/score pairs to cache in memory
our $CACHE_SIZE = 50;

sub new {
  my $class = shift;

  my $self = $class->SUPER::new(@_);
  
  # we need sequence, so no offline mode unless we have FASTA
  die("ERROR: cannot function in offline mode without a FASTA file\n") if $self->{config}->{offline} && !$self->{config}->{fasta};

  my $params = $self->params;

  my $dir = shift @$params;
  die("ERROR: MaxEntScan directory not specified\n") unless $dir;
  die("ERROR: MaxEntScan directory not found\n") unless -d $dir;
  $self->{_dir} = $dir;

  ## setup from score5.pl
  $self->{'score5_me2x5'} = $self->score5_makescorematrix($dir.'/me2x5');
  $self->{'score5_seq'}   = $self->score5_makesequencematrix($dir.'/splicemodels/splice5sequences');

  ## setup from score3.pl
  $self->{'score3_metables'} = $self->score3_makemaxentscores;

  my %opts = map { $_ => undef } @$params;

  $self->{'run_SWA'} = 1 if exists $opts{'SWA'};
  $self->{'run_NCSS'} = 1 if exists $opts{'NCSS'};

  $self->{'verbose'} = 1 if exists $opts{'verbose'};

  return $self;
}

sub feature_types {
  return ['Transcript'];
}

sub get_header_info {
  my $self = shift;

  my $v = $self->{'verbose'};
  my $headers = $self->get_MES_header_info($v);

  if ($self->{'run_SWA'}) {
    my $swa_headers = $self->get_SWA_header_info($v);
    $headers = {%$headers, %$swa_headers};
  }

  if ($self->{'run_NCSS'}) {
    my $ncss_headers = $self->get_NCSS_header_info($v);
    $headers = {%$headers, %$ncss_headers};
  }

  return $headers;
}

sub get_MES_header_info {
  my ($self, $verbose) = @_;

  my $headers = {
    MaxEntScan_ref => "MaxEntScan reference sequence score",
    MaxEntScan_alt => "MaxEntScan alternate sequence score",
    MaxEntScan_diff => "MaxEntScan score difference",
  };

  if ($verbose) {

    $headers->{'MaxEntScan_ref_seq'} = "MaxEntScan reference sequence";
    $headers->{'MaxEntScan_alt_seq'} = "MaxEntScan alternate sequence";
  }

  return $headers;
}

sub get_SWA_header_info {
  my ($self, $verbose) = @_;

  my $headers = {
    "MES-SWA_donor_ref" => "Highest splice donor reference sequence score",
    "MES-SWA_donor_alt" => "Highest splice donor alternate sequence score",
    "MES-SWA_donor_ref_comp" => "Donor reference comparison sequence score",
    "MES-SWA_donor_diff" => "Difference between the donor reference comparison and alternate sequence scores",

    "MES-SWA_acceptor_ref" => "Highest splice acceptor reference sequence score",
    "MES-SWA_acceptor_alt" => "Highest splice acceptor alternate sequence score",
    "MES-SWA_acceptor_ref_comp" => "Acceptor reference comparison sequence score",
    "MES-SWA_acceptor_diff" => "Difference between the acceptor reference comparison and alternate sequence scores",
  };

  if ($verbose) {

    $headers->{'MES-SWA_donor_ref_seq'} = "Highest splice donor reference sequence";
    $headers->{'MES-SWA_donor_ref_frame'} = "Position of the highest splice donor reference sequence";
    $headers->{'MES-SWA_donor_ref_context'} = "Selected donor sequence context containing the reference allele";
    $headers->{'MES-SWA_donor_alt_seq'} = "Highest splice donor alternate sequence";
    $headers->{'MES-SWA_donor_alt_frame'} = "Position of the highest splice donor alternate sequence";
    $headers->{'MES-SWA_donor_alt_context'} = "Selected donor sequence context containing the alternate allele";
    $headers->{'MES-SWA_donor_ref_comp_seq'} = "Donor reference comparison sequence";

    $headers->{'MES-SWA_acceptor_ref_seq'} = "Highest splice acceptor reference sequence";
    $headers->{'MES-SWA_acceptor_ref_frame'} = "Position of the highest splice acceptor reference sequence";
    $headers->{'MES-SWA_acceptor_ref_context'} = "Selected acceptor sequence context containing the reference allele";
    $headers->{'MES-SWA_acceptor_alt_seq'} = "Highest splice acceptor alternate sequence";
    $headers->{'MES-SWA_acceptor_alt_frame'} = "Position of the highest splice acceptor alternate sequence";
    $headers->{'MES-SWA_acceptor_alt_context'} = "Selected acceptor sequence context containing the alternate allele";
    $headers->{'MES-SWA_acceptor_ref_comp_seq'} = "Acceptor reference comparison sequence";
  }

  return $headers;
}

sub get_NCSS_header_info {
  my ($self, $verbose) = @_;

  my $headers = {
    "MES-NCSS_upstream_acceptor" => "Nearest upstream canonical splice acceptor sequence score",
    "MES-NCSS_upstream_donor" => "Nearest upstream canonical splice donor sequence score",

    "MES-NCSS_downstream_acceptor" => "Nearest downstream canonical splice acceptor sequence score",
    "MES-NCSS_downstream_donor" => "Nearest downstream canonical splice donor sequence score",
  };

  if ($verbose) {

    $headers->{'MES-NCSS_upstream_acceptor_seq'} = "Nearest upstream canonical splice acceptor sequence";
    $headers->{'MES-NCSS_upstream_donor_seq'} = "Nearest upstream canonical splice donor sequence";

    $headers->{'MES-NCSS_downstream_acceptor_seq'} = "Nearest downstream canonical splice acceptor sequence";
    $headers->{'MES-NCSS_downstream_donor_seq'} = "Nearest downstream canonical splice donor sequence";
  }

  return $headers;
}

sub run {
  my ($self, $tva) = @_;

  my $seq_headers = $self->get_MES_header_info();
  my $results = $self->run_MES($tva);

  if ($self->{'run_SWA'}) {
    my $swa_seq_headers = $self->get_SWA_header_info();
    $seq_headers = {%$seq_headers, %$swa_seq_headers};
    my $swa_results = $self->run_SWA($tva);
    $results = {%$results, %$swa_results};
  }

  if ($self->{'run_NCSS'}) {
    my $ncss_seq_headers = $self->get_NCSS_header_info();
    $seq_headers = {%$seq_headers, %$ncss_seq_headers};
    my $ncss_results = $self->run_NCSS($tva);
    $results = {%$results, %$ncss_results};
  }

  my %data;

  # add the scores
  my @scores = grep { exists $results->{$_} } keys %$seq_headers;
  @data{@scores} = map { sprintf('%.3f', $_) } @{$results}{@scores};

  if ($self->{'verbose'}) {
    # add any remaining results
    my @non_scores = grep { ! exists $data{$_} } keys %$results;
    @data{@non_scores} = @{$results}{@non_scores};
  }

  return \%data;
}

sub run_MES {
  my ($self, $tva) = @_;

  my $vf = $tva->variation_feature;
  return {} unless $vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/;

  my $tv = $tva->transcript_variation;
  my $tr = $tva->transcript;
  my $tr_strand = $tr->strand;
  my ($vf_start, $vf_end) = ($vf->start, $vf->end);

  # use _overlapped_introns() method from BaseTranscriptVariation
  # this will use an interval tree if available for superfast lookup of overlapping introns
  # we have to expand the search space around $vf because we're looking for the splice region not the intron per se
  foreach my $intron(@{$tv->_overlapped_introns($vf_start - 21, $vf_end + 21)}) {
    
    # get coords depending on strand
    # MaxEntScan does different predictions for 5 and 3 prime
    # and we need to feed it different bits of sequence for each
    #
    # 5prime, 3 bases of exon, 6 bases of intron:
    # ===------
    #
    # 3prime, 20 bases of intron, 3 bases of exon
    # --------------------===

    my ($five_start, $five_end, $three_start, $three_end);

    if($tr_strand > 0) {
      ($five_start, $five_end)   = ($intron->start - 3, $intron->start + 5);
      ($three_start, $three_end) = ($intron->end - 19, $intron->end + 3);
    }

    else {
      ($five_start, $five_end)   = ($intron->end - 5, $intron->end + 3);
      ($three_start, $three_end) = ($intron->start - 3, $intron->start + 19);
    }

    if(overlap($vf->start, $vf->end, $five_start, $five_end)) {
      my ($ref_seq, $alt_seq) = @{$self->get_seqs($tva, $five_start, $five_end)};

      return {} unless defined($ref_seq) && $ref_seq =~ /^[ACGT]+$/;
      return {} unless defined($alt_seq) && $alt_seq =~ /^[ACGT]+$/;

      my $ref_score = $self->score5($ref_seq);
      my $alt_score = $self->score5($alt_seq);

      return {
        MaxEntScan_ref => $ref_score,
        MaxEntScan_ref_seq => $ref_seq,
        MaxEntScan_alt => $alt_score,
        MaxEntScan_alt_seq => $alt_seq,
        MaxEntScan_diff => $ref_score - $alt_score,
      }
    }

    if(overlap($vf->start, $vf->end, $three_start, $three_end)) {
      my ($ref_seq, $alt_seq) = @{$self->get_seqs($tva, $three_start, $three_end)};

      return {} unless defined($ref_seq) && $ref_seq =~ /^[ACGT]+$/;
      return {} unless defined($alt_seq) && $alt_seq =~ /^[ACGT]+$/;

      my $ref_score = $self->score3($ref_seq);
      my $alt_score = $self->score3($alt_seq);

      return {
        MaxEntScan_ref => $ref_score,
        MaxEntScan_ref_seq => $ref_seq,
        MaxEntScan_alt => $alt_score,
        MaxEntScan_alt_seq => $alt_seq,
        MaxEntScan_diff => $ref_score - $alt_score,
      }
    }
  }

  return {};
}

sub run_SWA {
  my ($self, $tva) = @_;

  my $vf = $tva->variation_feature;

  my %results;

  # get the donor reference and alternate sequence contexts
  my ($donor_ref_context, $donor_alt_context) = @{$self->get_seqs($tva, $vf->start - 8, $vf->end + 8)};

  if (defined($donor_ref_context)) {
    $results{'MES-SWA_donor_ref_context'} = $donor_ref_context;

    if ($donor_ref_context  =~ /^[ACGT]+$/) {
      my ($seq, $frame, $score) = @{$self->get_max_donor($donor_ref_context)};
      $results{'MES-SWA_donor_ref_seq'} = $seq;
      $results{'MES-SWA_donor_ref_frame'} = $frame;
      $results{'MES-SWA_donor_ref'} = $score;
    }
  }

  if (defined($donor_alt_context)) {
    $results{'MES-SWA_donor_alt_context'} = $donor_alt_context;

    if ($donor_alt_context  =~ /^[ACGT]+$/) {
      my ($seq, $frame, $score) = @{$self->get_max_donor($donor_alt_context)};
      $results{'MES-SWA_donor_alt_seq'} = $seq;
      $results{'MES-SWA_donor_alt_frame'} = $frame;
      $results{'MES-SWA_donor_alt'} = $score;

      if (defined(my $ref_comp_seq = $results{'MES-SWA_donor_ref_seq'})) {

        if ($vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/) {
          # for SNVs, compare to the same frame as the highest scoring ALT k-mer
          $ref_comp_seq = substr($donor_ref_context, $frame - 1, 9);
        }

        $results{'MES-SWA_donor_ref_comp_seq'} = $ref_comp_seq;
        $results{'MES-SWA_donor_ref_comp'} = $self->score5($ref_comp_seq);

        $results{'MES-SWA_donor_diff'} = $results{'MES-SWA_donor_ref_comp'} - $score;
      }
    }
  }

  # get the acceptor reference and alternate sequence contexts
  my ($acceptor_ref_context, $acceptor_alt_context) = @{$self->get_seqs($tva, $vf->start - 22, $vf->end + 22)};

  if (defined($acceptor_ref_context)) {
    $results{'MES-SWA_acceptor_ref_context'} = $acceptor_ref_context;

    if ($acceptor_ref_context  =~ /^[ACGT]+$/) {
      my ($seq, $frame, $score) = @{$self->get_max_acceptor($acceptor_ref_context)};
      $results{'MES-SWA_acceptor_ref_seq'} = $seq;
      $results{'MES-SWA_acceptor_ref_frame'} = $frame;
      $results{'MES-SWA_acceptor_ref'} = $score;
    }
  }

  if (defined($acceptor_alt_context)) {
    $results{'MES-SWA_acceptor_alt_context'} = $acceptor_alt_context;

    if ($acceptor_alt_context  =~ /^[ACGT]+$/) {
      my ($seq, $frame, $score) = @{$self->get_max_acceptor($acceptor_alt_context)};
      $results{'MES-SWA_acceptor_alt_seq'} = $seq;
      $results{'MES-SWA_acceptor_alt_frame'} = $frame;
      $results{'MES-SWA_acceptor_alt'} = $score;

      if (defined(my $ref_comp_seq = $results{'MES-SWA_acceptor_ref_seq'})) {

        if ($vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/) {
          # for SNVs, compare to the same frame as the highest scoring ALT k-mer
          $ref_comp_seq = substr($acceptor_ref_context, $frame - 1, 23);
        }

        $results{'MES-SWA_acceptor_ref_comp_seq'} = $ref_comp_seq;
        $results{'MES-SWA_acceptor_ref_comp'} = $self->score3($ref_comp_seq);

        $results{'MES-SWA_acceptor_diff'} = $results{'MES-SWA_acceptor_ref_comp'} - $score;
      }
    }
  }

  return \%results;
}

sub run_NCSS {
  my ($self, $tva) = @_;

  my $tv = $tva->transcript_variation;
  my $tr = $tva->transcript;

  my %results;

  if ($tv->intron_number) {

    my ($intron_numbers, $total_introns) = split(/\//, $tv->intron_number);
    my $intron_number = (split(/-/, $intron_numbers))[0];

    my $introns = $tr->get_all_Introns;

    my $intron_idx = $intron_number - 1;
    my $intron = $introns->[$intron_idx];

    if (defined(my $seq = $self->get_donor_seq_from_intron($intron))) {
      $results{'MES-NCSS_upstream_donor_seq'} = $seq;
      $results{'MES-NCSS_upstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
    }

    if (defined(my $seq = $self->get_acceptor_seq_from_intron($intron))) {
      $results{'MES-NCSS_downstream_acceptor_seq'} = $seq;
      $results{'MES-NCSS_downstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
    }

    # don't calculate an upstream acceptor score if the intron is the first in the transcript
    unless ($intron_number == 1) {
      my $upstream_intron = $introns->[$intron_idx - 1];

      if (defined(my $seq = $self->get_acceptor_seq_from_intron($upstream_intron))) {
        $results{'MES-NCSS_upstream_acceptor_seq'} = $seq;
        $results{'MES-NCSS_upstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
      }
    }

    # don't calculate a downstream donor score if the intron is the last in the transcript
    unless ($intron_number == $total_introns) {
      my $downstream_intron = $introns->[$intron_idx + 1];

      if (defined(my $seq = $self->get_donor_seq_from_intron($downstream_intron))) {
        $results{'MES-NCSS_downstream_donor_seq'} = $seq;
        $results{'MES-NCSS_downstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
      }
    }
  }

  elsif ($tv->exon_number) {

    my ($exon_numbers, $total_exons) = split(/\//, $tv->exon_number);
    my $exon_number = (split(/-/, $exon_numbers))[0];

    my $exons = $tr->get_all_Exons;

    my $exon_idx = $exon_number - 1;
    my $exon = $exons->[$exon_idx];

    # don't calculate upstream scores if the exon is the first in the transcript
    unless ($exon_number == 1) {
      my $upstream_exon = $exons->[$exon_idx - 1];

      if (defined(my $seq = $self->get_donor_seq_from_exon($upstream_exon))) {
        $results{'MES-NCSS_upstream_donor_seq'} = $seq;
        $results{'MES-NCSS_upstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
      }

      if (defined(my $seq = $self->get_acceptor_seq_from_exon($exon))) {
        $results{'MES-NCSS_upstream_acceptor_seq'} = $seq;
        $results{'MES-NCSS_upstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
      }
    }

    # don't calculate downstream scores if the exon is the last exon in the transcript
    unless ($exon_number == $total_exons) {
      my $downstream_exon = $exons->[$exon_idx + 1];

      if (defined(my $seq = $self->get_donor_seq_from_exon($exon))) {
        $results{'MES-NCSS_downstream_donor_seq'} = $seq;
        $results{'MES-NCSS_downstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
      }

      if (defined(my $seq = $self->get_acceptor_seq_from_exon($downstream_exon))) {
        $results{'MES-NCSS_downstream_acceptor_seq'} = $seq;
        $results{'MES-NCSS_downstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
      }
    }
  }

  return \%results;
}


## Sliding window approach methods
##################################

sub get_max_donor {
  my ($self, $sequence) = @_;

  my ($seq, $frame, $max);
  my @kmers = @{$self->sliding_window($sequence, 9)};

  for my $i (0 .. $#kmers) {
    my $kmer = $kmers[$i];
    my $score = $self->score5($kmer);
    if(!$max || $score > $max) {
      $seq = $kmer;
      $frame = $i + 1;
      $max = $score;
    }
  }
  return [$seq, $frame, $max];
}

sub get_max_acceptor {
  my ($self, $sequence) = @_;

  my ($seq, $frame, $max);
  my @kmers = @{$self->sliding_window($sequence, 23)};

  for my $i (0 .. $#kmers) {
    my $kmer = $kmers[$i];
    my $score = $self->score3($kmer);
    if(!$max || $score > $max) {
      $seq = $kmer;
      $frame = $i + 1;
      $max = $score;
    }
  }
  return [$seq, $frame, $max];
}

sub sliding_window {
  my ($self, $sequence, $winsize) = @_;
  my @seqs;
  for (my $i = 1; $i <= length($sequence) - $winsize + 1; $i++) {
    push @seqs, substr($sequence, $i - 1, $winsize);
  }
  return \@seqs;
}


## Nearest canonical splice site methods
########################################

sub get_donor_seq_from_exon {
  my ($self, $exon) = @_;

  my ($start, $end);

  if ($exon->strand > 0) {
    ($start, $end) = ($exon->end - 2, $exon->end + 6);
  }
  else {
    ($start, $end) = ($exon->start - 6, $exon->start + 2);
  }

  my $slice = $exon->slice()->sub_Slice($start, $end, $exon->strand);
  my $seq = $slice->seq() if defined($slice);

  return $seq;
}

sub get_acceptor_seq_from_exon {
  my ($self, $exon) = @_;

  my ($start, $end);

  if ($exon->strand > 0) {
    ($start, $end) = ($exon->start - 20, $exon->start + 2);
  }
  else {
    ($start, $end) = ($exon->end - 2, $exon->end + 20);
  }

  my $slice = $exon->slice()->sub_Slice($start, $end, $exon->strand);
  my $seq = $slice->seq() if defined($slice);

  return $seq;
}

sub get_donor_seq_from_intron {
  my ($self, $intron) = @_;

  my ($start, $end);

  if ($intron->strand > 0) {
    ($start, $end) = ($intron->start - 3, $intron->start + 5);
  }
  else {
    ($start, $end) = ($intron->end - 5, $intron->end + 3);
  }

  my $slice = $intron->slice()->sub_Slice($start, $end, $intron->strand);
  my $seq = $slice->seq() if defined($slice);

  return $seq;
}

sub get_acceptor_seq_from_intron {
  my ($self, $intron) = @_;

  my ($start, $end);

  if ($intron->strand > 0) {
    ($start, $end) = ($intron->end - 19, $intron->end + 3);
  }
  else {
    ($start, $end) = ($intron->start - 3, $intron->start + 19);
  }

  my $slice = $intron->slice()->sub_Slice($start, $end, $intron->strand);
  my $seq = $slice->seq() if defined($slice);

  return $seq;
}


## Common methods
#################

sub get_seqs {
  my ($self, $tva, $start, $end) = @_;
  my $vf = $tva->variation_feature;

  my $tr_strand = $tva->transcript->strand;

  my $ref_slice = $vf->{slice}->sub_Slice($start, $end, $tr_strand);

  my ($ref_seq, $alt_seq);

  if (defined $ref_slice) {

    $ref_seq = $alt_seq = $ref_slice->seq();

    my $substr_start = $tr_strand > 0 ? $vf->{start} - $start : $end - $vf->{end};
    my $feature_seq = $tva->seq_length > 0 ? $tva->feature_seq : '';

    substr($alt_seq, $substr_start, ($vf->{end} - $vf->{start}) + 1) = $feature_seq;
  }

  return [$ref_seq, $alt_seq];
}

sub score5 {
  my $self = shift;
  my $seq = shift;
  my $hex = md5_hex($seq);

  # check cache
  if($self->{cache}) {
    my ($res) = grep {$_->{hex} eq $hex} @{$self->{cache}->{score5}};

    return $res->{score} if $res; 
  }

  my $a = $self->score5_scoreconsensus($seq);
  die("ERROR: No score5_scoreconsensus\n") unless defined($a);

  my $b = $self->score5_getrest($seq);
  die("ERROR: No score5_getrest\n") unless defined($b);

  my $c = $self->{'score5_seq'}->{$b};
  die("ERROR: No score5_seq for $b\n") unless defined($c);

  my $d = $self->{'score5_me2x5'}->{$c};
  die("ERROR: No score5_me2x5 for $c\n") unless defined($d);

  my $score = $self->log2($a * $d);

  # cache it
  push @{$self->{cache}->{score5}}, { hex => $hex, score => $score };
  shift @{$self->{cache}->{score5}} while scalar @{$self->{cache}->{score5}} > $CACHE_SIZE;

  return $score;
}

sub score3 {
  my $self = shift;
  my $seq = shift;
  my $hex = md5_hex($seq);

  # check cache
  if($self->{cache}) {
    my ($res) = grep {$_->{hex} eq $hex} @{$self->{cache}->{score3}};

    return $res->{score} if $res; 
  }

  my $a = $self->score3_scoreconsensus($seq);
  die("ERROR: No score3_scoreconsensus\n") unless defined($a);

  my $b = $self->score3_getrest($seq);
  die("ERROR: No score3_getrest\n") unless defined($b);

  my $c = $self->score3_maxentscore($b, $self->{'score3_metables'});
  die("ERROR: No score3_maxentscore for $b\n") unless defined($c);

  my $score = $self->log2($a * $c);

  # cache it
  push @{$self->{cache}->{score3}}, { hex => $hex, score => $score };
  shift @{$self->{cache}->{score3}} while scalar @{$self->{cache}->{score3}} > $CACHE_SIZE;

  return $score;
}


## methods copied from score5.pl
################################

sub score5_makesequencematrix {
  my $self = shift;
  my $file = shift;
  my %matrix;
  my $n=0;
  open(SCOREF, $file) || die "Can't open $file!\n";
  while(<SCOREF>) { 
    chomp;
    $_=~ s/\s//;
    $matrix{$_} = $n;
    $n++;
  }
  close(SCOREF);
  return \%matrix;
}

sub score5_makescorematrix {
  my $self = shift;
  my $file = shift;
  my %matrix;
  my $n=0;
  open(SCOREF, $file) || die "Can't open $file!\n";
  while(<SCOREF>) { 
    chomp;
    $_=~ s/\s//;
    $matrix{$n} = $_;
    $n++;
  }
  close(SCOREF);
  return \%matrix;
}

sub score5_getrest {
  my $self = shift;
  my $seq = shift;
  my @seqa = split(//,uc($seq));
  return $seqa[0].$seqa[1].$seqa[2].$seqa[5].$seqa[6].$seqa[7].$seqa[8];
}

sub score5_scoreconsensus {
  my $self = shift;
  my $seq = shift;
  my @seqa = split(//,uc($seq));
  my %bgd; 
  $bgd{'A'} = 0.27; 
  $bgd{'C'} = 0.23; 
  $bgd{'G'} = 0.23; 
  $bgd{'T'} = 0.27;  
  my %cons1;
  $cons1{'A'} = 0.004;
  $cons1{'C'} = 0.0032;
  $cons1{'G'} = 0.9896;
  $cons1{'T'} = 0.0032;
  my %cons2;
  $cons2{'A'} = 0.0034; 
  $cons2{'C'} = 0.0039; 
  $cons2{'G'} = 0.0042; 
  $cons2{'T'} = 0.9884;
  my $addscore = $cons1{$seqa[3]}*$cons2{$seqa[4]}/($bgd{$seqa[3]}*$bgd{$seqa[4]}); 
  return $addscore;
}

sub log2 {
  my ($self, $val) = @_;
  return log($val)/log(2);
}


## methods copied from score3.pl
################################

sub score3_hashseq {
  #returns hash of sequence in base 4
  # $self->score3_hashseq('CAGAAGT') returns 4619
  my $self = shift;
  my $seq = shift;
  $seq = uc($seq);
  $seq =~ tr/ACGT/0123/;
  my @seqa = split(//,$seq);
  my $sum = 0;
  my $len = length($seq);
  my @four = (1,4,16,64,256,1024,4096,16384);
  my $i=0;
  while ($i<$len) {
    $sum+= $seqa[$i] * $four[$len - $i -1] ;
    $i++;
  }
  return $sum;
}

sub score3_makemaxentscores {
  my $self = shift;
  my $dir = $self->{'_dir'}."/splicemodels/";
  my @list = ('me2x3acc1','me2x3acc2','me2x3acc3','me2x3acc4',
    'me2x3acc5','me2x3acc6','me2x3acc7','me2x3acc8','me2x3acc9');
  my @metables;
  my $num = 0 ;
  foreach my $file (@list) {
    my $n = 0;
    open (SCOREF,"<".$dir.$file) || die "Can't open $file!\n";
    while(<SCOREF>) {
      chomp;
      $_=~ s/\s//;
      $metables[$num]{$n} = $_;
      $n++;
    }
    close(SCOREF);
    #print STDERR $file."\t".$num."\t".$n."\n";
    $num++;
  }
  return \@metables;
}

sub score3_maxentscore {
  my $self = shift;
  my $seq = shift;
  my $table_ref = shift;
  my @metables = @$table_ref;
  my @sc;
  $sc[0] = $metables[0]{$self->score3_hashseq(substr($seq,0,7))};
  $sc[1] = $metables[1]{$self->score3_hashseq(substr($seq,7,7))};
  $sc[2] = $metables[2]{$self->score3_hashseq(substr($seq,14,7))};
  $sc[3] = $metables[3]{$self->score3_hashseq(substr($seq,4,7))};
  $sc[4] = $metables[4]{$self->score3_hashseq(substr($seq,11,7))};
  $sc[5] = $metables[5]{$self->score3_hashseq(substr($seq,4,3))};
  $sc[6] = $metables[6]{$self->score3_hashseq(substr($seq,7,4))};
  $sc[7] = $metables[7]{$self->score3_hashseq(substr($seq,11,3))};
  $sc[8] = $metables[8]{$self->score3_hashseq(substr($seq,14,4))};
  my $finalscore = $sc[0] * $sc[1] * $sc[2] * $sc[3] * $sc[4] / ($sc[5] * $sc[6] * $sc[7] * $sc[8]);
  return $finalscore;
}

sub score3_getrest {
  my $self = shift;
  my $seq = shift;
  my $seq_noconsensus = substr($seq,0,18).substr($seq,20,3);
  return $seq_noconsensus;
}

sub score3_scoreconsensus {
  my $self = shift;
  my $seq = shift;
  my @seqa = split(//,uc($seq));
  my %bgd; 
  $bgd{'A'} = 0.27; 
  $bgd{'C'} = 0.23; 
  $bgd{'G'} = 0.23; 
  $bgd{'T'} = 0.27;  
  my %cons1;
  $cons1{'A'} = 0.9903;
  $cons1{'C'} = 0.0032;
  $cons1{'G'} = 0.0034;
  $cons1{'T'} = 0.0030;
  my %cons2;
  $cons2{'A'} = 0.0027; 
  $cons2{'C'} = 0.0037; 
  $cons2{'G'} = 0.9905; 
  $cons2{'T'} = 0.0030;
  my $addscore = $cons1{$seqa[18]} * $cons2{$seqa[19]}/ ($bgd{$seqa[18]} * $bgd{$seqa[19]}); 
  return $addscore;
}

1;