diff dir_plugins/MaxEntScan.pm @ 0:e545d0a25ffe draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:17:17 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dir_plugins/MaxEntScan.pm	Mon Jul 15 05:17:17 2019 -0400
@@ -0,0 +1,895 @@
+=head1 LICENSE
+
+Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
+Copyright [2016-2018] EMBL-European Bioinformatics Institute
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+=head1 CONTACT
+
+ Ensembl <http://www.ensembl.org/info/about/contact/index.html>
+    
+=cut
+
+=head1 NAME
+
+ MaxEntScan
+
+=head1 SYNOPSIS
+
+ mv MaxEntScan.pm ~/.vep/Plugins
+ ./vep -i variants.vcf --plugin MaxEntScan,/path/to/maxentscan/fordownload
+ ./vep -i variants.vcf --plugin MaxEntScan,/path/to/maxentscan/fordownload,SWA,NCSS
+
+=head1 DESCRIPTION
+
+ This is a plugin for the Ensembl Variant Effect Predictor (VEP) that
+ runs MaxEntScan (http://genes.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html)
+ to get splice site predictions.
+
+ The plugin copies most of the code verbatim from the score5.pl and score3.pl
+ scripts provided in the MaxEntScan download. To run the plugin you must get and
+ unpack the archive from http://genes.mit.edu/burgelab/maxent/download/; the path
+ to this unpacked directory is then the param you pass to the --plugin flag.
+
+ The plugin executes the logic from one of the scripts depending on which
+ splice region the variant overlaps:
+
+ score5.pl : last 3 bases of exon    --> first 6 bases of intron
+ score3.pl : last 20 bases of intron --> first 3 bases of exon
+
+ The plugin reports the reference, alternate and difference (REF - ALT) maximum
+ entropy scores.
+
+ If 'SWA' is specified as a command-line argument, a sliding window algorithm
+ is applied to subsequences containing the reference and alternate alleles to
+ identify k-mers with the highest donor and acceptor splice site scores. To assess
+ the impact of variants, reference comparison scores are also provided. For SNVs,
+ the comparison scores are derived from sequence in the same frame as the highest
+ scoring k-mers containing the alternate allele. For all other variants, the
+ comparison scores are derived from the highest scoring k-mers containing the
+ reference allele. The difference between the reference comparison and alternate
+ scores (SWA_REF_COMP - SWA_ALT) are also provided.
+
+ If 'NCSS' is specified as a command-line argument, scores for the nearest
+ upstream and downstream canonical splice sites are also included.
+
+ By default, only scores are reported. Add 'verbose' to the list of command-
+ line arguments to include the sequence output associated with those scores.
+
+=cut
+
+package MaxEntScan;
+
+use strict;
+use warnings;
+
+use Digest::MD5 qw(md5_hex);
+
+use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
+use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap);
+
+use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
+use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
+
+# how many seq/score pairs to cache in memory
+our $CACHE_SIZE = 50;
+
+sub new {
+  my $class = shift;
+
+  my $self = $class->SUPER::new(@_);
+  
+  # we need sequence, so no offline mode unless we have FASTA
+  die("ERROR: cannot function in offline mode without a FASTA file\n") if $self->{config}->{offline} && !$self->{config}->{fasta};
+
+  my $params = $self->params;
+
+  my $dir = shift @$params;
+  die("ERROR: MaxEntScan directory not specified\n") unless $dir;
+  die("ERROR: MaxEntScan directory not found\n") unless -d $dir;
+  $self->{_dir} = $dir;
+
+  ## setup from score5.pl
+  $self->{'score5_me2x5'} = $self->score5_makescorematrix($dir.'/me2x5');
+  $self->{'score5_seq'}   = $self->score5_makesequencematrix($dir.'/splicemodels/splice5sequences');
+
+  ## setup from score3.pl
+  $self->{'score3_metables'} = $self->score3_makemaxentscores;
+
+  my %opts = map { $_ => undef } @$params;
+
+  $self->{'run_SWA'} = 1 if exists $opts{'SWA'};
+  $self->{'run_NCSS'} = 1 if exists $opts{'NCSS'};
+
+  $self->{'verbose'} = 1 if exists $opts{'verbose'};
+
+  return $self;
+}
+
+sub feature_types {
+  return ['Transcript'];
+}
+
+sub get_header_info {
+  my $self = shift;
+
+  my $v = $self->{'verbose'};
+  my $headers = $self->get_MES_header_info($v);
+
+  if ($self->{'run_SWA'}) {
+    my $swa_headers = $self->get_SWA_header_info($v);
+    $headers = {%$headers, %$swa_headers};
+  }
+
+  if ($self->{'run_NCSS'}) {
+    my $ncss_headers = $self->get_NCSS_header_info($v);
+    $headers = {%$headers, %$ncss_headers};
+  }
+
+  return $headers;
+}
+
+sub get_MES_header_info {
+  my ($self, $verbose) = @_;
+
+  my $headers = {
+    MaxEntScan_ref => "MaxEntScan reference sequence score",
+    MaxEntScan_alt => "MaxEntScan alternate sequence score",
+    MaxEntScan_diff => "MaxEntScan score difference",
+  };
+
+  if ($verbose) {
+
+    $headers->{'MaxEntScan_ref_seq'} = "MaxEntScan reference sequence";
+    $headers->{'MaxEntScan_alt_seq'} = "MaxEntScan alternate sequence";
+  }
+
+  return $headers;
+}
+
+sub get_SWA_header_info {
+  my ($self, $verbose) = @_;
+
+  my $headers = {
+    "MES-SWA_donor_ref" => "Highest splice donor reference sequence score",
+    "MES-SWA_donor_alt" => "Highest splice donor alternate sequence score",
+    "MES-SWA_donor_ref_comp" => "Donor reference comparison sequence score",
+    "MES-SWA_donor_diff" => "Difference between the donor reference comparison and alternate sequence scores",
+
+    "MES-SWA_acceptor_ref" => "Highest splice acceptor reference sequence score",
+    "MES-SWA_acceptor_alt" => "Highest splice acceptor alternate sequence score",
+    "MES-SWA_acceptor_ref_comp" => "Acceptor reference comparison sequence score",
+    "MES-SWA_acceptor_diff" => "Difference between the acceptor reference comparison and alternate sequence scores",
+  };
+
+  if ($verbose) {
+
+    $headers->{'MES-SWA_donor_ref_seq'} = "Highest splice donor reference sequence";
+    $headers->{'MES-SWA_donor_ref_frame'} = "Position of the highest splice donor reference sequence";
+    $headers->{'MES-SWA_donor_ref_context'} = "Selected donor sequence context containing the reference allele";
+    $headers->{'MES-SWA_donor_alt_seq'} = "Highest splice donor alternate sequence";
+    $headers->{'MES-SWA_donor_alt_frame'} = "Position of the highest splice donor alternate sequence";
+    $headers->{'MES-SWA_donor_alt_context'} = "Selected donor sequence context containing the alternate allele";
+    $headers->{'MES-SWA_donor_ref_comp_seq'} = "Donor reference comparison sequence";
+
+    $headers->{'MES-SWA_acceptor_ref_seq'} = "Highest splice acceptor reference sequence";
+    $headers->{'MES-SWA_acceptor_ref_frame'} = "Position of the highest splice acceptor reference sequence";
+    $headers->{'MES-SWA_acceptor_ref_context'} = "Selected acceptor sequence context containing the reference allele";
+    $headers->{'MES-SWA_acceptor_alt_seq'} = "Highest splice acceptor alternate sequence";
+    $headers->{'MES-SWA_acceptor_alt_frame'} = "Position of the highest splice acceptor alternate sequence";
+    $headers->{'MES-SWA_acceptor_alt_context'} = "Selected acceptor sequence context containing the alternate allele";
+    $headers->{'MES-SWA_acceptor_ref_comp_seq'} = "Acceptor reference comparison sequence";
+  }
+
+  return $headers;
+}
+
+sub get_NCSS_header_info {
+  my ($self, $verbose) = @_;
+
+  my $headers = {
+    "MES-NCSS_upstream_acceptor" => "Nearest upstream canonical splice acceptor sequence score",
+    "MES-NCSS_upstream_donor" => "Nearest upstream canonical splice donor sequence score",
+
+    "MES-NCSS_downstream_acceptor" => "Nearest downstream canonical splice acceptor sequence score",
+    "MES-NCSS_downstream_donor" => "Nearest downstream canonical splice donor sequence score",
+  };
+
+  if ($verbose) {
+
+    $headers->{'MES-NCSS_upstream_acceptor_seq'} = "Nearest upstream canonical splice acceptor sequence";
+    $headers->{'MES-NCSS_upstream_donor_seq'} = "Nearest upstream canonical splice donor sequence";
+
+    $headers->{'MES-NCSS_downstream_acceptor_seq'} = "Nearest downstream canonical splice acceptor sequence";
+    $headers->{'MES-NCSS_downstream_donor_seq'} = "Nearest downstream canonical splice donor sequence";
+  }
+
+  return $headers;
+}
+
+sub run {
+  my ($self, $tva) = @_;
+
+  my $seq_headers = $self->get_MES_header_info();
+  my $results = $self->run_MES($tva);
+
+  if ($self->{'run_SWA'}) {
+    my $swa_seq_headers = $self->get_SWA_header_info();
+    $seq_headers = {%$seq_headers, %$swa_seq_headers};
+    my $swa_results = $self->run_SWA($tva);
+    $results = {%$results, %$swa_results};
+  }
+
+  if ($self->{'run_NCSS'}) {
+    my $ncss_seq_headers = $self->get_NCSS_header_info();
+    $seq_headers = {%$seq_headers, %$ncss_seq_headers};
+    my $ncss_results = $self->run_NCSS($tva);
+    $results = {%$results, %$ncss_results};
+  }
+
+  my %data;
+
+  # add the scores
+  my @scores = grep { exists $results->{$_} } keys %$seq_headers;
+  @data{@scores} = map { sprintf('%.3f', $_) } @{$results}{@scores};
+
+  if ($self->{'verbose'}) {
+    # add any remaining results
+    my @non_scores = grep { ! exists $data{$_} } keys %$results;
+    @data{@non_scores} = @{$results}{@non_scores};
+  }
+
+  return \%data;
+}
+
+sub run_MES {
+  my ($self, $tva) = @_;
+
+  my $vf = $tva->variation_feature;
+  return {} unless $vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/;
+
+  my $tv = $tva->transcript_variation;
+  my $tr = $tva->transcript;
+  my $tr_strand = $tr->strand;
+  my ($vf_start, $vf_end) = ($vf->start, $vf->end);
+
+  # use _overlapped_introns() method from BaseTranscriptVariation
+  # this will use an interval tree if available for superfast lookup of overlapping introns
+  # we have to expand the search space around $vf because we're looking for the splice region not the intron per se
+  foreach my $intron(@{$tv->_overlapped_introns($vf_start - 21, $vf_end + 21)}) {
+    
+    # get coords depending on strand
+    # MaxEntScan does different predictions for 5 and 3 prime
+    # and we need to feed it different bits of sequence for each
+    #
+    # 5prime, 3 bases of exon, 6 bases of intron:
+    # ===------
+    #
+    # 3prime, 20 bases of intron, 3 bases of exon
+    # --------------------===
+
+    my ($five_start, $five_end, $three_start, $three_end);
+
+    if($tr_strand > 0) {
+      ($five_start, $five_end)   = ($intron->start - 3, $intron->start + 5);
+      ($three_start, $three_end) = ($intron->end - 19, $intron->end + 3);
+    }
+
+    else {
+      ($five_start, $five_end)   = ($intron->end - 5, $intron->end + 3);
+      ($three_start, $three_end) = ($intron->start - 3, $intron->start + 19);
+    }
+
+    if(overlap($vf->start, $vf->end, $five_start, $five_end)) {
+      my ($ref_seq, $alt_seq) = @{$self->get_seqs($tva, $five_start, $five_end)};
+
+      return {} unless defined($ref_seq) && $ref_seq =~ /^[ACGT]+$/;
+      return {} unless defined($alt_seq) && $alt_seq =~ /^[ACGT]+$/;
+
+      my $ref_score = $self->score5($ref_seq);
+      my $alt_score = $self->score5($alt_seq);
+
+      return {
+        MaxEntScan_ref => $ref_score,
+        MaxEntScan_ref_seq => $ref_seq,
+        MaxEntScan_alt => $alt_score,
+        MaxEntScan_alt_seq => $alt_seq,
+        MaxEntScan_diff => $ref_score - $alt_score,
+      }
+    }
+
+    if(overlap($vf->start, $vf->end, $three_start, $three_end)) {
+      my ($ref_seq, $alt_seq) = @{$self->get_seqs($tva, $three_start, $three_end)};
+
+      return {} unless defined($ref_seq) && $ref_seq =~ /^[ACGT]+$/;
+      return {} unless defined($alt_seq) && $alt_seq =~ /^[ACGT]+$/;
+
+      my $ref_score = $self->score3($ref_seq);
+      my $alt_score = $self->score3($alt_seq);
+
+      return {
+        MaxEntScan_ref => $ref_score,
+        MaxEntScan_ref_seq => $ref_seq,
+        MaxEntScan_alt => $alt_score,
+        MaxEntScan_alt_seq => $alt_seq,
+        MaxEntScan_diff => $ref_score - $alt_score,
+      }
+    }
+  }
+
+  return {};
+}
+
+sub run_SWA {
+  my ($self, $tva) = @_;
+
+  my $vf = $tva->variation_feature;
+
+  my %results;
+
+  # get the donor reference and alternate sequence contexts
+  my ($donor_ref_context, $donor_alt_context) = @{$self->get_seqs($tva, $vf->start - 8, $vf->end + 8)};
+
+  if (defined($donor_ref_context)) {
+    $results{'MES-SWA_donor_ref_context'} = $donor_ref_context;
+
+    if ($donor_ref_context  =~ /^[ACGT]+$/) {
+      my ($seq, $frame, $score) = @{$self->get_max_donor($donor_ref_context)};
+      $results{'MES-SWA_donor_ref_seq'} = $seq;
+      $results{'MES-SWA_donor_ref_frame'} = $frame;
+      $results{'MES-SWA_donor_ref'} = $score;
+    }
+  }
+
+  if (defined($donor_alt_context)) {
+    $results{'MES-SWA_donor_alt_context'} = $donor_alt_context;
+
+    if ($donor_alt_context  =~ /^[ACGT]+$/) {
+      my ($seq, $frame, $score) = @{$self->get_max_donor($donor_alt_context)};
+      $results{'MES-SWA_donor_alt_seq'} = $seq;
+      $results{'MES-SWA_donor_alt_frame'} = $frame;
+      $results{'MES-SWA_donor_alt'} = $score;
+
+      if (defined(my $ref_comp_seq = $results{'MES-SWA_donor_ref_seq'})) {
+
+        if ($vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/) {
+          # for SNVs, compare to the same frame as the highest scoring ALT k-mer
+          $ref_comp_seq = substr($donor_ref_context, $frame - 1, 9);
+        }
+
+        $results{'MES-SWA_donor_ref_comp_seq'} = $ref_comp_seq;
+        $results{'MES-SWA_donor_ref_comp'} = $self->score5($ref_comp_seq);
+
+        $results{'MES-SWA_donor_diff'} = $results{'MES-SWA_donor_ref_comp'} - $score;
+      }
+    }
+  }
+
+  # get the acceptor reference and alternate sequence contexts
+  my ($acceptor_ref_context, $acceptor_alt_context) = @{$self->get_seqs($tva, $vf->start - 22, $vf->end + 22)};
+
+  if (defined($acceptor_ref_context)) {
+    $results{'MES-SWA_acceptor_ref_context'} = $acceptor_ref_context;
+
+    if ($acceptor_ref_context  =~ /^[ACGT]+$/) {
+      my ($seq, $frame, $score) = @{$self->get_max_acceptor($acceptor_ref_context)};
+      $results{'MES-SWA_acceptor_ref_seq'} = $seq;
+      $results{'MES-SWA_acceptor_ref_frame'} = $frame;
+      $results{'MES-SWA_acceptor_ref'} = $score;
+    }
+  }
+
+  if (defined($acceptor_alt_context)) {
+    $results{'MES-SWA_acceptor_alt_context'} = $acceptor_alt_context;
+
+    if ($acceptor_alt_context  =~ /^[ACGT]+$/) {
+      my ($seq, $frame, $score) = @{$self->get_max_acceptor($acceptor_alt_context)};
+      $results{'MES-SWA_acceptor_alt_seq'} = $seq;
+      $results{'MES-SWA_acceptor_alt_frame'} = $frame;
+      $results{'MES-SWA_acceptor_alt'} = $score;
+
+      if (defined(my $ref_comp_seq = $results{'MES-SWA_acceptor_ref_seq'})) {
+
+        if ($vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/) {
+          # for SNVs, compare to the same frame as the highest scoring ALT k-mer
+          $ref_comp_seq = substr($acceptor_ref_context, $frame - 1, 23);
+        }
+
+        $results{'MES-SWA_acceptor_ref_comp_seq'} = $ref_comp_seq;
+        $results{'MES-SWA_acceptor_ref_comp'} = $self->score3($ref_comp_seq);
+
+        $results{'MES-SWA_acceptor_diff'} = $results{'MES-SWA_acceptor_ref_comp'} - $score;
+      }
+    }
+  }
+
+  return \%results;
+}
+
+sub run_NCSS {
+  my ($self, $tva) = @_;
+
+  my $tv = $tva->transcript_variation;
+  my $tr = $tva->transcript;
+
+  my %results;
+
+  if ($tv->intron_number) {
+
+    my ($intron_numbers, $total_introns) = split(/\//, $tv->intron_number);
+    my $intron_number = (split(/-/, $intron_numbers))[0];
+
+    my $introns = $tr->get_all_Introns;
+
+    my $intron_idx = $intron_number - 1;
+    my $intron = $introns->[$intron_idx];
+
+    if (defined(my $seq = $self->get_donor_seq_from_intron($intron))) {
+      $results{'MES-NCSS_upstream_donor_seq'} = $seq;
+      $results{'MES-NCSS_upstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
+    }
+
+    if (defined(my $seq = $self->get_acceptor_seq_from_intron($intron))) {
+      $results{'MES-NCSS_downstream_acceptor_seq'} = $seq;
+      $results{'MES-NCSS_downstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
+    }
+
+    # don't calculate an upstream acceptor score if the intron is the first in the transcript
+    unless ($intron_number == 1) {
+      my $upstream_intron = $introns->[$intron_idx - 1];
+
+      if (defined(my $seq = $self->get_acceptor_seq_from_intron($upstream_intron))) {
+        $results{'MES-NCSS_upstream_acceptor_seq'} = $seq;
+        $results{'MES-NCSS_upstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
+      }
+    }
+
+    # don't calculate a downstream donor score if the intron is the last in the transcript
+    unless ($intron_number == $total_introns) {
+      my $downstream_intron = $introns->[$intron_idx + 1];
+
+      if (defined(my $seq = $self->get_donor_seq_from_intron($downstream_intron))) {
+        $results{'MES-NCSS_downstream_donor_seq'} = $seq;
+        $results{'MES-NCSS_downstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
+      }
+    }
+  }
+
+  elsif ($tv->exon_number) {
+
+    my ($exon_numbers, $total_exons) = split(/\//, $tv->exon_number);
+    my $exon_number = (split(/-/, $exon_numbers))[0];
+
+    my $exons = $tr->get_all_Exons;
+
+    my $exon_idx = $exon_number - 1;
+    my $exon = $exons->[$exon_idx];
+
+    # don't calculate upstream scores if the exon is the first in the transcript
+    unless ($exon_number == 1) {
+      my $upstream_exon = $exons->[$exon_idx - 1];
+
+      if (defined(my $seq = $self->get_donor_seq_from_exon($upstream_exon))) {
+        $results{'MES-NCSS_upstream_donor_seq'} = $seq;
+        $results{'MES-NCSS_upstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
+      }
+
+      if (defined(my $seq = $self->get_acceptor_seq_from_exon($exon))) {
+        $results{'MES-NCSS_upstream_acceptor_seq'} = $seq;
+        $results{'MES-NCSS_upstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
+      }
+    }
+
+    # don't calculate downstream scores if the exon is the last exon in the transcript
+    unless ($exon_number == $total_exons) {
+      my $downstream_exon = $exons->[$exon_idx + 1];
+
+      if (defined(my $seq = $self->get_donor_seq_from_exon($exon))) {
+        $results{'MES-NCSS_downstream_donor_seq'} = $seq;
+        $results{'MES-NCSS_downstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
+      }
+
+      if (defined(my $seq = $self->get_acceptor_seq_from_exon($downstream_exon))) {
+        $results{'MES-NCSS_downstream_acceptor_seq'} = $seq;
+        $results{'MES-NCSS_downstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
+      }
+    }
+  }
+
+  return \%results;
+}
+
+
+## Sliding window approach methods
+##################################
+
+sub get_max_donor {
+  my ($self, $sequence) = @_;
+
+  my ($seq, $frame, $max);
+  my @kmers = @{$self->sliding_window($sequence, 9)};
+
+  for my $i (0 .. $#kmers) {
+    my $kmer = $kmers[$i];
+    my $score = $self->score5($kmer);
+    if(!$max || $score > $max) {
+      $seq = $kmer;
+      $frame = $i + 1;
+      $max = $score;
+    }
+  }
+  return [$seq, $frame, $max];
+}
+
+sub get_max_acceptor {
+  my ($self, $sequence) = @_;
+
+  my ($seq, $frame, $max);
+  my @kmers = @{$self->sliding_window($sequence, 23)};
+
+  for my $i (0 .. $#kmers) {
+    my $kmer = $kmers[$i];
+    my $score = $self->score3($kmer);
+    if(!$max || $score > $max) {
+      $seq = $kmer;
+      $frame = $i + 1;
+      $max = $score;
+    }
+  }
+  return [$seq, $frame, $max];
+}
+
+sub sliding_window {
+  my ($self, $sequence, $winsize) = @_;
+  my @seqs;
+  for (my $i = 1; $i <= length($sequence) - $winsize + 1; $i++) {
+    push @seqs, substr($sequence, $i - 1, $winsize);
+  }
+  return \@seqs;
+}
+
+
+## Nearest canonical splice site methods
+########################################
+
+sub get_donor_seq_from_exon {
+  my ($self, $exon) = @_;
+
+  my ($start, $end);
+
+  if ($exon->strand > 0) {
+    ($start, $end) = ($exon->end - 2, $exon->end + 6);
+  }
+  else {
+    ($start, $end) = ($exon->start - 6, $exon->start + 2);
+  }
+
+  my $slice = $exon->slice()->sub_Slice($start, $end, $exon->strand);
+  my $seq = $slice->seq() if defined($slice);
+
+  return $seq;
+}
+
+sub get_acceptor_seq_from_exon {
+  my ($self, $exon) = @_;
+
+  my ($start, $end);
+
+  if ($exon->strand > 0) {
+    ($start, $end) = ($exon->start - 20, $exon->start + 2);
+  }
+  else {
+    ($start, $end) = ($exon->end - 2, $exon->end + 20);
+  }
+
+  my $slice = $exon->slice()->sub_Slice($start, $end, $exon->strand);
+  my $seq = $slice->seq() if defined($slice);
+
+  return $seq;
+}
+
+sub get_donor_seq_from_intron {
+  my ($self, $intron) = @_;
+
+  my ($start, $end);
+
+  if ($intron->strand > 0) {
+    ($start, $end) = ($intron->start - 3, $intron->start + 5);
+  }
+  else {
+    ($start, $end) = ($intron->end - 5, $intron->end + 3);
+  }
+
+  my $slice = $intron->slice()->sub_Slice($start, $end, $intron->strand);
+  my $seq = $slice->seq() if defined($slice);
+
+  return $seq;
+}
+
+sub get_acceptor_seq_from_intron {
+  my ($self, $intron) = @_;
+
+  my ($start, $end);
+
+  if ($intron->strand > 0) {
+    ($start, $end) = ($intron->end - 19, $intron->end + 3);
+  }
+  else {
+    ($start, $end) = ($intron->start - 3, $intron->start + 19);
+  }
+
+  my $slice = $intron->slice()->sub_Slice($start, $end, $intron->strand);
+  my $seq = $slice->seq() if defined($slice);
+
+  return $seq;
+}
+
+
+## Common methods
+#################
+
+sub get_seqs {
+  my ($self, $tva, $start, $end) = @_;
+  my $vf = $tva->variation_feature;
+
+  my $tr_strand = $tva->transcript->strand;
+
+  my $ref_slice = $vf->{slice}->sub_Slice($start, $end, $tr_strand);
+
+  my ($ref_seq, $alt_seq);
+
+  if (defined $ref_slice) {
+
+    $ref_seq = $alt_seq = $ref_slice->seq();
+
+    my $substr_start = $tr_strand > 0 ? $vf->{start} - $start : $end - $vf->{end};
+    my $feature_seq = $tva->seq_length > 0 ? $tva->feature_seq : '';
+
+    substr($alt_seq, $substr_start, ($vf->{end} - $vf->{start}) + 1) = $feature_seq;
+  }
+
+  return [$ref_seq, $alt_seq];
+}
+
+sub score5 {
+  my $self = shift;
+  my $seq = shift;
+  my $hex = md5_hex($seq);
+
+  # check cache
+  if($self->{cache}) {
+    my ($res) = grep {$_->{hex} eq $hex} @{$self->{cache}->{score5}};
+
+    return $res->{score} if $res; 
+  }
+
+  my $a = $self->score5_scoreconsensus($seq);
+  die("ERROR: No score5_scoreconsensus\n") unless defined($a);
+
+  my $b = $self->score5_getrest($seq);
+  die("ERROR: No score5_getrest\n") unless defined($b);
+
+  my $c = $self->{'score5_seq'}->{$b};
+  die("ERROR: No score5_seq for $b\n") unless defined($c);
+
+  my $d = $self->{'score5_me2x5'}->{$c};
+  die("ERROR: No score5_me2x5 for $c\n") unless defined($d);
+
+  my $score = $self->log2($a * $d);
+
+  # cache it
+  push @{$self->{cache}->{score5}}, { hex => $hex, score => $score };
+  shift @{$self->{cache}->{score5}} while scalar @{$self->{cache}->{score5}} > $CACHE_SIZE;
+
+  return $score;
+}
+
+sub score3 {
+  my $self = shift;
+  my $seq = shift;
+  my $hex = md5_hex($seq);
+
+  # check cache
+  if($self->{cache}) {
+    my ($res) = grep {$_->{hex} eq $hex} @{$self->{cache}->{score3}};
+
+    return $res->{score} if $res; 
+  }
+
+  my $a = $self->score3_scoreconsensus($seq);
+  die("ERROR: No score3_scoreconsensus\n") unless defined($a);
+
+  my $b = $self->score3_getrest($seq);
+  die("ERROR: No score3_getrest\n") unless defined($b);
+
+  my $c = $self->score3_maxentscore($b, $self->{'score3_metables'});
+  die("ERROR: No score3_maxentscore for $b\n") unless defined($c);
+
+  my $score = $self->log2($a * $c);
+
+  # cache it
+  push @{$self->{cache}->{score3}}, { hex => $hex, score => $score };
+  shift @{$self->{cache}->{score3}} while scalar @{$self->{cache}->{score3}} > $CACHE_SIZE;
+
+  return $score;
+}
+
+
+## methods copied from score5.pl
+################################
+
+sub score5_makesequencematrix {
+  my $self = shift;
+  my $file = shift;
+  my %matrix;
+  my $n=0;
+  open(SCOREF, $file) || die "Can't open $file!\n";
+  while(<SCOREF>) { 
+    chomp;
+    $_=~ s/\s//;
+    $matrix{$_} = $n;
+    $n++;
+  }
+  close(SCOREF);
+  return \%matrix;
+}
+
+sub score5_makescorematrix {
+  my $self = shift;
+  my $file = shift;
+  my %matrix;
+  my $n=0;
+  open(SCOREF, $file) || die "Can't open $file!\n";
+  while(<SCOREF>) { 
+    chomp;
+    $_=~ s/\s//;
+    $matrix{$n} = $_;
+    $n++;
+  }
+  close(SCOREF);
+  return \%matrix;
+}
+
+sub score5_getrest {
+  my $self = shift;
+  my $seq = shift;
+  my @seqa = split(//,uc($seq));
+  return $seqa[0].$seqa[1].$seqa[2].$seqa[5].$seqa[6].$seqa[7].$seqa[8];
+}
+
+sub score5_scoreconsensus {
+  my $self = shift;
+  my $seq = shift;
+  my @seqa = split(//,uc($seq));
+  my %bgd; 
+  $bgd{'A'} = 0.27; 
+  $bgd{'C'} = 0.23; 
+  $bgd{'G'} = 0.23; 
+  $bgd{'T'} = 0.27;  
+  my %cons1;
+  $cons1{'A'} = 0.004;
+  $cons1{'C'} = 0.0032;
+  $cons1{'G'} = 0.9896;
+  $cons1{'T'} = 0.0032;
+  my %cons2;
+  $cons2{'A'} = 0.0034; 
+  $cons2{'C'} = 0.0039; 
+  $cons2{'G'} = 0.0042; 
+  $cons2{'T'} = 0.9884;
+  my $addscore = $cons1{$seqa[3]}*$cons2{$seqa[4]}/($bgd{$seqa[3]}*$bgd{$seqa[4]}); 
+  return $addscore;
+}
+
+sub log2 {
+  my ($self, $val) = @_;
+  return log($val)/log(2);
+}
+
+
+## methods copied from score3.pl
+################################
+
+sub score3_hashseq {
+  #returns hash of sequence in base 4
+  # $self->score3_hashseq('CAGAAGT') returns 4619
+  my $self = shift;
+  my $seq = shift;
+  $seq = uc($seq);
+  $seq =~ tr/ACGT/0123/;
+  my @seqa = split(//,$seq);
+  my $sum = 0;
+  my $len = length($seq);
+  my @four = (1,4,16,64,256,1024,4096,16384);
+  my $i=0;
+  while ($i<$len) {
+    $sum+= $seqa[$i] * $four[$len - $i -1] ;
+    $i++;
+  }
+  return $sum;
+}
+
+sub score3_makemaxentscores {
+  my $self = shift;
+  my $dir = $self->{'_dir'}."/splicemodels/";
+  my @list = ('me2x3acc1','me2x3acc2','me2x3acc3','me2x3acc4',
+    'me2x3acc5','me2x3acc6','me2x3acc7','me2x3acc8','me2x3acc9');
+  my @metables;
+  my $num = 0 ;
+  foreach my $file (@list) {
+    my $n = 0;
+    open (SCOREF,"<".$dir.$file) || die "Can't open $file!\n";
+    while(<SCOREF>) {
+      chomp;
+      $_=~ s/\s//;
+      $metables[$num]{$n} = $_;
+      $n++;
+    }
+    close(SCOREF);
+    #print STDERR $file."\t".$num."\t".$n."\n";
+    $num++;
+  }
+  return \@metables;
+}
+
+sub score3_maxentscore {
+  my $self = shift;
+  my $seq = shift;
+  my $table_ref = shift;
+  my @metables = @$table_ref;
+  my @sc;
+  $sc[0] = $metables[0]{$self->score3_hashseq(substr($seq,0,7))};
+  $sc[1] = $metables[1]{$self->score3_hashseq(substr($seq,7,7))};
+  $sc[2] = $metables[2]{$self->score3_hashseq(substr($seq,14,7))};
+  $sc[3] = $metables[3]{$self->score3_hashseq(substr($seq,4,7))};
+  $sc[4] = $metables[4]{$self->score3_hashseq(substr($seq,11,7))};
+  $sc[5] = $metables[5]{$self->score3_hashseq(substr($seq,4,3))};
+  $sc[6] = $metables[6]{$self->score3_hashseq(substr($seq,7,4))};
+  $sc[7] = $metables[7]{$self->score3_hashseq(substr($seq,11,3))};
+  $sc[8] = $metables[8]{$self->score3_hashseq(substr($seq,14,4))};
+  my $finalscore = $sc[0] * $sc[1] * $sc[2] * $sc[3] * $sc[4] / ($sc[5] * $sc[6] * $sc[7] * $sc[8]);
+  return $finalscore;
+}
+
+sub score3_getrest {
+  my $self = shift;
+  my $seq = shift;
+  my $seq_noconsensus = substr($seq,0,18).substr($seq,20,3);
+  return $seq_noconsensus;
+}
+
+sub score3_scoreconsensus {
+  my $self = shift;
+  my $seq = shift;
+  my @seqa = split(//,uc($seq));
+  my %bgd; 
+  $bgd{'A'} = 0.27; 
+  $bgd{'C'} = 0.23; 
+  $bgd{'G'} = 0.23; 
+  $bgd{'T'} = 0.27;  
+  my %cons1;
+  $cons1{'A'} = 0.9903;
+  $cons1{'C'} = 0.0032;
+  $cons1{'G'} = 0.0034;
+  $cons1{'T'} = 0.0030;
+  my %cons2;
+  $cons2{'A'} = 0.0027; 
+  $cons2{'C'} = 0.0037; 
+  $cons2{'G'} = 0.9905; 
+  $cons2{'T'} = 0.0030;
+  my $addscore = $cons1{$seqa[18]} * $cons2{$seqa[19]}/ ($bgd{$seqa[18]} * $bgd{$seqa[19]}); 
+  return $addscore;
+}
+
+1;
+