vep_emc: dir_plugins/MaxEntScan.pm comparison

comparison dir_plugins/MaxEntScan.pm @ 0:e545d0a25ffe draft

Uploaded

author	dvanzessen
date	Mon, 15 Jul 2019 05:17:17 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:e545d0a25ffe
+=head1 LICENSE
+Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
+Copyright [2016-2018] EMBL-European Bioinformatics Institute
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=head1 CONTACT
+Ensembl <http://www.ensembl.org/info/about/contact/index.html>
+=cut
+=head1 NAME
+MaxEntScan
+=head1 SYNOPSIS
+mv MaxEntScan.pm ~/.vep/Plugins
+./vep -i variants.vcf --plugin MaxEntScan,/path/to/maxentscan/fordownload
+./vep -i variants.vcf --plugin MaxEntScan,/path/to/maxentscan/fordownload,SWA,NCSS
+=head1 DESCRIPTION
+This is a plugin for the Ensembl Variant Effect Predictor (VEP) that
+runs MaxEntScan (http://genes.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html)
+to get splice site predictions.
+The plugin copies most of the code verbatim from the score5.pl and score3.pl
+scripts provided in the MaxEntScan download. To run the plugin you must get and
+unpack the archive from http://genes.mit.edu/burgelab/maxent/download/; the path
+to this unpacked directory is then the param you pass to the --plugin flag.
+The plugin executes the logic from one of the scripts depending on which
+splice region the variant overlaps:
+score5.pl : last 3 bases of exon    --> first 6 bases of intron
+score3.pl : last 20 bases of intron --> first 3 bases of exon
+The plugin reports the reference, alternate and difference (REF - ALT) maximum
+entropy scores.
+If 'SWA' is specified as a command-line argument, a sliding window algorithm
+is applied to subsequences containing the reference and alternate alleles to
+identify k-mers with the highest donor and acceptor splice site scores. To assess
+the impact of variants, reference comparison scores are also provided. For SNVs,
+the comparison scores are derived from sequence in the same frame as the highest
+scoring k-mers containing the alternate allele. For all other variants, the
+comparison scores are derived from the highest scoring k-mers containing the
+reference allele. The difference between the reference comparison and alternate
+scores (SWA_REF_COMP - SWA_ALT) are also provided.
+If 'NCSS' is specified as a command-line argument, scores for the nearest
+upstream and downstream canonical splice sites are also included.
+By default, only scores are reported. Add 'verbose' to the list of command-
+line arguments to include the sequence output associated with those scores.
+=cut
+package MaxEntScan;
+use strict;
+use warnings;
+use Digest::MD5 qw(md5_hex);
+use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
+use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap);
+use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
+use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
+# how many seq/score pairs to cache in memory
+our $CACHE_SIZE = 50;
+sub new {
+my $class = shift;
+my $self = $class->SUPER::new(@_);
+# we need sequence, so no offline mode unless we have FASTA
+die("ERROR: cannot function in offline mode without a FASTA file\n") if $self->{config}->{offline} && !$self->{config}->{fasta};
+my $params = $self->params;
+my $dir = shift @$params;
+die("ERROR: MaxEntScan directory not specified\n") unless $dir;
+die("ERROR: MaxEntScan directory not found\n") unless -d $dir;
+$self->{_dir} = $dir;
+## setup from score5.pl
+$self->{'score5_me2x5'} = $self->score5_makescorematrix($dir.'/me2x5');
+$self->{'score5_seq'}   = $self->score5_makesequencematrix($dir.'/splicemodels/splice5sequences');
+## setup from score3.pl
+$self->{'score3_metables'} = $self->score3_makemaxentscores;
+my %opts = map { $_ => undef } @$params;
+$self->{'run_SWA'} = 1 if exists $opts{'SWA'};
+$self->{'run_NCSS'} = 1 if exists $opts{'NCSS'};
+$self->{'verbose'} = 1 if exists $opts{'verbose'};
+return $self;
+}
+sub feature_types {
+return ['Transcript'];
+}
+sub get_header_info {
+my $self = shift;
+my $v = $self->{'verbose'};
+my $headers = $self->get_MES_header_info($v);
+if ($self->{'run_SWA'}) {
+my $swa_headers = $self->get_SWA_header_info($v);
+$headers = {%$headers, %$swa_headers};
+}
+if ($self->{'run_NCSS'}) {
+my $ncss_headers = $self->get_NCSS_header_info($v);
+$headers = {%$headers, %$ncss_headers};
+}
+return $headers;
+}
+sub get_MES_header_info {
+my ($self, $verbose) = @_;
+my $headers = {
+MaxEntScan_ref => "MaxEntScan reference sequence score",
+MaxEntScan_alt => "MaxEntScan alternate sequence score",
+MaxEntScan_diff => "MaxEntScan score difference",
+};
+if ($verbose) {
+$headers->{'MaxEntScan_ref_seq'} = "MaxEntScan reference sequence";
+$headers->{'MaxEntScan_alt_seq'} = "MaxEntScan alternate sequence";
+}
+return $headers;
+}
+sub get_SWA_header_info {
+my ($self, $verbose) = @_;
+my $headers = {
+"MES-SWA_donor_ref" => "Highest splice donor reference sequence score",
+"MES-SWA_donor_alt" => "Highest splice donor alternate sequence score",
+"MES-SWA_donor_ref_comp" => "Donor reference comparison sequence score",
+"MES-SWA_donor_diff" => "Difference between the donor reference comparison and alternate sequence scores",
+"MES-SWA_acceptor_ref" => "Highest splice acceptor reference sequence score",
+"MES-SWA_acceptor_alt" => "Highest splice acceptor alternate sequence score",
+"MES-SWA_acceptor_ref_comp" => "Acceptor reference comparison sequence score",
+"MES-SWA_acceptor_diff" => "Difference between the acceptor reference comparison and alternate sequence scores",
+};
+if ($verbose) {
+$headers->{'MES-SWA_donor_ref_seq'} = "Highest splice donor reference sequence";
+$headers->{'MES-SWA_donor_ref_frame'} = "Position of the highest splice donor reference sequence";
+$headers->{'MES-SWA_donor_ref_context'} = "Selected donor sequence context containing the reference allele";
+$headers->{'MES-SWA_donor_alt_seq'} = "Highest splice donor alternate sequence";
+$headers->{'MES-SWA_donor_alt_frame'} = "Position of the highest splice donor alternate sequence";
+$headers->{'MES-SWA_donor_alt_context'} = "Selected donor sequence context containing the alternate allele";
+$headers->{'MES-SWA_donor_ref_comp_seq'} = "Donor reference comparison sequence";
+$headers->{'MES-SWA_acceptor_ref_seq'} = "Highest splice acceptor reference sequence";
+$headers->{'MES-SWA_acceptor_ref_frame'} = "Position of the highest splice acceptor reference sequence";
+$headers->{'MES-SWA_acceptor_ref_context'} = "Selected acceptor sequence context containing the reference allele";
+$headers->{'MES-SWA_acceptor_alt_seq'} = "Highest splice acceptor alternate sequence";
+$headers->{'MES-SWA_acceptor_alt_frame'} = "Position of the highest splice acceptor alternate sequence";
+$headers->{'MES-SWA_acceptor_alt_context'} = "Selected acceptor sequence context containing the alternate allele";
+$headers->{'MES-SWA_acceptor_ref_comp_seq'} = "Acceptor reference comparison sequence";
+}
+return $headers;
+}
+sub get_NCSS_header_info {
+my ($self, $verbose) = @_;
+my $headers = {
+"MES-NCSS_upstream_acceptor" => "Nearest upstream canonical splice acceptor sequence score",
+"MES-NCSS_upstream_donor" => "Nearest upstream canonical splice donor sequence score",
+"MES-NCSS_downstream_acceptor" => "Nearest downstream canonical splice acceptor sequence score",
+"MES-NCSS_downstream_donor" => "Nearest downstream canonical splice donor sequence score",
+};
+if ($verbose) {
+$headers->{'MES-NCSS_upstream_acceptor_seq'} = "Nearest upstream canonical splice acceptor sequence";
+$headers->{'MES-NCSS_upstream_donor_seq'} = "Nearest upstream canonical splice donor sequence";
+$headers->{'MES-NCSS_downstream_acceptor_seq'} = "Nearest downstream canonical splice acceptor sequence";
+$headers->{'MES-NCSS_downstream_donor_seq'} = "Nearest downstream canonical splice donor sequence";
+}
+return $headers;
+}
+sub run {
+my ($self, $tva) = @_;
+my $seq_headers = $self->get_MES_header_info();
+my $results = $self->run_MES($tva);
+if ($self->{'run_SWA'}) {
+my $swa_seq_headers = $self->get_SWA_header_info();
+$seq_headers = {%$seq_headers, %$swa_seq_headers};
+my $swa_results = $self->run_SWA($tva);
+$results = {%$results, %$swa_results};
+}
+if ($self->{'run_NCSS'}) {
+my $ncss_seq_headers = $self->get_NCSS_header_info();
+$seq_headers = {%$seq_headers, %$ncss_seq_headers};
+my $ncss_results = $self->run_NCSS($tva);
+$results = {%$results, %$ncss_results};
+}
+my %data;
+# add the scores
+my @scores = grep { exists $results->{$_} } keys %$seq_headers;
+@data{@scores} = map { sprintf('%.3f', $_) } @{$results}{@scores};
+if ($self->{'verbose'}) {
+# add any remaining results
+my @non_scores = grep { ! exists $data{$_} } keys %$results;
+@data{@non_scores} = @{$results}{@non_scores};
+}
+return \%data;
+}
+sub run_MES {
+my ($self, $tva) = @_;
+my $vf = $tva->variation_feature;
+return {} unless $vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/;
+my $tv = $tva->transcript_variation;
+my $tr = $tva->transcript;
+my $tr_strand = $tr->strand;
+my ($vf_start, $vf_end) = ($vf->start, $vf->end);
+# use _overlapped_introns() method from BaseTranscriptVariation
+# this will use an interval tree if available for superfast lookup of overlapping introns
+# we have to expand the search space around $vf because we're looking for the splice region not the intron per se
+foreach my $intron(@{$tv->_overlapped_introns($vf_start - 21, $vf_end + 21)}) {
+# get coords depending on strand
+# MaxEntScan does different predictions for 5 and 3 prime
+# and we need to feed it different bits of sequence for each
+#
+# 5prime, 3 bases of exon, 6 bases of intron:
+# ===------
+#
+# 3prime, 20 bases of intron, 3 bases of exon
+# --------------------===
+my ($five_start, $five_end, $three_start, $three_end);
+if($tr_strand > 0) {
+($five_start, $five_end)   = ($intron->start - 3, $intron->start + 5);
+($three_start, $three_end) = ($intron->end - 19, $intron->end + 3);
+}
+else {
+($five_start, $five_end)   = ($intron->end - 5, $intron->end + 3);
+($three_start, $three_end) = ($intron->start - 3, $intron->start + 19);
+}
+if(overlap($vf->start, $vf->end, $five_start, $five_end)) {
+my ($ref_seq, $alt_seq) = @{$self->get_seqs($tva, $five_start, $five_end)};
+return {} unless defined($ref_seq) && $ref_seq =~ /^[ACGT]+$/;
+return {} unless defined($alt_seq) && $alt_seq =~ /^[ACGT]+$/;
+my $ref_score = $self->score5($ref_seq);
+my $alt_score = $self->score5($alt_seq);
+return {
+MaxEntScan_ref => $ref_score,
+MaxEntScan_ref_seq => $ref_seq,
+MaxEntScan_alt => $alt_score,
+MaxEntScan_alt_seq => $alt_seq,
+MaxEntScan_diff => $ref_score - $alt_score,
+}
+}
+if(overlap($vf->start, $vf->end, $three_start, $three_end)) {
+my ($ref_seq, $alt_seq) = @{$self->get_seqs($tva, $three_start, $three_end)};
+return {} unless defined($ref_seq) && $ref_seq =~ /^[ACGT]+$/;
+return {} unless defined($alt_seq) && $alt_seq =~ /^[ACGT]+$/;
+my $ref_score = $self->score3($ref_seq);
+my $alt_score = $self->score3($alt_seq);
+return {
+MaxEntScan_ref => $ref_score,
+MaxEntScan_ref_seq => $ref_seq,
+MaxEntScan_alt => $alt_score,
+MaxEntScan_alt_seq => $alt_seq,
+MaxEntScan_diff => $ref_score - $alt_score,
+}
+}
+}
+return {};
+}
+sub run_SWA {
+my ($self, $tva) = @_;
+my $vf = $tva->variation_feature;
+my %results;
+# get the donor reference and alternate sequence contexts
+my ($donor_ref_context, $donor_alt_context) = @{$self->get_seqs($tva, $vf->start - 8, $vf->end + 8)};
+if (defined($donor_ref_context)) {
+$results{'MES-SWA_donor_ref_context'} = $donor_ref_context;
+if ($donor_ref_context  =~ /^[ACGT]+$/) {
+my ($seq, $frame, $score) = @{$self->get_max_donor($donor_ref_context)};
+$results{'MES-SWA_donor_ref_seq'} = $seq;
+$results{'MES-SWA_donor_ref_frame'} = $frame;
+$results{'MES-SWA_donor_ref'} = $score;
+}
+}
+if (defined($donor_alt_context)) {
+$results{'MES-SWA_donor_alt_context'} = $donor_alt_context;
+if ($donor_alt_context  =~ /^[ACGT]+$/) {
+my ($seq, $frame, $score) = @{$self->get_max_donor($donor_alt_context)};
+$results{'MES-SWA_donor_alt_seq'} = $seq;
+$results{'MES-SWA_donor_alt_frame'} = $frame;
+$results{'MES-SWA_donor_alt'} = $score;
+if (defined(my $ref_comp_seq = $results{'MES-SWA_donor_ref_seq'})) {
+if ($vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/) {
+# for SNVs, compare to the same frame as the highest scoring ALT k-mer
+$ref_comp_seq = substr($donor_ref_context, $frame - 1, 9);
+}
+$results{'MES-SWA_donor_ref_comp_seq'} = $ref_comp_seq;
+$results{'MES-SWA_donor_ref_comp'} = $self->score5($ref_comp_seq);
+$results{'MES-SWA_donor_diff'} = $results{'MES-SWA_donor_ref_comp'} - $score;
+}
+}
+}
+# get the acceptor reference and alternate sequence contexts
+my ($acceptor_ref_context, $acceptor_alt_context) = @{$self->get_seqs($tva, $vf->start - 22, $vf->end + 22)};
+if (defined($acceptor_ref_context)) {
+$results{'MES-SWA_acceptor_ref_context'} = $acceptor_ref_context;
+if ($acceptor_ref_context  =~ /^[ACGT]+$/) {
+my ($seq, $frame, $score) = @{$self->get_max_acceptor($acceptor_ref_context)};
+$results{'MES-SWA_acceptor_ref_seq'} = $seq;
+$results{'MES-SWA_acceptor_ref_frame'} = $frame;
+$results{'MES-SWA_acceptor_ref'} = $score;
+}
+}
+if (defined($acceptor_alt_context)) {
+$results{'MES-SWA_acceptor_alt_context'} = $acceptor_alt_context;
+if ($acceptor_alt_context  =~ /^[ACGT]+$/) {
+my ($seq, $frame, $score) = @{$self->get_max_acceptor($acceptor_alt_context)};
+$results{'MES-SWA_acceptor_alt_seq'} = $seq;
+$results{'MES-SWA_acceptor_alt_frame'} = $frame;
+$results{'MES-SWA_acceptor_alt'} = $score;
+if (defined(my $ref_comp_seq = $results{'MES-SWA_acceptor_ref_seq'})) {
+if ($vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/) {
+# for SNVs, compare to the same frame as the highest scoring ALT k-mer
+$ref_comp_seq = substr($acceptor_ref_context, $frame - 1, 23);
+}
+$results{'MES-SWA_acceptor_ref_comp_seq'} = $ref_comp_seq;
+$results{'MES-SWA_acceptor_ref_comp'} = $self->score3($ref_comp_seq);
+$results{'MES-SWA_acceptor_diff'} = $results{'MES-SWA_acceptor_ref_comp'} - $score;
+}
+}
+}
+return \%results;
+}
+sub run_NCSS {
+my ($self, $tva) = @_;
+my $tv = $tva->transcript_variation;
+my $tr = $tva->transcript;
+my %results;
+if ($tv->intron_number) {
+my ($intron_numbers, $total_introns) = split(/\//, $tv->intron_number);
+my $intron_number = (split(/-/, $intron_numbers))[0];
+my $introns = $tr->get_all_Introns;
+my $intron_idx = $intron_number - 1;
+my $intron = $introns->[$intron_idx];
+if (defined(my $seq = $self->get_donor_seq_from_intron($intron))) {
+$results{'MES-NCSS_upstream_donor_seq'} = $seq;
+$results{'MES-NCSS_upstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
+}
+if (defined(my $seq = $self->get_acceptor_seq_from_intron($intron))) {
+$results{'MES-NCSS_downstream_acceptor_seq'} = $seq;
+$results{'MES-NCSS_downstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
+}
+# don't calculate an upstream acceptor score if the intron is the first in the transcript
+unless ($intron_number == 1) {
+my $upstream_intron = $introns->[$intron_idx - 1];
+if (defined(my $seq = $self->get_acceptor_seq_from_intron($upstream_intron))) {
+$results{'MES-NCSS_upstream_acceptor_seq'} = $seq;
+$results{'MES-NCSS_upstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
+}
+}
+# don't calculate a downstream donor score if the intron is the last in the transcript
+unless ($intron_number == $total_introns) {
+my $downstream_intron = $introns->[$intron_idx + 1];
+if (defined(my $seq = $self->get_donor_seq_from_intron($downstream_intron))) {
+$results{'MES-NCSS_downstream_donor_seq'} = $seq;
+$results{'MES-NCSS_downstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
+}
+}
+}
+elsif ($tv->exon_number) {
+my ($exon_numbers, $total_exons) = split(/\//, $tv->exon_number);
+my $exon_number = (split(/-/, $exon_numbers))[0];
+my $exons = $tr->get_all_Exons;
+my $exon_idx = $exon_number - 1;
+my $exon = $exons->[$exon_idx];
+# don't calculate upstream scores if the exon is the first in the transcript
+unless ($exon_number == 1) {
+my $upstream_exon = $exons->[$exon_idx - 1];
+if (defined(my $seq = $self->get_donor_seq_from_exon($upstream_exon))) {
+$results{'MES-NCSS_upstream_donor_seq'} = $seq;
+$results{'MES-NCSS_upstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
+}
+if (defined(my $seq = $self->get_acceptor_seq_from_exon($exon))) {
+$results{'MES-NCSS_upstream_acceptor_seq'} = $seq;
+$results{'MES-NCSS_upstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
+}
+}
+# don't calculate downstream scores if the exon is the last exon in the transcript
+unless ($exon_number == $total_exons) {
+my $downstream_exon = $exons->[$exon_idx + 1];
+if (defined(my $seq = $self->get_donor_seq_from_exon($exon))) {
+$results{'MES-NCSS_downstream_donor_seq'} = $seq;
+$results{'MES-NCSS_downstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
+}
+if (defined(my $seq = $self->get_acceptor_seq_from_exon($downstream_exon))) {
+$results{'MES-NCSS_downstream_acceptor_seq'} = $seq;
+$results{'MES-NCSS_downstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
+}
+}
+}
+return \%results;
+}
+## Sliding window approach methods
+##################################
+sub get_max_donor {
+my ($self, $sequence) = @_;
+my ($seq, $frame, $max);
+my @kmers = @{$self->sliding_window($sequence, 9)};
+for my $i (0 .. $#kmers) {
+my $kmer = $kmers[$i];
+my $score = $self->score5($kmer);
+if(!$max || $score > $max) {
+$seq = $kmer;
+$frame = $i + 1;
+$max = $score;
+}
+}
+return [$seq, $frame, $max];
+}
+sub get_max_acceptor {
+my ($self, $sequence) = @_;
+my ($seq, $frame, $max);
+my @kmers = @{$self->sliding_window($sequence, 23)};
+for my $i (0 .. $#kmers) {
+my $kmer = $kmers[$i];
+my $score = $self->score3($kmer);
+if(!$max || $score > $max) {
+$seq = $kmer;
+$frame = $i + 1;
+$max = $score;
+}
+}
+return [$seq, $frame, $max];
+}
+sub sliding_window {
+my ($self, $sequence, $winsize) = @_;
+my @seqs;
+for (my $i = 1; $i <= length($sequence) - $winsize + 1; $i++) {
+push @seqs, substr($sequence, $i - 1, $winsize);
+}
+return \@seqs;
+}
+## Nearest canonical splice site methods
+########################################
+sub get_donor_seq_from_exon {
+my ($self, $exon) = @_;
+my ($start, $end);
+if ($exon->strand > 0) {
+($start, $end) = ($exon->end - 2, $exon->end + 6);
+}
+else {
+($start, $end) = ($exon->start - 6, $exon->start + 2);
+}
+my $slice = $exon->slice()->sub_Slice($start, $end, $exon->strand);
+my $seq = $slice->seq() if defined($slice);
+return $seq;
+}
+sub get_acceptor_seq_from_exon {
+my ($self, $exon) = @_;
+my ($start, $end);
+if ($exon->strand > 0) {
+($start, $end) = ($exon->start - 20, $exon->start + 2);
+}
+else {
+($start, $end) = ($exon->end - 2, $exon->end + 20);
+}
+my $slice = $exon->slice()->sub_Slice($start, $end, $exon->strand);
+my $seq = $slice->seq() if defined($slice);
+return $seq;
+}
+sub get_donor_seq_from_intron {
+my ($self, $intron) = @_;
+my ($start, $end);
+if ($intron->strand > 0) {
+($start, $end) = ($intron->start - 3, $intron->start + 5);
+}
+else {
+($start, $end) = ($intron->end - 5, $intron->end + 3);
+}
+my $slice = $intron->slice()->sub_Slice($start, $end, $intron->strand);
+my $seq = $slice->seq() if defined($slice);
+return $seq;
+}
+sub get_acceptor_seq_from_intron {
+my ($self, $intron) = @_;
+my ($start, $end);
+if ($intron->strand > 0) {
+($start, $end) = ($intron->end - 19, $intron->end + 3);
+}
+else {
+($start, $end) = ($intron->start - 3, $intron->start + 19);
+}
+my $slice = $intron->slice()->sub_Slice($start, $end, $intron->strand);
+my $seq = $slice->seq() if defined($slice);
+return $seq;
+}
+## Common methods
+#################
+sub get_seqs {
+my ($self, $tva, $start, $end) = @_;
+my $vf = $tva->variation_feature;
+my $tr_strand = $tva->transcript->strand;
+my $ref_slice = $vf->{slice}->sub_Slice($start, $end, $tr_strand);
+my ($ref_seq, $alt_seq);
+if (defined $ref_slice) {
+$ref_seq = $alt_seq = $ref_slice->seq();
+my $substr_start = $tr_strand > 0 ? $vf->{start} - $start : $end - $vf->{end};
+my $feature_seq = $tva->seq_length > 0 ? $tva->feature_seq : '';
+substr($alt_seq, $substr_start, ($vf->{end} - $vf->{start}) + 1) = $feature_seq;
+}
+return [$ref_seq, $alt_seq];
+}
+sub score5 {
+my $self = shift;
+my $seq = shift;
+my $hex = md5_hex($seq);
+# check cache
+if($self->{cache}) {
+my ($res) = grep {$_->{hex} eq $hex} @{$self->{cache}->{score5}};
+return $res->{score} if $res;
+}
+my $a = $self->score5_scoreconsensus($seq);
+die("ERROR: No score5_scoreconsensus\n") unless defined($a);
+my $b = $self->score5_getrest($seq);
+die("ERROR: No score5_getrest\n") unless defined($b);
+my $c = $self->{'score5_seq'}->{$b};
+die("ERROR: No score5_seq for $b\n") unless defined($c);
+my $d = $self->{'score5_me2x5'}->{$c};
+die("ERROR: No score5_me2x5 for $c\n") unless defined($d);
+my $score = $self->log2($a * $d);
+# cache it
+push @{$self->{cache}->{score5}}, { hex => $hex, score => $score };
+shift @{$self->{cache}->{score5}} while scalar @{$self->{cache}->{score5}} > $CACHE_SIZE;
+return $score;
+}
+sub score3 {
+my $self = shift;
+my $seq = shift;
+my $hex = md5_hex($seq);
+# check cache
+if($self->{cache}) {
+my ($res) = grep {$_->{hex} eq $hex} @{$self->{cache}->{score3}};
+return $res->{score} if $res;
+}
+my $a = $self->score3_scoreconsensus($seq);
+die("ERROR: No score3_scoreconsensus\n") unless defined($a);
+my $b = $self->score3_getrest($seq);
+die("ERROR: No score3_getrest\n") unless defined($b);
+my $c = $self->score3_maxentscore($b, $self->{'score3_metables'});
+die("ERROR: No score3_maxentscore for $b\n") unless defined($c);
+my $score = $self->log2($a * $c);
+# cache it
+push @{$self->{cache}->{score3}}, { hex => $hex, score => $score };
+shift @{$self->{cache}->{score3}} while scalar @{$self->{cache}->{score3}} > $CACHE_SIZE;
+return $score;
+}
+## methods copied from score5.pl
+################################
+sub score5_makesequencematrix {
+my $self = shift;
+my $file = shift;
+my %matrix;
+my $n=0;
+open(SCOREF, $file) || die "Can't open $file!\n";
+while(<SCOREF>) {
+chomp;
+$_=~ s/\s//;
+$matrix{$_} = $n;
+$n++;
+}
+close(SCOREF);
+return \%matrix;
+}
+sub score5_makescorematrix {
+my $self = shift;
+my $file = shift;
+my %matrix;
+my $n=0;
+open(SCOREF, $file) || die "Can't open $file!\n";
+while(<SCOREF>) {
+chomp;
+$_=~ s/\s//;
+$matrix{$n} = $_;
+$n++;
+}
+close(SCOREF);
+return \%matrix;
+}
+sub score5_getrest {
+my $self = shift;
+my $seq = shift;
+my @seqa = split(//,uc($seq));
+return $seqa[0].$seqa[1].$seqa[2].$seqa[5].$seqa[6].$seqa[7].$seqa[8];
+}
+sub score5_scoreconsensus {
+my $self = shift;
+my $seq = shift;
+my @seqa = split(//,uc($seq));
+my %bgd;
+$bgd{'A'} = 0.27;
+$bgd{'C'} = 0.23;
+$bgd{'G'} = 0.23;
+$bgd{'T'} = 0.27;
+my %cons1;
+$cons1{'A'} = 0.004;
+$cons1{'C'} = 0.0032;
+$cons1{'G'} = 0.9896;
+$cons1{'T'} = 0.0032;
+my %cons2;
+$cons2{'A'} = 0.0034;
+$cons2{'C'} = 0.0039;
+$cons2{'G'} = 0.0042;
+$cons2{'T'} = 0.9884;
+my $addscore = $cons1{$seqa[3]}*$cons2{$seqa[4]}/($bgd{$seqa[3]}*$bgd{$seqa[4]});
+return $addscore;
+}
+sub log2 {
+my ($self, $val) = @_;
+return log($val)/log(2);
+}
+## methods copied from score3.pl
+################################
+sub score3_hashseq {
+#returns hash of sequence in base 4
+# $self->score3_hashseq('CAGAAGT') returns 4619
+my $self = shift;
+my $seq = shift;
+$seq = uc($seq);
+$seq =~ tr/ACGT/0123/;
+my @seqa = split(//,$seq);
+my $sum = 0;
+my $len = length($seq);
+my @four = (1,4,16,64,256,1024,4096,16384);
+my $i=0;
+while ($i<$len) {
+$sum+= $seqa[$i] * $four[$len - $i -1] ;
+$i++;
+}
+return $sum;
+}
+sub score3_makemaxentscores {
+my $self = shift;
+my $dir = $self->{'_dir'}."/splicemodels/";
+my @list = ('me2x3acc1','me2x3acc2','me2x3acc3','me2x3acc4',
+'me2x3acc5','me2x3acc6','me2x3acc7','me2x3acc8','me2x3acc9');
+my @metables;
+my $num = 0 ;
+foreach my $file (@list) {
+my $n = 0;
+open (SCOREF,"<".$dir.$file) || die "Can't open $file!\n";
+while(<SCOREF>) {
+chomp;
+$_=~ s/\s//;
+$metables[$num]{$n} = $_;
+$n++;
+}
+close(SCOREF);
+#print STDERR $file."\t".$num."\t".$n."\n";
+$num++;
+}
+return \@metables;
+}
+sub score3_maxentscore {
+my $self = shift;
+my $seq = shift;
+my $table_ref = shift;
+my @metables = @$table_ref;
+my @sc;
+$sc[0] = $metables[0]{$self->score3_hashseq(substr($seq,0,7))};
+$sc[1] = $metables[1]{$self->score3_hashseq(substr($seq,7,7))};
+$sc[2] = $metables[2]{$self->score3_hashseq(substr($seq,14,7))};
+$sc[3] = $metables[3]{$self->score3_hashseq(substr($seq,4,7))};
+$sc[4] = $metables[4]{$self->score3_hashseq(substr($seq,11,7))};
+$sc[5] = $metables[5]{$self->score3_hashseq(substr($seq,4,3))};
+$sc[6] = $metables[6]{$self->score3_hashseq(substr($seq,7,4))};
+$sc[7] = $metables[7]{$self->score3_hashseq(substr($seq,11,3))};
+$sc[8] = $metables[8]{$self->score3_hashseq(substr($seq,14,4))};
+my $finalscore = $sc[0] * $sc[1] * $sc[2] * $sc[3] * $sc[4] / ($sc[5] * $sc[6] * $sc[7] * $sc[8]);
+return $finalscore;
+}
+sub score3_getrest {
+my $self = shift;
+my $seq = shift;
+my $seq_noconsensus = substr($seq,0,18).substr($seq,20,3);
+return $seq_noconsensus;
+}
+sub score3_scoreconsensus {
+my $self = shift;
+my $seq = shift;
+my @seqa = split(//,uc($seq));
+my %bgd;
+$bgd{'A'} = 0.27;
+$bgd{'C'} = 0.23;
+$bgd{'G'} = 0.23;
+$bgd{'T'} = 0.27;
+my %cons1;
+$cons1{'A'} = 0.9903;
+$cons1{'C'} = 0.0032;
+$cons1{'G'} = 0.0034;
+$cons1{'T'} = 0.0030;
+my %cons2;
+$cons2{'A'} = 0.0027;
+$cons2{'C'} = 0.0037;
+$cons2{'G'} = 0.9905;
+$cons2{'T'} = 0.0030;
+my $addscore = $cons1{$seqa[18]} * $cons2{$seqa[19]}/ ($bgd{$seqa[18]} * $bgd{$seqa[19]});
+return $addscore;
+}
+1;

Mercurial > repos > dvanzessen > vep_emc

comparison dir_plugins/MaxEntScan.pm @ 0:e545d0a25ffe draft