| 0 | 1 =head1 LICENSE | 
|  | 2 | 
|  | 3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute | 
|  | 4 Copyright [2016-2018] EMBL-European Bioinformatics Institute | 
|  | 5 | 
|  | 6 Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | 7 you may not use this file except in compliance with the License. | 
|  | 8 You may obtain a copy of the License at | 
|  | 9 | 
|  | 10      http://www.apache.org/licenses/LICENSE-2.0 | 
|  | 11 | 
|  | 12 Unless required by applicable law or agreed to in writing, software | 
|  | 13 distributed under the License is distributed on an "AS IS" BASIS, | 
|  | 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | 15 See the License for the specific language governing permissions and | 
|  | 16 limitations under the License. | 
|  | 17 | 
|  | 18 =head1 CONTACT | 
|  | 19 | 
|  | 20  Ensembl <http://www.ensembl.org/info/about/contact/index.html> | 
|  | 21 | 
|  | 22 =cut | 
|  | 23 | 
|  | 24 =head1 NAME | 
|  | 25 | 
|  | 26  MaxEntScan | 
|  | 27 | 
|  | 28 =head1 SYNOPSIS | 
|  | 29 | 
|  | 30  mv MaxEntScan.pm ~/.vep/Plugins | 
|  | 31  ./vep -i variants.vcf --plugin MaxEntScan,/path/to/maxentscan/fordownload | 
|  | 32  ./vep -i variants.vcf --plugin MaxEntScan,/path/to/maxentscan/fordownload,SWA,NCSS | 
|  | 33 | 
|  | 34 =head1 DESCRIPTION | 
|  | 35 | 
|  | 36  This is a plugin for the Ensembl Variant Effect Predictor (VEP) that | 
|  | 37  runs MaxEntScan (http://genes.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html) | 
|  | 38  to get splice site predictions. | 
|  | 39 | 
|  | 40  The plugin copies most of the code verbatim from the score5.pl and score3.pl | 
|  | 41  scripts provided in the MaxEntScan download. To run the plugin you must get and | 
|  | 42  unpack the archive from http://genes.mit.edu/burgelab/maxent/download/; the path | 
|  | 43  to this unpacked directory is then the param you pass to the --plugin flag. | 
|  | 44 | 
|  | 45  The plugin executes the logic from one of the scripts depending on which | 
|  | 46  splice region the variant overlaps: | 
|  | 47 | 
|  | 48  score5.pl : last 3 bases of exon    --> first 6 bases of intron | 
|  | 49  score3.pl : last 20 bases of intron --> first 3 bases of exon | 
|  | 50 | 
|  | 51  The plugin reports the reference, alternate and difference (REF - ALT) maximum | 
|  | 52  entropy scores. | 
|  | 53 | 
|  | 54  If 'SWA' is specified as a command-line argument, a sliding window algorithm | 
|  | 55  is applied to subsequences containing the reference and alternate alleles to | 
|  | 56  identify k-mers with the highest donor and acceptor splice site scores. To assess | 
|  | 57  the impact of variants, reference comparison scores are also provided. For SNVs, | 
|  | 58  the comparison scores are derived from sequence in the same frame as the highest | 
|  | 59  scoring k-mers containing the alternate allele. For all other variants, the | 
|  | 60  comparison scores are derived from the highest scoring k-mers containing the | 
|  | 61  reference allele. The difference between the reference comparison and alternate | 
|  | 62  scores (SWA_REF_COMP - SWA_ALT) are also provided. | 
|  | 63 | 
|  | 64  If 'NCSS' is specified as a command-line argument, scores for the nearest | 
|  | 65  upstream and downstream canonical splice sites are also included. | 
|  | 66 | 
|  | 67  By default, only scores are reported. Add 'verbose' to the list of command- | 
|  | 68  line arguments to include the sequence output associated with those scores. | 
|  | 69 | 
|  | 70 =cut | 
|  | 71 | 
|  | 72 package MaxEntScan; | 
|  | 73 | 
|  | 74 use strict; | 
|  | 75 use warnings; | 
|  | 76 | 
|  | 77 use Digest::MD5 qw(md5_hex); | 
|  | 78 | 
|  | 79 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | 
|  | 80 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap); | 
|  | 81 | 
|  | 82 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; | 
|  | 83 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); | 
|  | 84 | 
|  | 85 # how many seq/score pairs to cache in memory | 
|  | 86 our $CACHE_SIZE = 50; | 
|  | 87 | 
|  | 88 sub new { | 
|  | 89   my $class = shift; | 
|  | 90 | 
|  | 91   my $self = $class->SUPER::new(@_); | 
|  | 92 | 
|  | 93   # we need sequence, so no offline mode unless we have FASTA | 
|  | 94   die("ERROR: cannot function in offline mode without a FASTA file\n") if $self->{config}->{offline} && !$self->{config}->{fasta}; | 
|  | 95 | 
|  | 96   my $params = $self->params; | 
|  | 97 | 
|  | 98   my $dir = shift @$params; | 
|  | 99   die("ERROR: MaxEntScan directory not specified\n") unless $dir; | 
|  | 100   die("ERROR: MaxEntScan directory not found\n") unless -d $dir; | 
|  | 101   $self->{_dir} = $dir; | 
|  | 102 | 
|  | 103   ## setup from score5.pl | 
|  | 104   $self->{'score5_me2x5'} = $self->score5_makescorematrix($dir.'/me2x5'); | 
|  | 105   $self->{'score5_seq'}   = $self->score5_makesequencematrix($dir.'/splicemodels/splice5sequences'); | 
|  | 106 | 
|  | 107   ## setup from score3.pl | 
|  | 108   $self->{'score3_metables'} = $self->score3_makemaxentscores; | 
|  | 109 | 
|  | 110   my %opts = map { $_ => undef } @$params; | 
|  | 111 | 
|  | 112   $self->{'run_SWA'} = 1 if exists $opts{'SWA'}; | 
|  | 113   $self->{'run_NCSS'} = 1 if exists $opts{'NCSS'}; | 
|  | 114 | 
|  | 115   $self->{'verbose'} = 1 if exists $opts{'verbose'}; | 
|  | 116 | 
|  | 117   return $self; | 
|  | 118 } | 
|  | 119 | 
|  | 120 sub feature_types { | 
|  | 121   return ['Transcript']; | 
|  | 122 } | 
|  | 123 | 
|  | 124 sub get_header_info { | 
|  | 125   my $self = shift; | 
|  | 126 | 
|  | 127   my $v = $self->{'verbose'}; | 
|  | 128   my $headers = $self->get_MES_header_info($v); | 
|  | 129 | 
|  | 130   if ($self->{'run_SWA'}) { | 
|  | 131     my $swa_headers = $self->get_SWA_header_info($v); | 
|  | 132     $headers = {%$headers, %$swa_headers}; | 
|  | 133   } | 
|  | 134 | 
|  | 135   if ($self->{'run_NCSS'}) { | 
|  | 136     my $ncss_headers = $self->get_NCSS_header_info($v); | 
|  | 137     $headers = {%$headers, %$ncss_headers}; | 
|  | 138   } | 
|  | 139 | 
|  | 140   return $headers; | 
|  | 141 } | 
|  | 142 | 
|  | 143 sub get_MES_header_info { | 
|  | 144   my ($self, $verbose) = @_; | 
|  | 145 | 
|  | 146   my $headers = { | 
|  | 147     MaxEntScan_ref => "MaxEntScan reference sequence score", | 
|  | 148     MaxEntScan_alt => "MaxEntScan alternate sequence score", | 
|  | 149     MaxEntScan_diff => "MaxEntScan score difference", | 
|  | 150   }; | 
|  | 151 | 
|  | 152   if ($verbose) { | 
|  | 153 | 
|  | 154     $headers->{'MaxEntScan_ref_seq'} = "MaxEntScan reference sequence"; | 
|  | 155     $headers->{'MaxEntScan_alt_seq'} = "MaxEntScan alternate sequence"; | 
|  | 156   } | 
|  | 157 | 
|  | 158   return $headers; | 
|  | 159 } | 
|  | 160 | 
|  | 161 sub get_SWA_header_info { | 
|  | 162   my ($self, $verbose) = @_; | 
|  | 163 | 
|  | 164   my $headers = { | 
|  | 165     "MES-SWA_donor_ref" => "Highest splice donor reference sequence score", | 
|  | 166     "MES-SWA_donor_alt" => "Highest splice donor alternate sequence score", | 
|  | 167     "MES-SWA_donor_ref_comp" => "Donor reference comparison sequence score", | 
|  | 168     "MES-SWA_donor_diff" => "Difference between the donor reference comparison and alternate sequence scores", | 
|  | 169 | 
|  | 170     "MES-SWA_acceptor_ref" => "Highest splice acceptor reference sequence score", | 
|  | 171     "MES-SWA_acceptor_alt" => "Highest splice acceptor alternate sequence score", | 
|  | 172     "MES-SWA_acceptor_ref_comp" => "Acceptor reference comparison sequence score", | 
|  | 173     "MES-SWA_acceptor_diff" => "Difference between the acceptor reference comparison and alternate sequence scores", | 
|  | 174   }; | 
|  | 175 | 
|  | 176   if ($verbose) { | 
|  | 177 | 
|  | 178     $headers->{'MES-SWA_donor_ref_seq'} = "Highest splice donor reference sequence"; | 
|  | 179     $headers->{'MES-SWA_donor_ref_frame'} = "Position of the highest splice donor reference sequence"; | 
|  | 180     $headers->{'MES-SWA_donor_ref_context'} = "Selected donor sequence context containing the reference allele"; | 
|  | 181     $headers->{'MES-SWA_donor_alt_seq'} = "Highest splice donor alternate sequence"; | 
|  | 182     $headers->{'MES-SWA_donor_alt_frame'} = "Position of the highest splice donor alternate sequence"; | 
|  | 183     $headers->{'MES-SWA_donor_alt_context'} = "Selected donor sequence context containing the alternate allele"; | 
|  | 184     $headers->{'MES-SWA_donor_ref_comp_seq'} = "Donor reference comparison sequence"; | 
|  | 185 | 
|  | 186     $headers->{'MES-SWA_acceptor_ref_seq'} = "Highest splice acceptor reference sequence"; | 
|  | 187     $headers->{'MES-SWA_acceptor_ref_frame'} = "Position of the highest splice acceptor reference sequence"; | 
|  | 188     $headers->{'MES-SWA_acceptor_ref_context'} = "Selected acceptor sequence context containing the reference allele"; | 
|  | 189     $headers->{'MES-SWA_acceptor_alt_seq'} = "Highest splice acceptor alternate sequence"; | 
|  | 190     $headers->{'MES-SWA_acceptor_alt_frame'} = "Position of the highest splice acceptor alternate sequence"; | 
|  | 191     $headers->{'MES-SWA_acceptor_alt_context'} = "Selected acceptor sequence context containing the alternate allele"; | 
|  | 192     $headers->{'MES-SWA_acceptor_ref_comp_seq'} = "Acceptor reference comparison sequence"; | 
|  | 193   } | 
|  | 194 | 
|  | 195   return $headers; | 
|  | 196 } | 
|  | 197 | 
|  | 198 sub get_NCSS_header_info { | 
|  | 199   my ($self, $verbose) = @_; | 
|  | 200 | 
|  | 201   my $headers = { | 
|  | 202     "MES-NCSS_upstream_acceptor" => "Nearest upstream canonical splice acceptor sequence score", | 
|  | 203     "MES-NCSS_upstream_donor" => "Nearest upstream canonical splice donor sequence score", | 
|  | 204 | 
|  | 205     "MES-NCSS_downstream_acceptor" => "Nearest downstream canonical splice acceptor sequence score", | 
|  | 206     "MES-NCSS_downstream_donor" => "Nearest downstream canonical splice donor sequence score", | 
|  | 207   }; | 
|  | 208 | 
|  | 209   if ($verbose) { | 
|  | 210 | 
|  | 211     $headers->{'MES-NCSS_upstream_acceptor_seq'} = "Nearest upstream canonical splice acceptor sequence"; | 
|  | 212     $headers->{'MES-NCSS_upstream_donor_seq'} = "Nearest upstream canonical splice donor sequence"; | 
|  | 213 | 
|  | 214     $headers->{'MES-NCSS_downstream_acceptor_seq'} = "Nearest downstream canonical splice acceptor sequence"; | 
|  | 215     $headers->{'MES-NCSS_downstream_donor_seq'} = "Nearest downstream canonical splice donor sequence"; | 
|  | 216   } | 
|  | 217 | 
|  | 218   return $headers; | 
|  | 219 } | 
|  | 220 | 
|  | 221 sub run { | 
|  | 222   my ($self, $tva) = @_; | 
|  | 223 | 
|  | 224   my $seq_headers = $self->get_MES_header_info(); | 
|  | 225   my $results = $self->run_MES($tva); | 
|  | 226 | 
|  | 227   if ($self->{'run_SWA'}) { | 
|  | 228     my $swa_seq_headers = $self->get_SWA_header_info(); | 
|  | 229     $seq_headers = {%$seq_headers, %$swa_seq_headers}; | 
|  | 230     my $swa_results = $self->run_SWA($tva); | 
|  | 231     $results = {%$results, %$swa_results}; | 
|  | 232   } | 
|  | 233 | 
|  | 234   if ($self->{'run_NCSS'}) { | 
|  | 235     my $ncss_seq_headers = $self->get_NCSS_header_info(); | 
|  | 236     $seq_headers = {%$seq_headers, %$ncss_seq_headers}; | 
|  | 237     my $ncss_results = $self->run_NCSS($tva); | 
|  | 238     $results = {%$results, %$ncss_results}; | 
|  | 239   } | 
|  | 240 | 
|  | 241   my %data; | 
|  | 242 | 
|  | 243   # add the scores | 
|  | 244   my @scores = grep { exists $results->{$_} } keys %$seq_headers; | 
|  | 245   @data{@scores} = map { sprintf('%.3f', $_) } @{$results}{@scores}; | 
|  | 246 | 
|  | 247   if ($self->{'verbose'}) { | 
|  | 248     # add any remaining results | 
|  | 249     my @non_scores = grep { ! exists $data{$_} } keys %$results; | 
|  | 250     @data{@non_scores} = @{$results}{@non_scores}; | 
|  | 251   } | 
|  | 252 | 
|  | 253   return \%data; | 
|  | 254 } | 
|  | 255 | 
|  | 256 sub run_MES { | 
|  | 257   my ($self, $tva) = @_; | 
|  | 258 | 
|  | 259   my $vf = $tva->variation_feature; | 
|  | 260   return {} unless $vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/; | 
|  | 261 | 
|  | 262   my $tv = $tva->transcript_variation; | 
|  | 263   my $tr = $tva->transcript; | 
|  | 264   my $tr_strand = $tr->strand; | 
|  | 265   my ($vf_start, $vf_end) = ($vf->start, $vf->end); | 
|  | 266 | 
|  | 267   # use _overlapped_introns() method from BaseTranscriptVariation | 
|  | 268   # this will use an interval tree if available for superfast lookup of overlapping introns | 
|  | 269   # we have to expand the search space around $vf because we're looking for the splice region not the intron per se | 
|  | 270   foreach my $intron(@{$tv->_overlapped_introns($vf_start - 21, $vf_end + 21)}) { | 
|  | 271 | 
|  | 272     # get coords depending on strand | 
|  | 273     # MaxEntScan does different predictions for 5 and 3 prime | 
|  | 274     # and we need to feed it different bits of sequence for each | 
|  | 275     # | 
|  | 276     # 5prime, 3 bases of exon, 6 bases of intron: | 
|  | 277     # ===------ | 
|  | 278     # | 
|  | 279     # 3prime, 20 bases of intron, 3 bases of exon | 
|  | 280     # --------------------=== | 
|  | 281 | 
|  | 282     my ($five_start, $five_end, $three_start, $three_end); | 
|  | 283 | 
|  | 284     if($tr_strand > 0) { | 
|  | 285       ($five_start, $five_end)   = ($intron->start - 3, $intron->start + 5); | 
|  | 286       ($three_start, $three_end) = ($intron->end - 19, $intron->end + 3); | 
|  | 287     } | 
|  | 288 | 
|  | 289     else { | 
|  | 290       ($five_start, $five_end)   = ($intron->end - 5, $intron->end + 3); | 
|  | 291       ($three_start, $three_end) = ($intron->start - 3, $intron->start + 19); | 
|  | 292     } | 
|  | 293 | 
|  | 294     if(overlap($vf->start, $vf->end, $five_start, $five_end)) { | 
|  | 295       my ($ref_seq, $alt_seq) = @{$self->get_seqs($tva, $five_start, $five_end)}; | 
|  | 296 | 
|  | 297       return {} unless defined($ref_seq) && $ref_seq =~ /^[ACGT]+$/; | 
|  | 298       return {} unless defined($alt_seq) && $alt_seq =~ /^[ACGT]+$/; | 
|  | 299 | 
|  | 300       my $ref_score = $self->score5($ref_seq); | 
|  | 301       my $alt_score = $self->score5($alt_seq); | 
|  | 302 | 
|  | 303       return { | 
|  | 304         MaxEntScan_ref => $ref_score, | 
|  | 305         MaxEntScan_ref_seq => $ref_seq, | 
|  | 306         MaxEntScan_alt => $alt_score, | 
|  | 307         MaxEntScan_alt_seq => $alt_seq, | 
|  | 308         MaxEntScan_diff => $ref_score - $alt_score, | 
|  | 309       } | 
|  | 310     } | 
|  | 311 | 
|  | 312     if(overlap($vf->start, $vf->end, $three_start, $three_end)) { | 
|  | 313       my ($ref_seq, $alt_seq) = @{$self->get_seqs($tva, $three_start, $three_end)}; | 
|  | 314 | 
|  | 315       return {} unless defined($ref_seq) && $ref_seq =~ /^[ACGT]+$/; | 
|  | 316       return {} unless defined($alt_seq) && $alt_seq =~ /^[ACGT]+$/; | 
|  | 317 | 
|  | 318       my $ref_score = $self->score3($ref_seq); | 
|  | 319       my $alt_score = $self->score3($alt_seq); | 
|  | 320 | 
|  | 321       return { | 
|  | 322         MaxEntScan_ref => $ref_score, | 
|  | 323         MaxEntScan_ref_seq => $ref_seq, | 
|  | 324         MaxEntScan_alt => $alt_score, | 
|  | 325         MaxEntScan_alt_seq => $alt_seq, | 
|  | 326         MaxEntScan_diff => $ref_score - $alt_score, | 
|  | 327       } | 
|  | 328     } | 
|  | 329   } | 
|  | 330 | 
|  | 331   return {}; | 
|  | 332 } | 
|  | 333 | 
|  | 334 sub run_SWA { | 
|  | 335   my ($self, $tva) = @_; | 
|  | 336 | 
|  | 337   my $vf = $tva->variation_feature; | 
|  | 338 | 
|  | 339   my %results; | 
|  | 340 | 
|  | 341   # get the donor reference and alternate sequence contexts | 
|  | 342   my ($donor_ref_context, $donor_alt_context) = @{$self->get_seqs($tva, $vf->start - 8, $vf->end + 8)}; | 
|  | 343 | 
|  | 344   if (defined($donor_ref_context)) { | 
|  | 345     $results{'MES-SWA_donor_ref_context'} = $donor_ref_context; | 
|  | 346 | 
|  | 347     if ($donor_ref_context  =~ /^[ACGT]+$/) { | 
|  | 348       my ($seq, $frame, $score) = @{$self->get_max_donor($donor_ref_context)}; | 
|  | 349       $results{'MES-SWA_donor_ref_seq'} = $seq; | 
|  | 350       $results{'MES-SWA_donor_ref_frame'} = $frame; | 
|  | 351       $results{'MES-SWA_donor_ref'} = $score; | 
|  | 352     } | 
|  | 353   } | 
|  | 354 | 
|  | 355   if (defined($donor_alt_context)) { | 
|  | 356     $results{'MES-SWA_donor_alt_context'} = $donor_alt_context; | 
|  | 357 | 
|  | 358     if ($donor_alt_context  =~ /^[ACGT]+$/) { | 
|  | 359       my ($seq, $frame, $score) = @{$self->get_max_donor($donor_alt_context)}; | 
|  | 360       $results{'MES-SWA_donor_alt_seq'} = $seq; | 
|  | 361       $results{'MES-SWA_donor_alt_frame'} = $frame; | 
|  | 362       $results{'MES-SWA_donor_alt'} = $score; | 
|  | 363 | 
|  | 364       if (defined(my $ref_comp_seq = $results{'MES-SWA_donor_ref_seq'})) { | 
|  | 365 | 
|  | 366         if ($vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/) { | 
|  | 367           # for SNVs, compare to the same frame as the highest scoring ALT k-mer | 
|  | 368           $ref_comp_seq = substr($donor_ref_context, $frame - 1, 9); | 
|  | 369         } | 
|  | 370 | 
|  | 371         $results{'MES-SWA_donor_ref_comp_seq'} = $ref_comp_seq; | 
|  | 372         $results{'MES-SWA_donor_ref_comp'} = $self->score5($ref_comp_seq); | 
|  | 373 | 
|  | 374         $results{'MES-SWA_donor_diff'} = $results{'MES-SWA_donor_ref_comp'} - $score; | 
|  | 375       } | 
|  | 376     } | 
|  | 377   } | 
|  | 378 | 
|  | 379   # get the acceptor reference and alternate sequence contexts | 
|  | 380   my ($acceptor_ref_context, $acceptor_alt_context) = @{$self->get_seqs($tva, $vf->start - 22, $vf->end + 22)}; | 
|  | 381 | 
|  | 382   if (defined($acceptor_ref_context)) { | 
|  | 383     $results{'MES-SWA_acceptor_ref_context'} = $acceptor_ref_context; | 
|  | 384 | 
|  | 385     if ($acceptor_ref_context  =~ /^[ACGT]+$/) { | 
|  | 386       my ($seq, $frame, $score) = @{$self->get_max_acceptor($acceptor_ref_context)}; | 
|  | 387       $results{'MES-SWA_acceptor_ref_seq'} = $seq; | 
|  | 388       $results{'MES-SWA_acceptor_ref_frame'} = $frame; | 
|  | 389       $results{'MES-SWA_acceptor_ref'} = $score; | 
|  | 390     } | 
|  | 391   } | 
|  | 392 | 
|  | 393   if (defined($acceptor_alt_context)) { | 
|  | 394     $results{'MES-SWA_acceptor_alt_context'} = $acceptor_alt_context; | 
|  | 395 | 
|  | 396     if ($acceptor_alt_context  =~ /^[ACGT]+$/) { | 
|  | 397       my ($seq, $frame, $score) = @{$self->get_max_acceptor($acceptor_alt_context)}; | 
|  | 398       $results{'MES-SWA_acceptor_alt_seq'} = $seq; | 
|  | 399       $results{'MES-SWA_acceptor_alt_frame'} = $frame; | 
|  | 400       $results{'MES-SWA_acceptor_alt'} = $score; | 
|  | 401 | 
|  | 402       if (defined(my $ref_comp_seq = $results{'MES-SWA_acceptor_ref_seq'})) { | 
|  | 403 | 
|  | 404         if ($vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/) { | 
|  | 405           # for SNVs, compare to the same frame as the highest scoring ALT k-mer | 
|  | 406           $ref_comp_seq = substr($acceptor_ref_context, $frame - 1, 23); | 
|  | 407         } | 
|  | 408 | 
|  | 409         $results{'MES-SWA_acceptor_ref_comp_seq'} = $ref_comp_seq; | 
|  | 410         $results{'MES-SWA_acceptor_ref_comp'} = $self->score3($ref_comp_seq); | 
|  | 411 | 
|  | 412         $results{'MES-SWA_acceptor_diff'} = $results{'MES-SWA_acceptor_ref_comp'} - $score; | 
|  | 413       } | 
|  | 414     } | 
|  | 415   } | 
|  | 416 | 
|  | 417   return \%results; | 
|  | 418 } | 
|  | 419 | 
|  | 420 sub run_NCSS { | 
|  | 421   my ($self, $tva) = @_; | 
|  | 422 | 
|  | 423   my $tv = $tva->transcript_variation; | 
|  | 424   my $tr = $tva->transcript; | 
|  | 425 | 
|  | 426   my %results; | 
|  | 427 | 
|  | 428   if ($tv->intron_number) { | 
|  | 429 | 
|  | 430     my ($intron_numbers, $total_introns) = split(/\//, $tv->intron_number); | 
|  | 431     my $intron_number = (split(/-/, $intron_numbers))[0]; | 
|  | 432 | 
|  | 433     my $introns = $tr->get_all_Introns; | 
|  | 434 | 
|  | 435     my $intron_idx = $intron_number - 1; | 
|  | 436     my $intron = $introns->[$intron_idx]; | 
|  | 437 | 
|  | 438     if (defined(my $seq = $self->get_donor_seq_from_intron($intron))) { | 
|  | 439       $results{'MES-NCSS_upstream_donor_seq'} = $seq; | 
|  | 440       $results{'MES-NCSS_upstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/; | 
|  | 441     } | 
|  | 442 | 
|  | 443     if (defined(my $seq = $self->get_acceptor_seq_from_intron($intron))) { | 
|  | 444       $results{'MES-NCSS_downstream_acceptor_seq'} = $seq; | 
|  | 445       $results{'MES-NCSS_downstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/; | 
|  | 446     } | 
|  | 447 | 
|  | 448     # don't calculate an upstream acceptor score if the intron is the first in the transcript | 
|  | 449     unless ($intron_number == 1) { | 
|  | 450       my $upstream_intron = $introns->[$intron_idx - 1]; | 
|  | 451 | 
|  | 452       if (defined(my $seq = $self->get_acceptor_seq_from_intron($upstream_intron))) { | 
|  | 453         $results{'MES-NCSS_upstream_acceptor_seq'} = $seq; | 
|  | 454         $results{'MES-NCSS_upstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/; | 
|  | 455       } | 
|  | 456     } | 
|  | 457 | 
|  | 458     # don't calculate a downstream donor score if the intron is the last in the transcript | 
|  | 459     unless ($intron_number == $total_introns) { | 
|  | 460       my $downstream_intron = $introns->[$intron_idx + 1]; | 
|  | 461 | 
|  | 462       if (defined(my $seq = $self->get_donor_seq_from_intron($downstream_intron))) { | 
|  | 463         $results{'MES-NCSS_downstream_donor_seq'} = $seq; | 
|  | 464         $results{'MES-NCSS_downstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/; | 
|  | 465       } | 
|  | 466     } | 
|  | 467   } | 
|  | 468 | 
|  | 469   elsif ($tv->exon_number) { | 
|  | 470 | 
|  | 471     my ($exon_numbers, $total_exons) = split(/\//, $tv->exon_number); | 
|  | 472     my $exon_number = (split(/-/, $exon_numbers))[0]; | 
|  | 473 | 
|  | 474     my $exons = $tr->get_all_Exons; | 
|  | 475 | 
|  | 476     my $exon_idx = $exon_number - 1; | 
|  | 477     my $exon = $exons->[$exon_idx]; | 
|  | 478 | 
|  | 479     # don't calculate upstream scores if the exon is the first in the transcript | 
|  | 480     unless ($exon_number == 1) { | 
|  | 481       my $upstream_exon = $exons->[$exon_idx - 1]; | 
|  | 482 | 
|  | 483       if (defined(my $seq = $self->get_donor_seq_from_exon($upstream_exon))) { | 
|  | 484         $results{'MES-NCSS_upstream_donor_seq'} = $seq; | 
|  | 485         $results{'MES-NCSS_upstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/; | 
|  | 486       } | 
|  | 487 | 
|  | 488       if (defined(my $seq = $self->get_acceptor_seq_from_exon($exon))) { | 
|  | 489         $results{'MES-NCSS_upstream_acceptor_seq'} = $seq; | 
|  | 490         $results{'MES-NCSS_upstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/; | 
|  | 491       } | 
|  | 492     } | 
|  | 493 | 
|  | 494     # don't calculate downstream scores if the exon is the last exon in the transcript | 
|  | 495     unless ($exon_number == $total_exons) { | 
|  | 496       my $downstream_exon = $exons->[$exon_idx + 1]; | 
|  | 497 | 
|  | 498       if (defined(my $seq = $self->get_donor_seq_from_exon($exon))) { | 
|  | 499         $results{'MES-NCSS_downstream_donor_seq'} = $seq; | 
|  | 500         $results{'MES-NCSS_downstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/; | 
|  | 501       } | 
|  | 502 | 
|  | 503       if (defined(my $seq = $self->get_acceptor_seq_from_exon($downstream_exon))) { | 
|  | 504         $results{'MES-NCSS_downstream_acceptor_seq'} = $seq; | 
|  | 505         $results{'MES-NCSS_downstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/; | 
|  | 506       } | 
|  | 507     } | 
|  | 508   } | 
|  | 509 | 
|  | 510   return \%results; | 
|  | 511 } | 
|  | 512 | 
|  | 513 | 
|  | 514 ## Sliding window approach methods | 
|  | 515 ################################## | 
|  | 516 | 
|  | 517 sub get_max_donor { | 
|  | 518   my ($self, $sequence) = @_; | 
|  | 519 | 
|  | 520   my ($seq, $frame, $max); | 
|  | 521   my @kmers = @{$self->sliding_window($sequence, 9)}; | 
|  | 522 | 
|  | 523   for my $i (0 .. $#kmers) { | 
|  | 524     my $kmer = $kmers[$i]; | 
|  | 525     my $score = $self->score5($kmer); | 
|  | 526     if(!$max || $score > $max) { | 
|  | 527       $seq = $kmer; | 
|  | 528       $frame = $i + 1; | 
|  | 529       $max = $score; | 
|  | 530     } | 
|  | 531   } | 
|  | 532   return [$seq, $frame, $max]; | 
|  | 533 } | 
|  | 534 | 
|  | 535 sub get_max_acceptor { | 
|  | 536   my ($self, $sequence) = @_; | 
|  | 537 | 
|  | 538   my ($seq, $frame, $max); | 
|  | 539   my @kmers = @{$self->sliding_window($sequence, 23)}; | 
|  | 540 | 
|  | 541   for my $i (0 .. $#kmers) { | 
|  | 542     my $kmer = $kmers[$i]; | 
|  | 543     my $score = $self->score3($kmer); | 
|  | 544     if(!$max || $score > $max) { | 
|  | 545       $seq = $kmer; | 
|  | 546       $frame = $i + 1; | 
|  | 547       $max = $score; | 
|  | 548     } | 
|  | 549   } | 
|  | 550   return [$seq, $frame, $max]; | 
|  | 551 } | 
|  | 552 | 
|  | 553 sub sliding_window { | 
|  | 554   my ($self, $sequence, $winsize) = @_; | 
|  | 555   my @seqs; | 
|  | 556   for (my $i = 1; $i <= length($sequence) - $winsize + 1; $i++) { | 
|  | 557     push @seqs, substr($sequence, $i - 1, $winsize); | 
|  | 558   } | 
|  | 559   return \@seqs; | 
|  | 560 } | 
|  | 561 | 
|  | 562 | 
|  | 563 ## Nearest canonical splice site methods | 
|  | 564 ######################################## | 
|  | 565 | 
|  | 566 sub get_donor_seq_from_exon { | 
|  | 567   my ($self, $exon) = @_; | 
|  | 568 | 
|  | 569   my ($start, $end); | 
|  | 570 | 
|  | 571   if ($exon->strand > 0) { | 
|  | 572     ($start, $end) = ($exon->end - 2, $exon->end + 6); | 
|  | 573   } | 
|  | 574   else { | 
|  | 575     ($start, $end) = ($exon->start - 6, $exon->start + 2); | 
|  | 576   } | 
|  | 577 | 
|  | 578   my $slice = $exon->slice()->sub_Slice($start, $end, $exon->strand); | 
|  | 579   my $seq = $slice->seq() if defined($slice); | 
|  | 580 | 
|  | 581   return $seq; | 
|  | 582 } | 
|  | 583 | 
|  | 584 sub get_acceptor_seq_from_exon { | 
|  | 585   my ($self, $exon) = @_; | 
|  | 586 | 
|  | 587   my ($start, $end); | 
|  | 588 | 
|  | 589   if ($exon->strand > 0) { | 
|  | 590     ($start, $end) = ($exon->start - 20, $exon->start + 2); | 
|  | 591   } | 
|  | 592   else { | 
|  | 593     ($start, $end) = ($exon->end - 2, $exon->end + 20); | 
|  | 594   } | 
|  | 595 | 
|  | 596   my $slice = $exon->slice()->sub_Slice($start, $end, $exon->strand); | 
|  | 597   my $seq = $slice->seq() if defined($slice); | 
|  | 598 | 
|  | 599   return $seq; | 
|  | 600 } | 
|  | 601 | 
|  | 602 sub get_donor_seq_from_intron { | 
|  | 603   my ($self, $intron) = @_; | 
|  | 604 | 
|  | 605   my ($start, $end); | 
|  | 606 | 
|  | 607   if ($intron->strand > 0) { | 
|  | 608     ($start, $end) = ($intron->start - 3, $intron->start + 5); | 
|  | 609   } | 
|  | 610   else { | 
|  | 611     ($start, $end) = ($intron->end - 5, $intron->end + 3); | 
|  | 612   } | 
|  | 613 | 
|  | 614   my $slice = $intron->slice()->sub_Slice($start, $end, $intron->strand); | 
|  | 615   my $seq = $slice->seq() if defined($slice); | 
|  | 616 | 
|  | 617   return $seq; | 
|  | 618 } | 
|  | 619 | 
|  | 620 sub get_acceptor_seq_from_intron { | 
|  | 621   my ($self, $intron) = @_; | 
|  | 622 | 
|  | 623   my ($start, $end); | 
|  | 624 | 
|  | 625   if ($intron->strand > 0) { | 
|  | 626     ($start, $end) = ($intron->end - 19, $intron->end + 3); | 
|  | 627   } | 
|  | 628   else { | 
|  | 629     ($start, $end) = ($intron->start - 3, $intron->start + 19); | 
|  | 630   } | 
|  | 631 | 
|  | 632   my $slice = $intron->slice()->sub_Slice($start, $end, $intron->strand); | 
|  | 633   my $seq = $slice->seq() if defined($slice); | 
|  | 634 | 
|  | 635   return $seq; | 
|  | 636 } | 
|  | 637 | 
|  | 638 | 
|  | 639 ## Common methods | 
|  | 640 ################# | 
|  | 641 | 
|  | 642 sub get_seqs { | 
|  | 643   my ($self, $tva, $start, $end) = @_; | 
|  | 644   my $vf = $tva->variation_feature; | 
|  | 645 | 
|  | 646   my $tr_strand = $tva->transcript->strand; | 
|  | 647 | 
|  | 648   my $ref_slice = $vf->{slice}->sub_Slice($start, $end, $tr_strand); | 
|  | 649 | 
|  | 650   my ($ref_seq, $alt_seq); | 
|  | 651 | 
|  | 652   if (defined $ref_slice) { | 
|  | 653 | 
|  | 654     $ref_seq = $alt_seq = $ref_slice->seq(); | 
|  | 655 | 
|  | 656     my $substr_start = $tr_strand > 0 ? $vf->{start} - $start : $end - $vf->{end}; | 
|  | 657     my $feature_seq = $tva->seq_length > 0 ? $tva->feature_seq : ''; | 
|  | 658 | 
|  | 659     substr($alt_seq, $substr_start, ($vf->{end} - $vf->{start}) + 1) = $feature_seq; | 
|  | 660   } | 
|  | 661 | 
|  | 662   return [$ref_seq, $alt_seq]; | 
|  | 663 } | 
|  | 664 | 
|  | 665 sub score5 { | 
|  | 666   my $self = shift; | 
|  | 667   my $seq = shift; | 
|  | 668   my $hex = md5_hex($seq); | 
|  | 669 | 
|  | 670   # check cache | 
|  | 671   if($self->{cache}) { | 
|  | 672     my ($res) = grep {$_->{hex} eq $hex} @{$self->{cache}->{score5}}; | 
|  | 673 | 
|  | 674     return $res->{score} if $res; | 
|  | 675   } | 
|  | 676 | 
|  | 677   my $a = $self->score5_scoreconsensus($seq); | 
|  | 678   die("ERROR: No score5_scoreconsensus\n") unless defined($a); | 
|  | 679 | 
|  | 680   my $b = $self->score5_getrest($seq); | 
|  | 681   die("ERROR: No score5_getrest\n") unless defined($b); | 
|  | 682 | 
|  | 683   my $c = $self->{'score5_seq'}->{$b}; | 
|  | 684   die("ERROR: No score5_seq for $b\n") unless defined($c); | 
|  | 685 | 
|  | 686   my $d = $self->{'score5_me2x5'}->{$c}; | 
|  | 687   die("ERROR: No score5_me2x5 for $c\n") unless defined($d); | 
|  | 688 | 
|  | 689   my $score = $self->log2($a * $d); | 
|  | 690 | 
|  | 691   # cache it | 
|  | 692   push @{$self->{cache}->{score5}}, { hex => $hex, score => $score }; | 
|  | 693   shift @{$self->{cache}->{score5}} while scalar @{$self->{cache}->{score5}} > $CACHE_SIZE; | 
|  | 694 | 
|  | 695   return $score; | 
|  | 696 } | 
|  | 697 | 
|  | 698 sub score3 { | 
|  | 699   my $self = shift; | 
|  | 700   my $seq = shift; | 
|  | 701   my $hex = md5_hex($seq); | 
|  | 702 | 
|  | 703   # check cache | 
|  | 704   if($self->{cache}) { | 
|  | 705     my ($res) = grep {$_->{hex} eq $hex} @{$self->{cache}->{score3}}; | 
|  | 706 | 
|  | 707     return $res->{score} if $res; | 
|  | 708   } | 
|  | 709 | 
|  | 710   my $a = $self->score3_scoreconsensus($seq); | 
|  | 711   die("ERROR: No score3_scoreconsensus\n") unless defined($a); | 
|  | 712 | 
|  | 713   my $b = $self->score3_getrest($seq); | 
|  | 714   die("ERROR: No score3_getrest\n") unless defined($b); | 
|  | 715 | 
|  | 716   my $c = $self->score3_maxentscore($b, $self->{'score3_metables'}); | 
|  | 717   die("ERROR: No score3_maxentscore for $b\n") unless defined($c); | 
|  | 718 | 
|  | 719   my $score = $self->log2($a * $c); | 
|  | 720 | 
|  | 721   # cache it | 
|  | 722   push @{$self->{cache}->{score3}}, { hex => $hex, score => $score }; | 
|  | 723   shift @{$self->{cache}->{score3}} while scalar @{$self->{cache}->{score3}} > $CACHE_SIZE; | 
|  | 724 | 
|  | 725   return $score; | 
|  | 726 } | 
|  | 727 | 
|  | 728 | 
|  | 729 ## methods copied from score5.pl | 
|  | 730 ################################ | 
|  | 731 | 
|  | 732 sub score5_makesequencematrix { | 
|  | 733   my $self = shift; | 
|  | 734   my $file = shift; | 
|  | 735   my %matrix; | 
|  | 736   my $n=0; | 
|  | 737   open(SCOREF, $file) || die "Can't open $file!\n"; | 
|  | 738   while(<SCOREF>) { | 
|  | 739     chomp; | 
|  | 740     $_=~ s/\s//; | 
|  | 741     $matrix{$_} = $n; | 
|  | 742     $n++; | 
|  | 743   } | 
|  | 744   close(SCOREF); | 
|  | 745   return \%matrix; | 
|  | 746 } | 
|  | 747 | 
|  | 748 sub score5_makescorematrix { | 
|  | 749   my $self = shift; | 
|  | 750   my $file = shift; | 
|  | 751   my %matrix; | 
|  | 752   my $n=0; | 
|  | 753   open(SCOREF, $file) || die "Can't open $file!\n"; | 
|  | 754   while(<SCOREF>) { | 
|  | 755     chomp; | 
|  | 756     $_=~ s/\s//; | 
|  | 757     $matrix{$n} = $_; | 
|  | 758     $n++; | 
|  | 759   } | 
|  | 760   close(SCOREF); | 
|  | 761   return \%matrix; | 
|  | 762 } | 
|  | 763 | 
|  | 764 sub score5_getrest { | 
|  | 765   my $self = shift; | 
|  | 766   my $seq = shift; | 
|  | 767   my @seqa = split(//,uc($seq)); | 
|  | 768   return $seqa[0].$seqa[1].$seqa[2].$seqa[5].$seqa[6].$seqa[7].$seqa[8]; | 
|  | 769 } | 
|  | 770 | 
|  | 771 sub score5_scoreconsensus { | 
|  | 772   my $self = shift; | 
|  | 773   my $seq = shift; | 
|  | 774   my @seqa = split(//,uc($seq)); | 
|  | 775   my %bgd; | 
|  | 776   $bgd{'A'} = 0.27; | 
|  | 777   $bgd{'C'} = 0.23; | 
|  | 778   $bgd{'G'} = 0.23; | 
|  | 779   $bgd{'T'} = 0.27; | 
|  | 780   my %cons1; | 
|  | 781   $cons1{'A'} = 0.004; | 
|  | 782   $cons1{'C'} = 0.0032; | 
|  | 783   $cons1{'G'} = 0.9896; | 
|  | 784   $cons1{'T'} = 0.0032; | 
|  | 785   my %cons2; | 
|  | 786   $cons2{'A'} = 0.0034; | 
|  | 787   $cons2{'C'} = 0.0039; | 
|  | 788   $cons2{'G'} = 0.0042; | 
|  | 789   $cons2{'T'} = 0.9884; | 
|  | 790   my $addscore = $cons1{$seqa[3]}*$cons2{$seqa[4]}/($bgd{$seqa[3]}*$bgd{$seqa[4]}); | 
|  | 791   return $addscore; | 
|  | 792 } | 
|  | 793 | 
|  | 794 sub log2 { | 
|  | 795   my ($self, $val) = @_; | 
|  | 796   return log($val)/log(2); | 
|  | 797 } | 
|  | 798 | 
|  | 799 | 
|  | 800 ## methods copied from score3.pl | 
|  | 801 ################################ | 
|  | 802 | 
|  | 803 sub score3_hashseq { | 
|  | 804   #returns hash of sequence in base 4 | 
|  | 805   # $self->score3_hashseq('CAGAAGT') returns 4619 | 
|  | 806   my $self = shift; | 
|  | 807   my $seq = shift; | 
|  | 808   $seq = uc($seq); | 
|  | 809   $seq =~ tr/ACGT/0123/; | 
|  | 810   my @seqa = split(//,$seq); | 
|  | 811   my $sum = 0; | 
|  | 812   my $len = length($seq); | 
|  | 813   my @four = (1,4,16,64,256,1024,4096,16384); | 
|  | 814   my $i=0; | 
|  | 815   while ($i<$len) { | 
|  | 816     $sum+= $seqa[$i] * $four[$len - $i -1] ; | 
|  | 817     $i++; | 
|  | 818   } | 
|  | 819   return $sum; | 
|  | 820 } | 
|  | 821 | 
|  | 822 sub score3_makemaxentscores { | 
|  | 823   my $self = shift; | 
|  | 824   my $dir = $self->{'_dir'}."/splicemodels/"; | 
|  | 825   my @list = ('me2x3acc1','me2x3acc2','me2x3acc3','me2x3acc4', | 
|  | 826     'me2x3acc5','me2x3acc6','me2x3acc7','me2x3acc8','me2x3acc9'); | 
|  | 827   my @metables; | 
|  | 828   my $num = 0 ; | 
|  | 829   foreach my $file (@list) { | 
|  | 830     my $n = 0; | 
|  | 831     open (SCOREF,"<".$dir.$file) || die "Can't open $file!\n"; | 
|  | 832     while(<SCOREF>) { | 
|  | 833       chomp; | 
|  | 834       $_=~ s/\s//; | 
|  | 835       $metables[$num]{$n} = $_; | 
|  | 836       $n++; | 
|  | 837     } | 
|  | 838     close(SCOREF); | 
|  | 839     #print STDERR $file."\t".$num."\t".$n."\n"; | 
|  | 840     $num++; | 
|  | 841   } | 
|  | 842   return \@metables; | 
|  | 843 } | 
|  | 844 | 
|  | 845 sub score3_maxentscore { | 
|  | 846   my $self = shift; | 
|  | 847   my $seq = shift; | 
|  | 848   my $table_ref = shift; | 
|  | 849   my @metables = @$table_ref; | 
|  | 850   my @sc; | 
|  | 851   $sc[0] = $metables[0]{$self->score3_hashseq(substr($seq,0,7))}; | 
|  | 852   $sc[1] = $metables[1]{$self->score3_hashseq(substr($seq,7,7))}; | 
|  | 853   $sc[2] = $metables[2]{$self->score3_hashseq(substr($seq,14,7))}; | 
|  | 854   $sc[3] = $metables[3]{$self->score3_hashseq(substr($seq,4,7))}; | 
|  | 855   $sc[4] = $metables[4]{$self->score3_hashseq(substr($seq,11,7))}; | 
|  | 856   $sc[5] = $metables[5]{$self->score3_hashseq(substr($seq,4,3))}; | 
|  | 857   $sc[6] = $metables[6]{$self->score3_hashseq(substr($seq,7,4))}; | 
|  | 858   $sc[7] = $metables[7]{$self->score3_hashseq(substr($seq,11,3))}; | 
|  | 859   $sc[8] = $metables[8]{$self->score3_hashseq(substr($seq,14,4))}; | 
|  | 860   my $finalscore = $sc[0] * $sc[1] * $sc[2] * $sc[3] * $sc[4] / ($sc[5] * $sc[6] * $sc[7] * $sc[8]); | 
|  | 861   return $finalscore; | 
|  | 862 } | 
|  | 863 | 
|  | 864 sub score3_getrest { | 
|  | 865   my $self = shift; | 
|  | 866   my $seq = shift; | 
|  | 867   my $seq_noconsensus = substr($seq,0,18).substr($seq,20,3); | 
|  | 868   return $seq_noconsensus; | 
|  | 869 } | 
|  | 870 | 
|  | 871 sub score3_scoreconsensus { | 
|  | 872   my $self = shift; | 
|  | 873   my $seq = shift; | 
|  | 874   my @seqa = split(//,uc($seq)); | 
|  | 875   my %bgd; | 
|  | 876   $bgd{'A'} = 0.27; | 
|  | 877   $bgd{'C'} = 0.23; | 
|  | 878   $bgd{'G'} = 0.23; | 
|  | 879   $bgd{'T'} = 0.27; | 
|  | 880   my %cons1; | 
|  | 881   $cons1{'A'} = 0.9903; | 
|  | 882   $cons1{'C'} = 0.0032; | 
|  | 883   $cons1{'G'} = 0.0034; | 
|  | 884   $cons1{'T'} = 0.0030; | 
|  | 885   my %cons2; | 
|  | 886   $cons2{'A'} = 0.0027; | 
|  | 887   $cons2{'C'} = 0.0037; | 
|  | 888   $cons2{'G'} = 0.9905; | 
|  | 889   $cons2{'T'} = 0.0030; | 
|  | 890   my $addscore = $cons1{$seqa[18]} * $cons2{$seqa[19]}/ ($bgd{$seqa[18]} * $bgd{$seqa[19]}); | 
|  | 891   return $addscore; | 
|  | 892 } | 
|  | 893 | 
|  | 894 1; | 
|  | 895 |