comparison dir_plugins/MaxEntScan.pm @ 0:e545d0a25ffe draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:17:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e545d0a25ffe
1 =head1 LICENSE
2
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
5
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17
18 =head1 CONTACT
19
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
21
22 =cut
23
24 =head1 NAME
25
26 MaxEntScan
27
28 =head1 SYNOPSIS
29
30 mv MaxEntScan.pm ~/.vep/Plugins
31 ./vep -i variants.vcf --plugin MaxEntScan,/path/to/maxentscan/fordownload
32 ./vep -i variants.vcf --plugin MaxEntScan,/path/to/maxentscan/fordownload,SWA,NCSS
33
34 =head1 DESCRIPTION
35
36 This is a plugin for the Ensembl Variant Effect Predictor (VEP) that
37 runs MaxEntScan (http://genes.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html)
38 to get splice site predictions.
39
40 The plugin copies most of the code verbatim from the score5.pl and score3.pl
41 scripts provided in the MaxEntScan download. To run the plugin you must get and
42 unpack the archive from http://genes.mit.edu/burgelab/maxent/download/; the path
43 to this unpacked directory is then the param you pass to the --plugin flag.
44
45 The plugin executes the logic from one of the scripts depending on which
46 splice region the variant overlaps:
47
48 score5.pl : last 3 bases of exon --> first 6 bases of intron
49 score3.pl : last 20 bases of intron --> first 3 bases of exon
50
51 The plugin reports the reference, alternate and difference (REF - ALT) maximum
52 entropy scores.
53
54 If 'SWA' is specified as a command-line argument, a sliding window algorithm
55 is applied to subsequences containing the reference and alternate alleles to
56 identify k-mers with the highest donor and acceptor splice site scores. To assess
57 the impact of variants, reference comparison scores are also provided. For SNVs,
58 the comparison scores are derived from sequence in the same frame as the highest
59 scoring k-mers containing the alternate allele. For all other variants, the
60 comparison scores are derived from the highest scoring k-mers containing the
61 reference allele. The difference between the reference comparison and alternate
62 scores (SWA_REF_COMP - SWA_ALT) are also provided.
63
64 If 'NCSS' is specified as a command-line argument, scores for the nearest
65 upstream and downstream canonical splice sites are also included.
66
67 By default, only scores are reported. Add 'verbose' to the list of command-
68 line arguments to include the sequence output associated with those scores.
69
70 =cut
71
72 package MaxEntScan;
73
74 use strict;
75 use warnings;
76
77 use Digest::MD5 qw(md5_hex);
78
79 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
80 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap);
81
82 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
83 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
84
85 # how many seq/score pairs to cache in memory
86 our $CACHE_SIZE = 50;
87
88 sub new {
89 my $class = shift;
90
91 my $self = $class->SUPER::new(@_);
92
93 # we need sequence, so no offline mode unless we have FASTA
94 die("ERROR: cannot function in offline mode without a FASTA file\n") if $self->{config}->{offline} && !$self->{config}->{fasta};
95
96 my $params = $self->params;
97
98 my $dir = shift @$params;
99 die("ERROR: MaxEntScan directory not specified\n") unless $dir;
100 die("ERROR: MaxEntScan directory not found\n") unless -d $dir;
101 $self->{_dir} = $dir;
102
103 ## setup from score5.pl
104 $self->{'score5_me2x5'} = $self->score5_makescorematrix($dir.'/me2x5');
105 $self->{'score5_seq'} = $self->score5_makesequencematrix($dir.'/splicemodels/splice5sequences');
106
107 ## setup from score3.pl
108 $self->{'score3_metables'} = $self->score3_makemaxentscores;
109
110 my %opts = map { $_ => undef } @$params;
111
112 $self->{'run_SWA'} = 1 if exists $opts{'SWA'};
113 $self->{'run_NCSS'} = 1 if exists $opts{'NCSS'};
114
115 $self->{'verbose'} = 1 if exists $opts{'verbose'};
116
117 return $self;
118 }
119
120 sub feature_types {
121 return ['Transcript'];
122 }
123
124 sub get_header_info {
125 my $self = shift;
126
127 my $v = $self->{'verbose'};
128 my $headers = $self->get_MES_header_info($v);
129
130 if ($self->{'run_SWA'}) {
131 my $swa_headers = $self->get_SWA_header_info($v);
132 $headers = {%$headers, %$swa_headers};
133 }
134
135 if ($self->{'run_NCSS'}) {
136 my $ncss_headers = $self->get_NCSS_header_info($v);
137 $headers = {%$headers, %$ncss_headers};
138 }
139
140 return $headers;
141 }
142
143 sub get_MES_header_info {
144 my ($self, $verbose) = @_;
145
146 my $headers = {
147 MaxEntScan_ref => "MaxEntScan reference sequence score",
148 MaxEntScan_alt => "MaxEntScan alternate sequence score",
149 MaxEntScan_diff => "MaxEntScan score difference",
150 };
151
152 if ($verbose) {
153
154 $headers->{'MaxEntScan_ref_seq'} = "MaxEntScan reference sequence";
155 $headers->{'MaxEntScan_alt_seq'} = "MaxEntScan alternate sequence";
156 }
157
158 return $headers;
159 }
160
161 sub get_SWA_header_info {
162 my ($self, $verbose) = @_;
163
164 my $headers = {
165 "MES-SWA_donor_ref" => "Highest splice donor reference sequence score",
166 "MES-SWA_donor_alt" => "Highest splice donor alternate sequence score",
167 "MES-SWA_donor_ref_comp" => "Donor reference comparison sequence score",
168 "MES-SWA_donor_diff" => "Difference between the donor reference comparison and alternate sequence scores",
169
170 "MES-SWA_acceptor_ref" => "Highest splice acceptor reference sequence score",
171 "MES-SWA_acceptor_alt" => "Highest splice acceptor alternate sequence score",
172 "MES-SWA_acceptor_ref_comp" => "Acceptor reference comparison sequence score",
173 "MES-SWA_acceptor_diff" => "Difference between the acceptor reference comparison and alternate sequence scores",
174 };
175
176 if ($verbose) {
177
178 $headers->{'MES-SWA_donor_ref_seq'} = "Highest splice donor reference sequence";
179 $headers->{'MES-SWA_donor_ref_frame'} = "Position of the highest splice donor reference sequence";
180 $headers->{'MES-SWA_donor_ref_context'} = "Selected donor sequence context containing the reference allele";
181 $headers->{'MES-SWA_donor_alt_seq'} = "Highest splice donor alternate sequence";
182 $headers->{'MES-SWA_donor_alt_frame'} = "Position of the highest splice donor alternate sequence";
183 $headers->{'MES-SWA_donor_alt_context'} = "Selected donor sequence context containing the alternate allele";
184 $headers->{'MES-SWA_donor_ref_comp_seq'} = "Donor reference comparison sequence";
185
186 $headers->{'MES-SWA_acceptor_ref_seq'} = "Highest splice acceptor reference sequence";
187 $headers->{'MES-SWA_acceptor_ref_frame'} = "Position of the highest splice acceptor reference sequence";
188 $headers->{'MES-SWA_acceptor_ref_context'} = "Selected acceptor sequence context containing the reference allele";
189 $headers->{'MES-SWA_acceptor_alt_seq'} = "Highest splice acceptor alternate sequence";
190 $headers->{'MES-SWA_acceptor_alt_frame'} = "Position of the highest splice acceptor alternate sequence";
191 $headers->{'MES-SWA_acceptor_alt_context'} = "Selected acceptor sequence context containing the alternate allele";
192 $headers->{'MES-SWA_acceptor_ref_comp_seq'} = "Acceptor reference comparison sequence";
193 }
194
195 return $headers;
196 }
197
198 sub get_NCSS_header_info {
199 my ($self, $verbose) = @_;
200
201 my $headers = {
202 "MES-NCSS_upstream_acceptor" => "Nearest upstream canonical splice acceptor sequence score",
203 "MES-NCSS_upstream_donor" => "Nearest upstream canonical splice donor sequence score",
204
205 "MES-NCSS_downstream_acceptor" => "Nearest downstream canonical splice acceptor sequence score",
206 "MES-NCSS_downstream_donor" => "Nearest downstream canonical splice donor sequence score",
207 };
208
209 if ($verbose) {
210
211 $headers->{'MES-NCSS_upstream_acceptor_seq'} = "Nearest upstream canonical splice acceptor sequence";
212 $headers->{'MES-NCSS_upstream_donor_seq'} = "Nearest upstream canonical splice donor sequence";
213
214 $headers->{'MES-NCSS_downstream_acceptor_seq'} = "Nearest downstream canonical splice acceptor sequence";
215 $headers->{'MES-NCSS_downstream_donor_seq'} = "Nearest downstream canonical splice donor sequence";
216 }
217
218 return $headers;
219 }
220
221 sub run {
222 my ($self, $tva) = @_;
223
224 my $seq_headers = $self->get_MES_header_info();
225 my $results = $self->run_MES($tva);
226
227 if ($self->{'run_SWA'}) {
228 my $swa_seq_headers = $self->get_SWA_header_info();
229 $seq_headers = {%$seq_headers, %$swa_seq_headers};
230 my $swa_results = $self->run_SWA($tva);
231 $results = {%$results, %$swa_results};
232 }
233
234 if ($self->{'run_NCSS'}) {
235 my $ncss_seq_headers = $self->get_NCSS_header_info();
236 $seq_headers = {%$seq_headers, %$ncss_seq_headers};
237 my $ncss_results = $self->run_NCSS($tva);
238 $results = {%$results, %$ncss_results};
239 }
240
241 my %data;
242
243 # add the scores
244 my @scores = grep { exists $results->{$_} } keys %$seq_headers;
245 @data{@scores} = map { sprintf('%.3f', $_) } @{$results}{@scores};
246
247 if ($self->{'verbose'}) {
248 # add any remaining results
249 my @non_scores = grep { ! exists $data{$_} } keys %$results;
250 @data{@non_scores} = @{$results}{@non_scores};
251 }
252
253 return \%data;
254 }
255
256 sub run_MES {
257 my ($self, $tva) = @_;
258
259 my $vf = $tva->variation_feature;
260 return {} unless $vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/;
261
262 my $tv = $tva->transcript_variation;
263 my $tr = $tva->transcript;
264 my $tr_strand = $tr->strand;
265 my ($vf_start, $vf_end) = ($vf->start, $vf->end);
266
267 # use _overlapped_introns() method from BaseTranscriptVariation
268 # this will use an interval tree if available for superfast lookup of overlapping introns
269 # we have to expand the search space around $vf because we're looking for the splice region not the intron per se
270 foreach my $intron(@{$tv->_overlapped_introns($vf_start - 21, $vf_end + 21)}) {
271
272 # get coords depending on strand
273 # MaxEntScan does different predictions for 5 and 3 prime
274 # and we need to feed it different bits of sequence for each
275 #
276 # 5prime, 3 bases of exon, 6 bases of intron:
277 # ===------
278 #
279 # 3prime, 20 bases of intron, 3 bases of exon
280 # --------------------===
281
282 my ($five_start, $five_end, $three_start, $three_end);
283
284 if($tr_strand > 0) {
285 ($five_start, $five_end) = ($intron->start - 3, $intron->start + 5);
286 ($three_start, $three_end) = ($intron->end - 19, $intron->end + 3);
287 }
288
289 else {
290 ($five_start, $five_end) = ($intron->end - 5, $intron->end + 3);
291 ($three_start, $three_end) = ($intron->start - 3, $intron->start + 19);
292 }
293
294 if(overlap($vf->start, $vf->end, $five_start, $five_end)) {
295 my ($ref_seq, $alt_seq) = @{$self->get_seqs($tva, $five_start, $five_end)};
296
297 return {} unless defined($ref_seq) && $ref_seq =~ /^[ACGT]+$/;
298 return {} unless defined($alt_seq) && $alt_seq =~ /^[ACGT]+$/;
299
300 my $ref_score = $self->score5($ref_seq);
301 my $alt_score = $self->score5($alt_seq);
302
303 return {
304 MaxEntScan_ref => $ref_score,
305 MaxEntScan_ref_seq => $ref_seq,
306 MaxEntScan_alt => $alt_score,
307 MaxEntScan_alt_seq => $alt_seq,
308 MaxEntScan_diff => $ref_score - $alt_score,
309 }
310 }
311
312 if(overlap($vf->start, $vf->end, $three_start, $three_end)) {
313 my ($ref_seq, $alt_seq) = @{$self->get_seqs($tva, $three_start, $three_end)};
314
315 return {} unless defined($ref_seq) && $ref_seq =~ /^[ACGT]+$/;
316 return {} unless defined($alt_seq) && $alt_seq =~ /^[ACGT]+$/;
317
318 my $ref_score = $self->score3($ref_seq);
319 my $alt_score = $self->score3($alt_seq);
320
321 return {
322 MaxEntScan_ref => $ref_score,
323 MaxEntScan_ref_seq => $ref_seq,
324 MaxEntScan_alt => $alt_score,
325 MaxEntScan_alt_seq => $alt_seq,
326 MaxEntScan_diff => $ref_score - $alt_score,
327 }
328 }
329 }
330
331 return {};
332 }
333
334 sub run_SWA {
335 my ($self, $tva) = @_;
336
337 my $vf = $tva->variation_feature;
338
339 my %results;
340
341 # get the donor reference and alternate sequence contexts
342 my ($donor_ref_context, $donor_alt_context) = @{$self->get_seqs($tva, $vf->start - 8, $vf->end + 8)};
343
344 if (defined($donor_ref_context)) {
345 $results{'MES-SWA_donor_ref_context'} = $donor_ref_context;
346
347 if ($donor_ref_context =~ /^[ACGT]+$/) {
348 my ($seq, $frame, $score) = @{$self->get_max_donor($donor_ref_context)};
349 $results{'MES-SWA_donor_ref_seq'} = $seq;
350 $results{'MES-SWA_donor_ref_frame'} = $frame;
351 $results{'MES-SWA_donor_ref'} = $score;
352 }
353 }
354
355 if (defined($donor_alt_context)) {
356 $results{'MES-SWA_donor_alt_context'} = $donor_alt_context;
357
358 if ($donor_alt_context =~ /^[ACGT]+$/) {
359 my ($seq, $frame, $score) = @{$self->get_max_donor($donor_alt_context)};
360 $results{'MES-SWA_donor_alt_seq'} = $seq;
361 $results{'MES-SWA_donor_alt_frame'} = $frame;
362 $results{'MES-SWA_donor_alt'} = $score;
363
364 if (defined(my $ref_comp_seq = $results{'MES-SWA_donor_ref_seq'})) {
365
366 if ($vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/) {
367 # for SNVs, compare to the same frame as the highest scoring ALT k-mer
368 $ref_comp_seq = substr($donor_ref_context, $frame - 1, 9);
369 }
370
371 $results{'MES-SWA_donor_ref_comp_seq'} = $ref_comp_seq;
372 $results{'MES-SWA_donor_ref_comp'} = $self->score5($ref_comp_seq);
373
374 $results{'MES-SWA_donor_diff'} = $results{'MES-SWA_donor_ref_comp'} - $score;
375 }
376 }
377 }
378
379 # get the acceptor reference and alternate sequence contexts
380 my ($acceptor_ref_context, $acceptor_alt_context) = @{$self->get_seqs($tva, $vf->start - 22, $vf->end + 22)};
381
382 if (defined($acceptor_ref_context)) {
383 $results{'MES-SWA_acceptor_ref_context'} = $acceptor_ref_context;
384
385 if ($acceptor_ref_context =~ /^[ACGT]+$/) {
386 my ($seq, $frame, $score) = @{$self->get_max_acceptor($acceptor_ref_context)};
387 $results{'MES-SWA_acceptor_ref_seq'} = $seq;
388 $results{'MES-SWA_acceptor_ref_frame'} = $frame;
389 $results{'MES-SWA_acceptor_ref'} = $score;
390 }
391 }
392
393 if (defined($acceptor_alt_context)) {
394 $results{'MES-SWA_acceptor_alt_context'} = $acceptor_alt_context;
395
396 if ($acceptor_alt_context =~ /^[ACGT]+$/) {
397 my ($seq, $frame, $score) = @{$self->get_max_acceptor($acceptor_alt_context)};
398 $results{'MES-SWA_acceptor_alt_seq'} = $seq;
399 $results{'MES-SWA_acceptor_alt_frame'} = $frame;
400 $results{'MES-SWA_acceptor_alt'} = $score;
401
402 if (defined(my $ref_comp_seq = $results{'MES-SWA_acceptor_ref_seq'})) {
403
404 if ($vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/) {
405 # for SNVs, compare to the same frame as the highest scoring ALT k-mer
406 $ref_comp_seq = substr($acceptor_ref_context, $frame - 1, 23);
407 }
408
409 $results{'MES-SWA_acceptor_ref_comp_seq'} = $ref_comp_seq;
410 $results{'MES-SWA_acceptor_ref_comp'} = $self->score3($ref_comp_seq);
411
412 $results{'MES-SWA_acceptor_diff'} = $results{'MES-SWA_acceptor_ref_comp'} - $score;
413 }
414 }
415 }
416
417 return \%results;
418 }
419
420 sub run_NCSS {
421 my ($self, $tva) = @_;
422
423 my $tv = $tva->transcript_variation;
424 my $tr = $tva->transcript;
425
426 my %results;
427
428 if ($tv->intron_number) {
429
430 my ($intron_numbers, $total_introns) = split(/\//, $tv->intron_number);
431 my $intron_number = (split(/-/, $intron_numbers))[0];
432
433 my $introns = $tr->get_all_Introns;
434
435 my $intron_idx = $intron_number - 1;
436 my $intron = $introns->[$intron_idx];
437
438 if (defined(my $seq = $self->get_donor_seq_from_intron($intron))) {
439 $results{'MES-NCSS_upstream_donor_seq'} = $seq;
440 $results{'MES-NCSS_upstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
441 }
442
443 if (defined(my $seq = $self->get_acceptor_seq_from_intron($intron))) {
444 $results{'MES-NCSS_downstream_acceptor_seq'} = $seq;
445 $results{'MES-NCSS_downstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
446 }
447
448 # don't calculate an upstream acceptor score if the intron is the first in the transcript
449 unless ($intron_number == 1) {
450 my $upstream_intron = $introns->[$intron_idx - 1];
451
452 if (defined(my $seq = $self->get_acceptor_seq_from_intron($upstream_intron))) {
453 $results{'MES-NCSS_upstream_acceptor_seq'} = $seq;
454 $results{'MES-NCSS_upstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
455 }
456 }
457
458 # don't calculate a downstream donor score if the intron is the last in the transcript
459 unless ($intron_number == $total_introns) {
460 my $downstream_intron = $introns->[$intron_idx + 1];
461
462 if (defined(my $seq = $self->get_donor_seq_from_intron($downstream_intron))) {
463 $results{'MES-NCSS_downstream_donor_seq'} = $seq;
464 $results{'MES-NCSS_downstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
465 }
466 }
467 }
468
469 elsif ($tv->exon_number) {
470
471 my ($exon_numbers, $total_exons) = split(/\//, $tv->exon_number);
472 my $exon_number = (split(/-/, $exon_numbers))[0];
473
474 my $exons = $tr->get_all_Exons;
475
476 my $exon_idx = $exon_number - 1;
477 my $exon = $exons->[$exon_idx];
478
479 # don't calculate upstream scores if the exon is the first in the transcript
480 unless ($exon_number == 1) {
481 my $upstream_exon = $exons->[$exon_idx - 1];
482
483 if (defined(my $seq = $self->get_donor_seq_from_exon($upstream_exon))) {
484 $results{'MES-NCSS_upstream_donor_seq'} = $seq;
485 $results{'MES-NCSS_upstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
486 }
487
488 if (defined(my $seq = $self->get_acceptor_seq_from_exon($exon))) {
489 $results{'MES-NCSS_upstream_acceptor_seq'} = $seq;
490 $results{'MES-NCSS_upstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
491 }
492 }
493
494 # don't calculate downstream scores if the exon is the last exon in the transcript
495 unless ($exon_number == $total_exons) {
496 my $downstream_exon = $exons->[$exon_idx + 1];
497
498 if (defined(my $seq = $self->get_donor_seq_from_exon($exon))) {
499 $results{'MES-NCSS_downstream_donor_seq'} = $seq;
500 $results{'MES-NCSS_downstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
501 }
502
503 if (defined(my $seq = $self->get_acceptor_seq_from_exon($downstream_exon))) {
504 $results{'MES-NCSS_downstream_acceptor_seq'} = $seq;
505 $results{'MES-NCSS_downstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
506 }
507 }
508 }
509
510 return \%results;
511 }
512
513
514 ## Sliding window approach methods
515 ##################################
516
517 sub get_max_donor {
518 my ($self, $sequence) = @_;
519
520 my ($seq, $frame, $max);
521 my @kmers = @{$self->sliding_window($sequence, 9)};
522
523 for my $i (0 .. $#kmers) {
524 my $kmer = $kmers[$i];
525 my $score = $self->score5($kmer);
526 if(!$max || $score > $max) {
527 $seq = $kmer;
528 $frame = $i + 1;
529 $max = $score;
530 }
531 }
532 return [$seq, $frame, $max];
533 }
534
535 sub get_max_acceptor {
536 my ($self, $sequence) = @_;
537
538 my ($seq, $frame, $max);
539 my @kmers = @{$self->sliding_window($sequence, 23)};
540
541 for my $i (0 .. $#kmers) {
542 my $kmer = $kmers[$i];
543 my $score = $self->score3($kmer);
544 if(!$max || $score > $max) {
545 $seq = $kmer;
546 $frame = $i + 1;
547 $max = $score;
548 }
549 }
550 return [$seq, $frame, $max];
551 }
552
553 sub sliding_window {
554 my ($self, $sequence, $winsize) = @_;
555 my @seqs;
556 for (my $i = 1; $i <= length($sequence) - $winsize + 1; $i++) {
557 push @seqs, substr($sequence, $i - 1, $winsize);
558 }
559 return \@seqs;
560 }
561
562
563 ## Nearest canonical splice site methods
564 ########################################
565
566 sub get_donor_seq_from_exon {
567 my ($self, $exon) = @_;
568
569 my ($start, $end);
570
571 if ($exon->strand > 0) {
572 ($start, $end) = ($exon->end - 2, $exon->end + 6);
573 }
574 else {
575 ($start, $end) = ($exon->start - 6, $exon->start + 2);
576 }
577
578 my $slice = $exon->slice()->sub_Slice($start, $end, $exon->strand);
579 my $seq = $slice->seq() if defined($slice);
580
581 return $seq;
582 }
583
584 sub get_acceptor_seq_from_exon {
585 my ($self, $exon) = @_;
586
587 my ($start, $end);
588
589 if ($exon->strand > 0) {
590 ($start, $end) = ($exon->start - 20, $exon->start + 2);
591 }
592 else {
593 ($start, $end) = ($exon->end - 2, $exon->end + 20);
594 }
595
596 my $slice = $exon->slice()->sub_Slice($start, $end, $exon->strand);
597 my $seq = $slice->seq() if defined($slice);
598
599 return $seq;
600 }
601
602 sub get_donor_seq_from_intron {
603 my ($self, $intron) = @_;
604
605 my ($start, $end);
606
607 if ($intron->strand > 0) {
608 ($start, $end) = ($intron->start - 3, $intron->start + 5);
609 }
610 else {
611 ($start, $end) = ($intron->end - 5, $intron->end + 3);
612 }
613
614 my $slice = $intron->slice()->sub_Slice($start, $end, $intron->strand);
615 my $seq = $slice->seq() if defined($slice);
616
617 return $seq;
618 }
619
620 sub get_acceptor_seq_from_intron {
621 my ($self, $intron) = @_;
622
623 my ($start, $end);
624
625 if ($intron->strand > 0) {
626 ($start, $end) = ($intron->end - 19, $intron->end + 3);
627 }
628 else {
629 ($start, $end) = ($intron->start - 3, $intron->start + 19);
630 }
631
632 my $slice = $intron->slice()->sub_Slice($start, $end, $intron->strand);
633 my $seq = $slice->seq() if defined($slice);
634
635 return $seq;
636 }
637
638
639 ## Common methods
640 #################
641
642 sub get_seqs {
643 my ($self, $tva, $start, $end) = @_;
644 my $vf = $tva->variation_feature;
645
646 my $tr_strand = $tva->transcript->strand;
647
648 my $ref_slice = $vf->{slice}->sub_Slice($start, $end, $tr_strand);
649
650 my ($ref_seq, $alt_seq);
651
652 if (defined $ref_slice) {
653
654 $ref_seq = $alt_seq = $ref_slice->seq();
655
656 my $substr_start = $tr_strand > 0 ? $vf->{start} - $start : $end - $vf->{end};
657 my $feature_seq = $tva->seq_length > 0 ? $tva->feature_seq : '';
658
659 substr($alt_seq, $substr_start, ($vf->{end} - $vf->{start}) + 1) = $feature_seq;
660 }
661
662 return [$ref_seq, $alt_seq];
663 }
664
665 sub score5 {
666 my $self = shift;
667 my $seq = shift;
668 my $hex = md5_hex($seq);
669
670 # check cache
671 if($self->{cache}) {
672 my ($res) = grep {$_->{hex} eq $hex} @{$self->{cache}->{score5}};
673
674 return $res->{score} if $res;
675 }
676
677 my $a = $self->score5_scoreconsensus($seq);
678 die("ERROR: No score5_scoreconsensus\n") unless defined($a);
679
680 my $b = $self->score5_getrest($seq);
681 die("ERROR: No score5_getrest\n") unless defined($b);
682
683 my $c = $self->{'score5_seq'}->{$b};
684 die("ERROR: No score5_seq for $b\n") unless defined($c);
685
686 my $d = $self->{'score5_me2x5'}->{$c};
687 die("ERROR: No score5_me2x5 for $c\n") unless defined($d);
688
689 my $score = $self->log2($a * $d);
690
691 # cache it
692 push @{$self->{cache}->{score5}}, { hex => $hex, score => $score };
693 shift @{$self->{cache}->{score5}} while scalar @{$self->{cache}->{score5}} > $CACHE_SIZE;
694
695 return $score;
696 }
697
698 sub score3 {
699 my $self = shift;
700 my $seq = shift;
701 my $hex = md5_hex($seq);
702
703 # check cache
704 if($self->{cache}) {
705 my ($res) = grep {$_->{hex} eq $hex} @{$self->{cache}->{score3}};
706
707 return $res->{score} if $res;
708 }
709
710 my $a = $self->score3_scoreconsensus($seq);
711 die("ERROR: No score3_scoreconsensus\n") unless defined($a);
712
713 my $b = $self->score3_getrest($seq);
714 die("ERROR: No score3_getrest\n") unless defined($b);
715
716 my $c = $self->score3_maxentscore($b, $self->{'score3_metables'});
717 die("ERROR: No score3_maxentscore for $b\n") unless defined($c);
718
719 my $score = $self->log2($a * $c);
720
721 # cache it
722 push @{$self->{cache}->{score3}}, { hex => $hex, score => $score };
723 shift @{$self->{cache}->{score3}} while scalar @{$self->{cache}->{score3}} > $CACHE_SIZE;
724
725 return $score;
726 }
727
728
729 ## methods copied from score5.pl
730 ################################
731
732 sub score5_makesequencematrix {
733 my $self = shift;
734 my $file = shift;
735 my %matrix;
736 my $n=0;
737 open(SCOREF, $file) || die "Can't open $file!\n";
738 while(<SCOREF>) {
739 chomp;
740 $_=~ s/\s//;
741 $matrix{$_} = $n;
742 $n++;
743 }
744 close(SCOREF);
745 return \%matrix;
746 }
747
748 sub score5_makescorematrix {
749 my $self = shift;
750 my $file = shift;
751 my %matrix;
752 my $n=0;
753 open(SCOREF, $file) || die "Can't open $file!\n";
754 while(<SCOREF>) {
755 chomp;
756 $_=~ s/\s//;
757 $matrix{$n} = $_;
758 $n++;
759 }
760 close(SCOREF);
761 return \%matrix;
762 }
763
764 sub score5_getrest {
765 my $self = shift;
766 my $seq = shift;
767 my @seqa = split(//,uc($seq));
768 return $seqa[0].$seqa[1].$seqa[2].$seqa[5].$seqa[6].$seqa[7].$seqa[8];
769 }
770
771 sub score5_scoreconsensus {
772 my $self = shift;
773 my $seq = shift;
774 my @seqa = split(//,uc($seq));
775 my %bgd;
776 $bgd{'A'} = 0.27;
777 $bgd{'C'} = 0.23;
778 $bgd{'G'} = 0.23;
779 $bgd{'T'} = 0.27;
780 my %cons1;
781 $cons1{'A'} = 0.004;
782 $cons1{'C'} = 0.0032;
783 $cons1{'G'} = 0.9896;
784 $cons1{'T'} = 0.0032;
785 my %cons2;
786 $cons2{'A'} = 0.0034;
787 $cons2{'C'} = 0.0039;
788 $cons2{'G'} = 0.0042;
789 $cons2{'T'} = 0.9884;
790 my $addscore = $cons1{$seqa[3]}*$cons2{$seqa[4]}/($bgd{$seqa[3]}*$bgd{$seqa[4]});
791 return $addscore;
792 }
793
794 sub log2 {
795 my ($self, $val) = @_;
796 return log($val)/log(2);
797 }
798
799
800 ## methods copied from score3.pl
801 ################################
802
803 sub score3_hashseq {
804 #returns hash of sequence in base 4
805 # $self->score3_hashseq('CAGAAGT') returns 4619
806 my $self = shift;
807 my $seq = shift;
808 $seq = uc($seq);
809 $seq =~ tr/ACGT/0123/;
810 my @seqa = split(//,$seq);
811 my $sum = 0;
812 my $len = length($seq);
813 my @four = (1,4,16,64,256,1024,4096,16384);
814 my $i=0;
815 while ($i<$len) {
816 $sum+= $seqa[$i] * $four[$len - $i -1] ;
817 $i++;
818 }
819 return $sum;
820 }
821
822 sub score3_makemaxentscores {
823 my $self = shift;
824 my $dir = $self->{'_dir'}."/splicemodels/";
825 my @list = ('me2x3acc1','me2x3acc2','me2x3acc3','me2x3acc4',
826 'me2x3acc5','me2x3acc6','me2x3acc7','me2x3acc8','me2x3acc9');
827 my @metables;
828 my $num = 0 ;
829 foreach my $file (@list) {
830 my $n = 0;
831 open (SCOREF,"<".$dir.$file) || die "Can't open $file!\n";
832 while(<SCOREF>) {
833 chomp;
834 $_=~ s/\s//;
835 $metables[$num]{$n} = $_;
836 $n++;
837 }
838 close(SCOREF);
839 #print STDERR $file."\t".$num."\t".$n."\n";
840 $num++;
841 }
842 return \@metables;
843 }
844
845 sub score3_maxentscore {
846 my $self = shift;
847 my $seq = shift;
848 my $table_ref = shift;
849 my @metables = @$table_ref;
850 my @sc;
851 $sc[0] = $metables[0]{$self->score3_hashseq(substr($seq,0,7))};
852 $sc[1] = $metables[1]{$self->score3_hashseq(substr($seq,7,7))};
853 $sc[2] = $metables[2]{$self->score3_hashseq(substr($seq,14,7))};
854 $sc[3] = $metables[3]{$self->score3_hashseq(substr($seq,4,7))};
855 $sc[4] = $metables[4]{$self->score3_hashseq(substr($seq,11,7))};
856 $sc[5] = $metables[5]{$self->score3_hashseq(substr($seq,4,3))};
857 $sc[6] = $metables[6]{$self->score3_hashseq(substr($seq,7,4))};
858 $sc[7] = $metables[7]{$self->score3_hashseq(substr($seq,11,3))};
859 $sc[8] = $metables[8]{$self->score3_hashseq(substr($seq,14,4))};
860 my $finalscore = $sc[0] * $sc[1] * $sc[2] * $sc[3] * $sc[4] / ($sc[5] * $sc[6] * $sc[7] * $sc[8]);
861 return $finalscore;
862 }
863
864 sub score3_getrest {
865 my $self = shift;
866 my $seq = shift;
867 my $seq_noconsensus = substr($seq,0,18).substr($seq,20,3);
868 return $seq_noconsensus;
869 }
870
871 sub score3_scoreconsensus {
872 my $self = shift;
873 my $seq = shift;
874 my @seqa = split(//,uc($seq));
875 my %bgd;
876 $bgd{'A'} = 0.27;
877 $bgd{'C'} = 0.23;
878 $bgd{'G'} = 0.23;
879 $bgd{'T'} = 0.27;
880 my %cons1;
881 $cons1{'A'} = 0.9903;
882 $cons1{'C'} = 0.0032;
883 $cons1{'G'} = 0.0034;
884 $cons1{'T'} = 0.0030;
885 my %cons2;
886 $cons2{'A'} = 0.0027;
887 $cons2{'C'} = 0.0037;
888 $cons2{'G'} = 0.9905;
889 $cons2{'T'} = 0.0030;
890 my $addscore = $cons1{$seqa[18]} * $cons2{$seqa[19]}/ ($bgd{$seqa[18]} * $bgd{$seqa[19]});
891 return $addscore;
892 }
893
894 1;
895