|
0
|
1 =head1 LICENSE
|
|
|
2
|
|
|
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
|
|
|
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
|
|
|
5
|
|
|
6 Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
7 you may not use this file except in compliance with the License.
|
|
|
8 You may obtain a copy of the License at
|
|
|
9
|
|
|
10 http://www.apache.org/licenses/LICENSE-2.0
|
|
|
11
|
|
|
12 Unless required by applicable law or agreed to in writing, software
|
|
|
13 distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
15 See the License for the specific language governing permissions and
|
|
|
16 limitations under the License.
|
|
|
17
|
|
|
18 =head1 CONTACT
|
|
|
19
|
|
|
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
|
|
|
21
|
|
|
22 =cut
|
|
|
23
|
|
|
24 =head1 NAME
|
|
|
25
|
|
|
26 MaxEntScan
|
|
|
27
|
|
|
28 =head1 SYNOPSIS
|
|
|
29
|
|
|
30 mv MaxEntScan.pm ~/.vep/Plugins
|
|
|
31 ./vep -i variants.vcf --plugin MaxEntScan,/path/to/maxentscan/fordownload
|
|
|
32 ./vep -i variants.vcf --plugin MaxEntScan,/path/to/maxentscan/fordownload,SWA,NCSS
|
|
|
33
|
|
|
34 =head1 DESCRIPTION
|
|
|
35
|
|
|
36 This is a plugin for the Ensembl Variant Effect Predictor (VEP) that
|
|
|
37 runs MaxEntScan (http://genes.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html)
|
|
|
38 to get splice site predictions.
|
|
|
39
|
|
|
40 The plugin copies most of the code verbatim from the score5.pl and score3.pl
|
|
|
41 scripts provided in the MaxEntScan download. To run the plugin you must get and
|
|
|
42 unpack the archive from http://genes.mit.edu/burgelab/maxent/download/; the path
|
|
|
43 to this unpacked directory is then the param you pass to the --plugin flag.
|
|
|
44
|
|
|
45 The plugin executes the logic from one of the scripts depending on which
|
|
|
46 splice region the variant overlaps:
|
|
|
47
|
|
|
48 score5.pl : last 3 bases of exon --> first 6 bases of intron
|
|
|
49 score3.pl : last 20 bases of intron --> first 3 bases of exon
|
|
|
50
|
|
|
51 The plugin reports the reference, alternate and difference (REF - ALT) maximum
|
|
|
52 entropy scores.
|
|
|
53
|
|
|
54 If 'SWA' is specified as a command-line argument, a sliding window algorithm
|
|
|
55 is applied to subsequences containing the reference and alternate alleles to
|
|
|
56 identify k-mers with the highest donor and acceptor splice site scores. To assess
|
|
|
57 the impact of variants, reference comparison scores are also provided. For SNVs,
|
|
|
58 the comparison scores are derived from sequence in the same frame as the highest
|
|
|
59 scoring k-mers containing the alternate allele. For all other variants, the
|
|
|
60 comparison scores are derived from the highest scoring k-mers containing the
|
|
|
61 reference allele. The difference between the reference comparison and alternate
|
|
|
62 scores (SWA_REF_COMP - SWA_ALT) are also provided.
|
|
|
63
|
|
|
64 If 'NCSS' is specified as a command-line argument, scores for the nearest
|
|
|
65 upstream and downstream canonical splice sites are also included.
|
|
|
66
|
|
|
67 By default, only scores are reported. Add 'verbose' to the list of command-
|
|
|
68 line arguments to include the sequence output associated with those scores.
|
|
|
69
|
|
|
70 =cut
|
|
|
71
|
|
|
72 package MaxEntScan;
|
|
|
73
|
|
|
74 use strict;
|
|
|
75 use warnings;
|
|
|
76
|
|
|
77 use Digest::MD5 qw(md5_hex);
|
|
|
78
|
|
|
79 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
|
|
|
80 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap);
|
|
|
81
|
|
|
82 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
|
|
|
83 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
|
|
|
84
|
|
|
85 # how many seq/score pairs to cache in memory
|
|
|
86 our $CACHE_SIZE = 50;
|
|
|
87
|
|
|
88 sub new {
|
|
|
89 my $class = shift;
|
|
|
90
|
|
|
91 my $self = $class->SUPER::new(@_);
|
|
|
92
|
|
|
93 # we need sequence, so no offline mode unless we have FASTA
|
|
|
94 die("ERROR: cannot function in offline mode without a FASTA file\n") if $self->{config}->{offline} && !$self->{config}->{fasta};
|
|
|
95
|
|
|
96 my $params = $self->params;
|
|
|
97
|
|
|
98 my $dir = shift @$params;
|
|
|
99 die("ERROR: MaxEntScan directory not specified\n") unless $dir;
|
|
|
100 die("ERROR: MaxEntScan directory not found\n") unless -d $dir;
|
|
|
101 $self->{_dir} = $dir;
|
|
|
102
|
|
|
103 ## setup from score5.pl
|
|
|
104 $self->{'score5_me2x5'} = $self->score5_makescorematrix($dir.'/me2x5');
|
|
|
105 $self->{'score5_seq'} = $self->score5_makesequencematrix($dir.'/splicemodels/splice5sequences');
|
|
|
106
|
|
|
107 ## setup from score3.pl
|
|
|
108 $self->{'score3_metables'} = $self->score3_makemaxentscores;
|
|
|
109
|
|
|
110 my %opts = map { $_ => undef } @$params;
|
|
|
111
|
|
|
112 $self->{'run_SWA'} = 1 if exists $opts{'SWA'};
|
|
|
113 $self->{'run_NCSS'} = 1 if exists $opts{'NCSS'};
|
|
|
114
|
|
|
115 $self->{'verbose'} = 1 if exists $opts{'verbose'};
|
|
|
116
|
|
|
117 return $self;
|
|
|
118 }
|
|
|
119
|
|
|
120 sub feature_types {
|
|
|
121 return ['Transcript'];
|
|
|
122 }
|
|
|
123
|
|
|
124 sub get_header_info {
|
|
|
125 my $self = shift;
|
|
|
126
|
|
|
127 my $v = $self->{'verbose'};
|
|
|
128 my $headers = $self->get_MES_header_info($v);
|
|
|
129
|
|
|
130 if ($self->{'run_SWA'}) {
|
|
|
131 my $swa_headers = $self->get_SWA_header_info($v);
|
|
|
132 $headers = {%$headers, %$swa_headers};
|
|
|
133 }
|
|
|
134
|
|
|
135 if ($self->{'run_NCSS'}) {
|
|
|
136 my $ncss_headers = $self->get_NCSS_header_info($v);
|
|
|
137 $headers = {%$headers, %$ncss_headers};
|
|
|
138 }
|
|
|
139
|
|
|
140 return $headers;
|
|
|
141 }
|
|
|
142
|
|
|
143 sub get_MES_header_info {
|
|
|
144 my ($self, $verbose) = @_;
|
|
|
145
|
|
|
146 my $headers = {
|
|
|
147 MaxEntScan_ref => "MaxEntScan reference sequence score",
|
|
|
148 MaxEntScan_alt => "MaxEntScan alternate sequence score",
|
|
|
149 MaxEntScan_diff => "MaxEntScan score difference",
|
|
|
150 };
|
|
|
151
|
|
|
152 if ($verbose) {
|
|
|
153
|
|
|
154 $headers->{'MaxEntScan_ref_seq'} = "MaxEntScan reference sequence";
|
|
|
155 $headers->{'MaxEntScan_alt_seq'} = "MaxEntScan alternate sequence";
|
|
|
156 }
|
|
|
157
|
|
|
158 return $headers;
|
|
|
159 }
|
|
|
160
|
|
|
161 sub get_SWA_header_info {
|
|
|
162 my ($self, $verbose) = @_;
|
|
|
163
|
|
|
164 my $headers = {
|
|
|
165 "MES-SWA_donor_ref" => "Highest splice donor reference sequence score",
|
|
|
166 "MES-SWA_donor_alt" => "Highest splice donor alternate sequence score",
|
|
|
167 "MES-SWA_donor_ref_comp" => "Donor reference comparison sequence score",
|
|
|
168 "MES-SWA_donor_diff" => "Difference between the donor reference comparison and alternate sequence scores",
|
|
|
169
|
|
|
170 "MES-SWA_acceptor_ref" => "Highest splice acceptor reference sequence score",
|
|
|
171 "MES-SWA_acceptor_alt" => "Highest splice acceptor alternate sequence score",
|
|
|
172 "MES-SWA_acceptor_ref_comp" => "Acceptor reference comparison sequence score",
|
|
|
173 "MES-SWA_acceptor_diff" => "Difference between the acceptor reference comparison and alternate sequence scores",
|
|
|
174 };
|
|
|
175
|
|
|
176 if ($verbose) {
|
|
|
177
|
|
|
178 $headers->{'MES-SWA_donor_ref_seq'} = "Highest splice donor reference sequence";
|
|
|
179 $headers->{'MES-SWA_donor_ref_frame'} = "Position of the highest splice donor reference sequence";
|
|
|
180 $headers->{'MES-SWA_donor_ref_context'} = "Selected donor sequence context containing the reference allele";
|
|
|
181 $headers->{'MES-SWA_donor_alt_seq'} = "Highest splice donor alternate sequence";
|
|
|
182 $headers->{'MES-SWA_donor_alt_frame'} = "Position of the highest splice donor alternate sequence";
|
|
|
183 $headers->{'MES-SWA_donor_alt_context'} = "Selected donor sequence context containing the alternate allele";
|
|
|
184 $headers->{'MES-SWA_donor_ref_comp_seq'} = "Donor reference comparison sequence";
|
|
|
185
|
|
|
186 $headers->{'MES-SWA_acceptor_ref_seq'} = "Highest splice acceptor reference sequence";
|
|
|
187 $headers->{'MES-SWA_acceptor_ref_frame'} = "Position of the highest splice acceptor reference sequence";
|
|
|
188 $headers->{'MES-SWA_acceptor_ref_context'} = "Selected acceptor sequence context containing the reference allele";
|
|
|
189 $headers->{'MES-SWA_acceptor_alt_seq'} = "Highest splice acceptor alternate sequence";
|
|
|
190 $headers->{'MES-SWA_acceptor_alt_frame'} = "Position of the highest splice acceptor alternate sequence";
|
|
|
191 $headers->{'MES-SWA_acceptor_alt_context'} = "Selected acceptor sequence context containing the alternate allele";
|
|
|
192 $headers->{'MES-SWA_acceptor_ref_comp_seq'} = "Acceptor reference comparison sequence";
|
|
|
193 }
|
|
|
194
|
|
|
195 return $headers;
|
|
|
196 }
|
|
|
197
|
|
|
198 sub get_NCSS_header_info {
|
|
|
199 my ($self, $verbose) = @_;
|
|
|
200
|
|
|
201 my $headers = {
|
|
|
202 "MES-NCSS_upstream_acceptor" => "Nearest upstream canonical splice acceptor sequence score",
|
|
|
203 "MES-NCSS_upstream_donor" => "Nearest upstream canonical splice donor sequence score",
|
|
|
204
|
|
|
205 "MES-NCSS_downstream_acceptor" => "Nearest downstream canonical splice acceptor sequence score",
|
|
|
206 "MES-NCSS_downstream_donor" => "Nearest downstream canonical splice donor sequence score",
|
|
|
207 };
|
|
|
208
|
|
|
209 if ($verbose) {
|
|
|
210
|
|
|
211 $headers->{'MES-NCSS_upstream_acceptor_seq'} = "Nearest upstream canonical splice acceptor sequence";
|
|
|
212 $headers->{'MES-NCSS_upstream_donor_seq'} = "Nearest upstream canonical splice donor sequence";
|
|
|
213
|
|
|
214 $headers->{'MES-NCSS_downstream_acceptor_seq'} = "Nearest downstream canonical splice acceptor sequence";
|
|
|
215 $headers->{'MES-NCSS_downstream_donor_seq'} = "Nearest downstream canonical splice donor sequence";
|
|
|
216 }
|
|
|
217
|
|
|
218 return $headers;
|
|
|
219 }
|
|
|
220
|
|
|
221 sub run {
|
|
|
222 my ($self, $tva) = @_;
|
|
|
223
|
|
|
224 my $seq_headers = $self->get_MES_header_info();
|
|
|
225 my $results = $self->run_MES($tva);
|
|
|
226
|
|
|
227 if ($self->{'run_SWA'}) {
|
|
|
228 my $swa_seq_headers = $self->get_SWA_header_info();
|
|
|
229 $seq_headers = {%$seq_headers, %$swa_seq_headers};
|
|
|
230 my $swa_results = $self->run_SWA($tva);
|
|
|
231 $results = {%$results, %$swa_results};
|
|
|
232 }
|
|
|
233
|
|
|
234 if ($self->{'run_NCSS'}) {
|
|
|
235 my $ncss_seq_headers = $self->get_NCSS_header_info();
|
|
|
236 $seq_headers = {%$seq_headers, %$ncss_seq_headers};
|
|
|
237 my $ncss_results = $self->run_NCSS($tva);
|
|
|
238 $results = {%$results, %$ncss_results};
|
|
|
239 }
|
|
|
240
|
|
|
241 my %data;
|
|
|
242
|
|
|
243 # add the scores
|
|
|
244 my @scores = grep { exists $results->{$_} } keys %$seq_headers;
|
|
|
245 @data{@scores} = map { sprintf('%.3f', $_) } @{$results}{@scores};
|
|
|
246
|
|
|
247 if ($self->{'verbose'}) {
|
|
|
248 # add any remaining results
|
|
|
249 my @non_scores = grep { ! exists $data{$_} } keys %$results;
|
|
|
250 @data{@non_scores} = @{$results}{@non_scores};
|
|
|
251 }
|
|
|
252
|
|
|
253 return \%data;
|
|
|
254 }
|
|
|
255
|
|
|
256 sub run_MES {
|
|
|
257 my ($self, $tva) = @_;
|
|
|
258
|
|
|
259 my $vf = $tva->variation_feature;
|
|
|
260 return {} unless $vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/;
|
|
|
261
|
|
|
262 my $tv = $tva->transcript_variation;
|
|
|
263 my $tr = $tva->transcript;
|
|
|
264 my $tr_strand = $tr->strand;
|
|
|
265 my ($vf_start, $vf_end) = ($vf->start, $vf->end);
|
|
|
266
|
|
|
267 # use _overlapped_introns() method from BaseTranscriptVariation
|
|
|
268 # this will use an interval tree if available for superfast lookup of overlapping introns
|
|
|
269 # we have to expand the search space around $vf because we're looking for the splice region not the intron per se
|
|
|
270 foreach my $intron(@{$tv->_overlapped_introns($vf_start - 21, $vf_end + 21)}) {
|
|
|
271
|
|
|
272 # get coords depending on strand
|
|
|
273 # MaxEntScan does different predictions for 5 and 3 prime
|
|
|
274 # and we need to feed it different bits of sequence for each
|
|
|
275 #
|
|
|
276 # 5prime, 3 bases of exon, 6 bases of intron:
|
|
|
277 # ===------
|
|
|
278 #
|
|
|
279 # 3prime, 20 bases of intron, 3 bases of exon
|
|
|
280 # --------------------===
|
|
|
281
|
|
|
282 my ($five_start, $five_end, $three_start, $three_end);
|
|
|
283
|
|
|
284 if($tr_strand > 0) {
|
|
|
285 ($five_start, $five_end) = ($intron->start - 3, $intron->start + 5);
|
|
|
286 ($three_start, $three_end) = ($intron->end - 19, $intron->end + 3);
|
|
|
287 }
|
|
|
288
|
|
|
289 else {
|
|
|
290 ($five_start, $five_end) = ($intron->end - 5, $intron->end + 3);
|
|
|
291 ($three_start, $three_end) = ($intron->start - 3, $intron->start + 19);
|
|
|
292 }
|
|
|
293
|
|
|
294 if(overlap($vf->start, $vf->end, $five_start, $five_end)) {
|
|
|
295 my ($ref_seq, $alt_seq) = @{$self->get_seqs($tva, $five_start, $five_end)};
|
|
|
296
|
|
|
297 return {} unless defined($ref_seq) && $ref_seq =~ /^[ACGT]+$/;
|
|
|
298 return {} unless defined($alt_seq) && $alt_seq =~ /^[ACGT]+$/;
|
|
|
299
|
|
|
300 my $ref_score = $self->score5($ref_seq);
|
|
|
301 my $alt_score = $self->score5($alt_seq);
|
|
|
302
|
|
|
303 return {
|
|
|
304 MaxEntScan_ref => $ref_score,
|
|
|
305 MaxEntScan_ref_seq => $ref_seq,
|
|
|
306 MaxEntScan_alt => $alt_score,
|
|
|
307 MaxEntScan_alt_seq => $alt_seq,
|
|
|
308 MaxEntScan_diff => $ref_score - $alt_score,
|
|
|
309 }
|
|
|
310 }
|
|
|
311
|
|
|
312 if(overlap($vf->start, $vf->end, $three_start, $three_end)) {
|
|
|
313 my ($ref_seq, $alt_seq) = @{$self->get_seqs($tva, $three_start, $three_end)};
|
|
|
314
|
|
|
315 return {} unless defined($ref_seq) && $ref_seq =~ /^[ACGT]+$/;
|
|
|
316 return {} unless defined($alt_seq) && $alt_seq =~ /^[ACGT]+$/;
|
|
|
317
|
|
|
318 my $ref_score = $self->score3($ref_seq);
|
|
|
319 my $alt_score = $self->score3($alt_seq);
|
|
|
320
|
|
|
321 return {
|
|
|
322 MaxEntScan_ref => $ref_score,
|
|
|
323 MaxEntScan_ref_seq => $ref_seq,
|
|
|
324 MaxEntScan_alt => $alt_score,
|
|
|
325 MaxEntScan_alt_seq => $alt_seq,
|
|
|
326 MaxEntScan_diff => $ref_score - $alt_score,
|
|
|
327 }
|
|
|
328 }
|
|
|
329 }
|
|
|
330
|
|
|
331 return {};
|
|
|
332 }
|
|
|
333
|
|
|
334 sub run_SWA {
|
|
|
335 my ($self, $tva) = @_;
|
|
|
336
|
|
|
337 my $vf = $tva->variation_feature;
|
|
|
338
|
|
|
339 my %results;
|
|
|
340
|
|
|
341 # get the donor reference and alternate sequence contexts
|
|
|
342 my ($donor_ref_context, $donor_alt_context) = @{$self->get_seqs($tva, $vf->start - 8, $vf->end + 8)};
|
|
|
343
|
|
|
344 if (defined($donor_ref_context)) {
|
|
|
345 $results{'MES-SWA_donor_ref_context'} = $donor_ref_context;
|
|
|
346
|
|
|
347 if ($donor_ref_context =~ /^[ACGT]+$/) {
|
|
|
348 my ($seq, $frame, $score) = @{$self->get_max_donor($donor_ref_context)};
|
|
|
349 $results{'MES-SWA_donor_ref_seq'} = $seq;
|
|
|
350 $results{'MES-SWA_donor_ref_frame'} = $frame;
|
|
|
351 $results{'MES-SWA_donor_ref'} = $score;
|
|
|
352 }
|
|
|
353 }
|
|
|
354
|
|
|
355 if (defined($donor_alt_context)) {
|
|
|
356 $results{'MES-SWA_donor_alt_context'} = $donor_alt_context;
|
|
|
357
|
|
|
358 if ($donor_alt_context =~ /^[ACGT]+$/) {
|
|
|
359 my ($seq, $frame, $score) = @{$self->get_max_donor($donor_alt_context)};
|
|
|
360 $results{'MES-SWA_donor_alt_seq'} = $seq;
|
|
|
361 $results{'MES-SWA_donor_alt_frame'} = $frame;
|
|
|
362 $results{'MES-SWA_donor_alt'} = $score;
|
|
|
363
|
|
|
364 if (defined(my $ref_comp_seq = $results{'MES-SWA_donor_ref_seq'})) {
|
|
|
365
|
|
|
366 if ($vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/) {
|
|
|
367 # for SNVs, compare to the same frame as the highest scoring ALT k-mer
|
|
|
368 $ref_comp_seq = substr($donor_ref_context, $frame - 1, 9);
|
|
|
369 }
|
|
|
370
|
|
|
371 $results{'MES-SWA_donor_ref_comp_seq'} = $ref_comp_seq;
|
|
|
372 $results{'MES-SWA_donor_ref_comp'} = $self->score5($ref_comp_seq);
|
|
|
373
|
|
|
374 $results{'MES-SWA_donor_diff'} = $results{'MES-SWA_donor_ref_comp'} - $score;
|
|
|
375 }
|
|
|
376 }
|
|
|
377 }
|
|
|
378
|
|
|
379 # get the acceptor reference and alternate sequence contexts
|
|
|
380 my ($acceptor_ref_context, $acceptor_alt_context) = @{$self->get_seqs($tva, $vf->start - 22, $vf->end + 22)};
|
|
|
381
|
|
|
382 if (defined($acceptor_ref_context)) {
|
|
|
383 $results{'MES-SWA_acceptor_ref_context'} = $acceptor_ref_context;
|
|
|
384
|
|
|
385 if ($acceptor_ref_context =~ /^[ACGT]+$/) {
|
|
|
386 my ($seq, $frame, $score) = @{$self->get_max_acceptor($acceptor_ref_context)};
|
|
|
387 $results{'MES-SWA_acceptor_ref_seq'} = $seq;
|
|
|
388 $results{'MES-SWA_acceptor_ref_frame'} = $frame;
|
|
|
389 $results{'MES-SWA_acceptor_ref'} = $score;
|
|
|
390 }
|
|
|
391 }
|
|
|
392
|
|
|
393 if (defined($acceptor_alt_context)) {
|
|
|
394 $results{'MES-SWA_acceptor_alt_context'} = $acceptor_alt_context;
|
|
|
395
|
|
|
396 if ($acceptor_alt_context =~ /^[ACGT]+$/) {
|
|
|
397 my ($seq, $frame, $score) = @{$self->get_max_acceptor($acceptor_alt_context)};
|
|
|
398 $results{'MES-SWA_acceptor_alt_seq'} = $seq;
|
|
|
399 $results{'MES-SWA_acceptor_alt_frame'} = $frame;
|
|
|
400 $results{'MES-SWA_acceptor_alt'} = $score;
|
|
|
401
|
|
|
402 if (defined(my $ref_comp_seq = $results{'MES-SWA_acceptor_ref_seq'})) {
|
|
|
403
|
|
|
404 if ($vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/) {
|
|
|
405 # for SNVs, compare to the same frame as the highest scoring ALT k-mer
|
|
|
406 $ref_comp_seq = substr($acceptor_ref_context, $frame - 1, 23);
|
|
|
407 }
|
|
|
408
|
|
|
409 $results{'MES-SWA_acceptor_ref_comp_seq'} = $ref_comp_seq;
|
|
|
410 $results{'MES-SWA_acceptor_ref_comp'} = $self->score3($ref_comp_seq);
|
|
|
411
|
|
|
412 $results{'MES-SWA_acceptor_diff'} = $results{'MES-SWA_acceptor_ref_comp'} - $score;
|
|
|
413 }
|
|
|
414 }
|
|
|
415 }
|
|
|
416
|
|
|
417 return \%results;
|
|
|
418 }
|
|
|
419
|
|
|
420 sub run_NCSS {
|
|
|
421 my ($self, $tva) = @_;
|
|
|
422
|
|
|
423 my $tv = $tva->transcript_variation;
|
|
|
424 my $tr = $tva->transcript;
|
|
|
425
|
|
|
426 my %results;
|
|
|
427
|
|
|
428 if ($tv->intron_number) {
|
|
|
429
|
|
|
430 my ($intron_numbers, $total_introns) = split(/\//, $tv->intron_number);
|
|
|
431 my $intron_number = (split(/-/, $intron_numbers))[0];
|
|
|
432
|
|
|
433 my $introns = $tr->get_all_Introns;
|
|
|
434
|
|
|
435 my $intron_idx = $intron_number - 1;
|
|
|
436 my $intron = $introns->[$intron_idx];
|
|
|
437
|
|
|
438 if (defined(my $seq = $self->get_donor_seq_from_intron($intron))) {
|
|
|
439 $results{'MES-NCSS_upstream_donor_seq'} = $seq;
|
|
|
440 $results{'MES-NCSS_upstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
|
|
|
441 }
|
|
|
442
|
|
|
443 if (defined(my $seq = $self->get_acceptor_seq_from_intron($intron))) {
|
|
|
444 $results{'MES-NCSS_downstream_acceptor_seq'} = $seq;
|
|
|
445 $results{'MES-NCSS_downstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
|
|
|
446 }
|
|
|
447
|
|
|
448 # don't calculate an upstream acceptor score if the intron is the first in the transcript
|
|
|
449 unless ($intron_number == 1) {
|
|
|
450 my $upstream_intron = $introns->[$intron_idx - 1];
|
|
|
451
|
|
|
452 if (defined(my $seq = $self->get_acceptor_seq_from_intron($upstream_intron))) {
|
|
|
453 $results{'MES-NCSS_upstream_acceptor_seq'} = $seq;
|
|
|
454 $results{'MES-NCSS_upstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
|
|
|
455 }
|
|
|
456 }
|
|
|
457
|
|
|
458 # don't calculate a downstream donor score if the intron is the last in the transcript
|
|
|
459 unless ($intron_number == $total_introns) {
|
|
|
460 my $downstream_intron = $introns->[$intron_idx + 1];
|
|
|
461
|
|
|
462 if (defined(my $seq = $self->get_donor_seq_from_intron($downstream_intron))) {
|
|
|
463 $results{'MES-NCSS_downstream_donor_seq'} = $seq;
|
|
|
464 $results{'MES-NCSS_downstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
|
|
|
465 }
|
|
|
466 }
|
|
|
467 }
|
|
|
468
|
|
|
469 elsif ($tv->exon_number) {
|
|
|
470
|
|
|
471 my ($exon_numbers, $total_exons) = split(/\//, $tv->exon_number);
|
|
|
472 my $exon_number = (split(/-/, $exon_numbers))[0];
|
|
|
473
|
|
|
474 my $exons = $tr->get_all_Exons;
|
|
|
475
|
|
|
476 my $exon_idx = $exon_number - 1;
|
|
|
477 my $exon = $exons->[$exon_idx];
|
|
|
478
|
|
|
479 # don't calculate upstream scores if the exon is the first in the transcript
|
|
|
480 unless ($exon_number == 1) {
|
|
|
481 my $upstream_exon = $exons->[$exon_idx - 1];
|
|
|
482
|
|
|
483 if (defined(my $seq = $self->get_donor_seq_from_exon($upstream_exon))) {
|
|
|
484 $results{'MES-NCSS_upstream_donor_seq'} = $seq;
|
|
|
485 $results{'MES-NCSS_upstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
|
|
|
486 }
|
|
|
487
|
|
|
488 if (defined(my $seq = $self->get_acceptor_seq_from_exon($exon))) {
|
|
|
489 $results{'MES-NCSS_upstream_acceptor_seq'} = $seq;
|
|
|
490 $results{'MES-NCSS_upstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
|
|
|
491 }
|
|
|
492 }
|
|
|
493
|
|
|
494 # don't calculate downstream scores if the exon is the last exon in the transcript
|
|
|
495 unless ($exon_number == $total_exons) {
|
|
|
496 my $downstream_exon = $exons->[$exon_idx + 1];
|
|
|
497
|
|
|
498 if (defined(my $seq = $self->get_donor_seq_from_exon($exon))) {
|
|
|
499 $results{'MES-NCSS_downstream_donor_seq'} = $seq;
|
|
|
500 $results{'MES-NCSS_downstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/;
|
|
|
501 }
|
|
|
502
|
|
|
503 if (defined(my $seq = $self->get_acceptor_seq_from_exon($downstream_exon))) {
|
|
|
504 $results{'MES-NCSS_downstream_acceptor_seq'} = $seq;
|
|
|
505 $results{'MES-NCSS_downstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/;
|
|
|
506 }
|
|
|
507 }
|
|
|
508 }
|
|
|
509
|
|
|
510 return \%results;
|
|
|
511 }
|
|
|
512
|
|
|
513
|
|
|
514 ## Sliding window approach methods
|
|
|
515 ##################################
|
|
|
516
|
|
|
517 sub get_max_donor {
|
|
|
518 my ($self, $sequence) = @_;
|
|
|
519
|
|
|
520 my ($seq, $frame, $max);
|
|
|
521 my @kmers = @{$self->sliding_window($sequence, 9)};
|
|
|
522
|
|
|
523 for my $i (0 .. $#kmers) {
|
|
|
524 my $kmer = $kmers[$i];
|
|
|
525 my $score = $self->score5($kmer);
|
|
|
526 if(!$max || $score > $max) {
|
|
|
527 $seq = $kmer;
|
|
|
528 $frame = $i + 1;
|
|
|
529 $max = $score;
|
|
|
530 }
|
|
|
531 }
|
|
|
532 return [$seq, $frame, $max];
|
|
|
533 }
|
|
|
534
|
|
|
535 sub get_max_acceptor {
|
|
|
536 my ($self, $sequence) = @_;
|
|
|
537
|
|
|
538 my ($seq, $frame, $max);
|
|
|
539 my @kmers = @{$self->sliding_window($sequence, 23)};
|
|
|
540
|
|
|
541 for my $i (0 .. $#kmers) {
|
|
|
542 my $kmer = $kmers[$i];
|
|
|
543 my $score = $self->score3($kmer);
|
|
|
544 if(!$max || $score > $max) {
|
|
|
545 $seq = $kmer;
|
|
|
546 $frame = $i + 1;
|
|
|
547 $max = $score;
|
|
|
548 }
|
|
|
549 }
|
|
|
550 return [$seq, $frame, $max];
|
|
|
551 }
|
|
|
552
|
|
|
553 sub sliding_window {
|
|
|
554 my ($self, $sequence, $winsize) = @_;
|
|
|
555 my @seqs;
|
|
|
556 for (my $i = 1; $i <= length($sequence) - $winsize + 1; $i++) {
|
|
|
557 push @seqs, substr($sequence, $i - 1, $winsize);
|
|
|
558 }
|
|
|
559 return \@seqs;
|
|
|
560 }
|
|
|
561
|
|
|
562
|
|
|
563 ## Nearest canonical splice site methods
|
|
|
564 ########################################
|
|
|
565
|
|
|
566 sub get_donor_seq_from_exon {
|
|
|
567 my ($self, $exon) = @_;
|
|
|
568
|
|
|
569 my ($start, $end);
|
|
|
570
|
|
|
571 if ($exon->strand > 0) {
|
|
|
572 ($start, $end) = ($exon->end - 2, $exon->end + 6);
|
|
|
573 }
|
|
|
574 else {
|
|
|
575 ($start, $end) = ($exon->start - 6, $exon->start + 2);
|
|
|
576 }
|
|
|
577
|
|
|
578 my $slice = $exon->slice()->sub_Slice($start, $end, $exon->strand);
|
|
|
579 my $seq = $slice->seq() if defined($slice);
|
|
|
580
|
|
|
581 return $seq;
|
|
|
582 }
|
|
|
583
|
|
|
584 sub get_acceptor_seq_from_exon {
|
|
|
585 my ($self, $exon) = @_;
|
|
|
586
|
|
|
587 my ($start, $end);
|
|
|
588
|
|
|
589 if ($exon->strand > 0) {
|
|
|
590 ($start, $end) = ($exon->start - 20, $exon->start + 2);
|
|
|
591 }
|
|
|
592 else {
|
|
|
593 ($start, $end) = ($exon->end - 2, $exon->end + 20);
|
|
|
594 }
|
|
|
595
|
|
|
596 my $slice = $exon->slice()->sub_Slice($start, $end, $exon->strand);
|
|
|
597 my $seq = $slice->seq() if defined($slice);
|
|
|
598
|
|
|
599 return $seq;
|
|
|
600 }
|
|
|
601
|
|
|
602 sub get_donor_seq_from_intron {
|
|
|
603 my ($self, $intron) = @_;
|
|
|
604
|
|
|
605 my ($start, $end);
|
|
|
606
|
|
|
607 if ($intron->strand > 0) {
|
|
|
608 ($start, $end) = ($intron->start - 3, $intron->start + 5);
|
|
|
609 }
|
|
|
610 else {
|
|
|
611 ($start, $end) = ($intron->end - 5, $intron->end + 3);
|
|
|
612 }
|
|
|
613
|
|
|
614 my $slice = $intron->slice()->sub_Slice($start, $end, $intron->strand);
|
|
|
615 my $seq = $slice->seq() if defined($slice);
|
|
|
616
|
|
|
617 return $seq;
|
|
|
618 }
|
|
|
619
|
|
|
620 sub get_acceptor_seq_from_intron {
|
|
|
621 my ($self, $intron) = @_;
|
|
|
622
|
|
|
623 my ($start, $end);
|
|
|
624
|
|
|
625 if ($intron->strand > 0) {
|
|
|
626 ($start, $end) = ($intron->end - 19, $intron->end + 3);
|
|
|
627 }
|
|
|
628 else {
|
|
|
629 ($start, $end) = ($intron->start - 3, $intron->start + 19);
|
|
|
630 }
|
|
|
631
|
|
|
632 my $slice = $intron->slice()->sub_Slice($start, $end, $intron->strand);
|
|
|
633 my $seq = $slice->seq() if defined($slice);
|
|
|
634
|
|
|
635 return $seq;
|
|
|
636 }
|
|
|
637
|
|
|
638
|
|
|
639 ## Common methods
|
|
|
640 #################
|
|
|
641
|
|
|
642 sub get_seqs {
|
|
|
643 my ($self, $tva, $start, $end) = @_;
|
|
|
644 my $vf = $tva->variation_feature;
|
|
|
645
|
|
|
646 my $tr_strand = $tva->transcript->strand;
|
|
|
647
|
|
|
648 my $ref_slice = $vf->{slice}->sub_Slice($start, $end, $tr_strand);
|
|
|
649
|
|
|
650 my ($ref_seq, $alt_seq);
|
|
|
651
|
|
|
652 if (defined $ref_slice) {
|
|
|
653
|
|
|
654 $ref_seq = $alt_seq = $ref_slice->seq();
|
|
|
655
|
|
|
656 my $substr_start = $tr_strand > 0 ? $vf->{start} - $start : $end - $vf->{end};
|
|
|
657 my $feature_seq = $tva->seq_length > 0 ? $tva->feature_seq : '';
|
|
|
658
|
|
|
659 substr($alt_seq, $substr_start, ($vf->{end} - $vf->{start}) + 1) = $feature_seq;
|
|
|
660 }
|
|
|
661
|
|
|
662 return [$ref_seq, $alt_seq];
|
|
|
663 }
|
|
|
664
|
|
|
665 sub score5 {
|
|
|
666 my $self = shift;
|
|
|
667 my $seq = shift;
|
|
|
668 my $hex = md5_hex($seq);
|
|
|
669
|
|
|
670 # check cache
|
|
|
671 if($self->{cache}) {
|
|
|
672 my ($res) = grep {$_->{hex} eq $hex} @{$self->{cache}->{score5}};
|
|
|
673
|
|
|
674 return $res->{score} if $res;
|
|
|
675 }
|
|
|
676
|
|
|
677 my $a = $self->score5_scoreconsensus($seq);
|
|
|
678 die("ERROR: No score5_scoreconsensus\n") unless defined($a);
|
|
|
679
|
|
|
680 my $b = $self->score5_getrest($seq);
|
|
|
681 die("ERROR: No score5_getrest\n") unless defined($b);
|
|
|
682
|
|
|
683 my $c = $self->{'score5_seq'}->{$b};
|
|
|
684 die("ERROR: No score5_seq for $b\n") unless defined($c);
|
|
|
685
|
|
|
686 my $d = $self->{'score5_me2x5'}->{$c};
|
|
|
687 die("ERROR: No score5_me2x5 for $c\n") unless defined($d);
|
|
|
688
|
|
|
689 my $score = $self->log2($a * $d);
|
|
|
690
|
|
|
691 # cache it
|
|
|
692 push @{$self->{cache}->{score5}}, { hex => $hex, score => $score };
|
|
|
693 shift @{$self->{cache}->{score5}} while scalar @{$self->{cache}->{score5}} > $CACHE_SIZE;
|
|
|
694
|
|
|
695 return $score;
|
|
|
696 }
|
|
|
697
|
|
|
698 sub score3 {
|
|
|
699 my $self = shift;
|
|
|
700 my $seq = shift;
|
|
|
701 my $hex = md5_hex($seq);
|
|
|
702
|
|
|
703 # check cache
|
|
|
704 if($self->{cache}) {
|
|
|
705 my ($res) = grep {$_->{hex} eq $hex} @{$self->{cache}->{score3}};
|
|
|
706
|
|
|
707 return $res->{score} if $res;
|
|
|
708 }
|
|
|
709
|
|
|
710 my $a = $self->score3_scoreconsensus($seq);
|
|
|
711 die("ERROR: No score3_scoreconsensus\n") unless defined($a);
|
|
|
712
|
|
|
713 my $b = $self->score3_getrest($seq);
|
|
|
714 die("ERROR: No score3_getrest\n") unless defined($b);
|
|
|
715
|
|
|
716 my $c = $self->score3_maxentscore($b, $self->{'score3_metables'});
|
|
|
717 die("ERROR: No score3_maxentscore for $b\n") unless defined($c);
|
|
|
718
|
|
|
719 my $score = $self->log2($a * $c);
|
|
|
720
|
|
|
721 # cache it
|
|
|
722 push @{$self->{cache}->{score3}}, { hex => $hex, score => $score };
|
|
|
723 shift @{$self->{cache}->{score3}} while scalar @{$self->{cache}->{score3}} > $CACHE_SIZE;
|
|
|
724
|
|
|
725 return $score;
|
|
|
726 }
|
|
|
727
|
|
|
728
|
|
|
729 ## methods copied from score5.pl
|
|
|
730 ################################
|
|
|
731
|
|
|
732 sub score5_makesequencematrix {
|
|
|
733 my $self = shift;
|
|
|
734 my $file = shift;
|
|
|
735 my %matrix;
|
|
|
736 my $n=0;
|
|
|
737 open(SCOREF, $file) || die "Can't open $file!\n";
|
|
|
738 while(<SCOREF>) {
|
|
|
739 chomp;
|
|
|
740 $_=~ s/\s//;
|
|
|
741 $matrix{$_} = $n;
|
|
|
742 $n++;
|
|
|
743 }
|
|
|
744 close(SCOREF);
|
|
|
745 return \%matrix;
|
|
|
746 }
|
|
|
747
|
|
|
748 sub score5_makescorematrix {
|
|
|
749 my $self = shift;
|
|
|
750 my $file = shift;
|
|
|
751 my %matrix;
|
|
|
752 my $n=0;
|
|
|
753 open(SCOREF, $file) || die "Can't open $file!\n";
|
|
|
754 while(<SCOREF>) {
|
|
|
755 chomp;
|
|
|
756 $_=~ s/\s//;
|
|
|
757 $matrix{$n} = $_;
|
|
|
758 $n++;
|
|
|
759 }
|
|
|
760 close(SCOREF);
|
|
|
761 return \%matrix;
|
|
|
762 }
|
|
|
763
|
|
|
764 sub score5_getrest {
|
|
|
765 my $self = shift;
|
|
|
766 my $seq = shift;
|
|
|
767 my @seqa = split(//,uc($seq));
|
|
|
768 return $seqa[0].$seqa[1].$seqa[2].$seqa[5].$seqa[6].$seqa[7].$seqa[8];
|
|
|
769 }
|
|
|
770
|
|
|
771 sub score5_scoreconsensus {
|
|
|
772 my $self = shift;
|
|
|
773 my $seq = shift;
|
|
|
774 my @seqa = split(//,uc($seq));
|
|
|
775 my %bgd;
|
|
|
776 $bgd{'A'} = 0.27;
|
|
|
777 $bgd{'C'} = 0.23;
|
|
|
778 $bgd{'G'} = 0.23;
|
|
|
779 $bgd{'T'} = 0.27;
|
|
|
780 my %cons1;
|
|
|
781 $cons1{'A'} = 0.004;
|
|
|
782 $cons1{'C'} = 0.0032;
|
|
|
783 $cons1{'G'} = 0.9896;
|
|
|
784 $cons1{'T'} = 0.0032;
|
|
|
785 my %cons2;
|
|
|
786 $cons2{'A'} = 0.0034;
|
|
|
787 $cons2{'C'} = 0.0039;
|
|
|
788 $cons2{'G'} = 0.0042;
|
|
|
789 $cons2{'T'} = 0.9884;
|
|
|
790 my $addscore = $cons1{$seqa[3]}*$cons2{$seqa[4]}/($bgd{$seqa[3]}*$bgd{$seqa[4]});
|
|
|
791 return $addscore;
|
|
|
792 }
|
|
|
793
|
|
|
794 sub log2 {
|
|
|
795 my ($self, $val) = @_;
|
|
|
796 return log($val)/log(2);
|
|
|
797 }
|
|
|
798
|
|
|
799
|
|
|
800 ## methods copied from score3.pl
|
|
|
801 ################################
|
|
|
802
|
|
|
803 sub score3_hashseq {
|
|
|
804 #returns hash of sequence in base 4
|
|
|
805 # $self->score3_hashseq('CAGAAGT') returns 4619
|
|
|
806 my $self = shift;
|
|
|
807 my $seq = shift;
|
|
|
808 $seq = uc($seq);
|
|
|
809 $seq =~ tr/ACGT/0123/;
|
|
|
810 my @seqa = split(//,$seq);
|
|
|
811 my $sum = 0;
|
|
|
812 my $len = length($seq);
|
|
|
813 my @four = (1,4,16,64,256,1024,4096,16384);
|
|
|
814 my $i=0;
|
|
|
815 while ($i<$len) {
|
|
|
816 $sum+= $seqa[$i] * $four[$len - $i -1] ;
|
|
|
817 $i++;
|
|
|
818 }
|
|
|
819 return $sum;
|
|
|
820 }
|
|
|
821
|
|
|
822 sub score3_makemaxentscores {
|
|
|
823 my $self = shift;
|
|
|
824 my $dir = $self->{'_dir'}."/splicemodels/";
|
|
|
825 my @list = ('me2x3acc1','me2x3acc2','me2x3acc3','me2x3acc4',
|
|
|
826 'me2x3acc5','me2x3acc6','me2x3acc7','me2x3acc8','me2x3acc9');
|
|
|
827 my @metables;
|
|
|
828 my $num = 0 ;
|
|
|
829 foreach my $file (@list) {
|
|
|
830 my $n = 0;
|
|
|
831 open (SCOREF,"<".$dir.$file) || die "Can't open $file!\n";
|
|
|
832 while(<SCOREF>) {
|
|
|
833 chomp;
|
|
|
834 $_=~ s/\s//;
|
|
|
835 $metables[$num]{$n} = $_;
|
|
|
836 $n++;
|
|
|
837 }
|
|
|
838 close(SCOREF);
|
|
|
839 #print STDERR $file."\t".$num."\t".$n."\n";
|
|
|
840 $num++;
|
|
|
841 }
|
|
|
842 return \@metables;
|
|
|
843 }
|
|
|
844
|
|
|
845 sub score3_maxentscore {
|
|
|
846 my $self = shift;
|
|
|
847 my $seq = shift;
|
|
|
848 my $table_ref = shift;
|
|
|
849 my @metables = @$table_ref;
|
|
|
850 my @sc;
|
|
|
851 $sc[0] = $metables[0]{$self->score3_hashseq(substr($seq,0,7))};
|
|
|
852 $sc[1] = $metables[1]{$self->score3_hashseq(substr($seq,7,7))};
|
|
|
853 $sc[2] = $metables[2]{$self->score3_hashseq(substr($seq,14,7))};
|
|
|
854 $sc[3] = $metables[3]{$self->score3_hashseq(substr($seq,4,7))};
|
|
|
855 $sc[4] = $metables[4]{$self->score3_hashseq(substr($seq,11,7))};
|
|
|
856 $sc[5] = $metables[5]{$self->score3_hashseq(substr($seq,4,3))};
|
|
|
857 $sc[6] = $metables[6]{$self->score3_hashseq(substr($seq,7,4))};
|
|
|
858 $sc[7] = $metables[7]{$self->score3_hashseq(substr($seq,11,3))};
|
|
|
859 $sc[8] = $metables[8]{$self->score3_hashseq(substr($seq,14,4))};
|
|
|
860 my $finalscore = $sc[0] * $sc[1] * $sc[2] * $sc[3] * $sc[4] / ($sc[5] * $sc[6] * $sc[7] * $sc[8]);
|
|
|
861 return $finalscore;
|
|
|
862 }
|
|
|
863
|
|
|
864 sub score3_getrest {
|
|
|
865 my $self = shift;
|
|
|
866 my $seq = shift;
|
|
|
867 my $seq_noconsensus = substr($seq,0,18).substr($seq,20,3);
|
|
|
868 return $seq_noconsensus;
|
|
|
869 }
|
|
|
870
|
|
|
871 sub score3_scoreconsensus {
|
|
|
872 my $self = shift;
|
|
|
873 my $seq = shift;
|
|
|
874 my @seqa = split(//,uc($seq));
|
|
|
875 my %bgd;
|
|
|
876 $bgd{'A'} = 0.27;
|
|
|
877 $bgd{'C'} = 0.23;
|
|
|
878 $bgd{'G'} = 0.23;
|
|
|
879 $bgd{'T'} = 0.27;
|
|
|
880 my %cons1;
|
|
|
881 $cons1{'A'} = 0.9903;
|
|
|
882 $cons1{'C'} = 0.0032;
|
|
|
883 $cons1{'G'} = 0.0034;
|
|
|
884 $cons1{'T'} = 0.0030;
|
|
|
885 my %cons2;
|
|
|
886 $cons2{'A'} = 0.0027;
|
|
|
887 $cons2{'C'} = 0.0037;
|
|
|
888 $cons2{'G'} = 0.9905;
|
|
|
889 $cons2{'T'} = 0.0030;
|
|
|
890 my $addscore = $cons1{$seqa[18]} * $cons2{$seqa[19]}/ ($bgd{$seqa[18]} * $bgd{$seqa[19]});
|
|
|
891 return $addscore;
|
|
|
892 }
|
|
|
893
|
|
|
894 1;
|
|
|
895
|